Add specialization for steam store and GOG

This commit is contained in:
Viktor Lofgren 2024-12-11 18:32:45 +01:00
parent e65d75a0f9
commit 0a53ac68a0
21 changed files with 7821 additions and 30 deletions

View File

@ -150,7 +150,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
ret.length = length;
ret.standard = standard;
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
ret.title = specialization.getTitle(doc, dld, crawledDocument.url);
documentLengthLogic.validateLength(dld, specialization.lengthModifier() * documentClass.lengthLimitModifier());

View File

@ -3,6 +3,7 @@ package nu.marginalia.converting.processor.plugin.specialization;
import ca.rmen.porterstemmer.PorterStemmer;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.converting.processor.logic.TitleExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.model.EdgeUrl;
@ -29,8 +30,8 @@ import java.util.stream.Collectors;
public class BlogSpecialization extends DefaultSpecialization {
@Inject
public BlogSpecialization(SummaryExtractor summaryExtractor) {
super(summaryExtractor);
public BlogSpecialization(SummaryExtractor summaryExtractor, TitleExtractor titleExtractor) {
super(summaryExtractor, titleExtractor);
}
@Override

View File

@ -2,8 +2,12 @@ package nu.marginalia.converting.processor.plugin.specialization;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.converting.processor.logic.TitleExtractor;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.EdgeUrl;
import org.jsoup.nodes.Document;
import java.util.ArrayList;
@ -14,10 +18,12 @@ import java.util.Set;
public class DefaultSpecialization implements HtmlProcessorSpecializations.HtmlProcessorSpecializationIf {
private final SummaryExtractor summaryExtractor;
private final TitleExtractor titleExtractor;
@Inject
public DefaultSpecialization(SummaryExtractor summaryExtractor) {
public DefaultSpecialization(SummaryExtractor summaryExtractor, TitleExtractor titleExtractor) {
this.summaryExtractor = summaryExtractor;
this.titleExtractor = titleExtractor;
}
@Override
@ -46,4 +52,14 @@ public class DefaultSpecialization implements HtmlProcessorSpecializations.HtmlP
return summaryExtractor.extractSummary(doc, cleanedWords);
}
@Override
public String getTitle(Document original, DocumentLanguageData dld, String url) {
return titleExtractor.getTitleAbbreviated(original, dld, url);
}
public boolean shouldIndex(EdgeUrl url) { return true; }
public double lengthModifier() { return 1.0; }
public void amendWords(Document doc, DocumentKeywordsBuilder words) {}
}

View File

@ -0,0 +1,53 @@
package nu.marginalia.converting.processor.plugin.specialization;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.converting.processor.logic.TitleExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.EdgeUrl;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document;
import java.util.Set;
@Singleton
public class GogStoreSpecialization extends DefaultSpecialization {
@Inject
public GogStoreSpecialization(SummaryExtractor summaryExtractor, TitleExtractor titleExtractor) {
super(summaryExtractor, titleExtractor);
}
@Override
public Document prune(Document original) {
var pruned = super.prune(original);
pruned.select(".age-gate").remove();
return pruned;
}
@Override
public String getSummary(Document original,
Set<String> importantWords) {
var desc = original.select(".description").first();
if (desc != null)
return StringUtils.truncate(desc.text(), 255);
return super.getSummary(original, importantWords);
}
public String getTitle(Document original, DocumentLanguageData dld, String url) {
var appHubName = original.select(".productcard-basics__title").first();
if (appHubName != null) {
return StringUtils.truncate(appHubName.text(), 128);
}
return super.getTitle(original, dld, url);
}
@Override
public boolean shouldIndex(EdgeUrl url) {
return url.path.startsWith("/en/game/");
}
}

View File

@ -6,6 +6,7 @@ import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.processor.ConverterDomainTypes;
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.EdgeUrl;
import org.jsoup.nodes.Document;
@ -19,8 +20,10 @@ public class HtmlProcessorSpecializations {
private final PhpBBSpecialization phpBBSpecialization;
private final JavadocSpecialization javadocSpecialization;
private final MariadbKbSpecialization mariadbKbSpecialization;
private final SteamStoreSpecialization steamStoreSpecialization;
private final WikiSpecialization wikiSpecialization;
private final BlogSpecialization blogSpecialization;
private final GogStoreSpecialization gogStoreSpecialization;
private final DefaultSpecialization defaultSpecialization;
@Inject
@ -30,8 +33,10 @@ public class HtmlProcessorSpecializations {
PhpBBSpecialization phpBBSpecialization,
JavadocSpecialization javadocSpecialization,
MariadbKbSpecialization mariadbKbSpecialization,
SteamStoreSpecialization steamStoreSpecialization,
WikiSpecialization wikiSpecialization,
BlogSpecialization blogSpecialization,
GogStoreSpecialization gogStoreSpecialization,
DefaultSpecialization defaultSpecialization) {
this.domainTypes = domainTypes;
this.lemmySpecialization = lemmySpecialization;
@ -39,8 +44,10 @@ public class HtmlProcessorSpecializations {
this.phpBBSpecialization = phpBBSpecialization;
this.javadocSpecialization = javadocSpecialization;
this.mariadbKbSpecialization = mariadbKbSpecialization;
this.steamStoreSpecialization = steamStoreSpecialization;
this.wikiSpecialization = wikiSpecialization;
this.blogSpecialization = blogSpecialization;
this.gogStoreSpecialization = gogStoreSpecialization;
this.defaultSpecialization = defaultSpecialization;
}
@ -59,6 +66,14 @@ public class HtmlProcessorSpecializations {
return mariadbKbSpecialization;
}
if (url.domain.toString().equals("store.steampowered.com")) {
return steamStoreSpecialization;
}
if (url.domain.toString().equals("www.gog.com") && url.path.contains("/game/")) {
return gogStoreSpecialization;
}
if (generator.keywords().contains("lemmy")) {
return lemmySpecialization;
}
@ -86,11 +101,11 @@ public class HtmlProcessorSpecializations {
Document prune(Document original);
String getSummary(Document original,
Set<String> importantWords);
String getTitle(Document original, DocumentLanguageData dld, String url);
default boolean shouldIndex(EdgeUrl url) { return true; }
default double lengthModifier() { return 1.0; }
default void amendWords(Document doc, DocumentKeywordsBuilder words) {}
boolean shouldIndex(EdgeUrl url);
double lengthModifier();
void amendWords(Document doc, DocumentKeywordsBuilder words);
}
}

View File

@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.plugin.specialization;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.converting.processor.logic.TitleExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document;
@ -15,8 +16,8 @@ public class JavadocSpecialization extends DefaultSpecialization {
private static final Logger logger = LoggerFactory.getLogger(JavadocSpecialization.class);
@Inject
public JavadocSpecialization(SummaryExtractor summaryExtractor) {
super(summaryExtractor);
public JavadocSpecialization(SummaryExtractor summaryExtractor, TitleExtractor titleExtractor) {
super(summaryExtractor, titleExtractor);
}
@Override

View File

@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.plugin.specialization;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.converting.processor.logic.TitleExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
@ -11,12 +12,13 @@ import java.util.Set;
/** This class is used to specify how to process a website running Lemmy */
@Singleton
public class LemmySpecialization implements HtmlProcessorSpecializations.HtmlProcessorSpecializationIf {
public class LemmySpecialization extends DefaultSpecialization {
private static final Logger logger = LoggerFactory.getLogger(LemmySpecialization.class);
private final SummaryExtractor summaryExtractor;
@Inject
public LemmySpecialization(SummaryExtractor summaryExtractor) {
public LemmySpecialization(SummaryExtractor summaryExtractor, TitleExtractor titleExtractor) {
super(summaryExtractor, titleExtractor);
this.summaryExtractor = summaryExtractor;
}

View File

@ -2,22 +2,25 @@ package nu.marginalia.converting.processor.plugin.specialization;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.converting.processor.logic.TitleExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
@Singleton
public class MariadbKbSpecialization extends DefaultSpecialization {
private static final Logger logger = LoggerFactory.getLogger(MariadbKbSpecialization.class);
@Inject
public MariadbKbSpecialization(SummaryExtractor summaryExtractor) {
super(summaryExtractor);
public MariadbKbSpecialization(SummaryExtractor summaryExtractor, TitleExtractor titleExtractor) {
super(summaryExtractor, titleExtractor);
}
@Override

View File

@ -2,8 +2,9 @@ package nu.marginalia.converting.processor.plugin.specialization;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.converting.processor.logic.TitleExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import nu.marginalia.model.EdgeUrl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -12,8 +13,8 @@ public class PhpBBSpecialization extends DefaultSpecialization {
private static final Logger logger = LoggerFactory.getLogger(PhpBBSpecialization.class);
@Inject
public PhpBBSpecialization(SummaryExtractor summaryExtractor) {
super(summaryExtractor);
public PhpBBSpecialization(SummaryExtractor summaryExtractor, TitleExtractor titleExtractor) {
super(summaryExtractor, titleExtractor);
}
@Override

View File

@ -0,0 +1,75 @@
package nu.marginalia.converting.processor.plugin.specialization;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.converting.processor.logic.TitleExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.EdgeUrl;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document;
import java.util.Set;
@Singleton
public class SteamStoreSpecialization extends DefaultSpecialization {
@Inject
public SteamStoreSpecialization(SummaryExtractor summaryExtractor, TitleExtractor titleExtractor) {
super(summaryExtractor, titleExtractor);
}
@Override
public Document prune(Document original) {
var glanceCtn = original.select(".glance_ctn").clone().first();
var gameAreaDesc= original.select("#game_area_description").clone().first();
var appHubName = original.select("#appHubAppName").clone().first();
var newDoc = new Document(original.baseUri());
var title = newDoc.head().appendElement("title");
var bodyTag = newDoc.appendElement("body");
if (appHubName != null) {
title.appendText(appHubName.text());
bodyTag.appendChild(appHubName);
}
if (glanceCtn != null) {
bodyTag.appendChild(glanceCtn);
}
if (gameAreaDesc != null) {
bodyTag.appendChild(gameAreaDesc);
}
return newDoc;
}
public String getTitle(Document original, DocumentLanguageData dld, String url) {
var appHubName = original.select("#appHubAppName").first();
if (appHubName != null) {
return StringUtils.truncate(appHubName.text(), 128);
}
return super.getTitle(original, dld, url);
}
@Override
public String getSummary(Document original, Set<String> importantWords) {
// Trust wikis to generate a useful summary
var gameDesc = original.select(".game_description_snippet");
gameDesc = gameDesc.clone();
gameDesc.select("h2").remove();
String desc = gameDesc.text();
if (!desc.isBlank()) {
return StringUtils.truncate(desc, 255);
}
else {
return super.getSummary(original, importantWords);
}
}
@Override
public boolean shouldIndex(EdgeUrl url) {
return url.path.startsWith("/app/");
}
}

View File

@ -2,9 +2,10 @@ package nu.marginalia.converting.processor.plugin.specialization;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.converting.processor.logic.TitleExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
@ -15,8 +16,8 @@ import java.util.Set;
public class WikiSpecialization extends DefaultSpecialization {
@Inject
public WikiSpecialization(SummaryExtractor summaryExtractor) {
super(summaryExtractor);
public WikiSpecialization(SummaryExtractor summaryExtractor, TitleExtractor titleExtractor) {
super(summaryExtractor, titleExtractor);
}
@Override

View File

@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.plugin.specialization;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.converting.processor.logic.TitleExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
@ -10,12 +11,13 @@ import org.slf4j.LoggerFactory;
import java.util.Set;
@Singleton
public class XenForoSpecialization implements HtmlProcessorSpecializations.HtmlProcessorSpecializationIf {
public class XenForoSpecialization extends DefaultSpecialization {
private static final Logger logger = LoggerFactory.getLogger(XenForoSpecialization.class);
private final SummaryExtractor summaryExtractor;
@Inject
public XenForoSpecialization(SummaryExtractor summaryExtractor) {
public XenForoSpecialization(SummaryExtractor summaryExtractor, TitleExtractor titleExtractor) {
super(summaryExtractor, titleExtractor);
this.summaryExtractor = summaryExtractor;
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -3,13 +3,13 @@ package nu.marginalia.converting.processor.plugin.specialization;
import nu.marginalia.model.EdgeUrl;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.*;
import static org.junit.jupiter.api.Assertions.assertFalse;
class BlogSpecializationTest {
@Test
void shouldIndex() throws Exception {
var spec = new BlogSpecialization(null);
var spec = new BlogSpecialization(null, null);
assertFalse(spec.shouldIndex(new EdgeUrl("https://blog.marginalia.nu/2023/00/22/")));
assertFalse(spec.shouldIndex(new EdgeUrl("https://blog.marginalia.nu/2023/00/")));
assertFalse(spec.shouldIndex(new EdgeUrl("https://blog.marginalia.nu/00/22/")));

View File

@ -0,0 +1,53 @@
package nu.marginalia.converting.processor.plugin.specialization;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
import nu.marginalia.converting.processor.logic.TitleExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.test.CommonTestData;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import java.util.Set;
class GogSpecializationTest {
static GogStoreSpecialization specialization;
static DocumentGeneratorExtractor generatorExtractor = new DocumentGeneratorExtractor();
String storePage = CommonTestData.loadTestData("html/gog-store.html");
@BeforeAll
public static void setUpAll() {
specialization = new GogStoreSpecialization(
new SummaryExtractor(255,
null,
null,
null,
null,
null),
new TitleExtractor(128)
);
}
@Test
void prune() {
System.out.println(specialization.prune(Jsoup.parse(storePage)));
}
@Test
void generatorExtraction() throws Exception {
var gen = generatorExtractor.detectGenerator(new EdgeUrl("https://www.example.com/"), Jsoup.parse(storePage), new DocumentHeaders(""));
System.out.println(gen);
}
@Test
void getSummary() {
String summary = specialization.getSummary(Jsoup.parse(storePage), Set.of(""));
System.out.println(summary);
}
}

View File

@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.plugin.specialization;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
import nu.marginalia.converting.processor.logic.TitleExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.test.CommonTestData;
@ -26,7 +27,8 @@ class JavadocSpecializationTest {
null,
null,
null,
null));
null),
new TitleExtractor(128));
}
@Test

View File

@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.plugin.specialization;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
import nu.marginalia.converting.processor.logic.TitleExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.test.CommonTestData;
@ -29,7 +30,8 @@ class LemmySpecializationTest {
null,
null,
null,
null));
null),
new TitleExtractor(128));
}
@Test

View File

@ -0,0 +1,53 @@
package nu.marginalia.converting.processor.plugin.specialization;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
import nu.marginalia.converting.processor.logic.TitleExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.test.CommonTestData;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import java.util.Set;
class SteamSpecializationTest {
static SteamStoreSpecialization specialization;
static DocumentGeneratorExtractor generatorExtractor = new DocumentGeneratorExtractor();
String storePage = CommonTestData.loadTestData("html/steam-store.html");
@BeforeAll
public static void setUpAll() {
specialization = new SteamStoreSpecialization(
new SummaryExtractor(255,
null,
null,
null,
null,
null),
new TitleExtractor(128)
);
}
@Test
void prune() {
System.out.println(specialization.prune(Jsoup.parse(storePage)));
}
@Test
void generatorExtraction() throws Exception {
var gen = generatorExtractor.detectGenerator(new EdgeUrl("https://www.example.com/"), Jsoup.parse(storePage), new DocumentHeaders(""));
System.out.println(gen);
}
@Test
void getSummary() {
String summary = specialization.getSummary(Jsoup.parse(storePage), Set.of(""));
System.out.println(summary);
}
}

View File

@ -1,5 +1,6 @@
package nu.marginalia.converting.processor.plugin.specialization;
import nu.marginalia.converting.processor.logic.TitleExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import nu.marginalia.test.CommonTestData;
import org.jsoup.Jsoup;
@ -23,7 +24,9 @@ class WikiSpecializationTest {
null,
null,
null,
null));
null),
new TitleExtractor(128)
);
}
@Test

View File

@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.plugin.specialization;
import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
import nu.marginalia.converting.processor.logic.TitleExtractor;
import nu.marginalia.converting.processor.summary.SummaryExtractor;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.test.CommonTestData;
@ -27,7 +28,9 @@ class XenForoSpecializationTest {
null,
null,
null,
null));
null),
new TitleExtractor(128)
);
}
@Test