From ca1807caae00de5371085299d93d38bb181b2852 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 7 Jan 2025 15:41:05 +0100 Subject: [PATCH] (specialization) Add new specialization for cppreference.com Give this reference website some synthetically generated tokens to improve the likelihood of a good match. --- .../CppreferenceSpecialization.java | 113 ++++++++++++++++++ .../HtmlProcessorSpecializations.java | 12 +- .../CppreferenceSpecializationTest.java | 27 +++++ 3 files changed, 151 insertions(+), 1 deletion(-) create mode 100644 code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/CppreferenceSpecialization.java create mode 100644 code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/CppreferenceSpecializationTest.java diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/CppreferenceSpecialization.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/CppreferenceSpecialization.java new file mode 100644 index 00000000..dca53b8e --- /dev/null +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/CppreferenceSpecialization.java @@ -0,0 +1,113 @@ +package nu.marginalia.converting.processor.plugin.specialization; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.converting.processor.logic.TitleExtractor; +import nu.marginalia.converting.processor.summary.SummaryExtractor; +import org.apache.commons.lang3.StringUtils; +import org.apache.logging.log4j.util.Strings; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import java.util.ArrayList; +import java.util.List; +import java.util.Set; + +@Singleton +public class CppreferenceSpecialization extends WikiSpecialization { + + @Inject + public CppreferenceSpecialization(SummaryExtractor summaryExtractor, TitleExtractor titleExtractor) { + super(summaryExtractor, titleExtractor); + } + + @Override + public Document prune(Document original) { + var doc = original.clone(); + + doc.getElementsByClass("t-nv").remove(); + doc.getElementsByClass("toc").remove(); + doc.getElementsByClass("mw-head").remove(); + doc.getElementsByClass("printfooter").remove(); + doc.getElementsByClass("cpp-footer-base").remove(); + + doc.title(doc.title() + " " + Strings.join(extractExtraTokens(doc.title()), ' ')); + + return doc; + } + + @Override + public String getSummary(Document doc, Set importantWords) { + + Element declTable = doc.getElementsByClass("t-dcl-begin").first(); + if (declTable != null) { + var nextPar = declTable.nextElementSibling(); + if (nextPar != null) { + return nextPar.text(); + } + } + + return super.getSummary(doc, importantWords); + } + + + public List extractExtraTokens(String title) { + + if (!title.contains("::")) { + return List.of(); + } + if (!title.contains("-")) { + return List.of(); + } + + title = StringUtils.split(title, '-')[0]; + + String name = title; + for (;;) { + int lbidx = name.indexOf('<'); + int rbidx = name.indexOf('>'); + + if (lbidx > 0 && rbidx > lbidx) { + String className = name.substring(0, lbidx); + String methodName = name.substring(rbidx + 1); + name = className + methodName; + } else { + break; + } + } + + + List tokens = new ArrayList<>(); + + for (var part : name.split("\\s*,\\s*")) { + if (part.endsWith(")") && !part.endsWith("()")) { + int parenStart = part.indexOf('('); + if (parenStart > 0) { // foo(...) -> foo + part = part.substring(0, parenStart); + } + else if (parenStart == 0) { // (foo) -> foo + part = part.substring(1, part.length() - 1); + } + } + + part = part.trim(); + if (part.contains("::")) { + tokens.add(part); + if (part.startsWith("std::")) { + tokens.add(part.substring(5)); + + int ss = part.indexOf("::", 5); + if (ss > 0) { + tokens.add(part.substring(0, ss)); + tokens.add(part.substring(ss+2)); + } + + } + } + } + + return tokens; + } + + +} diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java index b9fe6390..443352f0 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java @@ -24,6 +24,7 @@ public class HtmlProcessorSpecializations { private final WikiSpecialization wikiSpecialization; private final BlogSpecialization blogSpecialization; private final GogStoreSpecialization gogStoreSpecialization; + private final CppreferenceSpecialization cppreferenceSpecialization; private final DefaultSpecialization defaultSpecialization; @Inject @@ -37,6 +38,7 @@ public class HtmlProcessorSpecializations { WikiSpecialization wikiSpecialization, BlogSpecialization blogSpecialization, GogStoreSpecialization gogStoreSpecialization, + CppreferenceSpecialization cppreferenceSpecialization, DefaultSpecialization defaultSpecialization) { this.domainTypes = domainTypes; this.lemmySpecialization = lemmySpecialization; @@ -48,6 +50,7 @@ public class HtmlProcessorSpecializations { this.wikiSpecialization = wikiSpecialization; this.blogSpecialization = blogSpecialization; this.gogStoreSpecialization = gogStoreSpecialization; + this.cppreferenceSpecialization = cppreferenceSpecialization; this.defaultSpecialization = defaultSpecialization; } @@ -66,6 +69,10 @@ public class HtmlProcessorSpecializations { return mariadbKbSpecialization; } + if (url.domain.getTopDomain().equals("cppreference.com")) { + return cppreferenceSpecialization; + } + if (url.domain.toString().equals("store.steampowered.com")) { return steamStoreSpecialization; } @@ -86,6 +93,9 @@ public class HtmlProcessorSpecializations { if (generator.keywords().contains("javadoc")) { return javadocSpecialization; } + + // Must be toward the end, as some specializations are for + // wiki-generator content if (generator.type() == GeneratorType.WIKI) { return wikiSpecialization; } @@ -105,7 +115,7 @@ public class HtmlProcessorSpecializations { boolean shouldIndex(EdgeUrl url); double lengthModifier(); - void amendWords(Document doc, DocumentKeywordsBuilder words); + default void amendWords(Document doc, DocumentKeywordsBuilder words) {} } } diff --git a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/CppreferenceSpecializationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/CppreferenceSpecializationTest.java new file mode 100644 index 00000000..3f73b31a --- /dev/null +++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/CppreferenceSpecializationTest.java @@ -0,0 +1,27 @@ +package nu.marginalia.converting.processor.plugin.specialization; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.List; + +class CppreferenceSpecializationTest { + CppreferenceSpecialization specialization = new CppreferenceSpecialization(null, null); + + @Test + public void testTitleMagic() { + + List ret; + + ret = specialization.extractExtraTokens("std::multimap::crend - cppreference.com"); + Assertions.assertTrue(ret.contains("std::multimap::crend")); + Assertions.assertTrue(ret.contains("multimap::crend")); + Assertions.assertTrue(ret.contains("std::multimap")); + Assertions.assertTrue(ret.contains("crend")); + + ret = specialization.extractExtraTokens("std::coroutine_handle::operator(), std::coroutine_handle::resume - cppreference.com"); + Assertions.assertTrue(ret.contains("std::coroutine_handle::operator()")); + Assertions.assertTrue(ret.contains("std::coroutine_handle::resume")); + } + +} \ No newline at end of file