diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java index 7133b87a..b87b1193 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java @@ -233,9 +233,19 @@ public class QueryParser { entity.replace(new QueryToken.RankTerm(limit, str)); } else if (str.startsWith("qs=")) { entity.replace(new QueryToken.QsTerm(str.substring(3))); - } else if (str.contains(":")) { + } else if (str.startsWith("site:") + || str.startsWith("format:") + || str.startsWith("file:") + || str.startsWith("tld:") + || str.startsWith("ip:") + || str.startsWith("as:") + || str.startsWith("asn:") + || str.startsWith("generator:") + ) + { entity.replace(new QueryToken.AdviceTerm(str, t.displayStr())); } + } private static SpecificationLimit parseSpecificationLimit(String str) { diff --git a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java index 26a9928d..7bbcdfb7 100644 --- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java +++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java @@ -208,6 +208,12 @@ public class QueryFactoryTest { System.out.println(subquery); } + @Test + public void testCplusPlus() { + var subquery = parseAndGetSpecs("std::vector::push_back vector"); + System.out.println(subquery); + } + @Test public void testQuotedApostrophe() { var subquery = parseAndGetSpecs("\"bob's cars\""); diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentPositionMapper.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentPositionMapper.java index 0644cf76..ce7fb3b4 100644 --- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentPositionMapper.java +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentPositionMapper.java @@ -152,7 +152,10 @@ public class DocumentPositionMapper { } boolean matchesWordPattern(String s) { - // this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4} + if (s.length() > 48) + return false; + + // this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,8} String wordPartSeparator = ".-_/:+*"; @@ -169,7 +172,7 @@ public class DocumentPositionMapper { if (i == 0) return false; - for (int j = 0; j < 5; j++) { + for (int j = 0; j < 8; j++) { if (i == s.length()) return true; if (wordPartSeparator.indexOf(s.charAt(i)) < 0) { diff --git a/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentPositionMapperTest.java b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentPositionMapperTest.java index a00dd3ae..533b0993 100644 --- a/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentPositionMapperTest.java +++ b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentPositionMapperTest.java @@ -30,9 +30,11 @@ class DocumentPositionMapperTest { Assertions.assertFalse(positionMapper.matchesWordPattern("1234567890abcdef")); Assertions.assertTrue(positionMapper.matchesWordPattern("test-test-test-test-test")); - Assertions.assertFalse(positionMapper.matchesWordPattern("test-test-test-test-test-test")); + Assertions.assertFalse(positionMapper.matchesWordPattern("test-test-test-test-test-test-test-test-test")); Assertions.assertTrue(positionMapper.matchesWordPattern("192.168.1.100/24")); Assertions.assertTrue(positionMapper.matchesWordPattern("std::vector")); + Assertions.assertTrue(positionMapper.matchesWordPattern("std::vector::push_back")); + Assertions.assertTrue(positionMapper.matchesWordPattern("c++")); Assertions.assertTrue(positionMapper.matchesWordPattern("m*a*s*h")); Assertions.assertFalse(positionMapper.matchesWordPattern("Stulpnagelstrasse")); diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/CppreferenceSpecialization.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/CppreferenceSpecialization.java new file mode 100644 index 00000000..dca53b8e --- /dev/null +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/CppreferenceSpecialization.java @@ -0,0 +1,113 @@ +package nu.marginalia.converting.processor.plugin.specialization; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.converting.processor.logic.TitleExtractor; +import nu.marginalia.converting.processor.summary.SummaryExtractor; +import org.apache.commons.lang3.StringUtils; +import org.apache.logging.log4j.util.Strings; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import java.util.ArrayList; +import java.util.List; +import java.util.Set; + +@Singleton +public class CppreferenceSpecialization extends WikiSpecialization { + + @Inject + public CppreferenceSpecialization(SummaryExtractor summaryExtractor, TitleExtractor titleExtractor) { + super(summaryExtractor, titleExtractor); + } + + @Override + public Document prune(Document original) { + var doc = original.clone(); + + doc.getElementsByClass("t-nv").remove(); + doc.getElementsByClass("toc").remove(); + doc.getElementsByClass("mw-head").remove(); + doc.getElementsByClass("printfooter").remove(); + doc.getElementsByClass("cpp-footer-base").remove(); + + doc.title(doc.title() + " " + Strings.join(extractExtraTokens(doc.title()), ' ')); + + return doc; + } + + @Override + public String getSummary(Document doc, Set importantWords) { + + Element declTable = doc.getElementsByClass("t-dcl-begin").first(); + if (declTable != null) { + var nextPar = declTable.nextElementSibling(); + if (nextPar != null) { + return nextPar.text(); + } + } + + return super.getSummary(doc, importantWords); + } + + + public List extractExtraTokens(String title) { + + if (!title.contains("::")) { + return List.of(); + } + if (!title.contains("-")) { + return List.of(); + } + + title = StringUtils.split(title, '-')[0]; + + String name = title; + for (;;) { + int lbidx = name.indexOf('<'); + int rbidx = name.indexOf('>'); + + if (lbidx > 0 && rbidx > lbidx) { + String className = name.substring(0, lbidx); + String methodName = name.substring(rbidx + 1); + name = className + methodName; + } else { + break; + } + } + + + List tokens = new ArrayList<>(); + + for (var part : name.split("\\s*,\\s*")) { + if (part.endsWith(")") && !part.endsWith("()")) { + int parenStart = part.indexOf('('); + if (parenStart > 0) { // foo(...) -> foo + part = part.substring(0, parenStart); + } + else if (parenStart == 0) { // (foo) -> foo + part = part.substring(1, part.length() - 1); + } + } + + part = part.trim(); + if (part.contains("::")) { + tokens.add(part); + if (part.startsWith("std::")) { + tokens.add(part.substring(5)); + + int ss = part.indexOf("::", 5); + if (ss > 0) { + tokens.add(part.substring(0, ss)); + tokens.add(part.substring(ss+2)); + } + + } + } + } + + return tokens; + } + + +} diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java index b9fe6390..443352f0 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java @@ -24,6 +24,7 @@ public class HtmlProcessorSpecializations { private final WikiSpecialization wikiSpecialization; private final BlogSpecialization blogSpecialization; private final GogStoreSpecialization gogStoreSpecialization; + private final CppreferenceSpecialization cppreferenceSpecialization; private final DefaultSpecialization defaultSpecialization; @Inject @@ -37,6 +38,7 @@ public class HtmlProcessorSpecializations { WikiSpecialization wikiSpecialization, BlogSpecialization blogSpecialization, GogStoreSpecialization gogStoreSpecialization, + CppreferenceSpecialization cppreferenceSpecialization, DefaultSpecialization defaultSpecialization) { this.domainTypes = domainTypes; this.lemmySpecialization = lemmySpecialization; @@ -48,6 +50,7 @@ public class HtmlProcessorSpecializations { this.wikiSpecialization = wikiSpecialization; this.blogSpecialization = blogSpecialization; this.gogStoreSpecialization = gogStoreSpecialization; + this.cppreferenceSpecialization = cppreferenceSpecialization; this.defaultSpecialization = defaultSpecialization; } @@ -66,6 +69,10 @@ public class HtmlProcessorSpecializations { return mariadbKbSpecialization; } + if (url.domain.getTopDomain().equals("cppreference.com")) { + return cppreferenceSpecialization; + } + if (url.domain.toString().equals("store.steampowered.com")) { return steamStoreSpecialization; } @@ -86,6 +93,9 @@ public class HtmlProcessorSpecializations { if (generator.keywords().contains("javadoc")) { return javadocSpecialization; } + + // Must be toward the end, as some specializations are for + // wiki-generator content if (generator.type() == GeneratorType.WIKI) { return wikiSpecialization; } @@ -105,7 +115,7 @@ public class HtmlProcessorSpecializations { boolean shouldIndex(EdgeUrl url); double lengthModifier(); - void amendWords(Document doc, DocumentKeywordsBuilder words); + default void amendWords(Document doc, DocumentKeywordsBuilder words) {} } } diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/WikiSpecialization.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/WikiSpecialization.java index ea9eb045..19ee00d2 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/WikiSpecialization.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/WikiSpecialization.java @@ -4,7 +4,6 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.converting.processor.logic.TitleExtractor; import nu.marginalia.converting.processor.summary.SummaryExtractor; -import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.model.EdgeUrl; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; @@ -93,6 +92,8 @@ public class WikiSpecialization extends DefaultSpecialization { return true; } - public void amendWords(Document doc, DocumentKeywordsBuilder words) { + @Override + public double lengthModifier() { + return 2.5; } } diff --git a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/CppreferenceSpecializationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/CppreferenceSpecializationTest.java new file mode 100644 index 00000000..3f73b31a --- /dev/null +++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/CppreferenceSpecializationTest.java @@ -0,0 +1,27 @@ +package nu.marginalia.converting.processor.plugin.specialization; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.List; + +class CppreferenceSpecializationTest { + CppreferenceSpecialization specialization = new CppreferenceSpecialization(null, null); + + @Test + public void testTitleMagic() { + + List ret; + + ret = specialization.extractExtraTokens("std::multimap::crend - cppreference.com"); + Assertions.assertTrue(ret.contains("std::multimap::crend")); + Assertions.assertTrue(ret.contains("multimap::crend")); + Assertions.assertTrue(ret.contains("std::multimap")); + Assertions.assertTrue(ret.contains("crend")); + + ret = specialization.extractExtraTokens("std::coroutine_handle::operator(), std::coroutine_handle::resume - cppreference.com"); + Assertions.assertTrue(ret.contains("std::coroutine_handle::operator()")); + Assertions.assertTrue(ret.contains("std::coroutine_handle::resume")); + } + +} \ No newline at end of file