mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(specialization) Add new specialization for cppreference.com
Give this reference website some synthetically generated tokens to improve the likelihood of a good match.
This commit is contained in:
parent
26c20e18ac
commit
ca1807caae
@ -0,0 +1,113 @@
|
|||||||
|
package nu.marginalia.converting.processor.plugin.specialization;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.converting.processor.logic.TitleExtractor;
|
||||||
|
import nu.marginalia.converting.processor.summary.SummaryExtractor;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.logging.log4j.util.Strings;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class CppreferenceSpecialization extends WikiSpecialization {
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public CppreferenceSpecialization(SummaryExtractor summaryExtractor, TitleExtractor titleExtractor) {
|
||||||
|
super(summaryExtractor, titleExtractor);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Document prune(Document original) {
|
||||||
|
var doc = original.clone();
|
||||||
|
|
||||||
|
doc.getElementsByClass("t-nv").remove();
|
||||||
|
doc.getElementsByClass("toc").remove();
|
||||||
|
doc.getElementsByClass("mw-head").remove();
|
||||||
|
doc.getElementsByClass("printfooter").remove();
|
||||||
|
doc.getElementsByClass("cpp-footer-base").remove();
|
||||||
|
|
||||||
|
doc.title(doc.title() + " " + Strings.join(extractExtraTokens(doc.title()), ' '));
|
||||||
|
|
||||||
|
return doc;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getSummary(Document doc, Set<String> importantWords) {
|
||||||
|
|
||||||
|
Element declTable = doc.getElementsByClass("t-dcl-begin").first();
|
||||||
|
if (declTable != null) {
|
||||||
|
var nextPar = declTable.nextElementSibling();
|
||||||
|
if (nextPar != null) {
|
||||||
|
return nextPar.text();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return super.getSummary(doc, importantWords);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<String> extractExtraTokens(String title) {
|
||||||
|
|
||||||
|
if (!title.contains("::")) {
|
||||||
|
return List.of();
|
||||||
|
}
|
||||||
|
if (!title.contains("-")) {
|
||||||
|
return List.of();
|
||||||
|
}
|
||||||
|
|
||||||
|
title = StringUtils.split(title, '-')[0];
|
||||||
|
|
||||||
|
String name = title;
|
||||||
|
for (;;) {
|
||||||
|
int lbidx = name.indexOf('<');
|
||||||
|
int rbidx = name.indexOf('>');
|
||||||
|
|
||||||
|
if (lbidx > 0 && rbidx > lbidx) {
|
||||||
|
String className = name.substring(0, lbidx);
|
||||||
|
String methodName = name.substring(rbidx + 1);
|
||||||
|
name = className + methodName;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
List<String> tokens = new ArrayList<>();
|
||||||
|
|
||||||
|
for (var part : name.split("\\s*,\\s*")) {
|
||||||
|
if (part.endsWith(")") && !part.endsWith("()")) {
|
||||||
|
int parenStart = part.indexOf('(');
|
||||||
|
if (parenStart > 0) { // foo(...) -> foo
|
||||||
|
part = part.substring(0, parenStart);
|
||||||
|
}
|
||||||
|
else if (parenStart == 0) { // (foo) -> foo
|
||||||
|
part = part.substring(1, part.length() - 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
part = part.trim();
|
||||||
|
if (part.contains("::")) {
|
||||||
|
tokens.add(part);
|
||||||
|
if (part.startsWith("std::")) {
|
||||||
|
tokens.add(part.substring(5));
|
||||||
|
|
||||||
|
int ss = part.indexOf("::", 5);
|
||||||
|
if (ss > 0) {
|
||||||
|
tokens.add(part.substring(0, ss));
|
||||||
|
tokens.add(part.substring(ss+2));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return tokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@ -24,6 +24,7 @@ public class HtmlProcessorSpecializations {
|
|||||||
private final WikiSpecialization wikiSpecialization;
|
private final WikiSpecialization wikiSpecialization;
|
||||||
private final BlogSpecialization blogSpecialization;
|
private final BlogSpecialization blogSpecialization;
|
||||||
private final GogStoreSpecialization gogStoreSpecialization;
|
private final GogStoreSpecialization gogStoreSpecialization;
|
||||||
|
private final CppreferenceSpecialization cppreferenceSpecialization;
|
||||||
private final DefaultSpecialization defaultSpecialization;
|
private final DefaultSpecialization defaultSpecialization;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
@ -37,6 +38,7 @@ public class HtmlProcessorSpecializations {
|
|||||||
WikiSpecialization wikiSpecialization,
|
WikiSpecialization wikiSpecialization,
|
||||||
BlogSpecialization blogSpecialization,
|
BlogSpecialization blogSpecialization,
|
||||||
GogStoreSpecialization gogStoreSpecialization,
|
GogStoreSpecialization gogStoreSpecialization,
|
||||||
|
CppreferenceSpecialization cppreferenceSpecialization,
|
||||||
DefaultSpecialization defaultSpecialization) {
|
DefaultSpecialization defaultSpecialization) {
|
||||||
this.domainTypes = domainTypes;
|
this.domainTypes = domainTypes;
|
||||||
this.lemmySpecialization = lemmySpecialization;
|
this.lemmySpecialization = lemmySpecialization;
|
||||||
@ -48,6 +50,7 @@ public class HtmlProcessorSpecializations {
|
|||||||
this.wikiSpecialization = wikiSpecialization;
|
this.wikiSpecialization = wikiSpecialization;
|
||||||
this.blogSpecialization = blogSpecialization;
|
this.blogSpecialization = blogSpecialization;
|
||||||
this.gogStoreSpecialization = gogStoreSpecialization;
|
this.gogStoreSpecialization = gogStoreSpecialization;
|
||||||
|
this.cppreferenceSpecialization = cppreferenceSpecialization;
|
||||||
this.defaultSpecialization = defaultSpecialization;
|
this.defaultSpecialization = defaultSpecialization;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -66,6 +69,10 @@ public class HtmlProcessorSpecializations {
|
|||||||
return mariadbKbSpecialization;
|
return mariadbKbSpecialization;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (url.domain.getTopDomain().equals("cppreference.com")) {
|
||||||
|
return cppreferenceSpecialization;
|
||||||
|
}
|
||||||
|
|
||||||
if (url.domain.toString().equals("store.steampowered.com")) {
|
if (url.domain.toString().equals("store.steampowered.com")) {
|
||||||
return steamStoreSpecialization;
|
return steamStoreSpecialization;
|
||||||
}
|
}
|
||||||
@ -86,6 +93,9 @@ public class HtmlProcessorSpecializations {
|
|||||||
if (generator.keywords().contains("javadoc")) {
|
if (generator.keywords().contains("javadoc")) {
|
||||||
return javadocSpecialization;
|
return javadocSpecialization;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Must be toward the end, as some specializations are for
|
||||||
|
// wiki-generator content
|
||||||
if (generator.type() == GeneratorType.WIKI) {
|
if (generator.type() == GeneratorType.WIKI) {
|
||||||
return wikiSpecialization;
|
return wikiSpecialization;
|
||||||
}
|
}
|
||||||
@ -105,7 +115,7 @@ public class HtmlProcessorSpecializations {
|
|||||||
|
|
||||||
boolean shouldIndex(EdgeUrl url);
|
boolean shouldIndex(EdgeUrl url);
|
||||||
double lengthModifier();
|
double lengthModifier();
|
||||||
void amendWords(Document doc, DocumentKeywordsBuilder words);
|
|
||||||
|
|
||||||
|
default void amendWords(Document doc, DocumentKeywordsBuilder words) {}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,27 @@
|
|||||||
|
package nu.marginalia.converting.processor.plugin.specialization;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
class CppreferenceSpecializationTest {
|
||||||
|
CppreferenceSpecialization specialization = new CppreferenceSpecialization(null, null);
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testTitleMagic() {
|
||||||
|
|
||||||
|
List<String> ret;
|
||||||
|
|
||||||
|
ret = specialization.extractExtraTokens("std::multimap<Key, T, Compare, Allocator>::crend - cppreference.com");
|
||||||
|
Assertions.assertTrue(ret.contains("std::multimap::crend"));
|
||||||
|
Assertions.assertTrue(ret.contains("multimap::crend"));
|
||||||
|
Assertions.assertTrue(ret.contains("std::multimap"));
|
||||||
|
Assertions.assertTrue(ret.contains("crend"));
|
||||||
|
|
||||||
|
ret = specialization.extractExtraTokens("std::coroutine_handle<Promise>::operator(), std::coroutine_handle<Promise>::resume - cppreference.com");
|
||||||
|
Assertions.assertTrue(ret.contains("std::coroutine_handle::operator()"));
|
||||||
|
Assertions.assertTrue(ret.contains("std::coroutine_handle::resume"));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user