Clean up preconverter code

This commit is contained in:
vlofgren 2022-08-08 18:08:18 +02:00
parent 2af2c50f34
commit 0f59675f7c
8 changed files with 67 additions and 33 deletions

View File

@ -1,6 +1,8 @@
package nu.marginalia.wmsa.edge.search; package nu.marginalia.wmsa.edge.search;
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
@ -25,6 +27,9 @@ public enum EdgeSearchProfile {
ACADEMIA("academia", ACADEMIA("academia",
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
3), 3),
FOOD("food",
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
2, 0),
; ;
@ -49,7 +54,15 @@ public enum EdgeSearchProfile {
case "default" -> DEFAULT; case "default" -> DEFAULT;
case "corpo" -> CORPO; case "corpo" -> CORPO;
case "academia" -> ACADEMIA; case "academia" -> ACADEMIA;
case "food" -> FOOD;
default -> YOLO; default -> YOLO;
}; };
} }
public void addTacitTerms(EdgeSearchSubquery subquery) {
if (this == FOOD) {
subquery.searchTermsInclude.add(HtmlFeature.CATEGORY_FOOD.getKeyword());
}
}
} }

View File

@ -1,6 +1,9 @@
package nu.marginalia.wmsa.edge.search.command; package nu.marginalia.wmsa.edge.search.command;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery;
import javax.annotation.Nullable; import javax.annotation.Nullable;
import java.util.Arrays;
public enum SearchJsParameter { public enum SearchJsParameter {
DEFAULT("default"), DEFAULT("default"),
@ -21,4 +24,8 @@ public enum SearchJsParameter {
return DEFAULT; return DEFAULT;
} }
public void addTacitTerms(EdgeSearchSubquery subquery) {
subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms));
}
} }

View File

@ -46,9 +46,15 @@ public class EnglishDictionary {
var variants = findWordVariants(s); var variants = findWordVariants(s);
long freqBaseline = dict.getTermFreq(s); long freqBaseline = dict.getTermFreq(s);
return variants.stream() var ret = variants.stream()
.filter(var -> freqBaseline*10 > dict.getTermFreq(var) && freqBaseline/10 < dict.getTermFreq(var) .filter(var -> freqBaseline*10 > dict.getTermFreq(var) && freqBaseline/10 < dict.getTermFreq(var)
).collect(Collectors.toList()); ).collect(Collectors.toList());
if (s.equals("recipe") || s.equals("recipes")) {
ret.add("category:food");
}
return ret;
} }

View File

@ -54,7 +54,6 @@ public class QueryFactory {
} }
private List<EdgeSearchSubquery> reevaluateSubqueries(EdgeSearchQuery processedQuery, EdgeUserSearchParameters params) { private List<EdgeSearchSubquery> reevaluateSubqueries(EdgeSearchQuery processedQuery, EdgeUserSearchParameters params) {
final var jsSetting = params.jsSetting();
final var profile = params.profile(); final var profile = params.profile();
List<EdgeSearchSubquery> subqueries = List<EdgeSearchSubquery> subqueries =
@ -66,10 +65,6 @@ public class QueryFactory {
} }
} }
subqueries.forEach(sq -> {
sq.searchTermsExclude.addAll(Arrays.asList(jsSetting.implictExcludeSearchTerms));
});
subqueries.sort(Comparator.comparing(sq -> -sq.termSize()*2.3 + sq.block.sortOrder)); subqueries.sort(Comparator.comparing(sq -> -sq.termSize()*2.3 + sq.block.sortOrder));
return subqueries; return subqueries;
@ -132,7 +127,12 @@ public class QueryFactory {
} }
} }
subqueries.add(new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, IndexBlock.TitleKeywords)); EdgeSearchSubquery subquery = new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, IndexBlock.TitleKeywords);
params.profile().addTacitTerms(subquery);
params.jsSetting().addTacitTerms(subquery);
subqueries.add(subquery);
} }

View File

@ -5,12 +5,12 @@ import com.google.inject.Singleton;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Getter; import lombok.Getter;
import lombok.ToString; import lombok.ToString;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.KeywordExtractor; import nu.marginalia.util.language.processing.KeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor; import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.model.DocumentSentence; import nu.marginalia.util.language.processing.model.DocumentSentence;
import nu.marginalia.util.language.processing.model.WordSpan; import nu.marginalia.util.language.processing.model.WordSpan;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import opennlp.tools.stemmer.PorterStemmer; import opennlp.tools.stemmer.PorterStemmer;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -115,7 +115,6 @@ public class QueryVariants {
for (var span : goodSpans) { for (var span : goodSpans) {
alternativeQueries.addAll(joinTerms(span)); alternativeQueries.addAll(joinTerms(span));
// alternativeQueries.addAll(swapTerms(span));
} }
for (var ls : goodSpans) { for (var ls : goodSpans) {
@ -134,8 +133,11 @@ public class QueryVariants {
} }
} }
QueryVariantSet returnValue = new QueryVariantSet(); QueryVariantSet returnValue = new QueryVariantSet();
returnValue.faithful.addAll(evaluateQueries(faithfulQueries)); returnValue.faithful.addAll(evaluateQueries(faithfulQueries));
returnValue.faithful.addAll(evaluateQueries(alternativeQueries)); returnValue.faithful.addAll(evaluateQueries(alternativeQueries));
returnValue.faithful.sort(Comparator.comparing(QueryVariant::getValue)); returnValue.faithful.sort(Comparator.comparing(QueryVariant::getValue));

View File

@ -38,6 +38,7 @@
<option value="modern">Blogs and Personal Websites</option> <option value="modern">Blogs and Personal Websites</option>
<option value="academia">Academia, Forums, Big Websites</option> <option value="academia">Academia, Forums, Big Websites</option>
<option value="yolo" selected>Default Ranking Algorithm</option> <option value="yolo" selected>Default Ranking Algorithm</option>
<option value="food">Recipes &#127859;</option>
<option value="corpo">Experimental</option> <option value="corpo">Experimental</option>
</select> </select>
<select name="js" id="js"> <select name="js" id="js">

View File

@ -11,6 +11,7 @@
<option {{#eq profile "modern"}}selected{{/eq}} value="modern">Blogs and Personal Websites</option> <option {{#eq profile "modern"}}selected{{/eq}} value="modern">Blogs and Personal Websites</option>
<option {{#eq profile "academia"}}selected{{/eq}} value="academia">Academia, Forums, Big Websites</option> <option {{#eq profile "academia"}}selected{{/eq}} value="academia">Academia, Forums, Big Websites</option>
<option {{#eq profile "yolo"}}selected{{/eq}} value="yolo">Default Ranking Algorithm</option> <option {{#eq profile "yolo"}}selected{{/eq}} value="yolo">Default Ranking Algorithm</option>
<option {{#eq profile "food"}}selected{{/eq}} value="food">Recipes &#127859;</option>
<option {{#eq profile "corpo"}}selected{{/eq}} value="corpo">Experimental</option> <option {{#eq profile "corpo"}}selected{{/eq}} value="corpo">Experimental</option>
</select> </select>
<select name="js" id="js"> <select name="js" id="js">

View File

@ -1,14 +1,12 @@
package nu.marginalia.wmsa.edge.search.query; package nu.marginalia.wmsa.edge.search.query;
import nu.marginalia.util.TestLanguageModels; import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.SentenceExtractor; import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import java.util.List;
class QueryVariantsTest { class QueryVariantsTest {
QueryVariants variants; QueryVariants variants;
QueryParser parser; QueryParser parser;
@ -25,30 +23,36 @@ class QueryVariantsTest {
parser = new QueryParser(new EnglishDictionary(dict), variants); parser = new QueryParser(new EnglishDictionary(dict), variants);
} }
@Test @SuppressWarnings("unchecked") @Test
void getQueryVariants() { void getQueryFood() {
System.out.println(se.extractSentence("we are alone")); System.out.println(se.extractSentence("we are alone"));
testCase("DOS", List.of("DOS")); testCase("Omelet recipe");
testCase("dos", List.of("dos"));
testCase("we are alone", List.of("dos"));
testCase("3D Realms", List.of("dos"));
testCase("I am alone", List.of("dos"));
testCase("plato cave", List.of("dos"));
testCase("The internet is dead", List.of("dos"));
testCase("TRS80", List.of("trs_80"), List.of("trs80"));
testCase("TRS-80", List.of("trs-80"), List.of("trs80"));
testCase("TRS-80", List.of("trs-80"), List.of("trs80"));
testCase("Raspberry Pi 2", List.of("trs-80"), List.of("trs80"));
testCase("Duke Nukem 3D", List.of("trs-80"), List.of("trs80"));
testCase("The Man of Tomorrow", List.of("trs-80"), List.of("trs80"));
testCase("Computer Manual", List.of("trs-80"), List.of("trs80"));
testCase("Knitting", List.of("trs-80"), List.of("trs80"));
testCase("capcom", List.of("trs-80"), List.of("trs80"));
testCase("the man of tomorrow", List.of("trs-80"), List.of("trs80"));
} }
private void testCase(String input, List<String>... expected) { @Test
void getQueryVariants() {
System.out.println(se.extractSentence("we are alone"));
testCase("DOS");
testCase("dos");
testCase("we are alone");
testCase("3D Realms");
testCase("I am alone");
testCase("plato cave");
testCase("The internet is dead");
testCase("TRS80");
testCase("TRS-80");
testCase("TRS-80");
testCase("Raspberry Pi 2");
testCase("Duke Nukem 3D");
testCase("The Man of Tomorrow");
testCase("Computer Manual");
testCase("Knitting");
testCase("capcom");
testCase("the man of tomorrow");
}
private void testCase(String input) {
var tokens = variants.getQueryVariants(parser.extractBasicTokens(input)); var tokens = variants.getQueryVariants(parser.extractBasicTokens(input));
System.out.println(tokens); System.out.println(tokens);
} }