From 2af2c50f34a3ab0419ede4b4ad98b5b53113c3d7 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Mon, 8 Aug 2022 15:29:44 +0200 Subject: [PATCH 1/2] Clean up preconverter code --- .../conversion/SearchIndexPreconverter.java | 68 +++++++++++-------- 1 file changed, 40 insertions(+), 28 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java index 37560b61..57d63825 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java @@ -23,18 +23,6 @@ public class SearchIndexPreconverter { public record Shard(int bucket, int block) {} - private record ShardOutput(Shard shard, RandomAccessFile raf, FileChannel fc) { - public static ShardOutput fromFile(Shard s, File f) { - try { - var v = new RandomAccessFile(f, "rw"); - v.seek(SearchIndexJournalReader.FILE_HEADER_SIZE_BYTES); - return new ShardOutput(s, v, v.getChannel()); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - }; - @SneakyThrows @Inject public SearchIndexPreconverter(File inputFile, @@ -57,7 +45,9 @@ public class SearchIndexPreconverter { logger.info("{}", indexJournalReader.fileHeader); - ShardOutput[] outputs = outputFiles.entrySet().stream().map(entry -> ShardOutput.fromFile(entry.getKey(), entry.getValue())).toArray(ShardOutput[]::new); + ShardOutput[] outputs = outputFiles.entrySet().stream() + .map(entry -> ShardOutput.fromFile(entry.getKey(), entry.getValue())) + .toArray(ShardOutput[]::new); var lock = partitioner.getReadLock(); try { @@ -69,18 +59,14 @@ public class SearchIndexPreconverter { continue; } - int domainId = entry.domainId(); buffer.clear(); entry.copyToBuffer(buffer); - for (int i = 0; i < outputs.length; i++) { - if (outputs[i].shard.block == entry.header.block().id - && partitioner.filterUnsafe(domainId, outputs[i].shard.bucket)) - { + for (ShardOutput output : outputs) { + if (output.shouldWrite(partitioner, entry)) { buffer.flip(); - while (buffer.position() < buffer.limit()) - outputs[i].fc.write(buffer); + output.write(buffer); } } } @@ -90,16 +76,42 @@ public class SearchIndexPreconverter { } logger.info("Finalizing preconversion"); - for (int i = 0; i < outputs.length; i++) { - long pos = outputs[i].raf.getFilePointer(); - outputs[i].raf.seek(0); - outputs[i].raf.writeLong(pos); - outputs[i].raf.writeLong(wordCountOriginal); - outputs[i].fc.force(true); - outputs[i].fc.close(); - outputs[i].raf.close(); + for (ShardOutput output : outputs) { + output.finish(wordCountOriginal); } } + private record ShardOutput(Shard shard, RandomAccessFile raf, FileChannel fc) { + public static ShardOutput fromFile(Shard s, File f) { + try { + var v = new RandomAccessFile(f, "rw"); + v.seek(SearchIndexJournalReader.FILE_HEADER_SIZE_BYTES); + return new ShardOutput(s, v, v.getChannel()); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public boolean shouldWrite(SearchIndexPartitioner partitioner, SearchIndexJournalReader.JournalEntry entry) { + return shard.block == entry.header.block().id + && partitioner.filterUnsafe(entry.domainId(), shard.bucket); + } + + public void finish(long wordCountOriginal) throws IOException { + long pos = raf.getFilePointer(); + raf.seek(0); + raf.writeLong(pos); + raf.writeLong(wordCountOriginal); + fc.force(true); + fc.close(); + raf.close(); + } + + public void write(ByteBuffer buffer) throws IOException { + while (buffer.position() < buffer.limit()) + fc.write(buffer); + } + }; + } From 0f59675f7c5c405346212e790b4d1e3834981e66 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Mon, 8 Aug 2022 18:08:18 +0200 Subject: [PATCH 2/2] Clean up preconverter code --- .../wmsa/edge/search/EdgeSearchProfile.java | 13 +++++ .../search/command/SearchJsParameter.java | 7 +++ .../edge/search/query/EnglishDictionary.java | 8 ++- .../wmsa/edge/search/query/QueryFactory.java | 12 ++--- .../wmsa/edge/search/query/QueryVariants.java | 6 ++- .../src/main/resources/static/edge/index.html | 1 + .../templates/edge/parts/search-form.hdb | 1 + .../edge/search/query/QueryVariantsTest.java | 52 ++++++++++--------- 8 files changed, 67 insertions(+), 33 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java index c08d9ca3..64f8f8b1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java @@ -1,6 +1,8 @@ package nu.marginalia.wmsa.edge.search; +import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery; import java.util.Arrays; import java.util.List; @@ -25,6 +27,9 @@ public enum EdgeSearchProfile { ACADEMIA("academia", List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), 3), + FOOD("food", + List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), + 2, 0), ; @@ -49,7 +54,15 @@ public enum EdgeSearchProfile { case "default" -> DEFAULT; case "corpo" -> CORPO; case "academia" -> ACADEMIA; + case "food" -> FOOD; default -> YOLO; }; } + + public void addTacitTerms(EdgeSearchSubquery subquery) { + if (this == FOOD) { + subquery.searchTermsInclude.add(HtmlFeature.CATEGORY_FOOD.getKeyword()); + } + + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/SearchJsParameter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/SearchJsParameter.java index 29367549..f42b3525 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/SearchJsParameter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/SearchJsParameter.java @@ -1,6 +1,9 @@ package nu.marginalia.wmsa.edge.search.command; +import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery; + import javax.annotation.Nullable; +import java.util.Arrays; public enum SearchJsParameter { DEFAULT("default"), @@ -21,4 +24,8 @@ public enum SearchJsParameter { return DEFAULT; } + + public void addTacitTerms(EdgeSearchSubquery subquery) { + subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms)); + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/EnglishDictionary.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/EnglishDictionary.java index 982cb158..4a2086ca 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/EnglishDictionary.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/EnglishDictionary.java @@ -46,9 +46,15 @@ public class EnglishDictionary { var variants = findWordVariants(s); long freqBaseline = dict.getTermFreq(s); - return variants.stream() + var ret = variants.stream() .filter(var -> freqBaseline*10 > dict.getTermFreq(var) && freqBaseline/10 < dict.getTermFreq(var) ).collect(Collectors.toList()); + + if (s.equals("recipe") || s.equals("recipes")) { + ret.add("category:food"); + } + + return ret; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java index 53d80cd6..3badd593 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java @@ -54,7 +54,6 @@ public class QueryFactory { } private List reevaluateSubqueries(EdgeSearchQuery processedQuery, EdgeUserSearchParameters params) { - final var jsSetting = params.jsSetting(); final var profile = params.profile(); List subqueries = @@ -66,10 +65,6 @@ public class QueryFactory { } } - subqueries.forEach(sq -> { - sq.searchTermsExclude.addAll(Arrays.asList(jsSetting.implictExcludeSearchTerms)); - }); - subqueries.sort(Comparator.comparing(sq -> -sq.termSize()*2.3 + sq.block.sortOrder)); return subqueries; @@ -132,7 +127,12 @@ public class QueryFactory { } } - subqueries.add(new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, IndexBlock.TitleKeywords)); + EdgeSearchSubquery subquery = new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, IndexBlock.TitleKeywords); + + params.profile().addTacitTerms(subquery); + params.jsSetting().addTacitTerms(subquery); + + subqueries.add(subquery); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryVariants.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryVariants.java index cbfa1de5..2b509397 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryVariants.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryVariants.java @@ -5,12 +5,12 @@ import com.google.inject.Singleton; import lombok.AllArgsConstructor; import lombok.Getter; import lombok.ToString; -import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.processing.KeywordExtractor; import nu.marginalia.util.language.processing.SentenceExtractor; import nu.marginalia.util.language.processing.model.DocumentSentence; import nu.marginalia.util.language.processing.model.WordSpan; +import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import opennlp.tools.stemmer.PorterStemmer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -115,7 +115,6 @@ public class QueryVariants { for (var span : goodSpans) { alternativeQueries.addAll(joinTerms(span)); -// alternativeQueries.addAll(swapTerms(span)); } for (var ls : goodSpans) { @@ -134,8 +133,11 @@ public class QueryVariants { } } + QueryVariantSet returnValue = new QueryVariantSet(); + returnValue.faithful.addAll(evaluateQueries(faithfulQueries)); + returnValue.faithful.addAll(evaluateQueries(alternativeQueries)); returnValue.faithful.sort(Comparator.comparing(QueryVariant::getValue)); diff --git a/marginalia_nu/src/main/resources/static/edge/index.html b/marginalia_nu/src/main/resources/static/edge/index.html index 47d6e314..bf925208 100644 --- a/marginalia_nu/src/main/resources/static/edge/index.html +++ b/marginalia_nu/src/main/resources/static/edge/index.html @@ -38,6 +38,7 @@ +