mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 13:19:02 +00:00
Merge pull request 'Cooking mode' (#81) from master into release
Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/81
This commit is contained in:
commit
dbb5a4d3bf
@ -23,18 +23,6 @@ public class SearchIndexPreconverter {
|
|||||||
|
|
||||||
public record Shard(int bucket, int block) {}
|
public record Shard(int bucket, int block) {}
|
||||||
|
|
||||||
private record ShardOutput(Shard shard, RandomAccessFile raf, FileChannel fc) {
|
|
||||||
public static ShardOutput fromFile(Shard s, File f) {
|
|
||||||
try {
|
|
||||||
var v = new RandomAccessFile(f, "rw");
|
|
||||||
v.seek(SearchIndexJournalReader.FILE_HEADER_SIZE_BYTES);
|
|
||||||
return new ShardOutput(s, v, v.getChannel());
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
@Inject
|
@Inject
|
||||||
public SearchIndexPreconverter(File inputFile,
|
public SearchIndexPreconverter(File inputFile,
|
||||||
@ -57,7 +45,9 @@ public class SearchIndexPreconverter {
|
|||||||
|
|
||||||
logger.info("{}", indexJournalReader.fileHeader);
|
logger.info("{}", indexJournalReader.fileHeader);
|
||||||
|
|
||||||
ShardOutput[] outputs = outputFiles.entrySet().stream().map(entry -> ShardOutput.fromFile(entry.getKey(), entry.getValue())).toArray(ShardOutput[]::new);
|
ShardOutput[] outputs = outputFiles.entrySet().stream()
|
||||||
|
.map(entry -> ShardOutput.fromFile(entry.getKey(), entry.getValue()))
|
||||||
|
.toArray(ShardOutput[]::new);
|
||||||
|
|
||||||
var lock = partitioner.getReadLock();
|
var lock = partitioner.getReadLock();
|
||||||
try {
|
try {
|
||||||
@ -69,18 +59,14 @@ public class SearchIndexPreconverter {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
int domainId = entry.domainId();
|
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
entry.copyToBuffer(buffer);
|
entry.copyToBuffer(buffer);
|
||||||
|
|
||||||
for (int i = 0; i < outputs.length; i++) {
|
for (ShardOutput output : outputs) {
|
||||||
if (outputs[i].shard.block == entry.header.block().id
|
if (output.shouldWrite(partitioner, entry)) {
|
||||||
&& partitioner.filterUnsafe(domainId, outputs[i].shard.bucket))
|
|
||||||
{
|
|
||||||
buffer.flip();
|
buffer.flip();
|
||||||
|
|
||||||
while (buffer.position() < buffer.limit())
|
output.write(buffer);
|
||||||
outputs[i].fc.write(buffer);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -90,16 +76,42 @@ public class SearchIndexPreconverter {
|
|||||||
}
|
}
|
||||||
logger.info("Finalizing preconversion");
|
logger.info("Finalizing preconversion");
|
||||||
|
|
||||||
for (int i = 0; i < outputs.length; i++) {
|
for (ShardOutput output : outputs) {
|
||||||
long pos = outputs[i].raf.getFilePointer();
|
output.finish(wordCountOriginal);
|
||||||
outputs[i].raf.seek(0);
|
|
||||||
outputs[i].raf.writeLong(pos);
|
|
||||||
outputs[i].raf.writeLong(wordCountOriginal);
|
|
||||||
outputs[i].fc.force(true);
|
|
||||||
outputs[i].fc.close();
|
|
||||||
outputs[i].raf.close();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private record ShardOutput(Shard shard, RandomAccessFile raf, FileChannel fc) {
|
||||||
|
public static ShardOutput fromFile(Shard s, File f) {
|
||||||
|
try {
|
||||||
|
var v = new RandomAccessFile(f, "rw");
|
||||||
|
v.seek(SearchIndexJournalReader.FILE_HEADER_SIZE_BYTES);
|
||||||
|
return new ShardOutput(s, v, v.getChannel());
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean shouldWrite(SearchIndexPartitioner partitioner, SearchIndexJournalReader.JournalEntry entry) {
|
||||||
|
return shard.block == entry.header.block().id
|
||||||
|
&& partitioner.filterUnsafe(entry.domainId(), shard.bucket);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void finish(long wordCountOriginal) throws IOException {
|
||||||
|
long pos = raf.getFilePointer();
|
||||||
|
raf.seek(0);
|
||||||
|
raf.writeLong(pos);
|
||||||
|
raf.writeLong(wordCountOriginal);
|
||||||
|
fc.force(true);
|
||||||
|
fc.close();
|
||||||
|
raf.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void write(ByteBuffer buffer) throws IOException {
|
||||||
|
while (buffer.position() < buffer.limit())
|
||||||
|
fc.write(buffer);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
package nu.marginalia.wmsa.edge.search;
|
package nu.marginalia.wmsa.edge.search;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
|
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -25,6 +27,9 @@ public enum EdgeSearchProfile {
|
|||||||
ACADEMIA("academia",
|
ACADEMIA("academia",
|
||||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
||||||
3),
|
3),
|
||||||
|
FOOD("food",
|
||||||
|
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
||||||
|
2, 0),
|
||||||
;
|
;
|
||||||
|
|
||||||
|
|
||||||
@ -49,7 +54,15 @@ public enum EdgeSearchProfile {
|
|||||||
case "default" -> DEFAULT;
|
case "default" -> DEFAULT;
|
||||||
case "corpo" -> CORPO;
|
case "corpo" -> CORPO;
|
||||||
case "academia" -> ACADEMIA;
|
case "academia" -> ACADEMIA;
|
||||||
|
case "food" -> FOOD;
|
||||||
default -> YOLO;
|
default -> YOLO;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void addTacitTerms(EdgeSearchSubquery subquery) {
|
||||||
|
if (this == FOOD) {
|
||||||
|
subquery.searchTermsInclude.add(HtmlFeature.CATEGORY_FOOD.getKeyword());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,9 @@
|
|||||||
package nu.marginalia.wmsa.edge.search.command;
|
package nu.marginalia.wmsa.edge.search.command;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
public enum SearchJsParameter {
|
public enum SearchJsParameter {
|
||||||
DEFAULT("default"),
|
DEFAULT("default"),
|
||||||
@ -21,4 +24,8 @@ public enum SearchJsParameter {
|
|||||||
|
|
||||||
return DEFAULT;
|
return DEFAULT;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void addTacitTerms(EdgeSearchSubquery subquery) {
|
||||||
|
subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -46,9 +46,15 @@ public class EnglishDictionary {
|
|||||||
var variants = findWordVariants(s);
|
var variants = findWordVariants(s);
|
||||||
long freqBaseline = dict.getTermFreq(s);
|
long freqBaseline = dict.getTermFreq(s);
|
||||||
|
|
||||||
return variants.stream()
|
var ret = variants.stream()
|
||||||
.filter(var -> freqBaseline*10 > dict.getTermFreq(var) && freqBaseline/10 < dict.getTermFreq(var)
|
.filter(var -> freqBaseline*10 > dict.getTermFreq(var) && freqBaseline/10 < dict.getTermFreq(var)
|
||||||
).collect(Collectors.toList());
|
).collect(Collectors.toList());
|
||||||
|
|
||||||
|
if (s.equals("recipe") || s.equals("recipes")) {
|
||||||
|
ret.add("category:food");
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -54,7 +54,6 @@ public class QueryFactory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private List<EdgeSearchSubquery> reevaluateSubqueries(EdgeSearchQuery processedQuery, EdgeUserSearchParameters params) {
|
private List<EdgeSearchSubquery> reevaluateSubqueries(EdgeSearchQuery processedQuery, EdgeUserSearchParameters params) {
|
||||||
final var jsSetting = params.jsSetting();
|
|
||||||
final var profile = params.profile();
|
final var profile = params.profile();
|
||||||
|
|
||||||
List<EdgeSearchSubquery> subqueries =
|
List<EdgeSearchSubquery> subqueries =
|
||||||
@ -66,10 +65,6 @@ public class QueryFactory {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
subqueries.forEach(sq -> {
|
|
||||||
sq.searchTermsExclude.addAll(Arrays.asList(jsSetting.implictExcludeSearchTerms));
|
|
||||||
});
|
|
||||||
|
|
||||||
subqueries.sort(Comparator.comparing(sq -> -sq.termSize()*2.3 + sq.block.sortOrder));
|
subqueries.sort(Comparator.comparing(sq -> -sq.termSize()*2.3 + sq.block.sortOrder));
|
||||||
|
|
||||||
return subqueries;
|
return subqueries;
|
||||||
@ -132,7 +127,12 @@ public class QueryFactory {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
subqueries.add(new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, IndexBlock.TitleKeywords));
|
EdgeSearchSubquery subquery = new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, IndexBlock.TitleKeywords);
|
||||||
|
|
||||||
|
params.profile().addTacitTerms(subquery);
|
||||||
|
params.jsSetting().addTacitTerms(subquery);
|
||||||
|
|
||||||
|
subqueries.add(subquery);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -5,12 +5,12 @@ import com.google.inject.Singleton;
|
|||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
import lombok.ToString;
|
import lombok.ToString;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
|
||||||
import nu.marginalia.util.language.conf.LanguageModels;
|
import nu.marginalia.util.language.conf.LanguageModels;
|
||||||
import nu.marginalia.util.language.processing.KeywordExtractor;
|
import nu.marginalia.util.language.processing.KeywordExtractor;
|
||||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
||||||
import nu.marginalia.util.language.processing.model.WordSpan;
|
import nu.marginalia.util.language.processing.model.WordSpan;
|
||||||
|
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
||||||
import opennlp.tools.stemmer.PorterStemmer;
|
import opennlp.tools.stemmer.PorterStemmer;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -115,7 +115,6 @@ public class QueryVariants {
|
|||||||
|
|
||||||
for (var span : goodSpans) {
|
for (var span : goodSpans) {
|
||||||
alternativeQueries.addAll(joinTerms(span));
|
alternativeQueries.addAll(joinTerms(span));
|
||||||
// alternativeQueries.addAll(swapTerms(span));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (var ls : goodSpans) {
|
for (var ls : goodSpans) {
|
||||||
@ -134,8 +133,11 @@ public class QueryVariants {
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
QueryVariantSet returnValue = new QueryVariantSet();
|
QueryVariantSet returnValue = new QueryVariantSet();
|
||||||
|
|
||||||
returnValue.faithful.addAll(evaluateQueries(faithfulQueries));
|
returnValue.faithful.addAll(evaluateQueries(faithfulQueries));
|
||||||
|
|
||||||
returnValue.faithful.addAll(evaluateQueries(alternativeQueries));
|
returnValue.faithful.addAll(evaluateQueries(alternativeQueries));
|
||||||
|
|
||||||
returnValue.faithful.sort(Comparator.comparing(QueryVariant::getValue));
|
returnValue.faithful.sort(Comparator.comparing(QueryVariant::getValue));
|
||||||
|
@ -38,6 +38,7 @@
|
|||||||
<option value="modern">Blogs and Personal Websites</option>
|
<option value="modern">Blogs and Personal Websites</option>
|
||||||
<option value="academia">Academia, Forums, Big Websites</option>
|
<option value="academia">Academia, Forums, Big Websites</option>
|
||||||
<option value="yolo" selected>Default Ranking Algorithm</option>
|
<option value="yolo" selected>Default Ranking Algorithm</option>
|
||||||
|
<option value="food">Recipes 🍳</option>
|
||||||
<option value="corpo">Experimental</option>
|
<option value="corpo">Experimental</option>
|
||||||
</select>
|
</select>
|
||||||
<select name="js" id="js">
|
<select name="js" id="js">
|
||||||
|
@ -11,6 +11,7 @@
|
|||||||
<option {{#eq profile "modern"}}selected{{/eq}} value="modern">Blogs and Personal Websites</option>
|
<option {{#eq profile "modern"}}selected{{/eq}} value="modern">Blogs and Personal Websites</option>
|
||||||
<option {{#eq profile "academia"}}selected{{/eq}} value="academia">Academia, Forums, Big Websites</option>
|
<option {{#eq profile "academia"}}selected{{/eq}} value="academia">Academia, Forums, Big Websites</option>
|
||||||
<option {{#eq profile "yolo"}}selected{{/eq}} value="yolo">Default Ranking Algorithm</option>
|
<option {{#eq profile "yolo"}}selected{{/eq}} value="yolo">Default Ranking Algorithm</option>
|
||||||
|
<option {{#eq profile "food"}}selected{{/eq}} value="food">Recipes 🍳</option>
|
||||||
<option {{#eq profile "corpo"}}selected{{/eq}} value="corpo">Experimental</option>
|
<option {{#eq profile "corpo"}}selected{{/eq}} value="corpo">Experimental</option>
|
||||||
</select>
|
</select>
|
||||||
<select name="js" id="js">
|
<select name="js" id="js">
|
||||||
|
@ -1,14 +1,12 @@
|
|||||||
package nu.marginalia.wmsa.edge.search.query;
|
package nu.marginalia.wmsa.edge.search.query;
|
||||||
|
|
||||||
import nu.marginalia.util.TestLanguageModels;
|
import nu.marginalia.util.TestLanguageModels;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
|
||||||
import nu.marginalia.util.language.conf.LanguageModels;
|
import nu.marginalia.util.language.conf.LanguageModels;
|
||||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||||
|
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
class QueryVariantsTest {
|
class QueryVariantsTest {
|
||||||
QueryVariants variants;
|
QueryVariants variants;
|
||||||
QueryParser parser;
|
QueryParser parser;
|
||||||
@ -25,30 +23,36 @@ class QueryVariantsTest {
|
|||||||
parser = new QueryParser(new EnglishDictionary(dict), variants);
|
parser = new QueryParser(new EnglishDictionary(dict), variants);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test @SuppressWarnings("unchecked")
|
@Test
|
||||||
void getQueryVariants() {
|
void getQueryFood() {
|
||||||
System.out.println(se.extractSentence("we are alone"));
|
System.out.println(se.extractSentence("we are alone"));
|
||||||
testCase("DOS", List.of("DOS"));
|
testCase("Omelet recipe");
|
||||||
testCase("dos", List.of("dos"));
|
|
||||||
testCase("we are alone", List.of("dos"));
|
|
||||||
testCase("3D Realms", List.of("dos"));
|
|
||||||
testCase("I am alone", List.of("dos"));
|
|
||||||
testCase("plato cave", List.of("dos"));
|
|
||||||
testCase("The internet is dead", List.of("dos"));
|
|
||||||
|
|
||||||
testCase("TRS80", List.of("trs_80"), List.of("trs80"));
|
|
||||||
testCase("TRS-80", List.of("trs-80"), List.of("trs80"));
|
|
||||||
testCase("TRS-80", List.of("trs-80"), List.of("trs80"));
|
|
||||||
testCase("Raspberry Pi 2", List.of("trs-80"), List.of("trs80"));
|
|
||||||
testCase("Duke Nukem 3D", List.of("trs-80"), List.of("trs80"));
|
|
||||||
testCase("The Man of Tomorrow", List.of("trs-80"), List.of("trs80"));
|
|
||||||
testCase("Computer Manual", List.of("trs-80"), List.of("trs80"));
|
|
||||||
testCase("Knitting", List.of("trs-80"), List.of("trs80"));
|
|
||||||
testCase("capcom", List.of("trs-80"), List.of("trs80"));
|
|
||||||
testCase("the man of tomorrow", List.of("trs-80"), List.of("trs80"));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void testCase(String input, List<String>... expected) {
|
@Test
|
||||||
|
void getQueryVariants() {
|
||||||
|
System.out.println(se.extractSentence("we are alone"));
|
||||||
|
testCase("DOS");
|
||||||
|
testCase("dos");
|
||||||
|
testCase("we are alone");
|
||||||
|
testCase("3D Realms");
|
||||||
|
testCase("I am alone");
|
||||||
|
testCase("plato cave");
|
||||||
|
testCase("The internet is dead");
|
||||||
|
|
||||||
|
testCase("TRS80");
|
||||||
|
testCase("TRS-80");
|
||||||
|
testCase("TRS-80");
|
||||||
|
testCase("Raspberry Pi 2");
|
||||||
|
testCase("Duke Nukem 3D");
|
||||||
|
testCase("The Man of Tomorrow");
|
||||||
|
testCase("Computer Manual");
|
||||||
|
testCase("Knitting");
|
||||||
|
testCase("capcom");
|
||||||
|
testCase("the man of tomorrow");
|
||||||
|
}
|
||||||
|
|
||||||
|
private void testCase(String input) {
|
||||||
var tokens = variants.getQueryVariants(parser.extractBasicTokens(input));
|
var tokens = variants.getQueryVariants(parser.extractBasicTokens(input));
|
||||||
System.out.println(tokens);
|
System.out.println(tokens);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user