mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(forward-index, valuator) HTML features in valuator
Put it in the forward index for easy access during index-side valuation.
This commit is contained in:
parent
fcfe07fb7d
commit
704de50a9b
@ -2,7 +2,6 @@ package nu.marginalia.index.client.model.results;
|
|||||||
|
|
||||||
import nu.marginalia.model.idx.WordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
import nu.marginalia.model.idx.WordMetadata;
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
import nu.marginalia.model.idx.DocumentFlags;
|
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
|
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
@ -14,15 +13,19 @@ public final class SearchResultKeywordScore {
|
|||||||
private final long encodedDocMetadata;
|
private final long encodedDocMetadata;
|
||||||
private final boolean hasPriorityTerms;
|
private final boolean hasPriorityTerms;
|
||||||
|
|
||||||
|
private final int htmlFeatures;
|
||||||
|
|
||||||
public SearchResultKeywordScore(int subquery,
|
public SearchResultKeywordScore(int subquery,
|
||||||
String keyword,
|
String keyword,
|
||||||
long encodedWordMetadata,
|
long encodedWordMetadata,
|
||||||
long encodedDocMetadata,
|
long encodedDocMetadata,
|
||||||
|
int htmlFeatures,
|
||||||
boolean hasPriorityTerms) {
|
boolean hasPriorityTerms) {
|
||||||
this.subquery = subquery;
|
this.subquery = subquery;
|
||||||
this.keyword = keyword;
|
this.keyword = keyword;
|
||||||
this.encodedWordMetadata = encodedWordMetadata;
|
this.encodedWordMetadata = encodedWordMetadata;
|
||||||
this.encodedDocMetadata = encodedDocMetadata;
|
this.encodedDocMetadata = encodedDocMetadata;
|
||||||
|
this.htmlFeatures = htmlFeatures;
|
||||||
this.hasPriorityTerms = hasPriorityTerms;
|
this.hasPriorityTerms = hasPriorityTerms;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -58,6 +61,10 @@ public final class SearchResultKeywordScore {
|
|||||||
return encodedDocMetadata;
|
return encodedDocMetadata;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int htmlFeatures() {
|
||||||
|
return htmlFeatures;
|
||||||
|
}
|
||||||
|
|
||||||
public boolean hasPriorityTerms() {
|
public boolean hasPriorityTerms() {
|
||||||
return hasPriorityTerms;
|
return hasPriorityTerms;
|
||||||
}
|
}
|
||||||
|
@ -88,8 +88,9 @@ public class ForwardIndexConverter {
|
|||||||
int ranking = domainRankings.getRanking(entry.domainId());
|
int ranking = domainRankings.getRanking(entry.domainId());
|
||||||
long meta = DocumentMetadata.encodeRank(entry.docMeta(), ranking);
|
long meta = DocumentMetadata.encodeRank(entry.docMeta(), ranking);
|
||||||
|
|
||||||
docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta);
|
|
||||||
docFileData.set(entryOffset + ForwardIndexParameters.DOMAIN_OFFSET, entry.domainId());
|
docFileData.set(entryOffset + ForwardIndexParameters.DOMAIN_OFFSET, entry.domainId());
|
||||||
|
docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta);
|
||||||
|
docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, entry.header.documentFeatures());
|
||||||
});
|
});
|
||||||
|
|
||||||
progress.progress(TaskSteps.FORCE);
|
progress.progress(TaskSteps.FORCE);
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
package nu.marginalia.index.forward;
|
package nu.marginalia.index.forward;
|
||||||
|
|
||||||
class ForwardIndexParameters {
|
class ForwardIndexParameters {
|
||||||
public static final int ENTRY_SIZE = 2;
|
public static final int ENTRY_SIZE = 3;
|
||||||
public static final int DOMAIN_OFFSET = 0;
|
public static final int DOMAIN_OFFSET = 0;
|
||||||
public static final int METADATA_OFFSET = 1;
|
public static final int METADATA_OFFSET = 1;
|
||||||
|
public static final int FEATURES_OFFSET = 2;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -77,6 +77,13 @@ public class ForwardIndexReader {
|
|||||||
return data.get(ENTRY_SIZE * offset + METADATA_OFFSET);
|
return data.get(ENTRY_SIZE * offset + METADATA_OFFSET);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int getHtmlFeatures(long docId) {
|
||||||
|
long offset = idxForDoc(docId);
|
||||||
|
if (offset < 0) return 0;
|
||||||
|
|
||||||
|
return (int) data.get(ENTRY_SIZE * offset + FEATURES_OFFSET);
|
||||||
|
}
|
||||||
|
|
||||||
public int getDomainId(long docId) {
|
public int getDomainId(long docId) {
|
||||||
long offset = idxForDoc(docId);
|
long offset = idxForDoc(docId);
|
||||||
if (offset < 0) return 0;
|
if (offset < 0) return 0;
|
||||||
|
@ -7,7 +7,7 @@ import nu.marginalia.model.id.EdgeId;
|
|||||||
public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntryData data) {
|
public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntryData data) {
|
||||||
|
|
||||||
public static IndexJournalEntryBuilder builder(long documentId, long documentMeta) {
|
public static IndexJournalEntryBuilder builder(long documentId, long documentMeta) {
|
||||||
return new IndexJournalEntryBuilder(documentId, documentMeta);
|
return new IndexJournalEntryBuilder(0, documentId, documentMeta);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static IndexJournalEntryBuilder builder(int domainId,
|
public static IndexJournalEntryBuilder builder(int domainId,
|
||||||
@ -15,7 +15,9 @@ public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntr
|
|||||||
long documentMeta) {
|
long documentMeta) {
|
||||||
|
|
||||||
|
|
||||||
return builder(new EdgeId<>(domainId), new EdgeId<>(urlId), documentMeta);
|
return builder(new EdgeId<>(domainId),
|
||||||
|
new EdgeId<>(urlId),
|
||||||
|
documentMeta);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static IndexJournalEntryBuilder builder(EdgeId<EdgeDomain> domainId,
|
public static IndexJournalEntryBuilder builder(EdgeId<EdgeDomain> domainId,
|
||||||
@ -23,6 +25,8 @@ public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntr
|
|||||||
long documentMeta) {
|
long documentMeta) {
|
||||||
|
|
||||||
|
|
||||||
return new IndexJournalEntryBuilder(IndexJournalEntryHeader.combineIds(domainId, urlId), documentMeta);
|
return new IndexJournalEntryBuilder(0,
|
||||||
|
IndexJournalEntryHeader.combineIds(domainId, urlId),
|
||||||
|
documentMeta);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4,10 +4,15 @@ import gnu.trove.list.array.TLongArrayList;
|
|||||||
|
|
||||||
public class IndexJournalEntryBuilder {
|
public class IndexJournalEntryBuilder {
|
||||||
private final long documentId;
|
private final long documentId;
|
||||||
|
private final int documentFeatures;
|
||||||
private final long documentMeta;
|
private final long documentMeta;
|
||||||
private final TLongArrayList items = new TLongArrayList();
|
private final TLongArrayList items = new TLongArrayList();
|
||||||
|
|
||||||
public IndexJournalEntryBuilder(long documentId, long documentMeta) {
|
public IndexJournalEntryBuilder(
|
||||||
|
int documentFeatures,
|
||||||
|
long documentId,
|
||||||
|
long documentMeta) {
|
||||||
|
this.documentFeatures = documentFeatures;
|
||||||
this.documentId = documentId;
|
this.documentId = documentId;
|
||||||
this.documentMeta = documentMeta;
|
this.documentMeta = documentMeta;
|
||||||
}
|
}
|
||||||
@ -22,7 +27,10 @@ public class IndexJournalEntryBuilder {
|
|||||||
|
|
||||||
public IndexJournalEntry build() {
|
public IndexJournalEntry build() {
|
||||||
return new IndexJournalEntry(
|
return new IndexJournalEntry(
|
||||||
new IndexJournalEntryHeader(items.size(), documentId, documentMeta),
|
new IndexJournalEntryHeader(items.size(),
|
||||||
|
documentFeatures,
|
||||||
|
documentId,
|
||||||
|
documentMeta),
|
||||||
new IndexJournalEntryData(items.toArray())
|
new IndexJournalEntryData(items.toArray())
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -4,10 +4,19 @@ import nu.marginalia.model.EdgeDomain;
|
|||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.id.EdgeId;
|
import nu.marginalia.model.id.EdgeId;
|
||||||
|
|
||||||
public record IndexJournalEntryHeader(int entrySize, long combinedId, long documentMeta) {
|
public record IndexJournalEntryHeader(int entrySize,
|
||||||
|
int documentFeatures,
|
||||||
|
long combinedId,
|
||||||
|
long documentMeta) {
|
||||||
|
|
||||||
public IndexJournalEntryHeader(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId, long documentMeta) {
|
public IndexJournalEntryHeader(EdgeId<EdgeDomain> domainId,
|
||||||
this(-1, combineIds(domainId, urlId), documentMeta);
|
int documentFeatures,
|
||||||
|
EdgeId<EdgeUrl> urlId,
|
||||||
|
long documentMeta) {
|
||||||
|
this(-1,
|
||||||
|
documentFeatures,
|
||||||
|
combineIds(domainId, urlId),
|
||||||
|
documentMeta);
|
||||||
}
|
}
|
||||||
|
|
||||||
static long combineIds(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId) {
|
static long combineIds(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId) {
|
||||||
|
@ -30,6 +30,7 @@ public class IndexJournalReadEntry {
|
|||||||
|
|
||||||
var header = new IndexJournalEntryHeader(
|
var header = new IndexJournalEntryHeader(
|
||||||
(int) (sizeBlock >>> 32L),
|
(int) (sizeBlock >>> 32L),
|
||||||
|
(int) (sizeBlock & 0xFFFF_FFFFL),
|
||||||
docId,
|
docId,
|
||||||
meta);
|
meta);
|
||||||
|
|
||||||
|
@ -72,7 +72,7 @@ public class IndexJournalWriterImpl implements IndexJournalWriter{
|
|||||||
}
|
}
|
||||||
|
|
||||||
dataBuffer.putInt(entry.size());
|
dataBuffer.putInt(entry.size());
|
||||||
dataBuffer.putInt(0);
|
dataBuffer.putInt(header.documentFeatures());
|
||||||
dataBuffer.putLong(header.combinedId());
|
dataBuffer.putLong(header.combinedId());
|
||||||
dataBuffer.putLong(header.documentMeta());
|
dataBuffer.putLong(header.documentMeta());
|
||||||
|
|
||||||
|
@ -106,7 +106,7 @@ class ReverseIndexFullConverterTest2 {
|
|||||||
}
|
}
|
||||||
public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) {
|
public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) {
|
||||||
int[] factors = getFactorsI(id);
|
int[] factors = getFactorsI(id);
|
||||||
var header = new IndexJournalEntryHeader(factors.length, createId(id, id/20), id % 5);
|
var header = new IndexJournalEntryHeader(factors.length, 0, createId(id, id/20), id % 5);
|
||||||
|
|
||||||
long[] data = new long[factors.length*2];
|
long[] data = new long[factors.length*2];
|
||||||
for (int i = 0; i < factors.length; i++) {
|
for (int i = 0; i < factors.length; i++) {
|
||||||
|
@ -106,7 +106,7 @@ class ReverseIndexPriorityConverterTest2 {
|
|||||||
}
|
}
|
||||||
public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) {
|
public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) {
|
||||||
int[] factors = getFactorsI(id);
|
int[] factors = getFactorsI(id);
|
||||||
var header = new IndexJournalEntryHeader(factors.length, createId(id, id/20), id % 5);
|
var header = new IndexJournalEntryHeader(factors.length, 0, createId(id, id/20), id % 5);
|
||||||
|
|
||||||
long[] data = new long[factors.length*2];
|
long[] data = new long[factors.length*2];
|
||||||
for (int i = 0; i < factors.length; i++) {
|
for (int i = 0; i < factors.length; i++) {
|
||||||
|
@ -5,6 +5,7 @@ import nu.marginalia.index.client.model.results.ResultRankingParameters;
|
|||||||
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
|
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.ranking.factors.*;
|
import nu.marginalia.ranking.factors.*;
|
||||||
|
|
||||||
@ -48,19 +49,20 @@ public class ResultValuator {
|
|||||||
double bestScore = 10;
|
double bestScore = 10;
|
||||||
|
|
||||||
long documentMetadata = documentMetadata(scores);
|
long documentMetadata = documentMetadata(scores);
|
||||||
|
int features = htmlFeatures(scores);
|
||||||
var rankingParams = ctx.params;
|
var rankingParams = ctx.params;
|
||||||
|
|
||||||
int rank = DocumentMetadata.decodeRank(documentMetadata);
|
int rank = DocumentMetadata.decodeRank(documentMetadata);
|
||||||
int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata);
|
int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata);
|
||||||
int quality = DocumentMetadata.decodeQuality(documentMetadata);
|
int quality = DocumentMetadata.decodeQuality(documentMetadata);
|
||||||
int urlTypePenalty = getUrlTypePenalty(documentMetadata);
|
int size = DocumentMetadata.decodeSize(documentMetadata);
|
||||||
|
int flagsPenalty = flagsPenalty(features, documentMetadata & 0xFF, size, quality);
|
||||||
int topology = DocumentMetadata.decodeTopology(documentMetadata);
|
int topology = DocumentMetadata.decodeTopology(documentMetadata);
|
||||||
int year = DocumentMetadata.decodeYear(documentMetadata);
|
int year = DocumentMetadata.decodeYear(documentMetadata);
|
||||||
|
|
||||||
double averageSentenceLengthPenalty = (asl >= rankingParams.shortSentenceThreshold ? 0 : -rankingParams.shortSentencePenalty);
|
double averageSentenceLengthPenalty = (asl >= rankingParams.shortSentenceThreshold ? 0 : -rankingParams.shortSentencePenalty);
|
||||||
|
|
||||||
final double qualityPenalty = -quality * rankingParams.qualityPenalty;
|
final double qualityPenalty = calculateQualityPenalty(size, quality, rankingParams);
|
||||||
final double rankingBonus = (255. - rank) * rankingParams.domainRankBonus;
|
final double rankingBonus = (255. - rank) * rankingParams.domainRankBonus;
|
||||||
final double topologyBonus = Math.log(1 + topology);
|
final double topologyBonus = Math.log(1 + topology);
|
||||||
final double documentLengthPenalty = length > rankingParams.shortDocumentThreshold ? 0 : -rankingParams.shortDocumentPenalty;
|
final double documentLengthPenalty = length > rankingParams.shortDocumentThreshold ? 0 : -rankingParams.shortDocumentPenalty;
|
||||||
@ -80,7 +82,7 @@ public class ResultValuator {
|
|||||||
+ rankingBonus
|
+ rankingBonus
|
||||||
+ topologyBonus
|
+ topologyBonus
|
||||||
+ temporalBias
|
+ temporalBias
|
||||||
+ urlTypePenalty
|
+ flagsPenalty
|
||||||
+ priorityTermBonus.calculate(scores);
|
+ priorityTermBonus.calculate(scores);
|
||||||
|
|
||||||
for (int set = 0; set <= sets; set++) {
|
for (int set = 0; set <= sets; set++) {
|
||||||
@ -93,7 +95,8 @@ public class ResultValuator {
|
|||||||
final double bm25 = rankingParams.bm25FullWeight * bm25Factor.calculateBm25(rankingParams.fullParams, keywordSet, length, ctx);
|
final double bm25 = rankingParams.bm25FullWeight * bm25Factor.calculateBm25(rankingParams.fullParams, keywordSet, length, ctx);
|
||||||
final double bm25p = rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, keywordSet, ctx);
|
final double bm25p = rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, keywordSet, ctx);
|
||||||
|
|
||||||
double score = normalize(bm25 + bm25p + tcf + overallPart, keywordSet.length());
|
double nonNormalizedScore = bm25 + bm25p + tcf + overallPart;
|
||||||
|
double score = normalize(nonNormalizedScore, keywordSet.length());
|
||||||
|
|
||||||
bestScore = min(bestScore, score);
|
bestScore = min(bestScore, score);
|
||||||
|
|
||||||
@ -102,16 +105,55 @@ public class ResultValuator {
|
|||||||
return bestScore;
|
return bestScore;
|
||||||
}
|
}
|
||||||
|
|
||||||
private int getUrlTypePenalty(long documentMetadata) {
|
private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) {
|
||||||
|
if (size < 400) {
|
||||||
|
if (quality < 5)
|
||||||
|
return 0;
|
||||||
|
return -quality * rankingParams.qualityPenalty;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return -quality * rankingParams.qualityPenalty * 20;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Long urls-that-look-like-this tend to be poor search results
|
private int flagsPenalty(int featureFlags, long docFlags, int size, double quality) {
|
||||||
if (DocumentMetadata.hasFlags(documentMetadata,
|
|
||||||
HtmlFeature.LONG_URL.getFeatureBit()
|
// Short-circuit for index-service, which does not have the feature flags
|
||||||
| HtmlFeature.KEBAB_CASE_URL.getFeatureBit())) {
|
if (featureFlags == 0)
|
||||||
return 2;
|
return 0;
|
||||||
|
|
||||||
|
double penalty = 0;
|
||||||
|
|
||||||
|
boolean isForum = DocumentFlags.GeneratorForum.isPresent(docFlags);
|
||||||
|
|
||||||
|
// Penalize large sites harder for any bullshit as it's a strong signal of a low quality site
|
||||||
|
double largeSiteFactor = 1.;
|
||||||
|
|
||||||
|
if (!isForum && size > 400) {
|
||||||
|
// Long urls-that-look-like-this tend to be poor search results
|
||||||
|
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.KEBAB_CASE_URL.getFeatureBit()))
|
||||||
|
penalty += 30.0;
|
||||||
|
else if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.LONG_URL.getFeatureBit()))
|
||||||
|
penalty += 30.;
|
||||||
|
else penalty += 5.;
|
||||||
|
|
||||||
|
largeSiteFactor = 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING_ADTECH.getFeatureBit()))
|
||||||
|
penalty += 5.0 * largeSiteFactor;
|
||||||
|
|
||||||
|
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.AFFILIATE_LINK.getFeatureBit()))
|
||||||
|
penalty += 5.0 * largeSiteFactor;
|
||||||
|
|
||||||
|
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING.getFeatureBit()))
|
||||||
|
penalty += 2.5 * largeSiteFactor;
|
||||||
|
|
||||||
|
if (isForum) {
|
||||||
|
penalty = Math.min(0, penalty - 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
return (int) -penalty;
|
||||||
}
|
}
|
||||||
|
|
||||||
private long documentMetadata(List<SearchResultKeywordScore> rawScores) {
|
private long documentMetadata(List<SearchResultKeywordScore> rawScores) {
|
||||||
@ -121,6 +163,13 @@ public class ResultValuator {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private int htmlFeatures(List<SearchResultKeywordScore> rawScores) {
|
||||||
|
for (var score : rawScores) {
|
||||||
|
return score.htmlFeatures();
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
private ResultKeywordSet createKeywordSet(ValuatorListPool<SearchResultKeywordScore> listPool,
|
private ResultKeywordSet createKeywordSet(ValuatorListPool<SearchResultKeywordScore> listPool,
|
||||||
List<SearchResultKeywordScore> rawScores,
|
List<SearchResultKeywordScore> rawScores,
|
||||||
int thisSet)
|
int thisSet)
|
||||||
|
@ -40,20 +40,20 @@ class ResultValuatorTest {
|
|||||||
new SearchResultKeywordScore(0, "bob",
|
new SearchResultKeywordScore(0, "bob",
|
||||||
wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title)),
|
wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title)),
|
||||||
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
|
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
|
||||||
false)
|
0, false)
|
||||||
);
|
);
|
||||||
List<SearchResultKeywordScore> highCountNoTitleSet = List.of(
|
List<SearchResultKeywordScore> highCountNoTitleSet = List.of(
|
||||||
new SearchResultKeywordScore(0, "bob",
|
new SearchResultKeywordScore(0, "bob",
|
||||||
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh)),
|
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh)),
|
||||||
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
|
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
|
||||||
false)
|
0, false)
|
||||||
);
|
);
|
||||||
|
|
||||||
List<SearchResultKeywordScore> highCountSubjectSet = List.of(
|
List<SearchResultKeywordScore> highCountSubjectSet = List.of(
|
||||||
new SearchResultKeywordScore(0, "bob",
|
new SearchResultKeywordScore(0, "bob",
|
||||||
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects)),
|
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects)),
|
||||||
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
|
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
|
||||||
false)
|
0, false)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
|
@ -89,7 +89,7 @@ class TermCoherenceFactorTest {
|
|||||||
|
|
||||||
for (int i = 0; i < positionMasks.length; i++) {
|
for (int i = 0; i < positionMasks.length; i++) {
|
||||||
keywords.add(new SearchResultKeywordScore(0, "",
|
keywords.add(new SearchResultKeywordScore(0, "",
|
||||||
new WordMetadata(positionMasks[i], (byte) 0).encode(), 0, false));
|
new WordMetadata(positionMasks[i], (byte) 0).encode(), 0, 0, false));
|
||||||
}
|
}
|
||||||
|
|
||||||
return new ResultKeywordSet(keywords);
|
return new ResultKeywordSet(keywords);
|
||||||
|
@ -19,7 +19,7 @@ public interface Interpreter {
|
|||||||
default void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {}
|
default void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {}
|
||||||
default void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) {}
|
default void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) {}
|
||||||
|
|
||||||
default void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) {}
|
default void loadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) {}
|
||||||
|
|
||||||
default void loadDomainRedirect(DomainLink link) {}
|
default void loadDomainRedirect(DomainLink link) {}
|
||||||
|
|
||||||
|
@ -7,11 +7,11 @@ import nu.marginalia.converting.instruction.InstructionTag;
|
|||||||
import nu.marginalia.converting.instruction.Interpreter;
|
import nu.marginalia.converting.instruction.Interpreter;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
|
||||||
public record LoadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) implements Instruction {
|
public record LoadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) implements Instruction {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void apply(Interpreter interpreter) {
|
public void apply(Interpreter interpreter) {
|
||||||
interpreter.loadKeywords(url, metadata, words);
|
interpreter.loadKeywords(url, features, metadata, words);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -130,7 +130,7 @@ public class InstructionWriterFactory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) {
|
public void loadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) {
|
||||||
keywords++;
|
keywords++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -31,11 +31,16 @@ public class DocumentsCompiler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void compileWords(Consumer<Instruction> instructionConsumer, ProcessedDocument doc) {
|
public void compileWords(Consumer<Instruction> instructionConsumer,
|
||||||
|
ProcessedDocument doc) {
|
||||||
var words = doc.words;
|
var words = doc.words;
|
||||||
|
|
||||||
if (words != null) {
|
if (words != null) {
|
||||||
instructionConsumer.accept(new LoadKeywords(doc.url, doc.details.metadata, words.build()));
|
instructionConsumer.accept(new LoadKeywords(doc.url,
|
||||||
|
HtmlFeature.encode(doc.details.features),
|
||||||
|
doc.details.metadata,
|
||||||
|
words.build())
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17,7 +17,7 @@ import nu.marginalia.model.crawl.UrlIndexingState;
|
|||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.*;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.time.LocalTime;
|
import java.time.LocalTime;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
@ -143,4 +143,5 @@ public class ConvertingIntegrationTest {
|
|||||||
|
|
||||||
return SerializableCrawlDataStream.fromIterator(data.iterator());
|
return SerializableCrawlDataStream.fromIterator(data.iterator());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -18,7 +18,11 @@ public class IndexLoadKeywords implements Runnable {
|
|||||||
private final LinkedBlockingQueue<InsertTask> insertQueue = new LinkedBlockingQueue<>(32);
|
private final LinkedBlockingQueue<InsertTask> insertQueue = new LinkedBlockingQueue<>(32);
|
||||||
private final LoaderIndexJournalWriter journalWriter;
|
private final LoaderIndexJournalWriter journalWriter;
|
||||||
|
|
||||||
private record InsertTask(int urlId, int domainId, DocumentMetadata metadata, DocumentKeywords wordSet) {}
|
private record InsertTask(int urlId,
|
||||||
|
int domainId,
|
||||||
|
int features,
|
||||||
|
DocumentMetadata metadata,
|
||||||
|
DocumentKeywords wordSet) {}
|
||||||
|
|
||||||
private final Thread runThread;
|
private final Thread runThread;
|
||||||
|
|
||||||
@ -36,7 +40,10 @@ public class IndexLoadKeywords implements Runnable {
|
|||||||
while (!canceled) {
|
while (!canceled) {
|
||||||
var data = insertQueue.poll(1, TimeUnit.SECONDS);
|
var data = insertQueue.poll(1, TimeUnit.SECONDS);
|
||||||
if (data != null) {
|
if (data != null) {
|
||||||
journalWriter.putWords(new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), data.metadata(), data.wordSet);
|
journalWriter.putWords(new EdgeId<>(data.domainId), new EdgeId<>(data.urlId),
|
||||||
|
data.features,
|
||||||
|
data.metadata(),
|
||||||
|
data.wordSet);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -49,7 +56,11 @@ public class IndexLoadKeywords implements Runnable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void load(LoaderData loaderData, EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) throws InterruptedException {
|
public void load(LoaderData loaderData,
|
||||||
|
EdgeUrl url,
|
||||||
|
int features,
|
||||||
|
DocumentMetadata metadata,
|
||||||
|
DocumentKeywords words) throws InterruptedException {
|
||||||
int domainId = loaderData.getDomainId(url.domain);
|
int domainId = loaderData.getDomainId(url.domain);
|
||||||
int urlId = loaderData.getUrlId(url);
|
int urlId = loaderData.getUrlId(url);
|
||||||
|
|
||||||
@ -58,6 +69,6 @@ public class IndexLoadKeywords implements Runnable {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
insertQueue.put(new InsertTask(urlId, domainId, metadata, words));
|
insertQueue.put(new InsertTask(urlId, domainId, features, metadata, words));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -103,9 +103,9 @@ public class Loader implements Interpreter, AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words) {
|
public void loadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) {
|
||||||
try {
|
try {
|
||||||
indexLoadKeywords.load(data, url, metadata, words);
|
indexLoadKeywords.load(data, url, features, metadata, words);
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
|
@ -60,6 +60,7 @@ public class LoaderIndexJournalWriter {
|
|||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void putWords(EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url,
|
public void putWords(EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url,
|
||||||
|
int features,
|
||||||
DocumentMetadata metadata,
|
DocumentMetadata metadata,
|
||||||
DocumentKeywords wordSet) {
|
DocumentKeywords wordSet) {
|
||||||
if (wordSet.keywords().length == 0) {
|
if (wordSet.keywords().length == 0) {
|
||||||
@ -76,10 +77,10 @@ public class LoaderIndexJournalWriter {
|
|||||||
// with a chonky work queue is a fairly decent improvement
|
// with a chonky work queue is a fairly decent improvement
|
||||||
for (var chunk : KeywordListChunker.chopList(wordSet, IndexJournalEntryData.MAX_LENGTH)) {
|
for (var chunk : KeywordListChunker.chopList(wordSet, IndexJournalEntryData.MAX_LENGTH)) {
|
||||||
try {
|
try {
|
||||||
keywordInsertionExecutor.submit(() -> loadWords(domain, url, metadata, chunk));
|
keywordInsertionExecutor.submit(() -> loadWords(domain, url, features, metadata, chunk));
|
||||||
}
|
}
|
||||||
catch (RejectedExecutionException ex) {
|
catch (RejectedExecutionException ex) {
|
||||||
loadWords(domain, url, metadata, chunk);
|
loadWords(domain, url, features, metadata, chunk);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -87,6 +88,7 @@ public class LoaderIndexJournalWriter {
|
|||||||
|
|
||||||
private void loadWords(EdgeId<EdgeDomain> domain,
|
private void loadWords(EdgeId<EdgeDomain> domain,
|
||||||
EdgeId<EdgeUrl> url,
|
EdgeId<EdgeUrl> url,
|
||||||
|
int features,
|
||||||
DocumentMetadata metadata,
|
DocumentMetadata metadata,
|
||||||
DocumentKeywords wordSet) {
|
DocumentKeywords wordSet) {
|
||||||
if (null == metadata) {
|
if (null == metadata) {
|
||||||
@ -95,7 +97,7 @@ public class LoaderIndexJournalWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var entry = new IndexJournalEntryData(getOrInsertWordIds(wordSet.keywords(), wordSet.metadata()));
|
var entry = new IndexJournalEntryData(getOrInsertWordIds(wordSet.keywords(), wordSet.metadata()));
|
||||||
var header = new IndexJournalEntryHeader(domain, url, metadata.encode());
|
var header = new IndexJournalEntryHeader(domain, features, url, metadata.encode());
|
||||||
|
|
||||||
indexWriter.put(header, entry);
|
indexWriter.put(header, entry);
|
||||||
}
|
}
|
||||||
|
@ -196,6 +196,9 @@ public class SearchIndex {
|
|||||||
public long getDocumentMetadata(long docId) {
|
public long getDocumentMetadata(long docId) {
|
||||||
return indexReader.getDocumentMetadata(docId);
|
return indexReader.getDocumentMetadata(docId);
|
||||||
}
|
}
|
||||||
|
public int getHtmlFeatures(long docId) {
|
||||||
|
return indexReader.getHtmlFeatures(docId);
|
||||||
|
}
|
||||||
|
|
||||||
public int getDomainId(long docId) {
|
public int getDomainId(long docId) {
|
||||||
return indexReader.getDomainId(docId);
|
return indexReader.getDomainId(docId);
|
||||||
|
@ -67,4 +67,8 @@ public class SearchIndexReader {
|
|||||||
public int totalDocCount() {
|
public int totalDocCount() {
|
||||||
return forwardIndexReader.totalDocCount();
|
return forwardIndexReader.totalDocCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int getHtmlFeatures(long docId) {
|
||||||
|
return forwardIndexReader.getHtmlFeatures(docId);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -34,6 +34,10 @@ public class IndexMetadataService {
|
|||||||
return index.getDocumentMetadata(urlId);
|
return index.getDocumentMetadata(urlId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int getHtmlFeatures(long urlId) {
|
||||||
|
return index.getHtmlFeatures(urlId);
|
||||||
|
}
|
||||||
|
|
||||||
public int getDomainId(long urlId) {
|
public int getDomainId(long urlId) {
|
||||||
return index.getDomainId(urlId);
|
return index.getDomainId(urlId);
|
||||||
}
|
}
|
||||||
|
@ -59,6 +59,7 @@ public class IndexResultValuator {
|
|||||||
searchResult.setDomainId(metadataService.getDomainId(urlIdInt));
|
searchResult.setDomainId(metadataService.getDomainId(urlIdInt));
|
||||||
|
|
||||||
long docMetadata = metadataService.getDocumentMetadata(urlIdInt);
|
long docMetadata = metadataService.getDocumentMetadata(urlIdInt);
|
||||||
|
int htmlFeatures = metadataService.getHtmlFeatures(urlIdInt);
|
||||||
|
|
||||||
int maxFlagsCount = 0;
|
int maxFlagsCount = 0;
|
||||||
boolean anyAllSynthetic = false;
|
boolean anyAllSynthetic = false;
|
||||||
@ -85,6 +86,7 @@ public class IndexResultValuator {
|
|||||||
searchTerm,
|
searchTerm,
|
||||||
metadata,
|
metadata,
|
||||||
docMetadata,
|
docMetadata,
|
||||||
|
htmlFeatures,
|
||||||
resultsWithPriorityTerms.contains(searchResult.combinedId)
|
resultsWithPriorityTerms.contains(searchResult.combinedId)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -177,7 +177,7 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
|
|
||||||
long fullId = id | ((long) (32 - (id % 32)) << 32);
|
long fullId = id | ((long) (32 - (id % 32)) << 32);
|
||||||
|
|
||||||
var header = new IndexJournalEntryHeader(factors.length, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
|
var header = new IndexJournalEntryHeader(factors.length, 0, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
|
||||||
|
|
||||||
long[] data = new long[factors.length*2];
|
long[] data = new long[factors.length*2];
|
||||||
for (int i = 0; i < factors.length; i++) {
|
for (int i = 0; i < factors.length; i++) {
|
||||||
@ -190,7 +190,7 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
|
|
||||||
public void loadDataWithDomain(int domain, int id) {
|
public void loadDataWithDomain(int domain, int id) {
|
||||||
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
|
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
|
||||||
var header = new IndexJournalEntryHeader(factors.length, id | ((long) domain << 32), DocumentMetadata.defaultValue());
|
var header = new IndexJournalEntryHeader(factors.length, 0, id | ((long) domain << 32), DocumentMetadata.defaultValue());
|
||||||
|
|
||||||
long[] data = new long[factors.length*2];
|
long[] data = new long[factors.length*2];
|
||||||
for (int i = 0; i < factors.length; i++) {
|
for (int i = 0; i < factors.length; i++) {
|
||||||
|
Loading…
Reference in New Issue
Block a user