mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Bug fix for document metadata encoding that breaks year based queries.
This commit is contained in:
parent
ec7ce7b0b3
commit
2ab26f37b8
@ -62,8 +62,8 @@ public record DocumentMetadata(int avgSentLength,
|
||||
this(defaultValue());
|
||||
}
|
||||
|
||||
public DocumentMetadata(int avgSentLength, int year, int sets, int quality, EnumSet<DocumentFlags> flags) {
|
||||
this(avgSentLength, 0, 0, 0, year, sets, quality, encodeFlags(flags));
|
||||
public DocumentMetadata(int avgSentLength, int year, int quality, EnumSet<DocumentFlags> flags) {
|
||||
this(avgSentLength, 0, 0, 0, year, 0, quality, encodeFlags(flags));
|
||||
}
|
||||
|
||||
public DocumentMetadata withSize(int size, int topology) {
|
||||
|
@ -1,6 +1,7 @@
|
||||
package nu.marginalia.model;
|
||||
|
||||
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.idx.DocumentFlags;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@ -73,7 +74,7 @@ class DocumentMetadataTest {
|
||||
|
||||
@Test
|
||||
public void encRank() {
|
||||
var meta = new DocumentMetadata(0, 5, 22, 8, EnumSet.noneOf(DocumentFlags.class))
|
||||
var meta = new DocumentMetadata(0, 22, 8, EnumSet.noneOf(DocumentFlags.class))
|
||||
.withSize(0xffffffff, 5).encode();
|
||||
var enc2 = DocumentMetadata.encodeRank(meta, 83);
|
||||
|
||||
@ -81,4 +82,24 @@ class DocumentMetadataTest {
|
||||
assertEquals(5, DocumentMetadata.decodeTopology(enc2));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testYear() {
|
||||
for (int year = 1996; year < 2023; year++) {
|
||||
var meta = new DocumentMetadata(~0, new PubDate(null, year).yearByte(), ~0, EnumSet.allOf(DocumentFlags.class))
|
||||
.withSize(~0, ~0);
|
||||
|
||||
var encoded = DocumentMetadata.encodeRank(meta.encode(), 0);
|
||||
|
||||
assertEquals(year, DocumentMetadata.decodeYear(encoded));
|
||||
}
|
||||
|
||||
for (int year = 1996; year < 2023; year++) {
|
||||
var meta = new DocumentMetadata(0, new PubDate(null, year).yearByte(), 0, EnumSet.noneOf(DocumentFlags.class))
|
||||
.withSize(0, 0);
|
||||
|
||||
var encoded = DocumentMetadata.encodeRank(meta.encode(), 0);
|
||||
|
||||
assertEquals(year, DocumentMetadata.decodeYear(encoded));
|
||||
}
|
||||
}
|
||||
}
|
@ -39,20 +39,20 @@ class ResultValuatorTest {
|
||||
List<SearchResultKeywordScore> titleOnlyLowCountSet = List.of(
|
||||
new SearchResultKeywordScore(0, "bob",
|
||||
wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title)),
|
||||
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(DocumentFlags.class)),
|
||||
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
|
||||
false)
|
||||
);
|
||||
List<SearchResultKeywordScore> highCountNoTitleSet = List.of(
|
||||
new SearchResultKeywordScore(0, "bob",
|
||||
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh)),
|
||||
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(DocumentFlags.class)),
|
||||
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
|
||||
false)
|
||||
);
|
||||
|
||||
List<SearchResultKeywordScore> highCountSubjectSet = List.of(
|
||||
new SearchResultKeywordScore(0, "bob",
|
||||
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects)),
|
||||
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(DocumentFlags.class)),
|
||||
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
|
||||
false)
|
||||
);
|
||||
|
||||
@ -76,8 +76,8 @@ class ResultValuatorTest {
|
||||
System.out.println(highCountSubject);
|
||||
}
|
||||
|
||||
private long docMetadata(int topology, int year, int sets, int quality, EnumSet<DocumentFlags> flags) {
|
||||
return new DocumentMetadata(topology, PubDate.toYearByte(year), sets, quality, flags).encode();
|
||||
private long docMetadata(int topology, int year, int quality, EnumSet<DocumentFlags> flags) {
|
||||
return new DocumentMetadata(topology, PubDate.toYearByte(year), quality, flags).encode();
|
||||
}
|
||||
|
||||
private long wordMetadata(Set<Integer> positions, Set<WordFlags> wordFlags) {
|
||||
|
@ -28,6 +28,8 @@ import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.pubdate.PubDateSniffer;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Path;
|
||||
@ -40,6 +42,7 @@ import static nu.marginalia.converting.model.DisqualifiedException.*;
|
||||
|
||||
public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final double minDocumentQuality;
|
||||
|
||||
private final SentenceExtractor sentenceExtractor;
|
||||
@ -131,7 +134,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
ret.metadata = new DocumentMetadata(
|
||||
documentLengthLogic.getEncodedAverageLength(dld),
|
||||
url.depth(), pubDate.yearByte(), (int) -ret.quality, documentFlags);
|
||||
pubDate.yearByte(), (int) -ret.quality, documentFlags);
|
||||
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
|
||||
|
||||
|
@ -92,7 +92,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
||||
EnumSet<DocumentFlags> documentFlags = EnumSet.of(DocumentFlags.PlainText);
|
||||
|
||||
ret.metadata = new DocumentMetadata(documentLengthLogic.getEncodedAverageLength(dld),
|
||||
url.depth(), pubDate.yearByte(), (int) -ret.quality, documentFlags);
|
||||
pubDate.yearByte(), (int) -ret.quality, documentFlags);
|
||||
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
|
||||
|
||||
|
@ -5,11 +5,13 @@ import com.google.inject.Guice;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.bigstring.BigString;
|
||||
import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@ -47,6 +49,19 @@ public class ConvertingIntegrationTest {
|
||||
assertTrue(ret.documents.isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMemexMarginaliaNuDateInternalConsistency() throws IOException {
|
||||
var ret = domainProcessor.process(readMarginaliaWorkingSet());
|
||||
ret.documents.stream().filter(ProcessedDocument::isProcessedFully).forEach(doc -> {
|
||||
int year = PubDate.fromYearByte(doc.details.metadata.year());
|
||||
Integer yearMeta = doc.details.pubYear;
|
||||
if (yearMeta != null) {
|
||||
assertEquals(year, (int) yearMeta, doc.url.toString());
|
||||
}
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMemexMarginaliaNu() throws IOException {
|
||||
var ret = domainProcessor.process(readMarginaliaWorkingSet());
|
||||
|
Loading…
Reference in New Issue
Block a user