Bug fix for document metadata encoding that breaks year based queries.

This commit is contained in:
Viktor Lofgren 2023-04-14 16:56:49 +02:00
parent ec7ce7b0b3
commit 2ab26f37b8
6 changed files with 49 additions and 10 deletions

View File

@ -62,8 +62,8 @@ public record DocumentMetadata(int avgSentLength,
this(defaultValue());
}
public DocumentMetadata(int avgSentLength, int year, int sets, int quality, EnumSet<DocumentFlags> flags) {
this(avgSentLength, 0, 0, 0, year, sets, quality, encodeFlags(flags));
public DocumentMetadata(int avgSentLength, int year, int quality, EnumSet<DocumentFlags> flags) {
this(avgSentLength, 0, 0, 0, year, 0, quality, encodeFlags(flags));
}
public DocumentMetadata withSize(int size, int topology) {

View File

@ -1,6 +1,7 @@
package nu.marginalia.model;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import org.junit.jupiter.api.Test;
@ -73,7 +74,7 @@ class DocumentMetadataTest {
@Test
public void encRank() {
var meta = new DocumentMetadata(0, 5, 22, 8, EnumSet.noneOf(DocumentFlags.class))
var meta = new DocumentMetadata(0, 22, 8, EnumSet.noneOf(DocumentFlags.class))
.withSize(0xffffffff, 5).encode();
var enc2 = DocumentMetadata.encodeRank(meta, 83);
@ -81,4 +82,24 @@ class DocumentMetadataTest {
assertEquals(5, DocumentMetadata.decodeTopology(enc2));
}
@Test
public void testYear() {
for (int year = 1996; year < 2023; year++) {
var meta = new DocumentMetadata(~0, new PubDate(null, year).yearByte(), ~0, EnumSet.allOf(DocumentFlags.class))
.withSize(~0, ~0);
var encoded = DocumentMetadata.encodeRank(meta.encode(), 0);
assertEquals(year, DocumentMetadata.decodeYear(encoded));
}
for (int year = 1996; year < 2023; year++) {
var meta = new DocumentMetadata(0, new PubDate(null, year).yearByte(), 0, EnumSet.noneOf(DocumentFlags.class))
.withSize(0, 0);
var encoded = DocumentMetadata.encodeRank(meta.encode(), 0);
assertEquals(year, DocumentMetadata.decodeYear(encoded));
}
}
}

View File

@ -39,20 +39,20 @@ class ResultValuatorTest {
List<SearchResultKeywordScore> titleOnlyLowCountSet = List.of(
new SearchResultKeywordScore(0, "bob",
wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title)),
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(DocumentFlags.class)),
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
false)
);
List<SearchResultKeywordScore> highCountNoTitleSet = List.of(
new SearchResultKeywordScore(0, "bob",
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh)),
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(DocumentFlags.class)),
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
false)
);
List<SearchResultKeywordScore> highCountSubjectSet = List.of(
new SearchResultKeywordScore(0, "bob",
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects)),
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(DocumentFlags.class)),
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
false)
);
@ -76,8 +76,8 @@ class ResultValuatorTest {
System.out.println(highCountSubject);
}
private long docMetadata(int topology, int year, int sets, int quality, EnumSet<DocumentFlags> flags) {
return new DocumentMetadata(topology, PubDate.toYearByte(year), sets, quality, flags).encode();
private long docMetadata(int topology, int year, int quality, EnumSet<DocumentFlags> flags) {
return new DocumentMetadata(topology, PubDate.toYearByte(year), quality, flags).encode();
}
private long wordMetadata(Set<Integer> positions, Set<WordFlags> wordFlags) {

View File

@ -28,6 +28,8 @@ import nu.marginalia.model.EdgeUrl;
import nu.marginalia.pubdate.PubDateSniffer;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URISyntaxException;
import java.nio.file.Path;
@ -40,6 +42,7 @@ import static nu.marginalia.converting.model.DisqualifiedException.*;
public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final double minDocumentQuality;
private final SentenceExtractor sentenceExtractor;
@ -131,7 +134,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
ret.metadata = new DocumentMetadata(
documentLengthLogic.getEncodedAverageLength(dld),
url.depth(), pubDate.yearByte(), (int) -ret.quality, documentFlags);
pubDate.yearByte(), (int) -ret.quality, documentFlags);
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);

View File

@ -92,7 +92,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
EnumSet<DocumentFlags> documentFlags = EnumSet.of(DocumentFlags.PlainText);
ret.metadata = new DocumentMetadata(documentLengthLogic.getEncodedAverageLength(dld),
url.depth(), pubDate.yearByte(), (int) -ret.quality, documentFlags);
pubDate.yearByte(), (int) -ret.quality, documentFlags);
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);

View File

@ -5,11 +5,13 @@ import com.google.inject.Guice;
import com.google.inject.Injector;
import nu.marginalia.bigstring.BigString;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.crawl.UrlIndexingState;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@ -47,6 +49,19 @@ public class ConvertingIntegrationTest {
assertTrue(ret.documents.isEmpty());
}
@Test
public void testMemexMarginaliaNuDateInternalConsistency() throws IOException {
var ret = domainProcessor.process(readMarginaliaWorkingSet());
ret.documents.stream().filter(ProcessedDocument::isProcessedFully).forEach(doc -> {
int year = PubDate.fromYearByte(doc.details.metadata.year());
Integer yearMeta = doc.details.pubYear;
if (yearMeta != null) {
assertEquals(year, (int) yearMeta, doc.url.toString());
}
});
}
@Test
public void testMemexMarginaliaNu() throws IOException {
var ret = domainProcessor.process(readMarginaliaWorkingSet());