(converter) Route sizeHint to SideloadProcessing

Route the sizeHint from the input parquet file to SideloadProcessing, so that it can set sideloadSizeAdvice appropriately, instead of using a fixed "large" number.

This is necessary to populate the KNOWN_URL column in the domain data table, which is important as it is used in e.g. calculating how far to re-crawl the site in the future.
This commit is contained in:
Viktor Lofgren 2023-12-30 13:05:10 +01:00
parent 0b112cb4d4
commit 7ba296ccdf
2 changed files with 10 additions and 10 deletions

View File

@ -57,19 +57,20 @@ public class DomainProcessor {
} }
public ConverterBatchWritableIf createWritable(SerializableCrawlDataStream domain) { public ConverterBatchWritableIf createWritable(SerializableCrawlDataStream domain) {
if (domain.sizeHint() > 10_000) { final int sizeHint = domain.sizeHint();
if (sizeHint > 10_000) {
// If the file is too big, we run a processing mode that doesn't // If the file is too big, we run a processing mode that doesn't
// require loading the entire dataset into RAM // require loading the entire dataset into RAM
logger.info("Sideloading {}", domain.path()); return sideloadProcessing(domain, sizeHint);
return sideloadProcessing(domain);
} }
return fullProcessing(domain); return fullProcessing(domain);
} }
public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream) { public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) {
try { try {
return new SideloadProcessing(dataStream); return new SideloadProcessing(dataStream, sizeHint);
} }
catch (Exception ex) { catch (Exception ex) {
logger.warn("Failed to process domain sideload", ex); logger.warn("Failed to process domain sideload", ex);
@ -86,17 +87,16 @@ public class DomainProcessor {
private final DomainLinks externalDomainLinks; private final DomainLinks externalDomainLinks;
private final LshDocumentDeduplicator deduplicator = new LshDocumentDeduplicator(); private final LshDocumentDeduplicator deduplicator = new LshDocumentDeduplicator();
SideloadProcessing(SerializableCrawlDataStream dataStream) throws IOException { SideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) throws IOException {
this.dataStream = dataStream; this.dataStream = dataStream;
if (!dataStream.hasNext() if (!dataStream.hasNext() || !(dataStream.next() instanceof CrawledDomain crawledDomain))
|| !(dataStream.next() instanceof CrawledDomain crawledDomain))
{ {
throw new IllegalStateException("First record must be a domain, was " + dataStream.next().getClass().getSimpleName()); throw new IllegalStateException("First record must be a domain, was " + dataStream.next().getClass().getSimpleName());
} }
domain = new ProcessedDomain(); domain = new ProcessedDomain();
domain.sizeloadSizeAdvice = 10_000; domain.sizeloadSizeAdvice = sizeHint == 0 ? 10_000 : sizeHint;
documentDecorator = new DocumentDecorator(anchorTextKeywords); documentDecorator = new DocumentDecorator(anchorTextKeywords);
processDomain(crawledDomain, domain, documentDecorator); processDomain(crawledDomain, domain, documentDecorator);

View File

@ -96,7 +96,7 @@ public class ConvertingIntegrationTest {
@Test @Test
public void testMemexMarginaliaNuSideloadProcessing() throws IOException { public void testMemexMarginaliaNuSideloadProcessing() throws IOException {
var ret = domainProcessor.sideloadProcessing(asSerializableCrawlData(readMarginaliaWorkingSet())); var ret = domainProcessor.sideloadProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()), 100);
assertNotNull(ret); assertNotNull(ret);
assertEquals("memex.marginalia.nu", ret.id()); assertEquals("memex.marginalia.nu", ret.id());