diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java index 00f7aa51..3e954637 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java @@ -3,6 +3,7 @@ package nu.marginalia.converting.model; import lombok.ToString; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.crawl.DomainIndexingState; +import org.jetbrains.annotations.Nullable; import java.util.List; import java.util.Optional; @@ -16,6 +17,12 @@ public class ProcessedDomain { public EdgeDomain redirect; public String ip; + + /** Used by the sideloader to give advice on how many documents are crawled + * without actually having to count (which would take forever) */ + @Nullable + public Integer sizeloadSizeAdvice; + public int size() { return Optional.ofNullable(documents).map(List::size).orElse(1); } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/dirtree/DirtreeSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/dirtree/DirtreeSideloader.java index 07cf780b..a8e729e3 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/dirtree/DirtreeSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/dirtree/DirtreeSideloader.java @@ -49,6 +49,7 @@ public class DirtreeSideloader implements SideloadSource, AutoCloseable { ret.domain = new EdgeDomain(domainName); ret.ip = "0.0.0.0"; ret.state = DomainIndexingState.ACTIVE; + ret.sizeloadSizeAdvice = 1000; return ret; } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java index 8ca80c45..f0686b4c 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java @@ -69,7 +69,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC ret.domain = baseUrl.getDomain(); ret.ip = "0.0.0.0"; ret.state = DomainIndexingState.ACTIVE; - + ret.sizeloadSizeAdvice = 5_000_000; return ret; } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java index a39bdab8..46ad47e2 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java @@ -54,6 +54,13 @@ public class StackexchangeSideloader implements SideloadSource { ret.ip = "127.0.0.1"; ret.state = DomainIndexingState.ACTIVE; + if (domainName.contains("stackoverflow.com")) { + ret.sizeloadSizeAdvice = 5_000_000; + } + else { + ret.sizeloadSizeAdvice = 1000; + } + return ret; } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java index 2d8c1bda..97406ff0 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java @@ -53,6 +53,7 @@ public class WarcSideloader implements SideloadSource, AutoCloseable { ret.domain = domain; ret.ip = "0.0.0.0"; ret.state = DomainIndexingState.ACTIVE; + ret.sizeloadSizeAdvice = 1000; return ret; } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWriter.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWriter.java index cc9f0467..239d748c 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWriter.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWriter.java @@ -224,6 +224,13 @@ public class ConverterBatchWriter implements AutoCloseable { record DomainMetadata(int known, int good, int visited) { public static DomainMetadata from(ProcessedDomain domain) { + if (domain.sizeloadSizeAdvice != null) { + return new DomainMetadata( + domain.sizeloadSizeAdvice, + domain.sizeloadSizeAdvice, + domain.sizeloadSizeAdvice + ); + } var documents = domain.documents; if (documents == null) {