(converter) Add sizeloadSizeAdvice field to several ProcessedDomain

Since the sideloaders don't populate the documents list in ProcessedDomain to keep the memory footprint manageable, the code that estimates knownUrls etc. will set them to zero, which has negative effects on their ranking.  This change will populate them with a bullshit value within a sane ballpark, ensuring that these domains show up in the rankings.
This commit is contained in:
Viktor Lofgren 2023-12-19 18:37:51 +01:00
parent 5bd3934d22
commit dd8fb04886
6 changed files with 24 additions and 1 deletions

View File

@ -3,6 +3,7 @@ package nu.marginalia.converting.model;
import lombok.ToString;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.crawl.DomainIndexingState;
import org.jetbrains.annotations.Nullable;
import java.util.List;
import java.util.Optional;
@ -16,6 +17,12 @@ public class ProcessedDomain {
public EdgeDomain redirect;
public String ip;
/** Used by the sideloader to give advice on how many documents are crawled
* without actually having to count (which would take forever) */
@Nullable
public Integer sizeloadSizeAdvice;
public int size() {
return Optional.ofNullable(documents).map(List::size).orElse(1);
}

View File

@ -49,6 +49,7 @@ public class DirtreeSideloader implements SideloadSource, AutoCloseable {
ret.domain = new EdgeDomain(domainName);
ret.ip = "0.0.0.0";
ret.state = DomainIndexingState.ACTIVE;
ret.sizeloadSizeAdvice = 1000;
return ret;
}

View File

@ -69,7 +69,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
ret.domain = baseUrl.getDomain();
ret.ip = "0.0.0.0";
ret.state = DomainIndexingState.ACTIVE;
ret.sizeloadSizeAdvice = 5_000_000;
return ret;
}

View File

@ -54,6 +54,13 @@ public class StackexchangeSideloader implements SideloadSource {
ret.ip = "127.0.0.1";
ret.state = DomainIndexingState.ACTIVE;
if (domainName.contains("stackoverflow.com")) {
ret.sizeloadSizeAdvice = 5_000_000;
}
else {
ret.sizeloadSizeAdvice = 1000;
}
return ret;
}

View File

@ -53,6 +53,7 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
ret.domain = domain;
ret.ip = "0.0.0.0";
ret.state = DomainIndexingState.ACTIVE;
ret.sizeloadSizeAdvice = 1000;
return ret;
}

View File

@ -224,6 +224,13 @@ public class ConverterBatchWriter implements AutoCloseable {
record DomainMetadata(int known, int good, int visited) {
public static DomainMetadata from(ProcessedDomain domain) {
if (domain.sizeloadSizeAdvice != null) {
return new DomainMetadata(
domain.sizeloadSizeAdvice,
domain.sizeloadSizeAdvice,
domain.sizeloadSizeAdvice
);
}
var documents = domain.documents;
if (documents == null) {