mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(converter) Add sizeloadSizeAdvice field to several ProcessedDomain
Since the sideloaders don't populate the documents list in ProcessedDomain to keep the memory footprint manageable, the code that estimates knownUrls etc. will set them to zero, which has negative effects on their ranking. This change will populate them with a bullshit value within a sane ballpark, ensuring that these domains show up in the rankings.
This commit is contained in:
parent
5bd3934d22
commit
dd8fb04886
@ -3,6 +3,7 @@ package nu.marginalia.converting.model;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
@ -16,6 +17,12 @@ public class ProcessedDomain {
|
||||
public EdgeDomain redirect;
|
||||
public String ip;
|
||||
|
||||
|
||||
/** Used by the sideloader to give advice on how many documents are crawled
|
||||
* without actually having to count (which would take forever) */
|
||||
@Nullable
|
||||
public Integer sizeloadSizeAdvice;
|
||||
|
||||
public int size() {
|
||||
return Optional.ofNullable(documents).map(List::size).orElse(1);
|
||||
}
|
||||
|
@ -49,6 +49,7 @@ public class DirtreeSideloader implements SideloadSource, AutoCloseable {
|
||||
ret.domain = new EdgeDomain(domainName);
|
||||
ret.ip = "0.0.0.0";
|
||||
ret.state = DomainIndexingState.ACTIVE;
|
||||
ret.sizeloadSizeAdvice = 1000;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -69,7 +69,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
ret.domain = baseUrl.getDomain();
|
||||
ret.ip = "0.0.0.0";
|
||||
ret.state = DomainIndexingState.ACTIVE;
|
||||
|
||||
ret.sizeloadSizeAdvice = 5_000_000;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -54,6 +54,13 @@ public class StackexchangeSideloader implements SideloadSource {
|
||||
ret.ip = "127.0.0.1";
|
||||
ret.state = DomainIndexingState.ACTIVE;
|
||||
|
||||
if (domainName.contains("stackoverflow.com")) {
|
||||
ret.sizeloadSizeAdvice = 5_000_000;
|
||||
}
|
||||
else {
|
||||
ret.sizeloadSizeAdvice = 1000;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -53,6 +53,7 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
|
||||
ret.domain = domain;
|
||||
ret.ip = "0.0.0.0";
|
||||
ret.state = DomainIndexingState.ACTIVE;
|
||||
ret.sizeloadSizeAdvice = 1000;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -224,6 +224,13 @@ public class ConverterBatchWriter implements AutoCloseable {
|
||||
record DomainMetadata(int known, int good, int visited) {
|
||||
|
||||
public static DomainMetadata from(ProcessedDomain domain) {
|
||||
if (domain.sizeloadSizeAdvice != null) {
|
||||
return new DomainMetadata(
|
||||
domain.sizeloadSizeAdvice,
|
||||
domain.sizeloadSizeAdvice,
|
||||
domain.sizeloadSizeAdvice
|
||||
);
|
||||
}
|
||||
|
||||
var documents = domain.documents;
|
||||
if (documents == null) {
|
||||
|
Loading…
Reference in New Issue
Block a user