mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(sideload) Fix sideloading so that it doesn't get disproportionately good rankings
Also add type flags so that e.g. wikipedia shows up in the wikis filter.
This commit is contained in:
parent
e9a01caa5c
commit
e5cee1f46d
@ -3,15 +3,22 @@ package nu.marginalia.converting.sideload;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.atags.model.DomainLinks;
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
|
import nu.marginalia.converting.model.GeneratorType;
|
||||||
import nu.marginalia.converting.model.ProcessedDocument;
|
import nu.marginalia.converting.model.ProcessedDocument;
|
||||||
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
|
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.crawl.UrlIndexingState;
|
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||||
|
import nu.marginalia.model.html.HtmlStandard;
|
||||||
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.model.idx.WordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
|
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.time.LocalDateTime;
|
import java.time.LocalDateTime;
|
||||||
|
import java.util.EnumSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
@ -27,6 +34,7 @@ public class SideloaderProcessing {
|
|||||||
String body,
|
String body,
|
||||||
List<String> extraKeywords,
|
List<String> extraKeywords,
|
||||||
DomainLinks domainLinks,
|
DomainLinks domainLinks,
|
||||||
|
GeneratorType type,
|
||||||
int size) throws URISyntaxException {
|
int size) throws URISyntaxException {
|
||||||
var crawledDoc = new CrawledDocument(
|
var crawledDoc = new CrawledDocument(
|
||||||
"encyclopedia.marginalia.nu",
|
"encyclopedia.marginalia.nu",
|
||||||
@ -55,6 +63,27 @@ public class SideloaderProcessing {
|
|||||||
|
|
||||||
ret.details = details.details();
|
ret.details = details.details();
|
||||||
|
|
||||||
|
// Add a few things that we know about the document
|
||||||
|
// that we can't get from the sideloaded data since it's
|
||||||
|
// so stripped down
|
||||||
|
|
||||||
|
ret.details.standard = HtmlStandard.HTML5;
|
||||||
|
ret.details.pubYear = LocalDateTime.now().getYear();
|
||||||
|
ret.details.features.add(HtmlFeature.JS);
|
||||||
|
ret.details.features.add(HtmlFeature.TRACKING);
|
||||||
|
ret.details.quality = -10;
|
||||||
|
ret.details.generator = type;
|
||||||
|
|
||||||
|
ret.details.metadata = new DocumentMetadata(3,
|
||||||
|
PubDate.toYearByte(ret.details.pubYear),
|
||||||
|
(int) -ret.details.quality,
|
||||||
|
switch (type) {
|
||||||
|
case WIKI -> EnumSet.of(DocumentFlags.GeneratorWiki);
|
||||||
|
case DOCS -> EnumSet.of(DocumentFlags.GeneratorDocs);
|
||||||
|
default -> EnumSet.noneOf(DocumentFlags.class);
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
// FIXME (2023-11-06): For encyclopedia loading, this will likely only work when the domain specified is en.wikipedia.org
|
// FIXME (2023-11-06): For encyclopedia loading, this will likely only work when the domain specified is en.wikipedia.org
|
||||||
// We don't have access to the article name at this point to generate an equivalent URL... It's not a huge
|
// We don't have access to the article name at this point to generate an equivalent URL... It's not a huge
|
||||||
// deal but something to keep in mind
|
// deal but something to keep in mind
|
||||||
|
@ -2,6 +2,7 @@ package nu.marginalia.converting.sideload.dirtree;
|
|||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.atags.model.DomainLinks;
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
|
import nu.marginalia.converting.model.GeneratorType;
|
||||||
import nu.marginalia.converting.model.ProcessedDocument;
|
import nu.marginalia.converting.model.ProcessedDocument;
|
||||||
import nu.marginalia.converting.model.ProcessedDomain;
|
import nu.marginalia.converting.model.ProcessedDomain;
|
||||||
import nu.marginalia.converting.sideload.SideloadSource;
|
import nu.marginalia.converting.sideload.SideloadSource;
|
||||||
@ -79,7 +80,9 @@ public class DirtreeSideloader implements SideloadSource, AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
return sideloaderProcessing
|
return sideloaderProcessing
|
||||||
.processDocument(url, body, extraKeywords, new DomainLinks(), 10_000);
|
.processDocument(url, body, extraKeywords, new DomainLinks(),
|
||||||
|
GeneratorType.DOCS,
|
||||||
|
10_000);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -6,6 +6,7 @@ import lombok.SneakyThrows;
|
|||||||
import nu.marginalia.atags.model.DomainLinks;
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||||
import nu.marginalia.converting.model.DisqualifiedException;
|
import nu.marginalia.converting.model.DisqualifiedException;
|
||||||
|
import nu.marginalia.converting.model.GeneratorType;
|
||||||
import nu.marginalia.converting.model.ProcessedDocument;
|
import nu.marginalia.converting.model.ProcessedDocument;
|
||||||
import nu.marginalia.converting.model.ProcessedDomain;
|
import nu.marginalia.converting.model.ProcessedDomain;
|
||||||
import nu.marginalia.converting.sideload.SideloadSource;
|
import nu.marginalia.converting.sideload.SideloadSource;
|
||||||
@ -184,6 +185,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
|||||||
fullHtml.toString(),
|
fullHtml.toString(),
|
||||||
List.of("encyclopedia", "wiki"),
|
List.of("encyclopedia", "wiki"),
|
||||||
domainLinks,
|
domainLinks,
|
||||||
|
GeneratorType.WIKI,
|
||||||
10_000_000);
|
10_000_000);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -126,9 +126,13 @@ public class StackexchangeSideloader implements SideloadSource {
|
|||||||
|
|
||||||
ret.details = new ProcessedDocumentDetails();
|
ret.details = new ProcessedDocumentDetails();
|
||||||
ret.details.pubYear = post.year();
|
ret.details.pubYear = post.year();
|
||||||
ret.details.quality = 10;
|
ret.details.quality = -10;
|
||||||
ret.details.metadata = new DocumentMetadata(3,
|
ret.details.metadata = new DocumentMetadata(3,
|
||||||
PubDate.toYearByte(ret.details.pubYear), (int) -ret.details.quality, EnumSet.noneOf(DocumentFlags.class));
|
PubDate.toYearByte(ret.details.pubYear),
|
||||||
|
(int) -ret.details.quality,
|
||||||
|
EnumSet.of(DocumentFlags.GeneratorDocs));
|
||||||
|
ret.details.features.add(HtmlFeature.JS);
|
||||||
|
ret.details.features.add(HtmlFeature.TRACKING);
|
||||||
|
|
||||||
ret.details.metadata.withSizeAndTopology(10000, 0);
|
ret.details.metadata.withSizeAndTopology(10000, 0);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user