From e5cee1f46d411c17702b279491f91227d76f4b9e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 12 Nov 2023 14:56:26 +0100 Subject: [PATCH] (sideload) Fix sideloading so that it doesn't get disproportionately good rankings Also add type flags so that e.g. wikipedia shows up in the wikis filter. --- .../sideload/SideloaderProcessing.java | 29 +++++++++++++++++++ .../sideload/dirtree/DirtreeSideloader.java | 5 +++- .../EncyclopediaMarginaliaNuSideloader.java | 2 ++ .../StackexchangeSideloader.java | 8 +++-- 4 files changed, 41 insertions(+), 3 deletions(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java index 576d7031..a7aa70ba 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java @@ -3,15 +3,22 @@ package nu.marginalia.converting.sideload; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.atags.model.DomainLinks; +import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.HtmlFeature; +import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.UrlIndexingState; +import nu.marginalia.model.html.HtmlStandard; +import nu.marginalia.model.idx.DocumentFlags; +import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.WordFlags; import java.net.URISyntaxException; import java.time.LocalDateTime; +import java.util.EnumSet; import java.util.List; @Singleton @@ -27,6 +34,7 @@ public class SideloaderProcessing { String body, List extraKeywords, DomainLinks domainLinks, + GeneratorType type, int size) throws URISyntaxException { var crawledDoc = new CrawledDocument( "encyclopedia.marginalia.nu", @@ -55,6 +63,27 @@ public class SideloaderProcessing { ret.details = details.details(); + // Add a few things that we know about the document + // that we can't get from the sideloaded data since it's + // so stripped down + + ret.details.standard = HtmlStandard.HTML5; + ret.details.pubYear = LocalDateTime.now().getYear(); + ret.details.features.add(HtmlFeature.JS); + ret.details.features.add(HtmlFeature.TRACKING); + ret.details.quality = -10; + ret.details.generator = type; + + ret.details.metadata = new DocumentMetadata(3, + PubDate.toYearByte(ret.details.pubYear), + (int) -ret.details.quality, + switch (type) { + case WIKI -> EnumSet.of(DocumentFlags.GeneratorWiki); + case DOCS -> EnumSet.of(DocumentFlags.GeneratorDocs); + default -> EnumSet.noneOf(DocumentFlags.class); + }); + + // FIXME (2023-11-06): For encyclopedia loading, this will likely only work when the domain specified is en.wikipedia.org // We don't have access to the article name at this point to generate an equivalent URL... It's not a huge // deal but something to keep in mind diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/dirtree/DirtreeSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/dirtree/DirtreeSideloader.java index b5542e9b..07cf780b 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/dirtree/DirtreeSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/dirtree/DirtreeSideloader.java @@ -2,6 +2,7 @@ package nu.marginalia.converting.sideload.dirtree; import lombok.SneakyThrows; import nu.marginalia.atags.model.DomainLinks; +import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.sideload.SideloadSource; @@ -79,7 +80,9 @@ public class DirtreeSideloader implements SideloadSource, AutoCloseable { } return sideloaderProcessing - .processDocument(url, body, extraKeywords, new DomainLinks(), 10_000); + .processDocument(url, body, extraKeywords, new DomainLinks(), + GeneratorType.DOCS, + 10_000); } @Override diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java index 490bf56a..aab62ef9 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java @@ -6,6 +6,7 @@ import lombok.SneakyThrows; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.converting.model.DisqualifiedException; +import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.sideload.SideloadSource; @@ -184,6 +185,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC fullHtml.toString(), List.of("encyclopedia", "wiki"), domainLinks, + GeneratorType.WIKI, 10_000_000); } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java index 22d9eb33..b96d4a1b 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java @@ -126,9 +126,13 @@ public class StackexchangeSideloader implements SideloadSource { ret.details = new ProcessedDocumentDetails(); ret.details.pubYear = post.year(); - ret.details.quality = 10; + ret.details.quality = -10; ret.details.metadata = new DocumentMetadata(3, - PubDate.toYearByte(ret.details.pubYear), (int) -ret.details.quality, EnumSet.noneOf(DocumentFlags.class)); + PubDate.toYearByte(ret.details.pubYear), + (int) -ret.details.quality, + EnumSet.of(DocumentFlags.GeneratorDocs)); + ret.details.features.add(HtmlFeature.JS); + ret.details.features.add(HtmlFeature.TRACKING); ret.details.metadata.withSizeAndTopology(10000, 0);