From acf7bcc7a6a938d69f32dbf44a454c024addb323 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 27 Dec 2023 13:57:59 +0100 Subject: [PATCH 01/61] (converter) Refactor the DomainProcessor for new format of crawl data With the new crawler modifications, the crawl data comes in a slightly different order, and a result of this is that we can optimize the converter. This is a breaking change that will be incompatible with the old style of crawl data, hence it will linger as a branch for a while. The first step is to move stuff out of the domain processor into the document processor. --- .../processor/DocumentDecorator.java | 33 ++++ .../processor/DocumentProcessor.java | 17 ++- .../converting/processor/DomainProcessor.java | 143 ++++++++---------- .../logic/LshDocumentDeduplicator.java | 73 +++------ .../converting/ConvertingIntegrationTest.java | 5 +- 5 files changed, 134 insertions(+), 137 deletions(-) create mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentDecorator.java diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentDecorator.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentDecorator.java new file mode 100644 index 00000000..d3002df2 --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentDecorator.java @@ -0,0 +1,33 @@ +package nu.marginalia.converting.processor; + +import nu.marginalia.atags.AnchorTextKeywords; +import nu.marginalia.atags.model.DomainLinks; +import nu.marginalia.converting.model.ProcessedDocument; + +import java.util.HashSet; +import java.util.Set; + +public class DocumentDecorator { + private final Set extraSearchTerms = new HashSet<>(); + private final AnchorTextKeywords keywords; + private final DomainLinks externalDomainLinks; + + public DocumentDecorator(AnchorTextKeywords keywords, DomainLinks externalDomainLinks) { + this.keywords = keywords; + this.externalDomainLinks = externalDomainLinks; + } + + public void addTerm(String term) { + extraSearchTerms.add(term); + } + + public void apply(ProcessedDocument doc) { + if (doc == null) + return; + if (doc.words == null) + return; + + doc.words.addAllSyntheticTerms(extraSearchTerms); + doc.words.addAnchorTerms(keywords.getAnchorTextKeywords(externalDomainLinks, doc.url)); + } +} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java index 4b5d9173..d10da715 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java @@ -4,6 +4,7 @@ import com.google.inject.Inject; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawlerDocumentStatus; +import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.UrlIndexingState; import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.converting.model.ProcessedDocument; @@ -38,7 +39,7 @@ public class DocumentProcessor { processorPlugins.add(plainTextDocumentProcessorPlugin); } - public ProcessedDocument process(CrawledDocument crawledDocument, DomainLinks externalDomainLinks) { + public ProcessedDocument process(CrawledDocument crawledDocument, DomainLinks externalDomainLinks, DocumentDecorator documentDecorator) { ProcessedDocument ret = new ProcessedDocument(); try { @@ -51,7 +52,7 @@ public class DocumentProcessor { default -> DocumentClass.EXTERNALLY_LINKED_MULTI; }; - processDocument(crawledDocument, documentClass, ret); + processDocument(crawledDocument, documentClass, documentDecorator, ret); } catch (DisqualifiedException ex) { ret.state = UrlIndexingState.DISQUALIFIED; @@ -67,7 +68,7 @@ public class DocumentProcessor { return ret; } - private void processDocument(CrawledDocument crawledDocument, DocumentClass documentClass, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException { + private void processDocument(CrawledDocument crawledDocument, DocumentClass documentClass, DocumentDecorator documentDecorator, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException { var crawlerStatus = CrawlerDocumentStatus.valueOf(crawledDocument.crawlerStatus); if (crawlerStatus != CrawlerDocumentStatus.OK) { @@ -90,6 +91,16 @@ public class DocumentProcessor { ret.details = detailsWithWords.details(); ret.words = detailsWithWords.words(); + + documentDecorator.apply(ret); + + if (Boolean.TRUE.equals(crawledDocument.hasCookies) + && ret.details != null + && ret.details.features != null) + { + ret.details.features.add(HtmlFeature.COOKIES); + } + } private AbstractDocumentProcessorPlugin findPlugin(CrawledDocument crawledDocument) throws DisqualifiedException { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index 6e993a24..e8b89e94 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -17,7 +17,6 @@ import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.model.EdgeDomain; import nu.marginalia.converting.processor.logic.links.TopKeywords; import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator; -import nu.marginalia.model.crawl.HtmlFeature; import org.apache.commons.lang3.StringUtils; import org.jetbrains.annotations.Nullable; import org.slf4j.Logger; @@ -32,7 +31,6 @@ public class DomainProcessor { private final SiteWords siteWords; private final AnchorTagsSource anchorTagsSource; private final AnchorTextKeywords anchorTextKeywords; - private final LshDocumentDeduplicator documentDeduplicator; private final GeoIpDictionary geoIpDictionary; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -42,12 +40,11 @@ public class DomainProcessor { SiteWords siteWords, AnchorTagsSourceFactory anchorTagsSourceFactory, AnchorTextKeywords anchorTextKeywords, - LshDocumentDeduplicator documentDeduplicator, GeoIpDictionary geoIpDictionary) throws SQLException + GeoIpDictionary geoIpDictionary) throws SQLException { this.documentProcessor = documentProcessor; this.siteWords = siteWords; this.anchorTextKeywords = anchorTextKeywords; - this.documentDeduplicator = documentDeduplicator; this.anchorTagsSource = anchorTagsSourceFactory.create(); this.geoIpDictionary = geoIpDictionary; @@ -61,117 +58,101 @@ public class DomainProcessor { return null; } - var ret = new ProcessedDomain(); + ProcessedDomain ret = new ProcessedDomain(); List docs = new ArrayList<>(); Set processedUrls = new HashSet<>(); - boolean cookies = false; - String ip = ""; - DomainLinks externalDomainLinks = null; - while (dataStream.hasNext()) { - var data = dataStream.next(); + DocumentDecorator documentDecorator = null; - // Do a lazy load of the external domain links since we don't know the domain - // until we see the first document - if (externalDomainLinks == null) { - var domain = data.getDomain(); + try (var deduplicator = new LshDocumentDeduplicator()){ + while (dataStream.hasNext()) { + var data = dataStream.next(); - if (domain != null) { - externalDomainLinks = anchorTagsSource.getAnchorTags(domain); - } - } + // Do a lazy load of the external domain links since we don't know the domain + // until we see the first document + if (externalDomainLinks == null) { + var domain = data.getDomain(); - if (data instanceof CrawledDomain crawledDomain) { - ret.domain = new EdgeDomain(crawledDomain.domain); - ret.ip = crawledDomain.ip; - - cookies = crawledDomain.hasCookies(); - ip = crawledDomain.ip; - - if (crawledDomain.redirectDomain != null) { - ret.redirect = new EdgeDomain(crawledDomain.redirectDomain); - } - ret.documents = docs; - ret.state = getState(crawledDomain.crawlerStatus); - } - else if (data instanceof CrawledDocument doc) { - try { - if (doc.url == null || !processedUrls.add(doc.url)) - continue; - - if (Boolean.TRUE.equals(doc.hasCookies)) { - cookies = true; + if (domain != null) { + externalDomainLinks = anchorTagsSource.getAnchorTags(domain); } - - // This case should never be reachable, as we should have initiated - // the externalDomainLinks variable above if we made it past the - // doc.url == null check; but we'll leave it here just in case - // to make debugging easier if we break this. - assert externalDomainLinks != null : "externalDomainLinks has not been initialized"; - - docs.add(documentProcessor.process(doc, externalDomainLinks)); } - catch (Exception ex) { - logger.warn("Failed to process " + doc.url, ex); + + if (data instanceof CrawledDomain crawledDomain) { + documentDecorator = new DocumentDecorator(anchorTextKeywords, externalDomainLinks); + + ret = processDomain(crawledDomain, ret, documentDecorator); + + ret.documents = docs; + + } else if (data instanceof CrawledDocument doc) { + try { + if (doc.url == null || !processedUrls.add(doc.url)) + continue; + + var processedDoc = documentProcessor.process(doc, externalDomainLinks, documentDecorator); + + deduplicator.markIfDuplicate(processedDoc); + + docs.add(processedDoc); + } catch (Exception ex) { + logger.warn("Failed to process " + doc.url, ex); + } } } + } // Add late keywords and features from domain-level information - List terms = new ArrayList<>(); - - addIpInfo(terms, ip); - - if (cookies) { - terms.add(HtmlFeature.COOKIES.getKeyword()); - } - - if (isAcademicDomain(ret.domain)) { - terms.add("special:academia"); - } - - for (var document : ret.documents) { - if (document.details == null) - continue; - - if (cookies) { - document.details.features.add(HtmlFeature.COOKIES); - } - - document.words.addAllSyntheticTerms(terms); - - document.words.addAnchorTerms( - anchorTextKeywords.getAnchorTextKeywords(externalDomainLinks, document.url) - ); - } - documentDeduplicator.deduplicate(ret.documents); calculateStatistics(ret, externalDomainLinks); return ret; } - private void addIpInfo(List terms, String ip) { - terms.add("ip:"+ip); + private ProcessedDomain processDomain(CrawledDomain crawledDomain, + ProcessedDomain ret, + DocumentDecorator decorator) + { + ret.domain = new EdgeDomain(crawledDomain.domain); + ret.ip = crawledDomain.ip; + + addIpInfo(decorator, crawledDomain.ip); + + if (isAcademicDomain(ret.domain)) { + decorator.addTerm("special:academia"); + } + + if (crawledDomain.redirectDomain != null) { + ret.redirect = new EdgeDomain(crawledDomain.redirectDomain); + } + ret.state = getState(crawledDomain.crawlerStatus); + + return ret; + } + + + private void addIpInfo(DocumentDecorator decorator, String ip) { + decorator.addTerm("ip:"+ip); // Add IP location country as a term String country = geoIpDictionary.getCountry(ip); if (!country.isBlank()) { // use the ip:-prefix as there's no real confusion between e.g. ip:127.0.0.1 and ip:uk - terms.add("ip:"+country.toLowerCase()); + decorator.addTerm("ip:"+country.toLowerCase()); } // Add ASN as a term geoIpDictionary.getAsnInfo(ip).ifPresent(asnInfo -> { - terms.add("as:"+asnInfo.asn()); + decorator.addTerm("as:"+asnInfo.asn()); for (var orgPart : StringUtils.split(asnInfo.org(), '-')) { - terms.add("as:"+orgPart.toLowerCase()); + decorator.addTerm("as:"+orgPart.toLowerCase()); } if (isCloudy(asnInfo)) { - terms.add("special:cloud"); + decorator.addTerm("special:cloud"); } }); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/LshDocumentDeduplicator.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/LshDocumentDeduplicator.java index 275c1a49..2a312544 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/LshDocumentDeduplicator.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/LshDocumentDeduplicator.java @@ -1,74 +1,43 @@ package nu.marginalia.converting.processor.logic; -import com.google.inject.Singleton; +import gnu.trove.list.array.TLongArrayList; import nu.marginalia.model.crawl.UrlIndexingState; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.lsh.EasyLSH; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.List; /** Deduplicates documents based on their LSH * * @see EasyLSH */ -@Singleton -public class LshDocumentDeduplicator { +public class LshDocumentDeduplicator implements AutoCloseable { - private final int DISTANCE_THRESHOLD = 2; - private final Logger logger = LoggerFactory.getLogger(getClass()); + private final TLongArrayList hashCodes = new TLongArrayList(1000); + private static final int DISTANCE_THRESHOLD = 2; - public void deduplicate(List documents) { - ProcessedDocument[] goodDocuments = documents.stream() - .filter(ProcessedDocument::isProcessedFully) - .filter(doc -> doc.words.size() > 100) - .toArray(ProcessedDocument[]::new); - - long[] hashCodes = new long[goodDocuments.length]; - for (int i = 0; i < goodDocuments.length; i++) { - hashCodes[i] = goodDocuments[i].details.hashCode; + public void markIfDuplicate(ProcessedDocument document) { + if (!document.isProcessedFully()) { + return; } - // These arrays can be fairly large (~10,000) so we need to be - // careful about what we do in this O(n^2) loop + if (document.words.size() < 100) { + return; + } - for (int i = 0; i < hashCodes.length; i++) { - for (int j = 0; j < hashCodes.length; j++) { - // This is basically just a 64 bit XOR and a POPCOUNT so it's pretty fast. - if (EasyLSH.hammingDistance(hashCodes[i], hashCodes[j]) < DISTANCE_THRESHOLD) { - if (i == j) - continue; + long hashCode = document.details.hashCode; - if (flagIfDuplicate(goodDocuments[i], goodDocuments[j])) { - break; - } - } + for (int i = 0; i < hashCodes.size(); i++) { + if (EasyLSH.hammingDistance(hashCode, hashCodes.get(i)) < DISTANCE_THRESHOLD) { + document.state = UrlIndexingState.DISQUALIFIED; + document.stateReason = "Duplicate"; + return; } } + + hashCodes.add(hashCode); } - private boolean flagIfDuplicate(ProcessedDocument thisDoc, ProcessedDocument otherDoc) { - - // This document has already been disqualified as a duplicate - if (thisDoc.state != UrlIndexingState.OK) - return false; - - - // We might consider using thisDoc.details.metadata.topology() here instead of the - // URL length to determine which document is the "better" one. - if (thisDoc.url.path.length() - < otherDoc.url.path.length()) - { - logger.debug("{} duplicates {}", otherDoc.url, thisDoc.url); - - otherDoc.state = UrlIndexingState.DISQUALIFIED; - otherDoc.stateReason = "Duplicate"; - - return true; - } - - return false; - + @Override + public void close() throws Exception { + hashCodes.clear(1); } } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java index b7963bdf..09973f1b 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java @@ -139,10 +139,13 @@ public class ConvertingIntegrationTest { private SerializableCrawlDataStream asSerializableCrawlData(CrawledDomain domain) { List data = new ArrayList<>(); + + data.add(domain); + if (domain.doc != null) { data.addAll(domain.doc); } - data.add(domain); + return SerializableCrawlDataStream.fromIterator(data.iterator()); } From 24051fec03804d6e33ec94a04bc098e37b58b724 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 27 Dec 2023 18:20:03 +0100 Subject: [PATCH 02/61] (converter) WIP Run sideload-style processing for large domains The processor normally retains the domain data in memory after processing to be able to do additional site-wide analysis. This works well, except there are a number of outlier websites that have an absurd number of documents that can rapidly fill up the heap of the process. These websites now receive a simplified treatment. This is executed in the converter batch writer thread. This is slower, but the documents will not be persisted in memory. --- .../marginalia/converting/ConverterMain.java | 8 +- .../converting/model/ProcessedDomain.java | 18 +- .../converting/processor/DomainProcessor.java | 155 ++++++++++++++++-- .../writer/ConverterBatchWritableIf.java | 9 + .../writer/ConverterBatchWriter.java | 15 +- .../writer/ConverterBatchWriterIf.java | 15 ++ .../converting/writer/ConverterWriter.java | 7 +- .../converting/ConvertingIntegrationTest.java | 6 +- ...CrawlingThenConvertingIntegrationTest.java | 2 +- .../experiments/SiteStatisticsExperiment.java | 2 +- 10 files changed, 208 insertions(+), 29 deletions(-) create mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWritableIf.java create mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWriterIf.java diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index 3bada914..b4b3f96e 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -6,9 +6,9 @@ import com.google.inject.Inject; import com.google.inject.Injector; import nu.marginalia.ProcessConfiguration; import nu.marginalia.ProcessConfigurationModule; -import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.sideload.SideloadSource; import nu.marginalia.converting.sideload.SideloadSourceFactory; +import nu.marginalia.converting.writer.ConverterBatchWritableIf; import nu.marginalia.converting.writer.ConverterBatchWriter; import nu.marginalia.converting.writer.ConverterWriter; import nu.marginalia.storage.FileStorageService; @@ -109,7 +109,7 @@ public class ConverterMain { taskHeartbeat.progress(sideloadSource.domainName(), i++, sideloadSources.size()); - writer.write(sideloadSource); + writer.writeSideloadSource(sideloadSource); } taskHeartbeat.progress("Finished", i, sideloadSources.size()); @@ -139,8 +139,8 @@ public class ConverterMain { { pool.submit(() -> { try { - ProcessedDomain processed = processor.process(domain); - converterWriter.accept(processed); + ConverterBatchWritableIf writable = processor.createWritable(domain); + converterWriter.accept(writable); } catch (Exception ex) { logger.info("Error in processing", ex); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java index 3e954637..2146f52b 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java @@ -1,15 +1,18 @@ package nu.marginalia.converting.model; import lombok.ToString; +import nu.marginalia.converting.writer.ConverterBatchWritableIf; +import nu.marginalia.converting.writer.ConverterBatchWriter; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.crawl.DomainIndexingState; import org.jetbrains.annotations.Nullable; +import java.io.IOException; import java.util.List; import java.util.Optional; @ToString -public class ProcessedDomain { +public class ProcessedDomain implements ConverterBatchWritableIf { public EdgeDomain domain; public List documents; @@ -26,4 +29,17 @@ public class ProcessedDomain { public int size() { return Optional.ofNullable(documents).map(List::size).orElse(1); } + + @Override + public void write(ConverterBatchWriter writer) throws IOException { + writer.writeDomainData(this); + } + + @Override + public String id() { + return domain.toString(); + } + + @Override + public void close() {} } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index e8b89e94..6d46a85f 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -8,6 +8,9 @@ import nu.marginalia.atags.source.AnchorTagsSource; import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.logic.links.LinkGraph; +import nu.marginalia.converting.sideload.SideloadSource; +import nu.marginalia.converting.writer.ConverterBatchWritableIf; +import nu.marginalia.converting.writer.ConverterBatchWriter; import nu.marginalia.crawling.io.SerializableCrawlDataStream; import nu.marginalia.crawling.model.*; import nu.marginalia.geoip.GeoIpDictionary; @@ -17,11 +20,15 @@ import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.model.EdgeDomain; import nu.marginalia.converting.processor.logic.links.TopKeywords; import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator; +import nu.marginalia.util.ProcessingIterator; import org.apache.commons.lang3.StringUtils; import org.jetbrains.annotations.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import java.sql.SQLException; import java.util.*; import java.util.regex.Pattern; @@ -33,6 +40,11 @@ public class DomainProcessor { private final AnchorTextKeywords anchorTextKeywords; private final GeoIpDictionary geoIpDictionary; + + // The threshold for running a cheaper sideloading-style process + // (10 MB is ~ 99.5%th percentile of domain data sizes) + private static final long DOMAIN_SIDELOAD_THRESHOLD = 10_000_000L; + private final Logger logger = LoggerFactory.getLogger(getClass()); @Inject @@ -51,9 +63,130 @@ public class DomainProcessor { geoIpDictionary.waitReady(); } + public ConverterBatchWritableIf createWritable(SerializableCrawlDataStream domain) throws IOException { + Path filePath = domain.path(); + + if (filePath != null && Files.size(filePath) > DOMAIN_SIDELOAD_THRESHOLD) { + // If the file is too big, we run a processing mode that doesn't + // require loading the entire dataset into RAM + return sideloadProcessing(domain); + } + + return fullProcessing(domain); + } + + public ConverterBatchWritableIf sideloadProcessing(SerializableCrawlDataStream dataStream) { + try { + return new SideloadProcessing(dataStream); + } + catch (Exception ex) { + logger.warn("Failed to process domain sideload", ex); + return null; + } + + } + + class SideloadProcessing implements ConverterBatchWritableIf, SideloadSource { + private final SerializableCrawlDataStream dataStream; + private final ProcessedDomain domain; + private final DocumentDecorator documentDecorator; + private final Set processedUrls = new HashSet<>(); + private final DomainLinks externalDomainLinks; + private final LshDocumentDeduplicator deduplicator = new LshDocumentDeduplicator(); + + SideloadProcessing(SerializableCrawlDataStream dataStream) throws IOException { + this.dataStream = dataStream; + + if (!dataStream.hasNext()) { + throw new IllegalStateException("No data in stream"); + } + if (!(dataStream.next() instanceof CrawledDomain crawledDomain)) { + throw new IllegalStateException("First record must be a domain"); + } + + domain = new ProcessedDomain(); + externalDomainLinks = anchorTagsSource.getAnchorTags(domain.domain); + documentDecorator = new DocumentDecorator(anchorTextKeywords, externalDomainLinks); + + processDomain(crawledDomain, domain, documentDecorator); + } + + @Override + public ProcessedDomain getDomain() { + return domain; + } + + @Override + public Iterator getDocumentsStream() { + return new DocumentsIterator(); + } + + class DocumentsIterator implements Iterator { + ProcessedDocument next = null; + @Override + public boolean hasNext() { + try { + while (next != null + && dataStream.hasNext() + && dataStream.next() instanceof CrawledDocument doc) + { + if (doc.url == null || !processedUrls.add(doc.url)) + continue; + + var processedDoc = documentProcessor.process(doc, externalDomainLinks, documentDecorator); + + deduplicator.markIfDuplicate(processedDoc); + next = processedDoc; + + if (processedDoc.isProcessedFully()) { + // This is a bit sketchy, but we need to set the size and topology to something + processedDoc.details.metadata = processedDoc.details.metadata.withSizeAndTopology( + 10_000, externalDomainLinks.countForUrl(processedDoc.url)); + } + + return true; + } + } + catch (IOException ex) { + logger.warn("Failed to process domain sideload", ex); + } + + return false; + } + + @Override + public ProcessedDocument next() { + try { + if (next == null && !hasNext()) + throw new NoSuchElementException(); + return next; + } finally { + next = null; + } + } + } + + @Override + public void write(ConverterBatchWriter writer) throws IOException { + writer.writeSideloadSource(this); + } + + @Override + public String id() { + return domain.domain.toString(); + } + + @Override + public void close() throws Exception { + dataStream.close(); + deduplicator.close(); + } + } + + @SneakyThrows @Nullable - public ProcessedDomain process(SerializableCrawlDataStream dataStream) { + public ProcessedDomain fullProcessing(SerializableCrawlDataStream dataStream) { if (!dataStream.hasNext()) { return null; } @@ -83,8 +216,7 @@ public class DomainProcessor { if (data instanceof CrawledDomain crawledDomain) { documentDecorator = new DocumentDecorator(anchorTextKeywords, externalDomainLinks); - ret = processDomain(crawledDomain, ret, documentDecorator); - + processDomain(crawledDomain, ret, documentDecorator); ret.documents = docs; } else if (data instanceof CrawledDocument doc) { @@ -112,25 +244,23 @@ public class DomainProcessor { return ret; } - private ProcessedDomain processDomain(CrawledDomain crawledDomain, - ProcessedDomain ret, + private void processDomain(CrawledDomain crawledDomain, + ProcessedDomain domain, DocumentDecorator decorator) { - ret.domain = new EdgeDomain(crawledDomain.domain); - ret.ip = crawledDomain.ip; + domain.domain = new EdgeDomain(crawledDomain.domain); + domain.ip = crawledDomain.ip; addIpInfo(decorator, crawledDomain.ip); - if (isAcademicDomain(ret.domain)) { + if (isAcademicDomain(domain.domain)) { decorator.addTerm("special:academia"); } if (crawledDomain.redirectDomain != null) { - ret.redirect = new EdgeDomain(crawledDomain.redirectDomain); + domain.redirect = new EdgeDomain(crawledDomain.redirectDomain); } - ret.state = getState(crawledDomain.crawlerStatus); - - return ret; + domain.state = getState(crawledDomain.crawlerStatus); } @@ -232,4 +362,5 @@ public class DomainProcessor { }; } + } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWritableIf.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWritableIf.java new file mode 100644 index 00000000..c3b4ae65 --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWritableIf.java @@ -0,0 +1,9 @@ +package nu.marginalia.converting.writer; + +import java.io.IOException; + +public interface ConverterBatchWritableIf { + void write(ConverterBatchWriter writer) throws IOException; + String id(); + void close() throws Exception; +} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWriter.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWriter.java index 239d748c..73333320 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWriter.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWriter.java @@ -27,7 +27,7 @@ import java.util.concurrent.ForkJoinPool; import java.util.concurrent.Future; /** Writer for a single batch of converter parquet files */ -public class ConverterBatchWriter implements AutoCloseable { +public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriterIf { private final DomainRecordParquetFileWriter domainWriter; private final DomainLinkRecordParquetFileWriter domainLinkWriter; private final DocumentRecordParquetFileWriter documentWriter; @@ -46,7 +46,13 @@ public class ConverterBatchWriter implements AutoCloseable { ); } - public void write(SideloadSource sideloadSource) throws IOException { + @Override + public void write(ConverterBatchWritableIf writable) throws IOException { + writable.write(this); + } + + @Override + public void writeSideloadSource(SideloadSource sideloadSource) throws IOException { var domain = sideloadSource.getDomain(); writeDomainData(domain); @@ -54,7 +60,8 @@ public class ConverterBatchWriter implements AutoCloseable { writeDocumentData(domain.domain, sideloadSource.getDocumentsStream()); } - public void write(ProcessedDomain domain) { + @Override + public void writeProcessedDomain(ProcessedDomain domain) { var results = ForkJoinPool.commonPool().invokeAll( writeTasks(domain) ); @@ -180,7 +187,7 @@ public class ConverterBatchWriter implements AutoCloseable { return this; } - private Object writeDomainData(ProcessedDomain domain) throws IOException { + public Object writeDomainData(ProcessedDomain domain) throws IOException { DomainMetadata metadata = DomainMetadata.from(domain); List feeds = getFeedUrls(domain); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWriterIf.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWriterIf.java new file mode 100644 index 00000000..eb6e14f4 --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterBatchWriterIf.java @@ -0,0 +1,15 @@ +package nu.marginalia.converting.writer; + +import nu.marginalia.converting.model.ProcessedDomain; +import nu.marginalia.converting.sideload.SideloadSource; + +import java.io.IOException; + +public interface ConverterBatchWriterIf { + + void write(ConverterBatchWritableIf writable) throws IOException; + + void writeSideloadSource(SideloadSource sideloadSource) throws IOException; + + void writeProcessedDomain(ProcessedDomain domain); +} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java index 6cb4f332..6bac2804 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java @@ -24,7 +24,7 @@ public class ConverterWriter implements AutoCloseable { private final Duration switchInterval = Duration.of(10, ChronoUnit.MINUTES); - private final ArrayBlockingQueue domainData + private final ArrayBlockingQueue domainData = new ArrayBlockingQueue<>(1); private final Thread workerThread; @@ -42,7 +42,7 @@ public class ConverterWriter implements AutoCloseable { } @SneakyThrows - public void accept(@Nullable ProcessedDomain domain) { + public void accept(@Nullable ConverterBatchWritableIf domain) { if (null == domain) return; @@ -66,10 +66,11 @@ public class ConverterWriter implements AutoCloseable { if (data == null) continue; - String id = data.domain.toString(); + String id = data.id(); if (workLog.isItemCommitted(id) || workLog.isItemInCurrentBatch(id)) { logger.warn("Skipping already logged item {}", id); + data.close(); continue; } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java index 09973f1b..c22f2c66 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java @@ -43,7 +43,7 @@ public class ConvertingIntegrationTest { var domain = new CrawledDomain("memex.marginalia.nu", null, "OK", "-", "127.0.0.1", docs, Collections.emptyList()); - var ret = domainProcessor.process(asSerializableCrawlData(domain)); + var ret = domainProcessor.fullProcessing(asSerializableCrawlData(domain)); assertEquals(ret.state, DomainIndexingState.ACTIVE); assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu")); @@ -51,7 +51,7 @@ public class ConvertingIntegrationTest { } @Test public void testMemexMarginaliaNuDateInternalConsistency() throws IOException { - var ret = domainProcessor.process(asSerializableCrawlData(readMarginaliaWorkingSet())); + var ret = domainProcessor.fullProcessing(asSerializableCrawlData(readMarginaliaWorkingSet())); ret.documents.stream().filter(ProcessedDocument::isProcessedFully).forEach(doc -> { int year = PubDate.fromYearByte(doc.details.metadata.year()); Integer yearMeta = doc.details.pubYear; @@ -64,7 +64,7 @@ public class ConvertingIntegrationTest { @Test public void testMemexMarginaliaNu() throws IOException { - var ret = domainProcessor.process(asSerializableCrawlData(readMarginaliaWorkingSet())); + var ret = domainProcessor.fullProcessing(asSerializableCrawlData(readMarginaliaWorkingSet())); assertNotNull(ret); assertEquals(ret.state, DomainIndexingState.ACTIVE); assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu")); diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index 535eac31..3e6bc5eb 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -251,7 +251,7 @@ public class CrawlingThenConvertingIntegrationTest { private ProcessedDomain process() { try (var stream = new ParquetSerializableCrawlDataStream(fileName2)) { - return domainProcessor.process(stream); + return domainProcessor.fullProcessing(stream); } catch (Exception e) { Assertions.fail(e); diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java index 98c11e7f..0afb290f 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java @@ -22,7 +22,7 @@ public class SiteStatisticsExperiment extends Experiment { @Override public boolean process(SerializableCrawlDataStream stream) { - var ret = domainProcessor.process(stream); + var ret = domainProcessor.fullProcessing(stream); ret.documents.stream() .filter(ProcessedDocument::isProcessedFully) From b37223c05308bd086ec6017d5c150d5247ebc000 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 27 Dec 2023 18:33:16 +0100 Subject: [PATCH 03/61] (converter) Basic test coverage for sideloading-style processing --- .../converting/processor/DomainProcessor.java | 18 +++++----- .../converting/ConvertingIntegrationTest.java | 35 ++++++++++++++++++- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index 6d46a85f..290702c1 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -75,7 +75,7 @@ public class DomainProcessor { return fullProcessing(domain); } - public ConverterBatchWritableIf sideloadProcessing(SerializableCrawlDataStream dataStream) { + public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream) { try { return new SideloadProcessing(dataStream); } @@ -86,7 +86,7 @@ public class DomainProcessor { } - class SideloadProcessing implements ConverterBatchWritableIf, SideloadSource { + public class SideloadProcessing implements ConverterBatchWritableIf, SideloadSource { private final SerializableCrawlDataStream dataStream; private final ProcessedDomain domain; private final DocumentDecorator documentDecorator; @@ -97,10 +97,9 @@ public class DomainProcessor { SideloadProcessing(SerializableCrawlDataStream dataStream) throws IOException { this.dataStream = dataStream; - if (!dataStream.hasNext()) { - throw new IllegalStateException("No data in stream"); - } - if (!(dataStream.next() instanceof CrawledDomain crawledDomain)) { + if (!dataStream.hasNext() + || !(dataStream.next() instanceof CrawledDomain crawledDomain)) + { throw new IllegalStateException("First record must be a domain"); } @@ -126,10 +125,11 @@ public class DomainProcessor { @Override public boolean hasNext() { try { - while (next != null - && dataStream.hasNext() - && dataStream.next() instanceof CrawledDocument doc) + while (next == null + && dataStream.hasNext()) { + if (!(dataStream.next() instanceof CrawledDocument doc)) + continue; if (doc.url == null || !processedUrls.add(doc.url)) continue; diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java index c22f2c66..e253ecb6 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java @@ -63,7 +63,7 @@ public class ConvertingIntegrationTest { } @Test - public void testMemexMarginaliaNu() throws IOException { + public void testMemexMarginaliaNuFullProcessing() throws IOException { var ret = domainProcessor.fullProcessing(asSerializableCrawlData(readMarginaliaWorkingSet())); assertNotNull(ret); assertEquals(ret.state, DomainIndexingState.ACTIVE); @@ -94,6 +94,39 @@ public class ConvertingIntegrationTest { } } + @Test + public void testMemexMarginaliaNuSideloadProcessing() throws IOException { + var ret = domainProcessor.sideloadProcessing(asSerializableCrawlData(readMarginaliaWorkingSet())); + assertNotNull(ret); + assertEquals("memex.marginalia.nu", ret.id()); + + var domain = ret.getDomain(); + assertEquals(domain.domain, new EdgeDomain("memex.marginalia.nu")); + + List docsAll = new ArrayList<>(); + Map resultsByStatusCount = new HashMap<>(); + ret.getDocumentsStream().forEachRemaining(docsAll::add); + assertTrue(docsAll.size() > 25); + + docsAll.forEach(doc -> resultsByStatusCount.merge(doc.state, 1, Integer::sum)); + + assertTrue(resultsByStatusCount.get(UrlIndexingState.OK) > 25); + + for (var doc : docsAll) { + + if (!doc.isProcessedFully()) { + continue; + } + + var details = doc.details; + + assertTrue(details.title.length() > 4); + assertTrue(details.description.length() > 4); + assertEquals(HtmlStandard.HTML5, details.standard); + + } + } + private CrawledDomain readMarginaliaWorkingSet() throws IOException { String index = readClassPathFile("memex-marginalia/index"); String[] files = index.split("\n"); From 7428ba2dd713cb0b90f60065a72eec54594a1a8e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 27 Dec 2023 19:29:26 +0100 Subject: [PATCH 04/61] (converter) Basic test coverage for sideloading-style processing --- .../nu/marginalia/converting/ConvertingIntegrationTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java index e253ecb6..141777d6 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java @@ -120,10 +120,10 @@ public class ConvertingIntegrationTest { var details = doc.details; + assertTrue(details.metadata.size() > 0); assertTrue(details.title.length() > 4); assertTrue(details.description.length() > 4); assertEquals(HtmlStandard.HTML5, details.standard); - } } From c847d83011de0a4bcbbcc2df85f7aaae90d37f59 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 28 Dec 2023 19:14:16 +0100 Subject: [PATCH 05/61] (converter) Add size hint to converter sideload processing --- .../java/nu/marginalia/converting/processor/DomainProcessor.java | 1 + 1 file changed, 1 insertion(+) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index 290702c1..72797ee1 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -104,6 +104,7 @@ public class DomainProcessor { } domain = new ProcessedDomain(); + domain.sizeloadSizeAdvice = 10_000; externalDomainLinks = anchorTagsSource.getAnchorTags(domain.domain); documentDecorator = new DocumentDecorator(anchorTextKeywords, externalDomainLinks); From bcecc93e399fbbe219f390bb9a2b9d13afdf1351 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 28 Dec 2023 19:45:35 +0100 Subject: [PATCH 06/61] (converter) Swallow errors in parquet data stream --- .../io/format/ParquetSerializableCrawlDataStream.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java index 71159526..01457bd3 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java @@ -46,7 +46,13 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial createDomainRecord(nextRecord); wroteDomainRecord = true; } - createDocumentRecord(nextRecord); + + try { + createDocumentRecord(nextRecord); + } + catch (Exception ex) { + logger.error("Failed to create document record", ex); + } } return !nextQ.isEmpty(); } From c4885998796e58237b2bb29172a46977378db785 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 28 Dec 2023 19:52:26 +0100 Subject: [PATCH 07/61] (converter) Fix NPE in converter --- .../converting/processor/DocumentDecorator.java | 6 ++---- .../converting/processor/DocumentProcessor.java | 10 ++++++---- .../converting/processor/DomainProcessor.java | 7 ++++--- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentDecorator.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentDecorator.java index d3002df2..02e22f4f 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentDecorator.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentDecorator.java @@ -10,18 +10,16 @@ import java.util.Set; public class DocumentDecorator { private final Set extraSearchTerms = new HashSet<>(); private final AnchorTextKeywords keywords; - private final DomainLinks externalDomainLinks; - public DocumentDecorator(AnchorTextKeywords keywords, DomainLinks externalDomainLinks) { + public DocumentDecorator(AnchorTextKeywords keywords) { this.keywords = keywords; - this.externalDomainLinks = externalDomainLinks; } public void addTerm(String term) { extraSearchTerms.add(term); } - public void apply(ProcessedDocument doc) { + public void apply(ProcessedDocument doc, DomainLinks externalDomainLinks) { if (doc == null) return; if (doc.words == null) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java index d10da715..a9043e33 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java @@ -39,7 +39,9 @@ public class DocumentProcessor { processorPlugins.add(plainTextDocumentProcessorPlugin); } - public ProcessedDocument process(CrawledDocument crawledDocument, DomainLinks externalDomainLinks, DocumentDecorator documentDecorator) { + public ProcessedDocument process(CrawledDocument crawledDocument, + DomainLinks externalDomainLinks, + DocumentDecorator documentDecorator) { ProcessedDocument ret = new ProcessedDocument(); try { @@ -52,7 +54,7 @@ public class DocumentProcessor { default -> DocumentClass.EXTERNALLY_LINKED_MULTI; }; - processDocument(crawledDocument, documentClass, documentDecorator, ret); + processDocument(crawledDocument, documentClass, documentDecorator, externalDomainLinks, ret); } catch (DisqualifiedException ex) { ret.state = UrlIndexingState.DISQUALIFIED; @@ -68,7 +70,7 @@ public class DocumentProcessor { return ret; } - private void processDocument(CrawledDocument crawledDocument, DocumentClass documentClass, DocumentDecorator documentDecorator, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException { + private void processDocument(CrawledDocument crawledDocument, DocumentClass documentClass, DocumentDecorator documentDecorator, DomainLinks externalDomainLinks, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException { var crawlerStatus = CrawlerDocumentStatus.valueOf(crawledDocument.crawlerStatus); if (crawlerStatus != CrawlerDocumentStatus.OK) { @@ -92,7 +94,7 @@ public class DocumentProcessor { ret.details = detailsWithWords.details(); ret.words = detailsWithWords.words(); - documentDecorator.apply(ret); + documentDecorator.apply(ret, externalDomainLinks); if (Boolean.TRUE.equals(crawledDocument.hasCookies) && ret.details != null diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index 72797ee1..8aff30eb 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -105,10 +105,11 @@ public class DomainProcessor { domain = new ProcessedDomain(); domain.sizeloadSizeAdvice = 10_000; - externalDomainLinks = anchorTagsSource.getAnchorTags(domain.domain); - documentDecorator = new DocumentDecorator(anchorTextKeywords, externalDomainLinks); + documentDecorator = new DocumentDecorator(anchorTextKeywords); processDomain(crawledDomain, domain, documentDecorator); + + externalDomainLinks = anchorTagsSource.getAnchorTags(domain.domain); } @Override @@ -215,7 +216,7 @@ public class DomainProcessor { } if (data instanceof CrawledDomain crawledDomain) { - documentDecorator = new DocumentDecorator(anchorTextKeywords, externalDomainLinks); + documentDecorator = new DocumentDecorator(anchorTextKeywords); processDomain(crawledDomain, ret, documentDecorator); ret.documents = docs; From 407915a86e45e9a8f84ffd62f8c781d156d931fb Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 28 Dec 2023 19:52:26 +0100 Subject: [PATCH 08/61] (converter) Fix NPEs in converter due to the new data format --- .../main/java/nu/marginalia/contenttype/ContentType.java | 3 +++ .../nu/marginalia/contenttype/DocumentBodyToString.java | 6 +++++- .../io/format/ParquetSerializableCrawlDataStream.java | 5 ++++- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentType.java b/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentType.java index 095497c8..6a8c25c8 100644 --- a/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentType.java +++ b/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentType.java @@ -8,6 +8,9 @@ import org.apache.commons.lang3.StringUtils; */ public record ContentType(String contentType, String charset) { public static ContentType parse(String contentTypeHeader) { + if (contentTypeHeader == null || contentTypeHeader.isBlank()) + return new ContentType(null, null); + String[] parts = StringUtils.split(contentTypeHeader, ";", 2); String contentType = parts[0].trim(); String charset = parts.length > 1 ? parts[1].trim() : "UTF-8"; diff --git a/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/DocumentBodyToString.java b/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/DocumentBodyToString.java index d4d6e9b7..7fe604f4 100644 --- a/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/DocumentBodyToString.java +++ b/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/DocumentBodyToString.java @@ -8,7 +8,11 @@ public class DocumentBodyToString { public static String getStringData(ContentType type, byte[] data) { Charset charset; try { - charset = Charset.forName(type.charset()); + if (type.charset() == null || type.charset().isBlank()) + charset = StandardCharsets.UTF_8; + else { + charset = Charset.forName(type.charset()); + } } catch (IllegalCharsetNameException ex) { // Fall back to UTF-8 if we don't understand what this is. It's *probably* fine? Maybe? diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java index 01457bd3..bdcda9dc 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java @@ -100,7 +100,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial else if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { // other advisory stuff we don't want return; } - else { + else if (nextRecord.body != null) { try { bodyString = DocumentBodyToString.getStringData( ContentType.parse(nextRecord.contentType), @@ -110,6 +110,9 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial status = CrawlerDocumentStatus.BAD_CHARSET; } } + else { + status = CrawlerDocumentStatus.ERROR; + } nextQ.add(new CrawledDocument("", nextRecord.url, From dec3b1092d332c84780187404ebfced9a21f5397 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 29 Dec 2023 13:58:08 +0100 Subject: [PATCH 09/61] (converter) Fix bugs in conversion This commit adds a safety check that the URL of the document is from the correct domain. It also adds a sizeHint() method to SerializableCrawlDataStream which *may* provide an indication if the stream is very large and benefits from sideload-style processing (which is slow). It furthermore addresses a bug where the ProcessedDomain.write() invoked the wrong method on ConverterBatchWriter and only wrote the domain metadata, not the rest... --- .../io/SerializableCrawlDataStream.java | 4 ++++ .../ParquetSerializableCrawlDataStream.java | 16 +++++++++++++ ...rawledDocumentParquetRecordFileReader.java | 23 +++++++++++++++++++ .../converting/model/ProcessedDomain.java | 2 +- .../processor/DocumentProcessor.java | 8 +++++++ .../converting/processor/DomainProcessor.java | 21 +++++------------ 6 files changed, 58 insertions(+), 16 deletions(-) diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java index 9598d002..ce01ebce 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java @@ -17,6 +17,10 @@ public interface SerializableCrawlDataStream extends AutoCloseable { SerializableCrawlData next() throws IOException; + /** Return a size hint for the stream. 0 is returned if the hint is not available, + * or if the file is seemed too small to bother */ + default int sizeHint() { return 0; } + boolean hasNext() throws IOException; @Nullable diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java index bdcda9dc..94fafe29 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java @@ -14,6 +14,7 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.net.URISyntaxException; +import java.nio.file.Files; import java.nio.file.Path; import java.util.*; @@ -37,6 +38,21 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial return path; } + public int sizeHint() { + // Only calculate size hint for large files + // (the reason we calculate them in the first place is to assess whether it is large + // because it has many documents, or because it is a small number of large documents) + try { + if (Files.size(path) > 10_000_000) { + return CrawledDocumentParquetRecordFileReader.countGoodStatusCodes(path); + } + } catch (IOException e) { + // suppressed + } + + return 0; + } + @Override @SneakyThrows public boolean hasNext() { diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java index 7e8c7501..31d644ec 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java @@ -1,11 +1,13 @@ package nu.marginalia.crawling.parquet; +import blue.strategic.parquet.Hydrator; import blue.strategic.parquet.HydratorSupplier; import blue.strategic.parquet.ParquetReader; import org.jetbrains.annotations.NotNull; import java.io.IOException; import java.nio.file.Path; +import java.util.List; import java.util.stream.Stream; public class CrawledDocumentParquetRecordFileReader { @@ -16,4 +18,25 @@ public class CrawledDocumentParquetRecordFileReader { HydratorSupplier.constantly(CrawledDocumentParquetRecord.newHydrator())); } + /** Count the number of documents with a 200 status code */ + public static int countGoodStatusCodes(Path path) throws IOException { + return (int) ParquetReader.streamContent(path.toFile(), + HydratorSupplier.constantly(new Hydrator() { + @Override + public Integer start() { return 0; } + @Override + public Integer add(Integer target, String heading, Object value) { + if ("statusCode".equals(heading) && Integer.valueOf(200).equals(value)) { + return 1; + } + return 0; + } + @Override + public Integer finish(Integer target) { return target; } + }), + List.of("statusCode")) + .mapToInt(Integer::valueOf) + .count(); + } } + diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java index 2146f52b..b7be3f8b 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java @@ -32,7 +32,7 @@ public class ProcessedDomain implements ConverterBatchWritableIf { @Override public void write(ConverterBatchWriter writer) throws IOException { - writer.writeDomainData(this); + writer.writeProcessedDomain(this); } @Override diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java index a9043e33..96392920 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java @@ -4,6 +4,7 @@ import com.google.inject.Inject; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawlerDocumentStatus; +import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.UrlIndexingState; import nu.marginalia.converting.model.DisqualifiedException; @@ -40,6 +41,7 @@ public class DocumentProcessor { } public ProcessedDocument process(CrawledDocument crawledDocument, + EdgeDomain domain, DomainLinks externalDomainLinks, DocumentDecorator documentDecorator) { ProcessedDocument ret = new ProcessedDocument(); @@ -48,6 +50,12 @@ public class DocumentProcessor { // We must always provide the URL, even if we don't process the document ret.url = getDocumentUrl(crawledDocument); + if (!Objects.equals(ret.url.domain, domain)) { + ret.state = UrlIndexingState.DISQUALIFIED; + ret.stateReason = DisqualifiedException.DisqualificationReason.PROCESSING_EXCEPTION.toString(); + return ret; + } + DocumentClass documentClass = switch (externalDomainLinks.countForUrl(ret.url)) { case 0 -> DocumentClass.NORMAL; case 1 -> DocumentClass.EXTERNALLY_LINKED_ONCE; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index 8aff30eb..391be0df 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -20,15 +20,12 @@ import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.model.EdgeDomain; import nu.marginalia.converting.processor.logic.links.TopKeywords; import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator; -import nu.marginalia.util.ProcessingIterator; import org.apache.commons.lang3.StringUtils; import org.jetbrains.annotations.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; import java.sql.SQLException; import java.util.*; import java.util.regex.Pattern; @@ -40,11 +37,6 @@ public class DomainProcessor { private final AnchorTextKeywords anchorTextKeywords; private final GeoIpDictionary geoIpDictionary; - - // The threshold for running a cheaper sideloading-style process - // (10 MB is ~ 99.5%th percentile of domain data sizes) - private static final long DOMAIN_SIDELOAD_THRESHOLD = 10_000_000L; - private final Logger logger = LoggerFactory.getLogger(getClass()); @Inject @@ -63,12 +55,11 @@ public class DomainProcessor { geoIpDictionary.waitReady(); } - public ConverterBatchWritableIf createWritable(SerializableCrawlDataStream domain) throws IOException { - Path filePath = domain.path(); - - if (filePath != null && Files.size(filePath) > DOMAIN_SIDELOAD_THRESHOLD) { + public ConverterBatchWritableIf createWritable(SerializableCrawlDataStream domain) { + if (domain.sizeHint() > 10_000) { // If the file is too big, we run a processing mode that doesn't // require loading the entire dataset into RAM + logger.info("Sideloading {}", domain.path()); return sideloadProcessing(domain); } @@ -100,7 +91,7 @@ public class DomainProcessor { if (!dataStream.hasNext() || !(dataStream.next() instanceof CrawledDomain crawledDomain)) { - throw new IllegalStateException("First record must be a domain"); + throw new IllegalStateException("First record must be a domain, was " + dataStream.next().getClass().getSimpleName()); } domain = new ProcessedDomain(); @@ -135,7 +126,7 @@ public class DomainProcessor { if (doc.url == null || !processedUrls.add(doc.url)) continue; - var processedDoc = documentProcessor.process(doc, externalDomainLinks, documentDecorator); + var processedDoc = documentProcessor.process(doc, domain.domain, externalDomainLinks, documentDecorator); deduplicator.markIfDuplicate(processedDoc); next = processedDoc; @@ -226,7 +217,7 @@ public class DomainProcessor { if (doc.url == null || !processedUrls.add(doc.url)) continue; - var processedDoc = documentProcessor.process(doc, externalDomainLinks, documentDecorator); + var processedDoc = documentProcessor.process(doc, ret.domain, externalDomainLinks, documentDecorator); deduplicator.markIfDuplicate(processedDoc); From e7dd28b926d6886dc7e2a177fdae3d38098263d0 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 29 Dec 2023 14:25:48 +0100 Subject: [PATCH 10/61] (converter) Optimize sideload-loading Use ProcessingIterator to fan out processing of documents across more cores, instead of doing all of it in the writer thread blocking everything else with slow single-threaded processing. --- .../converting/processor/DomainProcessor.java | 50 ++++++------------- 1 file changed, 15 insertions(+), 35 deletions(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index 391be0df..f108321a 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -20,6 +20,7 @@ import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.model.EdgeDomain; import nu.marginalia.converting.processor.logic.links.TopKeywords; import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator; +import nu.marginalia.util.ProcessingIterator; import org.apache.commons.lang3.StringUtils; import org.jetbrains.annotations.Nullable; import org.slf4j.Logger; @@ -110,26 +111,21 @@ public class DomainProcessor { @Override public Iterator getDocumentsStream() { - return new DocumentsIterator(); - } + return new ProcessingIterator<>(24, 16, (taskConsumer) -> { + while (dataStream.hasNext()) + { + if (!(dataStream.next() instanceof CrawledDocument doc)) + continue; + if (doc.url == null || !processedUrls.add(doc.url)) + continue; - class DocumentsIterator implements Iterator { - ProcessedDocument next = null; - @Override - public boolean hasNext() { - try { - while (next == null - && dataStream.hasNext()) - { - if (!(dataStream.next() instanceof CrawledDocument doc)) - continue; - if (doc.url == null || !processedUrls.add(doc.url)) - continue; + taskConsumer.accept(() -> { var processedDoc = documentProcessor.process(doc, domain.domain, externalDomainLinks, documentDecorator); - deduplicator.markIfDuplicate(processedDoc); - next = processedDoc; + synchronized (deduplicator) { + deduplicator.markIfDuplicate(processedDoc); + } if (processedDoc.isProcessedFully()) { // This is a bit sketchy, but we need to set the size and topology to something @@ -137,26 +133,10 @@ public class DomainProcessor { 10_000, externalDomainLinks.countForUrl(processedDoc.url)); } - return true; - } + return processedDoc; + }); } - catch (IOException ex) { - logger.warn("Failed to process domain sideload", ex); - } - - return false; - } - - @Override - public ProcessedDocument next() { - try { - if (next == null && !hasNext()) - throw new NoSuchElementException(); - return next; - } finally { - next = null; - } - } + }); } @Override From 647d38007f6f10878942f3ea3f61c37eba976a97 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 29 Dec 2023 14:27:58 +0100 Subject: [PATCH 11/61] Reduce queue polling time in ProcessingIterator Updated ProcessingIterator's queue polling from one second to 50 milliseconds for improved performance. This facilitates faster document processing across more cores, reducing bottlenecks and slow single-threaded processing. --- .../src/main/java/nu/marginalia/util/ProcessingIterator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/libraries/blocking-thread-pool/src/main/java/nu/marginalia/util/ProcessingIterator.java b/code/libraries/blocking-thread-pool/src/main/java/nu/marginalia/util/ProcessingIterator.java index 15dbc087..edcc038a 100644 --- a/code/libraries/blocking-thread-pool/src/main/java/nu/marginalia/util/ProcessingIterator.java +++ b/code/libraries/blocking-thread-pool/src/main/java/nu/marginalia/util/ProcessingIterator.java @@ -75,7 +75,7 @@ public class ProcessingIterator implements Iterator { return true; do { - next = queue.poll(1, TimeUnit.SECONDS); + next = queue.poll(50, TimeUnit.MILLISECONDS); if (next != null) { return true; } From a1f3ccdd6d2283c5510aa11ded39d677803a1066 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 29 Dec 2023 14:59:39 +0100 Subject: [PATCH 12/61] Fix bug in ProcessingIterator where it would run the tasks in only one single thread instead of using the pool --- .../src/main/java/nu/marginalia/util/ProcessingIterator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/libraries/blocking-thread-pool/src/main/java/nu/marginalia/util/ProcessingIterator.java b/code/libraries/blocking-thread-pool/src/main/java/nu/marginalia/util/ProcessingIterator.java index edcc038a..08459b76 100644 --- a/code/libraries/blocking-thread-pool/src/main/java/nu/marginalia/util/ProcessingIterator.java +++ b/code/libraries/blocking-thread-pool/src/main/java/nu/marginalia/util/ProcessingIterator.java @@ -38,7 +38,7 @@ public class ProcessingIterator implements Iterator { private void executeJob(ProcessingJob job) { try { - job.run(this::executeTask); + job.run(j -> executorService.submit(() -> executeTask(j))); } catch (Exception e) { logger.warn("Exception while processing", e); } finally { From ba8a75c84bf1d182b136e74106792f09e9e5df26 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 29 Dec 2023 15:10:32 +0100 Subject: [PATCH 13/61] Fix bug in ProcessingIterator where it would run the tasks in only one single thread instead of using the pool --- .../nu/marginalia/util/ProcessingIterator.java | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/code/libraries/blocking-thread-pool/src/main/java/nu/marginalia/util/ProcessingIterator.java b/code/libraries/blocking-thread-pool/src/main/java/nu/marginalia/util/ProcessingIterator.java index 08459b76..52c93bb4 100644 --- a/code/libraries/blocking-thread-pool/src/main/java/nu/marginalia/util/ProcessingIterator.java +++ b/code/libraries/blocking-thread-pool/src/main/java/nu/marginalia/util/ProcessingIterator.java @@ -38,7 +38,7 @@ public class ProcessingIterator implements Iterator { private void executeJob(ProcessingJob job) { try { - job.run(j -> executorService.submit(() -> executeTask(j))); + job.run(this::executeTask); } catch (Exception e) { logger.warn("Exception while processing", e); } finally { @@ -53,13 +53,15 @@ public class ProcessingIterator implements Iterator { return; } - try { - queue.put(task.get()); - } catch (Exception e) { - logger.warn("Exception while processing", e); - } finally { - sem.release(); - } + executorService.submit(() -> { + try { + queue.put(task.get()); + } catch (Exception e) { + logger.warn("Exception while processing", e); + } finally { + sem.release(); + } + }); } /** Returns true if there are more documents to be processed. From 68ac8d3e0954fa672dc23848f56fe039983c36ed Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 29 Dec 2023 16:37:00 +0100 Subject: [PATCH 14/61] (search) Fetch fewer linking and similar domains. Showing a total of 200 connected domains is not very informative. --- .../java/nu/marginalia/search/svc/SearchSiteInfoService.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java index a6e9e381..28c5740d 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java @@ -145,10 +145,10 @@ public class SearchSiteInfoService { else { domainInfo = assistantClient.domainInformation(ctx, domainId).blockingFirst(); similarSet = assistantClient - .similarDomains(ctx, domainId, 100) + .similarDomains(ctx, domainId, 25) .blockingFirst(); linkingDomains = assistantClient - .linkedDomains(ctx, domainId, 100) + .linkedDomains(ctx, domainId, 25) .blockingFirst(); } From 0b112cb4d4f0fe621553ce3c986682e6d1dacff4 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 29 Dec 2023 19:41:37 +0100 Subject: [PATCH 15/61] (warc) Update URL encoding in WarcProtocolReconstructor The URI query string is now URL encoded in the WarcProtocolReconstructor. This change ensures proper encoding of special characters as per the standard URL encoding rules and improves URL validity during the crawling process. --- .../crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java index ad29056f..40d98d73 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java @@ -26,7 +26,7 @@ public class WarcProtocolReconstructor { requestStringBuilder.append(request.method()).append(" ").append(encodedURL); if (uri.getQuery() != null) { - requestStringBuilder.append("?").append(uri.getQuery()); + requestStringBuilder.append("?").append(URLEncoder.encode(uri.getQuery(), StandardCharsets.UTF_8)); } requestStringBuilder.append(" HTTP/1.1\r\n"); requestStringBuilder.append("Host: ").append(uri.getHost()).append("\r\n"); From 7ba296ccdfea1ba6f57358ba3d94ec96101b99c0 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 30 Dec 2023 13:05:10 +0100 Subject: [PATCH 16/61] (converter) Route sizeHint to SideloadProcessing Route the sizeHint from the input parquet file to SideloadProcessing, so that it can set sideloadSizeAdvice appropriately, instead of using a fixed "large" number. This is necessary to populate the KNOWN_URL column in the domain data table, which is important as it is used in e.g. calculating how far to re-crawl the site in the future. --- .../converting/processor/DomainProcessor.java | 18 +++++++++--------- .../converting/ConvertingIntegrationTest.java | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index f108321a..e97aa057 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -57,19 +57,20 @@ public class DomainProcessor { } public ConverterBatchWritableIf createWritable(SerializableCrawlDataStream domain) { - if (domain.sizeHint() > 10_000) { + final int sizeHint = domain.sizeHint(); + + if (sizeHint > 10_000) { // If the file is too big, we run a processing mode that doesn't // require loading the entire dataset into RAM - logger.info("Sideloading {}", domain.path()); - return sideloadProcessing(domain); + return sideloadProcessing(domain, sizeHint); } return fullProcessing(domain); } - public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream) { + public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) { try { - return new SideloadProcessing(dataStream); + return new SideloadProcessing(dataStream, sizeHint); } catch (Exception ex) { logger.warn("Failed to process domain sideload", ex); @@ -86,17 +87,16 @@ public class DomainProcessor { private final DomainLinks externalDomainLinks; private final LshDocumentDeduplicator deduplicator = new LshDocumentDeduplicator(); - SideloadProcessing(SerializableCrawlDataStream dataStream) throws IOException { + SideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) throws IOException { this.dataStream = dataStream; - if (!dataStream.hasNext() - || !(dataStream.next() instanceof CrawledDomain crawledDomain)) + if (!dataStream.hasNext() || !(dataStream.next() instanceof CrawledDomain crawledDomain)) { throw new IllegalStateException("First record must be a domain, was " + dataStream.next().getClass().getSimpleName()); } domain = new ProcessedDomain(); - domain.sizeloadSizeAdvice = 10_000; + domain.sizeloadSizeAdvice = sizeHint == 0 ? 10_000 : sizeHint; documentDecorator = new DocumentDecorator(anchorTextKeywords); processDomain(crawledDomain, domain, documentDecorator); diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java index 141777d6..61de3c38 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java @@ -96,7 +96,7 @@ public class ConvertingIntegrationTest { @Test public void testMemexMarginaliaNuSideloadProcessing() throws IOException { - var ret = domainProcessor.sideloadProcessing(asSerializableCrawlData(readMarginaliaWorkingSet())); + var ret = domainProcessor.sideloadProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()), 100); assertNotNull(ret); assertEquals("memex.marginalia.nu", ret.id()); From 70c83b60a18fa72f0306df8095b02855b838edc7 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 30 Dec 2023 13:36:18 +0100 Subject: [PATCH 17/61] (converter) Clean up fullProcessing() This function made some very flimsy-looking assumptions about the order of an iterable. These are still made, but more explicitly so. --- .../converting/processor/DomainProcessor.java | 62 ++++++++----------- 1 file changed, 27 insertions(+), 35 deletions(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index e97aa057..a25384a3 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -99,6 +99,7 @@ public class DomainProcessor { domain.sizeloadSizeAdvice = sizeHint == 0 ? 10_000 : sizeHint; documentDecorator = new DocumentDecorator(anchorTextKeywords); + processDomain(crawledDomain, domain, documentDecorator); externalDomainLinks = anchorTagsSource.getAnchorTags(domain.domain); @@ -164,50 +165,41 @@ public class DomainProcessor { return null; } - ProcessedDomain ret = new ProcessedDomain(); List docs = new ArrayList<>(); Set processedUrls = new HashSet<>(); - DomainLinks externalDomainLinks = null; + if (!(dataStream.next() instanceof CrawledDomain crawledDomain)) { + throw new IllegalStateException("First record must be a domain, was " + dataStream.next().getClass().getSimpleName()); + } - DocumentDecorator documentDecorator = null; + DomainLinks externalDomainLinks = anchorTagsSource.getAnchorTags(crawledDomain.getDomain()); + DocumentDecorator documentDecorator = new DocumentDecorator(anchorTextKeywords); - try (var deduplicator = new LshDocumentDeduplicator()){ + // Process Domain Record + + ProcessedDomain ret = new ProcessedDomain(); + processDomain(crawledDomain, ret, documentDecorator); + ret.documents = docs; + + // Process Documents + + try (var deduplicator = new LshDocumentDeduplicator()) { while (dataStream.hasNext()) { - var data = dataStream.next(); + if (!(dataStream.next() instanceof CrawledDocument doc)) + continue; + if (doc.url == null) + continue; + if (!processedUrls.add(doc.url)) + continue; - // Do a lazy load of the external domain links since we don't know the domain - // until we see the first document - if (externalDomainLinks == null) { - var domain = data.getDomain(); - - if (domain != null) { - externalDomainLinks = anchorTagsSource.getAnchorTags(domain); - } - } - - if (data instanceof CrawledDomain crawledDomain) { - documentDecorator = new DocumentDecorator(anchorTextKeywords); - - processDomain(crawledDomain, ret, documentDecorator); - ret.documents = docs; - - } else if (data instanceof CrawledDocument doc) { - try { - if (doc.url == null || !processedUrls.add(doc.url)) - continue; - - var processedDoc = documentProcessor.process(doc, ret.domain, externalDomainLinks, documentDecorator); - - deduplicator.markIfDuplicate(processedDoc); - - docs.add(processedDoc); - } catch (Exception ex) { - logger.warn("Failed to process " + doc.url, ex); - } + try { + var processedDoc = documentProcessor.process(doc, ret.domain, externalDomainLinks, documentDecorator); + deduplicator.markIfDuplicate(processedDoc); + docs.add(processedDoc); + } catch (Exception ex) { + logger.warn("Failed to process " + doc.url, ex); } } - } // Add late keywords and features from domain-level information From 7a1d20ed0a17532ef9f91ad8c62ea8b9db5364d6 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 30 Dec 2023 13:53:55 +0100 Subject: [PATCH 18/61] (converter) Better use of ProcessingIterator Modify processingiterator to be constructed via a factory, to enable re-use of its backing executor service. This reduces thread churn in the converter sideloader style processing of regular crawl data. --- .../marginalia/util/ProcessingIterator.java | 39 +++++++++++++++---- .../util/ProcessingIteratorTest.java | 3 +- .../converting/processor/DomainProcessor.java | 3 +- .../EncyclopediaMarginaliaNuSideloader.java | 3 +- 4 files changed, 38 insertions(+), 10 deletions(-) rename code/{processes/converting-process => libraries/blocking-thread-pool}/src/test/java/nu/marginalia/util/ProcessingIteratorTest.java (89%) diff --git a/code/libraries/blocking-thread-pool/src/main/java/nu/marginalia/util/ProcessingIterator.java b/code/libraries/blocking-thread-pool/src/main/java/nu/marginalia/util/ProcessingIterator.java index 52c93bb4..523381fa 100644 --- a/code/libraries/blocking-thread-pool/src/main/java/nu/marginalia/util/ProcessingIterator.java +++ b/code/libraries/blocking-thread-pool/src/main/java/nu/marginalia/util/ProcessingIterator.java @@ -26,16 +26,20 @@ public class ProcessingIterator implements Iterator { private final int parallelism; - public ProcessingIterator(int queueSize, int parallelism, ProcessingJob task) { + ProcessingIterator(ExecutorService executorService, int queueSize, int parallelism, ProcessingJob task) { this.parallelism = parallelism; queue = new LinkedBlockingQueue<>(queueSize); - executorService = Executors.newFixedThreadPool(parallelism); + this.executorService = executorService; sem = new Semaphore(parallelism); executorService.submit(() -> executeJob(task)); } + public static Factory factory(int queueSize, int parallelism) { + return new Factory(queueSize, parallelism); + } + private void executeJob(ProcessingJob job) { try { job.run(this::executeTask); @@ -83,10 +87,6 @@ public class ProcessingIterator implements Iterator { } } while (expectMore()); - if (!executorService.isShutdown()) { - executorService.shutdown(); - } - return false; } @@ -128,14 +128,39 @@ public class ProcessingIterator implements Iterator { * performed in parallel */ public interface ProcessingJob { + void run(Consumer> output) throws Exception; } - /** * A single task that produces a result to be iterable via the Iterator interface * (along with other tasks' outputs) */ public interface Task { + T get() throws Exception; } + + public static class Factory { + private final int queueSize; + private final int parallelism; + private final ExecutorService executorService; + + Factory(int queueSize, int parallelism) { + this.queueSize = queueSize; + this.parallelism = parallelism; + this.executorService = Executors.newFixedThreadPool(parallelism); + } + + public ProcessingIterator create(ProcessingJob task) { + return new ProcessingIterator<>(executorService, queueSize, parallelism, task); + } + + public void stop() { + if (!executorService.isShutdown()) { + executorService.shutdown(); + } + } + } + } + diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/util/ProcessingIteratorTest.java b/code/libraries/blocking-thread-pool/src/test/java/nu/marginalia/util/ProcessingIteratorTest.java similarity index 89% rename from code/processes/converting-process/src/test/java/nu/marginalia/util/ProcessingIteratorTest.java rename to code/libraries/blocking-thread-pool/src/test/java/nu/marginalia/util/ProcessingIteratorTest.java index d20b7ddf..dd0f8d14 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/util/ProcessingIteratorTest.java +++ b/code/libraries/blocking-thread-pool/src/test/java/nu/marginalia/util/ProcessingIteratorTest.java @@ -3,6 +3,7 @@ package nu.marginalia.util; import org.junit.jupiter.api.Test; import java.util.HashSet; +import java.util.Iterator; import java.util.Set; import java.util.concurrent.TimeUnit; @@ -14,7 +15,7 @@ class ProcessingIteratorTest { @Test public void test() { Set output = new HashSet<>(); - var iter = new ProcessingIterator(2, 2, q -> { + Iterator iter = ProcessingIterator.factory(2, 2).create(q -> { for (int i = 0; i < 10_000; i++) { int j = i; q.accept(() -> task(j)); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index a25384a3..630f97f7 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -86,6 +86,7 @@ public class DomainProcessor { private final Set processedUrls = new HashSet<>(); private final DomainLinks externalDomainLinks; private final LshDocumentDeduplicator deduplicator = new LshDocumentDeduplicator(); + private static ProcessingIterator.Factory iteratorFactory = ProcessingIterator.factory(24, 16); SideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) throws IOException { this.dataStream = dataStream; @@ -112,7 +113,7 @@ public class DomainProcessor { @Override public Iterator getDocumentsStream() { - return new ProcessingIterator<>(24, 16, (taskConsumer) -> { + return iteratorFactory.create((taskConsumer) -> { while (dataStream.hasNext()) { if (!(dataStream.next() instanceof CrawledDocument doc)) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java index f0686b4c..3220703a 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java @@ -76,7 +76,8 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC @SneakyThrows @Override public Iterator getDocumentsStream() { - return new ProcessingIterator<>(24, 16, (taskConsumer) -> { + // This leaks a thread pool, but it doesn't matter since this is a one-off process + return ProcessingIterator.factory(24, 16).create((taskConsumer) -> { DomainLinks domainLinks = getDomainLinks(); var stmt = connection.prepareStatement(""" From 0fe44c9bf2e064782f9de55a2f9065a98d5437d1 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 30 Dec 2023 13:56:44 +0100 Subject: [PATCH 19/61] (crawler) Fix broken test A necessary step was accidentally deleted when cleaning up these tests previously. --- .../nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java | 1 + 1 file changed, 1 insertion(+) diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index d5cf0c9c..3e8eb775 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -272,6 +272,7 @@ class CrawlerRetreiverTest { tempFileWarc2 = Files.createTempFile("crawling-process", ".warc.gz"); doCrawl(tempFileWarc1, specs); + convertToParquet(tempFileWarc1, tempFileParquet1); doCrawlWithReferenceStream(specs, CrawledDomainReader.createDataStream(tempFileParquet1) ); From 75d87c73d1accfc0477eba66581dd51445110270 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 30 Dec 2023 13:56:44 +0100 Subject: [PATCH 20/61] (crawler) Disable Java's infinite DNS caching --- .../src/main/java/nu/marginalia/crawl/CrawlerMain.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index edbf219f..4b97200b 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -41,6 +41,7 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardCopyOption; +import java.security.Security; import java.sql.SQLException; import java.util.*; import java.util.concurrent.*; @@ -100,11 +101,15 @@ public class CrawlerMain { } public static void main(String... args) throws Exception { + if (!AbortMonitor.getInstance().isAlive()) { System.err.println("Remove abort file first"); return; } + // Prevent Java from caching DNS lookups forever (filling up the system RAM as a result) + Security.setProperty("networkaddress.cache.ttl" , "3600"); + // This must run *early* System.setProperty("http.agent", WmsaHome.getUserAgent().uaString()); From 7f3f3f577c855d48ea60b1f759c3acabce87065a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 1 Jan 2024 15:20:57 +0100 Subject: [PATCH 21/61] (backup) Add task heartbeats to the backup service --- .../java/nu/marginalia/svc/BackupService.java | 40 +++++++++++++++---- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/svc/BackupService.java b/code/services-core/executor-service/src/main/java/nu/marginalia/svc/BackupService.java index b84d2bec..e78c5e2f 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/svc/BackupService.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/svc/BackupService.java @@ -3,6 +3,7 @@ package nu.marginalia.svc; import com.github.luben.zstd.ZstdInputStream; import com.github.luben.zstd.ZstdOutputStream; import nu.marginalia.IndexLocations; +import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.FileStorageBaseType; import nu.marginalia.storage.model.FileStorageId; @@ -21,10 +22,19 @@ import java.util.List; public class BackupService { private final FileStorageService storageService; + private final ServiceHeartbeat serviceHeartbeat; + + public enum BackupHeartbeatSteps { + LINKS, + JOURNAL, + DONE + } @Inject - public BackupService(FileStorageService storageService) { + public BackupService(FileStorageService storageService, + ServiceHeartbeat serviceHeartbeat) { this.storageService = storageService; + this.serviceHeartbeat = serviceHeartbeat; } /** Create a new backup of the contents in the _STAGING storage areas. @@ -42,13 +52,22 @@ public class BackupService { storageService.relateFileStorages(associatedId, backupStorage.id()); } - var indexStagingStorage = IndexLocations.getIndexConstructionArea(storageService); var linkdbStagingStorage = IndexLocations.getLinkdbWritePath(storageService); - backupFileCompressed("links.db", linkdbStagingStorage, backupStorage.asPath()); - // This file format is already compressed - backupJournal(indexStagingStorage, backupStorage.asPath()); + + try (var heartbeat = serviceHeartbeat.createServiceTaskHeartbeat(BackupHeartbeatSteps.class, "Backup")) { + heartbeat.progress(BackupHeartbeatSteps.LINKS); + backupFileCompressed("links.db", linkdbStagingStorage, backupStorage.asPath()); + + heartbeat.progress(BackupHeartbeatSteps.JOURNAL); + // This file format is already compressed + backupJournal(indexStagingStorage, backupStorage.asPath()); + + heartbeat.progress(BackupHeartbeatSteps.DONE); + } + + } @@ -59,8 +78,15 @@ public class BackupService { var indexStagingStorage = IndexLocations.getIndexConstructionArea(storageService); var linkdbStagingStorage = IndexLocations.getLinkdbWritePath(storageService); - restoreBackupCompressed("links.db", linkdbStagingStorage, backupStorage); - restoreJournal(indexStagingStorage, backupStorage); + try (var heartbeat = serviceHeartbeat.createServiceTaskHeartbeat(BackupHeartbeatSteps.class, "Restore Backup")) { + heartbeat.progress(BackupHeartbeatSteps.LINKS); + restoreBackupCompressed("links.db", linkdbStagingStorage, backupStorage); + + heartbeat.progress(BackupHeartbeatSteps.JOURNAL); + restoreJournal(indexStagingStorage, backupStorage); + + heartbeat.progress(BackupHeartbeatSteps.DONE); + } } From e46e174b598ec6db9217459f6bf855a37a2dc290 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 1 Jan 2024 15:21:51 +0100 Subject: [PATCH 22/61] (keyword-extractor) Add another test for Name-extractor --- .../extractors/NameLikeKeywordsTest.java | 41 +++ .../src/test/resources/test-data/java.html | 348 ++++++++++++++++++ 2 files changed, 389 insertions(+) create mode 100644 code/features-convert/keyword-extraction/src/test/resources/test-data/java.html diff --git a/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/extractors/NameLikeKeywordsTest.java b/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/extractors/NameLikeKeywordsTest.java index b08a2353..20147193 100644 --- a/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/extractors/NameLikeKeywordsTest.java +++ b/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/extractors/NameLikeKeywordsTest.java @@ -1,12 +1,18 @@ package nu.marginalia.keyword.extractors; import com.google.common.collect.Sets; +import lombok.SneakyThrows; +import nu.marginalia.WmsaHome; +import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; import nu.marginalia.keyword.KeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.test.util.TestLanguageModels; +import org.jsoup.Jsoup; import org.junit.jupiter.api.Test; +import java.nio.charset.Charset; import java.util.Collections; +import java.util.Objects; import java.util.Set; import java.util.stream.Collectors; @@ -50,4 +56,39 @@ class NameLikeKeywordsTest { assertEquals(Collections.emptySet(), Sets.symmetricDifference(actual, expected)); } + + @Test + @SneakyThrows + public void testWikiArticle() { + var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/java.html"), + "Could not load word frequency table"); + String html = new String(resource.readAllBytes(), Charset.defaultCharset()); + var doc = Jsoup.parse(html); + doc.filter(new DomPruningFilter(0)); + + SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); + + var ke = new KeywordExtractor(); + + var nameWords = new NameLikeKeywords(ke, se.extractSentences(doc), 2); + System.out.println("Names: " + nameWords.words()); + } + + @Test + @SneakyThrows + public void testWikiArticleP1() { + String html = """ +

Java is a high-level, class-based, object-oriented programming language that is designed to have as few implementation dependencies as possible. It is a general-purpose programming language intended to let programmers write once, run anywhere (WORA), meaning that compiled Java code can run on all platforms that support Java without the need to recompile. Java applications are typically compiled to bytecode that can run on any Java virtual machine (JVM) regardless of the underlying computer architecture. The syntax of Java is similar to C and C++, but has fewer low-level facilities than either of them. The Java runtime provides dynamic capabilities (such as reflection and runtime code modification) that are typically not available in traditional compiled languages. As of 2019 , Java was one of the most popular programming languages in use according to GitHub, particularly for client–server web applications, with a reported 9 million developers.

+

Java was originally developed by James Gosling at Sun Microsystems. It was released in May 1995 as a core component of Sun Microsystems' Java platform. The original and reference implementation Java compilers, virtual machines, and class libraries were originally released by Sun under proprietary licenses. As of May 2007, in compliance with the specifications of the Java Community Process, Sun had relicensed most of its Java technologies under the GPL-2.0-only license. Oracle offers its own HotSpot Java Virtual Machine, however the official reference implementation is the OpenJDK JVM which is free open-source software and used by most developers and is the default JVM for almost all Linux distributions.

+

As of September 2023 , Java 21 is the latest version, while Java 17, 11 and 8 are the current long-term support (LTS) versions.

"""; + var doc = Jsoup.parse(html); + doc.filter(new DomPruningFilter(0)); + + SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); + + var ke = new KeywordExtractor(); + + var nameWords = new NameLikeKeywords(ke, se.extractSentences(doc), 2); + System.out.println("Names: " + nameWords.words()); + } } \ No newline at end of file diff --git a/code/features-convert/keyword-extraction/src/test/resources/test-data/java.html b/code/features-convert/keyword-extraction/src/test/resources/test-data/java.html new file mode 100644 index 00000000..3f37ae8c --- /dev/null +++ b/code/features-convert/keyword-extraction/src/test/resources/test-data/java.html @@ -0,0 +1,348 @@ + + + + + Java (programming language) + + + + +
+
+

Java (programming language)

+
+
+

Java is a high-level, class-based, object-oriented programming language that is designed to have as few implementation dependencies as possible. It is a general-purpose programming language intended to let programmers write once, run anywhere (WORA), meaning that compiled Java code can run on all platforms that support Java without the need to recompile. Java applications are typically compiled to bytecode that can run on any Java virtual machine (JVM) regardless of the underlying computer architecture. The syntax of Java is similar to C and C++, but has fewer low-level facilities than either of them. The Java runtime provides dynamic capabilities (such as reflection and runtime code modification) that are typically not available in traditional compiled languages. As of 2019 , Java was one of the most popular programming languages in use according to GitHub, particularly for client–server web applications, with a reported 9 million developers.

+

Java was originally developed by James Gosling at Sun Microsystems. It was released in May 1995 as a core component of Sun Microsystems' Java platform. The original and reference implementation Java compilers, virtual machines, and class libraries were originally released by Sun under proprietary licenses. As of May 2007, in compliance with the specifications of the Java Community Process, Sun had relicensed most of its Java technologies under the GPL-2.0-only license. Oracle offers its own HotSpot Java Virtual Machine, however the official reference implementation is the OpenJDK JVM which is free open-source software and used by most developers and is the default JVM for almost all Linux distributions.

+

As of September 2023 , Java 21 is the latest version, while Java 17, 11 and 8 are the current long-term support (LTS) versions.

+
+
+

History

+

James Gosling, Mike Sheridan, and Patrick Naughton initiated the Java language project in June 1991. Java was originally designed for interactive television, but it was too advanced for the digital cable television industry at the time. The language was initially called Oak after an oak tree that stood outside Gosling's office. Later the project went by the name Green and was finally renamed Java, from Java coffee, a type of coffee from Indonesia. Gosling designed Java with a C/C++-style syntax that system and application programmers would find familiar.

+

Sun Microsystems released the first public implementation as Java 1.0 in 1996. It promised write once, run anywhere (WORA) functionality, providing no-cost run-times on popular platforms. Fairly secure and featuring configurable security, it allowed network- and file-access restrictions. Major web browsers soon incorporated the ability to run Java applets within web pages, and Java quickly became popular. The Java 1.0 compiler was re-written in Java by Arthur van Hoff to comply strictly with the Java 1.0 language specification. With the advent of Java 2 (released initially as J2SE 1.2 in December 1998 – 1999), new versions had multiple configurations built for different types of platforms. J2EE included technologies and APIs for enterprise applications typically run in server environments, while J2ME featured APIs optimized for mobile applications. The desktop version was renamed J2SE. In 2006, for marketing purposes, Sun renamed new J2 versions as Java EE, Java ME, and Java SE, respectively.

+

In 1997, Sun Microsystems approached the ISO/IEC JTC 1 standards body and later the Ecma International to formalize Java, but it soon withdrew from the process. Java remains a de facto standard, controlled through the Java Community Process. At one time, Sun made most of its Java implementations available without charge, despite their proprietary software status. Sun generated revenue from Java through the selling of licenses for specialized products such as the Java Enterprise System.

+

On November 13, 2006, Sun released much of its Java virtual machine (JVM) as free and open-source software (FOSS), under the terms of the GPL-2.0-only license. On May 8, 2007, Sun finished the process, making all of its JVM's core code available under free software/open-source distribution terms, aside from a small portion of code to which Sun did not hold the copyright.

+

Sun's vice-president Rich Green said that Sun's ideal role with regard to Java was as an evangelist. Following Oracle Corporation's acquisition of Sun Microsystems in 2009–10, Oracle has described itself as the steward of Java technology with a relentless commitment to fostering a community of participation and transparency. This did not prevent Oracle from filing a lawsuit against Google shortly after that for using Java inside the Android SDK (see the Android section).

+

On April 2, 2010, James Gosling resigned from Oracle.

+

In January 2016, Oracle announced that Java run-time environments based on JDK 9 will discontinue the browser plugin.

+

Java software runs on everything from laptops to data centers, game consoles to scientific supercomputers.

+

Oracle (and others) highly recommend uninstalling outdated and unsupported versions of Java, due to unresolved security issues in older versions.

+
+

Principles

+

There were five primary goals in the creation of the Java language:

+
+
    +
  1. It must be simple, object-oriented, and familiar.
  2. +
  3. It must be robust and secure.
  4. +
  5. It must be architecture-neutral and portable.
  6. +
  7. It must execute with high performance.
  8. +
  9. It must be interpreted, threaded, and dynamic.
  10. +
+
+
+
+

Versions

+

As of September 2023 , Java 8, 11, 17 and 21 are supported as Long-Term Support (LTS) versions.

+

Oracle released the last zero-cost public update for the legacy version Java 8 LTS in January 2019 for commercial use, although it will otherwise still support Java 8 with public updates for personal use indefinitely. Other vendors have begun to offer zero-cost builds of OpenJDK 18 and 8, 11 and 17 that are still receiving security and other upgrades.

+

Major release versions of Java, along with their release dates:

+
+
+
+

Editions

+

Sun has defined and supports four editions of Java targeting different application environments and segmented many of its APIs so that they belong to one of the platforms. The platforms are:

+ +

The classes in the Java APIs are organized into separate groups called packages. Each package contains a set of related interfaces, classes, subpackages and exceptions.

+

Sun also provided an edition called Personal Java that has been superseded by later, standards-based Java ME configuration-profile pairings.

+
+
+

Execution system

+
+

Java JVM and bytecode

+

One design goal of Java is portability, which means that programs written for the Java platform must run similarly on any combination of hardware and operating system with adequate run time support. This is achieved by compiling the Java language code to an intermediate representation called Java bytecode, instead of directly to architecture-specific machine code. Java bytecode instructions are analogous to machine code, but they are intended to be executed by a virtual machine (VM) written specifically for the host hardware. End-users commonly use a Java Runtime Environment (JRE) installed on their device for standalone Java applications or a web browser for Java applets.

+

Standard libraries provide a generic way to access host-specific features such as graphics, threading, and networking.

+

The use of universal bytecode makes porting simple. However, the overhead of interpreting bytecode into machine instructions made interpreted programs almost always run more slowly than native executables. Just-in-time (JIT) compilers that compile byte-codes to machine code during runtime were introduced from an early stage. Java's Hotspot compiler is actually two compilers in one; and with GraalVM (included in e.g. Java 11, but removed as of Java 16) allowing tiered compilation. Java itself is platform-independent and is adapted to the particular platform it is to run on by a Java virtual machine (JVM), which translates the Java bytecode into the platform's machine language.

+
+

Performance

+

Programs written in Java have a reputation for being slower and requiring more memory than those written in C++. However, Java programs' execution speed improved significantly with the introduction of just-in-time compilation in 1997/1998 for Java 1.1, the addition of language features supporting better code analysis (such as inner classes, the StringBuilder class, optional assertions, etc.), and optimizations in the Java virtual machine, such as HotSpot becoming Sun's default JVM in 2000. With Java 1.5, the performance was improved with the addition of the java.util.concurrent package, including lock-free implementations of the ConcurrentMaps and other multi-core collections, and it was improved further with Java 1.6.

+
+
+
+

Non-JVM

+

Some platforms offer direct hardware support for Java; there are micro controllers that can run Java bytecode in hardware instead of a software Java virtual machine, and some ARM-based processors could have hardware support for executing Java bytecode through their Jazelle option, though support has mostly been dropped in current implementations of ARM.

+
+
+

Automatic memory management

+

Java uses an automatic garbage collector to manage memory in the object lifecycle. The programmer determines when objects are created, and the Java runtime is responsible for recovering the memory once objects are no longer in use. Once no references to an object remain, the unreachable memory becomes eligible to be freed automatically by the garbage collector. Something similar to a memory leak may still occur if a programmer's code holds a reference to an object that is no longer needed, typically when objects that are no longer needed are stored in containers that are still in use. If methods for a non-existent object are called, a null pointer exception is thrown.

+

One of the ideas behind Java's automatic memory management model is that programmers can be spared the burden of having to perform manual memory management. In some languages, memory for the creation of objects is implicitly allocated on the stack or explicitly allocated and deallocated from the heap. In the latter case, the responsibility of managing memory resides with the programmer. If the program does not deallocate an object, a memory leak occurs. If the program attempts to access or deallocate memory that has already been deallocated, the result is undefined and difficult to predict, and the program is likely to become unstable or crash. This can be partially remedied by the use of smart pointers, but these add overhead and complexity. Garbage collection does not prevent logical memory leaks, i.e. those where the memory is still referenced but never used.

+

Garbage collection may happen at any time. Ideally, it will occur when a program is idle. It is guaranteed to be triggered if there is insufficient free memory on the heap to allocate a new object; this can cause a program to stall momentarily. Explicit memory management is not possible in Java.

+

Java does not support C/C++ style pointer arithmetic, where object addresses can be arithmetically manipulated (e.g. by adding or subtracting an offset). This allows the garbage collector to relocate referenced objects and ensures type safety and security.

+

As in C++ and some other object-oriented languages, variables of Java's primitive data types are either stored directly in fields (for objects) or on the stack (for methods) rather than on the heap, as is commonly true for non-primitive data types (but see escape analysis). This was a conscious decision by Java's designers for performance reasons.

+

Java contains multiple types of garbage collectors. Since Java 9, HotSpot uses the Garbage First Garbage Collector (G1GC) as the default. However, there are also several other garbage collectors that can be used to manage the heap. For most applications in Java, G1GC is sufficient. Previously, the Parallel Garbage Collector was used in Java 8.

+

Having solved the memory management problem does not relieve the programmer of the burden of handling properly other kinds of resources, like network or database connections, file handles, etc., especially in the presence of exceptions.

+
+
+
+

Syntax

+

The syntax of Java is largely influenced by C++ and C. Unlike C++, which combines the syntax for structured, generic, and object-oriented programming, Java was built almost exclusively as an object-oriented language. All code is written inside classes, and every data item is an object, with the exception of the primitive data types, (i.e. integers, floating-point numbers, boolean values, and characters), which are not objects for performance reasons. Java reuses some popular aspects of C++ (such as the printf method).

+

Unlike C++, Java does not support operator overloading or multiple inheritance for classes, though multiple inheritance is supported for interfaces.

+

Java uses comments similar to those of C++. There are three different styles of comments: a single line style marked with two slashes (//), a multiple line style opened with /* and closed with */, and the Javadoc commenting style opened with /** and closed with */. The Javadoc style of commenting allows the user to run the Javadoc executable to create documentation for the program and can be read by some integrated development environments (IDEs) such as Eclipse to allow developers to access documentation within the IDE.

+
+

Hello world example

+

The traditional Hello world program can be written in Java as:

+
+
publicclass Main{
+publicstaticvoidmain(String[]args){
+System.out.println("Hello World!");// Prints the string to the console.
+}
+}
+
+
+

All source files must be named after the public class they contain, appending the suffix .java, for example, HelloWorldApp.java. It must first be compiled into bytecode, using a Java compiler, producing a file with the .class suffix (Main.class, in this case). Only then can it be executed or launched. The Java source file may only contain one public class, but it can contain multiple classes with a non-public access modifier and any number of public inner classes. When the source file contains multiple classes, it is necessary to make one class (introduced by the class keyword) public (preceded by the public keyword) and name the source file with that public class name.

+

A class that is not declared public may be stored in any .java file. The compiler will generate a class file for each class defined in the source file. The name of the class file is the name of the class, with .class appended. For class file generation, anonymous classes are treated as if their name were the concatenation of the name of their enclosing class, a $, and an integer.

+

The keyword public denotes that a method can be called from code in other classes, or that a class may be used by classes outside the class hierarchy. The class hierarchy is related to the name of the directory in which the .java file is located. This is called an access level modifier. Other access level modifiers include the keywords private (a method that can only be accessed in the same class) and protected (which allows code from the same package to access). If a piece of code attempts to access private methods or protected methods, the JVM will throw a SecurityException.

+

The keyword static in front of a method indicates a static method, which is associated only with the class and not with any specific instance of that class. Only static methods can be invoked without a reference to an object. Static methods cannot access any class members that are not also static. Methods that are not designated static are instance methods and require a specific instance of a class to operate.

+

The keyword void indicates that the main method does not return any value to the caller. If a Java program is to exit with an error code, it must call System.exit() explicitly.

+

The method name main is not a keyword in the Java language. It is simply the name of the method the Java launcher calls to pass control to the program. Java classes that run in managed environments such as applets and Enterprise JavaBeans do not use or need a main() method. A Java program may contain multiple classes that have main methods, which means that the VM needs to be explicitly told which class to launch from.

+

The main method must accept an array of String objects. By convention, it is referenced as args although any other legal identifier name can be used. Since Java 5, the main method can also use variable arguments, in the form of public static void main(String... args), allowing the main method to be invoked with an arbitrary number of String arguments. The effect of this alternate declaration is semantically identical (to the args parameter which is still an array of String objects), but it allows an alternative syntax for creating and passing the array.

+

The Java launcher launches Java by loading a given class (specified on the command line or as an attribute in a JAR) and starting its public static void main(String[]) method. Stand-alone programs must declare this method explicitly. The String[] args parameter is an array of String objects containing any arguments passed to the class. The parameters to main are often passed by means of a command line.

+

Printing is part of a Java standard library: The System class defines a public static field called out. The out object is an instance of the PrintStream class and provides many methods for printing data to standard out, including println(String) which also appends a new line to the passed string.

+

The string "Hello World!" is automatically converted to a String object by the compiler.

+
+
+

Example with methods

+
+
// This is an example of a single line comment using two slashes
+
+/*
+* This is an example of a multiple line comment using the slash and asterisk.
+* This type of comment can be used to hold a lot of information or deactivate
+* code, but it is very important to remember to close the comment.
+*/
+
+packagefibsandlies;
+
+importjava.util.Map;
+importjava.util.HashMap;
+
+/**
+* This is an example of a Javadoc comment; Javadoc can compile documentation
+* from this text. Javadoc comments must immediately precede the class, method,
+* or field being documented.
+* @author Wikipedia Volunteers
+*/
+publicclass FibCalculatorextendsFibonacciimplementsCalculator{
+privatestaticMap<Integer,Integer>memoized=newHashMap<>();
+
+/*
+* The main method written as follows is used by the JVM as a starting point
+* for the program.
+*/
+publicstaticvoidmain(String[]args){
+memoized.put(1,1);
+memoized.put(2,1);
+System.out.println(fibonacci(12));// Get the 12th Fibonacci number and print to console
+}
+
+/**
+* An example of a method written in Java, wrapped in a class.
+* Given a non-negative number FIBINDEX, returns
+* the Nth Fibonacci number, where N equals FIBINDEX.
+*
+* @param fibIndex The index of the Fibonacci number
+* @return the Fibonacci number
+*/
+publicstaticintfibonacci(intfibIndex){
+if(memoized.containsKey(fibIndex)){
+returnmemoized.get(fibIndex);
+}
+
+intanswer=fibonacci(fibIndex-1)+fibonacci(fibIndex-2);
+memoized.put(fibIndex,answer);
+returnanswer;
+}
+}
+
+
+
+
+
+

Special classes

+
+

Applet

+

Java applets were programs that were embedded in other applications, typically in a Web page displayed in a web browser. The Java applet API is now deprecated since Java 9 in 2017.

+
+
+

Servlet

+

Java servlet technology provides Web developers with a simple, consistent mechanism for extending the functionality of a Web server and for accessing existing business systems. Servlets are server-side Java EE components that generate responses to requests from clients. Most of the time, this means generating HTML pages in response to HTTP requests, although there are a number of other standard servlet classes available, for example for WebSocket communication.

+

The Java servlet API has to some extent been superseded (but still used under the hood) by two standard Java technologies for web services:

+ +

Typical implementations of these APIs on Application Servers or Servlet Containers use a standard servlet for handling all interactions with the HTTP requests and responses that delegate to the web service methods for the actual business logic.

+
+
+

JavaServer Pages

+

JavaServer Pages (JSP) are server-side Java EE components that generate responses, typically HTML pages, to HTTP requests from clients. JSPs embed Java code in an HTML page by using the special delimiters <% and %>. A JSP is compiled to a Java servlet, a Java application in its own right, the first time it is accessed. After that, the generated servlet creates the response.

+
+
+

Swing application

+

Swing is a graphical user interface library for the Java SE platform. It is possible to specify a different look and feel through the pluggable look and feel system of Swing. Clones of Windows, GTK+, and Motif are supplied by Sun. Apple also provides an Aqua look and feel for macOS. Where prior implementations of these looks and feels may have been considered lacking, Swing in Java SE 6 addresses this problem by using more native GUI widget drawing routines of the underlying platforms.

+
+
+

JavaFX application

+

JavaFX is a software platform for creating and delivering desktop applications, as well as rich web applications that can run across a wide variety of devices. JavaFX is intended to replace Swing as the standard GUI library for Java SE, but since JDK 11 JavaFX has not been in the core JDK and instead in a separate module. JavaFX has support for desktop computers and web browsers on Microsoft Windows, Linux, and macOS. JavaFX does not have support for native OS look and feels.

+
+
+

Generics

+

In 2004, generics were added to the Java language, as part of J2SE 5.0. Prior to the introduction of generics, each variable declaration had to be of a specific type. For container classes, for example, this is a problem because there is no easy way to create a container that accepts only specific types of objects. Either the container operates on all subtypes of a class or interface, usually Object, or a different container class has to be created for each contained class. Generics allow compile-time type checking without having to create many container classes, each containing almost identical code. In addition to enabling more efficient code, certain runtime exceptions are prevented from occurring, by issuing compile-time errors. If Java prevented all runtime type errors (ClassCastExceptions) from occurring, it would be type safe.

+

In 2016, the type system of Java was proven unsound in that it is possible to use generics to construct classes and methods that allow assignment of an instance one class to a variable of another unrelated class. Such code is accepted by the compiler, but fails at run time with a class cast exception.

+
+
+
+

Criticism

+

Criticisms directed at Java include the implementation of generics, speed, the handling of unsigned numbers, the implementation of floating-point arithmetic, and a history of security vulnerabilities in the primary Java VM implementation HotSpot.

+
+
+

Class libraries

+

The Java Class Library is the standard library, developed to support application development in Java. It is controlled by Oracle in cooperation with others through the Java Community Process program. Companies or individuals participating in this process can influence the design and development of the APIs. This process has been a subject of controversy during the 2010s. The class library contains features such as:

+ +
+
+

Documentation

+

Javadoc is a comprehensive documentation system, created by Sun Microsystems. It provides developers with an organized system for documenting their code. Javadoc comments have an extra asterisk at the beginning, i.e. the delimiters are /** and */, whereas the normal multi-line comments in Java are delimited by /* and */, and single-line comments start with //.

+
+
+

Implementations

+

Oracle Corporation is the current owner of the official implementation of the Java SE platform, following their acquisition of Sun Microsystems on January 27, 2010. This implementation is based on the original implementation of Java by Sun. The Oracle implementation is available for Microsoft Windows (still works for XP, while only later versions are currently officially supported), macOS, Linux, and Solaris. Because Java lacks any formal standardization recognized by Ecma International, ISO/IEC, ANSI, or other third-party standards organizations, the Oracle implementation is the de facto standard.

+

The Oracle implementation is packaged into two different distributions: The Java Runtime Environment (JRE) which contains the parts of the Java SE platform required to run Java programs and is intended for end users, and the Java Development Kit (JDK), which is intended for software developers and includes development tools such as the Java compiler, Javadoc, Jar, and a debugger. Oracle has also released GraalVM, a high performance Java dynamic compiler and interpreter.

+

OpenJDK is another notable Java SE implementation that is licensed under the GNU GPL. The implementation started when Sun began releasing the Java source code under the GPL. As of Java SE 7, OpenJDK is the official Java reference implementation.

+

The goal of Java is to make all implementations of Java compatible. Historically, Sun's trademark license for usage of the Java brand insists that all implementations be compatible. This resulted in a legal dispute with Microsoft after Sun claimed that the Microsoft implementation did not support Java remote method invocation (RMI) or Java Native Interface (JNI) and had added platform-specific features of their own. Sun sued in 1997, and, in 2001, won a settlement of US$20 million, as well as a court order enforcing the terms of the license from Sun. As a result, Microsoft no longer ships Java with Windows.

+

Platform-independent Java is essential to Java EE, and an even more rigorous validation is required to certify an implementation. This environment enables portable server-side applications.

+
+
+

Use outside the Java platform

+

The Java programming language requires the presence of a software platform in order for compiled programs to be executed.

+

Oracle supplies the Java platform for use with Java. The Android SDK is an alternative software platform, used primarily for developing Android applications with its own GUI system.

+
+

Android

+

The Java language is a key pillar in Android, an open source mobile operating system. Although Android, built on the Linux kernel, is written largely in C, the Android SDK uses the Java language as the basis for Android applications but does not use any of its standard GUI, SE, ME or other established Java standards. The bytecode language supported by the Android SDK is incompatible with Java bytecode and runs on its own virtual machine, optimized for low-memory devices such as smartphones and tablet computers. Depending on the Android version, the bytecode is either interpreted by the Dalvik virtual machine or compiled into native code by the Android Runtime.

+

Android does not provide the full Java SE standard library, although the Android SDK does include an independent implementation of a large subset of it. It supports Java 6 and some Java 7 features, offering an implementation compatible with the standard library (Apache Harmony).

+
+

Controversy

+

The use of Java-related technology in Android led to a legal dispute between Oracle and Google. On May 7, 2012, a San Francisco jury found that if APIs could be copyrighted, then Google had infringed Oracle's copyrights by the use of Java in Android devices. District Judge William Alsup ruled on May 31, 2012, that APIs cannot be copyrighted, but this was reversed by the United States Court of Appeals for the Federal Circuit in May 2014. On May 26, 2016, the district court decided in favor of Google, ruling the copyright infringement of the Java API in Android constitutes fair use. In March 2018, this ruling was overturned by the Appeals Court, which sent down the case of determining the damages to federal court in San Francisco. Google filed a petition for writ of certiorari with the Supreme Court of the United States in January 2019 to challenge the two rulings that were made by the Appeals Court in Oracle's favor. On April 5, 2021, the Court ruled 6-2 in Google's favor, that its use of Java APIs should be considered fair use. However, the court refused to rule on the copyrightability of APIs, choosing instead to determine their ruling by considering Java's API copyrightable "purely for argument’s sake."

+
+
+
+
+

See also

+ +
+

Comparison of Java with other languages

+ +
+
+
+
+
+
+
+

External links

+ +
+
+
+
+ +
+
+
+ This encyclopedia contains articles issued from Wikipedia. + The text is licensed under CC BY-SA 3.0. + The wikipedia contents are from OpenZIM dumps, which typically lag behind the main Wikipedia project by up to a year. +
+ + \ No newline at end of file From dc90c9ac65f08f62fac0a5d6f88b4c3160596a20 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 1 Jan 2024 16:19:38 +0100 Subject: [PATCH 23/61] (sideload) Just index based on first paragraph This seems like it would make the wikipedia search result worse, but it drastically improves the result quality! This is because wikipedia has a lot of articles that each talk about a lot of irrelevant concepts, and indexing the entire document means tangentially relevant results tend to displace the most relevant results. --- .../encyclopedia/EncyclopediaMarginaliaNuSideloader.java | 1 + 1 file changed, 1 insertion(+) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java index 3220703a..009adf3a 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java @@ -121,6 +121,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC fullHtml.append("

"); fullHtml.append(part); fullHtml.append("

"); + break; // Only take the first part, this improves accuracy a lot } fullHtml.append(""); From 8f522470eda99dfea9fc6ae57297a09dc536e61d Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 1 Jan 2024 17:16:29 +0100 Subject: [PATCH 24/61] (index) Adjust rank weightings to fix bad wikipedia results There was as bug where if the input of ResultValuator.normalize() was negative, it was truncated to zero. This meant that "bad" results always rank the same. The penalty factor "overallPart" was moved outside of the function and was re-weighted to accomplish a better normalization. Some of the weights were also re-adjusted based on what appears to produce better results. Needs evaluation. --- .../src/main/java/nu/marginalia/ranking/ResultValuator.java | 3 ++- .../nu/marginalia/index/results/IndexResultDecorator.java | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java index 30b647e9..390a02b8 100644 --- a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java +++ b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java @@ -108,7 +108,8 @@ public class ResultValuator { } } - return normalize(bestTcf + bestBM25F + bestBM25P + bestBM25PN * 0.25 + overallPart); + + return normalize(2* bestTcf + bestBM25F + bestBM25P + bestBM25PN * 0.5) - overallPart / 4; } private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) { diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultDecorator.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultDecorator.java index cf352331..376972b8 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultDecorator.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultDecorator.java @@ -10,6 +10,8 @@ import nu.marginalia.index.client.model.results.SearchResultItem; import nu.marginalia.linkdb.LinkdbReader; import nu.marginalia.linkdb.model.LdbUrlDetail; import nu.marginalia.ranking.ResultValuator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.sql.SQLException; import java.util.ArrayList; @@ -21,6 +23,8 @@ import java.util.Map; @Singleton public class IndexResultDecorator { + private static final Logger logger = LoggerFactory.getLogger(IndexResultDecorator.class); + private final LinkdbReader linkdbReader; private final ResultValuator valuator; From 50771045d03c183672e1bb46a9369e670f8a1a42 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 1 Jan 2024 18:43:17 +0100 Subject: [PATCH 25/61] (index) Further ranking adjustments --- .../java/nu/marginalia/ranking/ResultValuator.java | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java index 390a02b8..99e59f4e 100644 --- a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java +++ b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java @@ -109,7 +109,12 @@ public class ResultValuator { } - return normalize(2* bestTcf + bestBM25F + bestBM25P + bestBM25PN * 0.5) - overallPart / 4; + double overallPartPositive = Math.max(0, overallPart); + double overallPartNegative = Math.min(0, overallPart); + + // Renormalize to 0...15, where 0 is the best possible score; + // this is a historical artifact of the original ranking function + return normalize(1.5 * bestTcf + bestBM25F + bestBM25P + 0.25 * bestBM25PN + overallPartPositive, overallPartNegative); } private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) { @@ -210,11 +215,11 @@ public class ResultValuator { return 1 + maxSet; } - public static double normalize(double value) { + public static double normalize(double value, double penalty) { if (value < 0) value = 0; - return Math.sqrt((1.0 + scalingFactor) / (1.0 + value)); + return Math.sqrt((1.0 + scalingFactor) / (1.0 + value)) + Math.sqrt(penalty); } } From fc6e3b6da0d0f0b4f8ea810c7c455c502e009d8c Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 1 Jan 2024 18:51:03 +0100 Subject: [PATCH 26/61] (index) Further ranking adjustments --- .../src/main/java/nu/marginalia/ranking/ResultValuator.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java index 99e59f4e..a6b2d925 100644 --- a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java +++ b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java @@ -137,11 +137,13 @@ public class ResultValuator { double penalty = 0; boolean isForum = DocumentFlags.GeneratorForum.isPresent(docFlags); + boolean isWiki = DocumentFlags.GeneratorWiki.isPresent(docFlags); + boolean isDocs = DocumentFlags.GeneratorDocs.isPresent(docFlags); // Penalize large sites harder for any bullshit as it's a strong signal of a low quality site double largeSiteFactor = 1.; - if (!isForum && size > 400) { + if (!isForum && !isWiki && !isDocs && size > 400) { // Long urls-that-look-like-this tend to be poor search results if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.KEBAB_CASE_URL.getFeatureBit())) penalty += 30.0; @@ -161,7 +163,7 @@ public class ResultValuator { if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING.getFeatureBit())) penalty += 2.5 * largeSiteFactor; - if (isForum) { + if (isForum || isWiki || isDocs) { penalty = Math.min(0, penalty - 2); } From 310a880fa88c969b7b5915071eca097b518e5ed0 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 2 Jan 2024 12:24:52 +0100 Subject: [PATCH 27/61] (index) Further ranking adjustments --- .../src/main/java/nu/marginalia/ranking/ResultValuator.java | 4 ++-- .../main/java/nu/marginalia/index/svc/IndexQueryService.java | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java index a6b2d925..2a856258 100644 --- a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java +++ b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java @@ -110,7 +110,7 @@ public class ResultValuator { double overallPartPositive = Math.max(0, overallPart); - double overallPartNegative = Math.min(0, overallPart); + double overallPartNegative = -Math.min(0, overallPart); // Renormalize to 0...15, where 0 is the best possible score; // this is a historical artifact of the original ranking function @@ -163,7 +163,7 @@ public class ResultValuator { if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING.getFeatureBit())) penalty += 2.5 * largeSiteFactor; - if (isForum || isWiki || isDocs) { + if (isForum || isWiki) { penalty = Math.min(0, penalty - 2); } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java index a912beee..476ea991 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java @@ -265,8 +265,6 @@ public class IndexQueryService extends IndexApiImplBase { return new SearchResultSet(resultDecorator.decorateAndRerank(bestResults, rankingContext)); } - /* This is used in result ranking, and is also routed back up the search service in order to recalculate BM-25 - * accurately */ private ResultRankingContext createRankingContext(ResultRankingParameters rankingParams, List subqueries) { final var termToId = searchTermsSvc.getAllIncludeTerms(subqueries); final Map termFrequencies = new HashMap<>(termToId.size()); From f0d9618dfcde0e94e92d8e0046993b45833e739e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 2 Jan 2024 12:34:58 +0100 Subject: [PATCH 28/61] (sideload) Reduce quality assessment. This will make these sideloaded results rank much better as there is a pretty harsh penalty for large low-q websites. --- .../nu/marginalia/converting/sideload/SideloaderProcessing.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java index 3e871f9a..14b35b6a 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java @@ -80,7 +80,7 @@ public class SideloaderProcessing { ret.details.pubYear = LocalDateTime.now().getYear(); ret.details.features.add(HtmlFeature.JS); ret.details.features.add(HtmlFeature.TRACKING); - ret.details.quality = -10; + ret.details.quality = -4.5; ret.details.generator = type; ret.details.metadata = new DocumentMetadata(3, From 4ce692ccaf3b0e6db746801864f738b682fe6500 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 13:40:44 +0100 Subject: [PATCH 29/61] (converter) Use SimpleBlockingThreadPool in ProcessingIterator --- .../nu/marginalia/ranking/ResultValuator.java | 23 ++++++++++- .../marginalia/util/ProcessingIterator.java | 41 +++++-------------- .../converting/processor/DomainProcessor.java | 4 +- 3 files changed, 36 insertions(+), 32 deletions(-) diff --git a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java index 2a856258..6322c09c 100644 --- a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java +++ b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java @@ -80,6 +80,14 @@ public class ResultValuator { temporalBias = 0; } + logger.info("averageSentenceLengthPenalty: " + averageSentenceLengthPenalty); + logger.info("documentLengthPenalty: " + documentLengthPenalty); + logger.info("qualityPenalty: " + qualityPenalty); + logger.info("rankingBonus: " + rankingBonus); + logger.info("topologyBonus: " + topologyBonus); + logger.info("temporalBias: " + temporalBias); + logger.info("flagsPenalty: " + flagsPenalty); + double overallPart = averageSentenceLengthPenalty + documentLengthPenalty + qualityPenalty @@ -112,9 +120,22 @@ public class ResultValuator { double overallPartPositive = Math.max(0, overallPart); double overallPartNegative = -Math.min(0, overallPart); + + logger.info("bestTcf: " + bestTcf); + logger.info("bestBM25F: " + bestBM25F); + logger.info("bestBM25P: " + bestBM25P); + logger.info("bestBM25PN: " + bestBM25PN); + logger.info("overallPartPositive: " + overallPartPositive); + logger.info("overallPartNegative: " + overallPartNegative); + // Renormalize to 0...15, where 0 is the best possible score; // this is a historical artifact of the original ranking function - return normalize(1.5 * bestTcf + bestBM25F + bestBM25P + 0.25 * bestBM25PN + overallPartPositive, overallPartNegative); + double ret = normalize(1.5 * bestTcf + bestBM25F + bestBM25P + 0.25 * bestBM25PN + overallPartPositive, overallPartNegative); + + logger.info("ret: " + ret); + + return ret; + } private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) { diff --git a/code/libraries/blocking-thread-pool/src/main/java/nu/marginalia/util/ProcessingIterator.java b/code/libraries/blocking-thread-pool/src/main/java/nu/marginalia/util/ProcessingIterator.java index 523381fa..c143f88a 100644 --- a/code/libraries/blocking-thread-pool/src/main/java/nu/marginalia/util/ProcessingIterator.java +++ b/code/libraries/blocking-thread-pool/src/main/java/nu/marginalia/util/ProcessingIterator.java @@ -19,21 +19,16 @@ public class ProcessingIterator implements Iterator { private final LinkedBlockingQueue queue; private final AtomicBoolean isFinished = new AtomicBoolean(false); - private final ExecutorService executorService; - private final Semaphore sem; + private final SimpleBlockingThreadPool pool; private T next = null; - private final int parallelism; - - ProcessingIterator(ExecutorService executorService, int queueSize, int parallelism, ProcessingJob task) { - this.parallelism = parallelism; - + @SneakyThrows + ProcessingIterator(SimpleBlockingThreadPool pool, int queueSize, ProcessingJob task) { queue = new LinkedBlockingQueue<>(queueSize); - this.executorService = executorService; - sem = new Semaphore(parallelism); + this.pool = pool; - executorService.submit(() -> executeJob(task)); + pool.submit(() -> executeJob(task)); } public static Factory factory(int queueSize, int parallelism) { @@ -50,20 +45,13 @@ public class ProcessingIterator implements Iterator { } } + @SneakyThrows private void executeTask(Task task) { - try { - sem.acquire(); - } catch (InterruptedException e) { - return; - } - - executorService.submit(() -> { + pool.submit(() -> { try { queue.put(task.get()); } catch (Exception e) { logger.warn("Exception while processing", e); - } finally { - sem.release(); } }); } @@ -97,7 +85,7 @@ public class ProcessingIterator implements Iterator { private boolean expectMore() { return !isFinished.get() // we are still reading from the database || !queue.isEmpty() // ... or we have documents in the queue - || sem.availablePermits() < parallelism; // ... or we are still processing documents + || pool.getActiveCount() > 0; // ... or we are still processing documents } /** Returns the next document to be processed. @@ -142,24 +130,17 @@ public class ProcessingIterator implements Iterator { public static class Factory { private final int queueSize; - private final int parallelism; - private final ExecutorService executorService; + private final SimpleBlockingThreadPool pool; Factory(int queueSize, int parallelism) { this.queueSize = queueSize; - this.parallelism = parallelism; - this.executorService = Executors.newFixedThreadPool(parallelism); + this.pool = new SimpleBlockingThreadPool("sideload", parallelism, 4); } public ProcessingIterator create(ProcessingJob task) { - return new ProcessingIterator<>(executorService, queueSize, parallelism, task); + return new ProcessingIterator<>(pool, queueSize, task); } - public void stop() { - if (!executorService.isShutdown()) { - executorService.shutdown(); - } - } } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index 630f97f7..a7f62aca 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -86,7 +86,9 @@ public class DomainProcessor { private final Set processedUrls = new HashSet<>(); private final DomainLinks externalDomainLinks; private final LshDocumentDeduplicator deduplicator = new LshDocumentDeduplicator(); - private static ProcessingIterator.Factory iteratorFactory = ProcessingIterator.factory(24, 16); + private static final ProcessingIterator.Factory iteratorFactory = ProcessingIterator.factory(8, + Integer.getInteger("java.util.concurrent.ForkJoinPool.common.parallelism", Runtime.getRuntime().availableProcessors()) + ); SideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) throws IOException { this.dataStream = dataStream; From 32436d099c6ebe4a26d02c963f4f9ce3f1ecfb34 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 13:49:39 +0100 Subject: [PATCH 30/61] (language-processing) Add maximum length limit for text input in SentenceExtractor Added a new constant, MAX_TEXT_LENGTH, to the SentenceExtractor class. If the length of the text input exceeds this limit, the text is truncated to fit within the limit. This modification is designed to prevent excessive resource usage for unusually long text inputs. --- .../nu/marginalia/language/sentence/SentenceExtractor.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java b/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java index 4cbdaf29..178cdee4 100644 --- a/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java +++ b/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java @@ -42,6 +42,7 @@ public class SentenceExtractor { * that might otherwise use an undue amount of processing power. 250 words is about 10X longer than * this comment. */ private static final int MAX_SENTENCE_LENGTH = 250; + private static final int MAX_TEXT_LENGTH = 65536; @SneakyThrows @Inject public SentenceExtractor(LanguageModels models) { @@ -136,6 +137,11 @@ public class SentenceExtractor { String[] sentences; String textNormalizedSpaces = SentenceExtractorStringUtils.normalizeSpaces(text); + + if (text.length() > MAX_TEXT_LENGTH) { + textNormalizedSpaces = textNormalizedSpaces.substring(0, MAX_TEXT_LENGTH); + } + try { sentences = sentenceDetector.sentDetect(textNormalizedSpaces); } From 0806aa6dfe32d4939ec274dec9b3f8dc0625b7e1 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 13:59:05 +0100 Subject: [PATCH 31/61] (language-processing) Add maximum length limit for text input in SentenceExtractor Added a new constant, MAX_TEXT_LENGTH, to the SentenceExtractor class. If the length of the text input exceeds this limit, the text is truncated to fit within the limit. This modification is designed to prevent excessive resource usage for unusually long text inputs. --- .../language/sentence/SentenceExtractor.java | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java b/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java index 178cdee4..13ba2e76 100644 --- a/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java +++ b/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java @@ -96,7 +96,7 @@ public class SentenceExtractor { title = doc.getElementsByTag("h2").text(); } - if (title.trim().length() < 3 && textSentences.length > 0) { + if (title.trim().length() < 3) { for (DocumentSentence textSentence : textSentences) { if (textSentence.length() > 0) { title = textSentence.originalSentence.toLowerCase(); @@ -138,10 +138,6 @@ public class SentenceExtractor { String textNormalizedSpaces = SentenceExtractorStringUtils.normalizeSpaces(text); - if (text.length() > MAX_TEXT_LENGTH) { - textNormalizedSpaces = textNormalizedSpaces.substring(0, MAX_TEXT_LENGTH); - } - try { sentences = sentenceDetector.sentDetect(textNormalizedSpaces); } @@ -221,7 +217,12 @@ public class SentenceExtractor { public String asText(Document dc) { String text = dc.getElementsByTag("body").text(); - return text.substring(0, (int) (text.length()*0.95)); + if (text.length() > MAX_TEXT_LENGTH) { + return text.substring(0, MAX_TEXT_LENGTH); + } + else { + return text.substring(0, (int) (text.length() * 0.95)); + } } From 0b9f3d1751fc224ef2b242f30506ca668c63f561 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 14:32:00 +0100 Subject: [PATCH 32/61] (*) Remove accidental commit of debug logging --- .../nu/marginalia/ranking/ResultValuator.java | 23 +------------------ 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java index 6322c09c..2a856258 100644 --- a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java +++ b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java @@ -80,14 +80,6 @@ public class ResultValuator { temporalBias = 0; } - logger.info("averageSentenceLengthPenalty: " + averageSentenceLengthPenalty); - logger.info("documentLengthPenalty: " + documentLengthPenalty); - logger.info("qualityPenalty: " + qualityPenalty); - logger.info("rankingBonus: " + rankingBonus); - logger.info("topologyBonus: " + topologyBonus); - logger.info("temporalBias: " + temporalBias); - logger.info("flagsPenalty: " + flagsPenalty); - double overallPart = averageSentenceLengthPenalty + documentLengthPenalty + qualityPenalty @@ -120,22 +112,9 @@ public class ResultValuator { double overallPartPositive = Math.max(0, overallPart); double overallPartNegative = -Math.min(0, overallPart); - - logger.info("bestTcf: " + bestTcf); - logger.info("bestBM25F: " + bestBM25F); - logger.info("bestBM25P: " + bestBM25P); - logger.info("bestBM25PN: " + bestBM25PN); - logger.info("overallPartPositive: " + overallPartPositive); - logger.info("overallPartNegative: " + overallPartNegative); - // Renormalize to 0...15, where 0 is the best possible score; // this is a historical artifact of the original ranking function - double ret = normalize(1.5 * bestTcf + bestBM25F + bestBM25P + 0.25 * bestBM25PN + overallPartPositive, overallPartNegative); - - logger.info("ret: " + ret); - - return ret; - + return normalize(1.5 * bestTcf + bestBM25F + bestBM25P + 0.25 * bestBM25PN + overallPartPositive, overallPartNegative); } private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) { From f732f6ae6f154117ba580949df642340404a991a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 14:53:53 +0100 Subject: [PATCH 33/61] (index) Tweak result valuation renormalization --- .../src/main/java/nu/marginalia/ranking/ResultValuator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java index 2a856258..2d564e5b 100644 --- a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java +++ b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java @@ -221,7 +221,7 @@ public class ResultValuator { if (value < 0) value = 0; - return Math.sqrt((1.0 + scalingFactor) / (1.0 + value)) + Math.sqrt(penalty); + return Math.sqrt((1.0 + scalingFactor + penalty) / (1.0 + value)); } } From 1f3b89cf28892cc280c12cbfcbe541d8d07fb0a5 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 15:20:18 +0100 Subject: [PATCH 34/61] (index) Reduce the value of site and site-adjacent in BM25P calculations --- .../ranking/factors/Bm25Factor.java | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/factors/Bm25Factor.java b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/factors/Bm25Factor.java index 13c99ecc..a11281db 100644 --- a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/factors/Bm25Factor.java +++ b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/factors/Bm25Factor.java @@ -53,9 +53,11 @@ public class Bm25Factor { } private static double evaluatePriorityScore(SearchResultKeywordScore keyword) { + int pcount = keyword.positionCount(); + double qcount = 0.; if ((keyword.encodedWordMetadata() & WordFlags.Site.asBit()) != 0) - qcount += 2.; + qcount += 0.5; if ((keyword.encodedWordMetadata() & WordFlags.SiteAdjacent.asBit()) != 0) qcount += 0.5; if ((keyword.encodedWordMetadata() & WordFlags.UrlPath.asBit()) != 0) @@ -66,12 +68,16 @@ public class Bm25Factor { qcount += 2.5; if ((keyword.encodedWordMetadata() & WordFlags.Title.asBit()) != 0) qcount += 1.5; - if ((keyword.encodedWordMetadata() & WordFlags.Subjects.asBit()) != 0) - qcount += 1.25; - if ((keyword.encodedWordMetadata() & WordFlags.NamesWords.asBit()) != 0) - qcount += 0.25; - if ((keyword.encodedWordMetadata() & WordFlags.TfIdfHigh.asBit()) != 0) - qcount += 0.5; + + if (pcount > 2) { + if ((keyword.encodedWordMetadata() & WordFlags.Subjects.asBit()) != 0) + qcount += 1.25; + if ((keyword.encodedWordMetadata() & WordFlags.NamesWords.asBit()) != 0) + qcount += 0.25; + if ((keyword.encodedWordMetadata() & WordFlags.TfIdfHigh.asBit()) != 0) + qcount += 0.5; + } + return qcount; } From ac1aca36b0927c5b7dde056098806a24dd4015b9 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 15:20:38 +0100 Subject: [PATCH 35/61] (valuation) Increase the penalty for adtech a bit --- .../src/main/java/nu/marginalia/ranking/ResultValuator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java index 2d564e5b..1fa2a133 100644 --- a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java +++ b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java @@ -155,7 +155,7 @@ public class ResultValuator { } if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING_ADTECH.getFeatureBit())) - penalty += 5.0 * largeSiteFactor; + penalty += 7.5 * largeSiteFactor; if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.AFFILIATE_LINK.getFeatureBit())) penalty += 5.0 * largeSiteFactor; From a19879d4940b0d3f3cf4331519daf9e4313e2f00 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 15:32:33 +0100 Subject: [PATCH 36/61] (valuation) Tweaking penalties a bit --- .../src/main/java/nu/marginalia/ranking/ResultValuator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java index 1fa2a133..678f93be 100644 --- a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java +++ b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java @@ -221,7 +221,7 @@ public class ResultValuator { if (value < 0) value = 0; - return Math.sqrt((1.0 + scalingFactor + penalty) / (1.0 + value)); + return Math.sqrt((1.0 + scalingFactor + 5 * penalty) / (1.0 + value)); } } From 78c00ad512561f562cb8e535d4d3f922126409a0 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 15:52:57 +0100 Subject: [PATCH 37/61] (valuation) Tweaking penalties a bit --- .../src/main/java/nu/marginalia/ranking/ResultValuator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java index 678f93be..7fc431ba 100644 --- a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java +++ b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java @@ -221,7 +221,7 @@ public class ResultValuator { if (value < 0) value = 0; - return Math.sqrt((1.0 + scalingFactor + 5 * penalty) / (1.0 + value)); + return Math.sqrt((1.0 + scalingFactor + 10 * penalty) / (1.0 + value)); } } From c770f0b68b264c97b2c0ccbf6d612bec2955184a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 15:59:21 +0100 Subject: [PATCH 38/61] (valuation) Tweaking penalties a bit --- .../src/main/java/nu/marginalia/ranking/ResultValuator.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java index 7fc431ba..5acaa260 100644 --- a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java +++ b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java @@ -160,6 +160,9 @@ public class ResultValuator { if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.AFFILIATE_LINK.getFeatureBit())) penalty += 5.0 * largeSiteFactor; + if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.COOKIES.getFeatureBit())) + penalty += 2.5 * largeSiteFactor; + if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING.getFeatureBit())) penalty += 2.5 * largeSiteFactor; @@ -221,7 +224,7 @@ public class ResultValuator { if (value < 0) value = 0; - return Math.sqrt((1.0 + scalingFactor + 10 * penalty) / (1.0 + value)); + return Math.sqrt((1.0 + scalingFactor + 5 * penalty) / (1.0 + value)); } } From 87048511fea06f9b6d2b009c1d84cfa7e0e47368 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 16:02:25 +0100 Subject: [PATCH 39/61] (valuation) Tweaking penalties a bit --- .../src/main/java/nu/marginalia/ranking/ResultValuator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java index 5acaa260..961a9e81 100644 --- a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java +++ b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java @@ -224,7 +224,7 @@ public class ResultValuator { if (value < 0) value = 0; - return Math.sqrt((1.0 + scalingFactor + 5 * penalty) / (1.0 + value)); + return Math.sqrt((1.0 + scalingFactor + 10 * penalty) / (1.0 + value)); } } From 7bbaedef97c731e1987c2693dbbbed1490ae8584 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 16:23:00 +0100 Subject: [PATCH 40/61] (search) Add query strategy requiring link --- .../java/nu/marginalia/index/query/limit/QueryStrategy.java | 1 + .../java/nu/marginalia/index/results/IndexResultValuator.java | 4 +++- .../java/nu/marginalia/query/svc/QueryLimitsAccumulator.java | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/code/features-index/index-query/src/main/java/nu/marginalia/index/query/limit/QueryStrategy.java b/code/features-index/index-query/src/main/java/nu/marginalia/index/query/limit/QueryStrategy.java index c15ab6ea..024828f9 100644 --- a/code/features-index/index-query/src/main/java/nu/marginalia/index/query/limit/QueryStrategy.java +++ b/code/features-index/index-query/src/main/java/nu/marginalia/index/query/limit/QueryStrategy.java @@ -9,6 +9,7 @@ public enum QueryStrategy { REQUIRE_FIELD_SUBJECT, REQUIRE_FIELD_URL, REQUIRE_FIELD_DOMAIN, + REQUIRE_FIELD_LINK, AUTO } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java index e19d3809..1e51fbd6 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java @@ -172,7 +172,9 @@ public class IndexResultValuator { else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) { return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.UrlDomain.asBit()); } - + else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_LINK) { + return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.ExternalLink.asBit()); + } return true; } diff --git a/code/services-core/query-service/src/main/java/nu/marginalia/query/svc/QueryLimitsAccumulator.java b/code/services-core/query-service/src/main/java/nu/marginalia/query/svc/QueryLimitsAccumulator.java index 663d4cfc..f1f17bed 100644 --- a/code/services-core/query-service/src/main/java/nu/marginalia/query/svc/QueryLimitsAccumulator.java +++ b/code/services-core/query-service/src/main/java/nu/marginalia/query/svc/QueryLimitsAccumulator.java @@ -45,6 +45,7 @@ public class QueryLimitsAccumulator implements TokenVisitor { case "RF_SITE" -> QueryStrategy.REQUIRE_FIELD_SITE; case "RF_URL" -> QueryStrategy.REQUIRE_FIELD_URL; case "RF_DOMAIN" -> QueryStrategy.REQUIRE_FIELD_DOMAIN; + case "RF_LINK" -> QueryStrategy.REQUIRE_FIELD_LINK; case "SENTENCE" -> QueryStrategy.SENTENCE; case "TOPIC" -> QueryStrategy.TOPIC; default -> QueryStrategy.AUTO; From 1e06aee6a24cd6ebf869397783ff295362b93adf Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 16:30:46 +0100 Subject: [PATCH 41/61] (index) Adjust BM25 parameters --- .../ranking/factors/Bm25Factor.java | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/factors/Bm25Factor.java b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/factors/Bm25Factor.java index a11281db..43a63ab6 100644 --- a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/factors/Bm25Factor.java +++ b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/factors/Bm25Factor.java @@ -56,16 +56,27 @@ public class Bm25Factor { int pcount = keyword.positionCount(); double qcount = 0.; + if ((keyword.encodedWordMetadata() & WordFlags.Site.asBit()) != 0) qcount += 0.5; if ((keyword.encodedWordMetadata() & WordFlags.SiteAdjacent.asBit()) != 0) qcount += 0.5; - if ((keyword.encodedWordMetadata() & WordFlags.UrlPath.asBit()) != 0) - qcount += 1.25; - if ((keyword.encodedWordMetadata() & WordFlags.UrlDomain.asBit()) != 0) - qcount += 1.25; - if ((keyword.encodedWordMetadata() & WordFlags.ExternalLink.asBit()) != 0) + + if ((keyword.encodedWordMetadata() & WordFlags.ExternalLink.asBit()) != 0) { qcount += 2.5; + + if ((keyword.encodedWordMetadata() & WordFlags.UrlDomain.asBit()) != 0) + qcount += 2.5; + else if ((keyword.encodedWordMetadata() & WordFlags.UrlPath.asBit()) != 0) + qcount += 1; + } + else { + if ((keyword.encodedWordMetadata() & WordFlags.UrlPath.asBit()) != 0) + qcount += 1; + if ((keyword.encodedWordMetadata() & WordFlags.UrlDomain.asBit()) != 0) + qcount += 1.5; + } + if ((keyword.encodedWordMetadata() & WordFlags.Title.asBit()) != 0) qcount += 1.5; From f5999449426ec143df72f55f460f042abe0d8a58 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 16:51:26 +0100 Subject: [PATCH 42/61] (converter) Penalize chatgpt content farm spam --- .../processor/logic/DocumentValuator.java | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java index 218f16b8..a64277c5 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java @@ -11,6 +11,7 @@ import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.select.NodeVisitor; +import java.util.List; import java.util.Set; public class DocumentValuator { @@ -21,6 +22,7 @@ public class DocumentValuator { int textLength) throws DisqualifiedException { double scriptPenalty = getScriptPenalty(parsedDocument); + double chatGptPenalty = getChatGptContentFarmPenalty(parsedDocument); int rawLength = crawledDocument.documentBody.length(); @@ -30,7 +32,22 @@ public class DocumentValuator { return Math.log(textLength / (double) (1+rawLength))*htmlStandard.scale + htmlStandard.offset - - scriptPenalty; + - scriptPenalty + - chatGptPenalty; + } + + private double getChatGptContentFarmPenalty(Document parsedDocument) { + // easily 90% of modern AI-authored content farm spam have this exact string in one of the headings + + for (String tagName : List.of("h1", "h2", "h3")) { + for (var elem : parsedDocument.getElementsByTag(tagName)) { + if (elem.text().startsWith("Benefits of")) { + return 10; + } + } + } + + return 0; } From 41a540a6294d95c398a54d901a531d012de2a9c1 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 17:04:38 +0100 Subject: [PATCH 43/61] (converter) Penalize chatgpt content farm spam --- .../processor/logic/DocumentValuator.java | 24 +++++++++++++++---- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java index a64277c5..af080a3a 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java @@ -37,17 +37,31 @@ public class DocumentValuator { } private double getChatGptContentFarmPenalty(Document parsedDocument) { - // easily 90% of modern AI-authored content farm spam have this exact string in one of the headings + // easily 90% of modern AI-authored content farm spam has these nonsense headers + boolean benefitsOf = false, keyBenefits = false, keyTakeaways = false; + + outer: for (String tagName : List.of("h1", "h2", "h3")) { for (var elem : parsedDocument.getElementsByTag(tagName)) { - if (elem.text().startsWith("Benefits of")) { - return 10; - } + if (benefitsOf && keyBenefits && keyTakeaways) + break outer; + + String text = elem.text().toLowerCase(); + + benefitsOf = benefitsOf || text.startsWith("benefits of"); + keyBenefits = keyBenefits || text.startsWith("key benefits"); + keyTakeaways = keyTakeaways || text.startsWith("key takeaways"); } } - return 0; + double penalty = 0; + + if (benefitsOf) penalty += 10; + if (keyBenefits) penalty += 5; + if (keyTakeaways) penalty += 5; + + return penalty; } From 7af07cef95552bf76e59dfc69b282323655c0a53 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 17:21:12 +0100 Subject: [PATCH 44/61] (feature) Add another doubleclick variant to the adtech trackers --- .../marginalia/converting/processor/logic/FeatureExtractor.java | 1 + 1 file changed, 1 insertion(+) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java index 741b6740..83b06bd5 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java @@ -39,6 +39,7 @@ public class FeatureExtractor { "googlesyndication.com", "smartadserver.com", "doubleclick.com", + "doubleclick.net", "2mdn.com", "dmtry.com", "amazon-adsystem.com", From 1f66568d59a189d4dc30795195498d73cfabf7ab Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 17:27:25 +0100 Subject: [PATCH 45/61] (feature) More trackers --- .../converting/processor/logic/FeatureExtractor.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java index 83b06bd5..d6bddbc2 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java @@ -58,7 +58,9 @@ public class FeatureExtractor { "personalized-ads-consent", "_taboola", "nativeads", - "skimlinks" + "skimlinks", + "juicyads.com", + "counter.yadro.ru" ); private final AdblockSimulator adblockSimulator; From f7560cb1d8008814c23f08231ee514ff9bf66fbc Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 17:31:02 +0100 Subject: [PATCH 46/61] (feature) More trackers --- .../marginalia/converting/processor/logic/FeatureExtractor.java | 1 + 1 file changed, 1 insertion(+) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java index d6bddbc2..c38f63f9 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java @@ -59,6 +59,7 @@ public class FeatureExtractor { "_taboola", "nativeads", "skimlinks", + "moapt", "juicyads.com", "counter.yadro.ru" ); From 60361f88eda6c67ec0ea7c664da45ed6d306df10 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 23:14:03 +0100 Subject: [PATCH 47/61] (converter) Add upper 128KB limit to how much HTML we'll parse --- .../processor/plugin/HtmlDocumentProcessorPlugin.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 7d973909..44da6008 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -107,6 +107,10 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin throw new DisqualifiedException(DisqualificationReason.LANGUAGE); } + if (documentBody.length() > 128_000) { // 128kb + documentBody = documentBody.substring(0, 128_000); + } + Document doc = Jsoup.parse(documentBody); if (!metaRobotsTag.allowIndexingByMetaTag(doc)) { From 343ea9c6d8d2618ab4b9beb2676923261576ab2f Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 4 Jan 2024 13:18:07 +0100 Subject: [PATCH 48/61] (search) Fetch fewer results per page This is a test to evaluate how this impacts load times. --- .../main/java/nu/marginalia/search/SearchQueryParamFactory.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchQueryParamFactory.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchQueryParamFactory.java index 6b913402..95439273 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchQueryParamFactory.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchQueryParamFactory.java @@ -31,7 +31,7 @@ public class SearchQueryParamFactory { SpecificationLimit.none(), SpecificationLimit.none(), List.of(), - new QueryLimits(1, 100, 200, 8192), + new QueryLimits(1, 25, 200, 8192), profile.searchSetIdentifier ); From 4078708aea13dbd8b26b5cc65359e84bf463ba5a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 4 Jan 2024 13:27:14 +0100 Subject: [PATCH 49/61] (qs) Better metrics for QS --- .../src/main/java/nu/marginalia/query/QueryGRPCService.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/code/services-core/query-service/src/main/java/nu/marginalia/query/QueryGRPCService.java b/code/services-core/query-service/src/main/java/nu/marginalia/query/QueryGRPCService.java index 2322c1ee..9e14ef15 100644 --- a/code/services-core/query-service/src/main/java/nu/marginalia/query/QueryGRPCService.java +++ b/code/services-core/query-service/src/main/java/nu/marginalia/query/QueryGRPCService.java @@ -23,6 +23,7 @@ public class QueryGRPCService extends QueryApiGrpc.QueryApiImplBase { private static final Histogram wmsa_qs_query_time_grpc = Histogram.build() .name("wmsa_qs_query_time_grpc") + .labelNames("timeout", "count") .linearBuckets(0.05, 0.05, 15) .help("QS-side query time (GRPC endpoint)") .register(); @@ -69,7 +70,10 @@ public class QueryGRPCService extends QueryApiGrpc.QueryApiImplBase { io.grpc.stub.StreamObserver responseObserver) { try { - wmsa_qs_query_time_grpc.time(() -> { + wmsa_qs_query_time_grpc + .labels(Integer.toString(request.getQueryLimits().getTimeoutMs()), + Integer.toString(request.getQueryLimits().getResultsTotal())) + .time(() -> { var params = QueryProtobufCodec.convertRequest(request); var query = queryFactory.createQuery(params); From 6d2e14a656be8175a9662c2138e4353152023d83 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 5 Jan 2024 13:17:29 +0100 Subject: [PATCH 50/61] (build) Remove false depdencency between icp and index-service This dependency causes the executor service docker image to change when the index service docker image changes. --- code/processes/index-constructor-process/build.gradle | 2 -- 1 file changed, 2 deletions(-) diff --git a/code/processes/index-constructor-process/build.gradle b/code/processes/index-constructor-process/build.gradle index d3b81107..e92db1b6 100644 --- a/code/processes/index-constructor-process/build.gradle +++ b/code/processes/index-constructor-process/build.gradle @@ -32,8 +32,6 @@ dependencies { implementation project(':code:features-index:index-journal') implementation project(':code:features-index:domain-ranking') - implementation project(':code:services-core:index-service') - implementation libs.bundles.slf4j implementation libs.guice implementation libs.bundles.mariadb From 41ca50ff0eec3c0be38eca4ef313f6de6ce92d80 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 5 Jan 2024 13:19:59 +0100 Subject: [PATCH 51/61] (build) Enable reproducible builds in build.gradle Settings for enabling reproducible builds for all subprojects were added to improve build consistency. This includes preserving file timestamps and ordering files reproducibly. This is primarily of help for docker, since it uses hashes to determine if a file or image layer has changed. --- build.gradle | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/build.gradle b/build.gradle index d7f74837..4bda69de 100644 --- a/build.gradle +++ b/build.gradle @@ -11,8 +11,8 @@ version 'SNAPSHOT' compileJava.options.encoding = "UTF-8" compileTestJava.options.encoding = "UTF-8" -// Enable preview features for the entire project subprojects.forEach {it -> + // Enable preview features for the entire project it.tasks.withType(JavaCompile).configureEach { options.compilerArgs += ['--enable-preview'] } @@ -22,6 +22,12 @@ subprojects.forEach {it -> it.tasks.withType(Test).configureEach { jvmArgs += ['--enable-preview'] } + + // Enable reproducible builds for the entire project + it.tasks.withType(AbstractArchiveTask).configureEach { + preserveFileTimestamps = false + reproducibleFileOrder = true + } } allprojects { From edc1acbb7e3383dfb6822f05add72e4270178240 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 8 Jan 2024 15:53:13 +0100 Subject: [PATCH 52/61] (*) Replace EC_DOMAIN_LINK table with files and in-memory caching The EC_DOMAIN_LINK MariaDB table stores links between domains. This is problematic, as both updating and querying this table is very slow in relation to how small the data is (~10 GB). This slowness is largely caused by the database enforcing ACID guarantees we don't particularly need. This changeset replaces the EC_DOMAIN_LINK table with a file in each index node containing 32 bit integer pairs corresponding to links between two domains. This file is loaded in memory in each node, and can be queried via the Query Service. A migration step is needed before this file is created in each node. Until that happens, the actual data is loaded from the EC_DOMAIN_LINK table, but accessed as though it was a file. The changeset also migrates/renames the links.db file to documents.db to avoid naming confusion between the two. --- .../src/main/protobuf/index-api.proto | 22 ++ code/api/query-api/build.gradle | 2 + .../marginalia/query/client/QueryClient.java | 123 +++++++- code/common/linkdb/build.gradle | 2 + code/common/linkdb/readme.md | 27 +- ...inkdbReader.java => DocumentDbReader.java} | 14 +- ...inkdbWriter.java => DocumentDbWriter.java} | 17 +- .../nu/marginalia/linkdb/DomainLinkDb.java | 39 +++ .../marginalia/linkdb/DomainLinkDbLoader.java | 45 +++ .../marginalia/linkdb/DomainLinkDbWriter.java | 29 ++ .../marginalia/linkdb/FileDomainLinkDb.java | 125 +++++++++ .../nu/marginalia/linkdb/LinkdbFileNames.java | 7 + .../nu/marginalia/linkdb/SqlDomainLinkDb.java | 158 +++++++++++ .../linkdb/model/DocdbUrlDetail.java | 18 ++ .../marginalia/linkdb/model/LdbUrlDetail.java | 18 -- ...linkdb-document.sql => docdb-document.sql} | 0 ...terTest.java => DocumentDbWriterTest.java} | 12 +- .../marginalia/linkdb/DomainLinkDbTest.java | 50 ++++ .../domain-ranking/build.gradle | 2 + .../ranking/data/RankingDomainFetcher.java | 30 +- ...RankingDomainFetcherForSimilarityData.java | 16 +- .../tool/CreateBrowseDomainRanksTool.java | 71 ----- .../ranking/tool/PerusePageRankV2.java | 264 ------------------ .../ranking/tool/PrintDomainRanksTool.java | 67 ----- .../ranking/tool/UpdateDomainRanksTool.java | 85 ------ .../browse/DbBrowseDomainsSimilarCosine.java | 1 - .../browse/DbBrowseDomainsSimilarOldAlgo.java | 132 --------- .../nu/marginalia/loading/LoaderMain.java | 10 +- .../nu/marginalia/loading/LoaderModule.java | 26 +- .../documents/DocumentLoaderService.java | 19 +- .../links/DomainLinksLoaderService.java | 77 ++--- .../links/DomainLinksLoaderServiceTest.java | 176 ------------ .../build.gradle | 2 + .../adjacencies/AdjacenciesData.java | 67 ++--- .../WebsiteAdjacenciesCalculator.java | 25 +- .../command/commands/BrowseCommand.java | 2 +- .../search/results/BrowseResultCleaner.java | 3 +- .../search/svc/SearchBrowseService.java | 41 +-- .../assistant-service/build.gradle | 1 + .../domains/DomainInformationService.java | 21 +- .../domains/SimilarDomainsService.java | 53 ++-- .../executor-service/build.gradle | 1 + .../actor/task/ExportDataActor.java | 3 - .../java/nu/marginalia/svc/BackupService.java | 14 +- .../java/nu/marginalia/index/IndexModule.java | 58 +++- .../nu/marginalia/index/IndexService.java | 38 ++- .../index/results/IndexResultDecorator.java | 16 +- .../index/svc/IndexDomainLinksService.java | 104 +++++++ .../marginalia/index/svc/IndexOpsService.java | 1 + ...IndexQueryServiceIntegrationSmokeTest.java | 35 +-- .../svc/IndexQueryServiceIntegrationTest.java | 17 +- ...ndexQueryServiceIntegrationTestModule.java | 7 +- .../query/QueryGRPCDomainLinksService.java | 96 +++++++ .../nu/marginalia/query/QueryGRPCService.java | 82 ++---- .../marginalia/query/QueryGrpcStubPool.java | 64 +++++ .../nu/marginalia/query/QueryService.java | 2 + 56 files changed, 1261 insertions(+), 1176 deletions(-) rename code/common/linkdb/src/main/java/nu/marginalia/linkdb/{LinkdbReader.java => DocumentDbReader.java} (91%) rename code/common/linkdb/src/main/java/nu/marginalia/linkdb/{LinkdbWriter.java => DocumentDbWriter.java} (83%) create mode 100644 code/common/linkdb/src/main/java/nu/marginalia/linkdb/DomainLinkDb.java create mode 100644 code/common/linkdb/src/main/java/nu/marginalia/linkdb/DomainLinkDbLoader.java create mode 100644 code/common/linkdb/src/main/java/nu/marginalia/linkdb/DomainLinkDbWriter.java create mode 100644 code/common/linkdb/src/main/java/nu/marginalia/linkdb/FileDomainLinkDb.java create mode 100644 code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbFileNames.java create mode 100644 code/common/linkdb/src/main/java/nu/marginalia/linkdb/SqlDomainLinkDb.java create mode 100644 code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/DocdbUrlDetail.java delete mode 100644 code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/LdbUrlDetail.java rename code/common/linkdb/src/main/resources/db/{linkdb-document.sql => docdb-document.sql} (100%) rename code/common/linkdb/src/test/java/nu/marginalia/linkdb/{LinkdbWriterTest.java => DocumentDbWriterTest.java} (76%) create mode 100644 code/common/linkdb/src/test/java/nu/marginalia/linkdb/DomainLinkDbTest.java delete mode 100644 code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/tool/CreateBrowseDomainRanksTool.java delete mode 100644 code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/tool/PerusePageRankV2.java delete mode 100644 code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/tool/PrintDomainRanksTool.java delete mode 100644 code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/tool/UpdateDomainRanksTool.java delete mode 100644 code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarOldAlgo.java delete mode 100644 code/processes/loading-process/src/test/java/nu/marginalia/loading/links/DomainLinksLoaderServiceTest.java create mode 100644 code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexDomainLinksService.java create mode 100644 code/services-core/query-service/src/main/java/nu/marginalia/query/QueryGRPCDomainLinksService.java create mode 100644 code/services-core/query-service/src/main/java/nu/marginalia/query/QueryGrpcStubPool.java diff --git a/code/api/index-api/src/main/protobuf/index-api.proto b/code/api/index-api/src/main/protobuf/index-api.proto index a53842d4..e7b5ae69 100644 --- a/code/api/index-api/src/main/protobuf/index-api.proto +++ b/code/api/index-api/src/main/protobuf/index-api.proto @@ -4,6 +4,28 @@ package actorapi; option java_package="nu.marginalia.index.api"; option java_multiple_files=true; +service IndexDomainLinksApi { + rpc getAllLinks(Empty) returns (stream RpcDomainIdPairs) {} + rpc getLinksFromDomain(RpcDomainId) returns (RpcDomainIdList) {} + rpc getLinksToDomain(RpcDomainId) returns (RpcDomainIdList) {} + rpc countLinksFromDomain(RpcDomainId) returns (RpcDomainIdCount) {} + rpc countLinksToDomain(RpcDomainId) returns (RpcDomainIdCount) {} +} + +message RpcDomainId { + int32 domainId = 1; +} +message RpcDomainIdList { + repeated int32 domainId = 1 [packed=true]; +} +message RpcDomainIdCount { + int32 idCount = 1; +} +message RpcDomainIdPairs { + repeated int32 sourceIds = 1 [packed=true]; + repeated int32 destIds = 2 [packed=true]; +} + service QueryApi { rpc query(RpcQsQuery) returns (RpcQsResponse) {} } diff --git a/code/api/query-api/build.gradle b/code/api/query-api/build.gradle index 524d21df..ed893ae1 100644 --- a/code/api/query-api/build.gradle +++ b/code/api/query-api/build.gradle @@ -20,8 +20,10 @@ dependencies { implementation libs.bundles.slf4j + implementation libs.roaringbitmap implementation libs.prometheus implementation libs.notnull + implementation libs.trove implementation libs.guice implementation libs.rxjava implementation libs.gson diff --git a/code/api/query-api/src/main/java/nu/marginalia/query/client/QueryClient.java b/code/api/query-api/src/main/java/nu/marginalia/query/client/QueryClient.java index 37308576..6c6e63a4 100644 --- a/code/api/query-api/src/main/java/nu/marginalia/query/client/QueryClient.java +++ b/code/api/query-api/src/main/java/nu/marginalia/query/client/QueryClient.java @@ -2,24 +2,33 @@ package nu.marginalia.query.client; import com.google.inject.Inject; import com.google.inject.Singleton; +import gnu.trove.list.array.TIntArrayList; import io.grpc.ManagedChannel; import io.grpc.ManagedChannelBuilder; import io.prometheus.client.Summary; import nu.marginalia.client.AbstractDynamicClient; import nu.marginalia.client.Context; +import nu.marginalia.index.api.Empty; +import nu.marginalia.index.api.IndexDomainLinksApiGrpc; import nu.marginalia.index.api.QueryApiGrpc; +import nu.marginalia.index.api.RpcDomainId; import nu.marginalia.index.client.model.query.SearchSpecification; import nu.marginalia.index.client.model.results.SearchResultSet; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.query.QueryProtobufCodec; import nu.marginalia.query.model.QueryParams; import nu.marginalia.query.model.QueryResponse; +import nu.marginalia.service.descriptor.ServiceDescriptor; import nu.marginalia.service.descriptor.ServiceDescriptors; import nu.marginalia.service.id.ServiceId; +import org.roaringbitmap.PeekableCharIterator; +import org.roaringbitmap.longlong.PeekableLongIterator; +import org.roaringbitmap.longlong.Roaring64Bitmap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.annotation.CheckReturnValue; +import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; @@ -36,13 +45,15 @@ public class QueryClient extends AbstractDynamicClient { .register(); private final Map channels = new ConcurrentHashMap<>(); - private final Map queryApis = new ConcurrentHashMap<>(); + private final Map queryIndexApis = new ConcurrentHashMap<>(); + private final Map domainLinkApis = new ConcurrentHashMap<>(); record ServiceAndNode(String service, int node) { public String getHostName() { return service; } } + private ManagedChannel getChannel(ServiceAndNode serviceAndNode) { return channels.computeIfAbsent(serviceAndNode, san -> ManagedChannelBuilder @@ -52,13 +63,21 @@ public class QueryClient extends AbstractDynamicClient { } public QueryApiGrpc.QueryApiBlockingStub queryApi(int node) { - return queryApis.computeIfAbsent(new ServiceAndNode("query-service", node), n -> + return queryIndexApis.computeIfAbsent(new ServiceAndNode("query-service", node), n -> QueryApiGrpc.newBlockingStub( getChannel(n) ) ); } + public IndexDomainLinksApiGrpc.IndexDomainLinksApiBlockingStub domainApi(int node) { + return domainLinkApis.computeIfAbsent(new ServiceAndNode("query-service", node), n -> + IndexDomainLinksApiGrpc.newBlockingStub( + getChannel(n) + ) + ); + } + private final Logger logger = LoggerFactory.getLogger(getClass()); @Inject @@ -66,6 +85,9 @@ public class QueryClient extends AbstractDynamicClient { super(descriptors.forId(ServiceId.Query), GsonFactory::get); } + public QueryClient() { + super(new ServiceDescriptor(ServiceId.Query, "query-service"), GsonFactory::get); + } /** Delegate an Index API style query directly to the index service */ @CheckReturnValue @@ -82,4 +104,101 @@ public class QueryClient extends AbstractDynamicClient { ); } + public AllLinks getAllDomainLinks() { + AllLinks links = new AllLinks(); + + domainApi(0).getAllLinks(Empty.newBuilder().build()).forEachRemaining(pairs -> { + for (int i = 0; i < pairs.getDestIdsCount(); i++) { + links.add(pairs.getSourceIds(i), pairs.getDestIds(i)); + } + }); + + return links; + } + + public List getLinksToDomain(int domainId) { + try { + return domainApi(0).getLinksToDomain(RpcDomainId + .newBuilder() + .setDomainId(domainId) + .build()) + .getDomainIdList(); + } + catch (Exception e) { + logger.error("API Exception", e); + return List.of(); + } + } + + public List getLinksFromDomain(int domainId) { + try { + return domainApi(0).getLinksFromDomain(RpcDomainId + .newBuilder() + .setDomainId(domainId) + .build()) + .getDomainIdList(); + } + catch (Exception e) { + logger.error("API Exception", e); + return List.of(); + } + } + + public int countLinksToDomain(int domainId) { + try { + return domainApi(0).countLinksToDomain(RpcDomainId + .newBuilder() + .setDomainId(domainId) + .build()) + .getIdCount(); + } + catch (Exception e) { + logger.error("API Exception", e); + return 0; + } + } + + public int countLinksFromDomain(int domainId) { + try { + return domainApi(0).countLinksFromDomain(RpcDomainId + .newBuilder() + .setDomainId(domainId) + .build()) + .getIdCount(); + } + catch (Exception e) { + logger.error("API Exception", e); + return 0; + } + } + public static class AllLinks { + private final Roaring64Bitmap sourceToDest = new Roaring64Bitmap(); + + public void add(int source, int dest) { + sourceToDest.add(Integer.toUnsignedLong(source) << 32 | Integer.toUnsignedLong(dest)); + } + + public Iterator iterator() { + return new Iterator(); + } + + public class Iterator { + private final PeekableLongIterator base = sourceToDest.getLongIterator(); + long val = Long.MIN_VALUE; + + public boolean advance() { + if (base.hasNext()) { + val = base.next(); + return true; + } + return false; + } + public int source() { + return (int) (val >>> 32); + } + public int dest() { + return (int) (val & 0xFFFF_FFFFL); + } + } + } } diff --git a/code/common/linkdb/build.gradle b/code/common/linkdb/build.gradle index 1f223144..08cd4db0 100644 --- a/code/common/linkdb/build.gradle +++ b/code/common/linkdb/build.gradle @@ -16,6 +16,7 @@ configurations { dependencies { implementation project(':code:common:model') + implementation project(':code:common:service') implementation libs.bundles.slf4j @@ -23,6 +24,7 @@ dependencies { implementation libs.bundles.gson implementation libs.notnull + implementation libs.bundles.mariadb implementation libs.sqlite implementation libs.commons.lang3 diff --git a/code/common/linkdb/readme.md b/code/common/linkdb/readme.md index 567ec746..ab86b931 100644 --- a/code/common/linkdb/readme.md +++ b/code/common/linkdb/readme.md @@ -1,11 +1,30 @@ -The link database contains information about links, +## Domain Link Database + +The domain link database contains information about links +between domains. It is a static in-memory database loaded +from a binary file. + +* [DomainLinkDb](src/main/java/nu/marginalia/linkdb/DomainLinkDb.java) +* * [FileDomainLinkDb](src/main/java/nu/marginalia/linkdb/FileDomainLinkDb.java) +* * [SqlDomainLinkDb](src/main/java/nu/marginalia/linkdb/SqlDomainLinkDb.java) +* [DomainLinkDbWriter](src/main/java/nu/marginalia/linkdb/DomainLinkDbWriter.java) +* [DomainLinkDbLoader](src/main/java/nu/marginalia/linkdb/DomainLinkDbLoader.java) + +## Document Database + +The document database contains information about links, such as their ID, their URL, their title, their description, and so forth. -The link database is a sqlite file. The reason this information +The document database is a sqlite file. The reason this information is not in the MariaDB database is that this would make updates to this information take effect in production immediately, even before the information was searchable. -It is constructed by the [loading-process](../../processes/loading-process), and consumed -by the [index-service](../../services-core/index-service). \ No newline at end of file +* [DocumentLinkDbWriter](src/main/java/nu/marginalia/linkdb/DocumentDbWriter.java) +* [DocumentLinkDbLoader](src/main/java/nu/marginalia/linkdb/DocumentDbReader.java) + + +## See Also + +These databases are constructed by the [loading-process](../../processes/loading-process), and consumed by the [index-service](../../services-core/index-service). \ No newline at end of file diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbReader.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/DocumentDbReader.java similarity index 91% rename from code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbReader.java rename to code/common/linkdb/src/main/java/nu/marginalia/linkdb/DocumentDbReader.java index 027b2371..6d7aefd6 100644 --- a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbReader.java +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/DocumentDbReader.java @@ -4,7 +4,7 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import com.google.inject.name.Named; import gnu.trove.list.TLongList; -import nu.marginalia.linkdb.model.LdbUrlDetail; +import nu.marginalia.linkdb.model.DocdbUrlDetail; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.id.UrlIdCodec; import org.slf4j.Logger; @@ -23,21 +23,21 @@ import java.util.ArrayList; import java.util.List; @Singleton -public class LinkdbReader { +public class DocumentDbReader { private final Path dbFile; private volatile Connection connection; private final Logger logger = LoggerFactory.getLogger(getClass()); @Inject - public LinkdbReader(@Named("linkdb-file") Path dbFile) throws SQLException { + public DocumentDbReader(@Named("docdb-file") Path dbFile) throws SQLException { this.dbFile = dbFile; if (Files.exists(dbFile)) { connection = createConnection(); } else { - logger.warn("No linkdb file {}", dbFile); + logger.warn("No docdb file {}", dbFile); } } @@ -107,8 +107,8 @@ public class LinkdbReader { return ret; } - public List getUrlDetails(TLongList ids) throws SQLException { - List ret = new ArrayList<>(ids.size()); + public List getUrlDetails(TLongList ids) throws SQLException { + List ret = new ArrayList<>(ids.size()); if (connection == null || connection.isClosed()) @@ -126,7 +126,7 @@ public class LinkdbReader { var rs = stmt.executeQuery(); if (rs.next()) { var url = new EdgeUrl(rs.getString("URL")); - ret.add(new LdbUrlDetail( + ret.add(new DocdbUrlDetail( rs.getLong("ID"), url, rs.getString("TITLE"), diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbWriter.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/DocumentDbWriter.java similarity index 83% rename from code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbWriter.java rename to code/common/linkdb/src/main/java/nu/marginalia/linkdb/DocumentDbWriter.java index fa9cad7e..88277e9d 100644 --- a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbWriter.java +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/DocumentDbWriter.java @@ -1,24 +1,23 @@ package nu.marginalia.linkdb; -import nu.marginalia.linkdb.model.LdbUrlDetail; +import nu.marginalia.linkdb.model.DocdbUrlDetail; import java.io.IOException; import java.nio.file.Path; import java.sql.Connection; import java.sql.DriverManager; import java.sql.SQLException; -import java.sql.Types; import java.util.List; -public class LinkdbWriter { +public class DocumentDbWriter { private final Connection connection; - public LinkdbWriter(Path outputFile) throws SQLException { + public DocumentDbWriter(Path outputFile) throws SQLException { String connStr = "jdbc:sqlite:" + outputFile.toString(); connection = DriverManager.getConnection(connStr); - try (var stream = ClassLoader.getSystemResourceAsStream("db/linkdb-document.sql"); + try (var stream = ClassLoader.getSystemResourceAsStream("db/docdb-document.sql"); var stmt = connection.createStatement() ) { var sql = new String(stream.readAllBytes()); @@ -31,11 +30,11 @@ public class LinkdbWriter { } } - public void add(LdbUrlDetail ldbUrlDetail) throws SQLException { - add(List.of(ldbUrlDetail)); + public void add(DocdbUrlDetail docdbUrlDetail) throws SQLException { + add(List.of(docdbUrlDetail)); } - public void add(List ldbUrlDetail) throws SQLException { + public void add(List docdbUrlDetail) throws SQLException { try (var stmt = connection.prepareStatement(""" INSERT OR IGNORE INTO DOCUMENT(ID, URL, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR) @@ -43,7 +42,7 @@ public class LinkdbWriter { """)) { int i = 0; - for (var document : ldbUrlDetail) { + for (var document : docdbUrlDetail) { var url = document.url(); stmt.setLong(1, document.urlId()); diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/DomainLinkDb.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/DomainLinkDb.java new file mode 100644 index 00000000..b9af1dea --- /dev/null +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/DomainLinkDb.java @@ -0,0 +1,39 @@ +package nu.marginalia.linkdb; + +import gnu.trove.list.array.TIntArrayList; + +import java.io.IOException; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.Arrays; + +/** A database of source-destination pairs of domain IDs. The database is loaded into memory from + * a source. The database is then kept in memory, reloading it upon switchInput(). + */ +public interface DomainLinkDb { + /** Replace the current db file with the provided file. The provided file will be deleted. + * The in-memory database MAY be updated to reflect the change. + * */ + void switchInput(Path filename) throws Exception; + + /** Find all destinations for the given source. */ + TIntArrayList findDestinations(int source); + + /** Count the number of destinations for the given source. */ + int countDestinations(int source); + + /** Find all sources for the given destination. */ + TIntArrayList findSources(int dest); + + + /** Count the number of sources for the given destination. */ + int countSources(int source); + + /** Iterate over all source-destination pairs. */ + void forEach(SourceDestConsumer consumer); + + + interface SourceDestConsumer { + void accept(int source, int dest); + } +} diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/DomainLinkDbLoader.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/DomainLinkDbLoader.java new file mode 100644 index 00000000..de8c6d96 --- /dev/null +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/DomainLinkDbLoader.java @@ -0,0 +1,45 @@ +package nu.marginalia.linkdb; + +import java.io.DataInputStream; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +public class DomainLinkDbLoader implements AutoCloseable { + private final DataInputStream stream; + private final Path filename; + + private long nextVal; + + public DomainLinkDbLoader(Path filename) throws IOException { + this.stream = new DataInputStream(Files.newInputStream(filename)); + this.filename = filename; + } + + public int size() throws IOException { + return (int) (Files.size(filename) / 8); + } + + public boolean next() { + try { + nextVal = stream.readLong(); + return true; + } + catch (IOException ex) { + return false; + } + } + + public int getSource() { + return (int) (nextVal >>> 32); + } + public int getDest() { + return (int) (nextVal & 0xffff_ffffL); + } + + public void close() throws IOException { + stream.close(); + } + + +} diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/DomainLinkDbWriter.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/DomainLinkDbWriter.java new file mode 100644 index 00000000..f275ba01 --- /dev/null +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/DomainLinkDbWriter.java @@ -0,0 +1,29 @@ +package nu.marginalia.linkdb; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; + +public class DomainLinkDbWriter implements AutoCloseable { + private final DataOutputStream stream; + + public DomainLinkDbWriter(Path fileName) throws IOException { + this.stream = new DataOutputStream(Files.newOutputStream(fileName, + StandardOpenOption.CREATE, + StandardOpenOption.WRITE, + StandardOpenOption.TRUNCATE_EXISTING) + ); + } + + public void write(int sourceDomainId, int destDomainId) throws IOException { + stream.writeLong(Integer.toUnsignedLong(sourceDomainId) << 32 + | Integer.toUnsignedLong(destDomainId)); + } + + @Override + public void close() throws IOException { + stream.close(); + } +} diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/FileDomainLinkDb.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/FileDomainLinkDb.java new file mode 100644 index 00000000..53f53417 --- /dev/null +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/FileDomainLinkDb.java @@ -0,0 +1,125 @@ +package nu.marginalia.linkdb; + +import com.google.inject.name.Named; +import gnu.trove.list.array.TIntArrayList; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; +import java.util.Arrays; + +/** Canonical DomainLinkDb implementation. The database is loaded into memory from + * a file. The database is then kept in memory, reloading it upon switchInput(). + */ +public class FileDomainLinkDb implements DomainLinkDb { + private static final Logger logger = LoggerFactory.getLogger(FileDomainLinkDb.class); + private final Path filename; + private volatile long[] sourceToDest = new long[0]; + private volatile long[] destToSource = new long[0]; + + public FileDomainLinkDb(@Named("domain-linkdb-file") Path filename) throws IOException { + this.filename = filename; + if (Files.exists(filename)) { + switchInput(filename); + } + } + + @Override + public void switchInput(Path newFilename) throws IOException { + Files.move(newFilename, filename, StandardCopyOption.REPLACE_EXISTING); + loadInput(filename); + } + + public void loadInput(Path filename) throws IOException { + try (var loader = new DomainLinkDbLoader(filename)) { + int size = loader.size(); + + var newSourceToDest = new long[size]; + var newDestToSource = new long[size]; + + int i = 0; + while (loader.next()) { + long source = loader.getSource(); + long dest = loader.getDest(); + + newSourceToDest[i] = (source << 32) | dest; + newDestToSource[i] = (dest << 32) | source; + + i++; + } + + Arrays.sort(newSourceToDest); + Arrays.sort(newDestToSource); + + sourceToDest = newSourceToDest; + destToSource = newDestToSource; + } + } + + @Override + public TIntArrayList findDestinations(int source) { + return findRelated(sourceToDest, source); + } + + @Override + public TIntArrayList findSources(int dest) { + return findRelated(destToSource, dest); + } + + @Override + public int countDestinations(int source) { + return countRelated(sourceToDest, source); + } + + @Override + public int countSources(int dest) { + return countRelated(destToSource, dest); + } + + @Override + public void forEach(SourceDestConsumer consumer) { + for (long val : sourceToDest) { + consumer.accept((int) (val >>> 32), (int) (val & 0xFFFF_FFFFL)); + } + } + + private TIntArrayList findRelated(long[] range, int key) { + long keyLong = Integer.toUnsignedLong(key) << 32; + long nextKeyLong = Integer.toUnsignedLong(key + 1) << 32; + + int start = Arrays.binarySearch(range, keyLong); + + if (start < 0) { + // Key is not found, get the insertion point + start = -start - 1; + } + + TIntArrayList result = new TIntArrayList(); + + for (int i = start; i < range.length && range[i] < nextKeyLong; i++) { + result.add((int) (range[i] & 0xFFFF_FFFFL)); + } + + return result; + } + + private int countRelated(long[] range, int key) { + long keyLong = Integer.toUnsignedLong(key) << 32; + long nextKeyLong = Integer.toUnsignedLong(key + 1) << 32; + + int start = Arrays.binarySearch(range, keyLong); + + if (start < 0) { + // Key is not found, get the insertion point + start = -start - 1; + } + + int num = 0; + for (int i = start; i < range.length && range[i] < nextKeyLong; i++, num++); + return num; + } + +} diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbFileNames.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbFileNames.java new file mode 100644 index 00000000..a39769d2 --- /dev/null +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbFileNames.java @@ -0,0 +1,7 @@ +package nu.marginalia.linkdb; + +public class LinkdbFileNames { + public static String DEPRECATED_LINKDB_FILE_NAME = "links.db"; + public static String DOCDB_FILE_NAME = "documents.db"; + public static String DOMAIN_LINKS_FILE_NAME = "domain-links.dat"; +} diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/SqlDomainLinkDb.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/SqlDomainLinkDb.java new file mode 100644 index 00000000..4a98eaa9 --- /dev/null +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/SqlDomainLinkDb.java @@ -0,0 +1,158 @@ +package nu.marginalia.linkdb; + +import com.google.inject.name.Named; +import com.zaxxer.hikari.HikariDataSource; +import gnu.trove.list.array.TIntArrayList; +import gnu.trove.list.array.TLongArrayList; +import nu.marginalia.service.module.ServiceConfiguration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; +import java.util.Arrays; + +/** DomainLinkDb implementation that goes through the motions of + * being a File-backed DomainLinkDb, but actually uses the legacy SQL database + * for loading the data. + *

+ * This is part of the migration path to using FileDomainLinkDb. + */ +public class SqlDomainLinkDb implements DomainLinkDb { + private volatile long[] sourceToDest = new long[0]; + private volatile long[] destToSource = new long[0]; + private static final Logger logger = LoggerFactory.getLogger(SqlDomainLinkDb.class); + + private final Path filename; + private final HikariDataSource dataSource; + private final int node; + + public SqlDomainLinkDb(@Named("domain-linkdb-file") Path filename, + HikariDataSource dataSource, + ServiceConfiguration configuration) + { + this.filename = filename; + this.dataSource = dataSource; + + node = configuration.node(); + + Thread.ofPlatform().start(() -> { + try { + loadDb(); + } catch (Exception e) { + logger.error("Failed to load linkdb", e); + } + }); + } + + @Override + public void switchInput(Path newFilename) throws IOException { + Files.move(newFilename, filename, StandardCopyOption.REPLACE_EXISTING); + + loadDb(); + } + + public void loadDb() { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement( + STR.""" + SELECT + SOURCE_DOMAIN_ID, + DEST_DOMAIN_ID + FROM EC_DOMAIN_LINK + INNER JOIN EC_DOMAIN + ON EC_DOMAIN.ID = EC_DOMAIN_LINK.SOURCE_DOMAIN_ID + WHERE NODE_AFFINITY=\{node} + """); + var rs = stmt.executeQuery()) + { + TLongArrayList sourceToDest = new TLongArrayList(10_000_000); + TLongArrayList destToSource = new TLongArrayList(10_000_000); + + while (rs.next()) { + long source = Integer.toUnsignedLong(rs.getInt(1)); + long dest = Integer.toUnsignedLong(rs.getInt(2)); + + sourceToDest.add((source << 32) | dest); + destToSource.add((dest << 32) | source); + } + + sourceToDest.sort(); + destToSource.sort(); + + this.sourceToDest = sourceToDest.toArray(); + this.destToSource = destToSource.toArray(); + } + catch (Exception ex) { + logger.error("Failed to load linkdb", ex); + } + + logger.info("LinkDB loaded, size = {}", sourceToDest.length); + } + + @Override + public TIntArrayList findDestinations(int source) { + return findRelated(sourceToDest, source); + } + + @Override + public TIntArrayList findSources(int dest) { + return findRelated(destToSource, dest); + } + + @Override + public int countDestinations(int source) { + return countRelated(sourceToDest, source); + } + + @Override + public int countSources(int dest) { + return countRelated(destToSource, dest); + } + + @Override + public void forEach(SourceDestConsumer consumer) { + for (long val : sourceToDest) { + consumer.accept((int) (val >>> 32), (int) (val & 0xFFFF_FFFFL)); + } + } + + private TIntArrayList findRelated(long[] range, int key) { + long keyLong = Integer.toUnsignedLong(key) << 32; + long nextKeyLong = Integer.toUnsignedLong(key + 1) << 32; + + int start = Arrays.binarySearch(range, keyLong); + + if (start < 0) { + // Key is not found, get the insertion point + start = -start - 1; + } + + TIntArrayList result = new TIntArrayList(); + + for (int i = start; i < range.length && range[i] < nextKeyLong; i++) { + result.add((int) (range[i] & 0xFFFF_FFFFL)); + } + + return result; + } + + private int countRelated(long[] range, int key) { + long keyLong = Integer.toUnsignedLong(key) << 32; + long nextKeyLong = Integer.toUnsignedLong(key + 1) << 32; + + int start = Arrays.binarySearch(range, keyLong); + + if (start < 0) { + // Key is not found, get the insertion point + start = -start - 1; + } + + int num = 0; + for (int i = start; i < range.length && range[i] < nextKeyLong; i++, num++); + return num; + } + +} diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/DocdbUrlDetail.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/DocdbUrlDetail.java new file mode 100644 index 00000000..a360571b --- /dev/null +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/DocdbUrlDetail.java @@ -0,0 +1,18 @@ +package nu.marginalia.linkdb.model; + +import nu.marginalia.model.EdgeUrl; + +public record DocdbUrlDetail(long urlId, + EdgeUrl url, + String title, + String description, + double urlQuality, + String format, + int features, + Integer pubYear, + long dataHash, + int wordsTotal + ) + +{ +} diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/LdbUrlDetail.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/LdbUrlDetail.java deleted file mode 100644 index 9b743c9c..00000000 --- a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/LdbUrlDetail.java +++ /dev/null @@ -1,18 +0,0 @@ -package nu.marginalia.linkdb.model; - -import nu.marginalia.model.EdgeUrl; - -public record LdbUrlDetail(long urlId, - EdgeUrl url, - String title, - String description, - double urlQuality, - String format, - int features, - Integer pubYear, - long dataHash, - int wordsTotal - ) - -{ -} diff --git a/code/common/linkdb/src/main/resources/db/linkdb-document.sql b/code/common/linkdb/src/main/resources/db/docdb-document.sql similarity index 100% rename from code/common/linkdb/src/main/resources/db/linkdb-document.sql rename to code/common/linkdb/src/main/resources/db/docdb-document.sql diff --git a/code/common/linkdb/src/test/java/nu/marginalia/linkdb/LinkdbWriterTest.java b/code/common/linkdb/src/test/java/nu/marginalia/linkdb/DocumentDbWriterTest.java similarity index 76% rename from code/common/linkdb/src/test/java/nu/marginalia/linkdb/LinkdbWriterTest.java rename to code/common/linkdb/src/test/java/nu/marginalia/linkdb/DocumentDbWriterTest.java index 598e6b67..b28b5ed4 100644 --- a/code/common/linkdb/src/test/java/nu/marginalia/linkdb/LinkdbWriterTest.java +++ b/code/common/linkdb/src/test/java/nu/marginalia/linkdb/DocumentDbWriterTest.java @@ -1,7 +1,7 @@ package nu.marginalia.linkdb; import gnu.trove.list.array.TLongArrayList; -import nu.marginalia.linkdb.model.LdbUrlDetail; +import nu.marginalia.linkdb.model.DocdbUrlDetail; import nu.marginalia.model.EdgeDomain; import org.junit.jupiter.api.Test; @@ -10,13 +10,13 @@ import java.nio.file.Files; import java.nio.file.Path; import java.sql.SQLException; -public class LinkdbWriterTest { +public class DocumentDbWriterTest { @Test public void testCreate() throws IOException { - Path tempPath = Files.createTempFile("linkdb", ".db"); + Path tempPath = Files.createTempFile("docdb", ".db"); try { - var writer = new LinkdbWriter(tempPath); - writer.add(new LdbUrlDetail( + var writer = new DocumentDbWriter(tempPath); + writer.add(new DocdbUrlDetail( 1, new nu.marginalia.model.EdgeUrl("http", new EdgeDomain("example.com"), null, "/", null), "Test", @@ -30,7 +30,7 @@ public class LinkdbWriterTest { )); writer.close(); - var reader = new LinkdbReader(tempPath); + var reader = new DocumentDbReader(tempPath); var deets = reader.getUrlDetails(new TLongArrayList(new long[]{1})); System.out.println(deets); } catch (SQLException e) { diff --git a/code/common/linkdb/src/test/java/nu/marginalia/linkdb/DomainLinkDbTest.java b/code/common/linkdb/src/test/java/nu/marginalia/linkdb/DomainLinkDbTest.java new file mode 100644 index 00000000..1014ba73 --- /dev/null +++ b/code/common/linkdb/src/test/java/nu/marginalia/linkdb/DomainLinkDbTest.java @@ -0,0 +1,50 @@ +package nu.marginalia.linkdb; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +public class DomainLinkDbTest { + Path fileName; + @BeforeEach + public void setUp() throws IOException { + fileName = Files.createTempFile("test", ".db"); + } + @AfterEach + public void tearDown() throws IOException { + Files.deleteIfExists(fileName); + } + + @Test + public void testWriteRead() { + try (var writer = new DomainLinkDbWriter(fileName)) { + writer.write(1, 2); + writer.write(2, 3); + writer.write(3, 4); + } + catch (IOException ex) { + throw new RuntimeException(ex); + } + + try (var reader = new DomainLinkDbLoader(fileName)) { + Assertions.assertTrue(reader.next()); + Assertions.assertEquals(1, reader.getSource()); + Assertions.assertEquals(2, reader.getDest()); + Assertions.assertTrue(reader.next()); + Assertions.assertEquals(2, reader.getSource()); + Assertions.assertEquals(3, reader.getDest()); + Assertions.assertTrue(reader.next()); + Assertions.assertEquals(3, reader.getSource()); + Assertions.assertEquals(4, reader.getDest()); + Assertions.assertFalse(reader.next()); + } + catch (IOException ex) { + throw new RuntimeException(ex); + } + } +} diff --git a/code/features-index/domain-ranking/build.gradle b/code/features-index/domain-ranking/build.gradle index bfd613e9..885787eb 100644 --- a/code/features-index/domain-ranking/build.gradle +++ b/code/features-index/domain-ranking/build.gradle @@ -17,6 +17,8 @@ dependencies { implementation project(':code:common:db') implementation project(':code:common:model') implementation project(':code:common:service') + implementation project(':code:common:service-client') + implementation project(':code:api:query-api') implementation libs.bundles.slf4j implementation libs.bundles.mariadb diff --git a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcher.java b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcher.java index 2499d51f..1be9a6e2 100644 --- a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcher.java +++ b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcher.java @@ -5,6 +5,7 @@ import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.db.DomainBlacklistImpl; import nu.marginalia.model.crawl.DomainIndexingState; +import nu.marginalia.query.client.QueryClient; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -15,14 +16,18 @@ import java.util.function.IntConsumer; @Singleton public class RankingDomainFetcher { protected final HikariDataSource dataSource; + private final QueryClient queryClient; protected final DomainBlacklistImpl blacklist; protected final Logger logger = LoggerFactory.getLogger(getClass()); protected boolean getNames = false; @Inject - public RankingDomainFetcher(HikariDataSource dataSource, DomainBlacklistImpl blacklist) { + public RankingDomainFetcher(HikariDataSource dataSource, + QueryClient queryClient, + DomainBlacklistImpl blacklist) { this.dataSource = dataSource; + this.queryClient = queryClient; this.blacklist = blacklist; } @@ -33,10 +38,10 @@ public class RankingDomainFetcher { public void getDomains(Consumer consumer) { String query; if (getNames) { - query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID"; + query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE NODE_AFFINITY>0 GROUP BY EC_DOMAIN.ID"; } else { - query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID"; + query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE NODE_AFFINITY>0 GROUP BY EC_DOMAIN.ID"; } getDomains(query, consumer); @@ -77,23 +82,14 @@ public class RankingDomainFetcher { } public void eachDomainLink(DomainLinkConsumer consumer) { - try (var conn = dataSource.getConnection(); - var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) - { - stmt.setFetchSize(10000); - var rsp = stmt.executeQuery(); + var allLinks = queryClient.getAllDomainLinks(); + var iter = allLinks.iterator(); - while (rsp.next()) { - int src = rsp.getInt(1); - int dst = rsp.getInt(2); - - consumer.accept(src, dst); - } - } - catch (SQLException ex) { - logger.error("Failed to fetch domain links", ex); + while (iter.advance()) { + consumer.accept(iter.source(), iter.dest()); } + } public void domainsByPattern(String pattern, IntConsumer idConsumer) { diff --git a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcherForSimilarityData.java b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcherForSimilarityData.java index eccb87ad..ae801166 100644 --- a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcherForSimilarityData.java +++ b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/data/RankingDomainFetcherForSimilarityData.java @@ -4,6 +4,7 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.db.DomainBlacklistImpl; +import nu.marginalia.query.client.QueryClient; import org.slf4j.LoggerFactory; import java.sql.SQLException; @@ -14,8 +15,8 @@ public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher final boolean hasData; @Inject - public RankingDomainFetcherForSimilarityData(HikariDataSource dataSource, DomainBlacklistImpl blacklist) { - super(dataSource, blacklist); + public RankingDomainFetcherForSimilarityData(HikariDataSource dataSource, QueryClient queryClient, DomainBlacklistImpl blacklist) { + super(dataSource, queryClient, blacklist); hasData = isDomainNeighborTablePopulated(dataSource); } @@ -61,17 +62,6 @@ public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher } public void getDomains(Consumer consumer) { -// String query = -// """ -// SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0) -// FROM EC_DOMAIN -// LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID -// INNER JOIN EC_DOMAIN_LINK ON DEST_DOMAIN_ID=EC_DOMAIN.ID -// WHERE SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID -// GROUP BY EC_DOMAIN.ID -// HAVING COUNT(SOURCE_DOMAIN_ID)>5 -// """; - String query; if (getNames) { query = diff --git a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/tool/CreateBrowseDomainRanksTool.java b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/tool/CreateBrowseDomainRanksTool.java deleted file mode 100644 index 17b2e195..00000000 --- a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/tool/CreateBrowseDomainRanksTool.java +++ /dev/null @@ -1,71 +0,0 @@ -package nu.marginalia.ranking.tool; - -import com.zaxxer.hikari.HikariDataSource; -import lombok.SneakyThrows; -import nu.marginalia.db.DomainBlacklistImpl; -import nu.marginalia.ranking.StandardPageRank; -import nu.marginalia.ranking.accumulator.RankingResultListAccumulator; -import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData; -import nu.marginalia.service.module.DatabaseModule; -import org.mariadb.jdbc.Driver; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.sql.SQLException; -import java.util.concurrent.LinkedBlockingQueue; - -public class CreateBrowseDomainRanksTool { - - private static final Logger logger = LoggerFactory.getLogger(CreateBrowseDomainRanksTool.class); - - - static final LinkedBlockingQueue uploadQueue = new LinkedBlockingQueue<>(10); - volatile static boolean running = true; - - @SneakyThrows - public static void main(String... args) { - Driver driver = new Driver(); - var conn = new DatabaseModule().provideConnection(); - - long start = System.currentTimeMillis(); - var uploader = new Thread(() -> uploadThread(conn), "Uploader"); - - logger.info("Ranking"); - var ds = new DatabaseModule().provideConnection(); - var domains = new RankingDomainFetcherForSimilarityData(ds, new DomainBlacklistImpl(ds)); - var rpr = new StandardPageRank(domains, args); - - uploader.start(); - - var rankData = rpr.pageRankWithPeripheralNodes(1000, RankingResultListAccumulator::new); - - rankData.forEach(i -> { - try { - uploadQueue.put(i); - } catch (InterruptedException e) { - e.printStackTrace(); - } - return true; - }); - - long end = System.currentTimeMillis(); - running = false; - uploader.join(); - - logger.info("Done in {}", (end - start)/1000.0); - } - - public static void uploadThread(HikariDataSource dataSource) { - try (var conn = dataSource.getConnection()) { - try (var stmt = conn.prepareStatement("INSERT IGNORE INTO EC_RANDOM_DOMAINS(DOMAIN_SET, DOMAIN_ID) VALUES (3, ?)")) { - while (running || (!running && !uploadQueue.isEmpty())) { - var job = uploadQueue.take(); - stmt.setInt(1, job); - stmt.executeUpdate(); - } - } - } catch (SQLException | InterruptedException throwables) { - throwables.printStackTrace(); - } - } -} diff --git a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/tool/PerusePageRankV2.java b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/tool/PerusePageRankV2.java deleted file mode 100644 index be64a4e2..00000000 --- a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/tool/PerusePageRankV2.java +++ /dev/null @@ -1,264 +0,0 @@ -package nu.marginalia.ranking.tool; - - -import com.zaxxer.hikari.HikariDataSource; -import gnu.trove.list.TIntList; -import gnu.trove.list.array.TIntArrayList; -import gnu.trove.map.hash.TIntIntHashMap; -import gnu.trove.map.hash.TIntObjectHashMap; -import it.unimi.dsi.fastutil.ints.IntArrays; -import it.unimi.dsi.fastutil.ints.IntComparator; -import lombok.AllArgsConstructor; -import lombok.SneakyThrows; -import nu.marginalia.ranking.RankingAlgorithm; -import nu.marginalia.ranking.data.RankingDomainData; -import nu.marginalia.ranking.data.RankingDomainFetcher; -import nu.marginalia.db.DomainBlacklistImpl; -import nu.marginalia.service.module.DatabaseModule; -import org.jetbrains.annotations.NotNull; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.sql.SQLException; -import java.util.Arrays; -import java.util.concurrent.LinkedBlockingQueue; -import java.util.stream.IntStream; - -public class PerusePageRankV2 { - - final TIntObjectHashMap domainsById = new TIntObjectHashMap<>(); - final TIntIntHashMap domainIndexToId = new TIntIntHashMap(); - final TIntIntHashMap domainIdToIndex = new TIntIntHashMap(); - - TIntArrayList[] linkDataSrc2Dest; - TIntArrayList[] linkDataDest2Src; - - private final Logger logger = LoggerFactory.getLogger(getClass()); - - static final LinkedBlockingQueue uploadQueue = new LinkedBlockingQueue<>(10); - volatile static boolean running = true; - - public int indexMax() { - return domainIndexToId.size(); - } - - public int getDomainId(int idx) { - return domainIndexToId.get(idx); - } - - @SneakyThrows - public static void main(String... args) { - var ds = new DatabaseModule().provideConnection(); - var blacklist = new DomainBlacklistImpl(ds); - var rank = new PerusePageRankV2(new RankingDomainFetcher(ds, blacklist)); - - long start = System.currentTimeMillis(); - var uploader = new Thread(() -> uploadThread(ds)); - uploader.start(); - - IntStream.range(0, rank.indexMax()).parallel().forEach(i -> { - int[] ids = rank.pageRank(i, 25).toArray(); - try { - uploadQueue.put(new LinkAdjacencies(rank.getDomainId(i), ids)); - } catch (InterruptedException e) { - e.printStackTrace(); - } - }); - - long end = System.currentTimeMillis(); - running = false; - uploader.join(); - System.out.printf("%2.2f", (end - start)/1000.0); - } - - @AllArgsConstructor - static class LinkAdjacencies { - public final int id; - public final int[] neighbors; - } - - public static void uploadThread(HikariDataSource dataSource) { - try (var conn = dataSource.getConnection()) { - try (var stmt = conn.prepareStatement("INSERT INTO EC_DOMAIN_NEIGHBORS(DOMAIN_ID, NEIGHBOR_ID, ADJ_IDX) VALUES (?,?,?) ON DUPLICATE KEY UPDATE NEIGHBOR_ID=VALUES(NEIGHBOR_ID)")) { - while (running || (!running && !uploadQueue.isEmpty())) { - var job = uploadQueue.take(); - for (int i = 0; i < job.neighbors.length; i++) { - stmt.setInt(1, job.id); - stmt.setInt(2, job.neighbors[i]); - stmt.setInt(3, i); - stmt.addBatch(); - } - stmt.executeBatch(); - } - } - } catch (SQLException | InterruptedException throwables) { - throwables.printStackTrace(); - } - } - - public PerusePageRankV2(RankingDomainFetcher domainFetcher) { - - domainFetcher.getDomains(domainData -> { - int id = domainData.id; - - domainsById.put(id, domainData); - - domainIndexToId.put(domainIndexToId.size(), id); - domainIdToIndex.put(id, domainIdToIndex.size()); - }); - domainFetcher.getPeripheralDomains(domainData -> { - int id = domainData.id; - - domainsById.put(id, domainData); - - domainIndexToId.put(domainIndexToId.size(), id); - domainIdToIndex.put(id, domainIdToIndex.size()); - }); - - linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()]; - linkDataDest2Src = new TIntArrayList[domainIndexToId.size()]; - - domainFetcher.eachDomainLink((src, dst) -> { - if (src == dst) return; - - if (domainsById.contains(src) && domainsById.contains(dst)) { - - int srcIdx = domainIdToIndex.get(src); - int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias()); - - if (linkDataSrc2Dest[srcIdx] == null) { - linkDataSrc2Dest[srcIdx] = new TIntArrayList(); - } - linkDataSrc2Dest[srcIdx].add(dstIdx); - - if (linkDataDest2Src[dstIdx] == null) { - linkDataDest2Src[dstIdx] = new TIntArrayList(); - } - linkDataDest2Src[dstIdx].add(srcIdx); - } - }); - } - - public TIntList pageRank(int origin, int resultCount) { - RankVector rank = new RankVector(1.d / domainsById.size()); - - int iter_max = 10; - for (int i = 0; i < iter_max; i++) { - RankVector newRank = createNewRankVector(rank); - - double oldNorm = rank.norm(); - double newNorm = newRank.norm(); - double dNorm = oldNorm - newNorm ; - - newRank.increment(origin, dNorm/oldNorm); - - rank = newRank; - } - - rank.increment(origin, -1); - - return rank.getRanking(resultCount); - } - - @NotNull - private RankVector createNewRankVector(RankVector rank) { - - double rankNorm = rank.norm(); - RankVector newRank = new RankVector(0); - - for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) { - - var links = linkDataSrc2Dest[domainId]; - double newRankValue = 0; - - if (links != null && links.size() > 0) { - - - for (int j = 0; j < links.size(); j++) { - var revLinks = linkDataDest2Src[links.getQuick(j)]; - newRankValue += rank.get(links.getQuick(j)) / revLinks.size(); - } - } - - newRank.set(domainId, 0.85*newRankValue/rankNorm); - } - - return newRank; - } - - public class RankVector { - private final double[] rank; - public RankVector(double defaultValue) { - rank = new double[domainIndexToId.size()]; - if (defaultValue != 0.) { - Arrays.fill(rank, defaultValue); - } - } - - public void set(int id, double value) { - rank[id] = value; - } - - public void increment(int id, double value) { - rank[id] += value; - } - - public double get(int id) { - if (id >= rank.length) return 0.; - - return rank[id]; - } - - public double norm() { - double v = 0.; - for (int i = 0; i < rank.length; i++) { - if (rank[i] > 0) { v+=rank[i]; } - else { v -= rank[i]; } - } - return v; - } - - public double norm(RankingAlgorithm.RankVector other) { - double v = 0.; - for (int i = 0; i < rank.length; i++) { - double dv = rank[i] - other.get(i); - - if (dv > 0) { v+=dv; } - else { v -= dv; } - } - return v; - } - - public TIntList getRanking(int numResults) { - if (numResults < 0) { - numResults = domainIdToIndex.size(); - } - TIntArrayList list = new TIntArrayList(numResults); - - int[] nodes = new int[rank.length]; - Arrays.setAll(nodes, i->i); - IntComparator comp = (i,j) -> (int) Math.signum(rank[j] - rank[i]); - IntArrays.quickSort(nodes, comp); - - int i; - - for (i = 0; i < numResults; i++) { - int id = domainIndexToId.get(nodes[i]); - - if (!domainsById.get(id).isAlias()) - list.add(id); - } - - for (; i < nodes.length && domainsById.size() < numResults; i++) { - int id = domainIndexToId.get(nodes[i]); - - if (!domainsById.get(id).isAlias()) - list.add(id); - } - - - return list; - } - } - -} diff --git a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/tool/PrintDomainRanksTool.java b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/tool/PrintDomainRanksTool.java deleted file mode 100644 index 9877f393..00000000 --- a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/tool/PrintDomainRanksTool.java +++ /dev/null @@ -1,67 +0,0 @@ -package nu.marginalia.ranking.tool; - -import lombok.SneakyThrows; -import nu.marginalia.ranking.accumulator.RankingResultListAccumulator; -import nu.marginalia.ranking.data.RankingDomainFetcher; -import nu.marginalia.db.DomainBlacklistImpl; -import nu.marginalia.ranking.StandardPageRank; -import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData; -import nu.marginalia.service.module.DatabaseModule; -import org.mariadb.jdbc.Driver; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.concurrent.LinkedBlockingQueue; -import java.util.concurrent.atomic.AtomicInteger; - -public class PrintDomainRanksTool { - - private static final Logger logger = LoggerFactory.getLogger(PrintDomainRanksTool.class); - - private volatile static int rankMax; - - static final LinkedBlockingQueue uploadQueue = new LinkedBlockingQueue<>(10); - volatile static boolean running = true; - - @SneakyThrows - public static void main(String... args) { - Driver driver = new Driver(); - var conn = new DatabaseModule().provideConnection(); - - long start = System.currentTimeMillis(); - - logger.info("Ranking"); - var ds = new DatabaseModule().provideConnection(); - - RankingDomainFetcher domains; - if (Boolean.getBoolean("use-link-data")) { - domains = new RankingDomainFetcher(ds, new DomainBlacklistImpl(ds)); - domains.retainNames(); - } - else { - domains = new RankingDomainFetcherForSimilarityData(ds, new DomainBlacklistImpl(ds)); - domains.retainNames(); - } - - var rpr = new StandardPageRank(domains, args); - - rankMax = rpr.size(); - - var rankData = rpr.pageRankWithPeripheralNodes(rankMax, RankingResultListAccumulator::new); - - AtomicInteger cnt = new AtomicInteger(); - rankData.forEach(i -> { - - var data = rpr.getDomainData(i); - - System.out.printf("%d %s %s\n", cnt.getAndIncrement(), data.name, data.state); - return true; - }); - - long end = System.currentTimeMillis(); - running = false; - - logger.info("Done in {}", (end - start)/1000.0); - } - -} diff --git a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/tool/UpdateDomainRanksTool.java b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/tool/UpdateDomainRanksTool.java deleted file mode 100644 index 7e57bc8a..00000000 --- a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/tool/UpdateDomainRanksTool.java +++ /dev/null @@ -1,85 +0,0 @@ -package nu.marginalia.ranking.tool; - -import com.zaxxer.hikari.HikariDataSource; -import lombok.SneakyThrows; -import nu.marginalia.ranking.StandardPageRank; -import nu.marginalia.ranking.accumulator.RankingResultListAccumulator; -import nu.marginalia.ranking.data.RankingDomainFetcherForSimilarityData; - -import nu.marginalia.db.DomainBlacklistImpl; -import nu.marginalia.service.module.DatabaseModule; -import org.mariadb.jdbc.Driver; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.sql.SQLException; -import java.util.concurrent.LinkedBlockingQueue; - -public class UpdateDomainRanksTool { - - private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool.class); - - private volatile static int rankMax; - - static final LinkedBlockingQueue uploadQueue = new LinkedBlockingQueue<>(10); - volatile static boolean running = true; - - @SneakyThrows - public static void main(String... args) { - Driver driver = new Driver(); - var conn = new DatabaseModule().provideConnection(); - - long start = System.currentTimeMillis(); - var uploader = new Thread(() -> uploadThread(conn), "Uploader"); - - logger.info("Ranking"); - var domains = new RankingDomainFetcherForSimilarityData(conn, new DomainBlacklistImpl(conn)); - var rpr = new StandardPageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com"); - - rankMax = rpr.size(); - uploader.start(); - - var rankData = rpr.pageRankWithPeripheralNodes(rankMax, RankingResultListAccumulator::new); - - rankData.forEach(i -> { - try { - uploadQueue.put(i); - } catch (InterruptedException e) { - e.printStackTrace(); - } - return true; - }); - - long end = System.currentTimeMillis(); - running = false; - uploader.join(); - - logger.info("Done in {}", (end - start)/1000.0); - } - - public static void uploadThread(HikariDataSource dataSource) { - int i = 0; - - try (var conn = dataSource.getConnection()) { - logger.info("Resetting rank"); - try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET RANK=1")) { - stmt.executeUpdate(); - } - - logger.info("Updating ranks"); - try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET RANK=? WHERE ID=?")) { - while (running || (!running && !uploadQueue.isEmpty())) { - var job = uploadQueue.take(); - stmt.setDouble(1, i++ / (double) rankMax); - stmt.setInt(2, job); - stmt.executeUpdate(); - } - } - - logger.info("Recalculating quality"); - - } catch (SQLException | InterruptedException throwables) { - throwables.printStackTrace(); - } - } -} diff --git a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarCosine.java b/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarCosine.java index 63a276a2..f75a87de 100644 --- a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarCosine.java +++ b/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarCosine.java @@ -3,7 +3,6 @@ package nu.marginalia.browse; import com.google.inject.Inject; import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; -import gnu.trove.set.hash.TIntHashSet; import nu.marginalia.browse.model.BrowseResult; import nu.marginalia.model.EdgeDomain; import nu.marginalia.db.DomainBlacklist; diff --git a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarOldAlgo.java b/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarOldAlgo.java deleted file mode 100644 index bf155040..00000000 --- a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarOldAlgo.java +++ /dev/null @@ -1,132 +0,0 @@ -package nu.marginalia.browse; - -import com.google.inject.Inject; -import com.google.inject.Singleton; -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.browse.model.BrowseResult; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.db.DomainBlacklist; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.sql.SQLException; -import java.util.*; - -@Singleton -public class DbBrowseDomainsSimilarOldAlgo { - - private final Logger logger = LoggerFactory.getLogger(getClass()); - private final HikariDataSource dataSource; - - @Inject - public DbBrowseDomainsSimilarOldAlgo(HikariDataSource dataSource) { - this.dataSource = dataSource; - } - - public List getDomainNeighborsAdjacent(int domainId, DomainBlacklist blacklist, int count) { - final Set domains = new HashSet<>(count*3); - - final String q = """ - SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, DOMAIN_NAME, COUNT(*) AS CNT, INDEXED - FROM EC_DOMAIN_NEIGHBORS - INNER JOIN EC_DOMAIN ON NEIGHBOR_ID=EC_DOMAIN.ID - INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID - INNER JOIN EC_DOMAIN_LINK ON DEST_DOMAIN_ID=EC_DOMAIN.ID - WHERE - STATE<2 - AND KNOWN_URLS<1000 - AND DOMAIN_ALIAS IS NULL - AND EC_DOMAIN_NEIGHBORS.DOMAIN_ID = ? - GROUP BY EC_DOMAIN.ID - HAVING CNT < 100 - ORDER BY ADJ_IDX - LIMIT ? - """; - - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement(q)) { - stmt.setFetchSize(count); - stmt.setInt(1, domainId); - stmt.setInt(2, count); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - int id = rsp.getInt(1); - String domain = rsp.getString(2); - - if (!blacklist.isBlacklisted(id)) { - domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0, rsp.getBoolean("INDEXED"))); - } - } - } - - if (domains.size() < count/2) { - final String q2 = """ - SELECT EC_DOMAIN.ID, DOMAIN_NAME, INDEXED - FROM EC_DOMAIN - INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID - INNER JOIN EC_DOMAIN_LINK B ON DEST_DOMAIN_ID=EC_DOMAIN.ID - INNER JOIN EC_DOMAIN_LINK O ON O.DEST_DOMAIN_ID=EC_DOMAIN.ID - WHERE B.SOURCE_DOMAIN_ID=? - AND STATE<2 - AND KNOWN_URLS<1000 - AND DOMAIN_ALIAS IS NULL - GROUP BY EC_DOMAIN.ID - HAVING COUNT(*) < 100 ORDER BY RANK ASC LIMIT ?"""; - try (var stmt = connection.prepareStatement(q2)) { - - stmt.setFetchSize(count/2); - stmt.setInt(1, domainId); - stmt.setInt(2, count/2 - domains.size()); - var rsp = stmt.executeQuery(); - while (rsp.next() && domains.size() < count/2) { - int id = rsp.getInt(1); - String domain = rsp.getString(2); - - if (!blacklist.isBlacklisted(id)) { - domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0, rsp.getBoolean("INDEXED"))); - } - } - } - } - - if (domains.size() < count/2) { - final String q3 = """ - SELECT EC_DOMAIN.ID, DOMAIN_NAME, INDEXED - FROM EC_DOMAIN - INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID - INNER JOIN EC_DOMAIN_LINK B ON B.SOURCE_DOMAIN_ID=EC_DOMAIN.ID - INNER JOIN EC_DOMAIN_LINK O ON O.DEST_DOMAIN_ID=EC_DOMAIN.ID - WHERE B.DEST_DOMAIN_ID=? - AND STATE<2 - AND KNOWN_URLS<1000 - AND DOMAIN_ALIAS IS NULL - GROUP BY EC_DOMAIN.ID - HAVING COUNT(*) < 100 - ORDER BY RANK ASC - LIMIT ?"""; - try (var stmt = connection.prepareStatement(q3)) { - stmt.setFetchSize(count/2); - stmt.setInt(1, domainId); - stmt.setInt(2, count/2 - domains.size()); - - var rsp = stmt.executeQuery(); - while (rsp.next() && domains.size() < count/2) { - int id = rsp.getInt(1); - String domain = rsp.getString(2); - - if (!blacklist.isBlacklisted(id)) { - domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0, rsp.getBoolean("INDEXED"))); - } - } - } - } - } catch (SQLException throwables) { - throwables.printStackTrace(); - } - - - return new ArrayList<>(domains); - } - - -} diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java index 98e66c5f..a91678d8 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java @@ -9,7 +9,7 @@ import lombok.SneakyThrows; import nu.marginalia.ProcessConfiguration; import nu.marginalia.ProcessConfigurationModule; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.linkdb.LinkdbWriter; +import nu.marginalia.linkdb.DocumentDbWriter; import nu.marginalia.loading.documents.DocumentLoaderService; import nu.marginalia.loading.documents.KeywordLoaderService; import nu.marginalia.loading.domains.DomainIdRegistry; @@ -43,7 +43,7 @@ public class LoaderMain { private final ProcessHeartbeatImpl heartbeat; private final MessageQueueFactory messageQueueFactory; private final FileStorageService fileStorageService; - private final LinkdbWriter linkdbWriter; + private final DocumentDbWriter documentDbWriter; private final LoaderIndexJournalWriter journalWriter; private final DomainLoaderService domainService; private final DomainLinksLoaderService linksService; @@ -77,7 +77,7 @@ public class LoaderMain { public LoaderMain(ProcessHeartbeatImpl heartbeat, MessageQueueFactory messageQueueFactory, FileStorageService fileStorageService, - LinkdbWriter linkdbWriter, + DocumentDbWriter documentDbWriter, LoaderIndexJournalWriter journalWriter, DomainLoaderService domainService, DomainLinksLoaderService linksService, @@ -90,7 +90,7 @@ public class LoaderMain { this.heartbeat = heartbeat; this.messageQueueFactory = messageQueueFactory; this.fileStorageService = fileStorageService; - this.linkdbWriter = linkdbWriter; + this.documentDbWriter = documentDbWriter; this.journalWriter = journalWriter; this.domainService = domainService; this.linksService = linksService; @@ -132,7 +132,7 @@ public class LoaderMain { } finally { journalWriter.close(); - linkdbWriter.close(); + documentDbWriter.close(); heartbeat.shutDown(); } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java index abd1d08a..1ba5d9ca 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java @@ -9,8 +9,9 @@ import com.google.inject.name.Names; import nu.marginalia.LanguageModels; import nu.marginalia.WmsaHome; import nu.marginalia.IndexLocations; +import nu.marginalia.linkdb.DomainLinkDbWriter; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.linkdb.LinkdbWriter; +import nu.marginalia.linkdb.DocumentDbWriter; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.service.SearchServiceDescriptors; import nu.marginalia.service.descriptor.ServiceDescriptors; @@ -20,6 +21,9 @@ import java.nio.file.Files; import java.nio.file.Path; import java.sql.SQLException; +import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME; +import static nu.marginalia.linkdb.LinkdbFileNames.DOMAIN_LINKS_FILE_NAME; + public class LoaderModule extends AbstractModule { public LoaderModule() { @@ -34,14 +38,26 @@ public class LoaderModule extends AbstractModule { } @Inject @Provides @Singleton - private LinkdbWriter createLinkdbWriter(FileStorageService service) throws SQLException, IOException { - - Path dbPath = IndexLocations.getLinkdbWritePath(service).resolve("links.db"); + private DocumentDbWriter createLinkdbWriter(FileStorageService service) throws SQLException, IOException { + // Migrate + Path dbPath = IndexLocations.getLinkdbWritePath(service).resolve(DOCDB_FILE_NAME); if (Files.exists(dbPath)) { Files.delete(dbPath); } - return new LinkdbWriter(dbPath); + return new DocumentDbWriter(dbPath); + } + + @Inject @Provides @Singleton + private DomainLinkDbWriter createDomainLinkdbWriter(FileStorageService service) throws SQLException, IOException { + + Path dbPath = IndexLocations.getLinkdbWritePath(service).resolve(DOMAIN_LINKS_FILE_NAME); + + if (Files.exists(dbPath)) { + Files.delete(dbPath); + } + + return new DomainLinkDbWriter(dbPath); } private Gson createGson() { diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/DocumentLoaderService.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/DocumentLoaderService.java index b0c86dcc..bed93d7e 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/DocumentLoaderService.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/DocumentLoaderService.java @@ -4,9 +4,8 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import lombok.SneakyThrows; import nu.marginalia.io.processed.DocumentRecordParquetFileReader; -import nu.marginalia.io.processed.ProcessedDataFileNames; -import nu.marginalia.linkdb.LinkdbWriter; -import nu.marginalia.linkdb.model.LdbUrlDetail; +import nu.marginalia.linkdb.DocumentDbWriter; +import nu.marginalia.linkdb.model.DocdbUrlDetail; import nu.marginalia.loading.LoaderInputData; import nu.marginalia.loading.domains.DomainIdRegistry; import nu.marginalia.model.EdgeUrl; @@ -26,11 +25,11 @@ import java.util.List; public class DocumentLoaderService { private static final Logger logger = LoggerFactory.getLogger(DocumentLoaderService.class); - private final LinkdbWriter linkdbWriter; + private final DocumentDbWriter documentDbWriter; @Inject - public DocumentLoaderService(LinkdbWriter linkdbWriter) { - this.linkdbWriter = linkdbWriter; + public DocumentLoaderService(DocumentDbWriter documentDbWriter) { + this.documentDbWriter = documentDbWriter; } public boolean loadDocuments( @@ -73,7 +72,7 @@ public class DocumentLoaderService { class LinkdbLoader implements AutoCloseable { private final DomainIdRegistry domainIdRegistry; - private final List details = new ArrayList<>(1000); + private final List details = new ArrayList<>(1000); LinkdbLoader(DomainIdRegistry domainIdRegistry) { this.domainIdRegistry = domainIdRegistry; @@ -88,7 +87,7 @@ public class DocumentLoaderService { projection.ordinal ); - details.add(new LdbUrlDetail( + details.add(new DocdbUrlDetail( urlId, new EdgeUrl(projection.url), projection.title, @@ -102,7 +101,7 @@ public class DocumentLoaderService { )); if (details.size() > 100) { - linkdbWriter.add(details); + documentDbWriter.add(details); details.clear(); } @@ -111,7 +110,7 @@ public class DocumentLoaderService { @Override public void close() throws SQLException { if (!details.isEmpty()) { - linkdbWriter.add(details); + documentDbWriter.add(details); } } } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/links/DomainLinksLoaderService.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/links/DomainLinksLoaderService.java index 6f3a6d8f..272b3936 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/links/DomainLinksLoaderService.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/links/DomainLinksLoaderService.java @@ -2,10 +2,9 @@ package nu.marginalia.loading.links; import com.google.inject.Inject; import com.google.inject.Singleton; -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.ProcessConfiguration; +import lombok.SneakyThrows; import nu.marginalia.io.processed.DomainLinkRecordParquetFileReader; -import nu.marginalia.io.processed.ProcessedDataFileNames; +import nu.marginalia.linkdb.DomainLinkDbWriter; import nu.marginalia.loading.LoaderInputData; import nu.marginalia.loading.domains.DomainIdRegistry; import nu.marginalia.model.processed.DomainLinkRecord; @@ -15,28 +14,22 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Path; -import java.sql.Connection; -import java.sql.PreparedStatement; -import java.sql.SQLException; @Singleton public class DomainLinksLoaderService { - private final HikariDataSource dataSource; private static final Logger logger = LoggerFactory.getLogger(DomainLinksLoaderService.class); - private final int nodeId; + + private final DomainLinkDbWriter domainLinkDbWriter; + @Inject - public DomainLinksLoaderService(HikariDataSource dataSource, - ProcessConfiguration processConfiguration) { - this.dataSource = dataSource; - this.nodeId = processConfiguration.node(); + public DomainLinksLoaderService(DomainLinkDbWriter domainLinkDbWriter) { + this.domainLinkDbWriter = domainLinkDbWriter; } public boolean loadLinks(DomainIdRegistry domainIdRegistry, ProcessHeartbeat heartbeat, - LoaderInputData inputData) throws IOException, SQLException { - - dropLinkData(); + LoaderInputData inputData) throws IOException { try (var task = heartbeat.createAdHocTaskHeartbeat("LINKS")) { var linkFiles = inputData.listDomainLinkFiles(); @@ -56,17 +49,7 @@ public class DomainLinksLoaderService { return true; } - private void dropLinkData() throws SQLException { - logger.info("Clearing EC_DOMAIN_LINK"); - - try (var conn = dataSource.getConnection(); - var call = conn.prepareCall("CALL PURGE_LINKS_TABLE(?)")) { - call.setInt(1, nodeId); - call.executeUpdate(); - } - } - - private void loadLinksFromFile(DomainIdRegistry domainIdRegistry, Path file) throws IOException, SQLException { + private void loadLinksFromFile(DomainIdRegistry domainIdRegistry, Path file) throws IOException { try (var domainStream = DomainLinkRecordParquetFileReader.stream(file); var linkLoader = new LinkLoader(domainIdRegistry)) { @@ -76,49 +59,21 @@ public class DomainLinksLoaderService { } class LinkLoader implements AutoCloseable { - private final Connection connection; - private final PreparedStatement insertStatement; private final DomainIdRegistry domainIdRegistry; - private int batchSize = 0; - private int total = 0; - - public LinkLoader(DomainIdRegistry domainIdRegistry) throws SQLException { + public LinkLoader(DomainIdRegistry domainIdRegistry) { this.domainIdRegistry = domainIdRegistry; - - connection = dataSource.getConnection(); - insertStatement = connection.prepareStatement(""" - INSERT IGNORE INTO EC_DOMAIN_LINK(SOURCE_DOMAIN_ID, DEST_DOMAIN_ID) - VALUES (?, ?) - """); } + @SneakyThrows void accept(DomainLinkRecord record) { - try { - insertStatement.setInt(1, domainIdRegistry.getDomainId(record.source)); - insertStatement.setInt(2, domainIdRegistry.getDomainId(record.dest)); - insertStatement.addBatch(); - if (++batchSize > 1000) { - batchSize = 0; - insertStatement.executeBatch(); - } - total++; - } - catch (SQLException ex) { - throw new RuntimeException(ex); - } + domainLinkDbWriter.write( + domainIdRegistry.getDomainId(record.source), + domainIdRegistry.getDomainId(record.dest) + ); } @Override - public void close() throws SQLException { - if (batchSize > 0) { - insertStatement.executeBatch(); - } - - logger.info("Inserted {} links", total); - - insertStatement.close(); - connection.close(); - } + public void close() {} } } diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loading/links/DomainLinksLoaderServiceTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loading/links/DomainLinksLoaderServiceTest.java deleted file mode 100644 index 9852b630..00000000 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loading/links/DomainLinksLoaderServiceTest.java +++ /dev/null @@ -1,176 +0,0 @@ -package nu.marginalia.loading.links; - -import com.google.common.collect.Lists; -import com.zaxxer.hikari.HikariConfig; -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.ProcessConfiguration; -import nu.marginalia.io.processed.DomainLinkRecordParquetFileWriter; -import nu.marginalia.io.processed.DomainRecordParquetFileWriter; -import nu.marginalia.io.processed.ProcessedDataFileNames; -import nu.marginalia.loader.DbTestUtil; -import nu.marginalia.loading.LoaderInputData; -import nu.marginalia.loading.domains.DomainLoaderService; -import nu.marginalia.model.processed.DomainLinkRecord; -import nu.marginalia.model.processed.DomainRecord; -import nu.marginalia.process.control.ProcessAdHocTaskHeartbeat; -import nu.marginalia.process.control.ProcessHeartbeat; -import org.junit.jupiter.api.*; -import org.mockito.Mockito; -import org.testcontainers.containers.MariaDBContainer; -import org.testcontainers.junit.jupiter.Container; -import org.testcontainers.junit.jupiter.Testcontainers; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.sql.SQLException; -import java.util.*; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -@Tag("slow") -@Testcontainers -@Disabled // Error in the SQL loading mechanism, we don't deal with DELIMITER correctly - // which means we can't get around flyway's bugs necessitating DELIMITER. -class DomainLinksLoaderServiceTest { - List toDelete = new ArrayList<>(); - ProcessHeartbeat heartbeat; - - @Container - static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") - .withDatabaseName("WMSA_prod") - .withUsername("wmsa") - .withPassword("wmsa") - .withInitScript("db/migration/V23_06_0_000__base.sql") - .withNetworkAliases("mariadb"); - - HikariDataSource dataSource; - - @BeforeEach - public void setUp() { - - HikariConfig config = new HikariConfig(); - config.setJdbcUrl(mariaDBContainer.getJdbcUrl()); - config.setUsername("wmsa"); - config.setPassword("wmsa"); - - dataSource = new HikariDataSource(config); - - List migrations = List.of( - "db/migration/V23_11_0_007__domain_node_affinity.sql", - "db/migration/V23_11_0_008__purge_procedure.sql" - ); - for (String migration : migrations) { - try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream(migration), - "Could not load migration script " + migration); - var conn = dataSource.getConnection(); - var stmt = conn.createStatement() - ) { - String script = new String(resource.readAllBytes()); - String[] cmds = script.split("\\s*;\\s*"); - for (String cmd : cmds) { - if (cmd.isBlank()) - continue; - System.out.println(cmd); - stmt.executeUpdate(cmd); - } - } catch (IOException | SQLException ex) { - - } - } - - heartbeat = Mockito.mock(ProcessHeartbeat.class); - - Mockito.when(heartbeat.createAdHocTaskHeartbeat(Mockito.anyString())).thenReturn( - Mockito.mock(ProcessAdHocTaskHeartbeat.class) - ); - } - - @AfterEach - public void tearDown() throws IOException { - for (var path : Lists.reverse(toDelete)) { - Files.deleteIfExists(path); - } - - toDelete.clear(); - dataSource.close(); - } - - @Test - public void test() throws IOException, SQLException { - Path workDir = Files.createTempDirectory(getClass().getSimpleName()); - Path parquetFile1 = ProcessedDataFileNames.domainFileName(workDir, 0); - Path parquetFile2 = ProcessedDataFileNames.domainLinkFileName(workDir, 0); - Path parquetFile3 = ProcessedDataFileNames.domainLinkFileName(workDir, 1); - - toDelete.add(workDir); - toDelete.add(parquetFile1); - toDelete.add(parquetFile2); - toDelete.add(parquetFile3); - - List domains1 = List.of("www.marginalia.nu", "search.marginalia.nu"); - List linkDomains1 = List.of("wiby.me", "www.mojeek.com", "www.altavista.com"); - List linkDomains2 = List.of("maya.land", "xkcd.com", "aaronsw.com"); - - try (var pw = new DomainRecordParquetFileWriter(parquetFile1)) { - for (var domain : domains1) { - pw.write(dr(domain)); - } - } - try (var pw = new DomainLinkRecordParquetFileWriter(parquetFile2)) { - for (var domain : linkDomains1) { - pw.write(dl("www.marginalia.nu", domain)); - } - } - try (var pw = new DomainLinkRecordParquetFileWriter(parquetFile3)) { - for (var domain : linkDomains2) { - pw.write(dl("search.marginalia.nu", domain)); - } - } - - try (var dataSource = DbTestUtil.getConnection(mariaDBContainer.getJdbcUrl()); - var conn = dataSource.getConnection(); - var query = conn.prepareStatement(""" - SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK - """) - ) { - var domainService = new DomainLoaderService(dataSource, new ProcessConfiguration("test", 1, UUID.randomUUID())); - var input = new LoaderInputData(workDir, 2); - var domainRegistry = domainService.getOrCreateDomainIds(input); - - var dls = new DomainLinksLoaderService(dataSource, new ProcessConfiguration("test", 1, UUID.randomUUID())); - dls.loadLinks(domainRegistry, heartbeat, input); - - Map> expected = new HashMap<>(); - Map> actual = new HashMap<>(); - expected.put(domainRegistry.getDomainId("www.marginalia.nu"), new HashSet<>()); - expected.put(domainRegistry.getDomainId("search.marginalia.nu"), new HashSet<>()); - - for (var domain : linkDomains1) { - expected.get(domainRegistry.getDomainId("www.marginalia.nu")).add(domainRegistry.getDomainId(domain)); - } - for (var domain : linkDomains2) { - expected.get(domainRegistry.getDomainId("search.marginalia.nu")).add(domainRegistry.getDomainId(domain)); - } - - var rs = query.executeQuery(); - while (rs.next()) { - actual.computeIfAbsent(rs.getInt(1), k -> new HashSet<>()) - .add(rs.getInt(2)); - } - - assertEquals(expected, actual); - - } - - - } - - private DomainRecord dr(String domainName) { - return new DomainRecord(domainName, 0, 0, 0, null, null, null, null); - } - - private DomainLinkRecord dl(String sourceDomainName, String destDomainName) { - return new DomainLinkRecord(sourceDomainName, destDomainName); - } -} \ No newline at end of file diff --git a/code/processes/website-adjacencies-calculator/build.gradle b/code/processes/website-adjacencies-calculator/build.gradle index 94378fff..479c2744 100644 --- a/code/processes/website-adjacencies-calculator/build.gradle +++ b/code/processes/website-adjacencies-calculator/build.gradle @@ -21,7 +21,9 @@ dependencies { implementation project(':code:common:model') implementation project(':code:common:db') implementation project(':code:common:process') + implementation project(':code:common:service-client') implementation project(':code:common:service') + implementation project(':code:api:query-api') implementation libs.bundles.slf4j diff --git a/code/processes/website-adjacencies-calculator/src/main/java/nu/marginalia/adjacencies/AdjacenciesData.java b/code/processes/website-adjacencies-calculator/src/main/java/nu/marginalia/adjacencies/AdjacenciesData.java index 4ddc087b..61c2ceee 100644 --- a/code/processes/website-adjacencies-calculator/src/main/java/nu/marginalia/adjacencies/AdjacenciesData.java +++ b/code/processes/website-adjacencies-calculator/src/main/java/nu/marginalia/adjacencies/AdjacenciesData.java @@ -1,26 +1,25 @@ package nu.marginalia.adjacencies; -import com.zaxxer.hikari.HikariDataSource; import gnu.trove.list.TIntList; import gnu.trove.list.array.TIntArrayList; import gnu.trove.map.hash.TIntObjectHashMap; import gnu.trove.set.hash.TIntHashSet; +import nu.marginalia.query.client.QueryClient; import org.roaringbitmap.RoaringBitmap; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import java.sql.ResultSet; -import java.sql.SQLException; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; public class AdjacenciesData { - TIntList idsList = new TIntArrayList(100_000); - ArrayList itemsList = new ArrayList<>(100_000); + private static final Logger logger = LoggerFactory.getLogger(AdjacenciesData.class); + private final TIntList idsList = new TIntArrayList(100_000); + private final ArrayList itemsList = new ArrayList<>(100_000); - TIntObjectHashMap dToSMap = new TIntObjectHashMap<>(100_000); - TIntObjectHashMap sToDMap = new TIntObjectHashMap<>(100_000); - - RoaringBitmap indexed = new RoaringBitmap(); + private final TIntObjectHashMap dToSMap = new TIntObjectHashMap<>(100_000); + private final TIntObjectHashMap sToDMap = new TIntObjectHashMap<>(100_000); public TIntHashSet getCandidates(SparseBitVector vec) { TIntHashSet ret = new TIntHashSet(); @@ -36,39 +35,31 @@ public class AdjacenciesData { return ret; } - public AdjacenciesData(HikariDataSource dataSource, DomainAliases aliases) throws SQLException { + public AdjacenciesData(QueryClient queryClient, + DomainAliases aliases) { + logger.info("Loading adjacency data"); Map tmpMapDtoS = new HashMap<>(100_000); - try ( - var conn = dataSource.getConnection(); - var indexedStmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE INDEXED>0"); - var linksStmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) { - ResultSet rsp; - indexedStmt.setFetchSize(10_000); - rsp = indexedStmt.executeQuery(); - while (rsp.next()) { - indexed.add(rsp.getInt(1)); + int count = 0; + var allLinks = queryClient.getAllDomainLinks(); + for (var iter = allLinks.iterator();;count++) { + if (!iter.advance()) { + break; } + int source = aliases.deAlias(iter.source()); + int dest = aliases.deAlias(iter.dest()); - linksStmt.setFetchSize(10_000); - rsp = linksStmt.executeQuery(); - while (rsp.next()) { - int source = aliases.deAlias(rsp.getInt(1)); - int dest = aliases.deAlias(rsp.getInt(2)); - - tmpMapDtoS.computeIfAbsent(dest, this::createBitmapWithSelf).add(source); - - - RoaringBitmap sToDEntry = sToDMap.get(source); - if (sToDEntry == null) { - sToDEntry = new RoaringBitmap(); - sToDMap.put(source, sToDEntry); - sToDEntry.add(source); - } - sToDEntry.add(dest); + tmpMapDtoS.computeIfAbsent(dest, this::createBitmapWithSelf).add(source); + RoaringBitmap sToDEntry = sToDMap.get(source); + if (sToDEntry == null) { + sToDEntry = new RoaringBitmap(); + sToDMap.put(source, sToDEntry); + sToDEntry.add(source); } + sToDEntry.add(dest); } + logger.info("Links loaded: {}", count); tmpMapDtoS.entrySet().stream() .filter(e -> isEligible(e.getValue())) @@ -79,10 +70,10 @@ public class AdjacenciesData { dToSMap.put(e.getKey(), val); }); + logger.info("All adjacency dat loaded"); } private boolean isEligible(RoaringBitmap value) { -// return true; int cardinality = value.getCardinality(); return cardinality < 10000; @@ -95,10 +86,6 @@ public class AdjacenciesData { return bm; } - public boolean isIndexedDomain(int domainId) { - return indexed.contains(domainId); - } - public TIntList getIdsList() { return idsList; } diff --git a/code/processes/website-adjacencies-calculator/src/main/java/nu/marginalia/adjacencies/WebsiteAdjacenciesCalculator.java b/code/processes/website-adjacencies-calculator/src/main/java/nu/marginalia/adjacencies/WebsiteAdjacenciesCalculator.java index 258982df..fbda4856 100644 --- a/code/processes/website-adjacencies-calculator/src/main/java/nu/marginalia/adjacencies/WebsiteAdjacenciesCalculator.java +++ b/code/processes/website-adjacencies-calculator/src/main/java/nu/marginalia/adjacencies/WebsiteAdjacenciesCalculator.java @@ -7,7 +7,10 @@ import nu.marginalia.db.DbDomainQueries; import nu.marginalia.model.EdgeDomain; import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeatImpl; +import nu.marginalia.query.client.QueryClient; import nu.marginalia.service.module.DatabaseModule; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.sql.SQLException; import java.util.*; @@ -23,13 +26,14 @@ public class WebsiteAdjacenciesCalculator { private final HikariDataSource dataSource; public AdjacenciesData adjacenciesData; public DomainAliases domainAliases; + private static final Logger logger = LoggerFactory.getLogger(WebsiteAdjacenciesCalculator.class); float[] weights; - public WebsiteAdjacenciesCalculator(HikariDataSource dataSource) throws SQLException { + public WebsiteAdjacenciesCalculator(QueryClient queryClient, HikariDataSource dataSource) throws SQLException { this.dataSource = dataSource; domainAliases = new DomainAliases(dataSource); - adjacenciesData = new AdjacenciesData(dataSource, domainAliases); + adjacenciesData = new AdjacenciesData(queryClient, domainAliases); weights = adjacenciesData.getWeights(); } @@ -47,7 +51,6 @@ public class WebsiteAdjacenciesCalculator { for (int domainId : domainIds) { findAdjacentDtoS(domainId, similarities -> { for (var similarity : similarities.similarities()) { - if (adjacenciesData.isIndexedDomain(similarity.domainId)) System.out.print("*"); System.out.println(dataStoreDao.getDomain(similarity.domainId).map(Object::toString).orElse("") + " " + prettyPercent(similarity.value)); } }); @@ -186,8 +189,9 @@ public class WebsiteAdjacenciesCalculator { DatabaseModule dm = new DatabaseModule(); var dataSource = dm.provideConnection(); + var qc = new QueryClient(); - var main = new WebsiteAdjacenciesCalculator(dataSource); + var main = new WebsiteAdjacenciesCalculator(qc, dataSource); if (args.length == 1 && "load".equals(args[0])) { var processHeartbeat = new ProcessHeartbeatImpl( @@ -195,9 +199,16 @@ public class WebsiteAdjacenciesCalculator { dataSource ); - processHeartbeat.start(); - main.loadAll(processHeartbeat); - processHeartbeat.shutDown(); + try { + processHeartbeat.start(); + main.loadAll(processHeartbeat); + } + catch (Exception ex) { + logger.error("Failed to load", ex); + } + finally { + processHeartbeat.shutDown(); + } return; } diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/command/commands/BrowseCommand.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/command/commands/BrowseCommand.java index 3099c29d..aa6b19ea 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/command/commands/BrowseCommand.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/command/commands/BrowseCommand.java @@ -66,7 +66,7 @@ public class BrowseCommand implements SearchCommandInterface { return browseService.getRandomEntries(set); } else { - return browseService.getRelatedEntries(word); + return browseService.getRelatedEntries(ctx, word); } } catch (Exception ex) { diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/results/BrowseResultCleaner.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/results/BrowseResultCleaner.java index 71a6ad43..7ebefb55 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/results/BrowseResultCleaner.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/results/BrowseResultCleaner.java @@ -2,6 +2,7 @@ package nu.marginalia.search.results; import com.google.inject.Inject; import com.google.inject.Singleton; +import nu.marginalia.assistant.client.model.SimilarDomain; import nu.marginalia.browse.model.BrowseResult; import nu.marginalia.screenshot.ScreenshotService; @@ -18,7 +19,7 @@ public class BrowseResultCleaner { this.screenshotService = screenshotService; } - public Predicate shouldRemoveResultPredicate() { + public Predicate shouldRemoveResultPredicateBr() { Set domainHashes = new HashSet<>(100); return (res) -> !screenshotService.hasScreenshot(res.domainId()) diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchBrowseService.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchBrowseService.java index 08423a4d..187a9081 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchBrowseService.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchBrowseService.java @@ -1,16 +1,18 @@ package nu.marginalia.search.svc; import com.google.inject.Inject; +import nu.marginalia.assistant.client.AssistantClient; +import nu.marginalia.assistant.client.model.SimilarDomain; import nu.marginalia.browse.DbBrowseDomainsRandom; -import nu.marginalia.browse.DbBrowseDomainsSimilarCosine; -import nu.marginalia.browse.DbBrowseDomainsSimilarOldAlgo; import nu.marginalia.browse.model.BrowseResult; import nu.marginalia.browse.model.BrowseResultSet; +import nu.marginalia.client.Context; import nu.marginalia.db.DbDomainQueries; import nu.marginalia.db.DomainBlacklist; import nu.marginalia.model.EdgeDomain; import nu.marginalia.search.results.BrowseResultCleaner; +import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; @@ -19,55 +21,60 @@ import static java.util.Collections.shuffle; public class SearchBrowseService { private final DbBrowseDomainsRandom randomDomains; - private final DbBrowseDomainsSimilarCosine similarDomains; - private final DbBrowseDomainsSimilarOldAlgo similarDomainsOld; private final DbDomainQueries domainQueries; private final DomainBlacklist blacklist; + private final AssistantClient assistantClient; private final BrowseResultCleaner browseResultCleaner; @Inject public SearchBrowseService(DbBrowseDomainsRandom randomDomains, - DbBrowseDomainsSimilarCosine similarDomains, - DbBrowseDomainsSimilarOldAlgo similarDomainsOld, DbDomainQueries domainQueries, DomainBlacklist blacklist, + AssistantClient assistantClient, BrowseResultCleaner browseResultCleaner) { this.randomDomains = randomDomains; - this.similarDomains = similarDomains; - this.similarDomainsOld = similarDomainsOld; this.domainQueries = domainQueries; this.blacklist = blacklist; + this.assistantClient = assistantClient; this.browseResultCleaner = browseResultCleaner; } public BrowseResultSet getRandomEntries(int set) { List results = randomDomains.getRandomDomains(25, blacklist, set); - results.removeIf(browseResultCleaner.shouldRemoveResultPredicate()); + results.removeIf(browseResultCleaner.shouldRemoveResultPredicateBr()); return new BrowseResultSet(results); } - public BrowseResultSet getRelatedEntries(String word) { - var domain = domainQueries.getDomainId(new EdgeDomain(word)); + public BrowseResultSet getRelatedEntries(Context ctx, String domainName) { + var domain = domainQueries.getDomainId(new EdgeDomain(domainName)); - var neighbors = similarDomains.getDomainNeighborsAdjacentCosineRequireScreenshot(domain, blacklist, 256); - neighbors.removeIf(browseResultCleaner.shouldRemoveResultPredicate()); + var neighbors = assistantClient.similarDomains(ctx, domain, 50).blockingFirst(); + neighbors.removeIf(sd -> !sd.screenshot()); // If the results are very few, supplement with the alternative shitty algorithm if (neighbors.size() < 25) { - Set allNeighbors = new HashSet<>(neighbors); - allNeighbors.addAll(similarDomainsOld.getDomainNeighborsAdjacent(domain, blacklist, 50)); + Set allNeighbors = new HashSet<>(neighbors); + allNeighbors.addAll(assistantClient.linkedDomains(ctx, domain, 50).blockingFirst()); neighbors.clear(); neighbors.addAll(allNeighbors); - neighbors.removeIf(browseResultCleaner.shouldRemoveResultPredicate()); + neighbors.removeIf(sd -> !sd.screenshot()); } + List results = new ArrayList<>(neighbors.size()); + for (SimilarDomain sd : neighbors) { + var resultDomain = domainQueries.getDomain(sd.domainId()); + if (resultDomain.isEmpty()) + continue; + + results.add(new BrowseResult(resultDomain.get().toRootUrl(), sd.domainId(), 0, sd.screenshot())); + } // shuffle the items for a less repetitive experience shuffle(neighbors); - return new BrowseResultSet(neighbors, word); + return new BrowseResultSet(results, domainName); } } diff --git a/code/services-core/assistant-service/build.gradle b/code/services-core/assistant-service/build.gradle index 8609903d..950dc359 100644 --- a/code/services-core/assistant-service/build.gradle +++ b/code/services-core/assistant-service/build.gradle @@ -24,6 +24,7 @@ java { dependencies { implementation project(':third-party:symspell') implementation project(':code:api:assistant-api') + implementation project(':code:api:query-api') implementation project(':code:common:config') implementation project(':code:common:service') implementation project(':code:common:model') diff --git a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/DomainInformationService.java b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/DomainInformationService.java index 69c82bdd..b99c3abf 100644 --- a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/DomainInformationService.java +++ b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/DomainInformationService.java @@ -5,6 +5,7 @@ import nu.marginalia.geoip.GeoIpDictionary; import nu.marginalia.model.EdgeDomain; import nu.marginalia.db.DbDomainQueries; import nu.marginalia.assistant.client.model.DomainInformation; +import nu.marginalia.query.client.QueryClient; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -20,6 +21,7 @@ public class DomainInformationService { private final GeoIpDictionary geoIpDictionary; private DbDomainQueries dbDomainQueries; + private final QueryClient queryClient; private HikariDataSource dataSource; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -27,9 +29,11 @@ public class DomainInformationService { public DomainInformationService( DbDomainQueries dbDomainQueries, GeoIpDictionary geoIpDictionary, + QueryClient queryClient, HikariDataSource dataSource) { this.dbDomainQueries = dbDomainQueries; this.geoIpDictionary = geoIpDictionary; + this.queryClient = queryClient; this.dataSource = dataSource; } @@ -80,21 +84,8 @@ public class DomainInformationService { inCrawlQueue = rs.next(); builder.inCrawlQueue(inCrawlQueue); - rs = stmt.executeQuery(STR.""" - SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=\{domainId} - """); - if (rs.next()) { - builder.incomingLinks(rs.getInt(1)); - } - - rs = stmt.executeQuery(STR.""" - SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=\{domainId} - """); - if (rs.next()) { - builder.outboundLinks(rs.getInt(1)); - outboundLinks = rs.getInt(1); - } - + builder.incomingLinks(queryClient.countLinksToDomain(domainId)); + builder.outboundLinks(queryClient.countLinksFromDomain(domainId)); rs = stmt.executeQuery(STR.""" SELECT KNOWN_URLS, GOOD_URLS, VISITED_URLS FROM DOMAIN_METADATA WHERE ID=\{domainId} diff --git a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/SimilarDomainsService.java b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/SimilarDomainsService.java index e409e7a2..ddcc2e98 100644 --- a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/SimilarDomainsService.java +++ b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/SimilarDomainsService.java @@ -10,6 +10,7 @@ import gnu.trove.set.TIntSet; import gnu.trove.set.hash.TIntHashSet; import nu.marginalia.assistant.client.model.SimilarDomain; import nu.marginalia.model.EdgeDomain; +import nu.marginalia.query.client.QueryClient; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -25,14 +26,13 @@ public class SimilarDomainsService { private static final Logger logger = LoggerFactory.getLogger(SimilarDomainsService.class); private final HikariDataSource dataSource; + private final QueryClient queryClient; private volatile TIntIntHashMap domainIdToIdx = new TIntIntHashMap(100_000); private volatile int[] domainIdxToId; public volatile TIntDoubleHashMap[] relatedDomains; public volatile TIntList[] domainNeighbors = null; - public volatile TIntList[] linkStoD = null; - public volatile TIntList[] linkDtoS = null; public volatile BitSet screenshotDomains = null; public volatile BitSet activeDomains = null; public volatile BitSet indexedDomains = null; @@ -42,8 +42,9 @@ public class SimilarDomainsService { volatile boolean isReady = false; @Inject - public SimilarDomainsService(HikariDataSource dataSource) { + public SimilarDomainsService(HikariDataSource dataSource, QueryClient queryClient) { this.dataSource = dataSource; + this.queryClient = queryClient; Executors.newSingleThreadExecutor().submit(this::init); } @@ -70,8 +71,6 @@ public class SimilarDomainsService { domainRanks = new double[domainIdToIdx.size()]; domainNames = new String[domainIdToIdx.size()]; domainNeighbors = new TIntList[domainIdToIdx.size()]; - linkStoD = new TIntList[domainIdToIdx.size()]; - linkDtoS = new TIntList[domainIdToIdx.size()]; screenshotDomains = new BitSet(domainIdToIdx.size()); activeDomains = new BitSet(domainIdToIdx.size()); indexedDomains = new BitSet(domainIdToIdx.size()); @@ -108,27 +107,6 @@ public class SimilarDomainsService { logger.info("Loaded {} related domains", relatedDomains.length); - rs = stmt.executeQuery(""" - SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK - """); - - while (rs.next()) { - int source = rs.getInt(1); - int dest = rs.getInt(2); - - int sourceIdx = domainIdToIdx.get(source); - int destIdx = domainIdToIdx.get(dest); - - if (linkStoD[sourceIdx] == null) - linkStoD[sourceIdx] = new TIntArrayList(32); - if (linkDtoS[destIdx] == null) - linkDtoS[destIdx] = new TIntArrayList(32); - - linkStoD[sourceIdx].add(destIdx); - linkDtoS[destIdx].add(sourceIdx); - - } - logger.info("Loaded links..."); rs = stmt.executeQuery(""" SELECT EC_DOMAIN.ID, @@ -167,7 +145,6 @@ public class SimilarDomainsService { } logger.info("Loaded {} domains", domainRanks.length); - logger.info("All done!"); isReady = true; } } @@ -272,17 +249,23 @@ public class SimilarDomainsService { } private TIntSet getLinkingIdsDToS(int domainIdx) { - var items = linkDtoS[domainIdx]; - if (items == null) - return new TIntHashSet(); - return new TIntHashSet(items); + var items = new TIntHashSet(); + + for (int id : queryClient.getLinksFromDomain(domainIdxToId[domainIdx])) { + items.add(domainIdToIdx.get(id)); + } + + return items; } private TIntSet getLinkingIdsSToD(int domainIdx) { - var items = linkStoD[domainIdx]; - if (items == null) - return new TIntHashSet(); - return new TIntHashSet(items); + var items = new TIntHashSet(); + + for (int id : queryClient.getLinksToDomain(domainIdxToId[domainIdx])) { + items.add(domainIdToIdx.get(id)); + } + + return items; } public List getLinkingDomains(int domainId, int count) { diff --git a/code/services-core/executor-service/build.gradle b/code/services-core/executor-service/build.gradle index fa69c62d..8d695415 100644 --- a/code/services-core/executor-service/build.gradle +++ b/code/services-core/executor-service/build.gradle @@ -26,6 +26,7 @@ dependencies { implementation project(':code:common:model') implementation project(':code:common:process') implementation project(':code:common:db') + implementation project(':code:common:linkdb') implementation project(':code:common:service') implementation project(':code:common:service-client') diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportDataActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportDataActor.java index 91add3d5..4c41da78 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportDataActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportDataActor.java @@ -4,9 +4,6 @@ import com.google.gson.Gson; import com.google.inject.Inject; import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; -import lombok.AllArgsConstructor; -import lombok.NoArgsConstructor; -import lombok.With; import nu.marginalia.actor.prototype.RecordActorPrototype; import nu.marginalia.actor.state.ActorStep; import nu.marginalia.storage.FileStorageService; diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/svc/BackupService.java b/code/services-core/executor-service/src/main/java/nu/marginalia/svc/BackupService.java index e78c5e2f..2205e5e2 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/svc/BackupService.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/svc/BackupService.java @@ -19,6 +19,9 @@ import java.sql.SQLException; import java.time.LocalDateTime; import java.util.List; +import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME; +import static nu.marginalia.linkdb.LinkdbFileNames.DOMAIN_LINKS_FILE_NAME; + public class BackupService { private final FileStorageService storageService; @@ -26,6 +29,7 @@ public class BackupService { public enum BackupHeartbeatSteps { LINKS, + DOCS, JOURNAL, DONE } @@ -57,8 +61,11 @@ public class BackupService { try (var heartbeat = serviceHeartbeat.createServiceTaskHeartbeat(BackupHeartbeatSteps.class, "Backup")) { + heartbeat.progress(BackupHeartbeatSteps.DOCS); + backupFileCompressed(DOCDB_FILE_NAME, linkdbStagingStorage, backupStorage.asPath()); + heartbeat.progress(BackupHeartbeatSteps.LINKS); - backupFileCompressed("links.db", linkdbStagingStorage, backupStorage.asPath()); + backupFileCompressed(DOMAIN_LINKS_FILE_NAME, linkdbStagingStorage, backupStorage.asPath()); heartbeat.progress(BackupHeartbeatSteps.JOURNAL); // This file format is already compressed @@ -79,8 +86,11 @@ public class BackupService { var linkdbStagingStorage = IndexLocations.getLinkdbWritePath(storageService); try (var heartbeat = serviceHeartbeat.createServiceTaskHeartbeat(BackupHeartbeatSteps.class, "Restore Backup")) { + heartbeat.progress(BackupHeartbeatSteps.DOCS); + restoreBackupCompressed(DOCDB_FILE_NAME, linkdbStagingStorage, backupStorage); + heartbeat.progress(BackupHeartbeatSteps.LINKS); - restoreBackupCompressed("links.db", linkdbStagingStorage, backupStorage); + restoreBackupCompressed(DOMAIN_LINKS_FILE_NAME, linkdbStagingStorage, backupStorage); heartbeat.progress(BackupHeartbeatSteps.JOURNAL); restoreJournal(indexStagingStorage, backupStorage); diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java index f3b409d9..179df9ec 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java @@ -4,17 +4,27 @@ import com.google.inject.AbstractModule; import com.google.inject.Provides; import com.google.inject.Singleton; import com.google.inject.name.Named; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.linkdb.DomainLinkDb; +import nu.marginalia.linkdb.FileDomainLinkDb; +import nu.marginalia.linkdb.SqlDomainLinkDb; +import nu.marginalia.service.module.ServiceConfiguration; import nu.marginalia.storage.FileStorageService; import nu.marginalia.IndexLocations; import nu.marginalia.index.config.RankingSettings; import nu.marginalia.WmsaHome; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.nio.file.Files; import java.nio.file.Path; import java.sql.SQLException; +import static nu.marginalia.linkdb.LinkdbFileNames.*; + public class IndexModule extends AbstractModule { - - + private static final Logger logger = LoggerFactory.getLogger(IndexModule.class); public void configure() { } @@ -25,11 +35,49 @@ public class IndexModule extends AbstractModule { return RankingSettings.from(dir); } + @Provides + @Singleton + public DomainLinkDb domainLinkDb ( + FileStorageService storageService, + HikariDataSource dataSource, + ServiceConfiguration serviceConfiguration + ) throws IOException + { + Path path = IndexLocations.getLinkdbLivePath(storageService).resolve(DOMAIN_LINKS_FILE_NAME); + + if (Files.exists(path)) { + logger.info("Using file domain link db {}", path); + return new FileDomainLinkDb(path); + } + else { + logger.warn("Using legacy sql domain link db"); + return new SqlDomainLinkDb(path, dataSource, serviceConfiguration); + } + } @Provides @Singleton - @Named("linkdb-file") - public Path linkdbPath(FileStorageService storageService) throws SQLException { - return IndexLocations.getLinkdbLivePath(storageService).resolve("links.db"); + @Named("docdb-file") + public Path linkdbPath(FileStorageService storageService) throws IOException { + // Migrate from old location + Path migrationMarker = IndexLocations.getLinkdbLivePath(storageService).resolve("migrated-links.db-to-documents.db"); + Path oldPath = IndexLocations.getLinkdbLivePath(storageService).resolve(DEPRECATED_LINKDB_FILE_NAME); + Path newPath = IndexLocations.getLinkdbLivePath(storageService).resolve(DOCDB_FILE_NAME); + + if (Files.exists(oldPath) && !Files.exists(newPath) && !Files.exists(migrationMarker)) { + logger.info("Migrating {} to {}", oldPath, newPath); + + Files.move(oldPath, newPath); + Files.createFile(migrationMarker); + } + + return newPath; + } + + @Provides + @Singleton + @Named("domain-linkdb-file") + public Path domainLinkDbFile(FileStorageService storageService) throws SQLException { + return IndexLocations.getLinkdbLivePath(storageService).resolve(DOMAIN_LINKS_FILE_NAME); } } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java index c1027ad9..9602b469 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java @@ -6,12 +6,14 @@ import io.grpc.ServerBuilder; import io.reactivex.rxjava3.schedulers.Schedulers; import lombok.SneakyThrows; import nu.marginalia.IndexLocations; +import nu.marginalia.index.svc.IndexDomainLinksService; +import nu.marginalia.linkdb.DomainLinkDb; import nu.marginalia.storage.FileStorageService; import nu.marginalia.index.client.IndexMqEndpoints; import nu.marginalia.index.index.SearchIndex; import nu.marginalia.index.svc.IndexOpsService; import nu.marginalia.index.svc.IndexQueryService; -import nu.marginalia.linkdb.LinkdbReader; +import nu.marginalia.linkdb.DocumentDbReader; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.service.control.ServiceEventLog; import nu.marginalia.service.server.*; @@ -28,6 +30,8 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.concurrent.TimeUnit; +import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME; +import static nu.marginalia.linkdb.LinkdbFileNames.DOMAIN_LINKS_FILE_NAME; import static spark.Spark.get; public class IndexService extends Service { @@ -38,8 +42,9 @@ public class IndexService extends Service { private final IndexOpsService opsService; private final SearchIndex searchIndex; private final FileStorageService fileStorageService; - private final LinkdbReader linkdbReader; + private final DocumentDbReader documentDbReader; + private final DomainLinkDb domainLinkDb; private final ServiceEventLog eventLog; @@ -49,14 +54,17 @@ public class IndexService extends Service { IndexQueryService indexQueryService, SearchIndex searchIndex, FileStorageService fileStorageService, - LinkdbReader linkdbReader, + DocumentDbReader documentDbReader, + DomainLinkDb domainLinkDb, + IndexDomainLinksService indexDomainLinksService, ServiceEventLog eventLog) throws IOException { super(params); this.opsService = opsService; this.searchIndex = searchIndex; this.fileStorageService = fileStorageService; - this.linkdbReader = linkdbReader; + this.documentDbReader = documentDbReader; + this.domainLinkDb = domainLinkDb; this.eventLog = eventLog; final Gson gson = GsonFactory.get(); @@ -65,6 +73,7 @@ public class IndexService extends Service { var grpcServer = ServerBuilder.forPort(params.configuration.port() + 1) .addService(indexQueryService) + .addService(indexDomainLinksService) .build(); grpcServer.start(); @@ -99,15 +108,24 @@ public class IndexService extends Service { @SneakyThrows @MqRequest(endpoint = IndexMqEndpoints.SWITCH_LINKDB) public void switchLinkdb(String unusedArg) { - logger.info("Switching link database"); + logger.info("Switching link databases"); - Path newPath = IndexLocations + Path newPathDocs = IndexLocations .getLinkdbWritePath(fileStorageService) - .resolve("links.db"); + .resolve(DOCDB_FILE_NAME); - if (Files.exists(newPath)) { - eventLog.logEvent("INDEX-SWITCH-LINKDB", ""); - linkdbReader.switchInput(newPath); + if (Files.exists(newPathDocs)) { + eventLog.logEvent("INDEX-SWITCH-DOCKDB", ""); + documentDbReader.switchInput(newPathDocs); + } + + Path newPathDomains = IndexLocations + .getLinkdbWritePath(fileStorageService) + .resolve(DOMAIN_LINKS_FILE_NAME); + + if (Files.exists(newPathDomains)) { + eventLog.logEvent("INDEX-SWITCH-DOMAIN-LINKDB", ""); + domainLinkDb.switchInput(newPathDomains); } } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultDecorator.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultDecorator.java index 376972b8..0994fcbc 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultDecorator.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultDecorator.java @@ -7,8 +7,8 @@ import gnu.trove.list.array.TLongArrayList; import nu.marginalia.index.client.model.results.DecoratedSearchResultItem; import nu.marginalia.index.client.model.results.ResultRankingContext; import nu.marginalia.index.client.model.results.SearchResultItem; -import nu.marginalia.linkdb.LinkdbReader; -import nu.marginalia.linkdb.model.LdbUrlDetail; +import nu.marginalia.linkdb.DocumentDbReader; +import nu.marginalia.linkdb.model.DocdbUrlDetail; import nu.marginalia.ranking.ResultValuator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -25,13 +25,13 @@ public class IndexResultDecorator { private static final Logger logger = LoggerFactory.getLogger(IndexResultDecorator.class); - private final LinkdbReader linkdbReader; + private final DocumentDbReader documentDbReader; private final ResultValuator valuator; @Inject - public IndexResultDecorator(LinkdbReader linkdbReader, + public IndexResultDecorator(DocumentDbReader documentDbReader, ResultValuator valuator) { - this.linkdbReader = linkdbReader; + this.documentDbReader = documentDbReader; this.valuator = valuator; } @@ -46,9 +46,9 @@ public class IndexResultDecorator { for (var result : rawResults) idsList.add(result.getDocumentId()); - Map urlDetailsById = new HashMap<>(rawResults.size()); + Map urlDetailsById = new HashMap<>(rawResults.size()); - for (var item : linkdbReader.getUrlDetails(idsList)) + for (var item : documentDbReader.getUrlDetails(idsList)) urlDetailsById.put(item.urlId(), item); List decoratedItems = new ArrayList<>(); @@ -63,7 +63,7 @@ public class IndexResultDecorator { } private DecoratedSearchResultItem createCombinedItem(SearchResultItem result, - LdbUrlDetail linkData, + DocdbUrlDetail linkData, ResultRankingContext rankingContext) { return new DecoratedSearchResultItem( result, diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexDomainLinksService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexDomainLinksService.java new file mode 100644 index 00000000..04b33e6c --- /dev/null +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexDomainLinksService.java @@ -0,0 +1,104 @@ +package nu.marginalia.index.svc; + +import com.google.inject.Inject; +import io.grpc.stub.StreamObserver; +import nu.marginalia.index.api.*; +import nu.marginalia.linkdb.DomainLinkDb; + +import static io.grpc.stub.ServerCalls.asyncUnimplementedUnaryCall; + +/** GRPC service for interrogating domain links + */ +public class IndexDomainLinksService extends IndexDomainLinksApiGrpc.IndexDomainLinksApiImplBase { + private final DomainLinkDb domainLinkDb; + + @Inject + public IndexDomainLinksService(DomainLinkDb domainLinkDb) { + this.domainLinkDb = domainLinkDb; + } + + public void getAllLinks(nu.marginalia.index.api.Empty request, + io.grpc.stub.StreamObserver responseObserver) { + + try (var idsConverter = new AllIdsResponseConverter(responseObserver)) { + domainLinkDb.forEach(idsConverter::accept); + } + + responseObserver.onCompleted(); + } + + private static class AllIdsResponseConverter implements AutoCloseable { + private RpcDomainIdPairs.Builder builder; + private final io.grpc.stub.StreamObserver responseObserver; + private int n = 0; + + private AllIdsResponseConverter(io.grpc.stub.StreamObserver responseObserver) { + this.responseObserver = responseObserver; + this.builder = RpcDomainIdPairs.newBuilder(); + } + + public void accept(int source, int dest) { + builder.addSourceIds(source); + builder.addDestIds(dest); + + if (++n > 1000) { + responseObserver.onNext(builder.build()); + builder = RpcDomainIdPairs.newBuilder(); + n = 0; + } + } + + @Override + public void close() { + if (n > 0) { + responseObserver.onNext(builder.build()); + } + } + } + + @Override + public void getLinksFromDomain(RpcDomainId request, + StreamObserver responseObserver) { + + var links = domainLinkDb.findDestinations(request.getDomainId()); + + var rspBuilder = RpcDomainIdList.newBuilder(); + for (int i = 0; i < links.size(); i++) { + rspBuilder.addDomainId(links.get(i)); + } + responseObserver.onNext(rspBuilder.build()); + + responseObserver.onCompleted(); + } + + @Override + public void getLinksToDomain(RpcDomainId request, + StreamObserver responseObserver) { + + var links = domainLinkDb.findSources(request.getDomainId()); + + var rspBuilder = RpcDomainIdList.newBuilder(); + for (int i = 0; i < links.size(); i++) { + rspBuilder.addDomainId(links.get(i)); + } + responseObserver.onNext(rspBuilder.build()); + + responseObserver.onCompleted(); + } + + public void countLinksFromDomain(RpcDomainId request, + StreamObserver responseObserver) { + responseObserver.onNext(RpcDomainIdCount.newBuilder() + .setIdCount(domainLinkDb.countDestinations(request.getDomainId())) + .build()); + responseObserver.onCompleted(); + } + + public void countLinksToDomain(RpcDomainId request, + StreamObserver responseObserver) { + responseObserver.onNext(RpcDomainIdCount.newBuilder() + .setIdCount(domainLinkDb.countSources(request.getDomainId())) + .build()); + responseObserver.onCompleted(); + } +} diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java index 397e124e..37d52f2e 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java @@ -59,6 +59,7 @@ public class IndexOpsService { public Optional run(Callable c) throws Exception { if (!opsLock.tryLock()) return Optional.empty(); + try { return Optional.of(c.call()); } diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationSmokeTest.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationSmokeTest.java index 2066d59d..cd69188c 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationSmokeTest.java +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationSmokeTest.java @@ -24,9 +24,9 @@ import nu.marginalia.index.journal.writer.IndexJournalWriter; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.SpecificationLimit; -import nu.marginalia.linkdb.LinkdbReader; -import nu.marginalia.linkdb.LinkdbWriter; -import nu.marginalia.linkdb.model.LdbUrlDetail; +import nu.marginalia.linkdb.DocumentDbReader; +import nu.marginalia.linkdb.DocumentDbWriter; +import nu.marginalia.linkdb.model.DocdbUrlDetail; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.idx.WordFlags; @@ -51,6 +51,7 @@ import java.sql.SQLException; import java.util.*; import java.util.stream.IntStream; +import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; @@ -80,7 +81,7 @@ public class IndexQueryServiceIntegrationSmokeTest { DomainRankings domainRankings; @Inject - LinkdbReader linkdbReader; + DocumentDbReader documentDbReader; @Inject ProcessHeartbeat processHeartbeat; @@ -103,15 +104,15 @@ public class IndexQueryServiceIntegrationSmokeTest { @Test public void willItBlend() throws Exception { - var linkdbWriter = new LinkdbWriter( + var linkdbWriter = new DocumentDbWriter( IndexLocations.getLinkdbLivePath(fileStorageService) - .resolve("links.db") + .resolve(DOCDB_FILE_NAME) ); for (int i = 1; i < 512; i++) { loadData(linkdbWriter, i); } linkdbWriter.close(); - linkdbReader.reconnect(); + documentDbReader.reconnect(); indexJournalWriter.close(); constructIndex(); @@ -146,15 +147,15 @@ public class IndexQueryServiceIntegrationSmokeTest { @Test public void testDomainQuery() throws Exception { - var linkdbWriter = new LinkdbWriter( + var linkdbWriter = new DocumentDbWriter( IndexLocations.getLinkdbLivePath(fileStorageService) - .resolve("links.db") + .resolve(DOCDB_FILE_NAME) ); for (int i = 1; i < 512; i++) { loadDataWithDomain(linkdbWriter, i/100, i); } linkdbWriter.close(); - linkdbReader.reconnect(); + documentDbReader.reconnect(); indexJournalWriter.close(); constructIndex(); @@ -183,15 +184,15 @@ public class IndexQueryServiceIntegrationSmokeTest { @Test public void testYearQuery() throws Exception { - var linkdbWriter = new LinkdbWriter( + var linkdbWriter = new DocumentDbWriter( IndexLocations.getLinkdbLivePath(fileStorageService) - .resolve("links.db") + .resolve(DOCDB_FILE_NAME) ); for (int i = 1; i < 512; i++) { loadData(linkdbWriter, i); } linkdbWriter.close(); - linkdbReader.reconnect(); + documentDbReader.reconnect(); indexJournalWriter.close(); constructIndex(); @@ -283,7 +284,7 @@ public class IndexQueryServiceIntegrationSmokeTest { MurmurHash3_128 hasher = new MurmurHash3_128(); @SneakyThrows - public void loadData(LinkdbWriter ldbw, int id) { + public void loadData(DocumentDbWriter ldbw, int id) { int[] factors = IntStream .rangeClosed(1, id) .filter(v -> (id % v) == 0) @@ -299,7 +300,7 @@ public class IndexQueryServiceIntegrationSmokeTest { data[2 * i + 1] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode(); } - ldbw.add(new LdbUrlDetail( + ldbw.add(new DocdbUrlDetail( fullId, new EdgeUrl("https://www.example.com/"+id), "test", "test", 0., "HTML5", 0, null, 0, 10 )); @@ -308,7 +309,7 @@ public class IndexQueryServiceIntegrationSmokeTest { } @SneakyThrows - public void loadDataWithDomain(LinkdbWriter ldbw, int domain, int id) { + public void loadDataWithDomain(DocumentDbWriter ldbw, int domain, int id) { int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray(); long fullId = UrlIdCodec.encodeId(domain, id); var header = new IndexJournalEntryHeader(factors.length, 0, fullId, DocumentMetadata.defaultValue()); @@ -319,7 +320,7 @@ public class IndexQueryServiceIntegrationSmokeTest { data[2*i + 1] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode(); } - ldbw.add(new LdbUrlDetail( + ldbw.add(new DocdbUrlDetail( fullId, new EdgeUrl("https://www.example.com/"+id), "test", "test", 0., "HTML5", 0, null, 0, 10 )); diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java index 8224101a..17acc7c4 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java @@ -23,9 +23,9 @@ import nu.marginalia.index.journal.writer.IndexJournalWriter; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.SpecificationLimit; -import nu.marginalia.linkdb.LinkdbReader; -import nu.marginalia.linkdb.LinkdbWriter; -import nu.marginalia.linkdb.model.LdbUrlDetail; +import nu.marginalia.linkdb.DocumentDbReader; +import nu.marginalia.linkdb.DocumentDbWriter; +import nu.marginalia.linkdb.model.DocdbUrlDetail; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.id.UrlIdCodec; @@ -53,6 +53,7 @@ import java.sql.SQLException; import java.util.*; import java.util.function.Function; +import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; @@ -84,7 +85,7 @@ public class IndexQueryServiceIntegrationTest { @Inject ProcessHeartbeat processHeartbeat; @Inject - LinkdbReader linkdbReader; + DocumentDbReader documentDbReader; @BeforeEach public void setUp() throws IOException { @@ -566,11 +567,11 @@ public class IndexQueryServiceIntegrationTest { indexJournalWriter.put(header, entry); }); - var linkdbWriter = new LinkdbWriter( - IndexLocations.getLinkdbLivePath(fileStorageService).resolve("links.db") + var linkdbWriter = new DocumentDbWriter( + IndexLocations.getLinkdbLivePath(fileStorageService).resolve(DOCDB_FILE_NAME) ); for (Long key : allData.keySet()) { - linkdbWriter.add(new LdbUrlDetail( + linkdbWriter.add(new DocdbUrlDetail( key, new EdgeUrl("https://www.example.com"), "test", @@ -587,7 +588,7 @@ public class IndexQueryServiceIntegrationTest { indexJournalWriter.close(); constructIndex(); - linkdbReader.reconnect(); + documentDbReader.reconnect(); searchIndex.switchIndex(); } } diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java index d128a690..79e722a0 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java @@ -7,7 +7,7 @@ import nu.marginalia.storage.model.FileStorageBase; import nu.marginalia.storage.model.FileStorageBaseType; import nu.marginalia.index.journal.writer.IndexJournalWriter; import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl; -import nu.marginalia.linkdb.LinkdbReader; +import nu.marginalia.linkdb.DocumentDbReader; import nu.marginalia.process.control.FakeProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.ranking.DomainRankings; @@ -26,6 +26,7 @@ import java.sql.SQLException; import java.util.Random; import java.util.UUID; +import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME; import static org.mockito.Mockito.when; public class IndexQueryServiceIntegrationTestModule extends AbstractModule { @@ -57,9 +58,9 @@ public class IndexQueryServiceIntegrationTestModule extends AbstractModule { Mockito.when(fileStorageServiceMock.getStorageBase(FileStorageBaseType.CURRENT)).thenReturn(new FileStorageBase(null, null, 0,null, fastDir.toString())); Mockito.when(fileStorageServiceMock.getStorageBase(FileStorageBaseType.STORAGE)).thenReturn(new FileStorageBase(null, null, 0, null, fastDir.toString())); - bind(LinkdbReader.class).toInstance(new LinkdbReader( + bind(DocumentDbReader.class).toInstance(new DocumentDbReader( IndexLocations.getLinkdbLivePath(fileStorageServiceMock) - .resolve("links.db") + .resolve(DOCDB_FILE_NAME) )); bind(FileStorageService.class).toInstance(fileStorageServiceMock); diff --git a/code/services-core/query-service/src/main/java/nu/marginalia/query/QueryGRPCDomainLinksService.java b/code/services-core/query-service/src/main/java/nu/marginalia/query/QueryGRPCDomainLinksService.java new file mode 100644 index 00000000..78cfc637 --- /dev/null +++ b/code/services-core/query-service/src/main/java/nu/marginalia/query/QueryGRPCDomainLinksService.java @@ -0,0 +1,96 @@ +package nu.marginalia.query; + +import com.google.inject.Inject; +import io.grpc.ManagedChannel; +import io.grpc.stub.StreamObserver; +import nu.marginalia.index.api.IndexDomainLinksApiGrpc; +import nu.marginalia.index.api.RpcDomainIdCount; +import nu.marginalia.index.api.RpcDomainIdList; +import nu.marginalia.index.api.RpcDomainIdPairs; +import nu.marginalia.query.svc.NodeConfigurationWatcher; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +public class QueryGRPCDomainLinksService extends IndexDomainLinksApiGrpc.IndexDomainLinksApiImplBase { + private static final Logger logger = LoggerFactory.getLogger(QueryGRPCDomainLinksService.class); + private final NodeConfigurationWatcher nodeConfigurationWatcher; + private final QueryGrpcStubPool stubPool; + + @Inject + public QueryGRPCDomainLinksService(NodeConfigurationWatcher nodeConfigurationWatcher) { + this.nodeConfigurationWatcher = nodeConfigurationWatcher; + stubPool = new QueryGrpcStubPool<>(nodeConfigurationWatcher) { + @Override + public IndexDomainLinksApiGrpc.IndexDomainLinksApiBlockingStub createStub(ManagedChannel channel) { + return IndexDomainLinksApiGrpc.newBlockingStub(channel); + } + }; + } + + @Override + public void getAllLinks(nu.marginalia.index.api.Empty request, + StreamObserver responseObserver) { + stubPool.callEachSequential(stub -> stub.getAllLinks(request)) + .forEach( + iter -> iter.forEachRemaining(responseObserver::onNext) + ); + + responseObserver.onCompleted(); + } + + @Override + public void getLinksFromDomain(nu.marginalia.index.api.RpcDomainId request, + StreamObserver responseObserver) { + var rspBuilder = RpcDomainIdList.newBuilder(); + + stubPool.callEachSequential(stub -> stub.getLinksFromDomain(request)) + .map(RpcDomainIdList::getDomainIdList) + .forEach(rspBuilder::addAllDomainId); + + responseObserver.onNext(rspBuilder.build()); + responseObserver.onCompleted(); + } + + @Override + public void getLinksToDomain(nu.marginalia.index.api.RpcDomainId request, + StreamObserver responseObserver) { + var rspBuilder = RpcDomainIdList.newBuilder(); + + stubPool.callEachSequential(stub -> stub.getLinksToDomain(request)) + .map(RpcDomainIdList::getDomainIdList) + .forEach(rspBuilder::addAllDomainId); + + responseObserver.onNext(rspBuilder.build()); + responseObserver.onCompleted(); + } + + @Override + public void countLinksFromDomain(nu.marginalia.index.api.RpcDomainId request, + StreamObserver responseObserver) { + + int sum = stubPool.callEachSequential(stub -> stub.countLinksFromDomain(request)) + .mapToInt(RpcDomainIdCount::getIdCount) + .sum(); + + var rspBuilder = RpcDomainIdCount.newBuilder(); + rspBuilder.setIdCount(sum); + responseObserver.onNext(rspBuilder.build()); + responseObserver.onCompleted(); + } + + @Override + public void countLinksToDomain(nu.marginalia.index.api.RpcDomainId request, + io.grpc.stub.StreamObserver responseObserver) { + + int sum = stubPool.callEachSequential(stub -> stub.countLinksToDomain(request)) + .mapToInt(RpcDomainIdCount::getIdCount) + .sum(); + + var rspBuilder = RpcDomainIdCount.newBuilder(); + rspBuilder.setIdCount(sum); + responseObserver.onNext(rspBuilder.build()); + responseObserver.onCompleted(); + } + +} diff --git a/code/services-core/query-service/src/main/java/nu/marginalia/query/QueryGRPCService.java b/code/services-core/query-service/src/main/java/nu/marginalia/query/QueryGRPCService.java index 9e14ef15..5f59ee15 100644 --- a/code/services-core/query-service/src/main/java/nu/marginalia/query/QueryGRPCService.java +++ b/code/services-core/query-service/src/main/java/nu/marginalia/query/QueryGRPCService.java @@ -2,7 +2,6 @@ package nu.marginalia.query; import com.google.inject.Inject; import io.grpc.ManagedChannel; -import io.grpc.ManagedChannelBuilder; import io.prometheus.client.Histogram; import lombok.SneakyThrows; import nu.marginalia.db.DomainBlacklist; @@ -10,7 +9,6 @@ import nu.marginalia.index.api.*; import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.query.svc.NodeConfigurationWatcher; import nu.marginalia.query.svc.QueryFactory; -import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -28,32 +26,7 @@ public class QueryGRPCService extends QueryApiGrpc.QueryApiImplBase { .help("QS-side query time (GRPC endpoint)") .register(); - private final Map channels - = new ConcurrentHashMap<>(); - private final Map actorRpcApis - = new ConcurrentHashMap<>(); - - private ManagedChannel getChannel(ServiceAndNode serviceAndNode) { - return channels.computeIfAbsent(serviceAndNode, - san -> ManagedChannelBuilder - .forAddress(serviceAndNode.getHostName(), 81) - .usePlaintext() - .build()); - } - - public IndexApiGrpc.IndexApiBlockingStub indexApi(int node) { - return actorRpcApis.computeIfAbsent(new ServiceAndNode("index-service", node), n -> - IndexApiGrpc.newBlockingStub( - getChannel(n) - ) - ); - } - - record ServiceAndNode(String service, int node) { - public String getHostName() { - return service+"-"+node; - } - } + private final QueryGrpcStubPool stubPool; private final QueryFactory queryFactory; private final DomainBlacklist blacklist; @@ -64,6 +37,13 @@ public class QueryGRPCService extends QueryApiGrpc.QueryApiImplBase { this.queryFactory = queryFactory; this.blacklist = blacklist; this.nodeConfigurationWatcher = nodeConfigurationWatcher; + + stubPool = new QueryGrpcStubPool<>(nodeConfigurationWatcher) { + @Override + public IndexApiGrpc.IndexApiBlockingStub createStub(ManagedChannel channel) { + return IndexApiGrpc.newBlockingStub(channel); + } + }; } public void query(nu.marginalia.index.api.RpcQsQuery request, @@ -89,7 +69,6 @@ public class QueryGRPCService extends QueryApiGrpc.QueryApiImplBase { responseBuilder.setDomain(query.domain); responseObserver.onNext(responseBuilder.build()); - responseObserver.onCompleted(); }); } catch (Exception e) { @@ -98,16 +77,13 @@ public class QueryGRPCService extends QueryApiGrpc.QueryApiImplBase { } } - private final ExecutorService es = Executors.newVirtualThreadPerTaskExecutor(); - private static final Comparator comparator = Comparator.comparing(RpcDecoratedResultItem::getRankingScore); @SneakyThrows private List executeQueries(RpcIndexQuery indexRequest, int totalSize) { - List>> tasks = createTasks(indexRequest); - - return es.invokeAll(tasks).stream() + return stubPool.invokeAll(stub -> new QueryTask(stub, indexRequest)) + .stream() .filter(f -> f.state() == Future.State.SUCCESS) .map(Future::resultNow) .flatMap(List::stream) @@ -116,26 +92,30 @@ public class QueryGRPCService extends QueryApiGrpc.QueryApiImplBase { .toList(); } - @NotNull - private List>> createTasks(RpcIndexQuery indexRequest) { - List>> tasks = new ArrayList<>(); + private class QueryTask implements Callable> { + private final IndexApiGrpc.IndexApiBlockingStub stub; + private final RpcIndexQuery indexRequest; - for (var node : nodeConfigurationWatcher.getQueryNodes()) { - tasks.add(() -> { - var responseIter = indexApi(node).query(indexRequest); - var ret = new ArrayList(); - while (responseIter.hasNext()) { - RpcDecoratedResultItem next = responseIter.next(); - if (isBlacklisted(next)) - continue; - ret.add(next); - } - return ret; - }); + public QueryTask(IndexApiGrpc.IndexApiBlockingStub stub, RpcIndexQuery indexRequest) { + this.stub = stub; + this.indexRequest = indexRequest; } - return tasks; - } + @Override + public List call() { + var rsp = stub.query(indexRequest); + List ret = new ArrayList<>(); + + while (rsp.hasNext()) { + RpcDecoratedResultItem next = rsp.next(); + if (isBlacklisted(next)) + continue; + ret.add(next); + } + + return ret; + } + } private boolean isBlacklisted(RpcDecoratedResultItem item) { return blacklist.isBlacklisted(UrlIdCodec.getDomainId(item.getRawItem().getCombinedId())); diff --git a/code/services-core/query-service/src/main/java/nu/marginalia/query/QueryGrpcStubPool.java b/code/services-core/query-service/src/main/java/nu/marginalia/query/QueryGrpcStubPool.java new file mode 100644 index 00000000..ed95b18c --- /dev/null +++ b/code/services-core/query-service/src/main/java/nu/marginalia/query/QueryGrpcStubPool.java @@ -0,0 +1,64 @@ +package nu.marginalia.query; + +import io.grpc.ManagedChannel; +import io.grpc.ManagedChannelBuilder; +import nu.marginalia.query.svc.NodeConfigurationWatcher; + +import java.util.List; +import java.util.Map; +import java.util.concurrent.*; +import java.util.function.Function; +import java.util.stream.Stream; + +public abstract class QueryGrpcStubPool { + protected record ServiceAndNode(String service, int node) { + public String getHostName() { + return service+"-"+node; + } + } + + private final NodeConfigurationWatcher nodeConfigurationWatcher; + private final Map channels = new ConcurrentHashMap<>(); + private final Map actorRpcApis = new ConcurrentHashMap<>(); + private final ExecutorService virtualExecutorService = Executors.newVirtualThreadPerTaskExecutor(); + + QueryGrpcStubPool(NodeConfigurationWatcher nodeConfigurationWatcher) { + this.nodeConfigurationWatcher = nodeConfigurationWatcher; + } + + /** Get an API stub for the given node */ + public STUB indexApi(int node) { + var san = new ServiceAndNode("index-service", node); + return actorRpcApis.computeIfAbsent(san, n -> + createStub(channels.computeIfAbsent(san, this::createChannel)) + ); + } + + protected ManagedChannel createChannel(ServiceAndNode serviceAndNode) { + return ManagedChannelBuilder.forAddress(serviceAndNode.getHostName(), 81).usePlaintext().build(); + } + + /** Invoke a function on each node, returning a list of futures in a terminal state, as per + * ExecutorService$invokeAll */ + public List> invokeAll(Function> callF) throws InterruptedException { + List> calls = nodeConfigurationWatcher.getQueryNodes().stream() + .map(id -> callF.apply(indexApi(id))) + .toList(); + + return virtualExecutorService.invokeAll(calls); + } + + /** Invoke a function on each node, returning a stream of results */ + public Stream callEachSequential(Function call) { + return nodeConfigurationWatcher.getQueryNodes().stream() + .map(id -> call.apply(indexApi(id))); + } + + + /** Create a stub for the given channel, this is an operation + * that needs to be implemented for the particular API this + * pool is intended for + */ + public abstract STUB createStub(ManagedChannel channel); + +} diff --git a/code/services-core/query-service/src/main/java/nu/marginalia/query/QueryService.java b/code/services-core/query-service/src/main/java/nu/marginalia/query/QueryService.java index d78b92bc..fc8bc8fb 100644 --- a/code/services-core/query-service/src/main/java/nu/marginalia/query/QueryService.java +++ b/code/services-core/query-service/src/main/java/nu/marginalia/query/QueryService.java @@ -42,6 +42,7 @@ public class QueryService extends Service { public QueryService(BaseServiceParams params, IndexClient indexClient, NodeConfigurationWatcher nodeWatcher, + QueryGRPCDomainLinksService domainLinksService, QueryGRPCService queryGRPCService, Gson gson, DomainBlacklist blacklist, @@ -55,6 +56,7 @@ public class QueryService extends Service { var grpcServer = ServerBuilder.forPort(params.configuration.port() + 1) .addService(queryGRPCService) + .addService(domainLinksService) .build(); grpcServer.start(); From e49ba887e9e0a6790b8b173868b9690a8113a969 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 8 Jan 2024 19:16:49 +0100 Subject: [PATCH 53/61] (crawl data) Add compatibility layer for old crawl data format The new converter logic assumes that the crawl data is ordered where the domain record comes first, and then a sequence of document records. This is true for the new parquet format, but not for the old zstd/gson format. To make the new converter compatible with the old format, a specialized reader is introduced that scans for the domain record before running through the sequence of document records; and presenting them in the new order. This is slower than just reading the file beginning to end, so in order to retain performance when this ordering isn't necessary, a CompatibilityLevel flag is added to CrawledDomainReader, permitting the caller to decide how compatible the data needs to be. Down the line when all the old data is purged, this should be removed, as it amounts to technical debt. --- .../crawling/io/CrawledDomainReader.java | 26 ++++- ...ibleLegacySerializableCrawlDataStream.java | 107 ++++++++++++++++++ ...astLegacySerializableCrawlDataStream.java} | 7 +- .../src/main/java/plan/CrawlPlan.java | 2 +- .../java/nu/marginalia/crawl/CrawlerMain.java | 2 +- .../retreival/CrawlerRetreiverTest.java | 14 +-- .../actor/task/ExportAtagsActor.java | 2 +- .../nu/marginalia/tools/CrawlDataUnfcker.java | 2 +- .../tools/ExperimentRunnerMain.java | 6 +- 9 files changed, 143 insertions(+), 25 deletions(-) create mode 100644 code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/CompatibleLegacySerializableCrawlDataStream.java rename code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/{LegacySerializableCrawlDataStream.java => FastLegacySerializableCrawlDataStream.java} (87%) diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java index bb79dcf0..dfd6415c 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java @@ -1,7 +1,8 @@ package nu.marginalia.crawling.io; import com.google.gson.Gson; -import nu.marginalia.crawling.io.format.LegacySerializableCrawlDataStream; +import nu.marginalia.crawling.io.format.CompatibleLegacySerializableCrawlDataStream; +import nu.marginalia.crawling.io.format.FastLegacySerializableCrawlDataStream; import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream; import nu.marginalia.model.gson.GsonFactory; @@ -15,11 +16,24 @@ public class CrawledDomainReader { public CrawledDomainReader() { } + public enum CompatibilityLevel { + /** Data order emulates the ordering of the new format. This is slower */ + COMPATIBLE, + /** Data order is not compatible with the new format, but the data itself is */ + FAST, + /** Alias for FAST */ + ANY + } /** An iterator-like access to domain data This must be closed otherwise it will leak off-heap memory! */ - public static SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException { + public static SerializableCrawlDataStream createDataStream(CompatibilityLevel compatibilityLevel, + Path fullPath) throws IOException + { String fileName = fullPath.getFileName().toString(); if (fileName.endsWith(".zstd")) { - return new LegacySerializableCrawlDataStream(gson, fullPath.toFile()); + if (compatibilityLevel == CompatibilityLevel.COMPATIBLE) + return new CompatibleLegacySerializableCrawlDataStream(gson, fullPath.toFile()); + else // if (compatibilityLevel == CompatibilityLevel.FAST or ANY) + return new FastLegacySerializableCrawlDataStream(gson, fullPath.toFile()); } else if (fileName.endsWith(".parquet")) { return new ParquetSerializableCrawlDataStream(fullPath); @@ -30,14 +44,14 @@ public class CrawledDomainReader { } /** An iterator-like access to domain data. This must be closed otherwise it will leak off-heap memory! */ - public static SerializableCrawlDataStream createDataStream(Path basePath, String domain, String id) throws IOException { + public static SerializableCrawlDataStream createDataStream(CompatibilityLevel level, Path basePath, String domain, String id) throws IOException { Path parquetPath = CrawlerOutputFile.getParquetPath(basePath, id, domain); if (Files.exists(parquetPath)) { - return createDataStream(parquetPath); + return createDataStream(level, parquetPath); } else { - return createDataStream(CrawlerOutputFile.getLegacyOutputFile(basePath, id, domain)); + return createDataStream(level, CrawlerOutputFile.getLegacyOutputFile(basePath, id, domain)); } } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/CompatibleLegacySerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/CompatibleLegacySerializableCrawlDataStream.java new file mode 100644 index 00000000..16b45954 --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/CompatibleLegacySerializableCrawlDataStream.java @@ -0,0 +1,107 @@ +package nu.marginalia.crawling.io.format; + +import com.github.luben.zstd.RecyclingBufferPool; +import com.github.luben.zstd.ZstdInputStream; +import com.google.gson.Gson; +import nu.marginalia.crawling.io.SerializableCrawlDataStream; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.SerializableCrawlData; + +import java.io.*; +import java.nio.file.Path; +import java.util.Objects; + +import static java.util.Objects.*; + +/** This class is used to read the old format of crawl data, which was zstd-compressed JSON + * with type delimiters between records. It does its best to preserve the semantics of the + * new format. This is slow. + */ +public class CompatibleLegacySerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { + private final Gson gson; + private final BufferedReader bufferedReader; + + private CrawledDomain domain; + private SerializableCrawlData next; + + private final Path path; + public CompatibleLegacySerializableCrawlDataStream(Gson gson, File file) throws IOException { + this.gson = gson; + path = file.toPath(); + domain = findDomain(file); + + bufferedReader = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE))); + } + + /** Scan through the file and find the domain record */ + private CrawledDomain findDomain(File file) throws IOException { + try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE)))) { + for (;;) { + String identifierLine = + requireNonNull(br.readLine(), "No identifier line found"); + String dataLine = + requireNonNull(br.readLine(), "No data line found"); + + if (identifierLine.equals(CrawledDomain.SERIAL_IDENTIFIER)) { + return gson.fromJson(dataLine, CrawledDomain.class); + } + } + } + } + + @Override + public Path path() { + return path; + } + + @Override + public SerializableCrawlData next() throws IOException { + if (hasNext()) { + if (domain != null) { + var ret = domain; + domain = null; + return ret; + } + else { + var ret = next; + next = null; + return ret; + } + } + throw new IllegalStateException("No more data"); + } + + @Override + public boolean hasNext() throws IOException { + if (domain != null || next != null) { + return true; + } + + String identifier = bufferedReader.readLine(); + if (identifier == null) { + bufferedReader.close(); + return false; + } + String data = bufferedReader.readLine(); + if (data == null) { + bufferedReader.close(); + return false; + } + + if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) { + next = null; + return false; // last record is expected to be the domain, so we're done + } else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) { + next = gson.fromJson(data, CrawledDocument.class); + } else { + throw new IllegalStateException("Unknown identifier: " + identifier); + } + return true; + } + + @Override + public void close() throws Exception { + bufferedReader.close(); + } +} diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/LegacySerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/FastLegacySerializableCrawlDataStream.java similarity index 87% rename from code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/LegacySerializableCrawlDataStream.java rename to code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/FastLegacySerializableCrawlDataStream.java index bfd52b78..09871cf4 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/LegacySerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/FastLegacySerializableCrawlDataStream.java @@ -12,15 +12,16 @@ import java.io.*; import java.nio.file.Path; /** This class is used to read the old format of crawl data, which was zstd-compressed JSON - * with type delimiters between records. + * with type delimiters between records. It does not preserve the semantics of the new format, + * but it is faster. */ -public class LegacySerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { +public class FastLegacySerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { private final Gson gson; private final BufferedReader bufferedReader; private SerializableCrawlData next = null; private final Path path; - public LegacySerializableCrawlDataStream(Gson gson, File file) throws IOException { + public FastLegacySerializableCrawlDataStream(Gson gson, File file) throws IOException { this.gson = gson; bufferedReader = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE))); path = file.toPath(); diff --git a/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java b/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java index cbb88772..02164b60 100644 --- a/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java +++ b/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java @@ -95,7 +95,7 @@ public class CrawlPlan { } try { - return Optional.of(CrawledDomainReader.createDataStream(path)); + return Optional.of(CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.COMPATIBLE, path)); } catch (IOException ex) { return Optional.empty(); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index 4b97200b..58d90950 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -272,7 +272,7 @@ public class CrawlerMain { private CrawlDataReference getReference() { try { - return new CrawlDataReference(CrawledDomainReader.createDataStream(outputDir, domain, id)); + return new CrawlDataReference(CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, outputDir, domain, id)); } catch (IOException e) { logger.debug("Failed to read previous crawl data for {}", specification.domain); return new CrawlDataReference(); diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index 3e8eb775..77581a3d 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -182,7 +182,7 @@ class CrawlerRetreiverTest { convertToParquet(tempFileWarc1, tempFileParquet1); - try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) { + try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet1)) { while (stream.hasNext()) { if (stream.next() instanceof CrawledDocument doc) { data.add(doc); @@ -227,7 +227,7 @@ class CrawlerRetreiverTest { doCrawl(tempFileWarc1, specs); convertToParquet(tempFileWarc1, tempFileParquet1); - try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) { + try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet1)) { while (stream.hasNext()) { if (stream.next() instanceof CrawledDocument doc) { data.add(doc); @@ -274,7 +274,7 @@ class CrawlerRetreiverTest { doCrawl(tempFileWarc1, specs); convertToParquet(tempFileWarc1, tempFileParquet1); doCrawlWithReferenceStream(specs, - CrawledDomainReader.createDataStream(tempFileParquet1) + CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet1) ); convertToParquet(tempFileWarc2, tempFileParquet2); @@ -295,7 +295,7 @@ class CrawlerRetreiverTest { }); } - try (var ds = CrawledDomainReader.createDataStream(tempFileParquet2)) { + try (var ds = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet2)) { while (ds.hasNext()) { var doc = ds.next(); if (doc instanceof CrawledDomain dr) { @@ -338,7 +338,7 @@ class CrawlerRetreiverTest { convertToParquet(tempFileWarc1, tempFileParquet1); - try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) { + try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet1)) { while (stream.hasNext()) { var doc = stream.next(); data.computeIfAbsent(doc.getClass(), c -> new ArrayList<>()).add(doc); @@ -347,7 +347,7 @@ class CrawlerRetreiverTest { throw new RuntimeException(e); } - var stream = CrawledDomainReader.createDataStream(tempFileParquet1); + var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet1); System.out.println("---"); @@ -387,7 +387,7 @@ class CrawlerRetreiverTest { }); } - try (var ds = CrawledDomainReader.createDataStream(tempFileParquet2)) { + try (var ds = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet2)) { while (ds.hasNext()) { var doc = ds.next(); if (doc instanceof CrawledDomain dr) { diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java index 35ddde89..c45adbaa 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java @@ -76,7 +76,7 @@ public class ExportAtagsActor extends RecordActorPrototype { } Path crawlDataPath = inputDir.resolve(item.relPath()); - try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) { + try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.FAST, crawlDataPath)) { exportLinks(tagWriter, stream); } catch (Exception ex) { diff --git a/code/tools/crawl-data-unfcker/src/main/java/nu/marginalia/tools/CrawlDataUnfcker.java b/code/tools/crawl-data-unfcker/src/main/java/nu/marginalia/tools/CrawlDataUnfcker.java index 4322d3fc..0101de12 100644 --- a/code/tools/crawl-data-unfcker/src/main/java/nu/marginalia/tools/CrawlDataUnfcker.java +++ b/code/tools/crawl-data-unfcker/src/main/java/nu/marginalia/tools/CrawlDataUnfcker.java @@ -60,7 +60,7 @@ public class CrawlDataUnfcker { return Optional.empty(); } - try (var stream = CrawledDomainReader.createDataStream(file)) { + try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.FAST, file)) { while (stream.hasNext()) { if (stream.next() instanceof CrawledDomain domain) { return Optional.of(domain); diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java index c5751a7a..77ee15ed 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java @@ -4,13 +4,9 @@ import com.google.inject.Guice; import com.google.inject.Injector; import nu.marginalia.converting.ConverterModule; import nu.marginalia.crawling.io.CrawledDomainReader; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.process.log.WorkLog; import nu.marginalia.service.module.DatabaseModule; import nu.marginalia.tools.experiments.*; -import plan.CrawlPlanLoader; import java.io.IOException; import java.nio.file.Path; @@ -52,7 +48,7 @@ public class ExperimentRunnerMain { Path basePath = Path.of(args[0]); for (var item : WorkLog.iterable(basePath.resolve("crawler.log"))) { Path crawlDataPath = basePath.resolve(item.relPath()); - try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) { + try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.FAST, crawlDataPath)) { experiment.process(stream); } catch (Exception ex) { From fbad6251265c7d72df6c7e52205dfd40984f170a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 8 Jan 2024 19:56:33 +0100 Subject: [PATCH 54/61] (linkdb) Add delegating implementation of DomainLinkDb This facilitates switching between SQL and File-backed implementations on the fly while migrating from one to the other. --- .../linkdb/{ => dlinks}/DomainLinkDb.java | 2 +- .../{ => dlinks}/DomainLinkDbLoader.java | 2 +- .../{ => dlinks}/DomainLinkDbWriter.java | 2 +- .../linkdb/{ => dlinks}/FileDomainLinkDb.java | 7 +- .../linkdb/dlinks/SelectingDomainLinkDb.java | 104 ++++++++++++++++++ .../linkdb/{ => dlinks}/SqlDomainLinkDb.java | 16 +-- .../linkdb/{ => docs}/DocumentDbReader.java | 2 +- .../linkdb/{ => docs}/DocumentDbWriter.java | 2 +- .../linkdb/DocumentDbWriterTest.java | 2 + .../marginalia/linkdb/DomainLinkDbTest.java | 2 + .../marginalia/converting/ConverterMain.java | 1 - .../nu/marginalia/loading/LoaderMain.java | 2 +- .../nu/marginalia/loading/LoaderModule.java | 4 +- .../documents/DocumentLoaderService.java | 2 +- .../links/DomainLinksLoaderService.java | 2 +- .../java/nu/marginalia/index/IndexModule.java | 18 +-- .../nu/marginalia/index/IndexService.java | 4 +- .../index/results/IndexResultDecorator.java | 2 +- .../index/svc/IndexDomainLinksService.java | 4 +- ...IndexQueryServiceIntegrationSmokeTest.java | 4 +- .../svc/IndexQueryServiceIntegrationTest.java | 4 +- ...ndexQueryServiceIntegrationTestModule.java | 2 +- 22 files changed, 140 insertions(+), 50 deletions(-) rename code/common/linkdb/src/main/java/nu/marginalia/linkdb/{ => dlinks}/DomainLinkDb.java (97%) rename code/common/linkdb/src/main/java/nu/marginalia/linkdb/{ => dlinks}/DomainLinkDbLoader.java (96%) rename code/common/linkdb/src/main/java/nu/marginalia/linkdb/{ => dlinks}/DomainLinkDbWriter.java (95%) rename code/common/linkdb/src/main/java/nu/marginalia/linkdb/{ => dlinks}/FileDomainLinkDb.java (96%) create mode 100644 code/common/linkdb/src/main/java/nu/marginalia/linkdb/dlinks/SelectingDomainLinkDb.java rename code/common/linkdb/src/main/java/nu/marginalia/linkdb/{ => dlinks}/SqlDomainLinkDb.java (93%) rename code/common/linkdb/src/main/java/nu/marginalia/linkdb/{ => docs}/DocumentDbReader.java (99%) rename code/common/linkdb/src/main/java/nu/marginalia/linkdb/{ => docs}/DocumentDbWriter.java (98%) diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/DomainLinkDb.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/dlinks/DomainLinkDb.java similarity index 97% rename from code/common/linkdb/src/main/java/nu/marginalia/linkdb/DomainLinkDb.java rename to code/common/linkdb/src/main/java/nu/marginalia/linkdb/dlinks/DomainLinkDb.java index b9af1dea..bb7c43c1 100644 --- a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/DomainLinkDb.java +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/dlinks/DomainLinkDb.java @@ -1,4 +1,4 @@ -package nu.marginalia.linkdb; +package nu.marginalia.linkdb.dlinks; import gnu.trove.list.array.TIntArrayList; diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/DomainLinkDbLoader.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/dlinks/DomainLinkDbLoader.java similarity index 96% rename from code/common/linkdb/src/main/java/nu/marginalia/linkdb/DomainLinkDbLoader.java rename to code/common/linkdb/src/main/java/nu/marginalia/linkdb/dlinks/DomainLinkDbLoader.java index de8c6d96..83af733d 100644 --- a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/DomainLinkDbLoader.java +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/dlinks/DomainLinkDbLoader.java @@ -1,4 +1,4 @@ -package nu.marginalia.linkdb; +package nu.marginalia.linkdb.dlinks; import java.io.DataInputStream; import java.io.IOException; diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/DomainLinkDbWriter.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/dlinks/DomainLinkDbWriter.java similarity index 95% rename from code/common/linkdb/src/main/java/nu/marginalia/linkdb/DomainLinkDbWriter.java rename to code/common/linkdb/src/main/java/nu/marginalia/linkdb/dlinks/DomainLinkDbWriter.java index f275ba01..99830443 100644 --- a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/DomainLinkDbWriter.java +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/dlinks/DomainLinkDbWriter.java @@ -1,4 +1,4 @@ -package nu.marginalia.linkdb; +package nu.marginalia.linkdb.dlinks; import java.io.DataOutputStream; import java.io.IOException; diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/FileDomainLinkDb.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/dlinks/FileDomainLinkDb.java similarity index 96% rename from code/common/linkdb/src/main/java/nu/marginalia/linkdb/FileDomainLinkDb.java rename to code/common/linkdb/src/main/java/nu/marginalia/linkdb/dlinks/FileDomainLinkDb.java index 53f53417..c548ab81 100644 --- a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/FileDomainLinkDb.java +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/dlinks/FileDomainLinkDb.java @@ -1,4 +1,4 @@ -package nu.marginalia.linkdb; +package nu.marginalia.linkdb.dlinks; import com.google.inject.name.Named; import gnu.trove.list.array.TIntArrayList; @@ -22,9 +22,8 @@ public class FileDomainLinkDb implements DomainLinkDb { public FileDomainLinkDb(@Named("domain-linkdb-file") Path filename) throws IOException { this.filename = filename; - if (Files.exists(filename)) { - switchInput(filename); - } + + loadInput(filename); } @Override diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/dlinks/SelectingDomainLinkDb.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/dlinks/SelectingDomainLinkDb.java new file mode 100644 index 00000000..d6220336 --- /dev/null +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/dlinks/SelectingDomainLinkDb.java @@ -0,0 +1,104 @@ +package nu.marginalia.linkdb.dlinks; + +import com.google.inject.name.Named; +import com.zaxxer.hikari.HikariDataSource; +import gnu.trove.list.array.TIntArrayList; +import nu.marginalia.service.module.ServiceConfiguration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; + +/** DomainLinkDb that delegates to either a FileDomainLinkDb or a SqlDomainLinkDb, + * depending on whether the file exists. This is part of the migration path to + * always using FileDomainLinkDb. + */ +public class SelectingDomainLinkDb implements DomainLinkDb { + private final static Logger logger = LoggerFactory.getLogger(SelectingDomainLinkDb.class); + + private volatile DomainLinkDb currentDb; + private final Path filename; + public SelectingDomainLinkDb(@Named("domain-linkdb-file") Path filename, + ServiceConfiguration serviceConfiguration, + HikariDataSource dataSource) { + this.filename = filename; + + // Load the database in a separate thread, so that the constructor can return + // immediately. This would otherwise add a lot of time to the startup of the + // index service. + + Thread.ofPlatform().start(() -> { + try { + if (Files.exists(filename)) { + currentDb = new FileDomainLinkDb(filename); + } + else { + currentDb = new SqlDomainLinkDb(filename, dataSource, serviceConfiguration); + } + logger.info("Loaded linkdb"); + } catch (Exception e) { + logger.error("Failed to load linkdb", e); + } + }); + } + + @Override + public void switchInput(Path newFilename) throws Exception { + Files.move(newFilename, filename, StandardCopyOption.REPLACE_EXISTING); + + Thread.ofPlatform().start(() -> { + try { + currentDb = new FileDomainLinkDb(filename); + } catch (IOException e) { + logger.error("Failed to load linkdb", e); + } + }); + + } + + @Override + public TIntArrayList findDestinations(int source) { + // A race condition is not possible here, as the nullity of currentDb only changes from + // null to non-null + + if (currentDb == null) + return new TIntArrayList(); + + return currentDb.findDestinations(source); + } + + @Override + public int countDestinations(int source) { + if (currentDb == null) + return 0; + + return currentDb.countDestinations(source); + } + + @Override + public TIntArrayList findSources(int dest) { + if (currentDb == null) + return new TIntArrayList(); + + return currentDb.findSources(dest); + } + + @Override + public int countSources(int source) { + if (currentDb == null) + return 0; + + return currentDb.countSources(source); + } + + @Override + public void forEach(SourceDestConsumer consumer) { + if (currentDb == null) + throw new IllegalStateException("No linkdb loaded"); + + currentDb.forEach(consumer); + } +} diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/SqlDomainLinkDb.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/dlinks/SqlDomainLinkDb.java similarity index 93% rename from code/common/linkdb/src/main/java/nu/marginalia/linkdb/SqlDomainLinkDb.java rename to code/common/linkdb/src/main/java/nu/marginalia/linkdb/dlinks/SqlDomainLinkDb.java index 4a98eaa9..883f8881 100644 --- a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/SqlDomainLinkDb.java +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/dlinks/SqlDomainLinkDb.java @@ -1,4 +1,4 @@ -package nu.marginalia.linkdb; +package nu.marginalia.linkdb.dlinks; import com.google.inject.name.Named; import com.zaxxer.hikari.HikariDataSource; @@ -9,6 +9,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; +import java.io.UnsupportedEncodingException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardCopyOption; @@ -37,21 +38,12 @@ public class SqlDomainLinkDb implements DomainLinkDb { this.dataSource = dataSource; node = configuration.node(); - - Thread.ofPlatform().start(() -> { - try { - loadDb(); - } catch (Exception e) { - logger.error("Failed to load linkdb", e); - } - }); + loadDb(); } @Override public void switchInput(Path newFilename) throws IOException { - Files.move(newFilename, filename, StandardCopyOption.REPLACE_EXISTING); - - loadDb(); + throw new UnsupportedEncodingException(); } public void loadDb() { diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/DocumentDbReader.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/docs/DocumentDbReader.java similarity index 99% rename from code/common/linkdb/src/main/java/nu/marginalia/linkdb/DocumentDbReader.java rename to code/common/linkdb/src/main/java/nu/marginalia/linkdb/docs/DocumentDbReader.java index 6d7aefd6..ba48f3ec 100644 --- a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/DocumentDbReader.java +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/docs/DocumentDbReader.java @@ -1,4 +1,4 @@ -package nu.marginalia.linkdb; +package nu.marginalia.linkdb.docs; import com.google.inject.Inject; import com.google.inject.Singleton; diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/DocumentDbWriter.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/docs/DocumentDbWriter.java similarity index 98% rename from code/common/linkdb/src/main/java/nu/marginalia/linkdb/DocumentDbWriter.java rename to code/common/linkdb/src/main/java/nu/marginalia/linkdb/docs/DocumentDbWriter.java index 88277e9d..e843e826 100644 --- a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/DocumentDbWriter.java +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/docs/DocumentDbWriter.java @@ -1,4 +1,4 @@ -package nu.marginalia.linkdb; +package nu.marginalia.linkdb.docs; import nu.marginalia.linkdb.model.DocdbUrlDetail; diff --git a/code/common/linkdb/src/test/java/nu/marginalia/linkdb/DocumentDbWriterTest.java b/code/common/linkdb/src/test/java/nu/marginalia/linkdb/DocumentDbWriterTest.java index b28b5ed4..29f9c7fb 100644 --- a/code/common/linkdb/src/test/java/nu/marginalia/linkdb/DocumentDbWriterTest.java +++ b/code/common/linkdb/src/test/java/nu/marginalia/linkdb/DocumentDbWriterTest.java @@ -1,6 +1,8 @@ package nu.marginalia.linkdb; import gnu.trove.list.array.TLongArrayList; +import nu.marginalia.linkdb.docs.DocumentDbReader; +import nu.marginalia.linkdb.docs.DocumentDbWriter; import nu.marginalia.linkdb.model.DocdbUrlDetail; import nu.marginalia.model.EdgeDomain; import org.junit.jupiter.api.Test; diff --git a/code/common/linkdb/src/test/java/nu/marginalia/linkdb/DomainLinkDbTest.java b/code/common/linkdb/src/test/java/nu/marginalia/linkdb/DomainLinkDbTest.java index 1014ba73..6db4a8cf 100644 --- a/code/common/linkdb/src/test/java/nu/marginalia/linkdb/DomainLinkDbTest.java +++ b/code/common/linkdb/src/test/java/nu/marginalia/linkdb/DomainLinkDbTest.java @@ -1,5 +1,7 @@ package nu.marginalia.linkdb; +import nu.marginalia.linkdb.dlinks.DomainLinkDbLoader; +import nu.marginalia.linkdb.dlinks.DomainLinkDbWriter; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index b4b3f96e..6115cbb1 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -46,7 +46,6 @@ public class ConverterMain { private final MessageQueueFactory messageQueueFactory; private final FileStorageService fileStorageService; private final SideloadSourceFactory sideloadSourceFactory; - private final int node; public static void main(String... args) throws Exception { diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java index a91678d8..6babfa7e 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java @@ -9,7 +9,7 @@ import lombok.SneakyThrows; import nu.marginalia.ProcessConfiguration; import nu.marginalia.ProcessConfigurationModule; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.linkdb.DocumentDbWriter; +import nu.marginalia.linkdb.docs.DocumentDbWriter; import nu.marginalia.loading.documents.DocumentLoaderService; import nu.marginalia.loading.documents.KeywordLoaderService; import nu.marginalia.loading.domains.DomainIdRegistry; diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java index 1ba5d9ca..35c98fc6 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java @@ -9,9 +9,9 @@ import com.google.inject.name.Names; import nu.marginalia.LanguageModels; import nu.marginalia.WmsaHome; import nu.marginalia.IndexLocations; -import nu.marginalia.linkdb.DomainLinkDbWriter; +import nu.marginalia.linkdb.dlinks.DomainLinkDbWriter; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.linkdb.DocumentDbWriter; +import nu.marginalia.linkdb.docs.DocumentDbWriter; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.service.SearchServiceDescriptors; import nu.marginalia.service.descriptor.ServiceDescriptors; diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/DocumentLoaderService.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/DocumentLoaderService.java index bed93d7e..5909a9aa 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/DocumentLoaderService.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/documents/DocumentLoaderService.java @@ -4,7 +4,7 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import lombok.SneakyThrows; import nu.marginalia.io.processed.DocumentRecordParquetFileReader; -import nu.marginalia.linkdb.DocumentDbWriter; +import nu.marginalia.linkdb.docs.DocumentDbWriter; import nu.marginalia.linkdb.model.DocdbUrlDetail; import nu.marginalia.loading.LoaderInputData; import nu.marginalia.loading.domains.DomainIdRegistry; diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/links/DomainLinksLoaderService.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/links/DomainLinksLoaderService.java index 272b3936..8cf42218 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/links/DomainLinksLoaderService.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/links/DomainLinksLoaderService.java @@ -4,7 +4,7 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import lombok.SneakyThrows; import nu.marginalia.io.processed.DomainLinkRecordParquetFileReader; -import nu.marginalia.linkdb.DomainLinkDbWriter; +import nu.marginalia.linkdb.dlinks.DomainLinkDbWriter; import nu.marginalia.loading.LoaderInputData; import nu.marginalia.loading.domains.DomainIdRegistry; import nu.marginalia.model.processed.DomainLinkRecord; diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java index 179df9ec..d0b2dcf9 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java @@ -5,9 +5,10 @@ import com.google.inject.Provides; import com.google.inject.Singleton; import com.google.inject.name.Named; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.linkdb.DomainLinkDb; -import nu.marginalia.linkdb.FileDomainLinkDb; -import nu.marginalia.linkdb.SqlDomainLinkDb; +import nu.marginalia.linkdb.dlinks.DomainLinkDb; +import nu.marginalia.linkdb.dlinks.FileDomainLinkDb; +import nu.marginalia.linkdb.dlinks.SelectingDomainLinkDb; +import nu.marginalia.linkdb.dlinks.SqlDomainLinkDb; import nu.marginalia.service.module.ServiceConfiguration; import nu.marginalia.storage.FileStorageService; import nu.marginalia.IndexLocations; @@ -41,18 +42,11 @@ public class IndexModule extends AbstractModule { FileStorageService storageService, HikariDataSource dataSource, ServiceConfiguration serviceConfiguration - ) throws IOException + ) { Path path = IndexLocations.getLinkdbLivePath(storageService).resolve(DOMAIN_LINKS_FILE_NAME); - if (Files.exists(path)) { - logger.info("Using file domain link db {}", path); - return new FileDomainLinkDb(path); - } - else { - logger.warn("Using legacy sql domain link db"); - return new SqlDomainLinkDb(path, dataSource, serviceConfiguration); - } + return new SelectingDomainLinkDb(path, serviceConfiguration, dataSource); } @Provides diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java index 9602b469..325b132c 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java @@ -7,13 +7,13 @@ import io.reactivex.rxjava3.schedulers.Schedulers; import lombok.SneakyThrows; import nu.marginalia.IndexLocations; import nu.marginalia.index.svc.IndexDomainLinksService; -import nu.marginalia.linkdb.DomainLinkDb; +import nu.marginalia.linkdb.dlinks.DomainLinkDb; import nu.marginalia.storage.FileStorageService; import nu.marginalia.index.client.IndexMqEndpoints; import nu.marginalia.index.index.SearchIndex; import nu.marginalia.index.svc.IndexOpsService; import nu.marginalia.index.svc.IndexQueryService; -import nu.marginalia.linkdb.DocumentDbReader; +import nu.marginalia.linkdb.docs.DocumentDbReader; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.service.control.ServiceEventLog; import nu.marginalia.service.server.*; diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultDecorator.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultDecorator.java index 0994fcbc..0c8f6acc 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultDecorator.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultDecorator.java @@ -7,7 +7,7 @@ import gnu.trove.list.array.TLongArrayList; import nu.marginalia.index.client.model.results.DecoratedSearchResultItem; import nu.marginalia.index.client.model.results.ResultRankingContext; import nu.marginalia.index.client.model.results.SearchResultItem; -import nu.marginalia.linkdb.DocumentDbReader; +import nu.marginalia.linkdb.docs.DocumentDbReader; import nu.marginalia.linkdb.model.DocdbUrlDetail; import nu.marginalia.ranking.ResultValuator; import org.slf4j.Logger; diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexDomainLinksService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexDomainLinksService.java index 04b33e6c..b368d289 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexDomainLinksService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexDomainLinksService.java @@ -3,9 +3,7 @@ package nu.marginalia.index.svc; import com.google.inject.Inject; import io.grpc.stub.StreamObserver; import nu.marginalia.index.api.*; -import nu.marginalia.linkdb.DomainLinkDb; - -import static io.grpc.stub.ServerCalls.asyncUnimplementedUnaryCall; +import nu.marginalia.linkdb.dlinks.DomainLinkDb; /** GRPC service for interrogating domain links */ diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationSmokeTest.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationSmokeTest.java index cd69188c..f8c95cf2 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationSmokeTest.java +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationSmokeTest.java @@ -24,8 +24,8 @@ import nu.marginalia.index.journal.writer.IndexJournalWriter; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.SpecificationLimit; -import nu.marginalia.linkdb.DocumentDbReader; -import nu.marginalia.linkdb.DocumentDbWriter; +import nu.marginalia.linkdb.docs.DocumentDbReader; +import nu.marginalia.linkdb.docs.DocumentDbWriter; import nu.marginalia.linkdb.model.DocdbUrlDetail; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.id.UrlIdCodec; diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java index 17acc7c4..ca5cafe0 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java @@ -23,8 +23,8 @@ import nu.marginalia.index.journal.writer.IndexJournalWriter; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.SpecificationLimit; -import nu.marginalia.linkdb.DocumentDbReader; -import nu.marginalia.linkdb.DocumentDbWriter; +import nu.marginalia.linkdb.docs.DocumentDbReader; +import nu.marginalia.linkdb.docs.DocumentDbWriter; import nu.marginalia.linkdb.model.DocdbUrlDetail; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.PubDate; diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java index 79e722a0..746657d8 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java @@ -7,7 +7,7 @@ import nu.marginalia.storage.model.FileStorageBase; import nu.marginalia.storage.model.FileStorageBaseType; import nu.marginalia.index.journal.writer.IndexJournalWriter; import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl; -import nu.marginalia.linkdb.DocumentDbReader; +import nu.marginalia.linkdb.docs.DocumentDbReader; import nu.marginalia.process.control.FakeProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.ranking.DomainRankings; From cb55273769fd34d350b0acc006aba89d3e1f0b06 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 8 Jan 2024 20:02:19 +0100 Subject: [PATCH 55/61] (search) When clicking asn-links, show results from the unfiltered view... --- .../templates/search/site-info/site-info-index-indexed.hdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-index-indexed.hdb b/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-index-indexed.hdb index e8e66b05..d138452b 100644 --- a/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-index-indexed.hdb +++ b/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info-index-indexed.hdb @@ -8,6 +8,6 @@ Pages Indexed: {{pagesIndexed}}

IP: {{ip}} {{#if ipCountry}}{{getIpFlag}}{{/if}}
- AS: {{#if asn}}{{asn}} {{asnOrg}} {{asnCountry}}{{/if}}
+ AS: {{#if asn}}{{asn}} {{asnOrg}} {{asnCountry}}{{/if}}

\ No newline at end of file From d4b0539d397d2a8ee9cf75646ce5a6529f220890 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 8 Jan 2024 20:57:40 +0100 Subject: [PATCH 56/61] (search) Clean up search results template Rendering is very slow. Let's see if this has a measurable effect on latency. --- .../marginalia/search/model/UrlDetails.java | 20 ++++++++----------- .../templates/search/parts/search-result.hdb | 9 ++++----- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java index 92c22e59..e1696950 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java @@ -111,18 +111,14 @@ public class UrlDetails { public int getProblemCount() { int numProblems = 0; - for (var problem :EnumSet.of( - HtmlFeature.JS, - HtmlFeature.TRACKING, - HtmlFeature.TRACKING_ADTECH, - HtmlFeature.AFFILIATE_LINK, - HtmlFeature.COOKIES, - HtmlFeature.ADVERTISEMENT)) { - if (HtmlFeature.hasFeature(features, problem)) { - numProblems++; - } - } - return numProblems; + int mask = HtmlFeature.JS.getFeatureBit() + | HtmlFeature.COOKIES.getFeatureBit() + | HtmlFeature.TRACKING.getFeatureBit() + | HtmlFeature.AFFILIATE_LINK.getFeatureBit() + | HtmlFeature.TRACKING_ADTECH.getFeatureBit() + | HtmlFeature.ADVERTISEMENT.getFeatureBit(); + + return Integer.bitCount(features & mask); } public String getProblems() { diff --git a/code/services-application/search-service/src/main/resources/templates/search/parts/search-result.hdb b/code/services-application/search-service/src/main/resources/templates/search/parts/search-result.hdb index bd89cc59..2c9c8f8a 100644 --- a/code/services-application/search-service/src/main/resources/templates/search/parts/search-result.hdb +++ b/code/services-application/search-service/src/main/resources/templates/search/parts/search-result.hdb @@ -1,5 +1,5 @@ -
+

{{title}}

{{description}}

@@ -7,10 +7,9 @@
{{#unless focusDomain}} Info - {{#if hasMoreResults}}{{resultsFromSameDomain}}+{{/if}} - {{/unless}} + {{#if hasMoreResults}}{{resultsFromSameDomain}}+{{/if}}{{/unless}}
- {{#if problems}} ⚠ {{problemCount}} {{/if}} + {{#if problemCount}} ⚠ {{problemCount}} {{/if}}
From aff690f7d6a30c42a44cfe80f3d285cc4ec1b28f Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 9 Jan 2024 11:28:36 +0100 Subject: [PATCH 57/61] (search) Toggle for showing recent results Will by default show results from the last 2 years. May need to tune this later. --- .../search/SearchQueryParamFactory.java | 5 ++- .../search/command/SearchParameters.java | 16 ++++++--- .../search/command/SearchRecentParameter.java | 33 +++++++++++++++++++ .../search/model/SearchFilters.java | 33 +++++++++++++++++++ .../search/svc/SearchQueryService.java | 6 ++-- .../templates/search/parts/search-filters.hdb | 8 +++++ 6 files changed, 91 insertions(+), 10 deletions(-) create mode 100644 code/services-application/search-service/src/main/java/nu/marginalia/search/command/SearchRecentParameter.java diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchQueryParamFactory.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchQueryParamFactory.java index 95439273..03acc479 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchQueryParamFactory.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchQueryParamFactory.java @@ -14,10 +14,13 @@ public class SearchQueryParamFactory { public QueryParams forRegularSearch(SearchParameters userParams) { SearchSubquery prototype = new SearchSubquery(); var profile = userParams.profile(); + profile.addTacitTerms(prototype); userParams.js().addTacitTerms(prototype); userParams.adtech().addTacitTerms(prototype); + SpecificationLimit yearLimit = userParams.recent().yearLimit(); + return new QueryParams( userParams.query(), null, @@ -26,7 +29,7 @@ public class SearchQueryParamFactory { prototype.searchTermsPriority, prototype.searchTermsAdvice, profile.getQualityLimit(), - profile.getYearLimit(), + yearLimit, profile.getSizeLimit(), SpecificationLimit.none(), SpecificationLimit.none(), diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/command/SearchParameters.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/command/SearchParameters.java index 3f45f1b1..865fe785 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/command/SearchParameters.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/command/SearchParameters.java @@ -9,6 +9,7 @@ import java.nio.charset.StandardCharsets; public record SearchParameters(String query, SearchProfile profile, SearchJsParameter js, + SearchRecentParameter recent, SearchAdtechParameter adtech ) { public String profileStr() { @@ -16,22 +17,27 @@ public record SearchParameters(String query, } public SearchParameters withProfile(SearchProfile profile) { - return new SearchParameters(query, profile, js, adtech); + return new SearchParameters(query, profile, js, recent, adtech); } public SearchParameters withJs(SearchJsParameter js) { - return new SearchParameters(query, profile, js, adtech); + return new SearchParameters(query, profile, js, recent, adtech); } public SearchParameters withAdtech(SearchAdtechParameter adtech) { - return new SearchParameters(query, profile, js, adtech); + return new SearchParameters(query, profile, js, recent, adtech); + } + + public SearchParameters withRecent(SearchRecentParameter recent) { + return new SearchParameters(query, profile, js, recent, adtech); } public String renderUrl(WebsiteUrl baseUrl) { - String path = String.format("/search?query=%s&profile=%s&js=%s&adtech=%s", + String path = String.format("/search?query=%s&profile=%s&js=%s&adtech=%s&recent=%s", URLEncoder.encode(query, StandardCharsets.UTF_8), URLEncoder.encode(profile.filterId, StandardCharsets.UTF_8), URLEncoder.encode(js.value, StandardCharsets.UTF_8), - URLEncoder.encode(adtech.value, StandardCharsets.UTF_8) + URLEncoder.encode(adtech.value, StandardCharsets.UTF_8), + URLEncoder.encode(recent.value, StandardCharsets.UTF_8) ); return baseUrl.withPath(path); diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/command/SearchRecentParameter.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/command/SearchRecentParameter.java new file mode 100644 index 00000000..c6c17453 --- /dev/null +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/command/SearchRecentParameter.java @@ -0,0 +1,33 @@ +package nu.marginalia.search.command; + +import nu.marginalia.index.client.model.query.SearchSubquery; +import nu.marginalia.index.query.limit.SpecificationLimit; + +import javax.annotation.Nullable; +import java.time.LocalDateTime; +import java.util.Arrays; + +public enum SearchRecentParameter { + DEFAULT("default"), + RECENT("recent"); + + public final String value; + + SearchRecentParameter(String value) { + this.value = value; + } + + public static SearchRecentParameter parse(@Nullable String value) { + if (RECENT.value.equals(value)) return RECENT; + + return DEFAULT; + } + + public SpecificationLimit yearLimit() { + if (this == RECENT) { + return SpecificationLimit.greaterThan(LocalDateTime.now().getYear() - 1); + } else { + return SpecificationLimit.none(); + } + } +} diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchFilters.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchFilters.java index 3afdef7f..1f2895af 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchFilters.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/SearchFilters.java @@ -5,6 +5,7 @@ import nu.marginalia.WebsiteUrl; import nu.marginalia.search.command.SearchAdtechParameter; import nu.marginalia.search.command.SearchJsParameter; import nu.marginalia.search.command.SearchParameters; +import nu.marginalia.search.command.SearchRecentParameter; import java.util.List; @@ -20,6 +21,9 @@ public class SearchFilters { public final RemoveJsOption removeJsOption; @Getter public final ReduceAdtechOption reduceAdtechOption; + @Getter + public final ShowRecentOption showRecentOption; + @Getter public final List> filterGroups; @@ -30,6 +34,8 @@ public class SearchFilters { removeJsOption = new RemoveJsOption(parameters); reduceAdtechOption = new ReduceAdtechOption(parameters); + showRecentOption = new ShowRecentOption(parameters); + currentFilter = parameters.profile().filterId; @@ -82,6 +88,7 @@ public class SearchFilters { this.url = parameters.withJs(toggledValue).renderUrl(SearchFilters.this.url); } } + public class ReduceAdtechOption { private final SearchAdtechParameter value; @@ -108,6 +115,32 @@ public class SearchFilters { } } + public class ShowRecentOption { + private final SearchRecentParameter value; + + @Getter + public final String url; + + public boolean isSet() { + return value.equals(SearchRecentParameter.RECENT); + } + + public String name() { + return "Recent Results"; + } + + public ShowRecentOption(SearchParameters parameters) { + this.value = parameters.recent(); + + var toggledValue = switch (parameters.recent()) { + case RECENT -> SearchRecentParameter.DEFAULT; + default -> SearchRecentParameter.RECENT; + }; + + this.url = parameters.withRecent(toggledValue).renderUrl(SearchFilters.this.url); + } + } + public class Filter { @Getter public final String displayName; diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryService.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryService.java index 9593623c..5b4e5c3a 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryService.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryService.java @@ -4,12 +4,9 @@ import com.google.inject.Inject; import io.prometheus.client.Histogram; import lombok.SneakyThrows; import nu.marginalia.WebsiteUrl; -import nu.marginalia.search.command.SearchAdtechParameter; +import nu.marginalia.search.command.*; import nu.marginalia.search.model.SearchProfile; import nu.marginalia.client.Context; -import nu.marginalia.search.command.CommandEvaluator; -import nu.marginalia.search.command.SearchJsParameter; -import nu.marginalia.search.command.SearchParameters; import nu.marginalia.search.exceptions.RedirectException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -63,6 +60,7 @@ public class SearchQueryService { return new SearchParameters(queryParam.trim(), SearchProfile.getSearchProfile(request.queryParams("profile")), SearchJsParameter.parse(request.queryParams("js")), + SearchRecentParameter.parse(request.queryParams("recent")), SearchAdtechParameter.parse(request.queryParams("adtech"))); } catch (Exception ex) { diff --git a/code/services-application/search-service/src/main/resources/templates/search/parts/search-filters.hdb b/code/services-application/search-service/src/main/resources/templates/search/parts/search-filters.hdb index c3be02b3..64452d63 100644 --- a/code/services-application/search-service/src/main/resources/templates/search/parts/search-filters.hdb +++ b/code/services-application/search-service/src/main/resources/templates/search/parts/search-filters.hdb @@ -16,6 +16,14 @@ {{name}} {{/with}} + {{#with showRecentOption}} + + {{/with}}

Domains

    From 41cccfd2aad49a7e2bd5b9cabc87f828f9ab773c Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 9 Jan 2024 11:36:49 +0100 Subject: [PATCH 58/61] (search) Toggle for showing recent results Actually persist the value of the toggle between searches too... --- .../search/model/DecoratedSearchResults.java | 15 ++++++--------- .../templates/search/parts/search-form.hdb | 1 + 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/DecoratedSearchResults.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/DecoratedSearchResults.java index 0bc86e56..bdea053e 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/model/DecoratedSearchResults.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/model/DecoratedSearchResults.java @@ -18,14 +18,11 @@ public class DecoratedSearchResults { private final String focusDomain; private final int focusDomainId; private final SearchFilters filters; - public String getQuery() { - return params.query(); - } - public String getProfile() { - return params.profile().filterId; - } - public String getJs() { - return params.js().value; - } + + // These are used by the search form + public String getQuery() { return params.query(); } + public String getProfile() { return params.profile().filterId; } + public String getJs() { return params.js().value; } public String getAdtech() { return params.adtech().value; } + public String getRecent() { return params.recent().value; } } diff --git a/code/services-application/search-service/src/main/resources/templates/search/parts/search-form.hdb b/code/services-application/search-service/src/main/resources/templates/search/parts/search-form.hdb index 79b47a99..447ff87e 100644 --- a/code/services-application/search-service/src/main/resources/templates/search/parts/search-form.hdb +++ b/code/services-application/search-service/src/main/resources/templates/search/parts/search-form.hdb @@ -8,6 +8,7 @@ +
From c47730f2cc8becbd57975da07b6e84184efe7326 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 9 Jan 2024 13:30:30 +0100 Subject: [PATCH 59/61] (search) Mobile UX improvements. Swipe right to show filter menu. Fix CSS bug that caused parts of the menu to not have a background. --- .../src/main/resources/static/search/menu.js | 75 ++++++++++++++++--- .../main/resources/static/search/serp.scss | 1 - 2 files changed, 66 insertions(+), 10 deletions(-) diff --git a/code/services-application/search-service/src/main/resources/static/search/menu.js b/code/services-application/search-service/src/main/resources/static/search/menu.js index a8b4c0b3..5832fd36 100644 --- a/code/services-application/search-service/src/main/resources/static/search/menu.js +++ b/code/services-application/search-service/src/main/resources/static/search/menu.js @@ -1,3 +1,26 @@ +function hideMenu() { + document.getElementById('filters').style.display = 'none'; +} +function showMenu() { + document.getElementById('filters').style.display = 'block'; + + // Defer creation of the close button until the menu is opened. This is needed because the script for creating + // the filter button is run early to avoid layout shifts. + + if (document.getElementById('menu-close') === null) { + registerCloseButton(); + } + + document.getElementById('filters').style.display = 'block'; + + // scroll to the top of the page so the user can see the filters + window.scrollTo({ + top: 0, + left: 0, + behavior: "instant", + }); +} + const registerCloseButton = () => { // Add a button to close the filters for mobile; we do this in js to not pollute the DOM for text-only browsers const closeButton = document.createElement('button'); @@ -6,7 +29,7 @@ const registerCloseButton = () => { closeButton.setAttribute('aria-controls', '#filters'); closeButton.innerHTML = 'X'; closeButton.onclick = (event) => { - document.getElementById('filters').style.display = 'none'; + hideMenu(); event.stopPropagation(); return false; } @@ -20,15 +43,49 @@ filtersButton.setAttribute('aria-controls', '#filters'); filtersButton.innerHTML = 'Ξ'; filtersButton.setAttribute('title', 'Open the filters menu'); filtersButton.onclick = (event) => { - // Defer creation of the close button until the menu is opened. This is needed because the script for creating - // the filter button is run early to avoid layout shifts. - - if (document.getElementById('menu-close') === null) { - registerCloseButton(); - } - - document.getElementById('filters').style.display = 'block'; + showMenu(); event.stopPropagation(); return false; } + document.getElementById('search-box').getElementsByTagName('h1')[0].append(filtersButton); + +// swipe affordances for mobile +if (window.matchMedia('(pointer: coarse)').matches) { + // capture swipes to the left and right to open and close the filters + let touchStartX = 0; + let touchEndX = 0; + let touchStartY = 0; + let touchEndY = 0; + + const swipeThreshold = 100; + const maxVerticalDistance = 75; + document.addEventListener('touchstart', (event) => { + touchStartX = event.changedTouches[0].screenX; + touchStartY = event.changedTouches[0].screenY; + }); + document.addEventListener('touchend', (event) => { + touchEndX = event.changedTouches[0].screenX; + touchEndY = event.changedTouches[0].screenY; + let verticalDistance = Math.abs(touchStartY - touchEndY); + + if (verticalDistance > maxVerticalDistance) { + return; + } + + if (touchEndX - touchStartX > swipeThreshold) { + showMenu(); + event.stopPropagation(); + } else if (touchStartX - touchEndX > swipeThreshold) { + hideMenu(); + event.stopPropagation(); + } + }); + + + // Add a floating panel to the bottom of the page to show a message when the filters are hidden + const floatingPanel = document.createElement('div'); + floatingPanel.setAttribute('style', 'position: fixed; bottom: 0; left: 0; right: 0; background-color: #fff; padding: 1em; text-align: center; display: block; border-top: 1px solid #ccc; box-shadow: 0 0 -5px #eee;'); + floatingPanel.innerHTML = '← right/left open/close the filters →'; + document.body.appendChild(floatingPanel); +} \ No newline at end of file diff --git a/code/services-application/search-service/src/main/resources/static/search/serp.scss b/code/services-application/search-service/src/main/resources/static/search/serp.scss index 3ab845bc..0c44415c 100644 --- a/code/services-application/search-service/src/main/resources/static/search/serp.scss +++ b/code/services-application/search-service/src/main/resources/static/search/serp.scss @@ -668,7 +668,6 @@ footer { top: 0; left: 0; width: 100%; - height: 100%; margin: 0; padding: 0; z-index: 100; From bd7970fb1fac0106eb3597eda57ad72e1d147b8d Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 9 Jan 2024 13:38:40 +0100 Subject: [PATCH 60/61] (search) Swap swipe direction for more consistent experience --- .../search-service/src/main/resources/static/search/menu.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/code/services-application/search-service/src/main/resources/static/search/menu.js b/code/services-application/search-service/src/main/resources/static/search/menu.js index 5832fd36..141846ad 100644 --- a/code/services-application/search-service/src/main/resources/static/search/menu.js +++ b/code/services-application/search-service/src/main/resources/static/search/menu.js @@ -74,10 +74,10 @@ if (window.matchMedia('(pointer: coarse)').matches) { } if (touchEndX - touchStartX > swipeThreshold) { - showMenu(); + hideMenu(); event.stopPropagation(); } else if (touchStartX - touchEndX > swipeThreshold) { - hideMenu(); + showMenu(); event.stopPropagation(); } }); @@ -86,6 +86,6 @@ if (window.matchMedia('(pointer: coarse)').matches) { // Add a floating panel to the bottom of the page to show a message when the filters are hidden const floatingPanel = document.createElement('div'); floatingPanel.setAttribute('style', 'position: fixed; bottom: 0; left: 0; right: 0; background-color: #fff; padding: 1em; text-align: center; display: block; border-top: 1px solid #ccc; box-shadow: 0 0 -5px #eee;'); - floatingPanel.innerHTML = '← right/left open/close the filters →'; + floatingPanel.innerHTML = '← swipe left to open filters ←'; document.body.appendChild(floatingPanel); } \ No newline at end of file From f592c9f04d776cd356a595027f56fc0af45f12a5 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 10 Jan 2024 09:26:34 +0100 Subject: [PATCH 61/61] (search) Fix acknowledgement page for domain complaints rendering as plain text This was caused by incorrect usage of the renderInto() function, which was always buggy and should never be used. This method is removed with this change. --- .../nu/marginalia/renderer/MustacheRenderer.java | 13 ------------- .../search/svc/SearchSiteInfoService.java | 6 ++---- 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/code/common/renderer/src/main/java/nu/marginalia/renderer/MustacheRenderer.java b/code/common/renderer/src/main/java/nu/marginalia/renderer/MustacheRenderer.java index c558a229..0dae086c 100644 --- a/code/common/renderer/src/main/java/nu/marginalia/renderer/MustacheRenderer.java +++ b/code/common/renderer/src/main/java/nu/marginalia/renderer/MustacheRenderer.java @@ -49,14 +49,6 @@ public class MustacheRenderer { return template.apply(model); } - @SneakyThrows - public Object renderInto(Response response, T model) { - - response.raw().getOutputStream().write(template.apply(model).getBytes(StandardCharsets.UTF_8)); - - return ""; - } - @SneakyThrows public String render(T model, String name, List children) { Context ctx = Context.newBuilder(model).combine(name, children).build(); @@ -70,9 +62,4 @@ public class MustacheRenderer { return template.apply(ctx); } - @SneakyThrows - public void renderInto(Response response, T model, Map children) { - Context ctx = Context.newBuilder(model).combine(children).build(); - response.raw().getOutputStream().write(template.apply(ctx).getBytes(StandardCharsets.UTF_8)); - } } diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java index 28c5740d..23ac2843 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java @@ -60,8 +60,6 @@ public class SearchSiteInfoService { String domainName = request.params("site"); String view = request.queryParamOrDefault("view", "info"); - response.type("text/html"); - if (null == domainName || domainName.isBlank()) { return null; } @@ -76,7 +74,7 @@ public class SearchSiteInfoService { default -> listInfo(ctx, domainName); }; - return renderer.renderInto(response, model); + return renderer.render(model); } public Object handlePost(Request request, Response response) throws SQLException { @@ -104,7 +102,7 @@ public class SearchSiteInfoService { var model = new ReportDomain(domainName, domainId, complaints, List.of(), true); - return renderer.renderInto(response, model); + return renderer.render(model); } private Object reportSite(Context ctx, String domainName) throws SQLException {