(converter) Optimize LSH based within-domain deduplication

2025-02-23 21:18:58 +00:00 · 2023-08-17 17:43:46 +02:00 · 2023-08-17 17:43:46 +02:00 · e296b02649
commit e296b02649
parent 2656fcfe2c
1 changed files with 28 additions and 18 deletions
--- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/LshDocumentDeduplicator.java
+++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/LshDocumentDeduplicator.java
@ -8,8 +8,6 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.util.List;
-import java.util.Set;
-import java.util.stream.Collectors;

 /** Deduplicates documents based on their LSH
 *
@ -22,31 +20,43 @@ public class LshDocumentDeduplicator {
    private final Logger logger = LoggerFactory.getLogger(getClass());

    public void deduplicate(List<ProcessedDocument> documents) {
-        Set<ProcessedDocument> goodDocuments = documents.stream()
+        ProcessedDocument[] goodDocuments = documents.stream()
                .filter(ProcessedDocument::isProcessedFully)
-                .collect(Collectors.toSet());
+                .filter(doc -> doc.words.size() > 100)
+                .toArray(ProcessedDocument[]::new);

-        for (var document : documents) {
-            if (!goodDocuments.contains(document)) {
-                continue;
+        long[] hashCodes = new long[goodDocuments.length];
+        for (int i = 0; i < goodDocuments.length; i++) {
+            hashCodes[i] = goodDocuments[i].details.hashCode;
+        }
+
+        // These arrays can be fairly large (~10,000) so we need to be
+        // careful about what we do in this O(n^2) loop
+
+        for (int i = 0; i < hashCodes.length; i++) {
+            for (int j = 0; j < hashCodes.length; j++) {
+                // This is basically just a 64 bit XOR and a POPCOUNT so it's pretty fast.
+                if (EasyLSH.hammingDistance(hashCodes[i], hashCodes[j]) < DISTANCE_THRESHOLD) {
+                    if (i == j)
+                        continue;
+
+                    if (flagIfDuplicate(goodDocuments[i], goodDocuments[j])) {
+                        break;
+                    }
+                }
            }
-
-            goodDocuments.removeIf(other -> removeIfDuplicate(document, other));
        }
    }

-    private boolean removeIfDuplicate(ProcessedDocument thisDoc, ProcessedDocument otherDoc) {
-        if (thisDoc == otherDoc)
+    private boolean flagIfDuplicate(ProcessedDocument thisDoc, ProcessedDocument otherDoc) {
+
+        // This document has already been disqualified as a duplicate
+        if (thisDoc.state != UrlIndexingState.OK)
            return false;

-        if (thisDoc.words.size() < 100
-        || otherDoc.words.size() < 100) {
-            return false;
-        }
-
-        if (EasyLSH.hammingDistance(thisDoc.details.hashCode, otherDoc.details.hashCode) > DISTANCE_THRESHOLD)
-            return false;

+        // We might consider using thisDoc.details.metadata.topology() here instead of the
+        // URL length to determine which document is the "better" one.
        if (thisDoc.url.path.length()
                < otherDoc.url.path.length())
        {