From 384de2e54bd79489e678f08d03c079d6a0a2fab9 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 6 Mar 2023 19:32:37 +0100 Subject: [PATCH] Fixing LSH deduplication bug. --- .../processor/logic/LshDocumentDeduplicator.java | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/LshDocumentDeduplicator.java b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/LshDocumentDeduplicator.java index 877c22d3..4cb4d96d 100644 --- a/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/LshDocumentDeduplicator.java +++ b/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/logic/LshDocumentDeduplicator.java @@ -11,10 +11,14 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; +/** Deduplicates documents based on their LSH + * + * @see EasyLSH + */ @Singleton public class LshDocumentDeduplicator { - private final int DISTANCE_THRESHOLD = 4; + private final int DISTANCE_THRESHOLD = 2; private final Logger logger = LoggerFactory.getLogger(getClass()); public void deduplicate(List documents) { @@ -40,13 +44,13 @@ public class LshDocumentDeduplicator { return false; } - if (EasyLSH.hammingDistance(thisDoc.lshHash, otherDoc.lshHash) < DISTANCE_THRESHOLD) + if (EasyLSH.hammingDistance(thisDoc.details.hashCode, otherDoc.details.hashCode) > DISTANCE_THRESHOLD) return false; if (thisDoc.url.path.length() < otherDoc.url.path.length()) { - logger.info("{} duplicates {}", otherDoc.url, thisDoc.url); + logger.debug("{} duplicates {}", otherDoc.url, thisDoc.url); otherDoc.state = EdgeUrlState.DISQUALIFIED; otherDoc.stateReason = "Duplicate";