Fixing LSH deduplication bug.

This commit is contained in:
Viktor Lofgren 2023-03-06 19:32:37 +01:00
parent 43f3380cb9
commit 384de2e54b

View File

@ -11,10 +11,14 @@ import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
/** Deduplicates documents based on their LSH
*
* @see EasyLSH
*/
@Singleton
public class LshDocumentDeduplicator {
private final int DISTANCE_THRESHOLD = 4;
private final int DISTANCE_THRESHOLD = 2;
private final Logger logger = LoggerFactory.getLogger(getClass());
public void deduplicate(List<ProcessedDocument> documents) {
@ -40,13 +44,13 @@ public class LshDocumentDeduplicator {
return false;
}
if (EasyLSH.hammingDistance(thisDoc.lshHash, otherDoc.lshHash) < DISTANCE_THRESHOLD)
if (EasyLSH.hammingDistance(thisDoc.details.hashCode, otherDoc.details.hashCode) > DISTANCE_THRESHOLD)
return false;
if (thisDoc.url.path.length()
< otherDoc.url.path.length())
{
logger.info("{} duplicates {}", otherDoc.url, thisDoc.url);
logger.debug("{} duplicates {}", otherDoc.url, thisDoc.url);
otherDoc.state = EdgeUrlState.DISQUALIFIED;
otherDoc.stateReason = "Duplicate";