mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Fixing LSH deduplication bug.
This commit is contained in:
parent
43f3380cb9
commit
384de2e54b
@ -11,10 +11,14 @@ import java.util.List;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
/** Deduplicates documents based on their LSH
|
||||||
|
*
|
||||||
|
* @see EasyLSH
|
||||||
|
*/
|
||||||
@Singleton
|
@Singleton
|
||||||
public class LshDocumentDeduplicator {
|
public class LshDocumentDeduplicator {
|
||||||
|
|
||||||
private final int DISTANCE_THRESHOLD = 4;
|
private final int DISTANCE_THRESHOLD = 2;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
public void deduplicate(List<ProcessedDocument> documents) {
|
public void deduplicate(List<ProcessedDocument> documents) {
|
||||||
@ -40,13 +44,13 @@ public class LshDocumentDeduplicator {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (EasyLSH.hammingDistance(thisDoc.lshHash, otherDoc.lshHash) < DISTANCE_THRESHOLD)
|
if (EasyLSH.hammingDistance(thisDoc.details.hashCode, otherDoc.details.hashCode) > DISTANCE_THRESHOLD)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
if (thisDoc.url.path.length()
|
if (thisDoc.url.path.length()
|
||||||
< otherDoc.url.path.length())
|
< otherDoc.url.path.length())
|
||||||
{
|
{
|
||||||
logger.info("{} duplicates {}", otherDoc.url, thisDoc.url);
|
logger.debug("{} duplicates {}", otherDoc.url, thisDoc.url);
|
||||||
|
|
||||||
otherDoc.state = EdgeUrlState.DISQUALIFIED;
|
otherDoc.state = EdgeUrlState.DISQUALIFIED;
|
||||||
otherDoc.stateReason = "Duplicate";
|
otherDoc.stateReason = "Duplicate";
|
||||||
|
Loading…
Reference in New Issue
Block a user