From 7a1edc08802e45ad909bc297215bac3c88937511 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 19 Jul 2024 12:23:28 +0200 Subject: [PATCH] (term-freq) Reduce the number of low-relevance words in the dictionary Using a statistical trick to reduce the number of low-frequency words in the dictionary, as they are numerous and not very informative. --- .../extractor/TermFrequencyExporter.java | 32 ++++++++++++++++--- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java b/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java index 998e94a4..4283a657 100644 --- a/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java +++ b/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java @@ -27,6 +27,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardCopyOption; import java.nio.file.attribute.PosixFilePermissions; +import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; @@ -96,8 +97,13 @@ public class TermFrequencyExporter implements ExporterIf { } - private void processFile(Path crawlDataPath, TLongIntHashMap counts, AtomicInteger docCount, SentenceExtractor se) { - TLongHashSet words = new TLongHashSet(10_000); + private void processFile(Path crawlDataPath, + TLongIntHashMap counts, + AtomicInteger docCount, + SentenceExtractor se) + { + TLongHashSet words = new TLongHashSet(1000); + try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) { while (stream.hasNext()) { if (Thread.interrupted()) @@ -119,15 +125,33 @@ public class TermFrequencyExporter implements ExporterIf { return; } - for (var sent : dld.sentences) { + for (var sent : dld) { + // Skip sentences with non-language tags, e.g. program code + if (sent.htmlTags.stream().anyMatch(t -> t.nonLanguage)) + continue; + for (var word : sent) { words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8))); } } + var random = ThreadLocalRandom.current(); synchronized (counts) { words.forEach(w -> { - counts.adjustOrPutValue(w, 1, 1); + // Mathematicians hate him for this one weird trick: + // + // We generally aren't interested in low-frequency entries, + // but due to zipf's law, there are a lot of them, in fact + // almost the entire term frequency dictionary is full of them. + // + // So we use a simple statistical trick to reduce the number + // of nearly unique entries in the dictionary, while still keeping the + // distribution of higher-frequency entries relatively intact + + if (random.nextDouble() < 0.2) { + counts.adjustOrPutValue(w, 5, 5); + } + return true; }); }