(term-freq) Reduce the number of low-relevance words in the dictionary

Using a statistical trick to reduce the number of low-frequency words in the dictionary, as they are numerous and not very informative.
This commit is contained in:
Viktor Lofgren 2024-07-19 12:23:28 +02:00
parent b812e96c6d
commit 7a1edc0880

View File

@ -27,6 +27,7 @@ import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.StandardCopyOption; import java.nio.file.StandardCopyOption;
import java.nio.file.attribute.PosixFilePermissions; import java.nio.file.attribute.PosixFilePermissions;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
@ -96,8 +97,13 @@ public class TermFrequencyExporter implements ExporterIf {
} }
private void processFile(Path crawlDataPath, TLongIntHashMap counts, AtomicInteger docCount, SentenceExtractor se) { private void processFile(Path crawlDataPath,
TLongHashSet words = new TLongHashSet(10_000); TLongIntHashMap counts,
AtomicInteger docCount,
SentenceExtractor se)
{
TLongHashSet words = new TLongHashSet(1000);
try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) { try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
while (stream.hasNext()) { while (stream.hasNext()) {
if (Thread.interrupted()) if (Thread.interrupted())
@ -119,15 +125,33 @@ public class TermFrequencyExporter implements ExporterIf {
return; return;
} }
for (var sent : dld.sentences) { for (var sent : dld) {
// Skip sentences with non-language tags, e.g. program code
if (sent.htmlTags.stream().anyMatch(t -> t.nonLanguage))
continue;
for (var word : sent) { for (var word : sent) {
words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8))); words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8)));
} }
} }
var random = ThreadLocalRandom.current();
synchronized (counts) { synchronized (counts) {
words.forEach(w -> { words.forEach(w -> {
counts.adjustOrPutValue(w, 1, 1); // Mathematicians hate him for this one weird trick:
//
// We generally aren't interested in low-frequency entries,
// but due to zipf's law, there are a lot of them, in fact
// almost the entire term frequency dictionary is full of them.
//
// So we use a simple statistical trick to reduce the number
// of nearly unique entries in the dictionary, while still keeping the
// distribution of higher-frequency entries relatively intact
if (random.nextDouble() < 0.2) {
counts.adjustOrPutValue(w, 5, 5);
}
return true; return true;
}); });
} }