From 52f0c0d33649ea848f2536320ddf2a4a6b27727d Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 13 Apr 2024 19:34:16 +0200 Subject: [PATCH] (ngram) Grab titles separately when extracting ngrams from wiki data --- .../segmentation/NgramExtractorMain.java | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java index f6ba5b08..b0eb6916 100644 --- a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java @@ -117,10 +117,9 @@ public class NgramExtractorMain { 32 ); - reader.forEachArticles((title, body) -> { + reader.forEachTitles((title) -> { pool.submitQuietly(() -> { LongArrayList orderedHashesTitle = new LongArrayList(); - LongArrayList orderedHashesBody = new LongArrayList(); String normalizedTitle = title.replace('_', ' '); @@ -128,6 +127,18 @@ public class NgramExtractorMain { String[] terms = BasicSentenceExtractor.getStemmedParts(sent); orderedHashesTitle.add(orderedHasher.rollingHash(terms)); } + synchronized (lexicon) { + for (var hash : orderedHashesTitle) { + lexicon.incOrderedTitle(hash); + } + } + }); + + }); + + reader.forEachArticles((title, body) -> { + pool.submitQuietly(() -> { + LongArrayList orderedHashesBody = new LongArrayList(); for (var sent : getNgramBodyTerms(Jsoup.parse(body))) { String[] terms = BasicSentenceExtractor.getStemmedParts(sent); @@ -135,9 +146,6 @@ public class NgramExtractorMain { } synchronized (lexicon) { - for (var hash : orderedHashesTitle) { - lexicon.incOrderedTitle(hash); - } for (var hash : orderedHashesBody) { lexicon.incOrderedBody(hash); }