(ngram) Grab titles separately when extracting ngrams from wiki data

This commit is contained in:
Viktor Lofgren 2024-04-13 19:34:16 +02:00
parent 0da03d4cfc
commit f3255e080d

View File

@ -117,10 +117,9 @@ public class NgramExtractorMain {
32 32
); );
reader.forEachArticles((title, body) -> { reader.forEachTitles((title) -> {
pool.submitQuietly(() -> { pool.submitQuietly(() -> {
LongArrayList orderedHashesTitle = new LongArrayList(); LongArrayList orderedHashesTitle = new LongArrayList();
LongArrayList orderedHashesBody = new LongArrayList();
String normalizedTitle = title.replace('_', ' '); String normalizedTitle = title.replace('_', ' ');
@ -128,6 +127,18 @@ public class NgramExtractorMain {
String[] terms = BasicSentenceExtractor.getStemmedParts(sent); String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
orderedHashesTitle.add(orderedHasher.rollingHash(terms)); orderedHashesTitle.add(orderedHasher.rollingHash(terms));
} }
synchronized (lexicon) {
for (var hash : orderedHashesTitle) {
lexicon.incOrderedTitle(hash);
}
}
});
});
reader.forEachArticles((title, body) -> {
pool.submitQuietly(() -> {
LongArrayList orderedHashesBody = new LongArrayList();
for (var sent : getNgramBodyTerms(Jsoup.parse(body))) { for (var sent : getNgramBodyTerms(Jsoup.parse(body))) {
String[] terms = BasicSentenceExtractor.getStemmedParts(sent); String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
@ -135,9 +146,6 @@ public class NgramExtractorMain {
} }
synchronized (lexicon) { synchronized (lexicon) {
for (var hash : orderedHashesTitle) {
lexicon.incOrderedTitle(hash);
}
for (var hash : orderedHashesBody) { for (var hash : orderedHashesBody) {
lexicon.incOrderedBody(hash); lexicon.incOrderedBody(hash);
} }