mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(ngram) Grab titles separately when extracting ngrams from wiki data
This commit is contained in:
parent
be55f3f937
commit
52f0c0d336
@ -117,10 +117,9 @@ public class NgramExtractorMain {
|
||||
32
|
||||
);
|
||||
|
||||
reader.forEachArticles((title, body) -> {
|
||||
reader.forEachTitles((title) -> {
|
||||
pool.submitQuietly(() -> {
|
||||
LongArrayList orderedHashesTitle = new LongArrayList();
|
||||
LongArrayList orderedHashesBody = new LongArrayList();
|
||||
|
||||
String normalizedTitle = title.replace('_', ' ');
|
||||
|
||||
@ -128,6 +127,18 @@ public class NgramExtractorMain {
|
||||
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
||||
orderedHashesTitle.add(orderedHasher.rollingHash(terms));
|
||||
}
|
||||
synchronized (lexicon) {
|
||||
for (var hash : orderedHashesTitle) {
|
||||
lexicon.incOrderedTitle(hash);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
reader.forEachArticles((title, body) -> {
|
||||
pool.submitQuietly(() -> {
|
||||
LongArrayList orderedHashesBody = new LongArrayList();
|
||||
|
||||
for (var sent : getNgramBodyTerms(Jsoup.parse(body))) {
|
||||
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
||||
@ -135,9 +146,6 @@ public class NgramExtractorMain {
|
||||
}
|
||||
|
||||
synchronized (lexicon) {
|
||||
for (var hash : orderedHashesTitle) {
|
||||
lexicon.incOrderedTitle(hash);
|
||||
}
|
||||
for (var hash : orderedHashesBody) {
|
||||
lexicon.incOrderedBody(hash);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user