mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(ngram) Grab titles separately when extracting ngrams from wiki data
This commit is contained in:
parent
0da03d4cfc
commit
f3255e080d
@ -117,10 +117,9 @@ public class NgramExtractorMain {
|
|||||||
32
|
32
|
||||||
);
|
);
|
||||||
|
|
||||||
reader.forEachArticles((title, body) -> {
|
reader.forEachTitles((title) -> {
|
||||||
pool.submitQuietly(() -> {
|
pool.submitQuietly(() -> {
|
||||||
LongArrayList orderedHashesTitle = new LongArrayList();
|
LongArrayList orderedHashesTitle = new LongArrayList();
|
||||||
LongArrayList orderedHashesBody = new LongArrayList();
|
|
||||||
|
|
||||||
String normalizedTitle = title.replace('_', ' ');
|
String normalizedTitle = title.replace('_', ' ');
|
||||||
|
|
||||||
@ -128,6 +127,18 @@ public class NgramExtractorMain {
|
|||||||
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
||||||
orderedHashesTitle.add(orderedHasher.rollingHash(terms));
|
orderedHashesTitle.add(orderedHasher.rollingHash(terms));
|
||||||
}
|
}
|
||||||
|
synchronized (lexicon) {
|
||||||
|
for (var hash : orderedHashesTitle) {
|
||||||
|
lexicon.incOrderedTitle(hash);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
reader.forEachArticles((title, body) -> {
|
||||||
|
pool.submitQuietly(() -> {
|
||||||
|
LongArrayList orderedHashesBody = new LongArrayList();
|
||||||
|
|
||||||
for (var sent : getNgramBodyTerms(Jsoup.parse(body))) {
|
for (var sent : getNgramBodyTerms(Jsoup.parse(body))) {
|
||||||
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
||||||
@ -135,9 +146,6 @@ public class NgramExtractorMain {
|
|||||||
}
|
}
|
||||||
|
|
||||||
synchronized (lexicon) {
|
synchronized (lexicon) {
|
||||||
for (var hash : orderedHashesTitle) {
|
|
||||||
lexicon.incOrderedTitle(hash);
|
|
||||||
}
|
|
||||||
for (var hash : orderedHashesBody) {
|
for (var hash : orderedHashesBody) {
|
||||||
lexicon.incOrderedBody(hash);
|
lexicon.incOrderedBody(hash);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user