mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(WIP) Improve data extraction from wikipedia data
This commit is contained in:
parent
8ae1f08095
commit
07e4d7ec6d
@ -22,9 +22,15 @@ public class NgramExtractorMain {
|
|||||||
public static void main(String... args) {
|
public static void main(String... args) {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static List<String> getNgramTerms(Document document) {
|
private static List<String> getNgramTerms(String title, Document document) {
|
||||||
List<String> terms = new ArrayList<>();
|
List<String> terms = new ArrayList<>();
|
||||||
|
|
||||||
|
// Add the title
|
||||||
|
if (title.contains(" ")) {
|
||||||
|
terms.add(title.toLowerCase());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Grab all internal links
|
||||||
document.select("a[href]").forEach(e -> {
|
document.select("a[href]").forEach(e -> {
|
||||||
var href = e.attr("href");
|
var href = e.attr("href");
|
||||||
if (href.contains(":"))
|
if (href.contains(":"))
|
||||||
@ -39,6 +45,43 @@ public class NgramExtractorMain {
|
|||||||
terms.add(text);
|
terms.add(text);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Grab all italicized text
|
||||||
|
document.getElementsByTag("i").forEach(e -> {
|
||||||
|
var text = e.text().toLowerCase();
|
||||||
|
if (!text.contains(" "))
|
||||||
|
return;
|
||||||
|
|
||||||
|
terms.add(text);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Trim the discovered terms
|
||||||
|
terms.replaceAll(s -> {
|
||||||
|
|
||||||
|
// Remove trailing parentheses and their contents
|
||||||
|
if (s.endsWith(")")) {
|
||||||
|
int idx = s.lastIndexOf('(');
|
||||||
|
if (idx > 0) {
|
||||||
|
return s.substring(0, idx).trim();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove leading "list of "
|
||||||
|
if (s.startsWith("list of ")) {
|
||||||
|
return s.substring("list of ".length());
|
||||||
|
}
|
||||||
|
|
||||||
|
return s;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Remove terms that are too short or too long
|
||||||
|
terms.removeIf(s -> {
|
||||||
|
if (!s.contains(" "))
|
||||||
|
return true;
|
||||||
|
if (s.length() > 64)
|
||||||
|
return true;
|
||||||
|
return false;
|
||||||
|
});
|
||||||
|
|
||||||
return terms;
|
return terms;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -56,7 +99,7 @@ public class NgramExtractorMain {
|
|||||||
try (var executor = Executors.newWorkStealingPool()) {
|
try (var executor = Executors.newWorkStealingPool()) {
|
||||||
reader.forEachArticles((title, body) -> {
|
reader.forEachArticles((title, body) -> {
|
||||||
executor.submit(() -> {
|
executor.submit(() -> {
|
||||||
var terms = getNgramTerms(Jsoup.parse(body));
|
var terms = getNgramTerms(title, Jsoup.parse(body));
|
||||||
synchronized (known) {
|
synchronized (known) {
|
||||||
for (String term : terms) {
|
for (String term : terms) {
|
||||||
if (known.add(hash.hashNearlyASCII(term))) {
|
if (known.add(hash.hashNearlyASCII(term))) {
|
||||||
@ -72,7 +115,9 @@ public class NgramExtractorMain {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static void dumpCounts(Path zimInputFile,
|
public static void dumpCounts(Path zimInputFile,
|
||||||
Path countsOutputFile) throws IOException, InterruptedException
|
Path countsOutputFile,
|
||||||
|
Path permutationsOutputFile
|
||||||
|
) throws IOException, InterruptedException
|
||||||
{
|
{
|
||||||
ZIMReader reader = new ZIMReader(new ZIMFile(zimInputFile.toString()));
|
ZIMReader reader = new ZIMReader(new ZIMFile(zimInputFile.toString()));
|
||||||
|
|
||||||
@ -87,7 +132,7 @@ public class NgramExtractorMain {
|
|||||||
LongArrayList orderedHashes = new LongArrayList();
|
LongArrayList orderedHashes = new LongArrayList();
|
||||||
LongArrayList unorderedHashes = new LongArrayList();
|
LongArrayList unorderedHashes = new LongArrayList();
|
||||||
|
|
||||||
for (var sent : getNgramTerms(Jsoup.parse(body))) {
|
for (var sent : getNgramTerms(title, Jsoup.parse(body))) {
|
||||||
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
||||||
|
|
||||||
orderedHashes.add(orderedHasher.rollingHash(terms));
|
orderedHashes.add(orderedHasher.rollingHash(terms));
|
||||||
@ -108,6 +153,7 @@ public class NgramExtractorMain {
|
|||||||
}
|
}
|
||||||
|
|
||||||
lexicon.saveCounts(countsOutputFile);
|
lexicon.saveCounts(countsOutputFile);
|
||||||
|
lexicon.savePermutations(permutationsOutputFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -14,7 +14,7 @@ import java.util.ArrayList;
|
|||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
class NgramLexicon {
|
public class NgramLexicon {
|
||||||
private final Long2IntOpenCustomHashMap counts = new Long2IntOpenCustomHashMap(
|
private final Long2IntOpenCustomHashMap counts = new Long2IntOpenCustomHashMap(
|
||||||
100_000_000,
|
100_000_000,
|
||||||
new KeyIsAlreadyHashStrategy()
|
new KeyIsAlreadyHashStrategy()
|
||||||
|
Loading…
Reference in New Issue
Block a user