From 07e4d7ec6d2806dc65a1916bdb5cb16d8f8deced Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 18 Mar 2024 13:16:00 +0100 Subject: [PATCH] (WIP) Improve data extraction from wikipedia data --- .../segmentation/NgramExtractorMain.java | 54 +++++++++++++++++-- .../segmentation/NgramLexicon.java | 2 +- 2 files changed, 51 insertions(+), 5 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java index 0339b2c1..4cd4b296 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java @@ -22,9 +22,15 @@ public class NgramExtractorMain { public static void main(String... args) { } - private static List getNgramTerms(Document document) { + private static List getNgramTerms(String title, Document document) { List terms = new ArrayList<>(); + // Add the title + if (title.contains(" ")) { + terms.add(title.toLowerCase()); + } + + // Grab all internal links document.select("a[href]").forEach(e -> { var href = e.attr("href"); if (href.contains(":")) @@ -39,6 +45,43 @@ public class NgramExtractorMain { terms.add(text); }); + // Grab all italicized text + document.getElementsByTag("i").forEach(e -> { + var text = e.text().toLowerCase(); + if (!text.contains(" ")) + return; + + terms.add(text); + }); + + // Trim the discovered terms + terms.replaceAll(s -> { + + // Remove trailing parentheses and their contents + if (s.endsWith(")")) { + int idx = s.lastIndexOf('('); + if (idx > 0) { + return s.substring(0, idx).trim(); + } + } + + // Remove leading "list of " + if (s.startsWith("list of ")) { + return s.substring("list of ".length()); + } + + return s; + }); + + // Remove terms that are too short or too long + terms.removeIf(s -> { + if (!s.contains(" ")) + return true; + if (s.length() > 64) + return true; + return false; + }); + return terms; } @@ -56,7 +99,7 @@ public class NgramExtractorMain { try (var executor = Executors.newWorkStealingPool()) { reader.forEachArticles((title, body) -> { executor.submit(() -> { - var terms = getNgramTerms(Jsoup.parse(body)); + var terms = getNgramTerms(title, Jsoup.parse(body)); synchronized (known) { for (String term : terms) { if (known.add(hash.hashNearlyASCII(term))) { @@ -72,7 +115,9 @@ public class NgramExtractorMain { } public static void dumpCounts(Path zimInputFile, - Path countsOutputFile) throws IOException, InterruptedException + Path countsOutputFile, + Path permutationsOutputFile + ) throws IOException, InterruptedException { ZIMReader reader = new ZIMReader(new ZIMFile(zimInputFile.toString())); @@ -87,7 +132,7 @@ public class NgramExtractorMain { LongArrayList orderedHashes = new LongArrayList(); LongArrayList unorderedHashes = new LongArrayList(); - for (var sent : getNgramTerms(Jsoup.parse(body))) { + for (var sent : getNgramTerms(title, Jsoup.parse(body))) { String[] terms = BasicSentenceExtractor.getStemmedParts(sent); orderedHashes.add(orderedHasher.rollingHash(terms)); @@ -108,6 +153,7 @@ public class NgramExtractorMain { } lexicon.saveCounts(countsOutputFile); + lexicon.savePermutations(permutationsOutputFile); } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java index 948347bf..f8044e12 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java @@ -14,7 +14,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; -class NgramLexicon { +public class NgramLexicon { private final Long2IntOpenCustomHashMap counts = new Long2IntOpenCustomHashMap( 100_000_000, new KeyIsAlreadyHashStrategy()