mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(ngram) Correct size value in ngram lexicon generation, trim the terms better
This commit is contained in:
parent
f064992137
commit
1329d4abd8
@ -1,7 +1,6 @@
|
||||
package nu.marginalia.segmentation;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.*;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
@ -16,8 +15,6 @@ import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class NgramExtractorMain {
|
||||
public static void main(String... args) throws IOException, InterruptedException {
|
||||
dumpCounts(Path.of("/home/vlofgren/Exports/wikipedia_en_all_nopic_2024-02.zim"),
|
||||
Path.of("/tmp/ngram-counts.bin"));
|
||||
}
|
||||
|
||||
private static List<String> getNgramTitleTerms(String title) {
|
||||
@ -64,7 +61,6 @@ public class NgramExtractorMain {
|
||||
private static List<String> cleanTerms(List<String> terms) {
|
||||
// Trim the discovered terms
|
||||
terms.replaceAll(s -> {
|
||||
|
||||
// Remove trailing parentheses and their contents
|
||||
if (s.endsWith(")")) {
|
||||
int idx = s.lastIndexOf('(');
|
||||
@ -73,6 +69,10 @@ public class NgramExtractorMain {
|
||||
}
|
||||
}
|
||||
|
||||
return s;
|
||||
});
|
||||
|
||||
terms.replaceAll(s -> {
|
||||
// Remove leading "list of "
|
||||
if (s.startsWith("list of ")) {
|
||||
return s.substring("list of ".length());
|
||||
@ -81,6 +81,15 @@ public class NgramExtractorMain {
|
||||
return s;
|
||||
});
|
||||
|
||||
terms.replaceAll(s -> {
|
||||
// Remove trailing punctuation
|
||||
if (s.endsWith(".") || s.endsWith(",") || s.endsWith(":") || s.endsWith(";")) {
|
||||
return s.substring(0, s.length() - 1);
|
||||
}
|
||||
|
||||
return s;
|
||||
});
|
||||
|
||||
// Remove terms that are too short or too long
|
||||
terms.removeIf(s -> {
|
||||
if (!s.contains(" "))
|
||||
|
@ -21,6 +21,7 @@ import java.util.List;
|
||||
public class NgramLexicon {
|
||||
private final Long2IntOpenCustomHashMap counts;
|
||||
|
||||
private int size;
|
||||
private static final HasherGroup orderedHasher = HasherGroup.ordered();
|
||||
|
||||
@Inject
|
||||
@ -31,10 +32,16 @@ public class NgramLexicon {
|
||||
(int) size,
|
||||
new KeyIsAlreadyHashStrategy()
|
||||
);
|
||||
counts.defaultReturnValue(0);
|
||||
|
||||
try {
|
||||
for (int i = 0; i < size; i++) {
|
||||
counts.put(dis.readLong(), dis.readInt());
|
||||
}
|
||||
}
|
||||
catch (IOException ex) {
|
||||
ex.printStackTrace();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
@ -137,8 +144,12 @@ public class NgramLexicon {
|
||||
public void incOrderedTitle(long hashOrdered) {
|
||||
int value = counts.get(hashOrdered);
|
||||
|
||||
if (value < 0) value = -value + 1;
|
||||
else value ++;
|
||||
if (value <= 0) {
|
||||
size ++;
|
||||
value = -value;
|
||||
}
|
||||
|
||||
value ++;
|
||||
|
||||
counts.put(hashOrdered, value);
|
||||
}
|
||||
@ -147,7 +158,7 @@ public class NgramLexicon {
|
||||
int value = counts.get(hashOrdered);
|
||||
|
||||
if (value <= 0) value --;
|
||||
else value ++;
|
||||
else value++;
|
||||
|
||||
counts.put(hashOrdered, value);
|
||||
}
|
||||
@ -157,7 +168,8 @@ public class NgramLexicon {
|
||||
StandardOpenOption.CREATE,
|
||||
StandardOpenOption.TRUNCATE_EXISTING,
|
||||
StandardOpenOption.WRITE))) {
|
||||
dos.writeInt(counts.size());
|
||||
|
||||
dos.writeInt(size);
|
||||
|
||||
counts.forEach((k, v) -> {
|
||||
try {
|
||||
|
Loading…
Reference in New Issue
Block a user