mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(ngram) Correct size value in ngram lexicon generation, trim the terms better
This commit is contained in:
parent
f064992137
commit
1329d4abd8
@ -1,7 +1,6 @@
|
|||||||
package nu.marginalia.segmentation;
|
package nu.marginalia.segmentation;
|
||||||
|
|
||||||
import it.unimi.dsi.fastutil.longs.*;
|
import it.unimi.dsi.fastutil.longs.*;
|
||||||
import nu.marginalia.hash.MurmurHash3_128;
|
|
||||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
@ -16,8 +15,6 @@ import java.util.concurrent.TimeUnit;
|
|||||||
|
|
||||||
public class NgramExtractorMain {
|
public class NgramExtractorMain {
|
||||||
public static void main(String... args) throws IOException, InterruptedException {
|
public static void main(String... args) throws IOException, InterruptedException {
|
||||||
dumpCounts(Path.of("/home/vlofgren/Exports/wikipedia_en_all_nopic_2024-02.zim"),
|
|
||||||
Path.of("/tmp/ngram-counts.bin"));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static List<String> getNgramTitleTerms(String title) {
|
private static List<String> getNgramTitleTerms(String title) {
|
||||||
@ -64,7 +61,6 @@ public class NgramExtractorMain {
|
|||||||
private static List<String> cleanTerms(List<String> terms) {
|
private static List<String> cleanTerms(List<String> terms) {
|
||||||
// Trim the discovered terms
|
// Trim the discovered terms
|
||||||
terms.replaceAll(s -> {
|
terms.replaceAll(s -> {
|
||||||
|
|
||||||
// Remove trailing parentheses and their contents
|
// Remove trailing parentheses and their contents
|
||||||
if (s.endsWith(")")) {
|
if (s.endsWith(")")) {
|
||||||
int idx = s.lastIndexOf('(');
|
int idx = s.lastIndexOf('(');
|
||||||
@ -73,6 +69,10 @@ public class NgramExtractorMain {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return s;
|
||||||
|
});
|
||||||
|
|
||||||
|
terms.replaceAll(s -> {
|
||||||
// Remove leading "list of "
|
// Remove leading "list of "
|
||||||
if (s.startsWith("list of ")) {
|
if (s.startsWith("list of ")) {
|
||||||
return s.substring("list of ".length());
|
return s.substring("list of ".length());
|
||||||
@ -81,6 +81,15 @@ public class NgramExtractorMain {
|
|||||||
return s;
|
return s;
|
||||||
});
|
});
|
||||||
|
|
||||||
|
terms.replaceAll(s -> {
|
||||||
|
// Remove trailing punctuation
|
||||||
|
if (s.endsWith(".") || s.endsWith(",") || s.endsWith(":") || s.endsWith(";")) {
|
||||||
|
return s.substring(0, s.length() - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
return s;
|
||||||
|
});
|
||||||
|
|
||||||
// Remove terms that are too short or too long
|
// Remove terms that are too short or too long
|
||||||
terms.removeIf(s -> {
|
terms.removeIf(s -> {
|
||||||
if (!s.contains(" "))
|
if (!s.contains(" "))
|
||||||
|
@ -21,6 +21,7 @@ import java.util.List;
|
|||||||
public class NgramLexicon {
|
public class NgramLexicon {
|
||||||
private final Long2IntOpenCustomHashMap counts;
|
private final Long2IntOpenCustomHashMap counts;
|
||||||
|
|
||||||
|
private int size;
|
||||||
private static final HasherGroup orderedHasher = HasherGroup.ordered();
|
private static final HasherGroup orderedHasher = HasherGroup.ordered();
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
@ -31,9 +32,15 @@ public class NgramLexicon {
|
|||||||
(int) size,
|
(int) size,
|
||||||
new KeyIsAlreadyHashStrategy()
|
new KeyIsAlreadyHashStrategy()
|
||||||
);
|
);
|
||||||
|
counts.defaultReturnValue(0);
|
||||||
|
|
||||||
for (int i = 0; i < size; i++) {
|
try {
|
||||||
counts.put(dis.readLong(), dis.readInt());
|
for (int i = 0; i < size; i++) {
|
||||||
|
counts.put(dis.readLong(), dis.readInt());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
ex.printStackTrace();
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
@ -137,8 +144,12 @@ public class NgramLexicon {
|
|||||||
public void incOrderedTitle(long hashOrdered) {
|
public void incOrderedTitle(long hashOrdered) {
|
||||||
int value = counts.get(hashOrdered);
|
int value = counts.get(hashOrdered);
|
||||||
|
|
||||||
if (value < 0) value = -value + 1;
|
if (value <= 0) {
|
||||||
else value ++;
|
size ++;
|
||||||
|
value = -value;
|
||||||
|
}
|
||||||
|
|
||||||
|
value ++;
|
||||||
|
|
||||||
counts.put(hashOrdered, value);
|
counts.put(hashOrdered, value);
|
||||||
}
|
}
|
||||||
@ -147,7 +158,7 @@ public class NgramLexicon {
|
|||||||
int value = counts.get(hashOrdered);
|
int value = counts.get(hashOrdered);
|
||||||
|
|
||||||
if (value <= 0) value --;
|
if (value <= 0) value --;
|
||||||
else value ++;
|
else value++;
|
||||||
|
|
||||||
counts.put(hashOrdered, value);
|
counts.put(hashOrdered, value);
|
||||||
}
|
}
|
||||||
@ -157,7 +168,8 @@ public class NgramLexicon {
|
|||||||
StandardOpenOption.CREATE,
|
StandardOpenOption.CREATE,
|
||||||
StandardOpenOption.TRUNCATE_EXISTING,
|
StandardOpenOption.TRUNCATE_EXISTING,
|
||||||
StandardOpenOption.WRITE))) {
|
StandardOpenOption.WRITE))) {
|
||||||
dos.writeInt(counts.size());
|
|
||||||
|
dos.writeInt(size);
|
||||||
|
|
||||||
counts.forEach((k, v) -> {
|
counts.forEach((k, v) -> {
|
||||||
try {
|
try {
|
||||||
|
Loading…
Reference in New Issue
Block a user