mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(keywords) Clean up leaky abstractions, clean up tests
This commit is contained in:
parent
8c0ce4fc1d
commit
5f427d2b4c
@ -3,26 +3,32 @@ package nu.marginalia.keyword.model;
|
||||
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
public final class DocumentKeywords {
|
||||
final String[] keywords;
|
||||
final long[] metadata;
|
||||
|
||||
public record DocumentKeywords(String[] keywords,
|
||||
long[] metadata)
|
||||
implements Serializable
|
||||
{
|
||||
public DocumentKeywords(String[] keywords,
|
||||
long[] metadata)
|
||||
{
|
||||
this.keywords = keywords;
|
||||
this.metadata = metadata;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append(getClass().getSimpleName());
|
||||
sb.append('[');
|
||||
for (int i = 0; i < keywords.length; i++) {
|
||||
var pointer = newPointer();
|
||||
while (pointer.advancePointer()) {
|
||||
sb.append("\n\t ");
|
||||
if (metadata[i] != 0) {
|
||||
sb.append(keywords[i]).append("/").append(new WordMetadata(metadata[i]));
|
||||
}
|
||||
else {
|
||||
sb.append(keywords[i]);
|
||||
|
||||
long metadata = pointer.getMetadata();
|
||||
String keyword = pointer.getKeyword();
|
||||
sb.append(keyword);
|
||||
|
||||
if (metadata != 0) {
|
||||
sb.append("/").append(new WordMetadata(metadata));
|
||||
}
|
||||
}
|
||||
return sb.append("\n]").toString();
|
||||
@ -36,7 +42,11 @@ implements Serializable
|
||||
return keywords.length;
|
||||
}
|
||||
|
||||
public DocumentKeywords subList(int start, int end) {
|
||||
return new DocumentKeywords(Arrays.copyOfRange(keywords, start, end), Arrays.copyOfRange(metadata, start, end));
|
||||
/** Return a pointer for traversing this structure */
|
||||
public DocumentKeywordsPointer newPointer() {
|
||||
return new DocumentKeywordsPointer(this);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -0,0 +1,41 @@
|
||||
package nu.marginalia.keyword.model;
|
||||
|
||||
/** Pointer into a {@see DocumentKeywords}. It starts out before the first position,
|
||||
* forward with advancePointer().
|
||||
* */
|
||||
public class DocumentKeywordsPointer {
|
||||
private int pos = -1;
|
||||
|
||||
private final DocumentKeywords keywords;
|
||||
|
||||
DocumentKeywordsPointer(DocumentKeywords keywords) {
|
||||
this.keywords = keywords;
|
||||
}
|
||||
|
||||
/** Number of positions remaining */
|
||||
public int remaining() {
|
||||
return keywords.size() - Math.max(0, pos);
|
||||
}
|
||||
|
||||
/** Return the keyword associated with the current position */
|
||||
public String getKeyword() {
|
||||
return keywords.keywords[pos];
|
||||
}
|
||||
|
||||
/** Return the metadata associated with the current position */
|
||||
public long getMetadata() {
|
||||
return keywords.metadata[pos];
|
||||
}
|
||||
|
||||
/** Advance the current position,
|
||||
* returns false if this was the
|
||||
* last position */
|
||||
public boolean advancePointer() {
|
||||
return ++pos < keywords.size();
|
||||
}
|
||||
|
||||
/** Returns true unless the pointer is beyond the last position in the keyword set */
|
||||
public boolean hasMore() {
|
||||
return pos + 1 < keywords.size();
|
||||
}
|
||||
}
|
@ -17,10 +17,11 @@ import java.util.Objects;
|
||||
|
||||
class DocumentKeywordExtractorTest {
|
||||
|
||||
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels()));
|
||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
|
||||
@Test
|
||||
public void testWordPattern() {
|
||||
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(null);
|
||||
|
||||
Assertions.assertTrue(extractor.matchesWordPattern("test"));
|
||||
Assertions.assertTrue(extractor.matchesWordPattern("1234567890abcde"));
|
||||
Assertions.assertFalse(extractor.matchesWordPattern("1234567890abcdef"));
|
||||
@ -34,6 +35,24 @@ class DocumentKeywordExtractorTest {
|
||||
Assertions.assertFalse(extractor.matchesWordPattern("Stulpnagelstrasse"));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testEmptyMetadata() throws URISyntaxException {
|
||||
var dld = se.extractSentences("""
|
||||
Some sample text, I'm not sure what even triggers this
|
||||
""", "A title perhaps?");
|
||||
var keywordBuilder = extractor.extractKeywords(dld, new EdgeUrl("https://www.example.com/invalid"));
|
||||
var keywords = keywordBuilder.build();
|
||||
|
||||
var pointer = keywords.newPointer();
|
||||
while (pointer.advancePointer()) {
|
||||
if (pointer.getMetadata() == 0L) {
|
||||
System.out.println("Aha! " + pointer.getKeyword());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testKeyboards() throws IOException, URISyntaxException {
|
||||
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
|
||||
@ -42,9 +61,6 @@ class DocumentKeywordExtractorTest {
|
||||
var doc = Jsoup.parse(html);
|
||||
doc.filter(new DomPruningFilter(0.5));
|
||||
|
||||
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels()));
|
||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
|
||||
var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/"));
|
||||
System.out.println(keywords.getMetaForWord("mechanical"));
|
||||
System.out.println(keywords.getMetaForWord("keyboard"));
|
||||
|
@ -26,17 +26,9 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
@Tag("slow")
|
||||
class SentenceExtractorTest {
|
||||
SentenceExtractor newSe;
|
||||
SentenceExtractor legacySe;
|
||||
final LanguageModels lm = TestLanguageModels.getLanguageModels();
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() {
|
||||
|
||||
newSe = new SentenceExtractor(lm);
|
||||
legacySe = new SentenceExtractor(lm);
|
||||
}
|
||||
|
||||
SentenceExtractor se = new SentenceExtractor(lm);
|
||||
|
||||
@SneakyThrows
|
||||
public static void main(String... args) throws IOException {
|
||||
@ -65,69 +57,16 @@ class SentenceExtractorTest {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Test
|
||||
void testExtractSubject() {
|
||||
var data = WmsaHome.getHomePath().resolve("test-data/");
|
||||
|
||||
System.out.println("Running");
|
||||
|
||||
SentenceExtractor se = new SentenceExtractor(lm);
|
||||
KeywordExtractor keywordExtractor = new KeywordExtractor();
|
||||
|
||||
for (var file : Objects.requireNonNull(data.toFile().listFiles())) {
|
||||
System.out.println(file);
|
||||
var dld = se.extractSentences(Jsoup.parse(Files.readString(file.toPath())));
|
||||
Map<String, Integer> counts = new HashMap<>();
|
||||
for (var sentence : dld.sentences) {
|
||||
for (WordSpan kw : keywordExtractor.getProperNames(sentence)) {
|
||||
if (kw.end + 2 >= sentence.length()) {
|
||||
continue;
|
||||
}
|
||||
if (sentence.separators[kw.end] == WordSeparator.COMMA
|
||||
|| sentence.separators[kw.end + 1] == WordSeparator.COMMA)
|
||||
break;
|
||||
|
||||
if (("VBZ".equals(sentence.posTags[kw.end]) || "VBP".equals(sentence.posTags[kw.end]))
|
||||
&& ("DT".equals(sentence.posTags[kw.end + 1]) || "RB".equals(sentence.posTags[kw.end]) || sentence.posTags[kw.end].startsWith("VB"))
|
||||
) {
|
||||
counts.merge(new WordRep(sentence, new WordSpan(kw.start, kw.end)).word, -1, Integer::sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int best = counts.values().stream().mapToInt(Integer::valueOf).min().orElse(0);
|
||||
|
||||
counts.entrySet().stream().sorted(Map.Entry.comparingByValue())
|
||||
.filter(e -> e.getValue()<-2 && e.getValue()<best*0.75)
|
||||
.forEach(System.out::println);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Test
|
||||
@Disabled
|
||||
public void testSE() {
|
||||
var result = newSe.extractSentences(
|
||||
Jsoup.parse(Files.readString(Path.of("/home/vlofgren/man open (2) openat.html"))));
|
||||
|
||||
var dict = new TermFrequencyDict(lm);
|
||||
System.out.println(new DocumentKeywordExtractor(dict).extractKeywords(result, new EdgeUrl("https://memex.marginalia.nu/")));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void separatorExtraction() {
|
||||
seprateExtractor("Cookies, cream and shoes");
|
||||
seprateExtractor("Cookies");
|
||||
seprateExtractor("");
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testACDC() {
|
||||
var ret = newSe.extractSentence("AC/DC is a rock band.");
|
||||
var ret = se.extractSentence("AC/DC is a rock band.");
|
||||
assertEquals("AC/DC", ret.words[0]);
|
||||
}
|
||||
|
||||
@ -139,7 +78,6 @@ class SentenceExtractorTest {
|
||||
List<String> words = new ArrayList<>();
|
||||
List<String> separators = new ArrayList<>();
|
||||
|
||||
int start = 0;
|
||||
int wordStart = 0;
|
||||
while (wordStart <= sentence.length()) {
|
||||
if (!matcher.find(wordStart)) {
|
||||
|
@ -1,34 +0,0 @@
|
||||
package nu.marginalia.loading.loader;
|
||||
|
||||
|
||||
import nu.marginalia.keyword.model.DocumentKeywords;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
public class KeywordListChunker {
|
||||
|
||||
/** Chops data into a list of lists of max length size
|
||||
*
|
||||
* Caveat: Relies on subList and does not clone "data", so
|
||||
* changes to the original list may affect the sub-lists
|
||||
* in unspecified ways
|
||||
*
|
||||
* @see List#subList
|
||||
*/
|
||||
public static List<DocumentKeywords> chopList(DocumentKeywords data, int size) {
|
||||
if (data.isEmpty())
|
||||
return Collections.emptyList();
|
||||
else if (data.size() < size)
|
||||
return List.of(data);
|
||||
|
||||
final List<DocumentKeywords> ret = new ArrayList<>(1 + data.size() / size);
|
||||
|
||||
for (int i = 0; i < data.size(); i+=size) {
|
||||
ret.add(data.subList(i, Math.min(data.size(), i+size)));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
}
|
@ -9,7 +9,6 @@ import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||
import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl;
|
||||
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
|
||||
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
||||
import nu.marginalia.keyword.model.DocumentKeywords;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
@ -20,7 +19,6 @@ import org.slf4j.LoggerFactory;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.sql.SQLException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import static nu.marginalia.index.journal.model.IndexJournalEntryData.MAX_LENGTH;
|
||||
|
||||
@ -30,6 +28,10 @@ public class LoaderIndexJournalWriter {
|
||||
private final IndexJournalWriter indexWriter;
|
||||
private static final Logger logger = LoggerFactory.getLogger(LoaderIndexJournalWriter.class);
|
||||
|
||||
private final MurmurHash3_128 hasher = new MurmurHash3_128();
|
||||
private final long[] buffer = new long[MAX_LENGTH * 2];
|
||||
|
||||
|
||||
@Inject
|
||||
public LoaderIndexJournalWriter(FileStorageService fileStorageService) throws IOException, SQLException {
|
||||
var indexArea = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING);
|
||||
@ -42,14 +44,13 @@ public class LoaderIndexJournalWriter {
|
||||
indexWriter = new IndexJournalWriterPagingImpl(indexArea.asPath());
|
||||
}
|
||||
|
||||
MurmurHash3_128 hasher = new MurmurHash3_128();
|
||||
long[] buffer = new long[MAX_LENGTH * 2];
|
||||
@SneakyThrows
|
||||
public void putWords(long combinedId,
|
||||
int features,
|
||||
DocumentMetadata metadata,
|
||||
DocumentKeywords wordSet) {
|
||||
if (wordSet.keywords().length == 0) {
|
||||
|
||||
if (wordSet.isEmpty()) {
|
||||
logger.info("Skipping zero-length word set for {}", combinedId);
|
||||
return;
|
||||
}
|
||||
@ -59,23 +60,24 @@ public class LoaderIndexJournalWriter {
|
||||
return;
|
||||
}
|
||||
|
||||
String[] words = wordSet.keywords();
|
||||
long[] meta = wordSet.metadata();
|
||||
var pointer = wordSet.newPointer();
|
||||
|
||||
for (int start = 0; start < words.length; ) {
|
||||
int end = Math.min(start + MAX_LENGTH, words.length);
|
||||
while (pointer.hasMore()) {
|
||||
int i = 0;
|
||||
|
||||
for (int i = 0; i < end - start; i++) {
|
||||
buffer[2*i] = hasher.hashNearlyASCII(words[start+i]);
|
||||
buffer[2*i + 1] = meta[start+i];
|
||||
while (i < buffer.length
|
||||
&& pointer.advancePointer())
|
||||
{
|
||||
final long hashedKeyword = hasher.hashNearlyASCII(pointer.getKeyword());
|
||||
|
||||
buffer[i++] = hashedKeyword;
|
||||
buffer[i++] = pointer.getMetadata();
|
||||
}
|
||||
|
||||
var entry = new IndexJournalEntryData(2 * (end-start), buffer);
|
||||
var entry = new IndexJournalEntryData(i, buffer);
|
||||
var header = new IndexJournalEntryHeader(combinedId, features, metadata.encode());
|
||||
|
||||
indexWriter.put(header, entry);
|
||||
|
||||
start = end;
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user