(keywords) Clean up leaky abstractions, clean up tests

This commit is contained in:
Viktor Lofgren 2023-09-01 13:52:00 +02:00
parent 8c0ce4fc1d
commit 5f427d2b4c
6 changed files with 105 additions and 132 deletions

View File

@ -3,26 +3,32 @@ package nu.marginalia.keyword.model;
import nu.marginalia.model.idx.WordMetadata;
import java.io.Serializable;
import java.util.Arrays;
public final class DocumentKeywords {
final String[] keywords;
final long[] metadata;
public record DocumentKeywords(String[] keywords,
long[] metadata)
implements Serializable
{
public DocumentKeywords(String[] keywords,
long[] metadata)
{
this.keywords = keywords;
this.metadata = metadata;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(getClass().getSimpleName());
sb.append('[');
for (int i = 0; i < keywords.length; i++) {
var pointer = newPointer();
while (pointer.advancePointer()) {
sb.append("\n\t ");
if (metadata[i] != 0) {
sb.append(keywords[i]).append("/").append(new WordMetadata(metadata[i]));
}
else {
sb.append(keywords[i]);
long metadata = pointer.getMetadata();
String keyword = pointer.getKeyword();
sb.append(keyword);
if (metadata != 0) {
sb.append("/").append(new WordMetadata(metadata));
}
}
return sb.append("\n]").toString();
@ -36,7 +42,11 @@ implements Serializable
return keywords.length;
}
public DocumentKeywords subList(int start, int end) {
return new DocumentKeywords(Arrays.copyOfRange(keywords, start, end), Arrays.copyOfRange(metadata, start, end));
/** Return a pointer for traversing this structure */
public DocumentKeywordsPointer newPointer() {
return new DocumentKeywordsPointer(this);
}
}

View File

@ -0,0 +1,41 @@
package nu.marginalia.keyword.model;
/** Pointer into a {@see DocumentKeywords}. It starts out before the first position,
* forward with advancePointer().
* */
public class DocumentKeywordsPointer {
private int pos = -1;
private final DocumentKeywords keywords;
DocumentKeywordsPointer(DocumentKeywords keywords) {
this.keywords = keywords;
}
/** Number of positions remaining */
public int remaining() {
return keywords.size() - Math.max(0, pos);
}
/** Return the keyword associated with the current position */
public String getKeyword() {
return keywords.keywords[pos];
}
/** Return the metadata associated with the current position */
public long getMetadata() {
return keywords.metadata[pos];
}
/** Advance the current position,
* returns false if this was the
* last position */
public boolean advancePointer() {
return ++pos < keywords.size();
}
/** Returns true unless the pointer is beyond the last position in the keyword set */
public boolean hasMore() {
return pos + 1 < keywords.size();
}
}

View File

@ -17,10 +17,11 @@ import java.util.Objects;
class DocumentKeywordExtractorTest {
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels()));
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
@Test
public void testWordPattern() {
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(null);
Assertions.assertTrue(extractor.matchesWordPattern("test"));
Assertions.assertTrue(extractor.matchesWordPattern("1234567890abcde"));
Assertions.assertFalse(extractor.matchesWordPattern("1234567890abcdef"));
@ -34,6 +35,24 @@ class DocumentKeywordExtractorTest {
Assertions.assertFalse(extractor.matchesWordPattern("Stulpnagelstrasse"));
}
@Test
public void testEmptyMetadata() throws URISyntaxException {
var dld = se.extractSentences("""
Some sample text, I'm not sure what even triggers this
""", "A title perhaps?");
var keywordBuilder = extractor.extractKeywords(dld, new EdgeUrl("https://www.example.com/invalid"));
var keywords = keywordBuilder.build();
var pointer = keywords.newPointer();
while (pointer.advancePointer()) {
if (pointer.getMetadata() == 0L) {
System.out.println("Aha! " + pointer.getKeyword());
}
}
}
@Test
public void testKeyboards() throws IOException, URISyntaxException {
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
@ -42,9 +61,6 @@ class DocumentKeywordExtractorTest {
var doc = Jsoup.parse(html);
doc.filter(new DomPruningFilter(0.5));
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels()));
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/"));
System.out.println(keywords.getMetaForWord("mechanical"));
System.out.println(keywords.getMetaForWord("keyboard"));

View File

@ -26,17 +26,9 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
@Tag("slow")
class SentenceExtractorTest {
SentenceExtractor newSe;
SentenceExtractor legacySe;
final LanguageModels lm = TestLanguageModels.getLanguageModels();
@BeforeEach
public void setUp() {
newSe = new SentenceExtractor(lm);
legacySe = new SentenceExtractor(lm);
}
SentenceExtractor se = new SentenceExtractor(lm);
@SneakyThrows
public static void main(String... args) throws IOException {
@ -65,69 +57,16 @@ class SentenceExtractorTest {
}
}
@SneakyThrows
@Test
void testExtractSubject() {
var data = WmsaHome.getHomePath().resolve("test-data/");
System.out.println("Running");
SentenceExtractor se = new SentenceExtractor(lm);
KeywordExtractor keywordExtractor = new KeywordExtractor();
for (var file : Objects.requireNonNull(data.toFile().listFiles())) {
System.out.println(file);
var dld = se.extractSentences(Jsoup.parse(Files.readString(file.toPath())));
Map<String, Integer> counts = new HashMap<>();
for (var sentence : dld.sentences) {
for (WordSpan kw : keywordExtractor.getProperNames(sentence)) {
if (kw.end + 2 >= sentence.length()) {
continue;
}
if (sentence.separators[kw.end] == WordSeparator.COMMA
|| sentence.separators[kw.end + 1] == WordSeparator.COMMA)
break;
if (("VBZ".equals(sentence.posTags[kw.end]) || "VBP".equals(sentence.posTags[kw.end]))
&& ("DT".equals(sentence.posTags[kw.end + 1]) || "RB".equals(sentence.posTags[kw.end]) || sentence.posTags[kw.end].startsWith("VB"))
) {
counts.merge(new WordRep(sentence, new WordSpan(kw.start, kw.end)).word, -1, Integer::sum);
}
}
}
int best = counts.values().stream().mapToInt(Integer::valueOf).min().orElse(0);
counts.entrySet().stream().sorted(Map.Entry.comparingByValue())
.filter(e -> e.getValue()<-2 && e.getValue()<best*0.75)
.forEach(System.out::println);
}
}
@SneakyThrows
@Test
@Disabled
public void testSE() {
var result = newSe.extractSentences(
Jsoup.parse(Files.readString(Path.of("/home/vlofgren/man open (2) openat.html"))));
var dict = new TermFrequencyDict(lm);
System.out.println(new DocumentKeywordExtractor(dict).extractKeywords(result, new EdgeUrl("https://memex.marginalia.nu/")));
}
@Test
public void separatorExtraction() {
seprateExtractor("Cookies, cream and shoes");
seprateExtractor("Cookies");
seprateExtractor("");
}
@Test
public void testACDC() {
var ret = newSe.extractSentence("AC/DC is a rock band.");
var ret = se.extractSentence("AC/DC is a rock band.");
assertEquals("AC/DC", ret.words[0]);
}
@ -139,7 +78,6 @@ class SentenceExtractorTest {
List<String> words = new ArrayList<>();
List<String> separators = new ArrayList<>();
int start = 0;
int wordStart = 0;
while (wordStart <= sentence.length()) {
if (!matcher.find(wordStart)) {

View File

@ -1,34 +0,0 @@
package nu.marginalia.loading.loader;
import nu.marginalia.keyword.model.DocumentKeywords;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class KeywordListChunker {
/** Chops data into a list of lists of max length size
*
* Caveat: Relies on subList and does not clone "data", so
* changes to the original list may affect the sub-lists
* in unspecified ways
*
* @see List#subList
*/
public static List<DocumentKeywords> chopList(DocumentKeywords data, int size) {
if (data.isEmpty())
return Collections.emptyList();
else if (data.size() < size)
return List.of(data);
final List<DocumentKeywords> ret = new ArrayList<>(1 + data.size() / size);
for (int i = 0; i < data.size(); i+=size) {
ret.add(data.subList(i, Math.min(data.size(), i+size)));
}
return ret;
}
}

View File

@ -9,7 +9,6 @@ import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl;
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.keyword.model.DocumentKeywords;
import nu.marginalia.model.idx.DocumentMetadata;
@ -20,7 +19,6 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.sql.SQLException;
import java.util.Arrays;
import static nu.marginalia.index.journal.model.IndexJournalEntryData.MAX_LENGTH;
@ -30,6 +28,10 @@ public class LoaderIndexJournalWriter {
private final IndexJournalWriter indexWriter;
private static final Logger logger = LoggerFactory.getLogger(LoaderIndexJournalWriter.class);
private final MurmurHash3_128 hasher = new MurmurHash3_128();
private final long[] buffer = new long[MAX_LENGTH * 2];
@Inject
public LoaderIndexJournalWriter(FileStorageService fileStorageService) throws IOException, SQLException {
var indexArea = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING);
@ -42,14 +44,13 @@ public class LoaderIndexJournalWriter {
indexWriter = new IndexJournalWriterPagingImpl(indexArea.asPath());
}
MurmurHash3_128 hasher = new MurmurHash3_128();
long[] buffer = new long[MAX_LENGTH * 2];
@SneakyThrows
public void putWords(long combinedId,
int features,
DocumentMetadata metadata,
DocumentKeywords wordSet) {
if (wordSet.keywords().length == 0) {
if (wordSet.isEmpty()) {
logger.info("Skipping zero-length word set for {}", combinedId);
return;
}
@ -59,23 +60,24 @@ public class LoaderIndexJournalWriter {
return;
}
String[] words = wordSet.keywords();
long[] meta = wordSet.metadata();
var pointer = wordSet.newPointer();
for (int start = 0; start < words.length; ) {
int end = Math.min(start + MAX_LENGTH, words.length);
while (pointer.hasMore()) {
int i = 0;
for (int i = 0; i < end - start; i++) {
buffer[2*i] = hasher.hashNearlyASCII(words[start+i]);
buffer[2*i + 1] = meta[start+i];
while (i < buffer.length
&& pointer.advancePointer())
{
final long hashedKeyword = hasher.hashNearlyASCII(pointer.getKeyword());
buffer[i++] = hashedKeyword;
buffer[i++] = pointer.getMetadata();
}
var entry = new IndexJournalEntryData(2 * (end-start), buffer);
var entry = new IndexJournalEntryData(i, buffer);
var header = new IndexJournalEntryHeader(combinedId, features, metadata.encode());
indexWriter.put(header, entry);
start = end;
}
}