(converter) Add AnchorTextKeywords to EncyclopediaMarginaliaNuSideloader processing

The commit updates EncyclopediaMarginaliaNuSideloader to include the AnchorTextKeywords in processing documents, aiding search result relevance.

It also removes old test-related functionality and a large but fairly useless test previously used to debug a specific problem, to the detriment of the overall code quality.
This commit is contained in:
Viktor Lofgren 2023-12-09 13:23:21 +01:00
parent eccb12b366
commit b6511fbfe2
3 changed files with 22 additions and 144 deletions

View File

@ -2,6 +2,7 @@ package nu.marginalia.converting.sideload;
import com.google.gson.Gson;
import com.google.inject.Inject;
import nu.marginalia.atags.AnchorTextKeywords;
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.converting.sideload.dirtree.DirtreeSideloaderFactory;
import nu.marginalia.converting.sideload.encyclopedia.EncyclopediaMarginaliaNuSideloader;
@ -20,6 +21,7 @@ public class SideloadSourceFactory {
private final SideloaderProcessing sideloaderProcessing;
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
private final DocumentKeywordExtractor documentKeywordExtractor;
private final AnchorTextKeywords anchorTextKeywords;
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
private final DirtreeSideloaderFactory dirtreeSideloaderFactory;
@ -27,19 +29,20 @@ public class SideloadSourceFactory {
public SideloadSourceFactory(Gson gson,
SideloaderProcessing sideloaderProcessing,
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
DocumentKeywordExtractor documentKeywordExtractor,
DocumentKeywordExtractor documentKeywordExtractor, AnchorTextKeywords anchorTextKeywords,
AnchorTagsSourceFactory anchorTagsSourceFactory,
DirtreeSideloaderFactory dirtreeSideloaderFactory) {
this.gson = gson;
this.sideloaderProcessing = sideloaderProcessing;
this.sentenceExtractorProvider = sentenceExtractorProvider;
this.documentKeywordExtractor = documentKeywordExtractor;
this.anchorTextKeywords = anchorTextKeywords;
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
this.dirtreeSideloaderFactory = dirtreeSideloaderFactory;
}
public SideloadSource sideloadEncyclopediaMarginaliaNu(Path pathToDbFile, String baseUrl) throws SQLException {
return new EncyclopediaMarginaliaNuSideloader(pathToDbFile, baseUrl, gson, anchorTagsSourceFactory, sideloaderProcessing);
return new EncyclopediaMarginaliaNuSideloader(pathToDbFile, baseUrl, gson, anchorTagsSourceFactory, anchorTextKeywords, sideloaderProcessing);
}
public Collection<? extends SideloadSource> sideloadDirtree(Path pathToYamlFile) throws IOException {

View File

@ -3,6 +3,7 @@ package nu.marginalia.converting.sideload.encyclopedia;
import com.github.luben.zstd.ZstdInputStream;
import com.google.gson.Gson;
import lombok.SneakyThrows;
import nu.marginalia.atags.AnchorTextKeywords;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.converting.model.DisqualifiedException;
@ -35,7 +36,7 @@ import java.util.concurrent.atomic.AtomicBoolean;
/** This is an experimental sideloader for encyclopedia.marginalia.nu's database;
* (which serves as a way of loading wikipedia's zim files without binding to GPL2'd code)
*
* <p>
* See https://github.com/MarginaliaSearch/encyclopedia.marginalia.nu for extracting the data
*/
public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoCloseable {
@ -43,6 +44,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
private final Connection connection;
private final EdgeUrl baseUrl;
private final Gson gson;
private final AnchorTextKeywords anchorTextKeywords;
private final SideloaderProcessing sideloaderProcessing;
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
private static final Logger logger = LoggerFactory.getLogger(EncyclopediaMarginaliaNuSideloader.class);
@ -51,9 +53,11 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
String baseUrl,
Gson gson,
AnchorTagsSourceFactory anchorTagsSourceFactory,
AnchorTextKeywords anchorTextKeywords,
SideloaderProcessing sideloaderProcessing) throws SQLException {
this.baseUrl = EdgeUrl.parse(baseUrl).orElseThrow(AssertionError::new);
this.gson = gson;
this.anchorTextKeywords = anchorTextKeywords;
this.sideloaderProcessing = sideloaderProcessing;
String sqliteDbString = "jdbc:sqlite:" + pathToDbFile.toString();
@ -103,7 +107,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
try {
docs.add(convertDocument(articleParts.parts, title, url, domainLinks));
} catch (URISyntaxException | DisqualifiedException e) {
e.printStackTrace();
logger.warn("Problem converting encyclopedia article " + url, e);
} finally {
sem.release();
}
@ -113,7 +117,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
stmt.close();
}
catch (Exception e) {
e.printStackTrace();
logger.warn("Problem converting encyclopedia article", e);
}
finally {
isFinished.set(true);
@ -142,30 +146,6 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
logger.error("Failed to create anchor tags source", ex);
return new DomainLinks();
}
}
ProcessedDocument processJust(String url) throws SQLException, IOException, URISyntaxException, DisqualifiedException {
var stmt = connection.prepareStatement("""
SELECT url,title,html
FROM articles
WHERE url=?
""");
stmt.setFetchSize(100);
stmt.setString(1, url);
var rs = stmt.executeQuery();
if (rs.next()) {
var articleParts = fromCompressedJson(rs.getBytes("html"), ArticleParts.class);
String title = rs.getString("title");
return convertDocument(articleParts.parts,
title,
URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8),
new DomainLinks() // FIXME (2023-11-06): Sideloaded dirtrees don't have access to anchor tag data.
);
}
return null;
}
private ProcessedDocument convertDocument(List<String> parts, String title, String url, DomainLinks domainLinks) throws URISyntaxException, DisqualifiedException {
@ -180,13 +160,22 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
}
fullHtml.append("</body></html>");
return sideloaderProcessing
var doc = sideloaderProcessing
.processDocument(fullUrl,
fullHtml.toString(),
List.of("encyclopedia", "wiki"),
domainLinks,
GeneratorType.WIKI,
10_000_000);
// Add anchor text keywords
if (doc.isProcessedFully()) {
doc.words.addAnchorTerms(
anchorTextKeywords.getAnchorTextKeywords(domainLinks, doc.url)
);
}
return doc;
}
private <T> T fromCompressedJson(byte[] stream, Class<T> type) throws IOException {

View File

@ -1,38 +1,12 @@
package nu.marginalia.converting.sideload.encyclopedia;
import com.google.inject.AbstractModule;
import com.google.inject.Guice;
import gnu.trove.list.array.TLongArrayList;
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.converting.ConverterModule;
import nu.marginalia.converting.model.DisqualifiedException;
import nu.marginalia.converting.processor.ConverterDomainTypes;
import nu.marginalia.converting.sideload.SideloaderProcessing;
import nu.marginalia.io.processed.DocumentRecordParquetFileReader;
import nu.marginalia.io.processed.DocumentRecordParquetFileWriter;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.model.processed.DocumentRecord;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import static org.junit.jupiter.api.Assertions.*;
import static org.mockito.Mockito.when;
class EncyclopediaMarginaliaNuSideloaderTest {
Path tempFile;
@ -62,93 +36,5 @@ class EncyclopediaMarginaliaNuSideloaderTest {
System.out.printf("%64s\n", Long.toBinaryString(Long.reverseBytes(0x1000000000000000L)));
System.out.printf("%64s\n", Long.toBinaryString(0x10L));
}
@Test
public void debugSpecificArticle() throws SQLException, IOException, URISyntaxException, DisqualifiedException {
Path pathToDbFile = Path.of("/home/vlofgren/Code/MarginaliaSearch/run/samples/articles.db");
if (!Files.exists(pathToDbFile)) {
// not really practical to ship a 40 Gb sqlite files on github
// be @vlofgren to run this test
return;
}
var domainTypesMock = Mockito.mock(ConverterDomainTypes.class);
when(domainTypesMock.isBlog(Mockito.any())).thenReturn(false);
var processing = Guice.createInjector(new ConverterModule(),
new AbstractModule() {
public void configure() {
bind(ConverterDomainTypes.class).toInstance(domainTypesMock);
}
}
)
.getInstance(SideloaderProcessing.class);
var atagsFactory = Mockito.mock(AnchorTagsSourceFactory.class);
when(atagsFactory.create(Mockito.any())).thenReturn(domain -> new DomainLinks());
var sideloader = new EncyclopediaMarginaliaNuSideloader(
pathToDbFile,
"https://en.wikipedia.org/wiki/",
GsonFactory.get(),
atagsFactory,
processing
);
var document = sideloader.processJust("Don't_Tell_Me_(Madonna_song)");
System.out.println(document);
var keywordsBuilt = document.words.build();
var ptr = keywordsBuilt.newPointer();
Map<String, WordMetadata> dirtyAndBlues = new HashMap<>();
while (ptr.advancePointer()) {
String word = ptr.getKeyword();
System.out.println(word + ": " + Long.toHexString(Long.reverseBytes(ptr.getMetadata())));
if (Set.of("dirty", "blues").contains(word)) {
WordMetadata meta = new WordMetadata(ptr.getMetadata());
Assertions.assertNull(
dirtyAndBlues.put(word, meta)
);
}
}
Assertions.assertTrue(dirtyAndBlues.containsKey("dirty"));
Assertions.assertTrue(dirtyAndBlues.containsKey("blues"));
Assertions.assertNotEquals(
dirtyAndBlues.get("dirty"),
dirtyAndBlues.get("blues")
);
try (var dw = new DocumentRecordParquetFileWriter(tempFile)) {
dw.write(new DocumentRecord(
"encyclopedia.marginalia.nu",
document.url.toString(),
0,
document.state.toString(),
document.stateReason,
document.details.title,
document.details.description,
HtmlFeature.encode(document.details.features),
document.details.standard.name(),
document.details.length,
document.details.hashCode,
(float) document.details.quality,
document.details.metadata.encode(),
document.details.pubYear,
List.of(keywordsBuilt.keywords),
new TLongArrayList(keywordsBuilt.metadata)
));
}
var record = DocumentRecordParquetFileReader.streamKeywordsProjection(tempFile).findFirst().get();
String[] words = record.words.toArray(String[]::new);
long[] meta = record.metas.toArray();
assertArrayEquals(keywordsBuilt.keywords, words);
assertArrayEquals(keywordsBuilt.metadata, meta);
}
}