diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java index debc460f..60f81d19 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java @@ -2,6 +2,7 @@ package nu.marginalia.converting.sideload; import com.google.gson.Gson; import com.google.inject.Inject; +import nu.marginalia.atags.AnchorTextKeywords; import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.converting.sideload.dirtree.DirtreeSideloaderFactory; import nu.marginalia.converting.sideload.encyclopedia.EncyclopediaMarginaliaNuSideloader; @@ -20,6 +21,7 @@ public class SideloadSourceFactory { private final SideloaderProcessing sideloaderProcessing; private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider; private final DocumentKeywordExtractor documentKeywordExtractor; + private final AnchorTextKeywords anchorTextKeywords; private final AnchorTagsSourceFactory anchorTagsSourceFactory; private final DirtreeSideloaderFactory dirtreeSideloaderFactory; @@ -27,19 +29,20 @@ public class SideloadSourceFactory { public SideloadSourceFactory(Gson gson, SideloaderProcessing sideloaderProcessing, ThreadLocalSentenceExtractorProvider sentenceExtractorProvider, - DocumentKeywordExtractor documentKeywordExtractor, + DocumentKeywordExtractor documentKeywordExtractor, AnchorTextKeywords anchorTextKeywords, AnchorTagsSourceFactory anchorTagsSourceFactory, DirtreeSideloaderFactory dirtreeSideloaderFactory) { this.gson = gson; this.sideloaderProcessing = sideloaderProcessing; this.sentenceExtractorProvider = sentenceExtractorProvider; this.documentKeywordExtractor = documentKeywordExtractor; + this.anchorTextKeywords = anchorTextKeywords; this.anchorTagsSourceFactory = anchorTagsSourceFactory; this.dirtreeSideloaderFactory = dirtreeSideloaderFactory; } public SideloadSource sideloadEncyclopediaMarginaliaNu(Path pathToDbFile, String baseUrl) throws SQLException { - return new EncyclopediaMarginaliaNuSideloader(pathToDbFile, baseUrl, gson, anchorTagsSourceFactory, sideloaderProcessing); + return new EncyclopediaMarginaliaNuSideloader(pathToDbFile, baseUrl, gson, anchorTagsSourceFactory, anchorTextKeywords, sideloaderProcessing); } public Collection extends SideloadSource> sideloadDirtree(Path pathToYamlFile) throws IOException { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java index aab62ef9..fc1f5015 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java @@ -3,6 +3,7 @@ package nu.marginalia.converting.sideload.encyclopedia; import com.github.luben.zstd.ZstdInputStream; import com.google.gson.Gson; import lombok.SneakyThrows; +import nu.marginalia.atags.AnchorTextKeywords; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.converting.model.DisqualifiedException; @@ -35,7 +36,7 @@ import java.util.concurrent.atomic.AtomicBoolean; /** This is an experimental sideloader for encyclopedia.marginalia.nu's database; * (which serves as a way of loading wikipedia's zim files without binding to GPL2'd code) - * + *
* See https://github.com/MarginaliaSearch/encyclopedia.marginalia.nu for extracting the data
*/
public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoCloseable {
@@ -43,6 +44,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
private final Connection connection;
private final EdgeUrl baseUrl;
private final Gson gson;
+ private final AnchorTextKeywords anchorTextKeywords;
private final SideloaderProcessing sideloaderProcessing;
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
private static final Logger logger = LoggerFactory.getLogger(EncyclopediaMarginaliaNuSideloader.class);
@@ -51,9 +53,11 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
String baseUrl,
Gson gson,
AnchorTagsSourceFactory anchorTagsSourceFactory,
+ AnchorTextKeywords anchorTextKeywords,
SideloaderProcessing sideloaderProcessing) throws SQLException {
this.baseUrl = EdgeUrl.parse(baseUrl).orElseThrow(AssertionError::new);
this.gson = gson;
+ this.anchorTextKeywords = anchorTextKeywords;
this.sideloaderProcessing = sideloaderProcessing;
String sqliteDbString = "jdbc:sqlite:" + pathToDbFile.toString();
@@ -103,7 +107,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
try {
docs.add(convertDocument(articleParts.parts, title, url, domainLinks));
} catch (URISyntaxException | DisqualifiedException e) {
- e.printStackTrace();
+ logger.warn("Problem converting encyclopedia article " + url, e);
} finally {
sem.release();
}
@@ -113,7 +117,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
stmt.close();
}
catch (Exception e) {
- e.printStackTrace();
+ logger.warn("Problem converting encyclopedia article", e);
}
finally {
isFinished.set(true);
@@ -142,30 +146,6 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
logger.error("Failed to create anchor tags source", ex);
return new DomainLinks();
}
-
- }
-
- ProcessedDocument processJust(String url) throws SQLException, IOException, URISyntaxException, DisqualifiedException {
- var stmt = connection.prepareStatement("""
- SELECT url,title,html
- FROM articles
- WHERE url=?
- """);
- stmt.setFetchSize(100);
- stmt.setString(1, url);
-
- var rs = stmt.executeQuery();
- if (rs.next()) {
- var articleParts = fromCompressedJson(rs.getBytes("html"), ArticleParts.class);
- String title = rs.getString("title");
-
- return convertDocument(articleParts.parts,
- title,
- URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8),
- new DomainLinks() // FIXME (2023-11-06): Sideloaded dirtrees don't have access to anchor tag data.
- );
- }
- return null;
}
private ProcessedDocument convertDocument(List