From e7b4ac0d34237637022d8be830f02a115ad1e758 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Fri, 27 May 2022 23:45:29 +0200 Subject: [PATCH 1/5] WIP: Killing off Archive service, adding new Encyclopedia service consisting largely of what Archive was and a few features from Assistant. --- .../wmsa/configuration/ServiceDescriptor.java | 5 +- .../wmsa/edge/archive/EdgeArchiveMain.java | 33 -- .../wmsa/edge/archive/EdgeArchiveModule.java | 15 - .../wmsa/edge/archive/EdgeArchiveService.java | 180 -------- .../archive/archiver/ArchiveExtractor.java | 65 --- .../edge/archive/archiver/ArchivedFile.java | 5 - .../wmsa/edge/archive/archiver/Archiver.java | 113 ------ .../edge/archive/client/ArchiveClient.java | 56 --- .../request/EdgeArchiveSubmissionReq.java | 13 - .../edge/assistant/EdgeAssistantService.java | 84 +--- .../assistant/client/AssistantClient.java | 4 - .../assistant/dict/DictionaryService.java | 136 ------- .../wmsa/edge/search/EdgeSearchOperator.java | 6 +- .../wmsa/edge/tools/ConverterMain.java | 384 ------------------ .../edge/tools/TermFrequencyCounterMain.java | 142 ------- .../wmsa/edge/tools/ZimConverterMain.java | 10 +- .../wmsa/encyclopedia/EncyclopediaClient.java | 34 ++ .../wmsa/encyclopedia/EncyclopediaDao.java | 160 ++++++++ .../wmsa/encyclopedia/EncyclopediaMain.java | 26 ++ .../encyclopedia/EncyclopediaService.java | 202 +++++++++ .../configuration/server/ServiceTest.java | 2 - .../wmsa/edge/archive/ArchiveTest.java | 72 ---- .../edge/archive/archiver/ArchiverTest.java | 17 - .../wmsa/edge/assistant/AssistantTest.java | 7 - 24 files changed, 438 insertions(+), 1333 deletions(-) delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/EdgeArchiveMain.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/EdgeArchiveModule.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/EdgeArchiveService.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/archiver/ArchiveExtractor.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/archiver/ArchivedFile.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/archiver/Archiver.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/client/ArchiveClient.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/request/EdgeArchiveSubmissionReq.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterMain.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/TermFrequencyCounterMain.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaClient.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaDao.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaMain.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaService.java delete mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/archive/ArchiveTest.java delete mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/archive/archiver/ArchiverTest.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java index fcc88260..eb239f8b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java @@ -6,11 +6,11 @@ import nu.marginalia.wmsa.configuration.command.Command; import nu.marginalia.wmsa.configuration.command.ListCommand; import nu.marginalia.wmsa.configuration.command.StartCommand; import nu.marginalia.wmsa.configuration.command.VersionCommand; -import nu.marginalia.wmsa.edge.archive.EdgeArchiveMain; import nu.marginalia.wmsa.edge.assistant.EdgeAssistantMain; import nu.marginalia.wmsa.edge.dating.DatingMain; import nu.marginalia.wmsa.edge.index.EdgeIndexMain; import nu.marginalia.wmsa.edge.search.EdgeSearchMain; +import nu.marginalia.wmsa.encyclopedia.EncyclopediaMain; import nu.marginalia.wmsa.memex.MemexMain; import nu.marginalia.wmsa.podcasts.PodcastScraperMain; import nu.marginalia.wmsa.renderer.RendererMain; @@ -33,11 +33,12 @@ public enum ServiceDescriptor { EDGE_INDEX("edge-index", 5021, EdgeIndexMain.class), EDGE_SEARCH("edge-search", 5023, EdgeSearchMain.class), - EDGE_ARCHIVE("edge-archive", 5024, EdgeArchiveMain.class), EDGE_ASSISTANT("edge-assistant", 5025, EdgeAssistantMain.class), EDGE_MEMEX("memex", 5030, MemexMain.class), + ENCYCLOPEDIA("encyclopedia", 5040, EncyclopediaMain.class), + DATING("dating", 5070, DatingMain.class), TEST_1("test-1", 0, null), diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/EdgeArchiveMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/EdgeArchiveMain.java deleted file mode 100644 index 64ef35a3..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/EdgeArchiveMain.java +++ /dev/null @@ -1,33 +0,0 @@ -package nu.marginalia.wmsa.edge.archive; - -import com.google.inject.Guice; -import com.google.inject.Inject; -import com.google.inject.Injector; -import nu.marginalia.wmsa.configuration.MainClass; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import nu.marginalia.wmsa.configuration.module.ConfigurationModule; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.configuration.server.Initialization; - -public class EdgeArchiveMain extends MainClass { - private final EdgeArchiveService service; - - @Inject - public EdgeArchiveMain(EdgeArchiveService service) { - this.service = service; - } - - public static void main(String... args) { - init(ServiceDescriptor.EDGE_ARCHIVE, args); - - Injector injector = Guice.createInjector( - new EdgeArchiveModule(), - new ConfigurationModule(), - new DatabaseModule() - ); - - injector.getInstance(EdgeArchiveMain.class); - injector.getInstance(Initialization.class).setReady(); - - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/EdgeArchiveModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/EdgeArchiveModule.java deleted file mode 100644 index 1d3c8215..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/EdgeArchiveModule.java +++ /dev/null @@ -1,15 +0,0 @@ -package nu.marginalia.wmsa.edge.archive; - -import com.google.inject.AbstractModule; -import com.google.inject.name.Names; - -import java.nio.file.Path; - -public class EdgeArchiveModule extends AbstractModule { - public void configure() { - bind(Path.class).annotatedWith(Names.named("archive-path")).toInstance(Path.of("/var/lib/wmsa/archive/webpage/")); - bind(Path.class).annotatedWith(Names.named("wiki-path")).toInstance(Path.of("/var/lib/wmsa/archive.fast/wiki/")); - bind(Integer.class).annotatedWith(Names.named("archive-size")).toInstance(10_000); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/EdgeArchiveService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/EdgeArchiveService.java deleted file mode 100644 index edc9c71a..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/EdgeArchiveService.java +++ /dev/null @@ -1,180 +0,0 @@ -package nu.marginalia.wmsa.edge.archive; - -import com.google.gson.Gson; -import com.google.gson.GsonBuilder; -import com.google.inject.Inject; -import com.google.inject.name.Named; -import io.prometheus.client.Histogram; -import lombok.SneakyThrows; -import nu.marginalia.wmsa.configuration.server.Initialization; -import nu.marginalia.wmsa.configuration.server.MetricsServer; -import nu.marginalia.wmsa.configuration.server.Service; -import nu.marginalia.wmsa.edge.archive.archiver.ArchivedFile; -import nu.marginalia.wmsa.edge.archive.archiver.Archiver; -import nu.marginalia.wmsa.edge.archive.request.EdgeArchiveSubmissionReq; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import spark.Request; -import spark.Response; -import spark.Spark; - -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.stream.Collectors; -import java.util.zip.GZIPInputStream; -import java.util.zip.GZIPOutputStream; - -public class EdgeArchiveService extends Service { - - private final Logger logger = LoggerFactory.getLogger(getClass()); - private final Gson gson = new GsonBuilder().create(); - - private static final Histogram wmsa_archive_store_time = Histogram.build().name("wmsa_archive_store_time").help("-").register(); - private static final Histogram wmsa_archive_fetch_time = Histogram.build().name("wmsa_archive_fetch_time").help("-").register(); - - private final Path wikiPath; - private final Archiver archiver; - - @SneakyThrows - @Inject - public EdgeArchiveService(@Named("service-host") String ip, - @Named("service-port") Integer port, - @Named("wiki-path") Path wikiPath, - Archiver archiver, - Initialization initialization, - MetricsServer metricsServer) - { - super(ip, port, initialization, metricsServer); - this.wikiPath = wikiPath; - this.archiver = archiver; - - Spark.staticFiles.expireTime(600); - - Spark.post("/page/submit", this::pathPageSubmit); - - Spark.post("/wiki/submit", this::pathWikiSubmit); - Spark.get("/wiki/has", this::pathWikiHas); - Spark.get("/wiki/get", this::pathWikiGet); - - Spark.awaitInitialization(); - } - - @SneakyThrows - private Object pathPageSubmit(Request request, Response response) { - var timer = wmsa_archive_store_time.startTimer(); - try { - var body = request.body(); - var data = gson.fromJson(body, EdgeArchiveSubmissionReq.class); - - String domainNamePart = data.getUrl().domain.domain.length() > 32 ? data.getUrl().domain.domain.substring(0, 32) : data.getUrl().domain.domain; - String fileName = String.format("%s-%10d", domainNamePart, data.getUrl().hashCode()); - - archiver.writeData(new ArchivedFile(fileName, body.getBytes())); - - return "ok"; - } finally { - timer.observeDuration(); - } - - } - - - @SneakyThrows - private Object pathWikiSubmit(Request request, Response response) { - var timer = wmsa_archive_store_time.startTimer(); - - try { - byte[] data = request.bodyAsBytes(); - - String wikiUrl = request.queryParams("url"); - Path filename = getWikiFilename(wikiPath, wikiUrl); - - Files.createDirectories(filename.getParent()); - - System.out.println(new String(data)); - logger.debug("Writing {} to {}", wikiUrl, filename); - - try (var gos = new GZIPOutputStream(new FileOutputStream(filename.toFile()))) { - gos.write(data); - gos.flush(); - } - - return "ok"; - } finally { - timer.observeDuration(); - } - - } - - - private Path getWikiFilename(Path base, String url) { - Path p = base; - - int urlHash = url.hashCode(); - - p = p.resolve(Integer.toString(urlHash & 0xFF)); - p = p.resolve(Integer.toString((urlHash>>>8) & 0xFF)); - p = p.resolve(Integer.toString((urlHash>>>16) & 0xFF)); - p = p.resolve(Integer.toString((urlHash>>>24) & 0xFF)); - - String fileName = url.chars() - .mapToObj(this::encodeUrlChar) - .collect(Collectors.joining()); - - if (fileName.length() > 128) { - fileName = fileName.substring(0, 128) + (((long)urlHash)&0xFFFFFFFFL); - } - - return p.resolve(fileName + ".gz"); - } - - - private String encodeUrlChar(int i) { - if (i >= 'a' && i <= 'z') { - return Character.toString(i); - } - if (i >= 'A' && i <= 'Z') { - return Character.toString(i); - } - if (i >= '0' && i <= '9') { - return Character.toString(i); - } - if (i == '.') { - return Character.toString(i); - } - else { - return String.format("%%%2X", i); - } - } - - @SneakyThrows - private Object pathWikiHas(Request request, Response response) { - return Files.exists(getWikiFilename(wikiPath, request.queryParams("url"))); - } - - - @SneakyThrows - private String pathWikiGet(Request request, Response response) { - var timer = wmsa_archive_fetch_time.startTimer(); - - try { - String url = request.queryParams("url"); - - var filename = getWikiFilename(wikiPath, url); - - if (Files.exists(filename)) { - try (var stream = new GZIPInputStream(new FileInputStream(filename.toFile()))) { - return new String(stream.readAllBytes()); - } - } else { - Spark.halt(404); - return null; - } - } - finally { - timer.observeDuration(); - } - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/archiver/ArchiveExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/archiver/ArchiveExtractor.java deleted file mode 100644 index e8d920ef..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/archiver/ArchiveExtractor.java +++ /dev/null @@ -1,65 +0,0 @@ -package nu.marginalia.wmsa.edge.archive.archiver; - -import com.google.gson.Gson; -import com.google.gson.GsonBuilder; -import nu.marginalia.wmsa.edge.archive.request.EdgeArchiveSubmissionReq; -import nu.marginalia.wmsa.edge.model.crawl.EdgeRawPageContents; -import org.apache.commons.compress.archivers.tar.TarArchiveEntry; -import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; -import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.BufferedInputStream; -import java.io.FileInputStream; -import java.io.InputStreamReader; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.function.Consumer; - -public class ArchiveExtractor { - private final Path archivePath; - private final String arhivePattern = "archive-%04d.tar.gz"; - - private final Logger logger = LoggerFactory.getLogger(getClass()); - private final Gson gson = new GsonBuilder().create(); - - public ArchiveExtractor(Path archivePath) { - this.archivePath = archivePath; - - } - - public void forEach(Consumer contents) { - for (int i = 0; ; ++i) { - var fn = getArchiveFile(i); - logger.info("{}", fn); - if (!Files.exists(fn)) { - break; - } - try (var stream = new TarArchiveInputStream(new GzipCompressorInputStream(new BufferedInputStream(new FileInputStream(fn.toFile()))))) { - TarArchiveEntry entry; - while ((entry = stream.getNextTarEntry()) != null) { - if (entry.isFile()) { - try { - var obj = gson.fromJson(new InputStreamReader(stream), EdgeArchiveSubmissionReq.class); - if (obj != null) { - contents.accept(obj.getData()); - } - } - catch (Exception ex) { - logger.error("Could not unpack {} - {} {}", entry.getName(), ex.getClass().getSimpleName(), ex.getMessage()); - } - } - } - } catch (Exception e) { - e.printStackTrace(); - } - } - } - - private Path getArchiveFile(int number) { - final String fileName = String.format(arhivePattern, number); - return archivePath.resolve(fileName); - } -} - diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/archiver/ArchivedFile.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/archiver/ArchivedFile.java deleted file mode 100644 index fd8f2ca1..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/archiver/ArchivedFile.java +++ /dev/null @@ -1,5 +0,0 @@ -package nu.marginalia.wmsa.edge.archive.archiver; - - -public record ArchivedFile(String filename,byte[] data ) { -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/archiver/Archiver.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/archiver/Archiver.java deleted file mode 100644 index 784ce8e7..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/archiver/Archiver.java +++ /dev/null @@ -1,113 +0,0 @@ -package nu.marginalia.wmsa.edge.archive.archiver; - -import com.google.inject.name.Named; -import org.apache.commons.compress.archivers.tar.TarArchiveEntry; -import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream; -import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream; -import org.apache.commons.io.IOUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.inject.Inject; -import javax.inject.Singleton; -import java.io.ByteArrayInputStream; -import java.io.FileOutputStream; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.concurrent.LinkedBlockingDeque; -import java.util.concurrent.TimeUnit; - -@Singleton -public class Archiver implements AutoCloseable { - private final Path archivePath; - private final int filesPerArchive; - private final String arhivePattern = "archive-%04d.tar.gz"; - - private final LinkedBlockingDeque writeQueue = new LinkedBlockingDeque<>(10); - private final Thread writeThread; - - private volatile int archiveNumber; - private volatile boolean running; - - private final Logger logger = LoggerFactory.getLogger(getClass()); - - @Inject - public Archiver(@Named("archive-path") Path archivePath, @Named("archive-size") Integer filesPerArchive) { - this.archivePath = archivePath; - this.filesPerArchive = filesPerArchive; - - if (!Files.exists(archivePath)) { - throw new IllegalArgumentException("Archive path does not exist"); - } - for (int i = 0;; ++i) { - if (!Files.exists(getArchiveFile(i))) { - archiveNumber = i; - break; - } - } - - running = true; - writeThread = new Thread(this::writeThreadMain, "ArchiveWriteThread"); - writeThread.start(); - } - - private Path getArchiveFile(int number) { - final String fileName = String.format(arhivePattern, number); - return archivePath.resolve(fileName); - } - - public void writeData(ArchivedFile file) throws InterruptedException { - if (!running) throw new IllegalStateException("Archiver is closing or closed"); - writeQueue.put(file); - } - - private void writeThreadMain() { - try { - while (running || !writeQueue.isEmpty()) { - writeToFile(archiveNumber); - archiveNumber++; - } - running = false; - } - catch (Exception ex) { - logger.error("Uncaught exception in writer thread!!"); - } - } - - private void writeToFile(int archiveNumber) { - var archiveFile = getArchiveFile(archiveNumber); - - logger.info("Switching to file {}", archiveFile); - - try (TarArchiveOutputStream taos = new TarArchiveOutputStream(new GzipCompressorOutputStream(new FileOutputStream(archiveFile.toFile())))) { - for (int i = 0; i < filesPerArchive; i++) { - - ArchivedFile writeJob = null; - while (writeJob == null) { - writeJob = writeQueue.poll(1, TimeUnit.SECONDS); - if (!running) return; - } - - var entry = new TarArchiveEntry(String.format("%06d-%s", i, writeJob.filename())); - entry.setSize(writeJob.data().length); - taos.putArchiveEntry(entry); - logger.debug("Writing {} to {}", writeJob.filename(), archiveFile); - try (var bais = new ByteArrayInputStream(writeJob.data())) { - IOUtils.copy(bais, taos); - } - taos.closeArchiveEntry(); - } - taos.finish(); - logger.debug("Finishing {}", archiveFile); - } catch (Exception e) { - logger.error("Error", e); - } - - } - - @Override - public void close() throws Exception { - running = false; - writeThread.join(); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/client/ArchiveClient.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/client/ArchiveClient.java deleted file mode 100644 index 0e56ac53..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/client/ArchiveClient.java +++ /dev/null @@ -1,56 +0,0 @@ -package nu.marginalia.wmsa.edge.archive.client; - -import com.google.inject.Inject; -import com.google.inject.Singleton; -import io.reactivex.rxjava3.core.Observable; -import nu.marginalia.wmsa.client.AbstractDynamicClient; -import nu.marginalia.wmsa.client.HttpStatusCode; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.archive.request.EdgeArchiveSubmissionReq; -import nu.marginalia.wmsa.edge.model.crawl.EdgeRawPageContents; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import okhttp3.MediaType; -import org.eclipse.jetty.util.UrlEncoded; - -import javax.annotation.CheckReturnValue; -import java.util.concurrent.Semaphore; - -@Singleton -public class ArchiveClient extends AbstractDynamicClient { - - private final Semaphore submitPageSem = new Semaphore(3, true); - - @Inject - public ArchiveClient() { - super(ServiceDescriptor.EDGE_ARCHIVE); - } - - @CheckReturnValue - public void submitPage(Context ctx, EdgeUrl url, EdgeRawPageContents data) throws InterruptedException { - try { - submitPageSem.acquire(); - super.post(ctx, "/page/submit", new EdgeArchiveSubmissionReq(url, data)).blockingSubscribe(); - } - finally { - submitPageSem.release(); - } - - } - - @CheckReturnValue - public Observable submitWiki(Context ctx, String url, String data) { - return super.post(ctx, "/wiki/submit?url="+UrlEncoded.encodeString(url), data, MediaType.parse("text/plain; charset=UTF-8")); - } - - @CheckReturnValue - public Observable hasWiki(Context ctx, String url) { - return super.get(ctx, "/wiki/has?url="+UrlEncoded.encodeString(url), Boolean.class); - } - - @CheckReturnValue - public Observable getWiki(Context ctx, String url) { - return super.get(ctx, "/wiki/get?url="+UrlEncoded.encodeString(url)); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/request/EdgeArchiveSubmissionReq.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/request/EdgeArchiveSubmissionReq.java deleted file mode 100644 index fbd97452..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/archive/request/EdgeArchiveSubmissionReq.java +++ /dev/null @@ -1,13 +0,0 @@ -package nu.marginalia.wmsa.edge.archive.request; - -import lombok.AllArgsConstructor; -import lombok.Getter; -import lombok.ToString; -import nu.marginalia.wmsa.edge.model.crawl.EdgeRawPageContents; -import nu.marginalia.wmsa.edge.model.EdgeUrl; - -@AllArgsConstructor @Getter @ToString -public class EdgeArchiveSubmissionReq { - EdgeUrl url; - EdgeRawPageContents data; -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/EdgeAssistantService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/EdgeAssistantService.java index 3fbdbd42..02b9b44f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/EdgeAssistantService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/EdgeAssistantService.java @@ -4,36 +4,27 @@ import com.google.gson.Gson; import com.google.gson.GsonBuilder; import com.google.inject.Inject; import com.google.inject.name.Named; -import io.reactivex.rxjava3.core.Observable; import lombok.SneakyThrows; -import nu.marginalia.wmsa.configuration.server.*; -import nu.marginalia.wmsa.edge.archive.client.ArchiveClient; +import nu.marginalia.wmsa.configuration.server.Initialization; +import nu.marginalia.wmsa.configuration.server.MetricsServer; +import nu.marginalia.wmsa.configuration.server.Service; import nu.marginalia.wmsa.edge.assistant.dict.DictionaryService; import nu.marginalia.wmsa.edge.assistant.eval.MathParser; import nu.marginalia.wmsa.edge.assistant.eval.Units; import nu.marginalia.wmsa.edge.assistant.screenshot.ScreenshotService; import nu.marginalia.wmsa.edge.assistant.suggest.Suggestions; -import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; -import nu.marginalia.wmsa.renderer.mustache.RendererFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import spark.Request; import spark.Response; import spark.Spark; -import java.util.Map; - public class EdgeAssistantService extends Service { private final Logger logger = LoggerFactory.getLogger(getClass()); private final Gson gson = new GsonBuilder().create(); private final Units units; - private final DictionaryService dictionaryService; private final MathParser mathParser; - private final ArchiveClient archiveClient; - private final ScreenshotService screenshotService; - private final MustacheRenderer wikiErrorPageRenderer; - private final MustacheRenderer wikiSearchResultRenderer; private final Suggestions suggestions; @SneakyThrows @@ -45,40 +36,22 @@ public class EdgeAssistantService extends Service { DictionaryService dictionaryService, MathParser mathParser, Units units, - ArchiveClient archiveClient, - RendererFactory rendererFactory, ScreenshotService screenshotService, Suggestions suggestions ) { super(ip, port, initialization, metricsServer); - this.dictionaryService = dictionaryService; this.mathParser = mathParser; this.units = units; - this.archiveClient = archiveClient; - this.screenshotService = screenshotService; this.suggestions = suggestions; Spark.staticFiles.expireTime(600); - if (rendererFactory != null) { - wikiErrorPageRenderer = rendererFactory.renderer("encyclopedia/wiki-error"); - wikiSearchResultRenderer = rendererFactory.renderer("encyclopedia/wiki-search"); - } - else { - wikiErrorPageRenderer = null; - wikiSearchResultRenderer = null; - } - - Spark.get("/public/wiki/*", this::getWikiPage); - Spark.get("/public/wiki-search", this::searchWikiPage); - Spark.get("/public/screenshot/:id", screenshotService::serveScreenshotRequest); Spark.get("/screenshot/:id", screenshotService::serveScreenshotRequest); Spark.get("/dictionary/:word", (req, rsp) -> dictionaryService.define(req.params("word")), this::convertToJson); Spark.get("/spell-check/:term", (req, rsp) -> dictionaryService.spellCheck(req.params("term").toLowerCase()), this::convertToJson); - Spark.get("/encyclopedia/:term", (req, rsp) -> dictionaryService.encyclopedia(req.params("term")), this::convertToJson); Spark.get("/unit-conversion", (req, rsp) -> unitConversion( rsp, req.queryParams("value"), @@ -106,57 +79,6 @@ public class EdgeAssistantService extends Service { return suggestions.getSuggestions(10, param); } - @SneakyThrows - private Object getWikiPage(Request req, Response rsp) { - final var ctx = Context.fromRequest(req); - - final String[] splats = req.splat(); - if (splats.length == 0) - rsp.redirect("https://encyclopedia.marginalia.nu/wiki-start.html"); - - - final String s = splats[0]; - - String pageName = dictionaryService.resolveEncylopediaRedirect(s).orElse(s); - logger.info("Resolved {} -> {}", s, pageName); - return archiveClient.getWiki(ctx, pageName) - .onErrorResumeWith(resolveWikiPageNameWrongCase(ctx, s)) - .blockingFirst(); - } - - private Observable resolveWikiPageNameWrongCase(Context ctx, String s) { - var rsp = dictionaryService.findEncyclopediaPageDirect(s); - if (rsp.isEmpty()) { - return renderSearchPage(s); - } - return archiveClient.getWiki(ctx, rsp.get().getInternalName()) - .onErrorResumeWith(renderSearchPage(s)); - } - - private Observable renderSearchPage(String s) { - return Observable.fromCallable(() -> wikiSearchResultRenderer.render( - Map.of("query", s, - "error", "true", - "results", dictionaryService.findEncyclopediaPages(s)))); - } - - @SneakyThrows - private Object searchWikiPage(Request req, Response rsp) { - final var ctx = Context.fromRequest(req); - - String term = req.queryParams("query"); - if (null == term) { - rsp.redirect("https://encyclopedia.marginalia.nu/wiki-start.html"); - return ""; - } - - return wikiSearchResultRenderer.render( - Map.of("query", term, - "results", - dictionaryService.findEncyclopediaPages(term)) - ); - } - private Object evalExpression(Response rsp, String value) { try { var val = mathParser.evalFormatted(value); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/client/AssistantClient.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/client/AssistantClient.java index 891a2cc0..de0b9313 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/client/AssistantClient.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/client/AssistantClient.java @@ -24,10 +24,6 @@ public class AssistantClient extends AbstractDynamicClient { return super.get(ctx,"/dictionary/" + UrlEncoded.encodeString(word), DictionaryResponse.class); } - public Observable encyclopediaLookup(Context ctx, String word) { - return super.get(ctx,"/encyclopedia/" + UrlEncoded.encodeString(word), WikiArticles.class); - } - @SuppressWarnings("unchecked") public Observable> spellCheck(Context ctx, String word) { return (Observable>) (Object) super.get(ctx,"/spell-check/" + UrlEncoded.encodeString(word), List.class); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/DictionaryService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/DictionaryService.java index 2147c297..572c96fa 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/DictionaryService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/DictionaryService.java @@ -43,142 +43,6 @@ public class DictionaryService { return response; } - public WikiArticles encyclopedia(String term) { - WikiArticles response = new WikiArticles(); - response.entries = new ArrayList<>(); - - try (var connection = dataSource.getConnection()) { - var stmt = connection.prepareStatement("SELECT DISTINCT(NAME_LOWER) FROM REF_WIKI_TITLE WHERE NAME_LOWER=?"); - stmt.setString(1, term); - - var rsp = stmt.executeQuery(); - while (rsp.next()) { - response.entries.add(capitalizeWikiString(rsp.getString(1))); - } - } - catch (Exception ex) { - logger.error("Failed to fetch articles", ex); - return new WikiArticles(); - } - - return response; - } - - public Optional resolveEncylopediaRedirect(String term) { - final List matches = new ArrayList<>(); - - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) { - stmt.setString(1, term); - - var rsp = stmt.executeQuery(); - while (rsp.next()) { - if (term.equals(rsp.getString(1)) - || rsp.getString(2) == null) { - return Optional.ofNullable(rsp.getString(2)); - } else { - matches.add(rsp.getString(2)); - } - } - } - } - catch (Exception ex) { - throw new RuntimeException(ex); - } - - if (!matches.isEmpty()) { - return Optional.of(matches.get(0)); - } - return Optional.empty(); - } - - - public Optional findEncyclopediaPageDirect(String term) { - - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) { - stmt.setString(1, term.replace(' ', '_')); - - var rsp = stmt.executeQuery(); - while (rsp.next()) { - String name = rsp.getString(1); - String refName = rsp.getString(2); - - if (refName == null) { - return Optional.of(new WikiSearchResult(name, null)); - } - } - } - } - catch (Exception ex) { - throw new RuntimeException(ex); - } - - return Optional.empty(); - } - - public List findEncyclopediaPages(String term) { - final List directMatches = new ArrayList<>(); - final Set directSearchMatches = new HashSet<>(); - final Set indirectMatches = new HashSet<>(); - - try (var connection = dataSource.getConnection()) { - - try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) { - stmt.setString(1, term.replace(' ', '_')); - - var rsp = stmt.executeQuery(); - while (rsp.next()) { - String name = rsp.getString(1); - String refName = rsp.getString(2); - - if (refName == null) { - directMatches.add(new WikiSearchResult(name, null)); - } else { - indirectMatches.add(new WikiSearchResult(name, refName)); - } - } - } - - try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER LIKE ? LIMIT 10")) { - stmt.setString(1, term.replace(' ', '_').replaceAll("%", "\\%").toLowerCase() + "%"); - - var rsp = stmt.executeQuery(); - while (rsp.next()) { - String name = rsp.getString(1); - String refName = rsp.getString(2); - - if (refName == null) { - directSearchMatches.add(new WikiSearchResult(name, null)); - } else { - indirectMatches.add(new WikiSearchResult(name, refName)); - } - } - } - } - catch (Exception ex) { - throw new RuntimeException(ex); - } - - directMatches.forEach(indirectMatches::remove); - indirectMatches.removeAll(directSearchMatches); - directMatches.forEach(directSearchMatches::remove); - directMatches.addAll(indirectMatches); - directMatches.addAll(directSearchMatches); - return directMatches; - } - - private String capitalizeWikiString(String string) { - if (string.contains("_")) { - return Arrays.stream(string.split("_")).map(this::capitalizeWikiString).collect(Collectors.joining("_")); - } - if (string.length() < 2) { - return string.toUpperCase(); - } - return Character.toUpperCase(string.charAt(0)) + string.substring(1).toLowerCase(); - } - public List spellCheck(String word) { return spellChecker.correct(word); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java index eaa1b4f4..5e999371 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java @@ -20,6 +20,7 @@ import nu.marginalia.wmsa.edge.search.results.SearchResultValuator; import nu.marginalia.wmsa.edge.search.results.model.AccumulatedQueryResults; import nu.marginalia.wmsa.edge.search.results.SearchResultDecorator; import nu.marginalia.wmsa.edge.search.results.UrlDeduplicator; +import nu.marginalia.wmsa.encyclopedia.EncyclopediaClient; import org.apache.logging.log4j.util.Strings; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; @@ -33,6 +34,7 @@ public class EdgeSearchOperator { private static final Logger logger = LoggerFactory.getLogger(EdgeSearchOperator.class); private final AssistantClient assistantClient; + private final EncyclopediaClient encyclopediaClient; private final EdgeDataStoreDao edgeDataStoreDao; private final EdgeIndexClient indexClient; private final QueryFactory queryFactory; @@ -42,6 +44,7 @@ public class EdgeSearchOperator { @Inject public EdgeSearchOperator(AssistantClient assistantClient, + EncyclopediaClient encyclopediaClient, EdgeDataStoreDao edgeDataStoreDao, EdgeIndexClient indexClient, QueryFactory queryFactory, @@ -50,6 +53,7 @@ public class EdgeSearchOperator { ) { this.assistantClient = assistantClient; + this.encyclopediaClient = encyclopediaClient; this.edgeDataStoreDao = edgeDataStoreDao; this.indexClient = indexClient; this.queryFactory = queryFactory; @@ -220,7 +224,7 @@ public class EdgeSearchOperator { @NotNull private Observable getWikiArticle(Context ctx, String humanQuery) { - return assistantClient + return encyclopediaClient .encyclopediaLookup(ctx, humanQuery.replaceAll("\\s+", "_") .replaceAll("\"", "") diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterMain.java deleted file mode 100644 index a6041300..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterMain.java +++ /dev/null @@ -1,384 +0,0 @@ -package nu.marginalia.wmsa.edge.tools; - - -import com.zaxxer.hikari.HikariDataSource; -import gnu.trove.map.hash.TIntObjectHashMap; -import gnu.trove.map.hash.TObjectIntHashMap; -import lombok.AllArgsConstructor; -import lombok.SneakyThrows; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import nu.marginalia.wmsa.edge.archive.archiver.ArchiveExtractor; -import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; -import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; -import nu.marginalia.util.language.LanguageFilter; -import nu.marginalia.util.language.WordPatterns; -import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.util.language.processing.DocumentKeywordExtractor; -import nu.marginalia.util.language.processing.SentenceExtractor; -import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; -import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlStandardExtractor; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryWriter; -import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl; -import nu.marginalia.wmsa.edge.model.*; -import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; -import nu.marginalia.wmsa.edge.model.crawl.EdgeRawPageContents; -import org.apache.commons.lang3.tuple.Pair; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.mariadb.jdbc.Driver; - -import java.io.File; -import java.nio.file.Path; -import java.sql.SQLException; -import java.util.*; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.LinkedBlockingQueue; - -import static nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard.UNKNOWN; - -public class ConverterMain { - static final LinkedBlockingQueue processQueue = new LinkedBlockingQueue<>(20); - static final LinkedBlockingQueue uploadQueue = new LinkedBlockingQueue<>(2); - - static final TObjectIntHashMap urlToIdMap = new TObjectIntHashMap<>(50_000_000, 0.5f, -1); - static final TObjectIntHashMap domainToIdMap = new TObjectIntHashMap<>(5_000_000, 0.5f, -1); - static final TIntObjectHashMap idToDomainMap = new TIntObjectHashMap<>(5_000_000, 0.5f, -1); - static HikariDataSource conn; - - private static SearchIndexWriterImpl indexWriter; - private static DictionaryWriter dictionaryWriter; - - @AllArgsConstructor - static class UploadJob { - EdgeId domainId; - EdgeId urlId; - EdgePageWordSet words; - int wordCount; - } - - static volatile boolean running = true; - - public static void main(String... args) { - org.mariadb.jdbc.Driver driver = new Driver(); - - dictionaryWriter = new DictionaryWriter(new File(args[0]), 1L << 30, true); - indexWriter = new SearchIndexWriterImpl(dictionaryWriter, new File(args[1])); - - new Thread(ConverterMain::uploadThread, "Uploader").start(); - - for (int i = 0; i < 24; i++) { - new Thread(ConverterMain::processorThread, "Processor-"+i).start(); - } - - conn = new DatabaseModule().provideConnection(); - - System.out.println("Loading URLs and domains"); - try (var c = conn.getConnection(); - var getUrlsStmt = c.prepareStatement("SELECT EC_URL.ID, DOMAIN_ID, PROTO, URL FROM EC_URL WHERE VISITED"); - var getDomainsStmt = c.prepareStatement("SELECT ID, URL_PART FROM EC_DOMAIN WHERE INDEXED>0") - ) { - getUrlsStmt.setFetchSize(10_000); - getDomainsStmt.setFetchSize(10_000); - - System.out.println("Fetch domains"); - var domainRsp = getDomainsStmt.executeQuery(); - while (domainRsp.next()) { - domainToIdMap.put(domainRsp.getString(2), domainRsp.getInt(1)); - idToDomainMap.put(domainRsp.getInt(1), domainRsp.getString(2)); - } - - System.out.println("Fetch URLs"); - var urlRsp = getUrlsStmt.executeQuery(); - while (urlRsp.next()) { - String urlStr = urlRsp.getString(3) + "://" + idToDomainMap.get(urlRsp.getInt(2)) + urlRsp.getString(4); - urlToIdMap.put(urlStr, urlRsp.getInt(1)); - } - } catch (SQLException throwables) { - throwables.printStackTrace(); - } - -// new Thread(ConverterMain::uploadThread, "Uploader").start(); -// -// for (int i = 0; i < 24; i++) { -// new Thread(ConverterMain::processorThread, "Processor-"+i).start(); -// } - - System.out.println("Loaded URLs and domains"); - - new ArchiveExtractor(Path.of(args[2])).forEach( - page -> { - if (page.contentType.contentType.startsWith("application/xhtml") - || page.contentType.contentType.startsWith("text/html")) { - try { - int domainId = domainToIdMap.get(page.url.domain.toString()); - if (domainId >= 0 && page.redirectUrl == null) { - int urlId = urlToIdMap.get(page.url.toString()); - int dataHash = page.data.hashCode(); - try (var c = conn.getConnection(); - var updateHash = c.prepareStatement("UPDATE EC_URL SET DATA_HASH=? WHERE ID=?")) - { - updateHash.setInt(1, dataHash); - updateHash.setInt(2, urlId); - updateHash.executeUpdate(); - } - catch (Exception ex) { - ex.printStackTrace(); - } - } - } catch (Exception e) { - e.printStackTrace(); - } - } - }); - - running = false; - } - - static final LanguageModels lm = new LanguageModels( - Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"), - Path.of("/var/lib/wmsa/model/tfreq-new-algo3.bin"), - Path.of("/var/lib/wmsa/model/opennlp-sentence.bin"), - Path.of("/var/lib/wmsa/model/English.RDR"), - Path.of("/var/lib/wmsa/model/English.DICT"), - Path.of("/var/lib/wmsa/model/opennlp-tok.bin") - ); - static final NGramDict dict = new NGramDict(lm); - - private static final LanguageFilter languageFilter = new LanguageFilter(); - private static final LinkParser linkParser = new LinkParser(); - public static void processorThread() { - SentenceExtractor newSe = new SentenceExtractor(lm); - DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict); - - try { - while (running || !processQueue.isEmpty()) { - var job = processQueue.take(); - if (job.data.length() > 512*1024) { - System.out.println(job.url + " too big, skipping"); - } - - var parsed = Jsoup.parse(job.data); - var text = parsed.text(); - - if (languageFilter.isBlockedUnicodeRange(text)) { - continue; - } - - var dld = newSe.extractSentences(parsed.clone()); - var keywords = documentKeywordExtractor.extractKeywords(dld); - int wc = dld.totalNumWords(); - - if (wc > 100) { - double languageAgreement = languageFilter.dictionaryAgreement(dld); - if (languageAgreement < 0.05) { - continue; - } - } - - - EdgeHtmlStandard htmlStandard = HtmlStandardExtractor.parseDocType(parsed.documentType()); - if (UNKNOWN.equals(htmlStandard)) { - htmlStandard = HtmlStandardExtractor.sniffHtmlStandard(parsed); - } - - int scriptTags = getScriptPenalty(parsed); - var featureSet = getFeatureSet(parsed, scriptTags, job.hasCookies); - addTags(keywords, htmlStandard, job.url, featureSet); - - extractLinkWords(keywords, job.getUrl(), parsed); - - uploadQueue.put(new UploadJob( - new EdgeId<>(domainToIdMap.get(job.url.domain.toString())), - new EdgeId<>(urlToIdMap.get(job.url.toString())), - keywords, - 0 - )); - - } - } - catch (InterruptedException ex) { - ex.printStackTrace(); - } - } - - - private static Map> extractLinkWords(EdgePageWordSet keywords, EdgeUrl pageUrl, Document parsed) { - - List> urls = new ArrayList<>(); - Set linkKeywords = new HashSet<>(); - Map> linkTextWords = new ConcurrentHashMap<>(); - - for (var tag : parsed.getElementsByTag("a")) { - if (!tag.hasAttr("href")) { - continue; - } - if (urls.size() > 100) { - break; - } - - var linkOpt = linkParser.parseLink(pageUrl, tag); - if (linkOpt.isEmpty()) - continue; - - var link = linkOpt.get(); - - urls.add(Pair.of(link, tag.text())); - - if (!Objects.equals(link.domain.domain, pageUrl.domain.domain) - && linkKeywords.size() <= 25) - { - linkKeywords.add("links:" + link.domain.domain); - } -// -// Set words = new HashSet<>(); -// -// for (var sent : sentenceExtractor.extractSentencesFromString(tag.text())) { -// for (var keyword : keywordExtractor.getWordsFromSentence(sent)) { -// words.add(sent.constructWordFromSpan(keyword)); -// } -// } -// -// linkTextWords.compute(link, (k, set) -> { -// if (set == null) return words; -// else { set.addAll(words); return set; } -// }); - - } - - keywords.get(IndexBlock.Meta).addAll(linkKeywords); - - if (WordPatterns.wordQualitiesPredicate.test(pageUrl.domain.domain.toLowerCase())) { - keywords.get(IndexBlock.Link).addJust(pageUrl.domain.domain.toLowerCase()); - } - - return linkTextWords; - } - - private static int getScriptPenalty(Document parsed) { - var scriptTags = parsed.getElementsByTag("script"); - String scriptText = scriptTags.html(); - int badScript = 0; - if (scriptText.contains(".createElement(")) { - badScript = 1; - } - return scriptTags.size() + badScript + (scriptText.length())/1000; - } - - static final List trackers = List.of("adform.net", - "connect.facebook", - "googletagmanager.com", - "googlesyndication.com", - "google.com", - "twitter.com", - "smartadserver.com", - "doubleclick.com", - "2mdn.com", - "dmtry.com", - "bing.com", - "msn.com", - "amazon-adsystem.com", - "alexametrics.com", - "rubiconproject.com", - "chango.com", - "d5nxst8fruw4z.cloudfront.net", - "d31qbv1cthcecs.cloudfront.net", - "linkedin.com"); - - private static Set getFeatureSet(Document parsed, int scriptTags, boolean cookies) { - Set features = new HashSet<>(); - - if (scriptTags > 0) { - features.add(HtmlFeature.JS); - } - if (!parsed.getElementsByTag("object").isEmpty() - || !parsed.getElementsByTag("audio").isEmpty() - || !parsed.getElementsByTag("video").isEmpty()) { - features.add(HtmlFeature.MEDIA); - } - if (parsed.getElementsByTag("script").stream() - .filter(tag -> tag.attr("src") != null) - .anyMatch(tag -> trackers.stream().anyMatch(tracker -> tag.attr("src").contains(tracker)))) { - features.add(HtmlFeature.TRACKING); - } - if (parsed.getElementsByTag("script").html().contains("google-analytics.com")) { - features.add(HtmlFeature.TRACKING); - } - if (parsed.getElementsByTag("a").stream().map(e -> e.attr("href")) - .filter(Objects::nonNull) - .map(String::toLowerCase) - .anyMatch(href -> - href.contains("amzn.to/") || href.contains("amazon.com/"))) { - features.add(HtmlFeature.AFFILIATE_LINK); - } - if (cookies) { - features.add(HtmlFeature.COOKIES); - } - - return features; - } - - private static void addTags(EdgePageWordSet wordSet, EdgeHtmlStandard htmlStandard, EdgeUrl url, Set features) { - List tagWords = new ArrayList<>(); - tagWords.add("format:"+htmlStandard.toString().toLowerCase()); - tagWords.add("site:"+url.domain.toString().toLowerCase()); - tagWords.add("proto:"+url.proto.toLowerCase()); - tagWords.add("js:" + Boolean.toString(features.contains(HtmlFeature.JS)).toLowerCase()); - if (features.contains(HtmlFeature.MEDIA)) { - tagWords.add("special:media"); - } - if (features.contains(HtmlFeature.TRACKING)) { - tagWords.add("special:tracking"); - } - if (features.contains(HtmlFeature.AFFILIATE_LINK)) { - tagWords.add("special:affiliate"); - } - if (features.contains(HtmlFeature.COOKIES)) { - tagWords.add("special:cookies"); - } - wordSet.append(IndexBlock.Meta, tagWords); - wordSet.append(IndexBlock.Words, tagWords); - } - - @SneakyThrows - public static void uploadThread() { - - while (running || !processQueue.isEmpty() || !uploadQueue.isEmpty()) { - var data = uploadQueue.take(); - - if (!data.words.isEmpty()) { - for (var words : data.words.values()) { - if (!words.getWords().isEmpty()) { - if (words.size() < 1000) { - indexWriter.put(data.domainId, data.urlId, words.block, words.words); - } else { - chunks(words.words, 1000).forEach(chunk -> { - indexWriter.put(data.domainId, data.urlId, words.block, chunk); - }); - } - } - } - } - } - - System.out.println("Closing"); - dictionaryWriter.commitToDisk(); - indexWriter.forceWrite(); - dictionaryWriter.close(); - indexWriter.close(); - System.out.println("Done"); - } - - private static List> chunks(Collection coll, int size) { - List> ret = new ArrayList<>(); - List data = List.copyOf(coll); - - for (int i = 0; i < data.size(); i+=size) { - ret.add(data.subList(i, Math.min(data.size(), i+size))); - } - - return ret; - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/TermFrequencyCounterMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/TermFrequencyCounterMain.java deleted file mode 100644 index c0ffb663..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/TermFrequencyCounterMain.java +++ /dev/null @@ -1,142 +0,0 @@ -package nu.marginalia.wmsa.edge.tools; - - -import gnu.trove.set.hash.TLongHashSet; -import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.archive.archiver.ArchiveExtractor; -import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; -import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.util.language.processing.KeywordExtractor; -import nu.marginalia.util.language.processing.SentenceExtractor; -import nu.marginalia.wmsa.edge.model.crawl.EdgeRawPageContents; -import opennlp.tools.stemmer.PorterStemmer; -import org.jsoup.Jsoup; - -import java.io.BufferedOutputStream; -import java.io.DataOutputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.nio.file.Path; -import java.util.*; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.LinkedBlockingQueue; -import java.util.concurrent.atomic.AtomicLong; - -public class TermFrequencyCounterMain { - - static final LinkedBlockingQueue processQueue = new LinkedBlockingQueue<>(20); - - public static final String OUTPUT_FILE = "/var/lib/wmsa/archive/tfreq-2022-04-04.bin"; - public static final String ARCHIVE_PATH = "/var/lib/wmsa/archive/webpage"; // "/mnt/storage/wmsa/archive/webpage/" - - @SneakyThrows - public static void main(String... args) { - - List pt = new ArrayList<>(); - for (int i = 0; i < 20; i++) { - pt.add(new Thread(TermFrequencyCounterMain::processorThread)); - } - pt.forEach(Thread::start); - - AtomicLong docsTotal = new AtomicLong(); - new ArchiveExtractor(Path.of(ARCHIVE_PATH)).forEach( - page -> { - if (page.contentType.contentType.contains("html") - && page.isAfter("2022-03-15T")) { - try { - long dt = docsTotal.incrementAndGet(); - if (dt == 0) { - System.out.println(docsTotal.get() + " - " + termFreq.size()); - } - if ((dt % 5) != 0) { - processQueue.put(page); - } - } catch (InterruptedException e) { - e.printStackTrace(); - } - } - }); - running = false; - - - System.out.println("Waiting for wrap-up"); - - Thread.sleep(36000); - - for (Thread thread : pt) { - thread.interrupt(); - } - for (Thread thread : pt) { - thread.join(); - } - System.out.println("Total documents = " + docsTotal.get()); - - System.out.println("Writing Frequencies"); - - try (var dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(OUTPUT_FILE))) - ) { - synchronized (termFreq) { - for (var entry : termFreq.entrySet()) { - - if (entry.getValue() > 5) { - dos.writeLong(entry.getKey()); - dos.writeLong(entry.getValue()); - } - } - } - } catch (IOException e) { - e.printStackTrace(); - } - - - System.out.println("All done!"); - } - - public static final ConcurrentHashMap termFreq = new ConcurrentHashMap<>(); - - public static final LanguageModels lm = new LanguageModels( - Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"), - Path.of("/var/lib/wmsa/model/tfreq-generous-emstr.bin"), - Path.of("/var/lib/wmsa/model/opennlp-sentence.bin"), - Path.of("/var/lib/wmsa/model/English.RDR"), - Path.of("/var/lib/wmsa/model/English.DICT"), - Path.of("/var/lib/wmsa/model/opennlp-tok.bin") - ); - public static volatile boolean running = true; - - public static void processorThread() { - var ke = new KeywordExtractor(); - var se = new SentenceExtractor(lm); - var ps = new PorterStemmer(); - try { - TLongHashSet words = new TLongHashSet(10000); - while (running || !processQueue.isEmpty()) { - var job = processQueue.take(); - var sentence = se.extractSentences(Jsoup.parse(job.data)); - - for (var sent : sentence.sentences) { - var keywords = ke.getKeywordsFromSentence(sent); - for (int i = 0; i < keywords.length; i++) { - if (keywords[i].size() > 1) { - words.add(NGramDict.longHash(sent.constructStemmedWordFromSpan(keywords[i]).getBytes())); - } - } - - for (String word : sent.wordsLowerCase) { - words.add(NGramDict.longHash(ps.stem(word).getBytes())); - } - - words.forEach(l -> { - termFreq.merge(l, 1, Integer::sum); - return true; - }); - words.clear(); - } - } - } - catch (InterruptedException ex) { - ex.printStackTrace(); - } - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ZimConverterMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ZimConverterMain.java index 3e9424d2..1b35dc12 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ZimConverterMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ZimConverterMain.java @@ -2,10 +2,10 @@ package nu.marginalia.wmsa.edge.tools; import lombok.AllArgsConstructor; import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.archive.client.ArchiveClient; import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.wmsa.edge.assistant.dict.WikiCleaner; import nu.marginalia.util.language.conf.LanguageModels; +import nu.marginalia.wmsa.encyclopedia.EncyclopediaClient; import org.jsoup.Jsoup; import org.openzim.ZIMTypes.ZIMFile; import org.openzim.ZIMTypes.ZIMReader; @@ -25,7 +25,7 @@ public class ZimConverterMain { static final LinkedBlockingQueue jobQueue = new LinkedBlockingQueue<>(100); static final LinkedBlockingQueue analysisQueue = new LinkedBlockingQueue<>(100); static boolean hasData = true; - static final ArchiveClient archiveClient = new ArchiveClient(); + static final EncyclopediaClient encyclopediaClient = new EncyclopediaClient(); static NGramDict dict = new NGramDict(new LanguageModels( Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"), Path.of("/var/lib/wmsa/model/tfreq-generous-emstr.bin"), @@ -60,7 +60,7 @@ public class ZimConverterMain { // convertJust("Plotinus"); // convertJust("C++"); convertAll(args); - archiveClient.close(); + encyclopediaClient.close(); } @SneakyThrows @@ -108,7 +108,7 @@ public class ZimConverterMain { } private static void convertAll(String[] args) throws IOException { - archiveClient.setServiceRoute("127.0.0.1", Integer.parseInt(args[0])); + encyclopediaClient.setServiceRoute("127.0.0.1", Integer.parseInt(args[0])); var zr = new ZIMReader(new ZIMFile(args[1])); // var zr = new ZIMReader(new ZIMFile("/home/vlofgren/Work/wikipedia_en_all_nopic_2021-01.zim")); @@ -142,7 +142,7 @@ public class ZimConverterMain { }, p -> true); hasData = false; - archiveClient.close(); + encyclopediaClient.close(); } @SneakyThrows diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaClient.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaClient.java new file mode 100644 index 00000000..dd382220 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaClient.java @@ -0,0 +1,34 @@ +package nu.marginalia.wmsa.encyclopedia; + +import io.reactivex.rxjava3.core.Observable; +import nu.marginalia.wmsa.client.AbstractDynamicClient; +import nu.marginalia.wmsa.client.HttpStatusCode; +import nu.marginalia.wmsa.configuration.ServiceDescriptor; +import nu.marginalia.wmsa.configuration.server.Context; +import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles; +import okhttp3.MediaType; +import org.eclipse.jetty.util.UrlEncoded; + +import javax.annotation.CheckReturnValue; + +public class EncyclopediaClient extends AbstractDynamicClient { + public EncyclopediaClient() { + super(ServiceDescriptor.ENCYCLOPEDIA); + } + + @CheckReturnValue + public Observable submitWiki(Context ctx, String url, String data) { + return super.post(ctx, "/wiki/submit?url="+UrlEncoded.encodeString(url), data, MediaType.parse("text/plain; charset=UTF-8")); + } + + @CheckReturnValue + public Observable hasWiki(Context ctx, String url) { + return super.get(ctx, "/wiki/has?url="+ UrlEncoded.encodeString(url), Boolean.class); + } + + @CheckReturnValue + public Observable encyclopediaLookup(Context ctx, String word) { + return super.get(ctx,"/encyclopedia/" + UrlEncoded.encodeString(word), WikiArticles.class); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaDao.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaDao.java new file mode 100644 index 00000000..771f29d3 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaDao.java @@ -0,0 +1,160 @@ +package nu.marginalia.wmsa.encyclopedia; + +import com.google.inject.Inject; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles; +import nu.marginalia.wmsa.edge.assistant.dict.WikiSearchResult; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.*; +import java.util.stream.Collectors; + +public class EncyclopediaDao { + + private HikariDataSource dataSource; + private static final Logger logger = LoggerFactory.getLogger(EncyclopediaDao.class); + + @Inject + public EncyclopediaDao(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + public WikiArticles encyclopedia(String term) { + WikiArticles response = new WikiArticles(); + response.entries = new ArrayList<>(); + + try (var connection = dataSource.getConnection()) { + var stmt = connection.prepareStatement("SELECT DISTINCT(NAME_LOWER) FROM REF_WIKI_TITLE WHERE NAME_LOWER=?"); + stmt.setString(1, term); + + var rsp = stmt.executeQuery(); + while (rsp.next()) { + response.entries.add(capitalizeWikiString(rsp.getString(1))); + } + } + catch (Exception ex) { + logger.error("Failed to fetch articles", ex); + return new WikiArticles(); + } + + return response; + } + + public Optional resolveEncylopediaRedirect(String term) { + final List matches = new ArrayList<>(); + + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) { + stmt.setString(1, term); + + var rsp = stmt.executeQuery(); + while (rsp.next()) { + if (term.equals(rsp.getString(1)) + || rsp.getString(2) == null) { + return Optional.ofNullable(rsp.getString(2)); + } else { + matches.add(rsp.getString(2)); + } + } + } + } + catch (Exception ex) { + throw new RuntimeException(ex); + } + + if (!matches.isEmpty()) { + return Optional.of(matches.get(0)); + } + return Optional.empty(); + } + + + public Optional findEncyclopediaPageDirect(String term) { + + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) { + stmt.setString(1, term.replace(' ', '_')); + + var rsp = stmt.executeQuery(); + while (rsp.next()) { + String name = rsp.getString(1); + String refName = rsp.getString(2); + + if (refName == null) { + return Optional.of(new WikiSearchResult(name, null)); + } + } + } + } + catch (Exception ex) { + throw new RuntimeException(ex); + } + + return Optional.empty(); + } + + public List findEncyclopediaPages(String term) { + final List directMatches = new ArrayList<>(); + final Set directSearchMatches = new HashSet<>(); + final Set indirectMatches = new HashSet<>(); + + try (var connection = dataSource.getConnection()) { + + try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER=LOWER(?)")) { + stmt.setString(1, term.replace(' ', '_')); + + var rsp = stmt.executeQuery(); + while (rsp.next()) { + String name = rsp.getString(1); + String refName = rsp.getString(2); + + if (refName == null) { + directMatches.add(new WikiSearchResult(name, null)); + } else { + indirectMatches.add(new WikiSearchResult(name, refName)); + } + } + } + + try (var stmt = connection.prepareStatement("SELECT NAME, REF_NAME FROM REF_WIKI_TITLE WHERE NAME_LOWER LIKE ? LIMIT 10")) { + stmt.setString(1, term.replace(' ', '_').replaceAll("%", "\\%").toLowerCase() + "%"); + + var rsp = stmt.executeQuery(); + while (rsp.next()) { + String name = rsp.getString(1); + String refName = rsp.getString(2); + + if (refName == null) { + directSearchMatches.add(new WikiSearchResult(name, null)); + } else { + indirectMatches.add(new WikiSearchResult(name, refName)); + } + } + } + } + catch (Exception ex) { + throw new RuntimeException(ex); + } + + directMatches.forEach(indirectMatches::remove); + indirectMatches.removeAll(directSearchMatches); + directMatches.forEach(directSearchMatches::remove); + directMatches.addAll(indirectMatches); + directMatches.addAll(directSearchMatches); + return directMatches; + } + + private String capitalizeWikiString(String string) { + if (string.contains("_")) { + return Arrays.stream(string.split("_")).map(this::capitalizeWikiString).collect(Collectors.joining("_")); + } + if (string.length() < 2) { + return string.toUpperCase(); + } + return Character.toUpperCase(string.charAt(0)) + string.substring(1).toLowerCase(); + } + + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaMain.java new file mode 100644 index 00000000..eb51915a --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaMain.java @@ -0,0 +1,26 @@ +package nu.marginalia.wmsa.encyclopedia; + +import com.google.inject.Guice; +import com.google.inject.Inject; +import com.google.inject.Injector; +import nu.marginalia.wmsa.configuration.MainClass; +import nu.marginalia.wmsa.configuration.ServiceDescriptor; +import nu.marginalia.wmsa.configuration.module.ConfigurationModule; + +public class EncyclopediaMain extends MainClass { + private final EncyclopediaService service; + + public static void main(String... args) { + init(ServiceDescriptor.ENCYCLOPEDIA, args); + + Injector injector = Guice.createInjector( + new ConfigurationModule()); + injector.getInstance(EncyclopediaMain.class); + } + + @Inject + public EncyclopediaMain(EncyclopediaService service) { + this.service = service; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaService.java new file mode 100644 index 00000000..a0492793 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaService.java @@ -0,0 +1,202 @@ +package nu.marginalia.wmsa.encyclopedia; + +import com.google.inject.name.Named; +import lombok.SneakyThrows; +import nu.marginalia.wmsa.configuration.server.Context; +import nu.marginalia.wmsa.configuration.server.Initialization; +import nu.marginalia.wmsa.configuration.server.MetricsServer; +import nu.marginalia.wmsa.configuration.server.Service; +import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; +import nu.marginalia.wmsa.renderer.mustache.RendererFactory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import spark.Request; +import spark.Response; +import spark.Spark; + +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; + +public class EncyclopediaService extends Service { + + private static final Logger logger = LoggerFactory.getLogger(EncyclopediaService.class); + private final MustacheRenderer wikiErrorPageRenderer; + private final MustacheRenderer wikiSearchResultRenderer; + private Path wikiPath; + private EncyclopediaDao encyclopediaDao; + + public EncyclopediaService(@Named("service-host") String ip, + @Named("service-port") Integer port, + @Named("wiki-path") Path wikiPath, + EncyclopediaDao encyclopediaDao, + RendererFactory rendererFactory, + Initialization initialization, + MetricsServer metricsServer) + throws IOException { + super(ip, port, initialization, metricsServer); + this.wikiPath = wikiPath; + this.encyclopediaDao = encyclopediaDao; + + if (rendererFactory != null) { + wikiErrorPageRenderer = rendererFactory.renderer("encyclopedia/wiki-error"); + wikiSearchResultRenderer = rendererFactory.renderer("encyclopedia/wiki-search"); + } + else { + wikiErrorPageRenderer = null; + wikiSearchResultRenderer = null; + } + + + Spark.get("/public/wiki/*", this::getWikiPage); + Spark.get("/public/wiki-search", this::searchWikiPage); + + Spark.get("/wiki/has", this::pathWikiHas); + Spark.post("/wiki/submit", this::pathWikiSubmit); + } + + + @SneakyThrows + private Object getWikiPage(Request req, Response rsp) { + final String[] splats = req.splat(); + if (splats.length == 0) + rsp.redirect("https://encyclopedia.marginalia.nu/wiki-start.html"); + + + final String name = splats[0]; + + String pageName = encyclopediaDao.resolveEncylopediaRedirect(name).orElse(name); + + logger.info("Resolved {} -> {}", name, pageName); + + return wikiGet(pageName) + .or(() -> resolveWikiPageNameWrongCase(name)) + .orElseGet(() -> renderSearchPage(name)); + } + + private Optional resolveWikiPageNameWrongCase(String name) { + var rsp = encyclopediaDao.findEncyclopediaPageDirect(name); + + if (rsp.isEmpty()) { + return Optional.of(renderSearchPage(name)); + } + + name = rsp.get().getInternalName(); + return wikiGet(name); + } + + private String renderSearchPage(String s) { + return wikiSearchResultRenderer.render( + Map.of("query", s, + "error", "true", + "results", encyclopediaDao.findEncyclopediaPages(s))); + } + + @SneakyThrows + private Object searchWikiPage(Request req, Response rsp) { + final var ctx = Context.fromRequest(req); + + String term = req.queryParams("query"); + if (null == term) { + rsp.redirect("https://encyclopedia.marginalia.nu/wiki-start.html"); + return ""; + } + + return wikiSearchResultRenderer.render( + Map.of("query", term, + "results", + encyclopediaDao.findEncyclopediaPages(term)) + ); + } + + + + private Path getWikiFilename(Path base, String url) { + Path p = base; + + int urlHash = url.hashCode(); + + p = p.resolve(Integer.toString(urlHash & 0xFF)); + p = p.resolve(Integer.toString((urlHash>>>8) & 0xFF)); + p = p.resolve(Integer.toString((urlHash>>>16) & 0xFF)); + p = p.resolve(Integer.toString((urlHash>>>24) & 0xFF)); + + String fileName = url.chars() + .mapToObj(this::encodeUrlChar) + .collect(Collectors.joining()); + + if (fileName.length() > 128) { + fileName = fileName.substring(0, 128) + (((long)urlHash)&0xFFFFFFFFL); + } + + return p.resolve(fileName + ".gz"); + } + + + private String encodeUrlChar(int i) { + if (i >= 'a' && i <= 'z') { + return Character.toString(i); + } + if (i >= 'A' && i <= 'Z') { + return Character.toString(i); + } + if (i >= '0' && i <= '9') { + return Character.toString(i); + } + if (i == '.') { + return Character.toString(i); + } + else { + return String.format("%%%2X", i); + } + } + + @SneakyThrows + private Object pathWikiHas(Request request, Response response) { + return Files.exists(getWikiFilename(wikiPath, request.queryParams("url"))); + } + + + @SneakyThrows + private Optional wikiGet(String name) { + + var filename = getWikiFilename(wikiPath, name); + + if (Files.exists(filename)) { + try (var stream = new GZIPInputStream(new FileInputStream(filename.toFile()))) { + return Optional.of(new String(stream.readAllBytes())); + } + } else { + return Optional.empty(); + } + } + + + @SneakyThrows + private Object pathWikiSubmit(Request request, Response response) { + byte[] data = request.bodyAsBytes(); + + String wikiUrl = request.queryParams("url"); + Path filename = getWikiFilename(wikiPath, wikiUrl); + + Files.createDirectories(filename.getParent()); + + System.out.println(new String(data)); + logger.debug("Writing {} to {}", wikiUrl, filename); + + try (var gos = new GZIPOutputStream(new FileOutputStream(filename.toFile()))) { + gos.write(data); + gos.flush(); + } + + return "ok"; + + } +} diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/configuration/server/ServiceTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/configuration/server/ServiceTest.java index 2ae44237..17dd7472 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/configuration/server/ServiceTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/configuration/server/ServiceTest.java @@ -50,8 +50,6 @@ class ServiceTest { new DictionaryService(dataSource, new SpellChecker()), new MathParser(), new Units(new MathParser()), - null, - null, new ScreenshotService(null), null); Spark.awaitInitialization(); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/archive/ArchiveTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/archive/ArchiveTest.java deleted file mode 100644 index 2dc15183..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/archive/ArchiveTest.java +++ /dev/null @@ -1,72 +0,0 @@ -package nu.marginalia.wmsa.edge.archive; - -import lombok.SneakyThrows; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.configuration.server.Initialization; -import nu.marginalia.wmsa.edge.archive.archiver.Archiver; -import nu.marginalia.wmsa.edge.archive.client.ArchiveClient; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.parallel.Execution; -import org.junit.jupiter.api.parallel.ExecutionMode; -import spark.Spark; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; - -import static nu.marginalia.util.TestUtil.getPort; -import static nu.marginalia.util.test.TestUtil.clearTempDir; - -@Execution(ExecutionMode.SAME_THREAD) -public class ArchiveTest { - static EdgeArchiveService service; - - static final int testPort = getPort(); - private static Path tempPath; - private static Path tempPath2; - private static ArchiveClient archiveClient; - private static Archiver archiver; - - @BeforeAll - public static void setUpClass() throws IOException { - Spark.port(testPort); - System.setProperty("service-name", "edge-archive"); - archiveClient = new ArchiveClient(); - archiveClient.setServiceRoute("127.0.0.1", testPort); - - tempPath = Files.createTempDirectory("archiveTest"); - tempPath2 = Files.createTempDirectory("wikiTest"); - - archiver = new Archiver(tempPath, 10); - service = new EdgeArchiveService("127.0.0.1", testPort, - tempPath, - archiver, - new Initialization(), null); - - Spark.awaitInitialization(); - } - - @AfterAll - public static void tearDown() throws Exception { - archiver.close(); - archiveClient.close(); - clearTempDir(tempPath); - clearTempDir(tempPath2); - } - - @SneakyThrows - @Test - public void testWiki() { - var url = "Plato_(Disambiguation)"; - - Assertions.assertFalse(archiveClient.hasWiki(Context.internal(), url).blockingFirst()); - - archiveClient.submitWiki(Context.internal(), url, "

Hello

").blockingFirst(); - Assertions.assertTrue(archiveClient.hasWiki(Context.internal(), url).blockingFirst()); - Assertions.assertEquals("

Hello

", archiveClient.getWiki(Context.internal(), url).blockingFirst()); - } - -} diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/archive/archiver/ArchiverTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/archive/archiver/ArchiverTest.java deleted file mode 100644 index 65a2c8e7..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/archive/archiver/ArchiverTest.java +++ /dev/null @@ -1,17 +0,0 @@ -package nu.marginalia.wmsa.edge.archive.archiver; -import org.junit.jupiter.api.*; - -import java.nio.file.Path; - -public class ArchiverTest { - - @Test - public void testArchiver() throws Exception { - Archiver archiver = new Archiver(Path.of("/tmp/"), 3); - archiver.writeData(new ArchivedFile("file1", "Hey".getBytes())); - archiver.writeData(new ArchivedFile("file2", "Hey".getBytes())); - archiver.writeData(new ArchivedFile("file3", "Hey".getBytes())); - archiver.writeData(new ArchivedFile("file4", "Hey".getBytes())); - archiver.close(); - } -} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/AssistantTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/AssistantTest.java index 8787cf51..392dbca3 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/AssistantTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/assistant/AssistantTest.java @@ -60,7 +60,6 @@ class AssistantTest { new DictionaryService(dataSource, new SpellChecker()), new MathParser(), new Units(new MathParser()), - null, null, new ScreenshotService(null), null); Spark.awaitInitialization(); @@ -77,12 +76,6 @@ class AssistantTest { Spark.awaitStop(); } - @Test - public void testEncyclopedia() { - var result = client.encyclopediaLookup(Context.internal(), "plato").blockingFirst(); - System.out.println(result); - assertTrue(result.entries.size() >= 1); - } @Test public void testSpellCheck() { var result = client.spellCheck(Context.internal(), "plato").blockingFirst(); From ad4521da9e0cb7f174c41a3b5e5963b067879f5b Mon Sep 17 00:00:00 2001 From: vlofgren Date: Sat, 28 May 2022 00:16:31 +0200 Subject: [PATCH 2/5] WIP: Killing off Archive service, adding new Encyclopedia service consisting largely of what Archive was and a few features from Assistant. --- .../nu/marginalia/wmsa/edge/E2ETestBase.java | 70 +++++++++++++++++++ .../wmsa/edge/EdgeSearchE2ETest.java | 59 ++-------------- .../wmsa/edge/EncyclopediaE2ETest.java | 27 +++++++ marginalia_nu/src/e2e/resources/crawl.sh | 2 +- marginalia_nu/src/e2e/resources/init.sh | 9 ++- .../wmsa/configuration/WmsaHome.java | 24 +++++++ .../configuration/module/DatabaseModule.java | 2 +- .../wmsa/encyclopedia/EncyclopediaMain.java | 1 + .../wmsa/encyclopedia/EncyclopediaModule.java | 18 +++++ .../encyclopedia/EncyclopediaService.java | 2 + 10 files changed, 157 insertions(+), 57 deletions(-) create mode 100644 marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/E2ETestBase.java create mode 100644 marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaModule.java diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/E2ETestBase.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/E2ETestBase.java new file mode 100644 index 00000000..eb534cf9 --- /dev/null +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/E2ETestBase.java @@ -0,0 +1,70 @@ +package nu.marginalia.wmsa.edge; + +import nu.marginalia.wmsa.configuration.ServiceDescriptor; +import org.slf4j.LoggerFactory; +import org.testcontainers.containers.BindMode; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.containers.Network; +import org.testcontainers.containers.output.Slf4jLogConsumer; +import org.testcontainers.containers.wait.strategy.Wait; +import org.testcontainers.utility.MountableFile; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Duration; + +public abstract class E2ETestBase { + public Network network = Network.newNetwork(); + + public GenericContainer getMariaDBContainer() { + return new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("sql/edge-crawler-cache.sql") + .withNetwork(network) + .withNetworkAliases("mariadb"); + } + + public GenericContainer forService(ServiceDescriptor service, GenericContainer mariaDB) { + return new GenericContainer<>("openjdk:17-alpine") + .dependsOn(mariaDB) + .withCopyFileToContainer(jarFile(), "/WMSA.jar") + .withCopyFileToContainer(MountableFile.forClasspathResource("init.sh"), "/init.sh") + .withExposedPorts(service.port) + .withFileSystemBind(modelsPath(), "/var/lib/wmsa/model", BindMode.READ_ONLY) + .withNetwork(network) + .withNetworkAliases(service.name) + .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger(service.name))) + .withCommand("sh", "init.sh", service.name) + .waitingFor(Wait.forHttp("/internal/ping") + .forPort(service.port) + .withReadTimeout(Duration.ofSeconds(15))) + ; + } + + public static MountableFile jarFile() { + Path cwd = Path.of(System.getProperty("user.dir")); + + cwd = cwd.resolve(".."); + var jarFile = cwd.resolve("build/libs/wmsa-SNAPSHOT-all.jar"); + if (!Files.exists(jarFile)) { + System.err.println("Could not find jarFile " + jarFile); + throw new RuntimeException(); + } + else { + System.out.println("jar file = " + jarFile); + } + return MountableFile.forHostPath(jarFile); + } + + public static String modelsPath() { + Path modelsPath = Path.of(System.getProperty("user.dir")).resolve("data/models"); + if (!Files.isDirectory(modelsPath)) { + System.err.println("Could not find models, looked in " + modelsPath.toAbsolutePath()); + throw new RuntimeException(); + } + return modelsPath.toString(); + } +} diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java index 8c68b272..5ff1249a 100644 --- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java @@ -32,24 +32,16 @@ import static org.testcontainers.containers.BrowserWebDriverContainer.VncRecordi @Tag("e2e") @Testcontainers -public class EdgeSearchE2ETest { - Network network = Network.newNetwork(); +public class EdgeSearchE2ETest extends E2ETestBase { + @Container + public GenericContainer mariaDB = getMariaDBContainer(); @Container - public GenericContainer mariaDB = new MariaDBContainer<>("mariadb") - .withDatabaseName("WMSA_prod") - .withUsername("wmsa") - .withPassword("wmsa") - .withInitScript("sql/edge-crawler-cache.sql") - .withNetwork(network) - .withNetworkAliases("mariadb"); - + public GenericContainer searchContainer = forService(EDGE_SEARCH, mariaDB); @Container - public GenericContainer searchContainer = forService(EDGE_SEARCH); + public GenericContainer assistantContainer = forService(EDGE_ASSISTANT, mariaDB); @Container - public GenericContainer assistantContainer = forService(EDGE_ASSISTANT); - @Container - public GenericContainer indexContainer = forService(EDGE_INDEX); + public GenericContainer indexContainer = forService(EDGE_INDEX, mariaDB); @Container public NginxContainer mockWikipedia = new NginxContainer<>("nginx:stable") @@ -88,46 +80,7 @@ public class EdgeSearchE2ETest { .withNetwork(network) .withNetworkAliases("proxyNginx"); ; - public GenericContainer forService(ServiceDescriptor service) { - return new GenericContainer<>("openjdk:17-alpine") - .dependsOn(mariaDB) - .withCopyFileToContainer(jarFile(), "/WMSA.jar") - .withCopyFileToContainer(MountableFile.forClasspathResource("init.sh"), "/init.sh") - .withExposedPorts(service.port) - .withFileSystemBind(modelsPath(), "/var/lib/wmsa/model", BindMode.READ_ONLY) - .withNetwork(network) - .withNetworkAliases(service.name) - .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger(service.name))) - .withCommand("sh", "init.sh", service.name) - .waitingFor(Wait.forHttp("/internal/ping") - .forPort(service.port) - .withReadTimeout(Duration.ofSeconds(15))) - ; - } - public static MountableFile jarFile() { - Path cwd = Path.of(System.getProperty("user.dir")); - - cwd = cwd.resolve(".."); - var jarFile = cwd.resolve("build/libs/wmsa-SNAPSHOT-all.jar"); - if (!Files.exists(jarFile)) { - System.err.println("Could not find jarFile " + jarFile); - throw new RuntimeException(); - } - else { - System.out.println("jar file = " + jarFile); - } - return MountableFile.forHostPath(jarFile); - } - - public static String modelsPath() { - Path modelsPath = Path.of(System.getProperty("user.dir")).resolve("data/models"); - if (!Files.isDirectory(modelsPath)) { - System.err.println("Could not find models, looked in " + modelsPath.toAbsolutePath()); - throw new RuntimeException(); - } - return modelsPath.toString(); - } public static MountableFile ipDatabasePath() { Path modelsPath = Path.of(System.getProperty("user.dir")).resolve("data/models/IP2LOC/IP2LOCATION-LITE-DB1.CSV"); if (!Files.isRegularFile(modelsPath)) { diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java new file mode 100644 index 00000000..ea18b2a8 --- /dev/null +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java @@ -0,0 +1,27 @@ +package nu.marginalia.wmsa.edge; + + +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.containers.Network; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; + +import static nu.marginalia.wmsa.configuration.ServiceDescriptor.ENCYCLOPEDIA; + +@Tag("e2e") +@Testcontainers +public class EncyclopediaE2ETest extends E2ETestBase { + @Container + public GenericContainer mariaDB = getMariaDBContainer(); + + @Container + public GenericContainer encyclopediaContainer = forService(ENCYCLOPEDIA, mariaDB); + + + @Test + public void run() { + } +} diff --git a/marginalia_nu/src/e2e/resources/crawl.sh b/marginalia_nu/src/e2e/resources/crawl.sh index bf503759..3a0e4b01 100644 --- a/marginalia_nu/src/e2e/resources/crawl.sh +++ b/marginalia_nu/src/e2e/resources/crawl.sh @@ -3,7 +3,7 @@ mkdir -p /var/lib/wmsa/conf/ mkdir -p /var/lib/wmsa/data/ -cat > /var/lib/wmsa/db.properties < /var/lib/wmsa/conf/db.properties < /var/lib/wmsa/db.properties < /var/lib/wmsa/conf/disks.properties < /var/lib/wmsa/conf/db.properties < Date: Sat, 28 May 2022 13:51:29 +0200 Subject: [PATCH 3/5] Rewrote Encyclopedia loader, added functioning E2E test for new encyclopedia service --- .../wmsa/edge/EdgeSearchE2ETest.java | 3 - .../wmsa/edge/EncyclopediaE2ETest.java | 47 +++- .../src/e2e/resources/load-encyclopedia.sh | 32 +++ .../src/e2e/resources/nginx/encyclopedia.conf | 40 ++++ .../edge/tools/EncyclopediaLoaderTool.java | 59 +++++ .../wmsa/edge/tools/ZimConverterMain.java | 211 ------------------ .../wmsa/encyclopedia/EncyclopediaMain.java | 2 + .../encyclopedia/EncyclopediaService.java | 3 +- .../main/resources/sql/edge-crawler-cache.sql | 27 ++- .../src/main/resources/sql/reference-data.sql | 1 - 10 files changed, 206 insertions(+), 219 deletions(-) create mode 100644 marginalia_nu/src/e2e/resources/load-encyclopedia.sh create mode 100644 marginalia_nu/src/e2e/resources/nginx/encyclopedia.conf create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/EncyclopediaLoaderTool.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ZimConverterMain.java diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java index 5ff1249a..20103f15 100644 --- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java @@ -2,7 +2,6 @@ package nu.marginalia.wmsa.edge; import nu.marginalia.util.test.TestUtil; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; import nu.marginalia.wmsa.edge.crawling.CrawlJobExtractorMain; import org.jsoup.Jsoup; import org.junit.jupiter.api.Tag; @@ -19,7 +18,6 @@ import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Testcontainers; import org.testcontainers.utility.MountableFile; -import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; @@ -28,7 +26,6 @@ import java.util.ArrayList; import java.util.List; import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*; -import static org.testcontainers.containers.BrowserWebDriverContainer.VncRecordingMode.RECORD_ALL; @Tag("e2e") @Testcontainers diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java index ea18b2a8..03bc774e 100644 --- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java @@ -3,11 +3,21 @@ package nu.marginalia.wmsa.edge; import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; +import org.openqa.selenium.By; +import org.openqa.selenium.chrome.ChromeOptions; +import org.slf4j.LoggerFactory; +import org.testcontainers.containers.BindMode; +import org.testcontainers.containers.BrowserWebDriverContainer; import org.testcontainers.containers.GenericContainer; -import org.testcontainers.containers.MariaDBContainer; -import org.testcontainers.containers.Network; +import org.testcontainers.containers.NginxContainer; +import org.testcontainers.containers.output.Slf4jLogConsumer; +import org.testcontainers.containers.wait.strategy.Wait; import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Testcontainers; +import org.testcontainers.utility.MountableFile; + +import java.nio.file.Path; +import java.time.Duration; import static nu.marginalia.wmsa.configuration.ServiceDescriptor.ENCYCLOPEDIA; @@ -19,9 +29,42 @@ public class EncyclopediaE2ETest extends E2ETestBase { @Container public GenericContainer encyclopediaContainer = forService(ENCYCLOPEDIA, mariaDB); + @Container + public GenericContainer encyclopediaLoader = new GenericContainer<>("openjdk:17-alpine") + .dependsOn(encyclopediaContainer) + .dependsOn(mariaDB) + .withNetwork(network) + .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("encyclopedia-loader"))) + .withCopyFileToContainer(jarFile(), "/WMSA.jar") + .withCopyFileToContainer(MountableFile.forClasspathResource("load-encyclopedia.sh"), "/load-encyclopedia.sh") + .withFileSystemBind(getModelData().toString(), "/data", BindMode.READ_ONLY) + .withCommand("sh", "load-encyclopedia.sh") + .waitingFor(Wait.forLogMessage(".*ALL DONE.*", 1).withStartupTimeout(Duration.ofMinutes(10))); + @Container + public NginxContainer proxyNginx = new NginxContainer<>("nginx:stable") + .dependsOn(encyclopediaLoader) + .dependsOn(encyclopediaContainer) + .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("nginx"))) + .withCopyFileToContainer(MountableFile.forClasspathResource("nginx/encyclopedia.conf"), "/etc/nginx/conf.d/default.conf") + .withNetwork(network) + .withNetworkAliases("proxyNginx"); + + @Container + public BrowserWebDriverContainer chrome = new BrowserWebDriverContainer<>() + .withNetwork(network) + .withCapabilities(new ChromeOptions()); + + private Path getModelData() { + return Path.of(System.getProperty("user.dir")).resolve("data/test"); + } @Test public void run() { + var driver = chrome.getWebDriver(); + + driver.get("http://proxyNginx/wiki/Frog"); + System.out.println(driver.getTitle()); + System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); } } diff --git a/marginalia_nu/src/e2e/resources/load-encyclopedia.sh b/marginalia_nu/src/e2e/resources/load-encyclopedia.sh new file mode 100644 index 00000000..9700f0de --- /dev/null +++ b/marginalia_nu/src/e2e/resources/load-encyclopedia.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +mkdir -p /var/lib/wmsa/conf/ +mkdir -p /var/lib/wmsa/data/ +mkdir -p /data + +cat > /var/lib/wmsa/conf/db.properties < /var/lib/wmsa/conf/hosts < { + if (art != null) { + try { + sem.acquire(); + + pool.execute(() -> { + try { + convert(url, art); + } finally { + sem.release(); + } + }); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + } + }, p -> true); + + sem.acquire(12); + + encyclopediaClient.close(); + } + + private static void convert(String url, String art) { + String newData = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url, art); + + if (null != newData) { + encyclopediaClient.submitWiki(Context.internal(), url, newData) + .retry(5) + .blockingSubscribe(); + } + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ZimConverterMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ZimConverterMain.java deleted file mode 100644 index 1b35dc12..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ZimConverterMain.java +++ /dev/null @@ -1,211 +0,0 @@ -package nu.marginalia.wmsa.edge.tools; - -import lombok.AllArgsConstructor; -import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; -import nu.marginalia.wmsa.edge.assistant.dict.WikiCleaner; -import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.wmsa.encyclopedia.EncyclopediaClient; -import org.jsoup.Jsoup; -import org.openzim.ZIMTypes.ZIMFile; -import org.openzim.ZIMTypes.ZIMReader; - -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.PrintWriter; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Map; -import java.util.Objects; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.LinkedBlockingQueue; - -public class ZimConverterMain { - - static final LinkedBlockingQueue jobQueue = new LinkedBlockingQueue<>(100); - static final LinkedBlockingQueue analysisQueue = new LinkedBlockingQueue<>(100); - static boolean hasData = true; - static final EncyclopediaClient encyclopediaClient = new EncyclopediaClient(); - static NGramDict dict = new NGramDict(new LanguageModels( - Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"), - Path.of("/var/lib/wmsa/model/tfreq-generous-emstr.bin"), - Path.of("/var/lib/wmsa/model/opennlp-sentence.bin"), - Path.of("/var/lib/wmsa/model/English.RDR"), - Path.of("/var/lib/wmsa/model/English.DICT"), - Path.of("/var/lib/wmsa/model/opennlp-tok.bin") - ) - ); - public void extractUrlList() throws IOException { - var zr = new ZIMReader(new ZIMFile("/home/vlofgren/Work/wikipedia_en_all_nopic_2021-01.zim")); - - var urlList = zr.getURLListByURL(); - - try (PrintWriter pw = new PrintWriter(new FileOutputStream("/home/vlofgren/Work/wikiTitlesAndRedirects.sql"))) { - zr.forEachTitles( - ae -> { - pw.printf("INSERT INTO REF_WIKI_TITLE(NAME) VALUES (\"%s\");\n", ae.getUrl().replace("\\", "\\\\").replace("\"", "\\\"")); - }, - re -> { - pw.printf("INSERT INTO REF_WIKI_TITLE(NAME, REF_NAME) VALUES (\"%s\",\"%s\");\n", re.getUrl().replace("\\", "\\\\").replace("\"", "\\\""), urlList.get(re.getRedirectIndex()).replace("\\", "\\\\").replace("\"", "\\\"")); - } - ); - } - } - - public static void main(String[] args) throws IOException { -// convertJust("Aleph_number"); -// convertJust("Floyd–Steinberg_dithering"); -// convertJust("Laplace's_equation"); -// convertJust("John_Fahey"); -// convertJust("Plotinus"); -// convertJust("C++"); - convertAll(args); - encyclopediaClient.close(); - } - - @SneakyThrows - private static void convertJust(String url) { - String newData = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url, - Files.readString(Path.of("/home/vlofgren/Work/wiki-convert/", "in-" + url + ".html"))); - Files.writeString(Path.of("/home/vlofgren/Work/wiki-convert/", "out-" + url + ".html"), newData); - } - - private static void extractOne(String which, int clusterId) throws IOException { -// var zr = new ZIMReader(new ZIMFile(args[1])); - var zr = new ZIMReader(new ZIMFile("/home/vlofgren/Work/wikipedia_en_all_nopic_2021-01.zim")); - - int[] cluster = new int[] { clusterId }; - if (clusterId == -1) { - zr.forEachTitles(ae -> { - if (ae.getUrl().equals(which)) { - System.err.print(ae.getUrl() + " " + ae.getClusterNumber()); - cluster[0] = ae.getClusterNumber(); - } - }, re -> { - }); - } - - System.err.println("Extracting cluster " + cluster[0] ); - if (cluster[0] == -1) { - return; - } - zr.forEachArticles((url, art) -> { - if (art != null) { - if (which.equals(url)) { - try { - Files.writeString(Path.of("/home/vlofgren/Work/wiki-convert/","in-" + url + ".html"), art); - String newData = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url, art); - Files.writeString(Path.of("/home/vlofgren/Work/wiki-convert/", "out-" + url + ".html"), newData); - } catch (IOException e) { - e.printStackTrace(); - } - - } - scheduleJob(url, art); - } - }, p -> p == cluster[0]); - - } - - private static void convertAll(String[] args) throws IOException { - encyclopediaClient.setServiceRoute("127.0.0.1", Integer.parseInt(args[0])); - var zr = new ZIMReader(new ZIMFile(args[1])); -// var zr = new ZIMReader(new ZIMFile("/home/vlofgren/Work/wikipedia_en_all_nopic_2021-01.zim")); - - for (int i = 0; i < 8; i++) { - Thread t = new Thread(ZimConverterMain::jobExecutor); - t.setName("Converter"); - t.start(); - - Thread t2 = new Thread(() -> { - for (; ; ) { - String pt; - try { - pt = analysisQueue.take(); - } catch (InterruptedException e) { - e.printStackTrace(); - return; - } -// var topic = new TopicWordExtractor().extractWords(pt); -// var words = new NGramTextRankExtractor(dict, topic).extractWords(Collections.emptyList(), pt); -// System.out.println(Strings.join(words, ',')); - } - }); - t2.setName("Analysis"); - t2.start(); - } - - zr.forEachArticles((url, art) -> { - if (art != null) { - scheduleJob(url, art); - } - }, p -> true); - - hasData = false; - encyclopediaClient.close(); - } - - @SneakyThrows - private static void jobExecutor() { - while (hasData || !jobQueue.isEmpty()) { - var job = jobQueue.take(); - try { - job.convert(); - } - catch (Exception ex) { - System.err.println("Error in " + job.url); - ex.printStackTrace(); - } - } - } - - @SneakyThrows - private static void scheduleJob(String url, String art) { - jobQueue.put(new ConversionJob(art, url)); - } - - static final Map wordCount = new ConcurrentHashMap<>(); - static boolean isKeyword(String word) { - - int limit = 100_000; - long n = word.chars().filter(c -> c=='_').count(); - if (n == 0) limit = 2; - if (n == 1) limit = 1; - if (n == 2) limit = 1; - if (n >= 3) limit = 1; - - long c = word.chars().filter(ch -> ch >= 'a' && ch <= 'z').count(); - if (c-2 <= n) { - return false; - } - int hashA = word.hashCode(); - int hashB = Objects.hash(n, c, word.length(), word.charAt(0)); - long hash = (long) hashA + ((long) hashB << 32); - - return wordCount.compute(hash, (k, v) -> v == null ? 1 : v+1) == limit; - } - @AllArgsConstructor - private static class ConversionJob { - private final String data; - private final String url; - - - public void convert() throws InterruptedException { - var page = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url, data); - String pt = Jsoup.parse(page).text(); - analysisQueue.put(pt); - - /* - - String newData = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url, data); - - - if (null != newData) { - archiveClient.submitWiki(Context.internal(), url, newData) - .retry(5) - .blockingSubscribe(); - - }*/ - } - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaMain.java index 7f84e6c1..ee364dcc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaMain.java @@ -6,6 +6,7 @@ import com.google.inject.Injector; import nu.marginalia.wmsa.configuration.MainClass; import nu.marginalia.wmsa.configuration.ServiceDescriptor; import nu.marginalia.wmsa.configuration.module.ConfigurationModule; +import nu.marginalia.wmsa.configuration.module.DatabaseModule; public class EncyclopediaMain extends MainClass { private final EncyclopediaService service; @@ -15,6 +16,7 @@ public class EncyclopediaMain extends MainClass { Injector injector = Guice.createInjector( new EncyclopediaModule(), + new DatabaseModule(), new ConfigurationModule()); injector.getInstance(EncyclopediaMain.class); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaService.java index 5f3b8519..15219021 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaService.java @@ -62,6 +62,8 @@ public class EncyclopediaService extends Service { Spark.get("/wiki/has", this::pathWikiHas); Spark.post("/wiki/submit", this::pathWikiSubmit); + + Spark.awaitInitialization(); } @@ -190,7 +192,6 @@ public class EncyclopediaService extends Service { Files.createDirectories(filename.getParent()); - System.out.println(new String(data)); logger.debug("Writing {} to {}", wikiUrl, filename); try (var gos = new GZIPOutputStream(new FileOutputStream(filename.toFile()))) { diff --git a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql index 1460375c..25df8be4 100644 --- a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql +++ b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql @@ -254,4 +254,29 @@ CREATE INDEX IF NOT EXISTS EC_DOMAIN_TRIO ON EC_DOMAIN (STATE, DOMAIN_ALIAS, IND CREATE INDEX IF NOT EXISTS EC_URL_VISITED ON EC_URL (VISITED); CREATE INDEX IF NOT EXISTS EC_URL_VISITED_STATE ON EC_URL (VISITED, STATE); -CREATE INDEX IF NOT EXISTS EC_URL_IP ON EC_URL (IP); \ No newline at end of file +CREATE INDEX IF NOT EXISTS EC_URL_IP ON EC_URL (IP); + +---; + +DROP TABLE IF EXISTS REF_DICTIONARY; + +CREATE TABLE IF NOT EXISTS REF_DICTIONARY( + TYPE VARCHAR(16), + WORD VARCHAR(255), + DEFINITION VARCHAR(255) +) +CHARACTER SET utf8mb4 +COLLATE utf8mb4_unicode_ci; + +CREATE INDEX IF NOT EXISTS REF_DICTIONARY_WORD ON REF_DICTIONARY (WORD); + +CREATE TABLE IF NOT EXISTS REF_WIKI_TITLE( + NAME VARCHAR(255), + NAME_LOWER VARCHAR(255) GENERATED ALWAYS AS (LOWER(NAME)), + REF_NAME VARCHAR(255) +) +CHARACTER SET utf8mb4 +COLLATE utf8mb4_unicode_ci; + +CREATE INDEX IF NOT EXISTS REF_WIKI_LOWER ON REF_WIKI_TITLE (NAME_LOWER); +CREATE INDEX IF NOT EXISTS REF_WIKI_NAME ON REF_WIKI_TITLE (NAME); \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/sql/reference-data.sql b/marginalia_nu/src/main/resources/sql/reference-data.sql index 52d9abbb..733504ac 100644 --- a/marginalia_nu/src/main/resources/sql/reference-data.sql +++ b/marginalia_nu/src/main/resources/sql/reference-data.sql @@ -18,6 +18,5 @@ CREATE TABLE IF NOT EXISTS REF_WIKI_TITLE( CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; - CREATE INDEX IF NOT EXISTS REF_WIKI_LOWER ON REF_WIKI_TITLE (NAME_LOWER); CREATE INDEX IF NOT EXISTS REF_WIKI_NAME ON REF_WIKI_TITLE (NAME); \ No newline at end of file From 0acdd5b6604a044760e57d27b9de54ba33168890 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Sat, 28 May 2022 13:59:50 +0200 Subject: [PATCH 4/5] Switch to beefier docker image to fix 'Could not initialize class sun.awt.X11FontManager' for math rendering in Encyclopedia test. --- .../e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java index 03bc774e..e244b44d 100644 --- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java @@ -30,7 +30,7 @@ public class EncyclopediaE2ETest extends E2ETestBase { @Container public GenericContainer encyclopediaContainer = forService(ENCYCLOPEDIA, mariaDB); @Container - public GenericContainer encyclopediaLoader = new GenericContainer<>("openjdk:17-alpine") + public GenericContainer encyclopediaLoader = new GenericContainer<>("openjdk:17") .dependsOn(encyclopediaContainer) .dependsOn(mariaDB) .withNetwork(network) From 5a1ec53a84d51e4177c6762809536188fb31d209 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Sat, 28 May 2022 14:35:32 +0200 Subject: [PATCH 5/5] WIP: Encyclopedia service --- .../nu/marginalia/wmsa/edge/E2ETestBase.java | 2 +- .../wmsa/edge/EncyclopediaE2ETest.java | 95 +++++++++++++++++-- .../encyclopedia/EncyclopediaService.java | 6 +- .../main/resources/sql/edge-crawler-cache.sql | 2 +- 4 files changed, 95 insertions(+), 10 deletions(-) diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/E2ETestBase.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/E2ETestBase.java index eb534cf9..d86e85e6 100644 --- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/E2ETestBase.java +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/E2ETestBase.java @@ -17,7 +17,7 @@ import java.time.Duration; public abstract class E2ETestBase { public Network network = Network.newNetwork(); - public GenericContainer getMariaDBContainer() { + public MariaDBContainer getMariaDBContainer() { return new MariaDBContainer<>("mariadb") .withDatabaseName("WMSA_prod") .withUsername("wmsa") diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java index e244b44d..69170aa3 100644 --- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java @@ -1,31 +1,44 @@ package nu.marginalia.wmsa.edge; +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; +import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles; +import okhttp3.OkHttpClient; +import okhttp3.Request; +import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; +import org.mariadb.jdbc.Driver; import org.openqa.selenium.By; import org.openqa.selenium.chrome.ChromeOptions; import org.slf4j.LoggerFactory; -import org.testcontainers.containers.BindMode; -import org.testcontainers.containers.BrowserWebDriverContainer; -import org.testcontainers.containers.GenericContainer; -import org.testcontainers.containers.NginxContainer; +import org.testcontainers.containers.*; import org.testcontainers.containers.output.Slf4jLogConsumer; import org.testcontainers.containers.wait.strategy.Wait; import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Testcontainers; import org.testcontainers.utility.MountableFile; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; import java.nio.file.Path; +import java.sql.DriverManager; +import java.sql.SQLException; +import java.sql.Types; import java.time.Duration; +import java.util.concurrent.TimeUnit; import static nu.marginalia.wmsa.configuration.ServiceDescriptor.ENCYCLOPEDIA; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; @Tag("e2e") @Testcontainers public class EncyclopediaE2ETest extends E2ETestBase { @Container - public GenericContainer mariaDB = getMariaDBContainer(); + public MariaDBContainer mariaDB = getMariaDBContainer(); @Container public GenericContainer encyclopediaContainer = forService(ENCYCLOPEDIA, mariaDB); @@ -55,16 +68,84 @@ public class EncyclopediaE2ETest extends E2ETestBase { .withNetwork(network) .withCapabilities(new ChromeOptions()); + private Gson gson = new GsonBuilder().create(); + private OkHttpClient httpClient = new OkHttpClient.Builder() + .connectTimeout(100, TimeUnit.MILLISECONDS) + .readTimeout(6000, TimeUnit.SECONDS) + .retryOnConnectionFailure(true) + .followRedirects(true) + .build(); + private Path getModelData() { return Path.of(System.getProperty("user.dir")).resolve("data/test"); } @Test - public void run() { + public void run() throws MalformedURLException { + new Driver(); + + try (var conn = DriverManager.getConnection(mariaDB.getJdbcUrl(), "wmsa", "wmsa"); + var stmt = conn.prepareStatement("INSERT IGNORE INTO REF_WIKI_TITLE(NAME,REF_NAME) VALUES (?,?)")) { + + stmt.setString(1, "Forg"); + stmt.setString(2, "Frog"); + stmt.executeUpdate(); + + stmt.setString(1, "Frog"); + stmt.setNull(2, Types.VARCHAR); + stmt.executeUpdate(); + + } catch (SQLException e) { + throw new RuntimeException(e); + } + var driver = chrome.getWebDriver(); driver.get("http://proxyNginx/wiki/Frog"); System.out.println(driver.getTitle()); - System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); + driver.get("http://proxyNginx/wiki-search?query=Forg"); + System.out.println(driver.getTitle()); + + assertTrue(get(encyclopediaContainer.getHost(), + encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port), + "/wiki/has?url=Frog", Boolean.class)); + + assertFalse(get(encyclopediaContainer.getHost(), + encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port), + "/wiki/has?url=Marginalia", Boolean.class)); + + assertFalse(get(encyclopediaContainer.getHost(), + encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port), + "/wiki/has?url=Marginalia", Boolean.class)); + + + + var resultsForMarginalia = get(encyclopediaContainer.getHost(), + encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port), + "/encyclopedia/Marginalia", WikiArticles.class); + Assertions.assertTrue(resultsForMarginalia.getEntries().isEmpty()); + + var resultsForFrog = get(encyclopediaContainer.getHost(), + encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port), + "/encyclopedia/Frog", WikiArticles.class); + Assertions.assertFalse(resultsForFrog.getEntries().isEmpty()); + + var resultsForFoRg = get(encyclopediaContainer.getHost(), + encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port), + "/encyclopedia/Forg", WikiArticles.class); + Assertions.assertFalse(resultsForFoRg.getEntries().isEmpty()); + + + } + + + private T get(String host, Integer mappedPort, String path, Class clazz) throws MalformedURLException { + var req = new Request.Builder().get().url(new URL("http", host, mappedPort, path)).build(); + var call = httpClient.newCall(req); + try (var rsp = call.execute()) { + return gson.fromJson(rsp.body().charStream(), clazz); + } catch (IOException e) { + throw new RuntimeException(e); + } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaService.java index 15219021..156e2215 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaService.java @@ -1,5 +1,7 @@ package nu.marginalia.wmsa.encyclopedia; +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; import com.google.inject.Inject; import com.google.inject.name.Named; import lombok.SneakyThrows; @@ -31,6 +33,8 @@ public class EncyclopediaService extends Service { private static final Logger logger = LoggerFactory.getLogger(EncyclopediaService.class); private final MustacheRenderer wikiErrorPageRenderer; private final MustacheRenderer wikiSearchResultRenderer; + private final Gson gson = new GsonBuilder().create(); + private Path wikiPath; private EncyclopediaDao encyclopediaDao; @@ -62,11 +66,11 @@ public class EncyclopediaService extends Service { Spark.get("/wiki/has", this::pathWikiHas); Spark.post("/wiki/submit", this::pathWikiSubmit); + Spark.get("/encyclopedia/:term", (rq, rsp) -> encyclopediaDao.encyclopedia(rq.params("term")), gson::toJson); Spark.awaitInitialization(); } - @SneakyThrows private Object getWikiPage(Request req, Response rsp) { final String[] splats = req.splat(); diff --git a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql index 25df8be4..fc9e515d 100644 --- a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql +++ b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql @@ -270,7 +270,7 @@ COLLATE utf8mb4_unicode_ci; CREATE INDEX IF NOT EXISTS REF_DICTIONARY_WORD ON REF_DICTIONARY (WORD); -CREATE TABLE IF NOT EXISTS REF_WIKI_TITLE( +CREATE TABLE IF NOT EXISTS REF_WIKI_TITLE ( NAME VARCHAR(255), NAME_LOWER VARCHAR(255) GENERATED ALWAYS AS (LOWER(NAME)), REF_NAME VARCHAR(255)