diff --git a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/experimental/EdgeWordWordConsineSimilarityMain.java b/code/features-search/random-websites/src/main/java/nu/marginalia/browse/experimental/EdgeWordWordConsineSimilarityMain.java deleted file mode 100644 index d05be9b5..00000000 --- a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/experimental/EdgeWordWordConsineSimilarityMain.java +++ /dev/null @@ -1,245 +0,0 @@ -package nu.marginalia.browse.experimental; - -import it.unimi.dsi.fastutil.ints.IntOpenHashSet; -import it.unimi.dsi.fastutil.ints.IntSet; -import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; -import lombok.SneakyThrows; -import org.roaringbitmap.RoaringBitmap; - -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.*; -import java.util.function.Consumer; -import java.util.stream.Collectors; -import java.util.stream.IntStream; - -import static nu.marginalia.browse.experimental.AndCardIntSet.andCardinality; -import static nu.marginalia.browse.experimental.AndCardIntSet.weightedProduct; - -public class EdgeWordWordConsineSimilarityMain { - final Object2IntOpenHashMap stringIds; - final AndCardIntSet[] dToSMap; - final float[] weights; - final boolean useWeights = false; - - enum Direction { - S_TO_D, - D_TO_S - } - - final Direction direction = Direction.D_TO_S; - - public EdgeWordWordConsineSimilarityMain(Path dataFile) throws IOException { - System.out.println("String IDs"); - stringIds = mapStringsToIds(dataFile); - - System.out.println("DtoS Map"); - dToSMap = constructDtoSMap(dataFile, stringIds); - - System.out.println("Weights"); - - if (useWeights) { - weights = new float[stringIds.size()]; - for (int i = 0; i < stringIds.size(); i++) { - weights[i] = getWeight(i); - } - } - else { - weights = null; - } - - System.out.println("Ready"); - } - - private Object2IntOpenHashMap mapStringsToIds(Path dataFile) throws IOException { - Object2IntOpenHashMap stringIds = new Object2IntOpenHashMap<>(15_000_000); - - try (var lines = Files.lines(dataFile, StandardCharsets.UTF_8)) { - lines.forEach(line -> { - int tab = line.indexOf('\t'); - if (tab <= 0) - return; - - // direction doesn't matter here - String from = line.substring(0, tab); - String to = line.substring(tab + 1); - - stringIds.putIfAbsent(from, stringIds.size()); - stringIds.putIfAbsent(to, stringIds.size()); - }); - } - return stringIds; - } - - private AndCardIntSet[] constructDtoSMap(Path dataFile, Object2IntOpenHashMap stringIds) throws IOException { - Map tmpMap = new HashMap<>(15_000_000); - - try (var lines = Files.lines(dataFile, StandardCharsets.UTF_8)) { - lines.forEach(line -> { - int tab = line.indexOf('\t'); - if (tab <= 0) return; - - String from, to; - if (direction == Direction.S_TO_D) { - from = line.substring(0, tab); - to = line.substring(tab + 1); - } - else { - from = line.substring(tab + 1); - to = line.substring(0, tab); - } - - tmpMap.computeIfAbsent(stringIds.getInt(to), this::createBitmapWithSelf).add(stringIds.getInt(from)); - }); - } - - AndCardIntSet[] dToSMap = new AndCardIntSet[stringIds.size()]; - tmpMap.entrySet().stream() - .filter(e -> isEligible(e.getValue())) - .forEach(e -> dToSMap[e.getKey()] = AndCardIntSet.of(e.getValue())); - - return dToSMap; - } - - private boolean isEligible(RoaringBitmap value) { - int cardinality = value.getCardinality(); - - return cardinality > 50; - } - - @SneakyThrows - public void tryDomains(String... word) { - - System.out.println(Arrays.toString(word)); - - int[] domainIds = Arrays.stream(word).mapToInt(stringIds::getInt).toArray(); - - long start = System.currentTimeMillis(); - findAdjacentDtoS(new IntOpenHashSet(domainIds), similarities -> { - Set ids = similarities.similarities().stream().map(Similarity::id).collect(Collectors.toSet()); - - Map reveseIds = new HashMap<>(similarities.similarities.size()); - - stringIds.forEach((str, id) -> { - if (ids.contains(id)) { - reveseIds.put(id, str); - } - }); - - for (var similarity : similarities.similarities()) { - System.out.println(reveseIds.get(similarity.id) + "\t" + dToSMap[similarity.id].getCardinality() + "\t" + prettyPercent(similarity.value)); - } - }); - - System.out.println(System.currentTimeMillis() - start); - } - - private String prettyPercent(double val) { - return String.format("%2.2f%%", 100. * val); - } - - - public RoaringBitmap createBitmapWithSelf(int val) { - var bm = new RoaringBitmap(); - bm.add(val); - return bm; - } - - double cosineSimilarity(AndCardIntSet a, AndCardIntSet b) { - double andCardinality = andCardinality(a, b); - andCardinality /= Math.sqrt(a.getCardinality()); - andCardinality /= Math.sqrt(b.getCardinality()); - return andCardinality; - } - - double expensiveCosineSimilarity(AndCardIntSet a, AndCardIntSet b) { - return weightedProduct(weights, a, b) / Math.sqrt(a.mulAndSum(weights) * b.mulAndSum(weights)); - } - - float getWeight(int i) { - var vector = dToSMap[i]; - - if (vector == null) return 1.0f; - return 1.0f / (float) Math.log(2+vector.getCardinality()); - } - - record Similarities(int id, List similarities) {}; - record Similarity(int id, double value) {}; - - @SneakyThrows - private void findAdjacentDtoS(IntSet ids, Consumer andThen) { - - - AndCardIntSet[] vectors = ids.intStream().mapToObj(id -> dToSMap[id]).toArray(AndCardIntSet[]::new); - for (var vector : vectors) { - if (null == vector) - return; - } - - var vector = Arrays.stream(vectors).reduce(AndCardIntSet::and).orElseThrow(); - - List similarities = IntStream.range(0, dToSMap.length).parallel().mapToObj( - id -> vectorSimilarity(ids, vector, id)) - .filter(Objects::nonNull) - .sorted(Comparator.comparing(Similarity::value)) - .toList(); - - - andThen.accept(new Similarities(0, similarities)); - } - - double cardinalityLimit = 0.1; - - private Similarity vectorSimilarity(IntSet ids, AndCardIntSet vector, int id) { - - /* The minimum cardinality a vector can have so that - * - * a (x) b - * ------- < k is given by k^2 - * |a||b| - * - */ - - final double cardMin = Math.min(2, cardinalityLimit * cardinalityLimit * vector.getCardinality()); - - if (ids.contains(id) || id >= dToSMap.length) - return null; - - var otherVec = dToSMap[id]; - if (otherVec == null || otherVec.getCardinality() < cardMin) - return null; - - double similarity = cosineSimilarity(vector, otherVec); - - if (similarity > 0.1) { - if (useWeights) { - var recalculated = expensiveCosineSimilarity(vector, otherVec); - if (recalculated > 0.1) { - return new Similarity(id, recalculated); - } - } - else { - return new Similarity(id, similarity); - } - } - - return null; - } - - public static void main(String[] args) throws IOException { - - var main = new EdgeWordWordConsineSimilarityMain(Path.of(args[0])); - - for (;;) { - String line = System.console().readLine("Words> "); - if (line == null || line.isBlank()) { - break; - } - - main.tryDomains(line.split("\\s+")); - } - } - -} diff --git a/code/tools/load-test/src/main/java/nu/marginalia/load_test/LoadTestMain.java b/code/tools/load-test/src/main/java/nu/marginalia/load_test/LoadTestMain.java index cfbaeec6..17600cb9 100644 --- a/code/tools/load-test/src/main/java/nu/marginalia/load_test/LoadTestMain.java +++ b/code/tools/load-test/src/main/java/nu/marginalia/load_test/LoadTestMain.java @@ -29,7 +29,7 @@ public class LoadTestMain { for (int i = 0; i < 10000; i++) { String uri = "http://127.0.0.1:8080/search?query=%s&profile=corpo".formatted( - Strings.join(pickNCommonWords(4), '+') + Strings.join(pickNCommonWords(3), '+') ); HttpRequest req = HttpRequest.newBuilder(new URI(uri)) diff --git a/tools/screenshot/build.gradle b/code/tools/screenshot-capture-tool/build.gradle similarity index 100% rename from tools/screenshot/build.gradle rename to code/tools/screenshot-capture-tool/build.gradle diff --git a/tools/screenshot/src/main/java/nu/marginalia/screenshot/ScreenshotCaptureToolMain.java b/code/tools/screenshot-capture-tool/src/main/java/nu/marginalia/screenshot/ScreenshotCaptureToolMain.java similarity index 96% rename from tools/screenshot/src/main/java/nu/marginalia/screenshot/ScreenshotCaptureToolMain.java rename to code/tools/screenshot-capture-tool/src/main/java/nu/marginalia/screenshot/ScreenshotCaptureToolMain.java index 24c771a8..e52be486 100644 --- a/tools/screenshot/src/main/java/nu/marginalia/screenshot/ScreenshotCaptureToolMain.java +++ b/code/tools/screenshot-capture-tool/src/main/java/nu/marginalia/screenshot/ScreenshotCaptureToolMain.java @@ -50,9 +50,8 @@ public class ScreenshotCaptureToolMain { @NotNull private static ChromeDriver initChromeDriver() { - System.setProperty("webdriver.chrome.driver", "/chromedriver"); + System.setProperty("webdriver.chrome.driver", "./chromedriver"); ChromeOptions options = new ChromeOptions(); - options.setBinary("/usr/bin/chromium-browser"); options.setPageLoadStrategy(PageLoadStrategy.NORMAL); options.setPageLoadTimeout(Duration.ofSeconds(30)); @@ -66,7 +65,8 @@ public class ScreenshotCaptureToolMain { "high-dpi-support=0.5", "disable-gpu", "disable-dev-shm-usage", - "disable-software-rasterizer"); + "disable-software-rasterizer" + ); return new ChromeDriver(options); } diff --git a/tools/screenshot/src/main/java/nu/marginalia/screenshot/ScreenshotLoaderMain.java b/code/tools/screenshot-capture-tool/src/main/java/nu/marginalia/screenshot/ScreenshotLoaderMain.java similarity index 100% rename from tools/screenshot/src/main/java/nu/marginalia/screenshot/ScreenshotLoaderMain.java rename to code/tools/screenshot-capture-tool/src/main/java/nu/marginalia/screenshot/ScreenshotLoaderMain.java diff --git a/code/tools/website-adjacencies-calculator/build.gradle b/code/tools/website-adjacencies-calculator/build.gradle new file mode 100644 index 00000000..99fca87e --- /dev/null +++ b/code/tools/website-adjacencies-calculator/build.gradle @@ -0,0 +1,56 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + id 'application' + id 'jvm-test-suite' +} + +application { + mainClass = 'nu.marginalia.adjacencies.WebsiteAdjacenciesCalculator' + applicationName = 'website-adjacencies-calculator' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation project(':code:common:model') + implementation project(':code:common:db') + implementation project(':code:common:service') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.guice + implementation libs.roaringbitmap + implementation libs.trove + implementation libs.fastutil + implementation libs.bundles.mariadb + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +configurations { + e2eTestImplementation.extendsFrom(testImplementation) + +} + +test { + maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 + maxHeapSize = "8G" + useJUnitPlatform() +} + +task fastTests(type: Test) { + maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 + maxHeapSize = "8G" + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/code/tools/website-adjacencies-calculator/readme.md b/code/tools/website-adjacencies-calculator/readme.md new file mode 100644 index 00000000..baeec469 --- /dev/null +++ b/code/tools/website-adjacencies-calculator/readme.md @@ -0,0 +1,8 @@ +# Website Adjacencies Calculator + +This job updates the website similarity table based on the data in the domain and links-tables in the URL database. + +It performs a brute force cosine similarity calculation across the entire link graph. + +These adjacencies power the [explorer service](../../services-satellite/explorer-service) and +[random websites](../../features-search/random-websites)-functionality. \ No newline at end of file diff --git a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/experimental/AndCardIntSet.java b/code/tools/website-adjacencies-calculator/src/main/java/nu/marginalia/adjacencies/AndCardIntSet.java similarity index 99% rename from code/features-search/random-websites/src/main/java/nu/marginalia/browse/experimental/AndCardIntSet.java rename to code/tools/website-adjacencies-calculator/src/main/java/nu/marginalia/adjacencies/AndCardIntSet.java index 645618aa..08fa8c82 100644 --- a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/experimental/AndCardIntSet.java +++ b/code/tools/website-adjacencies-calculator/src/main/java/nu/marginalia/adjacencies/AndCardIntSet.java @@ -1,4 +1,4 @@ -package nu.marginalia.browse.experimental; +package nu.marginalia.adjacencies; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; diff --git a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/experimental/EdgeDomainLinkConsineSimilarityMain.java b/code/tools/website-adjacencies-calculator/src/main/java/nu/marginalia/adjacencies/WebsiteAdjacenciesCalculator.java similarity index 91% rename from code/features-search/random-websites/src/main/java/nu/marginalia/browse/experimental/EdgeDomainLinkConsineSimilarityMain.java rename to code/tools/website-adjacencies-calculator/src/main/java/nu/marginalia/adjacencies/WebsiteAdjacenciesCalculator.java index 77b52e8c..81262708 100644 --- a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/experimental/EdgeDomainLinkConsineSimilarityMain.java +++ b/code/tools/website-adjacencies-calculator/src/main/java/nu/marginalia/adjacencies/WebsiteAdjacenciesCalculator.java @@ -1,4 +1,4 @@ -package nu.marginalia.browse.experimental; +package nu.marginalia.adjacencies; import com.zaxxer.hikari.HikariDataSource; import gnu.trove.map.hash.TIntIntHashMap; @@ -18,9 +18,9 @@ import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.TimeUnit; import java.util.function.Consumer; -import static nu.marginalia.browse.experimental.AndCardIntSet.*; +import static nu.marginalia.adjacencies.AndCardIntSet.*; -public class EdgeDomainLinkConsineSimilarityMain { +public class WebsiteAdjacenciesCalculator { ArrayList idsList = new ArrayList<>(100_000); ArrayList itemsList = new ArrayList<>(100_000); TIntObjectHashMap dToSMap = new TIntObjectHashMap<>(100_000); @@ -31,7 +31,7 @@ public class EdgeDomainLinkConsineSimilarityMain { private HikariDataSource dataSource; - public EdgeDomainLinkConsineSimilarityMain(HikariDataSource dataSource) throws SQLException { + public WebsiteAdjacenciesCalculator(HikariDataSource dataSource) throws SQLException { this.dataSource = dataSource; Map tmpMap = new HashMap<>(100_000); @@ -140,14 +140,23 @@ public class EdgeDomainLinkConsineSimilarityMain { public void insertThreadRun() { try (var conn = dataSource.getConnection(); + var s = conn.createStatement(); var stmt = conn.prepareStatement( """ - INSERT INTO EC_DOMAIN_NEIGHBORS_2 + INSERT INTO EC_DOMAIN_NEIGHBORS_TMP (DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS) VALUES (?, ?, ?) - ON DUPLICATE KEY UPDATE RELATEDNESS = GREATEST(EC_DOMAIN_NEIGHBORS_2.RELATEDNESS, VALUES(RELATEDNESS)) + ON DUPLICATE KEY UPDATE RELATEDNESS = GREATEST(EC_DOMAIN_NEIGHBORS_TMP.RELATEDNESS, VALUES(RELATEDNESS)) """) ) { + + s.execute(""" + DROP TABLE IF EXISTS EC_DOMAIN_NEIGHBORS_TMP + """); + s.execute(""" + CREATE TABLE EC_DOMAIN_NEIGHBORS_TMP LIKE EC_DOMAIN_NEIGHBORS_2 + """); + while (running || !similaritiesLinkedBlockingDeque.isEmpty()) { var item = similaritiesLinkedBlockingDeque.pollFirst(60, TimeUnit.SECONDS); if (item == null) continue; @@ -160,6 +169,14 @@ public class EdgeDomainLinkConsineSimilarityMain { } stmt.executeBatch(); } + + s.execute(""" + DROP TABLE IF EXISTS EC_DOMAIN_NEIGHBORS_2 + """); + s.execute(""" + RENAME TABLE EC_DOMAIN_NEIGHBORS_TMP TO EC_DOMAIN_NEIGHBORS_2 + """); + } catch (SQLException | InterruptedException e) { throw new RuntimeException(e); } @@ -285,7 +302,7 @@ public class EdgeDomainLinkConsineSimilarityMain { public static void main(String[] args) throws SQLException { DatabaseModule dm = new DatabaseModule(); - var main = new EdgeDomainLinkConsineSimilarityMain(dm.provideConnection()); + var main = new WebsiteAdjacenciesCalculator(dm.provideConnection()); if (args.length == 0) { main.loadAll(); } diff --git a/code/features-search/random-websites/src/test/java/nu/marginalia/experimental/AndCardIntSetTest.java b/code/tools/website-adjacencies-calculator/src/test/java/nu/marginalia/adjacencies/AndCardIntSetTest.java similarity index 87% rename from code/features-search/random-websites/src/test/java/nu/marginalia/experimental/AndCardIntSetTest.java rename to code/tools/website-adjacencies-calculator/src/test/java/nu/marginalia/adjacencies/AndCardIntSetTest.java index 65f83952..535e9123 100644 --- a/code/features-search/random-websites/src/test/java/nu/marginalia/experimental/AndCardIntSetTest.java +++ b/code/tools/website-adjacencies-calculator/src/test/java/nu/marginalia/adjacencies/AndCardIntSetTest.java @@ -1,7 +1,5 @@ -package nu.marginalia.experimental; +package nu.marginalia.adjacencies; -import nu.marginalia.browse.experimental.AndCardIntSet; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.assertEquals; diff --git a/settings.gradle b/settings.gradle index 0868e447..68ce9842 100644 --- a/settings.gradle +++ b/settings.gradle @@ -63,6 +63,8 @@ include 'code:process-models:crawling-model' include 'code:tools:term-frequency-extractor' include 'code:tools:crawl-job-extractor' include 'code:tools:experiment-runner' +include 'code:tools:website-adjacencies-calculator' +include 'code:tools:screenshot-capture-tool' include 'code:tools:load-test' include 'third-party:porterstemmer' @@ -74,7 +76,6 @@ include 'third-party:openzim' include 'third-party:count-min-sketch' include 'third-party:monkey-patch-opennlp' -include 'tools:screenshot' dependencyResolutionManagement { @@ -163,8 +164,8 @@ dependencyResolutionManagement { library('junit.jupiter.engine','org.junit.jupiter','junit-jupiter-engine').version('') library('mockito','org.mockito','mockito-junit-jupiter').version('4.5.1') - library('selenium.chrome','org.seleniumhq.selenium','selenium-chrome-driver').version('4.5.3') - library('selenium.java','org.seleniumhq.selenium','selenium-java').version('4.5.3') + library('selenium.chrome','org.seleniumhq.selenium','selenium-chrome-driver').version('4.8.2') + library('selenium.java','org.seleniumhq.selenium','selenium-java').version('4.8.2') library('handlebars','com.github.jknack','handlebars').version('4.3.1') library('handlebars.markdown','com.github.jknack','handlebars-markdown').version('4.2.1')