Refactor website screenshot tool and website adjacencies calculator into code/tools.

This commit is contained in:
Viktor Lofgren 2023-04-11 16:20:27 +02:00
parent 502713f7a8
commit 3e9b37c264
11 changed files with 98 additions and 263 deletions

View File

@ -1,245 +0,0 @@
package nu.marginalia.browse.experimental;
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
import it.unimi.dsi.fastutil.ints.IntSet;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import lombok.SneakyThrows;
import org.roaringbitmap.RoaringBitmap;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.function.Consumer;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import static nu.marginalia.browse.experimental.AndCardIntSet.andCardinality;
import static nu.marginalia.browse.experimental.AndCardIntSet.weightedProduct;
public class EdgeWordWordConsineSimilarityMain {
final Object2IntOpenHashMap<String> stringIds;
final AndCardIntSet[] dToSMap;
final float[] weights;
final boolean useWeights = false;
enum Direction {
S_TO_D,
D_TO_S
}
final Direction direction = Direction.D_TO_S;
public EdgeWordWordConsineSimilarityMain(Path dataFile) throws IOException {
System.out.println("String IDs");
stringIds = mapStringsToIds(dataFile);
System.out.println("DtoS Map");
dToSMap = constructDtoSMap(dataFile, stringIds);
System.out.println("Weights");
if (useWeights) {
weights = new float[stringIds.size()];
for (int i = 0; i < stringIds.size(); i++) {
weights[i] = getWeight(i);
}
}
else {
weights = null;
}
System.out.println("Ready");
}
private Object2IntOpenHashMap<String> mapStringsToIds(Path dataFile) throws IOException {
Object2IntOpenHashMap<String> stringIds = new Object2IntOpenHashMap<>(15_000_000);
try (var lines = Files.lines(dataFile, StandardCharsets.UTF_8)) {
lines.forEach(line -> {
int tab = line.indexOf('\t');
if (tab <= 0)
return;
// direction doesn't matter here
String from = line.substring(0, tab);
String to = line.substring(tab + 1);
stringIds.putIfAbsent(from, stringIds.size());
stringIds.putIfAbsent(to, stringIds.size());
});
}
return stringIds;
}
private AndCardIntSet[] constructDtoSMap(Path dataFile, Object2IntOpenHashMap<String> stringIds) throws IOException {
Map<Integer, RoaringBitmap> tmpMap = new HashMap<>(15_000_000);
try (var lines = Files.lines(dataFile, StandardCharsets.UTF_8)) {
lines.forEach(line -> {
int tab = line.indexOf('\t');
if (tab <= 0) return;
String from, to;
if (direction == Direction.S_TO_D) {
from = line.substring(0, tab);
to = line.substring(tab + 1);
}
else {
from = line.substring(tab + 1);
to = line.substring(0, tab);
}
tmpMap.computeIfAbsent(stringIds.getInt(to), this::createBitmapWithSelf).add(stringIds.getInt(from));
});
}
AndCardIntSet[] dToSMap = new AndCardIntSet[stringIds.size()];
tmpMap.entrySet().stream()
.filter(e -> isEligible(e.getValue()))
.forEach(e -> dToSMap[e.getKey()] = AndCardIntSet.of(e.getValue()));
return dToSMap;
}
private boolean isEligible(RoaringBitmap value) {
int cardinality = value.getCardinality();
return cardinality > 50;
}
@SneakyThrows
public void tryDomains(String... word) {
System.out.println(Arrays.toString(word));
int[] domainIds = Arrays.stream(word).mapToInt(stringIds::getInt).toArray();
long start = System.currentTimeMillis();
findAdjacentDtoS(new IntOpenHashSet(domainIds), similarities -> {
Set<Integer> ids = similarities.similarities().stream().map(Similarity::id).collect(Collectors.toSet());
Map<Integer, String> reveseIds = new HashMap<>(similarities.similarities.size());
stringIds.forEach((str, id) -> {
if (ids.contains(id)) {
reveseIds.put(id, str);
}
});
for (var similarity : similarities.similarities()) {
System.out.println(reveseIds.get(similarity.id) + "\t" + dToSMap[similarity.id].getCardinality() + "\t" + prettyPercent(similarity.value));
}
});
System.out.println(System.currentTimeMillis() - start);
}
private String prettyPercent(double val) {
return String.format("%2.2f%%", 100. * val);
}
public RoaringBitmap createBitmapWithSelf(int val) {
var bm = new RoaringBitmap();
bm.add(val);
return bm;
}
double cosineSimilarity(AndCardIntSet a, AndCardIntSet b) {
double andCardinality = andCardinality(a, b);
andCardinality /= Math.sqrt(a.getCardinality());
andCardinality /= Math.sqrt(b.getCardinality());
return andCardinality;
}
double expensiveCosineSimilarity(AndCardIntSet a, AndCardIntSet b) {
return weightedProduct(weights, a, b) / Math.sqrt(a.mulAndSum(weights) * b.mulAndSum(weights));
}
float getWeight(int i) {
var vector = dToSMap[i];
if (vector == null) return 1.0f;
return 1.0f / (float) Math.log(2+vector.getCardinality());
}
record Similarities(int id, List<Similarity> similarities) {};
record Similarity(int id, double value) {};
@SneakyThrows
private void findAdjacentDtoS(IntSet ids, Consumer<Similarities> andThen) {
AndCardIntSet[] vectors = ids.intStream().mapToObj(id -> dToSMap[id]).toArray(AndCardIntSet[]::new);
for (var vector : vectors) {
if (null == vector)
return;
}
var vector = Arrays.stream(vectors).reduce(AndCardIntSet::and).orElseThrow();
List<Similarity> similarities = IntStream.range(0, dToSMap.length).parallel().mapToObj(
id -> vectorSimilarity(ids, vector, id))
.filter(Objects::nonNull)
.sorted(Comparator.comparing(Similarity::value))
.toList();
andThen.accept(new Similarities(0, similarities));
}
double cardinalityLimit = 0.1;
private Similarity vectorSimilarity(IntSet ids, AndCardIntSet vector, int id) {
/* The minimum cardinality a vector can have so that
*
* a (x) b
* ------- < k is given by k^2
* |a||b|
*
*/
final double cardMin = Math.min(2, cardinalityLimit * cardinalityLimit * vector.getCardinality());
if (ids.contains(id) || id >= dToSMap.length)
return null;
var otherVec = dToSMap[id];
if (otherVec == null || otherVec.getCardinality() < cardMin)
return null;
double similarity = cosineSimilarity(vector, otherVec);
if (similarity > 0.1) {
if (useWeights) {
var recalculated = expensiveCosineSimilarity(vector, otherVec);
if (recalculated > 0.1) {
return new Similarity(id, recalculated);
}
}
else {
return new Similarity(id, similarity);
}
}
return null;
}
public static void main(String[] args) throws IOException {
var main = new EdgeWordWordConsineSimilarityMain(Path.of(args[0]));
for (;;) {
String line = System.console().readLine("Words> ");
if (line == null || line.isBlank()) {
break;
}
main.tryDomains(line.split("\\s+"));
}
}
}

View File

@ -29,7 +29,7 @@ public class LoadTestMain {
for (int i = 0; i < 10000; i++) { for (int i = 0; i < 10000; i++) {
String uri = "http://127.0.0.1:8080/search?query=%s&profile=corpo".formatted( String uri = "http://127.0.0.1:8080/search?query=%s&profile=corpo".formatted(
Strings.join(pickNCommonWords(4), '+') Strings.join(pickNCommonWords(3), '+')
); );
HttpRequest req = HttpRequest.newBuilder(new URI(uri)) HttpRequest req = HttpRequest.newBuilder(new URI(uri))

View File

@ -50,9 +50,8 @@ public class ScreenshotCaptureToolMain {
@NotNull @NotNull
private static ChromeDriver initChromeDriver() { private static ChromeDriver initChromeDriver() {
System.setProperty("webdriver.chrome.driver", "/chromedriver"); System.setProperty("webdriver.chrome.driver", "./chromedriver");
ChromeOptions options = new ChromeOptions(); ChromeOptions options = new ChromeOptions();
options.setBinary("/usr/bin/chromium-browser");
options.setPageLoadStrategy(PageLoadStrategy.NORMAL); options.setPageLoadStrategy(PageLoadStrategy.NORMAL);
options.setPageLoadTimeout(Duration.ofSeconds(30)); options.setPageLoadTimeout(Duration.ofSeconds(30));
@ -66,7 +65,8 @@ public class ScreenshotCaptureToolMain {
"high-dpi-support=0.5", "high-dpi-support=0.5",
"disable-gpu", "disable-gpu",
"disable-dev-shm-usage", "disable-dev-shm-usage",
"disable-software-rasterizer"); "disable-software-rasterizer"
);
return new ChromeDriver(options); return new ChromeDriver(options);
} }

View File

@ -0,0 +1,56 @@
plugins {
id 'java'
id "io.freefair.lombok" version "5.3.3.3"
id 'application'
id 'jvm-test-suite'
}
application {
mainClass = 'nu.marginalia.adjacencies.WebsiteAdjacenciesCalculator'
applicationName = 'website-adjacencies-calculator'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(17))
}
}
dependencies {
implementation project(':code:common:model')
implementation project(':code:common:db')
implementation project(':code:common:service')
implementation libs.lombok
annotationProcessor libs.lombok
implementation libs.bundles.slf4j
implementation libs.guice
implementation libs.roaringbitmap
implementation libs.trove
implementation libs.fastutil
implementation libs.bundles.mariadb
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}
configurations {
e2eTestImplementation.extendsFrom(testImplementation)
}
test {
maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1
maxHeapSize = "8G"
useJUnitPlatform()
}
task fastTests(type: Test) {
maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1
maxHeapSize = "8G"
useJUnitPlatform {
excludeTags "slow"
}
}

View File

@ -0,0 +1,8 @@
# Website Adjacencies Calculator
This job updates the website similarity table based on the data in the domain and links-tables in the URL database.
It performs a brute force cosine similarity calculation across the entire link graph.
These adjacencies power the [explorer service](../../services-satellite/explorer-service) and
[random websites](../../features-search/random-websites)-functionality.

View File

@ -1,4 +1,4 @@
package nu.marginalia.browse.experimental; package nu.marginalia.adjacencies;
import com.google.common.hash.HashFunction; import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing; import com.google.common.hash.Hashing;

View File

@ -1,4 +1,4 @@
package nu.marginalia.browse.experimental; package nu.marginalia.adjacencies;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.map.hash.TIntIntHashMap; import gnu.trove.map.hash.TIntIntHashMap;
@ -18,9 +18,9 @@ import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.function.Consumer; import java.util.function.Consumer;
import static nu.marginalia.browse.experimental.AndCardIntSet.*; import static nu.marginalia.adjacencies.AndCardIntSet.*;
public class EdgeDomainLinkConsineSimilarityMain { public class WebsiteAdjacenciesCalculator {
ArrayList<Integer> idsList = new ArrayList<>(100_000); ArrayList<Integer> idsList = new ArrayList<>(100_000);
ArrayList<AndCardIntSet> itemsList = new ArrayList<>(100_000); ArrayList<AndCardIntSet> itemsList = new ArrayList<>(100_000);
TIntObjectHashMap<AndCardIntSet> dToSMap = new TIntObjectHashMap<>(100_000); TIntObjectHashMap<AndCardIntSet> dToSMap = new TIntObjectHashMap<>(100_000);
@ -31,7 +31,7 @@ public class EdgeDomainLinkConsineSimilarityMain {
private HikariDataSource dataSource; private HikariDataSource dataSource;
public EdgeDomainLinkConsineSimilarityMain(HikariDataSource dataSource) throws SQLException { public WebsiteAdjacenciesCalculator(HikariDataSource dataSource) throws SQLException {
this.dataSource = dataSource; this.dataSource = dataSource;
Map<Integer, RoaringBitmap> tmpMap = new HashMap<>(100_000); Map<Integer, RoaringBitmap> tmpMap = new HashMap<>(100_000);
@ -140,14 +140,23 @@ public class EdgeDomainLinkConsineSimilarityMain {
public void insertThreadRun() { public void insertThreadRun() {
try (var conn = dataSource.getConnection(); try (var conn = dataSource.getConnection();
var s = conn.createStatement();
var stmt = conn.prepareStatement( var stmt = conn.prepareStatement(
""" """
INSERT INTO EC_DOMAIN_NEIGHBORS_2 INSERT INTO EC_DOMAIN_NEIGHBORS_TMP
(DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS) (DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS)
VALUES (?, ?, ?) VALUES (?, ?, ?)
ON DUPLICATE KEY UPDATE RELATEDNESS = GREATEST(EC_DOMAIN_NEIGHBORS_2.RELATEDNESS, VALUES(RELATEDNESS)) ON DUPLICATE KEY UPDATE RELATEDNESS = GREATEST(EC_DOMAIN_NEIGHBORS_TMP.RELATEDNESS, VALUES(RELATEDNESS))
""") """)
) { ) {
s.execute("""
DROP TABLE IF EXISTS EC_DOMAIN_NEIGHBORS_TMP
""");
s.execute("""
CREATE TABLE EC_DOMAIN_NEIGHBORS_TMP LIKE EC_DOMAIN_NEIGHBORS_2
""");
while (running || !similaritiesLinkedBlockingDeque.isEmpty()) { while (running || !similaritiesLinkedBlockingDeque.isEmpty()) {
var item = similaritiesLinkedBlockingDeque.pollFirst(60, TimeUnit.SECONDS); var item = similaritiesLinkedBlockingDeque.pollFirst(60, TimeUnit.SECONDS);
if (item == null) continue; if (item == null) continue;
@ -160,6 +169,14 @@ public class EdgeDomainLinkConsineSimilarityMain {
} }
stmt.executeBatch(); stmt.executeBatch();
} }
s.execute("""
DROP TABLE IF EXISTS EC_DOMAIN_NEIGHBORS_2
""");
s.execute("""
RENAME TABLE EC_DOMAIN_NEIGHBORS_TMP TO EC_DOMAIN_NEIGHBORS_2
""");
} catch (SQLException | InterruptedException e) { } catch (SQLException | InterruptedException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
@ -285,7 +302,7 @@ public class EdgeDomainLinkConsineSimilarityMain {
public static void main(String[] args) throws SQLException { public static void main(String[] args) throws SQLException {
DatabaseModule dm = new DatabaseModule(); DatabaseModule dm = new DatabaseModule();
var main = new EdgeDomainLinkConsineSimilarityMain(dm.provideConnection()); var main = new WebsiteAdjacenciesCalculator(dm.provideConnection());
if (args.length == 0) { if (args.length == 0) {
main.loadAll(); main.loadAll();
} }

View File

@ -1,7 +1,5 @@
package nu.marginalia.experimental; package nu.marginalia.adjacencies;
import nu.marginalia.browse.experimental.AndCardIntSet;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;

View File

@ -63,6 +63,8 @@ include 'code:process-models:crawling-model'
include 'code:tools:term-frequency-extractor' include 'code:tools:term-frequency-extractor'
include 'code:tools:crawl-job-extractor' include 'code:tools:crawl-job-extractor'
include 'code:tools:experiment-runner' include 'code:tools:experiment-runner'
include 'code:tools:website-adjacencies-calculator'
include 'code:tools:screenshot-capture-tool'
include 'code:tools:load-test' include 'code:tools:load-test'
include 'third-party:porterstemmer' include 'third-party:porterstemmer'
@ -74,7 +76,6 @@ include 'third-party:openzim'
include 'third-party:count-min-sketch' include 'third-party:count-min-sketch'
include 'third-party:monkey-patch-opennlp' include 'third-party:monkey-patch-opennlp'
include 'tools:screenshot'
dependencyResolutionManagement { dependencyResolutionManagement {
@ -163,8 +164,8 @@ dependencyResolutionManagement {
library('junit.jupiter.engine','org.junit.jupiter','junit-jupiter-engine').version('') library('junit.jupiter.engine','org.junit.jupiter','junit-jupiter-engine').version('')
library('mockito','org.mockito','mockito-junit-jupiter').version('4.5.1') library('mockito','org.mockito','mockito-junit-jupiter').version('4.5.1')
library('selenium.chrome','org.seleniumhq.selenium','selenium-chrome-driver').version('4.5.3') library('selenium.chrome','org.seleniumhq.selenium','selenium-chrome-driver').version('4.8.2')
library('selenium.java','org.seleniumhq.selenium','selenium-java').version('4.5.3') library('selenium.java','org.seleniumhq.selenium','selenium-java').version('4.8.2')
library('handlebars','com.github.jknack','handlebars').version('4.3.1') library('handlebars','com.github.jknack','handlebars').version('4.3.1')
library('handlebars.markdown','com.github.jknack','handlebars-markdown').version('4.2.1') library('handlebars.markdown','com.github.jknack','handlebars-markdown').version('4.2.1')