mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-22 20:48:59 +00:00
Refactor website screenshot tool and website adjacencies calculator into code/tools.
This commit is contained in:
parent
502713f7a8
commit
3e9b37c264
@ -1,245 +0,0 @@
|
||||
package nu.marginalia.browse.experimental;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
|
||||
import it.unimi.dsi.fastutil.ints.IntSet;
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import lombok.SneakyThrows;
|
||||
import org.roaringbitmap.RoaringBitmap;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import static nu.marginalia.browse.experimental.AndCardIntSet.andCardinality;
|
||||
import static nu.marginalia.browse.experimental.AndCardIntSet.weightedProduct;
|
||||
|
||||
public class EdgeWordWordConsineSimilarityMain {
|
||||
final Object2IntOpenHashMap<String> stringIds;
|
||||
final AndCardIntSet[] dToSMap;
|
||||
final float[] weights;
|
||||
final boolean useWeights = false;
|
||||
|
||||
enum Direction {
|
||||
S_TO_D,
|
||||
D_TO_S
|
||||
}
|
||||
|
||||
final Direction direction = Direction.D_TO_S;
|
||||
|
||||
public EdgeWordWordConsineSimilarityMain(Path dataFile) throws IOException {
|
||||
System.out.println("String IDs");
|
||||
stringIds = mapStringsToIds(dataFile);
|
||||
|
||||
System.out.println("DtoS Map");
|
||||
dToSMap = constructDtoSMap(dataFile, stringIds);
|
||||
|
||||
System.out.println("Weights");
|
||||
|
||||
if (useWeights) {
|
||||
weights = new float[stringIds.size()];
|
||||
for (int i = 0; i < stringIds.size(); i++) {
|
||||
weights[i] = getWeight(i);
|
||||
}
|
||||
}
|
||||
else {
|
||||
weights = null;
|
||||
}
|
||||
|
||||
System.out.println("Ready");
|
||||
}
|
||||
|
||||
private Object2IntOpenHashMap<String> mapStringsToIds(Path dataFile) throws IOException {
|
||||
Object2IntOpenHashMap<String> stringIds = new Object2IntOpenHashMap<>(15_000_000);
|
||||
|
||||
try (var lines = Files.lines(dataFile, StandardCharsets.UTF_8)) {
|
||||
lines.forEach(line -> {
|
||||
int tab = line.indexOf('\t');
|
||||
if (tab <= 0)
|
||||
return;
|
||||
|
||||
// direction doesn't matter here
|
||||
String from = line.substring(0, tab);
|
||||
String to = line.substring(tab + 1);
|
||||
|
||||
stringIds.putIfAbsent(from, stringIds.size());
|
||||
stringIds.putIfAbsent(to, stringIds.size());
|
||||
});
|
||||
}
|
||||
return stringIds;
|
||||
}
|
||||
|
||||
private AndCardIntSet[] constructDtoSMap(Path dataFile, Object2IntOpenHashMap<String> stringIds) throws IOException {
|
||||
Map<Integer, RoaringBitmap> tmpMap = new HashMap<>(15_000_000);
|
||||
|
||||
try (var lines = Files.lines(dataFile, StandardCharsets.UTF_8)) {
|
||||
lines.forEach(line -> {
|
||||
int tab = line.indexOf('\t');
|
||||
if (tab <= 0) return;
|
||||
|
||||
String from, to;
|
||||
if (direction == Direction.S_TO_D) {
|
||||
from = line.substring(0, tab);
|
||||
to = line.substring(tab + 1);
|
||||
}
|
||||
else {
|
||||
from = line.substring(tab + 1);
|
||||
to = line.substring(0, tab);
|
||||
}
|
||||
|
||||
tmpMap.computeIfAbsent(stringIds.getInt(to), this::createBitmapWithSelf).add(stringIds.getInt(from));
|
||||
});
|
||||
}
|
||||
|
||||
AndCardIntSet[] dToSMap = new AndCardIntSet[stringIds.size()];
|
||||
tmpMap.entrySet().stream()
|
||||
.filter(e -> isEligible(e.getValue()))
|
||||
.forEach(e -> dToSMap[e.getKey()] = AndCardIntSet.of(e.getValue()));
|
||||
|
||||
return dToSMap;
|
||||
}
|
||||
|
||||
private boolean isEligible(RoaringBitmap value) {
|
||||
int cardinality = value.getCardinality();
|
||||
|
||||
return cardinality > 50;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void tryDomains(String... word) {
|
||||
|
||||
System.out.println(Arrays.toString(word));
|
||||
|
||||
int[] domainIds = Arrays.stream(word).mapToInt(stringIds::getInt).toArray();
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
findAdjacentDtoS(new IntOpenHashSet(domainIds), similarities -> {
|
||||
Set<Integer> ids = similarities.similarities().stream().map(Similarity::id).collect(Collectors.toSet());
|
||||
|
||||
Map<Integer, String> reveseIds = new HashMap<>(similarities.similarities.size());
|
||||
|
||||
stringIds.forEach((str, id) -> {
|
||||
if (ids.contains(id)) {
|
||||
reveseIds.put(id, str);
|
||||
}
|
||||
});
|
||||
|
||||
for (var similarity : similarities.similarities()) {
|
||||
System.out.println(reveseIds.get(similarity.id) + "\t" + dToSMap[similarity.id].getCardinality() + "\t" + prettyPercent(similarity.value));
|
||||
}
|
||||
});
|
||||
|
||||
System.out.println(System.currentTimeMillis() - start);
|
||||
}
|
||||
|
||||
private String prettyPercent(double val) {
|
||||
return String.format("%2.2f%%", 100. * val);
|
||||
}
|
||||
|
||||
|
||||
public RoaringBitmap createBitmapWithSelf(int val) {
|
||||
var bm = new RoaringBitmap();
|
||||
bm.add(val);
|
||||
return bm;
|
||||
}
|
||||
|
||||
double cosineSimilarity(AndCardIntSet a, AndCardIntSet b) {
|
||||
double andCardinality = andCardinality(a, b);
|
||||
andCardinality /= Math.sqrt(a.getCardinality());
|
||||
andCardinality /= Math.sqrt(b.getCardinality());
|
||||
return andCardinality;
|
||||
}
|
||||
|
||||
double expensiveCosineSimilarity(AndCardIntSet a, AndCardIntSet b) {
|
||||
return weightedProduct(weights, a, b) / Math.sqrt(a.mulAndSum(weights) * b.mulAndSum(weights));
|
||||
}
|
||||
|
||||
float getWeight(int i) {
|
||||
var vector = dToSMap[i];
|
||||
|
||||
if (vector == null) return 1.0f;
|
||||
return 1.0f / (float) Math.log(2+vector.getCardinality());
|
||||
}
|
||||
|
||||
record Similarities(int id, List<Similarity> similarities) {};
|
||||
record Similarity(int id, double value) {};
|
||||
|
||||
@SneakyThrows
|
||||
private void findAdjacentDtoS(IntSet ids, Consumer<Similarities> andThen) {
|
||||
|
||||
|
||||
AndCardIntSet[] vectors = ids.intStream().mapToObj(id -> dToSMap[id]).toArray(AndCardIntSet[]::new);
|
||||
for (var vector : vectors) {
|
||||
if (null == vector)
|
||||
return;
|
||||
}
|
||||
|
||||
var vector = Arrays.stream(vectors).reduce(AndCardIntSet::and).orElseThrow();
|
||||
|
||||
List<Similarity> similarities = IntStream.range(0, dToSMap.length).parallel().mapToObj(
|
||||
id -> vectorSimilarity(ids, vector, id))
|
||||
.filter(Objects::nonNull)
|
||||
.sorted(Comparator.comparing(Similarity::value))
|
||||
.toList();
|
||||
|
||||
|
||||
andThen.accept(new Similarities(0, similarities));
|
||||
}
|
||||
|
||||
double cardinalityLimit = 0.1;
|
||||
|
||||
private Similarity vectorSimilarity(IntSet ids, AndCardIntSet vector, int id) {
|
||||
|
||||
/* The minimum cardinality a vector can have so that
|
||||
*
|
||||
* a (x) b
|
||||
* ------- < k is given by k^2
|
||||
* |a||b|
|
||||
*
|
||||
*/
|
||||
|
||||
final double cardMin = Math.min(2, cardinalityLimit * cardinalityLimit * vector.getCardinality());
|
||||
|
||||
if (ids.contains(id) || id >= dToSMap.length)
|
||||
return null;
|
||||
|
||||
var otherVec = dToSMap[id];
|
||||
if (otherVec == null || otherVec.getCardinality() < cardMin)
|
||||
return null;
|
||||
|
||||
double similarity = cosineSimilarity(vector, otherVec);
|
||||
|
||||
if (similarity > 0.1) {
|
||||
if (useWeights) {
|
||||
var recalculated = expensiveCosineSimilarity(vector, otherVec);
|
||||
if (recalculated > 0.1) {
|
||||
return new Similarity(id, recalculated);
|
||||
}
|
||||
}
|
||||
else {
|
||||
return new Similarity(id, similarity);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
|
||||
var main = new EdgeWordWordConsineSimilarityMain(Path.of(args[0]));
|
||||
|
||||
for (;;) {
|
||||
String line = System.console().readLine("Words> ");
|
||||
if (line == null || line.isBlank()) {
|
||||
break;
|
||||
}
|
||||
|
||||
main.tryDomains(line.split("\\s+"));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -29,7 +29,7 @@ public class LoadTestMain {
|
||||
|
||||
for (int i = 0; i < 10000; i++) {
|
||||
String uri = "http://127.0.0.1:8080/search?query=%s&profile=corpo".formatted(
|
||||
Strings.join(pickNCommonWords(4), '+')
|
||||
Strings.join(pickNCommonWords(3), '+')
|
||||
);
|
||||
|
||||
HttpRequest req = HttpRequest.newBuilder(new URI(uri))
|
||||
|
@ -50,9 +50,8 @@ public class ScreenshotCaptureToolMain {
|
||||
|
||||
@NotNull
|
||||
private static ChromeDriver initChromeDriver() {
|
||||
System.setProperty("webdriver.chrome.driver", "/chromedriver");
|
||||
System.setProperty("webdriver.chrome.driver", "./chromedriver");
|
||||
ChromeOptions options = new ChromeOptions();
|
||||
options.setBinary("/usr/bin/chromium-browser");
|
||||
|
||||
options.setPageLoadStrategy(PageLoadStrategy.NORMAL);
|
||||
options.setPageLoadTimeout(Duration.ofSeconds(30));
|
||||
@ -66,7 +65,8 @@ public class ScreenshotCaptureToolMain {
|
||||
"high-dpi-support=0.5",
|
||||
"disable-gpu",
|
||||
"disable-dev-shm-usage",
|
||||
"disable-software-rasterizer");
|
||||
"disable-software-rasterizer"
|
||||
);
|
||||
|
||||
return new ChromeDriver(options);
|
||||
}
|
56
code/tools/website-adjacencies-calculator/build.gradle
Normal file
56
code/tools/website-adjacencies-calculator/build.gradle
Normal file
@ -0,0 +1,56 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id "io.freefair.lombok" version "5.3.3.3"
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
application {
|
||||
mainClass = 'nu.marginalia.adjacencies.WebsiteAdjacenciesCalculator'
|
||||
applicationName = 'website-adjacencies-calculator'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(17))
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:common:service')
|
||||
|
||||
implementation libs.lombok
|
||||
annotationProcessor libs.lombok
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation libs.guice
|
||||
implementation libs.roaringbitmap
|
||||
implementation libs.trove
|
||||
implementation libs.fastutil
|
||||
implementation libs.bundles.mariadb
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
||||
|
||||
configurations {
|
||||
e2eTestImplementation.extendsFrom(testImplementation)
|
||||
|
||||
}
|
||||
|
||||
test {
|
||||
maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1
|
||||
maxHeapSize = "8G"
|
||||
useJUnitPlatform()
|
||||
}
|
||||
|
||||
task fastTests(type: Test) {
|
||||
maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1
|
||||
maxHeapSize = "8G"
|
||||
useJUnitPlatform {
|
||||
excludeTags "slow"
|
||||
}
|
||||
}
|
8
code/tools/website-adjacencies-calculator/readme.md
Normal file
8
code/tools/website-adjacencies-calculator/readme.md
Normal file
@ -0,0 +1,8 @@
|
||||
# Website Adjacencies Calculator
|
||||
|
||||
This job updates the website similarity table based on the data in the domain and links-tables in the URL database.
|
||||
|
||||
It performs a brute force cosine similarity calculation across the entire link graph.
|
||||
|
||||
These adjacencies power the [explorer service](../../services-satellite/explorer-service) and
|
||||
[random websites](../../features-search/random-websites)-functionality.
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.browse.experimental;
|
||||
package nu.marginalia.adjacencies;
|
||||
|
||||
import com.google.common.hash.HashFunction;
|
||||
import com.google.common.hash.Hashing;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.browse.experimental;
|
||||
package nu.marginalia.adjacencies;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import gnu.trove.map.hash.TIntIntHashMap;
|
||||
@ -18,9 +18,9 @@ import java.util.concurrent.LinkedBlockingDeque;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
import static nu.marginalia.browse.experimental.AndCardIntSet.*;
|
||||
import static nu.marginalia.adjacencies.AndCardIntSet.*;
|
||||
|
||||
public class EdgeDomainLinkConsineSimilarityMain {
|
||||
public class WebsiteAdjacenciesCalculator {
|
||||
ArrayList<Integer> idsList = new ArrayList<>(100_000);
|
||||
ArrayList<AndCardIntSet> itemsList = new ArrayList<>(100_000);
|
||||
TIntObjectHashMap<AndCardIntSet> dToSMap = new TIntObjectHashMap<>(100_000);
|
||||
@ -31,7 +31,7 @@ public class EdgeDomainLinkConsineSimilarityMain {
|
||||
|
||||
private HikariDataSource dataSource;
|
||||
|
||||
public EdgeDomainLinkConsineSimilarityMain(HikariDataSource dataSource) throws SQLException {
|
||||
public WebsiteAdjacenciesCalculator(HikariDataSource dataSource) throws SQLException {
|
||||
this.dataSource = dataSource;
|
||||
|
||||
Map<Integer, RoaringBitmap> tmpMap = new HashMap<>(100_000);
|
||||
@ -140,14 +140,23 @@ public class EdgeDomainLinkConsineSimilarityMain {
|
||||
|
||||
public void insertThreadRun() {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var s = conn.createStatement();
|
||||
var stmt = conn.prepareStatement(
|
||||
"""
|
||||
INSERT INTO EC_DOMAIN_NEIGHBORS_2
|
||||
INSERT INTO EC_DOMAIN_NEIGHBORS_TMP
|
||||
(DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS)
|
||||
VALUES (?, ?, ?)
|
||||
ON DUPLICATE KEY UPDATE RELATEDNESS = GREATEST(EC_DOMAIN_NEIGHBORS_2.RELATEDNESS, VALUES(RELATEDNESS))
|
||||
ON DUPLICATE KEY UPDATE RELATEDNESS = GREATEST(EC_DOMAIN_NEIGHBORS_TMP.RELATEDNESS, VALUES(RELATEDNESS))
|
||||
""")
|
||||
) {
|
||||
|
||||
s.execute("""
|
||||
DROP TABLE IF EXISTS EC_DOMAIN_NEIGHBORS_TMP
|
||||
""");
|
||||
s.execute("""
|
||||
CREATE TABLE EC_DOMAIN_NEIGHBORS_TMP LIKE EC_DOMAIN_NEIGHBORS_2
|
||||
""");
|
||||
|
||||
while (running || !similaritiesLinkedBlockingDeque.isEmpty()) {
|
||||
var item = similaritiesLinkedBlockingDeque.pollFirst(60, TimeUnit.SECONDS);
|
||||
if (item == null) continue;
|
||||
@ -160,6 +169,14 @@ public class EdgeDomainLinkConsineSimilarityMain {
|
||||
}
|
||||
stmt.executeBatch();
|
||||
}
|
||||
|
||||
s.execute("""
|
||||
DROP TABLE IF EXISTS EC_DOMAIN_NEIGHBORS_2
|
||||
""");
|
||||
s.execute("""
|
||||
RENAME TABLE EC_DOMAIN_NEIGHBORS_TMP TO EC_DOMAIN_NEIGHBORS_2
|
||||
""");
|
||||
|
||||
} catch (SQLException | InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
@ -285,7 +302,7 @@ public class EdgeDomainLinkConsineSimilarityMain {
|
||||
public static void main(String[] args) throws SQLException {
|
||||
DatabaseModule dm = new DatabaseModule();
|
||||
|
||||
var main = new EdgeDomainLinkConsineSimilarityMain(dm.provideConnection());
|
||||
var main = new WebsiteAdjacenciesCalculator(dm.provideConnection());
|
||||
if (args.length == 0) {
|
||||
main.loadAll();
|
||||
}
|
@ -1,7 +1,5 @@
|
||||
package nu.marginalia.experimental;
|
||||
package nu.marginalia.adjacencies;
|
||||
|
||||
import nu.marginalia.browse.experimental.AndCardIntSet;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
@ -63,6 +63,8 @@ include 'code:process-models:crawling-model'
|
||||
include 'code:tools:term-frequency-extractor'
|
||||
include 'code:tools:crawl-job-extractor'
|
||||
include 'code:tools:experiment-runner'
|
||||
include 'code:tools:website-adjacencies-calculator'
|
||||
include 'code:tools:screenshot-capture-tool'
|
||||
include 'code:tools:load-test'
|
||||
|
||||
include 'third-party:porterstemmer'
|
||||
@ -74,7 +76,6 @@ include 'third-party:openzim'
|
||||
include 'third-party:count-min-sketch'
|
||||
include 'third-party:monkey-patch-opennlp'
|
||||
|
||||
include 'tools:screenshot'
|
||||
|
||||
dependencyResolutionManagement {
|
||||
|
||||
@ -163,8 +164,8 @@ dependencyResolutionManagement {
|
||||
library('junit.jupiter.engine','org.junit.jupiter','junit-jupiter-engine').version('')
|
||||
library('mockito','org.mockito','mockito-junit-jupiter').version('4.5.1')
|
||||
|
||||
library('selenium.chrome','org.seleniumhq.selenium','selenium-chrome-driver').version('4.5.3')
|
||||
library('selenium.java','org.seleniumhq.selenium','selenium-java').version('4.5.3')
|
||||
library('selenium.chrome','org.seleniumhq.selenium','selenium-chrome-driver').version('4.8.2')
|
||||
library('selenium.java','org.seleniumhq.selenium','selenium-java').version('4.8.2')
|
||||
|
||||
library('handlebars','com.github.jknack','handlebars').version('4.3.1')
|
||||
library('handlebars.markdown','com.github.jknack','handlebars-markdown').version('4.2.1')
|
||||
|
Loading…
Reference in New Issue
Block a user