Merge pull request 'master' (#35) from master into release

Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/35
This commit is contained in:
Viktor Lofgren 2022-07-22 00:34:56 +02:00
commit 8c6a8fb7aa
189 changed files with 4476 additions and 4322 deletions

View File

@ -59,12 +59,12 @@ dependencies {
implementation "com.sparkjava:spark-core:2.9.3"
implementation 'com.opencsv:opencsv:5.6'
implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.1'
implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.1'
implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.1'
implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.1'
implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.1'
implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.1'
implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.2'
implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.2'
implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.2'
implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.2'
implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.2'
implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.2'
implementation 'org.slf4j:slf4j-api:1.7.36'
@ -76,7 +76,6 @@ dependencies {
implementation 'com.github.ThatJavaNerd:JRAW:1.1.0'
implementation group: 'com.h2database', name: 'h2', version: '2.1.210'
testImplementation group: 'org.mockito', name: 'mockito-core', version: '4.3.1'
implementation 'org.jsoup:jsoup:1.14.3'
implementation group: 'com.github.crawler-commons', name: 'crawler-commons', version: '1.2'
@ -86,7 +85,7 @@ dependencies {
implementation 'com.zaxxer:HikariCP:5.0.1'
implementation 'org.apache.opennlp:opennlp-tools:1.9.3'
implementation 'org.apache.opennlp:opennlp-tools:1.9.4'
implementation 'io.prometheus:simpleclient:0.15.0'
implementation 'io.prometheus:simpleclient_servlet:0.15.0'
implementation 'io.prometheus:simpleclient_httpserver:0.15.0'
@ -123,15 +122,19 @@ dependencies {
testImplementation 'org.projectlombok:lombok:1.18.24'
testAnnotationProcessor 'org.projectlombok:lombok:1.18.24'
testImplementation group: 'org.mockito', name: 'mockito-core', version: '4.5.1'
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.2')
testImplementation 'org.testcontainers:mariadb:1.17.2'
testImplementation "org.testcontainers:junit-jupiter:1.17.2"
e2eTestImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2'
e2eTestRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
e2eTestImplementation 'org.projectlombok:lombok:1.18.24'
e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.22'
e2eTestImplementation 'org.testcontainers:mariadb:1.17.1'
e2eTestImplementation 'org.testcontainers:nginx:1.17.1'
e2eTestImplementation 'org.testcontainers:testcontainers:1.17.1'
e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.1"
e2eTestImplementation "org.testcontainers:selenium:1.17.1"
e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.24'
e2eTestImplementation 'org.testcontainers:nginx:1.17.2'
e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.2"
e2eTestImplementation "org.testcontainers:selenium:1.17.2"
e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.1.4'
e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.1.4'
}

View File

@ -12,7 +12,10 @@ import org.openqa.selenium.chrome.ChromeOptions;
import org.openzim.ZIMTypes.ZIMFile;
import org.openzim.ZIMTypes.ZIMReader;
import org.slf4j.LoggerFactory;
import org.testcontainers.containers.*;
import org.testcontainers.containers.BindMode;
import org.testcontainers.containers.BrowserWebDriverContainer;
import org.testcontainers.containers.GenericContainer;
import org.testcontainers.containers.NginxContainer;
import org.testcontainers.containers.output.Slf4jLogConsumer;
import org.testcontainers.containers.wait.strategy.Wait;
import org.testcontainers.junit.jupiter.Container;
@ -28,6 +31,7 @@ import java.util.ArrayList;
import java.util.List;
import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
@Tag("e2e")
@Testcontainers
@ -40,8 +44,6 @@ public class EdgeSearchE2ETest extends E2ETestBase {
@Container
public static GenericContainer<?> assistantContainer = forService(EDGE_ASSISTANT, mariaDB);
@Container
public static GenericContainer<?> encyclopediaContainer = forService(ENCYCLOPEDIA, mariaDB);
@Container
public static GenericContainer<?> indexContainer = forService(EDGE_INDEX, mariaDB);
@Container
@ -156,6 +158,16 @@ public class EdgeSearchE2ETest extends E2ETestBase {
return wikipediaFiles.toString();
}
private List<String> getTitlesFromSearchResults(String html) {
List<String> ret = new ArrayList<>();
for (var title : Jsoup.parse(html).select(".card.search-result > h2")) {
ret.add(title.text());
}
return ret;
}
@Test
public void testFrontPage() throws IOException {
var driver = chrome.getWebDriver();
@ -173,8 +185,9 @@ public class EdgeSearchE2ETest extends E2ETestBase {
driver.get("http://proxyNginx/search?query=bird&profile=corpo");
System.out.println(driver.getTitle());
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
assertEquals(List.of("Bird"), getTitlesFromSearchResults(html));
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query"));
}
@ -187,20 +200,24 @@ public class EdgeSearchE2ETest extends E2ETestBase {
System.out.println(driver.getTitle());
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-info"));
}
@Test
public void testSiteSearch() throws IOException {
var driver = chrome.getWebDriver();
driver.get("http://proxyNginx/search?query=site:wikipedia.local%20frog");
System.out.println(driver.getTitle());
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-search"));
assertEquals(List.of("Frog", "Binomial nomenclature", "Mantis", "Amphibian"), getTitlesFromSearchResults(html));
}
@Test
public void testBrowse() throws IOException {
var driver = chrome.getWebDriver();
@ -209,7 +226,6 @@ public class EdgeSearchE2ETest extends E2ETestBase {
System.out.println(driver.getTitle());
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("browse"));
}
@Test
@ -220,7 +236,6 @@ public class EdgeSearchE2ETest extends E2ETestBase {
System.out.println(driver.getTitle());
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("define"));
}
@Test

View File

@ -10,7 +10,7 @@ import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.mariadb.jdbc.Driver;
import org.openqa.selenium.By;
import org.openqa.selenium.OutputType;
import org.openqa.selenium.chrome.ChromeOptions;
import org.slf4j.LoggerFactory;
import org.testcontainers.containers.*;
@ -23,16 +23,16 @@ import org.testcontainers.utility.MountableFile;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Types;
import java.time.Duration;
import java.time.LocalDateTime;
import java.util.concurrent.TimeUnit;
import static nu.marginalia.wmsa.configuration.ServiceDescriptor.ENCYCLOPEDIA;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
@Tag("e2e")
@Testcontainers
@ -80,12 +80,23 @@ public class EncyclopediaE2ETest extends E2ETestBase {
return Path.of(System.getProperty("user.dir")).resolve("data/test");
}
private static Path screenshotFilename(String operation) throws IOException {
var path = Path.of(System.getProperty("user.dir")).resolve("build/test/e2e/");
Files.createDirectories(path);
String name = String.format("test-encyclopedia-%s-%s.png", operation, LocalDateTime.now());
path = path.resolve(name);
System.out.println("Screenshot in " + path);
return path;
}
@Test
public void run() throws MalformedURLException {
public void run() throws IOException {
new Driver();
try (var conn = DriverManager.getConnection(mariaDB.getJdbcUrl(), "wmsa", "wmsa");
var stmt = conn.prepareStatement("INSERT IGNORE INTO REF_WIKI_TITLE(NAME,REF_NAME) VALUES (?,?)")) {
var stmt = conn.prepareStatement("INSERT IGNORE INTO REF_WIKI_ARTICLE(NAME,REF_NAME) VALUES (?,?)")) {
stmt.setString(1, "Forg");
stmt.setString(2, "Frog");
@ -102,24 +113,16 @@ public class EncyclopediaE2ETest extends E2ETestBase {
var driver = chrome.getWebDriver();
driver.get("http://proxyNginx/wiki/Frog");
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("get-article"));
driver.get("http://proxyNginx/wiki/Forg");
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("get-article-redir"));
System.out.println(driver.getTitle());
driver.get("http://proxyNginx/wiki-search?query=Forg");
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("disambig"));
System.out.println(driver.getTitle());
assertTrue(get(encyclopediaContainer.getHost(),
encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port),
"/wiki/has?url=Frog", Boolean.class));
assertFalse(get(encyclopediaContainer.getHost(),
encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port),
"/wiki/has?url=Marginalia", Boolean.class));
assertFalse(get(encyclopediaContainer.getHost(),
encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port),
"/wiki/has?url=Marginalia", Boolean.class));
var resultsForMarginalia = get(encyclopediaContainer.getHost(),
encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port),
"/encyclopedia/Marginalia", WikiArticles.class);

View File

@ -70,4 +70,4 @@ dating dating
EOF
echo "*** Starting $1"
WMSA_HOME=${HOME} java -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1
WMSA_HOME=${HOME} java -server -Xmx2G -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1

View File

@ -0,0 +1,37 @@
package nu.marginalia.util;
import java.nio.ByteBuffer;
public class DenseBitMap {
public static final long MAX_CAPACITY_2GB_16BN_ITEMS=(1L<<34)-8;
public final long cardinality;
private final ByteBuffer buffer;
public DenseBitMap(long cardinality) {
this.cardinality = cardinality;
boolean misaligned = (cardinality & 7) > 0;
this.buffer = ByteBuffer.allocateDirect((int)((cardinality / 8) + (misaligned ? 1 : 0)));
}
public boolean get(long pos) {
return (buffer.get((int)(pos >>> 3)) & ((byte)1 << (int)(pos & 7))) != 0;
}
/** Set the bit indexed by pos, returns
* its previous value.
*/
public boolean set(long pos) {
int offset = (int) (pos >>> 3);
int oldVal = buffer.get(offset);
int mask = (byte) 1 << (int) (pos & 7);
buffer.put(offset, (byte) (oldVal | mask));
return (oldVal & mask) != 0;
}
public void clear(long pos) {
int offset = (int)(pos >>> 3);
buffer.put(offset, (byte)(buffer.get(offset) & ~(byte)(1 << (int)(pos & 7))));
}
}

View File

@ -0,0 +1,31 @@
package nu.marginalia.util;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class ListChunker {
/** Chops data into a list of lists of max length size
*
* Caveat: Relies on subList and does not clone "data", so
* changes to the original list may affect the sub-lists
* in unspecified ways
*
* @see List#subList
*/
public static <T> List<List<T>> chopList(List<T> data, int size) {
if (data.isEmpty())
return Collections.emptyList();
else if (data.size() < size)
return List.of(data);
final List<List<T>> ret = new ArrayList<>(1 + data.size() / size);
for (int i = 0; i < data.size(); i+=size) {
ret.add(data.subList(i, Math.min(data.size(), i+size)));
}
return ret;
}
}

View File

@ -1,6 +1,6 @@
package nu.marginalia.util;
import io.prometheus.client.Gauge;
import lombok.SneakyThrows;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -18,10 +18,6 @@ import java.nio.file.Path;
* */
public class RandomWriteFunnel implements AutoCloseable {
private final static Gauge write_rate = Gauge.build("wmsa_rwf_write_bytes", "Bytes/s")
.register();
private final static Gauge transfer_rate = Gauge.build("wmsa_rwf_transfer_bytes", "Bytes/s")
.register();
private static final Logger logger = LoggerFactory.getLogger(RandomWriteFunnel.class);
private final DataBin[] bins;
@ -34,7 +30,7 @@ public class RandomWriteFunnel implements AutoCloseable {
int binCount = (int) (size / binSize + ((size % binSize) != 0L ? 1 : 0));
bins = new DataBin[binCount];
for (int i = 0; i < binCount; i++) {
bins[i] = new DataBin(tempDir, (int) Math.min(size - binSize * i, binSize));
bins[i] = new DataBin(tempDir, Math.min((int) (size - binSize * i), binSize));
}
}
else {
@ -42,25 +38,25 @@ public class RandomWriteFunnel implements AutoCloseable {
}
}
public void put(long address, long data) throws IOException {
bins[((int)(address / binSize))].put((int)(address%binSize), data);
@SneakyThrows
public void put(long address, long data) {
int bin = (int)(address / binSize);
int offset = (int)(address%binSize);
bins[bin].put(offset, data);
}
public void write(FileChannel o) throws IOException {
ByteBuffer buffer = ByteBuffer.allocateDirect(binSize*8);
logger.debug("Writing from RWF");
for (int i = 0; i < bins.length; i++) {
var bin = bins[i];
for (var bin : bins) {
buffer.clear();
bin.eval(buffer);
while (buffer.hasRemaining()) {
int wb = o.write(buffer);
write_rate.set(wb);
o.write(buffer);
}
}
logger.debug("Done");
}
@Override
@ -84,12 +80,12 @@ public class RandomWriteFunnel implements AutoCloseable {
}
void put(int address, long data) throws IOException {
buffer.putInt(address);
buffer.putLong(data);
if (buffer.capacity() - buffer.position() < 12) {
if (buffer.remaining() < 12) {
flushBuffer();
}
buffer.putInt(address);
buffer.putLong(data);
}
private void flushBuffer() throws IOException {
@ -97,12 +93,15 @@ public class RandomWriteFunnel implements AutoCloseable {
return;
buffer.flip();
while (channel.write(buffer) > 0);
while (buffer.hasRemaining())
channel.write(buffer);
buffer.clear();
}
private void eval(ByteBuffer dest) throws IOException {
flushBuffer();
channel.force(false);
channel.position(0);
buffer.clear();
@ -117,14 +116,17 @@ public class RandomWriteFunnel implements AutoCloseable {
if (rb < 0) {
break;
}
else {
transfer_rate.set(rb);
}
buffer.flip();
while (buffer.limit() - buffer.position() >= 12) {
int addr = buffer.getInt();
int addr = 8 * buffer.getInt();
long data = buffer.getLong();
dest.putLong(8*addr, data);
try {
dest.putLong(addr, data);
}
catch (IndexOutOfBoundsException ex) {
logger.info("Bad poke[{}]={}, this happens if an RWF is allocated with insufficient size", addr, data);
}
}
buffer.compact();
}

View File

@ -4,101 +4,80 @@ import nu.marginalia.util.btree.model.BTreeContext;
import nu.marginalia.util.btree.model.BTreeHeader;
import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.util.multimap.MultimapSearcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static java.lang.Math.min;
public class BTreeReader {
private final MultimapFileLong file;
private final BTreeContext ctx;
private final Logger logger = LoggerFactory.getLogger(BTreeReader.class);
private final long mask;
private final MultimapSearcher searcher;
private final MultimapSearcher indexSearcher;
private final MultimapSearcher dataSearcher;
public BTreeReader(MultimapFileLong file, BTreeContext ctx) {
this.file = file;
this.searcher = file.createSearcher();
this.indexSearcher = MultimapSearcher.forContext(file, ~0, 1);
this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize());
this.ctx = ctx;
this.mask = ctx.equalityMask();
}
public long fileSize() {
return file.size();
public BTreeHeader getHeader(long fileOffset) {
return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2));
}
public BTreeHeader getHeader(long offset) {
return new BTreeHeader(file.get(offset), file.get(offset+1), file.get(offset+2));
}
/**
*
* @return file offset of entry matching keyRaw, negative if absent
*/
public long findEntry(BTreeHeader header, final long keyRaw) {
final int blockSize = ctx.BLOCK_SIZE_WORDS();
public long offsetForEntry(BTreeHeader header, final long keyRaw) {
final long key = keyRaw & mask;
final long key = keyRaw & ctx.equalityMask();
final long dataAddress = header.dataOffsetLongs();
if (header.layers() == 0) {
return trivialSearch(header, key);
}
final long searchStart;
final long numEntries;
long p = searchEntireTopLayer(header, key);
if (p < 0) return -1;
long cumOffset = p * ctx.BLOCK_SIZE_WORDS();
for (int i = header.layers() - 2; i >= 0; --i) {
long offsetBase = header.indexOffsetLongs() + header.relativeLayerOffset(ctx, i);
p = searchLayerBlock(key, offsetBase+cumOffset);
if (p < 0)
return -1;
cumOffset = ctx.BLOCK_SIZE_WORDS()*(p + cumOffset);
}
long dataMax = header.dataOffsetLongs() + (long) header.numEntries() * ctx.entrySize();
return searchDataBlock(key,
header.dataOffsetLongs() + ctx.entrySize()*cumOffset,
dataMax);
}
private long searchEntireTopLayer(BTreeHeader header, long key) {
long offset = header.indexOffsetLongs();
return searcher.binarySearchUpperBound(key, offset, offset + ctx.BLOCK_SIZE_WORDS()) - offset;
}
private long searchLayerBlock(long key, long blockOffset) {
if (blockOffset < 0)
return blockOffset;
return searcher.binarySearchUpperBound(key, blockOffset, blockOffset + ctx.BLOCK_SIZE_WORDS()) - blockOffset;
}
private long searchDataBlock(long key, long blockOffset, long dataMax) {
if (blockOffset < 0)
return blockOffset;
long lastOffset = Math.min(blockOffset+ctx.BLOCK_SIZE_WORDS()*(long)ctx.entrySize(), dataMax);
int length = (int)(lastOffset - blockOffset);
if (ctx.entrySize() == 1) {
if (mask == ~0L) return searcher.binarySearchUpperBoundNoMiss(key, blockOffset, blockOffset+length);
return searcher.binarySearchUpperBoundNoMiss(key, blockOffset, blockOffset+length, mask);
}
return searcher.binarySearchUpperBoundNoMiss(key, blockOffset, ctx.entrySize(), length/ctx.entrySize(), mask);
}
private long trivialSearch(BTreeHeader header, long key) {
long offset = header.dataOffsetLongs();
if (ctx.entrySize() == 1) {
if (mask == ~0L) {
return searcher.binarySearchUpperBoundNoMiss(key, offset, offset+header.numEntries());
if (header.layers() == 0) { // For small data, there is no index block, only a flat data block
searchStart = dataAddress;
numEntries = header.numEntries();
}
else {
return searcher.binarySearchUpperBoundNoMiss(key, offset, offset+header.numEntries(), mask);
}
long dataLayerOffset = searchIndex(header, key);
if (dataLayerOffset < 0) {
return dataLayerOffset;
}
return searcher.binarySearchUpperBoundNoMiss(key, offset, ctx.entrySize(), header.numEntries(), mask);
searchStart = dataAddress + dataLayerOffset * ctx.entrySize();
numEntries = min(header.numEntries() - dataLayerOffset, blockSize);
}
return dataSearcher.binarySearch(key, searchStart, numEntries);
}
private long searchIndex(BTreeHeader header, long key) {
final int blockSize = ctx.BLOCK_SIZE_WORDS();
final long indexAddress = header.indexOffsetLongs();
long layerOffset = 0;
for (int i = header.layers() - 1; i >= 0; --i) {
final long indexLayerBlockOffset = header.relativeIndexLayerOffset(ctx, i) + layerOffset;
final long nextLayerOffset = relativePositionInIndex(key, indexAddress + indexLayerBlockOffset, blockSize);
if (nextLayerOffset < 0)
return nextLayerOffset;
layerOffset = blockSize * (nextLayerOffset + layerOffset);
}
return layerOffset;
}
private long relativePositionInIndex(long key, long start, long n) {
return indexSearcher.binarySearchUpper(key, start, n) - start;
}
}

View File

@ -2,19 +2,16 @@ package nu.marginalia.util.btree;
import nu.marginalia.util.btree.model.BTreeContext;
import nu.marginalia.util.btree.model.BTreeHeader;
import nu.marginalia.util.multimap.MultimapFileLong;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import nu.marginalia.util.multimap.MultimapFileLongSlice;
import java.io.IOException;
public class BTreeWriter {
private final Logger logger = LoggerFactory.getLogger(BTreeWriter.class);
private final BTreeContext ctx;
private final MultimapFileLong map;
private final MultimapFileLongSlice map;
public BTreeWriter(MultimapFileLong map, BTreeContext ctx) {
public BTreeWriter(MultimapFileLongSlice map, BTreeContext ctx) {
this.map = map;
this.ctx = ctx;
}
@ -26,30 +23,35 @@ public class BTreeWriter {
long size = 0;
for (int layer = 0; layer < numLayers; layer++) {
size += ctx.layerSize(numWords, layer);
size += ctx.indexLayerSize(numWords, layer);
}
return size;
}
public long write(long offset, int numEntries, WriteCallback writeIndex)
/** Construct a BTree with numEntries entries at offset in the associated map
*
* @return The size of the written data
*/
public long write(long offset, int numEntries, WriteCallback writeIndexCallback)
throws IOException
{
var header = makeHeader(offset, numEntries);
BTreeHeader header = makeHeader(offset, numEntries);
header.write(map, offset);
writeIndex.write(header.dataOffsetLongs());
if (header.layers() < 1) {
writeIndexCallback.write(map.atOffset(header.dataOffsetLongs()));
if (header.layers() < 1) { // The data is too small to benefit from indexing
return ctx.calculateSize(numEntries);
}
else {
writeIndex(header);
return ctx.calculateSize(numEntries);
}
}
public static BTreeHeader makeHeader(BTreeContext ctx, long offset, int numEntries) {
final int numLayers = ctx.numLayers(numEntries);
final int numLayers = ctx.numIndexLayers(numEntries);
final int padding = BTreeHeader.getPadding(ctx, offset, numLayers);
@ -65,46 +67,50 @@ public class BTreeWriter {
private void writeIndex(BTreeHeader header) {
var layerOffsets = getRelativeLayerOffsets(header);
var layerOffsets = header.getRelativeLayerOffsets(ctx);
long stride = ctx.BLOCK_SIZE_WORDS();
long indexedDataStepSize = ctx.BLOCK_SIZE_WORDS();
/* Index layer 0 indexes the data itself
Index layer 1 indexes layer 0
Index layer 2 indexes layer 1
And so on
*/
for (int layer = 0; layer < header.layers(); layer++,
stride*=ctx.BLOCK_SIZE_WORDS()) {
indexedDataStepSize*=ctx.BLOCK_SIZE_WORDS()) {
writeIndexLayer(header, layerOffsets, indexedDataStepSize, layer);
}
}
private void writeIndexLayer(BTreeHeader header, long[] layerOffsets,
final long indexedDataStepSize,
final int layer) {
final long indexOffsetBase = layerOffsets[layer] + header.indexOffsetLongs();
final long dataOffsetBase = header.dataOffsetLongs();
final long dataEntriesMax = header.numEntries();
final int entrySize = ctx.entrySize();
final long lastDataEntryOffset = indexedDataStepSize - 1;
long indexWord = 0;
long offsetBase = layerOffsets[layer] + header.indexOffsetLongs();
long numEntries = header.numEntries();
for (long idx = 0; idx < numEntries; idx += stride, indexWord++) {
long dataOffset = header.dataOffsetLongs() + (idx + (stride-1)) * ctx.entrySize();
long val;
if (idx + (stride-1) < numEntries) {
val = map.get(dataOffset) & ctx.equalityMask();
}
else {
val = Long.MAX_VALUE;
}
if (offsetBase + indexWord < 0) {
logger.error("bad put @ {}", offsetBase + indexWord);
logger.error("layer{}", layer);
logger.error("layer offsets {}", layerOffsets);
logger.error("offsetBase = {}", offsetBase);
logger.error("numEntries = {}", numEntries);
logger.error("indexWord = {}", indexWord);
}
map.put(offsetBase + indexWord, val);
}
for (; (indexWord % ctx.BLOCK_SIZE_WORDS()) != 0; indexWord++) {
map.put(offsetBase + indexWord, Long.MAX_VALUE);
}
for (long dataPtr = 0;
dataPtr + lastDataEntryOffset < dataEntriesMax;
dataPtr += indexedDataStepSize)
{
long dataOffset = dataOffsetBase + (dataPtr + lastDataEntryOffset) * entrySize;
map.put(indexOffsetBase + indexWord++, map.get(dataOffset) & ctx.equalityMask());
}
// Fill the remaining block with LONG_MAX
map.setRange(indexOffsetBase+indexWord,
(int) (ctx.BLOCK_SIZE_WORDS() - (indexWord % ctx.BLOCK_SIZE_WORDS())),
Long.MAX_VALUE);
}
private long[] getRelativeLayerOffsets(BTreeHeader header) {
long[] layerOffsets = new long[header.layers()];
for (int i = 0; i < header.layers(); i++) {
layerOffsets[i] = header.relativeLayerOffset(ctx, i);
}
return layerOffsets;
}
}

View File

@ -1,7 +1,9 @@
package nu.marginalia.util.btree;
import nu.marginalia.util.multimap.MultimapFileLongSlice;
import java.io.IOException;
public interface WriteCallback {
void write(long offset) throws IOException;
void write(MultimapFileLongSlice slice) throws IOException;
}

View File

@ -10,7 +10,6 @@ public record BTreeContext(int MAX_LAYERS,
public BTreeContext(int MAX_LAYERS, int entrySize, long equalityMask, int BLOCK_SIZE_BITS) {
this(MAX_LAYERS, entrySize, equalityMask, BLOCK_SIZE_BITS, 1 << BLOCK_SIZE_BITS);
}
public long calculateSize(int numEntries) {
@ -19,7 +18,7 @@ public record BTreeContext(int MAX_LAYERS,
return header.dataOffsetLongs() + (long)numEntries * entrySize;
}
public int numLayers(int numEntries) {
public int numIndexLayers(int numEntries) {
if (numEntries <= BLOCK_SIZE_WORDS*2) {
return 0;
}
@ -36,21 +35,14 @@ public record BTreeContext(int MAX_LAYERS,
return MAX_LAYERS;
}
public long layerSize(int numEntries, int level) {
return BLOCK_SIZE_WORDS * numBlocks(numEntries, level);
}
public long indexLayerSize(int numWords, int level) {
final long layerSize = 1L<<(BLOCK_SIZE_BITS*(level+1));
final long numBlocks = numWords / layerSize;
private long numBlocks(int numWords, int level) {
long layerSize = 1L<<(BLOCK_SIZE_BITS*(level+1));
int numBlocks = 0;
numBlocks += numWords / layerSize;
if (numWords % layerSize != 0) {
numBlocks++;
return BLOCK_SIZE_WORDS * (numBlocks + 1);
}
return numBlocks;
return BLOCK_SIZE_WORDS * numBlocks;
}
}

View File

@ -1,6 +1,6 @@
package nu.marginalia.util.btree.model;
import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.util.multimap.MultimapFileLongSlice;
public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, long dataOffsetLongs) {
public BTreeHeader {
@ -28,19 +28,27 @@ public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, lon
return padding;
}
public void write(MultimapFileLong dest, long offset) {
public void write(MultimapFileLongSlice dest, long offset) {
dest.put(offset, ((long) layers << 32L) | ((long)numEntries & 0xFFFF_FFFFL));
dest.put(offset+1, indexOffsetLongs);
dest.put(offset+2, dataOffsetLongs);
}
public long relativeLayerOffset(BTreeContext ctx, int n) {
public long relativeIndexLayerOffset(BTreeContext ctx, int n) {
long offset = 0;
for (int i = n+1; i < layers; i++) {
offset += ctx.layerSize( numEntries, i);
offset += ctx.indexLayerSize( numEntries, i);
}
return offset;
}
public long[] getRelativeLayerOffsets(BTreeContext ctx) {
long[] layerOffsets = new long[layers()];
for (int i = 0; i < layers(); i++) {
layerOffsets[i] = relativeIndexLayerOffset(ctx, i);
}
return layerOffsets;
}
}

View File

@ -5,7 +5,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.nio.LongBuffer;
public class DictionaryData {
@ -17,22 +17,22 @@ public class DictionaryData {
public DictionaryData(int bankSize) {
DICTIONARY_BANK_SIZE = bankSize;
banks.add(new DictionaryDataBank(0));
banks.add(new DictionaryDataBank(0, bankSize));
}
public int size() {
return banks.end();
}
public int add(byte[] data, int value) {
public int add(long key) {
var activeBank = banks.last();
int rb = activeBank.add(data, value);
int rb = activeBank.add(key);
if (rb == -1) {
int end = activeBank.getEnd();
logger.debug("Switching bank @ {}", end);
var newBank = new DictionaryDataBank(end);
rb = newBank.add(data, value);
var newBank = new DictionaryDataBank(end, DICTIONARY_BANK_SIZE);
rb = newBank.add(key);
banks.add(newBank);
}
@ -41,33 +41,32 @@ public class DictionaryData {
}
public byte[] getBytes(int offset) {
return banks.bankForOffset(offset).getBytes(offset);
public long getKey(int offset) {
return banks.bankForOffset(offset).getKey(offset);
}
public boolean keyEquals(int offset, byte[] data) {
return banks.bankForOffset(offset).keyEquals(offset, data);
public boolean keyEquals(int offset, long otherKey) {
return banks.bankForOffset(offset).keyEquals(offset, otherKey);
}
public int getValue(int offset) {
return banks.bankForOffset(offset).getValue(offset);
}
public class DictionaryDataBank {
private static class DictionaryDataBank {
private final int start_idx;
private final ByteBuffer data;
// Humongous long-lived arrays seem to sometimes yield considerable memory overhead and
// can make the GC behave poorly. Using off-heap memory seems preferred when their
// lifetime is "forever"
private final LongBuffer keys;
private int size;
private int[] offset;
private int[] value;
private final int capacity;
public DictionaryDataBank(int start_idx) {
public DictionaryDataBank(int start_idx, int sz) {
this.start_idx = start_idx;
this.capacity = sz;
data = ByteBuffer.allocateDirect(DICTIONARY_BANK_SIZE);
offset = new int[DICTIONARY_BANK_SIZE/16];
value = new int[DICTIONARY_BANK_SIZE/16];
keys = ByteBuffer.allocateDirect(8*capacity).asLongBuffer();
size = 0;
}
@ -79,102 +78,26 @@ public class DictionaryData {
return start_idx + size;
}
public byte[] getBytes(int idx) {
public long getKey(int idx) {
if (idx < start_idx || idx - start_idx >= size) {
throw new IndexOutOfBoundsException(idx);
}
return keys.get(idx - start_idx);
}
public boolean keyEquals(int idx, long other) {
if (idx < start_idx || idx - start_idx >= size) {
throw new IndexOutOfBoundsException(idx);
}
idx = idx - start_idx;
final int start;
final int end = offset[idx];
if (idx == 0) start = 0;
else start = offset[idx-1];
byte[] dst = new byte[end-start];
data.get(start, dst);
return dst;
return keys.get(idx - start_idx) == other;
}
public int getValue(int idx) {
if (idx < start_idx || idx - start_idx >= size) {
throw new IndexOutOfBoundsException(idx);
}
return value[idx - start_idx];
}
public int add(long newKey) {
if (size >= capacity)
return -1;
public boolean keyEquals(int idx, byte[] data) {
if (idx < start_idx || idx - start_idx >= size) {
throw new IndexOutOfBoundsException(idx);
}
idx = idx - start_idx;
int start;
int end = offset[idx];
if (idx == 0) {
start = 0;
}
else {
start = offset[idx-1];
}
if (data.length != end - start) {
return false;
}
for (int i = 0; i < data.length; i++) {
if (this.data.get(start + i) != data[i]) {
return false;
}
}
return true;
}
public long longHashCode(int idx) {
if (idx < start_idx || idx - start_idx >= size) {
throw new IndexOutOfBoundsException(idx);
}
idx = idx - start_idx;
int start;
int end = offset[idx];
if (idx == 0) {
start = 0;
}
else {
start = offset[idx-1];
}
long result = 1;
for (int i = start; i < end; i++)
result = 31 * result + data.get(i);
return result;
}
public int add(byte[] newData, int newValue) {
if (size == offset.length) {
logger.debug("Growing bank from {} to {}", offset.length, offset.length*2);
offset = Arrays.copyOf(offset, offset.length*2);
value = Arrays.copyOf(value, value.length*2);
}
if (size > 0 && offset[size-1]+newData.length >= DICTIONARY_BANK_SIZE) {
if (offset.length > size+1) {
logger.debug("Shrinking bank from {} to {}", offset.length, size - 1);
offset = Arrays.copyOf(offset, size + 1);
value = Arrays.copyOf(value, size + 1);
}
return -1; // Full
}
int dataOffset = size > 0 ? offset[size-1] : 0;
data.put(dataOffset, newData);
offset[size] = dataOffset + newData.length;
value[size] = newValue;
keys.put(size, newKey);
return start_idx + size++;
}

View File

@ -66,8 +66,7 @@ public class DictionaryHashMap {
logger.debug("Buffer size sanity checked passed");
}
dictionaryData = new DictionaryData(Math.min(1<<30, Math.max(32, (int)(sizeMemory/4))));
dictionaryData = new DictionaryData((int)Math.min(1<<27, Math.max(32L, sizeMemory/4)));
initializeBuffers();
}
@ -82,9 +81,6 @@ public class DictionaryHashMap {
}
}
public int memSz() {
return dictionaryData.size();
}
public int size() {
return sz.get();
}
@ -101,20 +97,20 @@ public class DictionaryHashMap {
buffers[buffer].put(bufferIdx, val);
}
public int put(byte[] data, int value) {
public int put(long key) {
long hash = longHash(data) & 0x7FFF_FFFF_FFFF_FFFFL;
long hash = key & 0x7FFF_FFFF_FFFF_FFFFL;
long idx = hash % hashTableSize;
if (getCell(idx) == NO_VALUE) {
return setValue(data, value, idx);
return setValue(key, idx);
}
return putRehash(data, value, idx, hash);
return putRehash(key, idx, hash);
}
private int putRehash(byte[] data, int value, long idx, long hash) {
private int putRehash(long key, long idx, long hash) {
final long pStride = 1 + (hash % (hashTableSize - 2));
for (long j = 1; j < maxProbeLength; j++) {
@ -129,9 +125,9 @@ public class DictionaryHashMap {
if (val == NO_VALUE) {
probe_count_metrics.set(j);
return setValue(data, value, idx);
return setValue(key, idx);
}
else if (dictionaryData.keyEquals(val, data)) {
else if (dictionaryData.keyEquals(val, key)) {
return val;
}
}
@ -139,16 +135,16 @@ public class DictionaryHashMap {
throw new IllegalStateException("DictionaryHashMap full @ size " + size() + "/" + hashTableSize + ", " + round((100.0*size()) / hashTableSize) + "%");
}
private int setValue(byte[] data, int value, long cell) {
private int setValue(long key, long cell) {
sz.incrementAndGet();
int di = dictionaryData.add(data, value);
int di = dictionaryData.add(key);
setCell(cell, di);
return di;
}
public int get(byte[] data) {
final long hash = longHash(data) & 0x7FFF_FFFF_FFFF_FFFFL;
public int get(long key) {
final long hash = key & 0x7FFF_FFFF_FFFF_FFFFL;
final long cell = hash % hashTableSize;
if (getCell(cell) == NO_VALUE) {
@ -157,15 +153,15 @@ public class DictionaryHashMap {
else {
int val = getCell(cell);
if (dictionaryData.keyEquals(val, data)) {
return dictionaryData.getValue(val);
if (dictionaryData.keyEquals(val, key)) {
return val;
}
}
return getRehash(data, cell, hash);
return getRehash(key, cell, hash);
}
private int getRehash(byte[] data, long idx, long hash) {
private int getRehash(long key, long idx, long hash) {
final long pStride = 1 + (hash % (hashTableSize - 2));
for (long j = 1; j < maxProbeLength; j++) {
@ -180,29 +176,12 @@ public class DictionaryHashMap {
if (val == NO_VALUE) {
return NO_VALUE;
}
else if (dictionaryData.keyEquals(val, data)) {
return dictionaryData.getValue(val);
else if (dictionaryData.keyEquals(val, key)) {
return val;
}
}
throw new IllegalStateException("DictionaryHashMap full @ size " + size() + "/" + hashTableSize + ", " + round((100.0*size()) / hashTableSize) + "%");
}
private long longHash(byte[] bytes) {
if (bytes == null)
return 0;
// https://cp-algorithms.com/string/string-hashing.html
int p = 127;
long m = (1L<<61)-1;
long p_power = 1;
long hash_val = 0;
for (byte element : bytes) {
hash_val = (hash_val + (element+1) * p_power) % m;
p_power = (p_power * p) % m;
}
return hash_val;
}
}

View File

@ -1,9 +1,7 @@
package nu.marginalia.util.hash;
import io.prometheus.client.Gauge;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import nu.marginalia.wmsa.edge.index.service.index.wordstable.IndexWordsTable;
import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.util.PrimeUtil;
import org.slf4j.Logger;
@ -17,9 +15,7 @@ import static java.lang.Math.round;
*/
public class LongPairHashMap {
private static final Logger logger = LoggerFactory.getLogger(LongPairHashMap.class);
private static final Gauge probe_count_metrics
= Gauge.build("wmsa_wordfile_hash_map_probe_count", "Probing Count")
.register();
private static final long MAGIC_WORD = 0xE00E00E00E0E0E0EL; // it's the data police
private final long hashTableSize;
private final MultimapFileLong data;
@ -27,26 +23,37 @@ public class LongPairHashMap {
private int sz = 0;
private static final int HEADER_SIZE = 2;
public LongPairHashMap(MultimapFileLong data, long size) {
private LongPairHashMap(MultimapFileLong data, long hashTableSize, long maxProbeLength) {
this.data = data;
// Actually use a prime size for Donald Knuth reasons
hashTableSize = PrimeUtil.nextPrime(size, 1);
maxProbeLength = hashTableSize / 2;
this.hashTableSize = hashTableSize;
this.maxProbeLength = maxProbeLength;
}
logger.debug("Table size = " + hashTableSize);
public static LongPairHashMap createNew(MultimapFileLong data, long size) {
var tableSize = PrimeUtil.nextPrime(size, 1);
var ret = new LongPairHashMap(data, tableSize, tableSize/2);
data.put(0, IndexWordsTable.Strategy.HASH.ordinal());
data.put(1, hashTableSize);
for (int i = 2; i < hashTableSize; i++) {
data.put(0, MAGIC_WORD);
data.put(1, tableSize);
for (int i = 2; i < tableSize; i++) {
data.put(HEADER_SIZE + 2L*i, 0);
}
}
public LongPairHashMap(MultimapFileLong data) {
this.data = data;
hashTableSize = data.get(1);
maxProbeLength = hashTableSize / 10;
logger.debug("Table size = " + hashTableSize);
return ret;
}
public static LongPairHashMap loadExisting(MultimapFileLong data) {
long key = data.get(0);
if (key != MAGIC_WORD) {
logger.warn("LongPairHashMap lacks magic word, could this be garbage data?");
}
var hashTableSize = data.get(1);
var maxProbeLength = hashTableSize / 10;
return new LongPairHashMap(data, hashTableSize, maxProbeLength);
}
public int size() {
@ -91,8 +98,6 @@ public class LongPairHashMap {
final var val = getCell(idx);
if (!val.isSet()) {
probe_count_metrics.set(j);
return setValue(data, idx);
}
else if (val.getKey() == data.getKey()) {

View File

@ -69,7 +69,7 @@ public class DocumentDebugger {
Set<String> reps = new HashSet<>();
// kc.count(languageData, 0.75).forEach(rep -> reps.add(rep.stemmed));
kc.count(languageData, 0.75).forEach(rep -> reps.add(rep.stemmed));
kc.count(languageData).forEach(rep -> reps.add(rep.stemmed));
try (var pw = new PrintWriter(new FileOutputStream(output.toFile()))) {

View File

@ -3,7 +3,9 @@ package nu.marginalia.util.language;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.*;
import java.util.HashSet;
import java.util.Objects;
import java.util.Set;
import java.util.function.Predicate;
import java.util.regex.Pattern;
@ -13,21 +15,13 @@ public class WordPatterns {
public static final String WORD_TOKEN_JOINER = "_";
public static final Pattern wordPattern = Pattern.compile("[#]?[_@.a-zA-Z0-9'+\\-\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+[#]?");
public static final Pattern wordPatternRestrictive = Pattern.compile("[#]?[@a-zA-Z0-9'+\\-\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+[#]?");
public static final Pattern keyWordPattern = Pattern.compile("[A-Z\\u00C0-\\u00D6][_a-zA-Z\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]{0,32}('[a-zA-Z])?");
public static final Pattern wordAppendixPattern = Pattern.compile("[.]?[0-9a-zA-Z\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]{1,3}[0-9]?");
public static final Pattern joinWord = Pattern.compile("(as|an|the|of|in|a)");
public static final Pattern keywordAppendixPattern = Pattern.compile("([0-9A-Z][A-Z0-9]{0,3})");
public static final Pattern wordBreakPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))");
public static final Pattern characterNoisePattern = Pattern.compile("^[/+\\-]+$");
public static final Predicate<String> wordQualitiesPredicate = wordPattern.asMatchPredicate();
public static final Predicate<String> restrictivePredicate = wordPatternRestrictive.asMatchPredicate();
public static final Predicate<String> wordAppendixPredicate = wordAppendixPattern.asMatchPredicate();
public static final Predicate<String> keywordPredicate = keyWordPattern.asMatchPredicate();
public static final Predicate<String> keywordAppendixPredicate = keywordAppendixPattern.asMatchPredicate();
public static final Predicate<String> wordPredicateEither = wordQualitiesPredicate.or(wordAppendixPredicate);
public static final Predicate<String> keywordPredicateEither = keywordPredicate.or(keywordAppendixPredicate);
public static final Predicate<String> characterNoisePredicate = characterNoisePattern.asMatchPredicate();
public static final Set<String> topWords;
@ -88,16 +82,6 @@ public class WordPatterns {
return true;
}
public static boolean filterStrict(String word) {
int numDigits = (int) word.chars().filter(Character::isDigit).count();
if (numDigits == word.length()) {
return false;
}
return true;
}
public static boolean isStopWord(String s) {
if (s.length() < MIN_WORD_LENGTH) {
return true;

View File

@ -39,13 +39,12 @@ public class DocumentKeywordExtractor {
public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData) {
var titleWords = extractTitleWords(documentLanguageData);
var wordsTfIdf = tfIdfCounter.count(documentLanguageData, 0.75);
var wordsNamesRepeated = nameCounter.count(documentLanguageData, 2);
var wordsNamesAll = nameCounter.count(documentLanguageData, 1);
var subjects = subjectCounter.count(documentLanguageData);
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
List<WordRep> wordsTfIdf = tfIdfCounter.count(documentLanguageData);
List<WordRep> wordsNamesRepeated = nameCounter.count(documentLanguageData, 2);
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1);
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
List<WordRep> wordsLongName = longNameCounter.count(documentLanguageData);
int totalSize = wordsTfIdf.size();
@ -55,8 +54,8 @@ public class DocumentKeywordExtractor {
List<WordRep> topKeywords = new ArrayList<>(totalSize / 2);
for(var v : wordsTfIdf) {
if (topKeywords.size() < totalSize / 10) topKeywords.add(v);
else if (midKeywords.size() < totalSize / 5) midKeywords.add(v);
if (topKeywords.size() <= totalSize / 10) topKeywords.add(v);
else if (midKeywords.size() <= totalSize / 5) midKeywords.add(v);
else lowKeywords.add(v);
}
@ -125,18 +124,19 @@ public class DocumentKeywordExtractor {
}
}
return counts.entrySet().stream().filter(c2 -> c2.getValue()>=1)
.sorted(Comparator.comparing(this::value))
.map(Map.Entry::getKey)
.limit(512).collect(Collectors.toSet());
}
private double value(Map.Entry<String, Integer> e) {
return counts.entrySet().stream()
.sorted(Comparator.comparing(e -> {
double N = 11820118.; // Number of documents in term freq dictionary
// Caveat: This is actually the *negated* term score, because the second logarithm has
// its parameter inverted (log(a^b) = b log(a); here b = -1)
return (1+Math.log(e.getValue())) * Math.log((1.+dict.getTermFreq(e.getKey()))/N);
}))
.map(Map.Entry::getKey)
.limit(512).collect(Collectors.toCollection(LinkedHashSet::new));
}
public EdgePageWords createWords(IndexBlock block, Collection<WordRep> words) {
return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns.wordQualitiesPredicate).collect(Collectors.toSet()));
}

View File

@ -1,15 +1,12 @@
package nu.marginalia.util.language.processing;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.util.language.processing.model.DocumentSentence;
import nu.marginalia.util.language.processing.model.WordRep;
import nu.marginalia.util.language.processing.model.WordSpan;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
public class KeywordCounter {
private final KeywordExtractor keywordExtractor;
@ -20,58 +17,29 @@ public class KeywordCounter {
this.keywordExtractor = keywordExtractor;
}
public List<WordRep> count(DocumentLanguageData dld, double cutoff) {
public List<WordRep> count(DocumentLanguageData dld) {
HashMap<String, Double> counts = new HashMap<>(1000);
HashMap<String, HashSet<String>> instances = new HashMap<>(1000);
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(1000);
for (int i = 0; i < dld.sentences.length; i++) {
DocumentSentence sent = dld.sentences[i];
double value = 1.0 / Math.log(1+i);
for (var sent : dld.sentences) {
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
for (var span : keywords) {
var stemmed = sent.constructStemmedWordFromSpan(span);
if (stemmed.isBlank())
continue;
counts.merge(stemmed, value, Double::sum);
String stemmed = sent.constructStemmedWordFromSpan(span);
instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(sent.constructWordFromSpan(span));
counts.merge(stemmed, 1., Double::sum);
instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(new WordRep(sent, span));
}
}
var topWords = counts.entrySet().stream()
.filter(w -> w.getValue() > cutoff)
return counts.entrySet().stream()
.filter(e -> e.getValue() > 1)
.sorted(Comparator.comparing(this::getTermValue))
.limit(Math.min(100, counts.size()/2))
.map(Map.Entry::getKey)
.flatMap(w -> instances.get(w).stream())
.filter(w -> w.word.length() > 1)
.limit(150)
.collect(Collectors.toList());
var topWordsSet = new HashSet<>(topWords);
final Set<WordRep> keywords = new HashSet<>();
for (var sentence : dld.sentences) {
for (WordSpan kw : keywordExtractor.getKeywordsFromSentence(sentence)) {
String stemmedWord = sentence.constructStemmedWordFromSpan(kw);
if (topWords.contains(stemmedWord)) {
keywords.add(new WordRep(sentence, kw));
}
}
}
for (var sentence : dld.sentences) {
for (var kw : keywordExtractor.getKeywordsFromSentenceStrict(sentence, topWordsSet, true)) {
keywords.add(new WordRep(sentence, kw));
}
}
Map<String, Integer> sortOrder = IntStream.range(0, topWords.size()).boxed().collect(Collectors.toMap(topWords::get, i->i));
Comparator<WordRep> comp = Comparator.comparing(wr -> sortOrder.getOrDefault(wr.stemmed, topWords.size()));
var ret = new ArrayList<>(keywords);
ret.sort(comp);
return ret;
}
private static final Pattern separator = Pattern.compile("_");
@ -86,7 +54,11 @@ public class KeywordCounter {
}
double value(String key, double value) {
return (1+Math.log(value)) * Math.log((1.+dict.getTermFreq(key))/11820118.);
double freq = dict.getTermFreqStemmed(key);
if (freq < 1) {
freq = 10;
}
return (1+Math.log(value)) * Math.log((1.1+freq)/11820118.);
}

View File

@ -56,7 +56,7 @@ public class LongNameCounter {
}
double value(String key, double value) {
return (1+Math.log(value)) * Math.log((1.+dict.getTermFreq(key))/11820118.);
return (1+Math.log(value)) * Math.log((1.1+dict.getTermFreqStemmed(key))/11820118.);
}

View File

@ -5,7 +5,9 @@ import nu.marginalia.util.language.processing.model.WordRep;
import nu.marginalia.util.language.processing.model.WordSpan;
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
import java.util.*;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public class SubjectCounter {
@ -15,6 +17,14 @@ public class SubjectCounter {
this.keywordExtractor = keywordExtractor;
}
// Seeks out subjects in a sentence by constructs like
//
// [Name] (Verbs) (the|a|Adverb|Verb) ...
// e.g.
//
// Greeks bearing gifts -> Greeks
// Steve McQueen drove fast | cars -> Steve McQueen
public List<WordRep> count(DocumentLanguageData dld) {
Map<WordRep, Integer> counts = new HashMap<>();
@ -27,9 +37,10 @@ public class SubjectCounter {
|| sentence.separators[kw.end + 1] == WordSeparator.COMMA)
break;
if (("VBZ".equals(sentence.posTags[kw.end]) || "VBP".equals(sentence.posTags[kw.end]))
&& ("DT".equals(sentence.posTags[kw.end + 1]) || "RB".equals(sentence.posTags[kw.end]) || sentence.posTags[kw.end].startsWith("VB"))
) {
String nextTag = sentence.posTags[kw.end];
String nextNextTag = sentence.posTags[kw.end+1];
if (isVerb(nextTag) && isDetOrAdverbOrVerb(nextNextTag)) {
counts.merge(new WordRep(sentence, new WordSpan(kw.start, kw.end)), -1, Integer::sum);
}
}
@ -43,4 +54,16 @@ public class SubjectCounter {
.collect(Collectors.toList());
}
private boolean isDetOrAdverbOrVerb(String posTag) {
return "DT".equals(posTag) // determinant
|| "RB".equals(posTag) // adverb
|| posTag.startsWith("VB") // verb
|| posTag.startsWith("JJ"); // adjective
}
boolean isVerb(String posTag) {
return posTag.startsWith("VB")
&& !posTag.equals("VB"); // not interested in the infinitive
}
}

View File

@ -2,15 +2,17 @@ package nu.marginalia.util.language.processing.model;
import nu.marginalia.util.language.WordPatterns;
import org.jetbrains.annotations.NotNull;
import java.lang.ref.SoftReference;
import java.util.BitSet;
import java.util.Iterator;
import java.util.StringJoiner;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
public class DocumentSentence {
public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>{
public final String originalSentence;
public final String[] words;
public final int[] separators;
@ -85,4 +87,37 @@ public class DocumentSentence {
public String toString() {
return IntStream.range(0, length()).mapToObj(i -> String.format("%s[%s]", words[i], posTags[i])).collect(Collectors.joining(" "));
}
@NotNull
@Override
public Iterator<SentencePos> iterator() {
return new Iterator<>() {
int i = -1;
@Override
public boolean hasNext() {
return i+1 < length();
}
@Override
public SentencePos next() {
return new SentencePos(++i);
}
};
}
public class SentencePos {
public final int pos;
public SentencePos(int pos) {
this.pos = pos;
}
public String word() { return words[pos]; }
public String wordLowerCase() { return wordsLowerCase[pos]; }
public String posTag() { return posTags[pos]; }
public String stemmed() { return stemmedWords[pos]; }
public int separator() { return separators[pos]; }
public boolean isStopWord() { return DocumentSentence.this.isStopWord(pos); }
}
}

View File

@ -21,7 +21,7 @@ import static java.nio.channels.FileChannel.MapMode.READ_WRITE;
import static nu.marginalia.util.FileSizeUtil.readableSize;
public class MultimapFileLong implements AutoCloseable {
public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
private final ArrayList<LongBuffer> buffers = new ArrayList<>();
private final ArrayList<MappedByteBuffer> mappedByteBuffers = new ArrayList<>();
@ -36,9 +36,7 @@ public class MultimapFileLong implements AutoCloseable {
private long mappedSize;
final static long WORD_SIZE = 8;
private boolean loadAggressively;
private final NativeIO.Advice advice = null;
private NativeIO.Advice defaultAdvice = null;
public static MultimapFileLong forReading(Path file) throws IOException {
long fileSize = Files.size(file);
@ -70,12 +68,7 @@ public class MultimapFileLong implements AutoCloseable {
long mapSize,
int bufferSize) throws IOException {
this(new RandomAccessFile(file, translateToRAFMode(mode)), mode, mapSize, bufferSize, false);
}
public MultimapFileLong loadAggressively(boolean v) {
this.loadAggressively = v;
return this;
this(new RandomAccessFile(file, translateToRAFMode(mode)), mode, mapSize, bufferSize);
}
private static String translateToRAFMode(FileChannel.MapMode mode) {
@ -91,13 +84,11 @@ public class MultimapFileLong implements AutoCloseable {
public MultimapFileLong(RandomAccessFile file,
FileChannel.MapMode mode,
long mapSizeBytes,
int bufferSizeWords,
boolean loadAggressively) throws IOException {
int bufferSizeWords) throws IOException {
this.mode = mode;
this.bufferSize = bufferSizeWords;
this.mapSize = mapSizeBytes;
this.fileLength = file.length();
this.loadAggressively = loadAggressively;
channel = file.getChannel();
mappedSize = 0;
@ -106,8 +97,8 @@ public class MultimapFileLong implements AutoCloseable {
readableSize(mapSizeBytes), readableSize(8L*bufferSizeWords), mode);
}
public MultimapSearcher createSearcher() {
return new MultimapSearcher(this);
public MultimapSearcherBase createSearcher() {
return new MultimapSearcherBase(this);
}
public MultimapSorter createSorter(Path tmpFile, int internalSortLimit) {
return new MultimapSorter(this, tmpFile, internalSortLimit);
@ -115,6 +106,7 @@ public class MultimapFileLong implements AutoCloseable {
@SneakyThrows
public void advice(NativeIO.Advice advice) {
this.defaultAdvice = advice;
for (var buffer : mappedByteBuffers) {
NativeIO.madvise(buffer, advice);
}
@ -157,7 +149,7 @@ public class MultimapFileLong implements AutoCloseable {
}
@SneakyThrows
private void grow(long posIdxRequired) {
public void grow(long posIdxRequired) {
if (posIdxRequired*WORD_SIZE > mapSize && mode == READ_ONLY) {
throw new IndexOutOfBoundsException(posIdxRequired + " (max " + mapSize + ")");
}
@ -182,11 +174,8 @@ public class MultimapFileLong implements AutoCloseable {
var buffer = channel.map(mode, posBytes, bzBytes);
if (loadAggressively)
buffer.load();
if (advice != null) {
NativeIO.madvise(buffer, advice);
if (defaultAdvice != null) {
NativeIO.madvise(buffer, defaultAdvice);
}
buffers.add(buffer.asLongBuffer());
@ -196,10 +185,12 @@ public class MultimapFileLong implements AutoCloseable {
}
}
@Override
public long size() {
return fileLength;
}
@Override
public void put(long idx, long val) {
if (idx >= mappedSize)
grow(idx);
@ -214,6 +205,7 @@ public class MultimapFileLong implements AutoCloseable {
}
}
@Override
public long get(long idx) {
if (idx >= mappedSize)
grow(idx);
@ -229,10 +221,12 @@ public class MultimapFileLong implements AutoCloseable {
}
@Override
public void read(long[] vals, long idx) {
read(vals, vals.length, idx);
}
@Override
public void read(long[] vals, int n, long idx) {
if (idx+n >= mappedSize) {
grow(idx+n);
@ -257,10 +251,38 @@ public class MultimapFileLong implements AutoCloseable {
}
@Override
public void read(LongBuffer vals, long idx) {
int n = vals.limit() - vals.position();
if (idx+n >= mappedSize) {
grow(idx+n);
}
int iN = (int)((idx + n) / bufferSize);
for (int i = 0; i < n; ) {
int i0 = (int)((idx + i) / bufferSize);
int bufferOffset = (int) ((idx+i) % bufferSize);
var buffer = buffers.get(i0);
final int l;
if (i0 < iN) l = bufferSize - bufferOffset;
else l = Math.min(n - i, bufferSize - bufferOffset);
vals.put(vals.position() + i, buffer, bufferOffset, l);
i+=l;
}
}
@Override
public void write(long[] vals, long idx) {
write(vals, vals.length, idx);
}
@Override
public void write(long[] vals, int n, long idx) {
if (idx+n >= mappedSize) {
grow(idx+n);
@ -285,6 +307,7 @@ public class MultimapFileLong implements AutoCloseable {
}
@Override
public void write(LongBuffer vals, long idx) {
int n = vals.limit() - vals.position();
if (idx+n >= mappedSize) {
@ -309,7 +332,36 @@ public class MultimapFileLong implements AutoCloseable {
}
@Override
public void setRange(long idx, int n, long val) {
if (n == 0) return;
if (idx+n >= mappedSize) {
grow(idx+n);
}
int iN = (int)((idx + n) / bufferSize);
for (int i = 0; i < n; ) {
int i0 = (int)((idx + i) / bufferSize);
int bufferOffset = (int) ((idx+i) % bufferSize);
var buffer = buffers.get(i0);
final int l;
if (i0 < iN) l = bufferSize - bufferOffset;
else l = Math.min(n - i, bufferSize - bufferOffset);
for (int p = 0; p < l; p++) {
buffer.put(bufferOffset + p, val);
}
i+=l;
}
}
@Override
public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException {
int length = (int)(sourceEnd - sourceStart);
@ -354,8 +406,10 @@ public class MultimapFileLong implements AutoCloseable {
@Override
public void close() throws IOException {
force();
mappedByteBuffers.clear();
buffers.clear();
channel.close();
// I want to believe

View File

@ -0,0 +1,78 @@
package nu.marginalia.util.multimap;
import java.io.IOException;
import java.nio.LongBuffer;
import java.nio.channels.FileChannel;
public class MultimapFileLongOffsetSlice implements MultimapFileLongSlice {
private final long off;
private final MultimapFileLongSlice map;
public MultimapFileLongOffsetSlice(MultimapFileLongSlice map, long off) {
this.off = off;
this.map = map;
}
@Override
public long size() {
return map.size() - off;
}
@Override
public void put(long idx, long val) {
map.put(off+idx, val);
}
@Override
public void setRange(long idx, int n, long val) {
map.setRange(off+idx, n, val);
}
@Override
public long get(long idx) {
return map.get(off+idx);
}
@Override
public void read(long[] vals, long idx) {
map.read(vals, idx+off);
}
@Override
public void read(long[] vals, int n, long idx) {
map.read(vals, n, idx+off);
}
@Override
public void read(LongBuffer vals, long idx) { map.read(vals, idx+off); }
@Override
public void write(long[] vals, long idx) {
map.write(vals, idx+off);
}
@Override
public void write(long[] vals, int n, long idx) {
map.write(vals, n, idx+off);
}
@Override
public void write(LongBuffer vals, long idx) {
map.write(vals, idx+off);
}
@Override
public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd)
throws IOException {
map.transferFromFileChannel(sourceChannel, destOffset + off, sourceStart, sourceEnd);
}
@Override
public MultimapFileLongSlice atOffset(long off) {
// If we don't override this, the default implementation would build a pyramid of
// MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(...)))
// if this is called iteratively (e.g. to walk over a file)
return new MultimapFileLongOffsetSlice(map, this.off + off);
}
}

View File

@ -0,0 +1,33 @@
package nu.marginalia.util.multimap;
import java.io.IOException;
import java.nio.LongBuffer;
import java.nio.channels.FileChannel;
public interface MultimapFileLongSlice {
long size();
void put(long idx, long val);
void setRange(long idx, int n, long val);
long get(long idx);
void read(long[] vals, long idx);
void read(long[] vals, int n, long idx);
void read(LongBuffer vals, long idx);
void write(long[] vals, long idx);
void write(long[] vals, int n, long idx);
void write(LongBuffer vals, long idx);
void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException;
default MultimapFileLongSlice atOffset(long off) {
return new MultimapFileLongOffsetSlice(this, off);
}
}

View File

@ -1,128 +1,80 @@
package nu.marginalia.util.multimap;
import lombok.experimental.Delegate;
public interface MultimapSearcher {
long binarySearchUpper(long key, long fromIndex, long n);
long binarySearch(long key, long fromIndex, long n);
public class MultimapSearcher {
@Delegate
private final MultimapFileLong mmf;
public MultimapSearcher(MultimapFileLong mmf) {
this.mmf = mmf;
static MultimapSearcher forContext(MultimapFileLongSlice slice, long mask, int stepSize) {
if (mask == ~0L && stepSize == 1) {
return new SimpleMultimapSearcher(new MultimapSearcherBase(slice));
}
public boolean binarySearch(long key, long fromIndex, long toIndex) {
long low = fromIndex;
long high = toIndex - 1;
while (low <= high) {
long mid = (low + high) >>> 1;
long midVal = get(mid);
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return true; // key found
else if (stepSize == 1) {
return new MaskedMultimapSearcher(new MultimapSearcherBase(slice), mask);
}
return false; // key not found.
}
public long binarySearchUpperBound(long key, long fromIndex, long toIndex) {
long low = fromIndex;
long high = toIndex - 1;
while (low <= high) {
long mid = (low + high) >>> 1;
long midVal = get(mid);
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return mid;
}
return low;
}
public long binarySearchUpperBound(long key, long fromIndex, long toIndex, long mask) {
long low = fromIndex;
long high = toIndex - 1;
while (low <= high) {
long mid = (low + high) >>> 1;
long midVal = get(mid) & mask;
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return mid;
}
return low;
}
public long binarySearchUpperBoundNoMiss(long key, long fromIndex, long toIndex) {
long low = fromIndex;
long high = toIndex - 1;
while (low <= high) {
long mid = (low + high) >>> 1;
long midVal = get(mid);
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return mid;
}
return -1;
}
public long binarySearchUpperBoundNoMiss(long key, long fromIndex, long toIndex, long mask) {
long low = fromIndex;
long high = toIndex - 1;
while (low <= high) {
long mid = (low + high) >>> 1;
long midVal = get(mid) & mask;
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return mid;
}
return -1;
}
public long binarySearchUpperBoundNoMiss(long key, long fromIndex, long step, long steps, long mask) {
long low = 0;
long high = steps - 1;
while (low <= high) {
long mid = (low + high) >>> 1;
long midVal = get(fromIndex + mid*step) & mask;
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid*step;
}
return -1;
else {
return new SteppingMaskedMultimapSearcher(new MultimapSearcherBase(slice), mask, stepSize);
}
}
}
class SimpleMultimapSearcher implements MultimapSearcher {
private final MultimapSearcherBase base;
SimpleMultimapSearcher(MultimapSearcherBase base) {
this.base = base;
}
@Override
public long binarySearchUpper(long key, long fromIndex, long n) {
return base.binarySearchUpper(key, fromIndex, n);
}
@Override
public long binarySearch(long key, long fromIndex, long n) {
return base.binarySearch(key, fromIndex, n);
}
}
class MaskedMultimapSearcher implements MultimapSearcher {
private final MultimapSearcherBase base;
private final long mask;
MaskedMultimapSearcher(MultimapSearcherBase base, long mask) {
this.base = base;
this.mask = mask;
}
@Override
public long binarySearchUpper(long key, long fromIndex, long n) {
return base.binarySearchUpper(key, fromIndex, n, mask);
}
@Override
public long binarySearch(long key, long fromIndex, long n) {
return base.binarySearch(key, fromIndex, n, mask);
}
}
class SteppingMaskedMultimapSearcher implements MultimapSearcher {
private final MultimapSearcherBase base;
private final long mask;
private final int step;
SteppingMaskedMultimapSearcher(MultimapSearcherBase base, long mask, int step) {
this.base = base;
this.mask = mask;
this.step = step;
}
@Override
public long binarySearchUpper(long key, long fromIndex, long n) {
return base.binarySearchUpper(key, fromIndex, step, n, mask);
}
@Override
public long binarySearch(long key, long fromIndex, long n) {
return base.binarySearch(key, fromIndex, step, n, mask);
}
}

View File

@ -0,0 +1,143 @@
package nu.marginalia.util.multimap;
import lombok.experimental.Delegate;
public class MultimapSearcherBase {
@Delegate
private final MultimapFileLongSlice mmf;
public MultimapSearcherBase(MultimapFileLongSlice mmf) {
this.mmf = mmf;
}
public boolean binarySearchTest(long key, long fromIndex, long n) {
long low = 0;
long high = n - 1;
while (low <= high) {
long mid = (low + high) >>> 1;
long midVal = get(fromIndex + mid);
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return true;
}
return false;
}
public long binarySearchUpper(long key, long fromIndex, long n) {
long low = 0;
long high = n - 1;
while (low <= high) {
long mid = (low + high) >>> 1;
long midVal = get(fromIndex + mid);
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid;
}
return fromIndex + low;
}
public long binarySearchUpper(long key, long fromIndex, long n, long mask) {
long low = 0;
long high = n - 1;
while (low <= high) {
long mid = (low + high) >>> 1;
long midVal = get(fromIndex + mid) & mask;
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid;
}
return fromIndex + low;
}
public long binarySearchUpper(long key, long fromIndex, int step, long n, long mask) {
long low = 0;
long high = n - 1;
while (low <= high) {
long mid = (low + high) >>> 1;
long midVal = get(fromIndex + mid*step) & mask;
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid*step;
}
return fromIndex + low;
}
public long binarySearch(long key, long fromIndex, long n) {
long low = 0;
long high = n - 1;
while (low <= high) {
long mid = (low + high) >>> 1;
long midVal = get(fromIndex + mid);
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid;
}
return -1;
}
public long binarySearch(long key, long fromIndex, long n, long mask) {
long low = 0;
long high = n - 1;
while (low <= high) {
long mid = (low + high) >>> 1;
long midVal = get(fromIndex + mid) & mask;
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid;
}
return -1;
}
public long binarySearch(long key, long fromIndex, int step, long n, long mask) {
long low = 0;
long high = n - 1;
while (low <= high) {
long mid = (low + high) >>> 1;
long midVal = get(fromIndex + mid*step) & mask;
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid*step;
}
return -1;
}
}

View File

@ -13,10 +13,10 @@ import static nu.marginalia.util.multimap.MultimapFileLong.WORD_SIZE;
public class MultimapSorter {
private final Path tmpFileDir;
private final int internalSortLimit;
private final MultimapFileLong multimapFileLong;
private final MultimapFileLongSlice multimapFileLong;
private final long[] buffer;
public MultimapSorter(MultimapFileLong multimapFileLong, Path tmpFileDir, int internalSortLimit) {
public MultimapSorter(MultimapFileLongSlice multimapFileLong, Path tmpFileDir, int internalSortLimit) {
this.multimapFileLong = multimapFileLong;
this.tmpFileDir = tmpFileDir;
this.internalSortLimit = internalSortLimit;

View File

@ -1,49 +0,0 @@
package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.hash.TIntIntHashMap;
import it.unimi.dsi.fastutil.ints.IntArrays;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.sql.SQLException;
public class AcademiaRank {
private final TIntArrayList result;
private static final Logger logger = LoggerFactory.getLogger(AcademiaRank.class);
public AcademiaRank(HikariDataSource ds, String... origins) throws IOException {
TIntList rankingResults = new BetterStandardPageRank(ds, origins).pageRank(100_000);
TIntIntHashMap idToRanking = new TIntIntHashMap(100_000, 0.5f, -1, 1_000_000_000);
for (int i = 0; i < rankingResults.size(); i++) {
idToRanking.put(rankingResults.get(i), i);
}
result = new TIntArrayList(10000);
try (var conn = ds.getConnection();
var stmt = conn.prepareStatement("select EC_DOMAIN.ID,COUNT(SOURCE_DOMAIN_ID) AS CNT from EC_DOMAIN INNER JOIN DOMAIN_METADATA ON DOMAIN_METADATA.ID=EC_DOMAIN.ID INNER JOIN EC_DOMAIN_LINK ON EC_DOMAIN_LINK.DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE INDEXED>0 AND STATE>=0 AND STATE<2 AND ((VISITED_URLS>1000+1500*RANK AND RANK<1) OR (GOOD_URLS>1000 AND URL_PART LIKE '%edu')) GROUP BY EC_DOMAIN.ID HAVING CNT<1500 ORDER BY RANK ASC")) {
stmt.setFetchSize(1000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
result.add(rsp.getInt(1));
}
}
catch (SQLException ex) {
logger.error("SQL error", ex);
}
int[] internalArray = result.toArray();
IntArrays.quickSort(internalArray, (a,b) -> idToRanking.get(a) - idToRanking.get(b));
result.set(0, internalArray);
}
public TIntArrayList getResult() {
return result;
}
}

View File

@ -1,15 +1,11 @@
package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource;
import java.io.IOException;
public class BetterReversePageRank extends RankingAlgorithm {
public BetterReversePageRank(HikariDataSource dataSource, String... origins) {
super(dataSource, origins);
public BetterReversePageRank(RankingDomainFetcher domains, String... origins) {
super(domains, origins);
}
@Override

View File

@ -1,14 +1,10 @@
package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource;
import java.io.IOException;
public class BetterStandardPageRank extends RankingAlgorithm {
public BetterStandardPageRank(HikariDataSource dataSource, String... origins) {
super(dataSource, origins);
public BetterStandardPageRank(RankingDomainFetcher domains, String... origins) {
super(domains, origins);
}
@Override

View File

@ -1,15 +1,11 @@
package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource;
import java.io.IOException;
public class BuggyReversePageRank extends RankingAlgorithm {
public BuggyReversePageRank(HikariDataSource dataSource, String... origins) {
super(dataSource, origins);
public BuggyReversePageRank(RankingDomainFetcher domains, String... origins) {
super(domains, origins);
}
@Override

View File

@ -1,14 +1,10 @@
package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource;
import java.io.IOException;
public class BuggyStandardPageRank extends RankingAlgorithm {
public BuggyStandardPageRank(HikariDataSource dataSource, String... origins) {
super(dataSource, origins);
public BuggyStandardPageRank(RankingDomainFetcher domains, String... origins) {
super(domains, origins);
}
@Override

View File

@ -1,111 +1,57 @@
package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.map.hash.TIntObjectHashMap;
import gnu.trove.set.hash.TIntHashSet;
import it.unimi.dsi.fastutil.ints.IntComparator;
import lombok.AllArgsConstructor;
import lombok.Data;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.sql.SQLException;
import java.util.*;
import java.util.function.IntToDoubleFunction;
import java.util.stream.IntStream;
import it.unimi.dsi.fastutil.ints.IntArrays;
public abstract class RankingAlgorithm {
final TIntObjectHashMap<DomainData> domainsById = new TIntObjectHashMap<>();
final TIntIntHashMap domainIndexToId = new TIntIntHashMap();
final TIntIntHashMap domainIdToIndex = new TIntIntHashMap();
protected final TIntObjectHashMap<RankingDomainData> domainsById = new TIntObjectHashMap<>();
protected final TIntIntHashMap domainIndexToId = new TIntIntHashMap();
protected final TIntIntHashMap domainIdToIndex = new TIntIntHashMap();
private final TIntHashSet spamDomains;
private final HikariDataSource dataSource;
TIntArrayList[] linkDataSrc2Dest;
TIntArrayList[] linkDataDest2Src;
protected TIntArrayList[] linkDataSrc2Dest;
protected TIntArrayList[] linkDataDest2Src;
public final Set<String> originDomains = new HashSet<>();
public final Set<Integer> originDomainIds = new HashSet<>();
private int maxKnownUrls = Integer.MAX_VALUE;
private static final boolean getNames = true;
private final Logger logger = LoggerFactory.getLogger(getClass());
public static void main(String... args) throws IOException {
var rpr = new BuggyReversePageRank(new DatabaseModule().provideConnection(), "wiki.xxiivv.com");
var spr = new BuggyStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu");
private final RankingDomainFetcher domains;
var rankVector = spr.pageRankVector();
var norm = rankVector.norm();
rpr.pageRank(i -> rankVector.get(i) / norm, 25).forEach(i -> {
System.out.println(spr.domainNameFromId(i));
return true;
});
}
public RankingAlgorithm(RankingDomainFetcher domains, String... origins) {
this.domains = domains;
public String domainNameFromId(int id) {
return domainsById.get(id).name;
}
public boolean isPeripheral(int id) {
return domainsById.get(id).peripheral;
}
public RankingAlgorithm(HikariDataSource dataSource, String... origins) {
this.dataSource = dataSource;
var blacklist = new EdgeDomainBlacklistImpl(dataSource);
spamDomains = blacklist.getSpamDomains();
originDomains.addAll(Arrays.asList(origins));
try (var conn = dataSource.getConnection()) {
domains.getDomains(domainData -> {
int id = domainData.id;
String s;
if (getNames) {
s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
}
else {
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
}
try (var stmt = conn.prepareStatement(s)) {
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int id = rsp.getInt(1);
if (!spamDomains.contains(id)) {
domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), rsp.getInt(4), rsp.getInt(5), false));
domainsById.put(id, domainData);
domainIndexToId.put(domainIndexToId.size(), id);
domainIdToIndex.put(id, domainIdToIndex.size());
}
}
}
});
linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()];
linkDataDest2Src = new TIntArrayList[domainIndexToId.size()];
try (var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) {
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int src = rsp.getInt(1);
int dst = rsp.getInt(2);
if (src == dst) continue;
domains.eachDomainLink((src, dst) -> {
if (src == dst) return;
if (domainsById.contains(src) && domainsById.contains(dst)) {
@ -122,15 +68,10 @@ public abstract class RankingAlgorithm {
}
linkDataDest2Src[dstIdx].add(srcIdx);
}
}
}
});
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART LIKE ?")) {
for (var seed : this.originDomains) {
stmt.setString(1, seed);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int i = rsp.getInt(1);
for (var namePattern : this.originDomains) {
domains.domainsByPattern(namePattern, i -> {
int ival = domainIdToIndex.get(i);
if (ival != domainIdToIndex.getNoEntryValue() || domainIndexToId.get(0) == i) {
originDomainIds.add(ival);
@ -138,71 +79,39 @@ public abstract class RankingAlgorithm {
else {
logger.debug("No value for {}", i);
}
});
}
logger.debug("{} -> {}", seed, originDomainIds.size());
}
}
logger.info("Origin Domains: {}", originDomainIds.size());
} catch (SQLException throwables) {
logger.error("SQL error", throwables);
}
}
public void addPeripheralNodes(boolean includeErrorStates) {
public void addPeripheralNodes() {
int newNodesIdxCutoff = domainIdToIndex.size();
logger.info("Inserting peripheral nodes");
try (var conn = dataSource.getConnection()) {
String s;
if (getNames) {
s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
}
else {
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
}
try (var stmt = conn.prepareStatement(s)) {
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int id = rsp.getInt(1);
if (!spamDomains.contains(id)) {
domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), rsp.getInt(4), rsp.getInt(5), true));
domains.getPeripheralDomains(domainData -> {
int id = domainData.id;
if (domainsById.put(id, domainData) == null) { // true if id was not already present
domainIndexToId.put(domainIndexToId.size(), id);
domainIdToIndex.put(id, domainIdToIndex.size());
}
}
}
});
linkDataSrc2Dest = Arrays.copyOf(linkDataSrc2Dest, domainIndexToId.size());
linkDataDest2Src = Arrays.copyOf(linkDataDest2Src, domainIndexToId.size());
try (var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) {
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int src = rsp.getInt(1);
int dst = rsp.getInt(2);
if (src == dst) continue;
domains.eachDomainLink((src, dst) -> {
if (src == dst) return;
if (domainsById.contains(src) && domainsById.contains(dst)) {
int srcIdx = domainIdToIndex.get(src);
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
// This looks like a bug, but it improves the results
if (srcIdx < newNodesIdxCutoff || dstIdx < newNodesIdxCutoff)
continue;
return;
if (linkDataSrc2Dest[srcIdx] == null) {
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
@ -214,11 +123,7 @@ public abstract class RankingAlgorithm {
}
linkDataDest2Src[dstIdx].add(srcIdx);
}
}
}
} catch (SQLException throwables) {
logger.error("SQL error", throwables);
}
});
logger.info("Peripheral nodes inserted {} -> {}", newNodesIdxCutoff, domainIdToIndex.size());
}
@ -271,14 +176,14 @@ public abstract class RankingAlgorithm {
return rank.getRanking(resultCount);
}
public TIntList pageRankWithPeripheralNodes(int resultCount, boolean includeErrorStates) {
public TIntList pageRankWithPeripheralNodes(int resultCount) {
RankVector rank = new RankVector(1.d / domainsById.size());
int iter_max = 100;
for (int i = 0; i < iter_max; i++) {
if (i == iter_max-1) {
addPeripheralNodes(includeErrorStates);
addPeripheralNodes();
}
RankVector newRank = createNewRankVector(rank);
@ -323,7 +228,7 @@ public abstract class RankingAlgorithm {
abstract RankVector createNewRankVector(RankVector rank);
public boolean includeInRanking(DomainData data) {
public boolean includeInRanking(RankingDomainData data) {
if (data.isAlias())
return false;
if (data.isSpecial())
@ -445,32 +350,4 @@ public abstract class RankingAlgorithm {
}
}
@Data
@AllArgsConstructor
static class DomainData {
public final int id;
public final String name;
private int alias;
private int state;
public final int knownUrls;
public boolean peripheral;
public int resolveAlias() {
if (alias == 0) return id;
return alias;
}
public boolean isAlias() {
return alias != 0;
}
public boolean isSpecial() {
return EdgeDomainIndexingState.SPECIAL.code == state;
}
public boolean isSocialMedia() {
return EdgeDomainIndexingState.SOCIAL_MEDIA.code == state;
}
}
}

View File

@ -0,0 +1,33 @@
package nu.marginalia.util.ranking;
import lombok.AllArgsConstructor;
import lombok.Data;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
@Data
@AllArgsConstructor
class RankingDomainData {
public final int id;
public final String name;
private int alias;
private EdgeDomainIndexingState state;
public final int knownUrls;
public boolean peripheral;
public int resolveAlias() {
if (alias == 0) return id;
return alias;
}
public boolean isAlias() {
return alias != 0;
}
public boolean isSpecial() {
return EdgeDomainIndexingState.SPECIAL == state;
}
public boolean isSocialMedia() {
return EdgeDomainIndexingState.SOCIAL_MEDIA == state;
}
}

View File

@ -0,0 +1,105 @@
package nu.marginalia.util.ranking;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.function.Consumer;
import java.util.function.IntConsumer;
public class RankingDomainFetcher {
private final HikariDataSource dataSource;
private final EdgeDomainBlacklistImpl blacklist;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final boolean getNames = false;
@Inject
public RankingDomainFetcher(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) {
this.dataSource = dataSource;
this.blacklist = blacklist;
}
public void getDomains(Consumer<RankingDomainData> consumer) {
String query;
if (getNames) {
query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
}
else {
query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
}
getDomains(query, consumer);
}
public void getPeripheralDomains(Consumer<RankingDomainData> consumer) {
String query;
if (getNames) {
query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
}
else {
query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
}
getDomains(query, consumer);
}
private void getDomains(String query, Consumer<RankingDomainData> consumer) {
try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(query)) {
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int id = rsp.getInt(1);
if (!blacklist.isBlacklisted(id)) {
consumer.accept(new RankingDomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), false));
}
}
}
catch (SQLException ex) {
logger.error("Failed to fetch domains", ex);
}
}
public void eachDomainLink(DomainLinkConsumer consumer) {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK"))
{
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int src = rsp.getInt(1);
int dst = rsp.getInt(2);
consumer.accept(src, dst);
}
}
catch (SQLException ex) {
logger.error("Failed to fetch domain links", ex);
}
}
public void domainsByPattern(String pattern, IntConsumer idConsumer) {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE ?")) {
stmt.setString(1, pattern);
var rsp = stmt.executeQuery();
while (rsp.next()) {
idConsumer.accept(rsp.getInt(1));
}
}
catch (SQLException ex) {
logger.error("Failed to fetch domains by pattern", ex);
}
}
public interface DomainLinkConsumer {
void accept(int from, int to);
}
}

View File

@ -66,7 +66,7 @@ public class OldReversePageRankV2 {
originDomains.add("memex.marginalia.nu");
try (var conn = dataSource.getConnection()) {
try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE FROM EC_DOMAIN WHERE INDEXED>1 AND STATE>=0 AND QUALITY_RAW>=-10")) {
try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE FROM EC_DOMAIN WHERE INDEXED>1 AND IS_ALIVE")) {
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
@ -90,7 +90,7 @@ public class OldReversePageRankV2 {
}
}
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) {
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
stmt.setFetchSize(10000);
for (var seed : this.originDomains) {

View File

@ -48,7 +48,7 @@ public class StandardPageRank {
originDomains.addAll(Arrays.asList(origins));
try (var conn = dataSource.getConnection()) {
try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE,URL_PART FROM EC_DOMAIN WHERE INDEXED>1 AND STATE>=0 AND QUALITY>=-10")) {
try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE,DOMAIN_NAME FROM EC_DOMAIN WHERE INDEXED>1 AND IS_ALIVE AND QUALITY>=-10")) {
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
@ -78,7 +78,7 @@ public class StandardPageRank {
}
}
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) {
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
for (var seed : this.originDomains) {
stmt.setString(1, seed);
var rsp = stmt.executeQuery();

View File

@ -50,7 +50,7 @@ public class DedupTool {
Map<Integer, Map<Integer, List<Data>>> domainToHashToUrl = new HashMap<>();
try (var conn = ds.getConnection();
var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.URL_PART FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL");
var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.DOMAIN_NAME FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL");
var updateStmt = conn.prepareStatement("UPDATE EC_URL SET STATE='redirect' WHERE ID=?")
) {

View File

@ -112,10 +112,10 @@ public class PerusePageRankV2 {
try (var conn = dataSource.getConnection()) {
String s;
if (getNames) {
s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 GROUP BY EC_DOMAIN.ID";
s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) GROUP BY EC_DOMAIN.ID";
}
else {
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS FROM EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 GROUP BY EC_DOMAIN.ID";
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS FROM EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) GROUP BY EC_DOMAIN.ID";
}
try (var stmt = conn.prepareStatement(s)) {
stmt.setFetchSize(10000);

View File

@ -1,30 +0,0 @@
package nu.marginalia.util.ranking.tool;
import lombok.SneakyThrows;
import nu.marginalia.util.ranking.AcademiaRank;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import org.mariadb.jdbc.Driver;
import java.io.IOException;
public class TestAcademiaRankTool {
@SneakyThrows
public static void main(String... args) {
Driver driver = new Driver();
var conn = new DatabaseModule().provideConnection();
var rank = new AcademiaRank(new DatabaseModule().provideConnection(), "www.perseus.tufts.edu", "xroads.virginia.edu");
var res = rank.getResult();
try (var c = conn.getConnection(); var stmt = c.prepareStatement("SELECT URL_PART FROM EC_DOMAIN WHERE ID=?")) {
for (int i = 0; i < Math.min(res.size(), 100); i++) {
stmt.setInt(1, res.getQuick(i));
var rsp = stmt.executeQuery();
while (rsp.next())
System.out.println(rsp.getString(1));
}
}
}
}

View File

@ -3,12 +3,13 @@ package nu.marginalia.util.ranking.tool;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.util.ranking.BuggyStandardPageRank;
import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import org.mariadb.jdbc.Driver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.sql.SQLException;
import java.util.HashSet;
import java.util.Set;
@ -43,12 +44,14 @@ public class UpdateDomainRanksTool {
var uploader = new Thread(() -> uploadThread(conn), "Uploader");
logger.info("Ranking");
var spr = new BuggyStandardPageRank(new DatabaseModule().provideConnection(),"memex.marginalia.nu");
var ds = new DatabaseModule().provideConnection();
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
var spr = new BuggyStandardPageRank(domains, "memex.marginalia.nu");
rankMax = spr.size()*2;
uploader.start();
spr.pageRankWithPeripheralNodes(rankMax, false).forEach(i -> {
spr.pageRankWithPeripheralNodes(rankMax).forEach(i -> {
try {
uploadQueue.put(i);
} catch (InterruptedException e) {
@ -83,11 +86,6 @@ public class UpdateDomainRanksTool {
}
}
logger.info("Recalculating quality");
try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET QUALITY=-5*RANK+IF(RANK=1,RANK*GREATEST(QUALITY_RAW,QUALITY_ORIGINAL)/2, 0)")) {
stmt.executeUpdate();
}
} catch (SQLException | InterruptedException throwables) {
throwables.printStackTrace();
}

View File

@ -3,12 +3,13 @@ package nu.marginalia.util.ranking.tool;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.util.ranking.BetterReversePageRank;
import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import org.mariadb.jdbc.Driver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.sql.SQLException;
import java.util.HashSet;
import java.util.Set;
@ -45,7 +46,9 @@ public class UpdateDomainRanksTool2 {
logger.info("Ranking");
// "memex.marginalia.nu", "wiki.xxiivv.com", "bikobatanari.art", "sadgrl.online", "lileks.com",
// "www.rep.routledge.com", "www.personal.kent.edu", "xroads.virginia.edu", "classics.mit.edu", "faculty.washington.edu", "monadnock.net"
var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
var ds = new DatabaseModule().provideConnection();
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
var rpr = new BetterReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
// var rpr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "%edu");
// var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu");
@ -58,7 +61,7 @@ public class UpdateDomainRanksTool2 {
rankMax = rpr.size();
rpr.pageRankWithPeripheralNodes(rankMax, false).forEach(i -> {
rpr.pageRankWithPeripheralNodes(rankMax).forEach(i -> {
try {
uploadQueue.put(i);
} catch (InterruptedException e) {
@ -94,9 +97,6 @@ public class UpdateDomainRanksTool2 {
}
logger.info("Recalculating quality");
try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET QUALITY=-5*RANK+IF(RANK=1,RANK*GREATEST(QUALITY_RAW,QUALITY_ORIGINAL)/2, 0)")) {
stmt.executeUpdate();
}
} catch (SQLException | InterruptedException throwables) {
throwables.printStackTrace();

View File

@ -6,6 +6,7 @@ import io.reactivex.rxjava3.core.Observable;
import io.reactivex.rxjava3.core.ObservableSource;
import io.reactivex.rxjava3.plugins.RxJavaPlugins;
import lombok.SneakyThrows;
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
import nu.marginalia.wmsa.client.exception.LocalException;
import nu.marginalia.wmsa.client.exception.NetworkException;
import nu.marginalia.wmsa.client.exception.RemoteException;
@ -30,9 +31,12 @@ import java.util.zip.GZIPOutputStream;
public abstract class AbstractClient implements AutoCloseable {
public static final String CONTEXT_OUTBOUND_REQUEST = "outbound-request";
private final Gson gson = new GsonBuilder().create();
private final Gson gson = new GsonBuilder()
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
.create();
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Marker httpMarker = MarkerFactory.getMarker("HTTP");
private final OkHttpClient client;

View File

@ -4,10 +4,10 @@ import com.google.inject.Inject;
import com.google.inject.Singleton;
import io.reactivex.rxjava3.core.Observable;
import nu.marginalia.wmsa.client.AbstractDynamicClient;
import nu.marginalia.wmsa.client.exception.RouteNotConfiguredException;
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.edge.assistant.dict.DictionaryResponse;
import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles;
import org.eclipse.jetty.util.UrlEncoded;
import java.util.List;
@ -21,18 +21,38 @@ public class AssistantClient extends AbstractDynamicClient {
}
public Observable<DictionaryResponse> dictionaryLookup(Context ctx, String word) {
try {
return super.get(ctx, "/dictionary/" + UrlEncoded.encodeString(word), DictionaryResponse.class);
}
catch (RouteNotConfiguredException ex) {
return Observable.empty();
}
}
@SuppressWarnings("unchecked")
public Observable<List<String>> spellCheck(Context ctx, String word) {
try {
return (Observable<List<String>>) (Object) super.get(ctx, "/spell-check/" + UrlEncoded.encodeString(word), List.class);
}
catch (RouteNotConfiguredException ex) {
return Observable.empty();
}
}
public Observable<String> unitConversion(Context ctx, String value, String from, String to) {
try {
return super.get(ctx, "/unit-conversion?value=" + value + "&from=" + from + "&to=" + to);
}
catch (RouteNotConfiguredException ex) {
return Observable.empty();
}
}
public Observable<String> evalMath(Context ctx, String expression) {
try {
return super.get(ctx, "/eval-expression?value=" + UrlEncoded.encodeString(expression));
}
catch (RouteNotConfiguredException ex) {
return Observable.empty();
}
}
}

View File

@ -79,10 +79,7 @@ public class WikiCleaner {
}
});
Optional.ofNullable(doc.getElementsByTag("cite")).ifPresent(cite -> cite.forEach(c -> {
c.tagName("span");
}));
doc.getElementsByTag("cite").tagName("span");
removeIds(doc, "toc", "catlinks", "Notes", "mw-navigation", "mw-data-after-content", "jump-to-nav");
removeByClass(doc, "mw-references-wrap", "references", "reference", "siteSub", "refbegin");
@ -205,7 +202,7 @@ public class WikiCleaner {
}
});
doc.getAllElements().forEach(elem -> {
var classes = elem.classNames().stream().filter(this::isWikiClass).collect(Collectors.toList());
var classes = elem.classNames().stream().filter(this::isWikiClass).toList();
classes.forEach(elem::removeClass);
elem.removeAttr("lang");
elem.removeAttr("dir");
@ -251,9 +248,8 @@ public class WikiCleaner {
var formula = math.getElementsByTag("math");
var converter = net.sourceforge.jeuclid.converter.Converter.getInstance();
var sos = new ByteArrayOutputStream();
var alt = Optional.ofNullable(formula.attr("alttext"))
.or(() -> Optional.ofNullable(math.getElementsByTag("annotation").text()))
.orElse("");
var alt = Optional.of(formula.attr("alttext")).filter(s -> !s.isBlank())
.orElseGet(() -> math.getElementsByTag("annotation").text());
var layoutContext = new LayoutContextImpl(LayoutContextImpl.getDefaultLayoutContext());
@ -309,16 +305,16 @@ public class WikiCleaner {
@NotNull
private List<Pair<String, String>> getWikiPageLinks(Document doc) {
List<Pair<String,String>> topLinks = new ArrayList<>();
Optional.ofNullable(doc.select("p a")).ifPresent(links -> links.forEach(atag -> {
doc.select("p a").forEach(atag -> {
String href = atag.attr("href");
if (href != null && !href.isBlank()
if (!href.isBlank()
&& !href.contains(":")
&& !href.startsWith("#")
) {
topLinks.add(Pair.of(href, atag.attr("title")));
}
}));
});
return topLinks;
}
@ -336,19 +332,16 @@ public class WikiCleaner {
private List<Pair<String, String>> getDisambiguationLinks(Document doc) {
List<Pair<String,String>> disambig = new ArrayList<>();
Optional.ofNullable(doc.getElementsByClass("hatnote")).ifPresent(hatnotes -> {
hatnotes.forEach(note -> {
Optional.ofNullable(note.getElementsByTag("a"))
.ifPresent(links -> links.forEach(atag -> {
for (var note: doc.getElementsByClass("hatnote")) {
for (var atag : note.getElementsByTag("a")) {
String href = atag.attr("href");
if (atag.hasClass("mw-disambig") && href != null) {
if (atag.hasClass("mw-disambig") && !href.isBlank()) {
disambig.add(Pair.of(href, atag.attr("title")));
}
}));
});
});
Optional.ofNullable(doc.getElementsByClass("hatnote")).ifPresent(Elements::remove);
}
}
doc.getElementsByClass("hatnote").remove();
return disambig;
}

View File

@ -0,0 +1,40 @@
package nu.marginalia.wmsa.edge.assistant.screenshot;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.mariadb.jdbc.Driver;
import java.io.FileInputStream;
import java.io.IOException;
import java.sql.SQLException;
import java.util.zip.GZIPInputStream;
public class ScreenshotLoaderMain {
public static void main(String... args) throws IOException {
org.mariadb.jdbc.Driver driver = new Driver();
var ds = new DatabaseModule().provideConnection();
try (var tis = new TarArchiveInputStream(new GZIPInputStream(new FileInputStream(args[0])));
var conn = ds.getConnection();
var ps = conn.prepareStatement("REPLACE INTO DATA_DOMAIN_SCREENSHOT(DOMAIN_NAME, CONTENT_TYPE, DATA) VALUES (?,?,?)")
) {
for (TarArchiveEntry entry = tis.getNextTarEntry(); entry != null; entry = tis.getNextTarEntry()) {
if (entry.isFile()) {
String fileName = entry.getName();
String domainName = fileName.substring(fileName.indexOf('/')+1, fileName.lastIndexOf('.'));
ps.setString(1, domainName);
ps.setString(2, "image/webp");
ps.setBlob(3, tis);
ps.executeUpdate();
System.out.println(domainName);
}
}
} catch (SQLException e) {
e.printStackTrace();
}
}
}

View File

@ -2,48 +2,50 @@ package nu.marginalia.wmsa.edge.assistant.screenshot;
import com.google.common.base.Strings;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeId;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request;
import spark.Response;
import spark.utils.IOUtils;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.NoSuchElementException;
import java.sql.SQLException;
import static java.lang.Integer.parseInt;
public class ScreenshotService {
private final Path screenshotsRoot = Path.of("/var/lib/wmsa/archive/screenshots/screenshots/");
private final Path screenshotsRootWebp = Path.of("/var/lib/wmsa/archive.fast/screenshots/");
private final EdgeDataStoreDao edgeDataStoreDao;
private final long MIN_FILE_SIZE = 4096;
private final HikariDataSource dataSource;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public ScreenshotService(EdgeDataStoreDao edgeDataStoreDao) {
public ScreenshotService(EdgeDataStoreDao edgeDataStoreDao, HikariDataSource dataSource) {
this.edgeDataStoreDao = edgeDataStoreDao;
this.dataSource = dataSource;
}
public boolean hasScreenshot(EdgeId<EdgeDomain> domainId) {
EdgeDomain domain = edgeDataStoreDao.getDomain(domainId);
Path p = getScreenshotPath(screenshotsRootWebp, domain, ".webp");
if (p == null) {
p = getScreenshotPath(screenshotsRoot, domain, ".png");
try (var conn = dataSource.getConnection();
var ps = conn.prepareStatement("""
SELECT TRUE
FROM DATA_DOMAIN_SCREENSHOT
INNER JOIN EC_DOMAIN ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME
WHERE EC_DOMAIN.ID=?
""")) {
ps.setInt(1, domainId.id());
var rs = ps.executeQuery();
return rs.next();
}
catch (SQLException ex) {
logger.warn("SQL error", ex);
}
try {
return p != null && Files.size(p) >= MIN_FILE_SIZE;
} catch (IOException e) {
return false;
}
}
@SneakyThrows
public Object serveScreenshotRequest(Request request, Response response) {
@ -54,28 +56,30 @@ public class ScreenshotService {
int id = parseInt(request.params("id"));
Path p = null;
if (id == 0) {
p = screenshotsRootWebp.resolve("dummy-snapshot.webp");
} else {
EdgeDomain domain;
try {
domain = edgeDataStoreDao.getDomain(new EdgeId<>(id));
p = getScreenshotPath(screenshotsRootWebp, domain, ".webp");
if (p == null) {
p = getScreenshotPath(screenshotsRoot, domain, ".png");
try (var conn = dataSource.getConnection();
var ps = conn.prepareStatement("""
SELECT CONTENT_TYPE, DATA
FROM DATA_DOMAIN_SCREENSHOT
INNER JOIN EC_DOMAIN ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME
WHERE EC_DOMAIN.ID=?
""")) {
ps.setInt(1, id);
var rsp = ps.executeQuery();
if (rsp.next()) {
response.type(rsp.getString(1));
rsp.getBlob(2).getBinaryStream().transferTo(response.raw().getOutputStream());
return "";
}
}
catch (SQLException ex) {
logger.warn("SQL error", ex);
}
if (p != null && Files.size(p) <= MIN_FILE_SIZE) {
p = null;
}
} catch (NoSuchElementException ex) {
domain = new EdgeDomain("error.example.com");
return serveSvgPlaceholder(response, id);
}
if (p == null) {
private Object serveSvgPlaceholder(Response response, int id) {
response.type("image/svg+xml");
return String.format("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" +
"<svg\n" +
" xmlns=\"http://www.w3.org/2000/svg\"\n" +
@ -101,32 +105,6 @@ public class ScreenshotService {
" style=\"font-size:32px;fill:#000000;font-family:monospace;\"\n" +
" x=\"320\" y=\"240\" dominant-baseline=\"middle\" text-anchor=\"middle\">%s</text>\n" +
" </g>\n" +
"</svg>\n", domain);
"</svg>\n", edgeDataStoreDao.getDomain(new EdgeId<>(id)));
}
}
response.status(200);
response.header("Cache-control", "public,max-age=3600");
if (p.toString().endsWith("webp")) {
response.type("image/webp");
} else {
response.type("image/png");
}
IOUtils.copy(new ByteArrayInputStream(Files.readAllBytes(p)), response.raw().getOutputStream());
return "";
}
private Path getScreenshotPath(Path root, EdgeDomain domain, String ending) {
var p = root.resolve(domain.toString() + ending);
if (!p.normalize().startsWith(root)) {
return null;
}
if (!Files.exists(p)) {
return null;
}
return p;
}
}

View File

@ -45,7 +45,7 @@ public class ConvertedDomainReader {
try {
ret.add(gson.fromJson(parts[1], type));
}
catch (JsonParseException ex) {
catch (NullPointerException|JsonParseException ex) {
logger.warn("Failed to deserialize {} {}", type.getSimpleName(), StringUtils.abbreviate(parts[1], 255));
logger.warn("Json error", ex);
}

View File

@ -1,18 +1,19 @@
package nu.marginalia.wmsa.edge.converting;
import com.google.gson.*;
import com.google.common.base.Strings;
import com.google.gson.Gson;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import nu.marginalia.util.ParallelPipe;
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor;
import nu.marginalia.wmsa.edge.converting.processor.InstructionsCompiler;
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader;
import nu.marginalia.wmsa.edge.crawling.WorkLog;
import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader;
import nu.marginalia.wmsa.edge.crawling.WorkLog;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
import nu.marginalia.util.ParallelPipe;
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -52,13 +53,6 @@ public class ConverterMain {
injector.getInstance(ConverterMain.class);
}
private static void requireArgs(String[] args, String... help) {
if (args.length != help.length) {
System.out.println("Usage: " + String.join(", ", help));
System.exit(255);
}
}
@Inject
public ConverterMain(
EdgeCrawlPlan plan,
@ -103,7 +97,12 @@ public class ConverterMain {
domainToId.forEach((domain, id) -> {
String fileName = idToFileName.get(id);
Path dest = getFilePath(plan.crawl.getDir(), fileName);
if (Strings.isNullOrEmpty(fileName))
return;
Path dest = plan.getCrawledFilePath(fileName);
logger.info("{} - {} - {}", domain, id, dest);
if (!processLog.isJobFinished(id)) {
@ -128,10 +127,4 @@ public class ConverterMain {
record ProcessingInstructions(String id, List<Instruction> instructions) {}
private Path getFilePath(Path dir, String fileName) {
String sp1 = fileName.substring(0, 2);
String sp2 = fileName.substring(2, 4);
return dir.resolve(sp1).resolve(sp2).resolve(fileName);
}
}

View File

@ -11,7 +11,6 @@ import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import java.net.URISyntaxException;
import java.nio.file.Path;
public class ConverterModule extends AbstractModule {
@ -27,7 +26,8 @@ public class ConverterModule extends AbstractModule {
bind(Gson.class).toInstance(createGson());
bind(Double.class).annotatedWith(Names.named("min-document-quality")).toInstance(-15.);
bind(Integer.class).annotatedWith(Names.named("min-document-length")).toInstance(100);
bind(Double.class).annotatedWith(Names.named("min-avg-document-quality")).toInstance(-25.);
bind(Integer.class).annotatedWith(Names.named("min-document-length")).toInstance(250);
bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128);
bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255);

View File

@ -0,0 +1,194 @@
package nu.marginalia.wmsa.edge.converting;
import gnu.trove.set.hash.TIntHashSet;
import nu.marginalia.wmsa.edge.converting.atags.AnchorTextExtractor;
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader;
import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader;
import nu.marginalia.wmsa.edge.crawling.WorkLog;
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
import nu.marginalia.wmsa.edge.integration.stackoverflow.StackOverflowPostsReader;
import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader;
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
public class LinkKeywordExtractorMain {
private static final Logger logger = LoggerFactory.getLogger(LinkKeywordExtractorMain.class);
public static void main(String... args) throws IOException, InterruptedException {
if (args.length < 2) {
System.err.println("Arguments: [crawl|so|wiki] crawl-plan.yaml [data]");
System.exit(0);
}
String command = args[0];
var plan = new CrawlPlanLoader().load(Path.of(args[1]));
switch (command) {
case "crawl": getKeywordsFromCrawl(plan); break;
case "so": getKeywordsFromSo(plan, args[2]); break;
case "wiki": getKeywordsFromWiki(plan, args[2]); break;
default: System.err.println("Unrecognized command");
}
}
private static void getKeywordsFromWiki(EdgeCrawlPlan plan, String arg) throws IOException, InterruptedException {
HashSet<String> crawledDomains = new HashSet<>();
TIntHashSet crawledUrls = new TIntHashSet(50_000_000);
logger.info("Loading URLs");
Files.lines(Path.of("/home/vlofgren/good-urls3.txt"))
.filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange"))
.mapToInt(String::hashCode)
.forEach(crawledUrls::add);
logger.info("Loading input spec");
CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
spec -> { crawledDomains.add(spec.domain); });
try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(domain -> crawledDomains.contains(domain)
&& !domain.contains("wiki")
&& !domain.contains("isni")
&& !domain.contains("wiktionary"),
url -> crawledUrls.contains(url.toString().hashCode()),
output::write);
new WikipediaReader(arg, new EdgeDomain("invalid.example"), article -> {
anchorTextExtractor.processDocument(article.getUrl().toString(), article.body);
}).join();
}
catch (IOException ex) {
ex.printStackTrace();
}
}
private static void getKeywordsFromSo(EdgeCrawlPlan plan, String arg) throws IOException, InterruptedException {
TIntHashSet crawledUrls = new TIntHashSet(50_000_000);
logger.info("Loading URLs");
Files.lines(Path.of("/home/vlofgren/good-urls3.txt"))
.filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange"))
.mapToInt(String::hashCode)
.forEach(crawledUrls::add);
logger.info("Loading input spec");
HashSet<String> crawledDomains = new HashSet<>();
CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
spec -> crawledDomains.add(spec.domain));
crawledDomains.remove("jsfiddle.net"); // like 30% of SO's links go here
crawledDomains.remove("jsbin.com");
crawledDomains.remove("codepad.org");
try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains,
url -> crawledUrls.contains(url.toString().hashCode()),
output::write);
new StackOverflowPostsReader(arg, new EdgeDomain("invalid.example"), post -> {
anchorTextExtractor.processDocument(post.getUrl().toString(), post.fullBody);
}).join();
}
catch (IOException ex) {
ex.printStackTrace();
}
}
public static void getKeywordsFromCrawl(EdgeCrawlPlan plan) throws IOException {
TIntHashSet crawledUrls = new TIntHashSet(50_000_000);
logger.info("Loading URLs");
Files.lines(Path.of("/home/vlofgren/good-urls3.txt"))
.filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange"))
.mapToInt(String::hashCode)
.forEach(crawledUrls::add);
logger.info("Loading input spec");
HashSet<String> crawledDomains = new HashSet<>();
CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
spec -> crawledDomains.add(spec.domain));
List<String> fileNames = new ArrayList<>();
logger.info("Replaying crawl log");
WorkLog.readLog(plan.crawl.getLogFile(),
entry -> fileNames.add(entry.path()));
try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains,
url -> url.param != null,
//url -> crawledUrls.contains(url.toString().hashCode()),
output::write);
logger.info("Reading files");
for (var fn : fileNames) {
CrawledDomainReader crawledDomainReader = new CrawledDomainReader();
var crawledDomain = crawledDomainReader.read(plan.getCrawledFilePath(fn));
if (crawledDomain.doc == null) continue;
System.out.println("# " + crawledDomain.domain);
for (var doc : crawledDomain.doc) {
if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) {
anchorTextExtractor.processDocument(doc.url, doc.documentBody);
}
}
}
}
}
private static class UrlKeywordTsvWriter implements AutoCloseable {
private final OutputStream stream;
UrlKeywordTsvWriter(Path outputFile) throws IOException {
this.stream = new BufferedOutputStream(new FileOutputStream(outputFile.toFile()));
}
void write(EdgeUrl url, String keyword) {
try {
stream.write(url.toString().getBytes());
stream.write('\t');
stream.write(keyword.getBytes());
stream.write('\n');
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
public void close() throws IOException {
stream.close();
}
}
}

View File

@ -3,7 +3,6 @@ package nu.marginalia.wmsa.edge.converting;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
@ -25,14 +24,15 @@ import java.util.concurrent.atomic.AtomicInteger;
public class LoaderMain {
private static final Logger logger = LoggerFactory.getLogger(LoaderMain.class);
private final Path processDir;
private final EdgeCrawlPlan plan;
private final ConvertedDomainReader instructionsReader;
private final HikariDataSource dataSource;
private static final Logger logger = LoggerFactory.getLogger(LoaderMain.class);
private final LoaderFactory loaderFactory;
private final EdgeIndexClient indexClient;
private volatile boolean running = true;
final Thread processorThread = new Thread(this::processor, "Processor Thread");
@ -56,14 +56,12 @@ public class LoaderMain {
@Inject
public LoaderMain(EdgeCrawlPlan plan,
ConvertedDomainReader instructionsReader,
HikariDataSource dataSource,
LoaderFactory loaderFactory,
EdgeIndexClient indexClient) {
this.processDir = plan.process.getDir();
this.plan = plan;
this.instructionsReader = instructionsReader;
this.dataSource = dataSource;
this.loaderFactory = loaderFactory;
this.indexClient = indexClient;
@ -79,7 +77,7 @@ public class LoaderMain {
LoaderMain.loadTotal = loadTotal.get();
WorkLog.readLog(logFile, entry -> {
load(entry.path(), entry.cnt());
load(plan, entry.path(), entry.cnt());
});
running = false;
@ -90,15 +88,9 @@ public class LoaderMain {
}
private volatile static int loadTotal;
private static final int loaded = 0;
private void load(String path, int cnt) {
String first = path.substring(0, 2);
String second = path.substring(2, 4);
Path destDir = processDir.resolve(first).resolve(second).resolve(path);
private void load(EdgeCrawlPlan plan, String path, int cnt) {
Path destDir = plan.getProcessedFilePath(path);
try {
var loader = loaderFactory.create(cnt);
var instructions = instructionsReader.read(destDir, cnt);
@ -120,7 +112,8 @@ public class LoaderMain {
loader.finish();
long loadTime = System.currentTimeMillis() - startTime;
taskStats.observe(loadTime);
logger.info("Loaded {}/{} : {} ({}) {}ms {} l/s", taskStats.getCount(), loadTotal, path, loader.data.sizeHint, loadTime, taskStats.avgTime());
logger.info("Loaded {}/{} : {} ({}) {}ms {} l/s", taskStats.getCount(),
loadTotal, path, loader.data.sizeHint, loadTime, taskStats.avgTime());
}
}

View File

@ -29,7 +29,7 @@ public class ReindexTriggerMain {
.build();
try (var ds = db.provideConnection(); var conn = ds.getConnection(); var stmt = conn.createStatement()) {
var rs = stmt.executeQuery("SELECT ID, URL_PART, STATE, INDEXED FROM EC_DOMAIN LIMIT 100");
var rs = stmt.executeQuery("SELECT ID, DOMAIN_NAME, STATE, INDEXED FROM EC_DOMAIN LIMIT 100");
while (rs.next()) {
System.out.printf("%d %s %s %d\n",
rs.getInt(1),
@ -38,7 +38,7 @@ public class ReindexTriggerMain {
rs.getInt(4));
}
rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, URL, VISITED, STATE FROM EC_URL LIMIT 100");
rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, PATH, VISITED, STATE FROM EC_URL LIMIT 100");
while (rs.next()) {
System.out.printf("%d %d %s %d %s\n",
rs.getInt(1),

View File

@ -0,0 +1,149 @@
package nu.marginalia.wmsa.edge.converting.atags;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import lombok.SneakyThrows;
import nu.marginalia.util.DenseBitMap;
import nu.marginalia.util.language.WordPatterns;
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.nio.charset.StandardCharsets;
import java.util.function.BiConsumer;
import java.util.function.Predicate;
import java.util.regex.Pattern;
public class AnchorTextExtractor {
private final Predicate<String> includeDomainPredicate;
private final Predicate<EdgeUrl> includeUrlPredicate;
private final BiConsumer<EdgeUrl, String> linkKeywordConsumer;
private final LinkParser linkParser = new LinkParser();
private final HashFunction hashFunction = Hashing.murmur3_128();
// This bit map is used as a bloom filter to deduplicate url-keyword combinations
// false positives are expected, but that's an acceptable trade-off to not have to deal with
// de-duplicating billions of shuffled (url, word) tuples on limited hardware
private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS);
public AnchorTextExtractor(Predicate<String> includeDomainPredicate,
Predicate<EdgeUrl> includeUrlPredicate,
BiConsumer<EdgeUrl, String> linkKeywordConsumer) {
this.includeDomainPredicate = includeDomainPredicate;
this.includeUrlPredicate = includeUrlPredicate;
this.linkKeywordConsumer = linkKeywordConsumer;
}
@SneakyThrows
public void processDocument(String docUrl, String documentBody) {
final Document processed = Jsoup.parse(documentBody);
final EdgeUrl documentUrl = new EdgeUrl(docUrl);
for (var link : processed.getElementsByTag("a")) {
if (link.hasAttr("href")) {
String href = link.attr("href");
String text = getLinkText(link);
processAnchor(documentUrl, href, text);
}
}
}
private final Pattern anchorTextNoise = Pattern.compile("[ \t\n\"()“”]+");
private String getLinkText(Element link) {
String text = link.text();
if (link.text().isBlank()) {
for (var img: link.getElementsByTag("img")) {
if (img.hasAttr("alt")) {
text = img.attr("alt");
break;
}
}
}
return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim();
}
private void processAnchor(EdgeUrl documentUrl, String href, String text) {
if (!isInterestingAnchorText(text)) {
return;
}
var optLinkUrl = linkParser.parseLink(documentUrl, href);
if (optLinkUrl.isEmpty()) return;
var linkUrl = optLinkUrl.get();
if (!isInterestingAnchorLink(linkUrl)) {
return;
}
for (String word: anchorTextNoise.split(text)) {
if (WordPatterns.isStopWord(word))
continue;
word = word.toLowerCase();
if (!WordPatterns.filter(word)) {
continue;
}
if (linkUrl.domain.equals(documentUrl.domain)) {
continue;
}
if (isNewKeywordForLink(word, linkUrl.toString())) {
linkKeywordConsumer.accept(linkUrl, word);
}
}
}
// This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine
private final Predicate<String> looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate();
private boolean isInterestingAnchorText(String text) {
if (text.isBlank()) return false;
if (text.length() > 32) return false;
// Google loves questions, and so does SEO spammers
if (text.endsWith("?")) return false;
if (text.startsWith("http:") || text.startsWith("https:")) return false;
if (looksLikeAnURL.test(text)) return false;
return switch (text) {
case "this", "here", "click", "click here", "download", "source" -> false;
default -> true;
};
}
private boolean isInterestingAnchorLink(EdgeUrl linkUrl) {
if (!(linkUrl.proto.endsWith("http") || linkUrl.proto.equals("https"))) {
return false;
}
if (!includeUrlPredicate.test(linkUrl)) {
return false;
}
return includeDomainPredicate.test(linkUrl.domain.toString());
}
private boolean isNewKeywordForLink(String href, String text) {
long hash = 0;
hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).padToLong();
hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).padToLong();
// Remove sign bit because we don't want a negative index in deduplicateHashBitset
hash &= 0x7FFF_FFFF_FFFF_FFFFL;
return !deduplicateHashBitset.set(hash % deduplicateHashBitset.cardinality);
}
}

View File

@ -14,7 +14,7 @@ public interface Interpreter {
void loadRssFeed(EdgeUrl[] rssFeed);
void loadDomainLink(DomainLink[] links);
void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality);
void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip);
void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument);
void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError);

View File

@ -6,11 +6,11 @@ import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
public record LoadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality) implements Instruction {
public record LoadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) implements Instruction {
@Override
public void apply(Interpreter interpreter) {
interpreter.loadProcessedDomain(domain, state, quality);
interpreter.loadProcessedDomain(domain, state, ip);
}
@Override

View File

@ -76,9 +76,9 @@ public class Loader implements Interpreter {
}
@Override
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality) {
logger.debug("loadProcessedDomain({}, {}, {})", domain, state, quality);
sqlLoadProcessedDomain.load(data, domain, state, quality);
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) {
logger.debug("loadProcessedDomain({}, {}, {})", domain, state, ip);
sqlLoadProcessedDomain.load(data, domain, state, ip);
}
@Override

View File

@ -30,7 +30,7 @@ public class SqlLoadDomainLinks {
INSERT IGNORE INTO EC_DOMAIN_LINK (SOURCE_DOMAIN_ID, DEST_DOMAIN_ID)
SELECT SOURCE.ID,DEST.ID
FROM EC_DOMAIN SOURCE INNER JOIN EC_DOMAIN DEST
ON SOURCE.URL_PART=FROM_DOMAIN AND DEST.URL_PART=TO_DOMAIN;
ON SOURCE.DOMAIN_NAME=FROM_DOMAIN AND DEST.DOMAIN_NAME=TO_DOMAIN;
END
""");
}
@ -61,8 +61,8 @@ public class SqlLoadDomainLinks {
}
}
}
catch (SQLException sql) {
sql.printStackTrace();
catch (SQLException ex) {
logger.warn("SQL error inserting domain links", ex);
}
}

View File

@ -25,15 +25,9 @@ public class SqlLoadDomains {
stmt.execute("""
CREATE PROCEDURE INSERT_DOMAIN (
IN DOMAIN_NAME VARCHAR(255),
IN SUB_DOMAIN VARCHAR(255),
IN TOP_DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci)
BEGIN
INSERT IGNORE INTO EC_TOP_DOMAIN (URL_PART) VALUES (TOP_DOMAIN);
INSERT IGNORE INTO EC_DOMAIN(URL_PART, URL_SUBDOMAIN, URL_TOP_DOMAIN_ID)
SELECT DOMAIN_NAME,SUB_DOMAIN,ID
FROM EC_TOP_DOMAIN
WHERE EC_TOP_DOMAIN.URL_PART=TOP_DOMAIN;
INSERT IGNORE INTO EC_DOMAIN(DOMAIN_NAME, DOMAIN_TOP) VALUES (DOMAIN_NAME, TOP_DOMAIN);
END
""");
}
@ -46,10 +40,9 @@ public class SqlLoadDomains {
public void load(LoaderData data, EdgeDomain domain) {
try (var connection = dataSource.getConnection()) {
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?,?)")) {
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) {
insertCall.setString(1, domain.toString());
insertCall.setString(2, domain.subDomain);
insertCall.setString(3, domain.domain);
insertCall.setString(2, domain.domain);
insertCall.addBatch();
var ret = insertCall.executeUpdate();
@ -57,12 +50,11 @@ public class SqlLoadDomains {
logger.warn("load({}) -- bad row count {}", domain, ret);
}
connection.commit();
findIdForTargetDomain(connection, data);
}
}
catch (SQLException ex) {
ex.printStackTrace();
logger.warn("SQL error inserting domain", ex);
}
@ -73,12 +65,11 @@ public class SqlLoadDomains {
try (var connection = dataSource.getConnection()) {
connection.setAutoCommit(false);
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?,?)")) {
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) {
for (var domain : domains) {
insertCall.setString(1, domain.toString());
insertCall.setString(2, domain.subDomain);
insertCall.setString(3, domain.domain);
insertCall.setString(2, domain.domain);
insertCall.addBatch();
}
var ret = insertCall.executeBatch();
@ -95,7 +86,7 @@ public class SqlLoadDomains {
findIdForTargetDomain(connection, data);
}
catch (SQLException ex) {
ex.printStackTrace();
logger.warn("SQL error inserting domains", ex);
}
}
@ -104,7 +95,7 @@ public class SqlLoadDomains {
return;
}
try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?"))
try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?"))
{
var targetDomain = data.getTargetDomain();
@ -118,7 +109,7 @@ public class SqlLoadDomains {
}
}
catch (SQLException ex) {
ex.printStackTrace();
logger.warn("SQL error finding id for domain", ex);
}
}
}

View File

@ -31,14 +31,14 @@ public class SqlLoadProcessedDocument {
IN TITLE VARCHAR(255),
IN DESCRIPTION VARCHAR(255),
IN LENGTH INT,
IN QUALITY_MEASURE DOUBLE,
IN FEATURES INT,
IN STANDARD VARCHAR(32),
IN QUALITY DOUBLE,
IN HASH INT)
BEGIN
SET FOREIGN_KEY_CHECKS=0;
REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES);
UPDATE EC_URL SET VISITED=1, STATE=STATE, QUALITY_MEASURE=QUALITY_MEASURE, DATA_HASH=HASH WHERE ID=URL_ID;
REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES, HASH, QUALITY);
UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID;
SET FOREIGN_KEY_CHECKS=1;
END
""");
@ -47,7 +47,8 @@ public class SqlLoadProcessedDocument {
IN URL_ID INT,
IN STATE VARCHAR(32))
BEGIN
UPDATE EC_URL SET VISITED=1, STATE=STATE, QUALITY_MEASURE=-100, DATA_HASH=NULL WHERE ID=URL_ID;
UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID;
DELETE FROM EC_PAGE_DATA WHERE ID=URL_ID;
END
""");
@ -61,6 +62,7 @@ public class SqlLoadProcessedDocument {
public void load(LoaderData data, List<LoadProcessedDocument> documents) {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?)")) {
conn.setAutoCommit(false);
for (var doc : documents) {
int urlId = data.getUrlId(doc.url());
@ -74,9 +76,9 @@ public class SqlLoadProcessedDocument {
stmt.setString(3, doc.title());
stmt.setString(4, doc.description());
stmt.setInt(5, doc.length());
stmt.setDouble(6, doc.quality());
stmt.setInt(7, doc.htmlFeatures());
stmt.setString(8, doc.standard().name());
stmt.setInt(6, doc.htmlFeatures());
stmt.setString(7, doc.standard().name());
stmt.setDouble(8, doc.quality());
stmt.setInt(9, (int) doc.hash());
stmt.addBatch();
}
@ -89,11 +91,9 @@ public class SqlLoadProcessedDocument {
}
conn.commit();
} catch (SQLException e) {
e.printStackTrace();
} catch (SQLException ex) {
logger.warn("SQL error inserting document", ex);
}
}
public void loadWithError(LoaderData data, List<LoadProcessedDocumentWithError> documents) {
@ -117,8 +117,8 @@ public class SqlLoadProcessedDocument {
logger.warn("load({}) -- bad row count {}", documents.get(rv), ret[rv]);
}
}
} catch (SQLException e) {
e.printStackTrace();
} catch (SQLException ex) {
logger.warn("SQL error inserting failed document", ex);
}
}

View File

@ -25,12 +25,12 @@ public class SqlLoadProcessedDomain {
stmt.execute("DROP PROCEDURE IF EXISTS INITIALIZE_DOMAIN");
stmt.execute("""
CREATE PROCEDURE INITIALIZE_DOMAIN (
IN ST INT,
IN ST ENUM('ACTIVE', 'EXHAUSTED', 'SPECIAL', 'SOCIAL_MEDIA', 'BLOCKED', 'REDIR', 'ERROR', 'UNKNOWN'),
IN IDX INT,
IN QUAL DOUBLE,
IN DID INT)
IN DID INT,
IN IP VARCHAR(32))
BEGIN
UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), QUALITY=QUAL, QUALITY_RAW=QUAL, QUALITY_ORIGINAL=QUAL WHERE ID=DID;
UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), IP=IP WHERE ID=DID;
DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID;
END
""");
@ -41,7 +41,7 @@ public class SqlLoadProcessedDomain {
}
}
public void load(LoaderData data, EdgeDomain domain, EdgeDomainIndexingState state, double quality) {
public void load(LoaderData data, EdgeDomain domain, EdgeDomainIndexingState state, String ip) {
data.setTargetDomain(domain);
loadDomains.load(data, domain);
@ -49,18 +49,17 @@ public class SqlLoadProcessedDomain {
try (var conn = dataSource.getConnection();
var initCall = conn.prepareCall("CALL INITIALIZE_DOMAIN(?,?,?,?)"))
{
initCall.setInt(1, state.code);
initCall.setString(1, state.name());
initCall.setInt(2, 1 + data.sizeHint / 100);
initCall.setDouble(3, quality);
initCall.setInt(4, data.getDomainId(domain));
initCall.setInt(3, data.getDomainId(domain));
initCall.setString(4, ip);
int rc = initCall.executeUpdate();
if (rc < 1) {
logger.warn("load({},{},{}) -- bad rowcount {}", domain, state, quality, rc);
logger.warn("load({},{}) -- bad rowcount {}", domain, state, rc);
}
conn.commit();
}
catch (SQLException ex) {
ex.printStackTrace();
logger.warn("SQL error initializing domain", ex);
}
}
@ -69,9 +68,9 @@ public class SqlLoadProcessedDomain {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
UPDATE EC_DOMAIN TARGET
INNER JOIN EC_DOMAIN ALIAS ON ALIAS.URL_PART=?
INNER JOIN EC_DOMAIN ALIAS ON ALIAS.DOMAIN_NAME=?
SET TARGET.DOMAIN_ALIAS=ALIAS.ID
WHERE TARGET.URL_PART=?
WHERE TARGET.DOMAIN_NAME=?
""")) {
stmt.setString(1, link.to().toString());
stmt.setString(2, link.from().toString());
@ -81,7 +80,7 @@ public class SqlLoadProcessedDomain {
}
}
catch (SQLException ex) {
ex.printStackTrace();
logger.warn("SQL error inserting domain alias", ex);
}
}
}

View File

@ -1,11 +1,14 @@
package nu.marginalia.wmsa.edge.converting.loader;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.charset.StandardCharsets;
import java.sql.SQLException;
import java.sql.Types;
@ -25,12 +28,14 @@ public class SqlLoadUrls {
stmt.execute("""
CREATE PROCEDURE INSERT_URL (
IN PROTO VARCHAR(255),
IN DOMAIN_NAME VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
IN PORT INT,
IN URL VARCHAR(255)
IN PATH VARCHAR(255),
IN PARAM VARCHAR(255),
IN PATH_HASH BIGINT
)
BEGIN
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,URL) SELECT PROTO,ID,PORT,URL FROM EC_DOMAIN WHERE URL_PART=DOMAIN_NAME;
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN;
END
""");
}
@ -42,12 +47,16 @@ public class SqlLoadUrls {
public void load(LoaderData data, EdgeUrl[] urls) {
try (var conn = dataSource.getConnection();
var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?)");
var queryCall = conn.prepareStatement("SELECT ID, PROTO, URL FROM EC_URL WHERE DOMAIN_ID=?")
var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?,?,?)");
var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH, PARAM FROM EC_URL WHERE DOMAIN_ID=?")
)
{
conn.setAutoCommit(false);
for (var url : urls) {
if (url.path.length() >= 255) {
logger.warn("Skipping bad URL {}", url);
continue;
}
insertCall.setString(1, url.proto);
insertCall.setString(2, url.domain.toString());
@ -58,10 +67,12 @@ public class SqlLoadUrls {
insertCall.setNull(3, Types.INTEGER);
}
insertCall.setString(4, url.path);
insertCall.setString(5, url.param);
insertCall.setLong(6, hashPath(url.path, url.param));
insertCall.addBatch();
}
var ret = insertCall.executeBatch();
for (int rv = 0; rv < urls.length; rv++) {
for (int rv = 0; rv < ret.length; rv++) {
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
logger.warn("load({}) -- bad row count {}", urls[rv], ret[rv]);
}
@ -80,13 +91,26 @@ public class SqlLoadUrls {
int urlId = rsp.getInt(1);
String proto = rsp.getString(2);
String path = rsp.getString(3);
String param = rsp.getString(4);
data.addUrl(new EdgeUrl(proto, targetDomain, null, path), urlId);
data.addUrl(new EdgeUrl(proto, targetDomain, null, path, param), urlId);
}
}
catch (SQLException ex) {
ex.printStackTrace();
logger.warn("SQL error inserting URLs", ex);
}
}
private static final HashFunction murmur3_128 = Hashing.murmur3_128();
private long hashPath(String path, String queryParam) {
long pathHash = murmur3_128.hashString(path, StandardCharsets.UTF_8).padToLong();
if (queryParam == null) {
return pathHash;
}
else {
return pathHash + murmur3_128.hashString(queryParam, StandardCharsets.UTF_8).padToLong();
}
}
}

View File

@ -12,6 +12,11 @@ public class DisqualifiedException extends Exception {
}
public enum DisqualificationReason {
LENGTH, CONTENT_TYPE, LANGUAGE, STATUS, QUALITY
LENGTH,
CONTENT_TYPE,
LANGUAGE,
STATUS,
QUALITY,
ACCEPTABLE_ADS
}
}

View File

@ -0,0 +1,22 @@
package nu.marginalia.wmsa.edge.converting.processor;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
import org.jsoup.nodes.Document;
public class AcceptableAds {
/* Acceptable Ads is an initiative to allow less intrusive ads to punch through adblockers.
*
* In practice, from looking at crawled data, the only sites in the crawled corpus that seem to
* follow this standard are domain squatters and other nuisance sites.
*
*/
public static boolean hasAcceptableAdsTag(Document parsedDocument) {
return parsedDocument.getElementsByTag("html").hasAttr("data-adblockkey");
}
public static boolean hasAcceptableAdsHeader(CrawledDocument document) {
return document.headers.contains("X-Adblock-Key");
}
}

View File

@ -3,19 +3,15 @@ package nu.marginalia.wmsa.edge.converting.processor;
import com.google.common.hash.HashCode;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.util.language.LanguageFilter;
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException.DisqualificationReason;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocumentDetails;
import nu.marginalia.wmsa.edge.converting.processor.logic.*;
import nu.marginalia.wmsa.edge.converting.processor.logic.FeedExtractor;
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
import nu.marginalia.util.language.LanguageFilter;
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlStandardExtractor;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
@ -81,6 +77,10 @@ public class DocumentProcessor {
if (ret.state == EdgeUrlState.OK) {
if (AcceptableAds.hasAcceptableAdsHeader(crawledDocument)) {
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
}
if (isAcceptedContentType(crawledDocument)) {
var detailsWords = createDetails(crawledDomain, crawledDocument);
@ -101,7 +101,7 @@ public class DocumentProcessor {
}
catch (DisqualifiedException ex) {
ret.state = EdgeUrlState.DISQUALIFIED;
logger.info("Disqualified {}: {}", ret.url, ex.reason);
logger.debug("Disqualified {}: {}", ret.url, ex.reason);
}
catch (Exception ex) {
ret.state = EdgeUrlState.DISQUALIFIED;
@ -113,7 +113,19 @@ public class DocumentProcessor {
}
private boolean isAcceptedContentType(CrawledDocument crawledDocument) {
return crawledDocument.contentType != null && acceptedContentTypes.contains(crawledDocument.contentType.toLowerCase());
if (crawledDocument.contentType == null) {
return false;
}
var ct = crawledDocument.contentType;
if (acceptedContentTypes.contains(ct))
return true;
if (ct.contains(";")) {
return acceptedContentTypes.contains(ct.substring(0, ct.indexOf(';')));
}
return false;
}
private EdgeUrlState crawlerStatusToUrlState(String crawlerStatus, int httpStatus) {
@ -128,6 +140,11 @@ public class DocumentProcessor {
throws DisqualifiedException, URISyntaxException {
var doc = Jsoup.parse(crawledDocument.documentBody);
if (AcceptableAds.hasAcceptableAdsTag(doc)) {
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
}
var dld = sentenceExtractor.extractSentences(doc.clone());
checkDocumentLanguage(dld);
@ -158,7 +175,6 @@ public class DocumentProcessor {
var edgeDomain = url.domain;
tagWords.add("format:"+ret.standard.toString().toLowerCase());
tagWords.add("site:" + edgeDomain.toString().toLowerCase());
if (!Objects.equals(edgeDomain.toString(), edgeDomain.domain)) {
tagWords.add("site:" + edgeDomain.domain.toLowerCase());
@ -167,19 +183,12 @@ public class DocumentProcessor {
tagWords.add("proto:"+url.proto.toLowerCase());
tagWords.add("js:" + Boolean.toString(ret.features.contains(HtmlFeature.JS)).toLowerCase());
if (ret.features.contains(HtmlFeature.MEDIA)) {
tagWords.add("special:media");
}
if (ret.features.contains(HtmlFeature.TRACKING)) {
tagWords.add("special:tracking");
}
if (ret.features.contains(HtmlFeature.AFFILIATE_LINK)) {
tagWords.add("special:affiliate");
}
if (ret.features.contains(HtmlFeature.COOKIES)) {
tagWords.add("special:cookies");
if (domain.ip != null) {
tagWords.add("ip:" + domain.ip.toLowerCase()); // lower case because IPv6 is hexadecimal
}
ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add);
words.append(IndexBlock.Meta, tagWords);
words.append(IndexBlock.Words, tagWords);
}
@ -196,7 +205,9 @@ public class DocumentProcessor {
for (var frame : doc.getElementsByTag("frame")) {
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
}
for (var frame : doc.getElementsByTag("iframe")) {
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
}
for (var link : doc.select("link[rel=alternate]")) {
feedExtractor
.getFeedFromAlternateTag(baseUrl, link)

View File

@ -1,21 +1,29 @@
package nu.marginalia.wmsa.edge.converting.processor;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class DomainProcessor {
private final DocumentProcessor documentProcessor;
private final Double minAvgDocumentQuality;
@Inject
public DomainProcessor(DocumentProcessor documentProcessor) {
public DomainProcessor(DocumentProcessor documentProcessor,
@Named("min-avg-document-quality") Double minAvgDocumentQuality
) {
this.documentProcessor = documentProcessor;
this.minAvgDocumentQuality = minAvgDocumentQuality;
}
public ProcessedDomain process(CrawledDomain crawledDomain) {
@ -37,17 +45,37 @@ public class DomainProcessor {
ret.documents.add(processedDoc);
}
}
}
else {
ret.documents = Collections.emptyList();
}
double averageQuality = getAverageQuality(ret.documents);
if (averageQuality < minAvgDocumentQuality) {
ret.documents.forEach(doc -> doc.state = EdgeUrlState.DISQUALIFIED);
}
ret.state = getState(crawledDomain.crawlerStatus);
return ret;
}
private double getAverageQuality(List<ProcessedDocument> documents) {
int n = 0;
double q = 0.;
for (var doc : documents) {
if (doc.quality().isPresent()) {
n++;
q += doc.quality().getAsDouble();
}
}
if (n > 0) {
return q / n;
}
return -5.;
}
private EdgeDomainIndexingState getState(String crawlerStatus) {
return switch (CrawlerDomainStatus.valueOf(crawlerStatus)) {
case OK -> EdgeDomainIndexingState.ACTIVE;

View File

@ -15,7 +15,7 @@ public class InstructionsCompiler {
public List<Instruction> compile(ProcessedDomain domain) {
List<Instruction> ret = new ArrayList<>(domain.size()*4);
ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.averageQuality().orElse(-5.)));
ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.ip));
if (domain.documents != null) {
compileUrls(ret, domain.documents);
@ -42,16 +42,17 @@ public class InstructionsCompiler {
Set<EdgeUrl> seenUrls = new HashSet<>(documents.size()*4);
Set<EdgeDomain> seenDomains = new HashSet<>(documents.size());
documents.stream().map(doc -> doc.url).forEach(seenUrls::add);
for (var doc : documents) {
if (doc.details == null) continue;
seenUrls.add(doc.url);
if (doc.details != null) {
for (var url : doc.details.linksExternal) {
seenDomains.add(url.domain);
}
seenUrls.addAll(doc.details.linksExternal);
seenUrls.addAll(doc.details.linksInternal);
}
}
ret.add(new LoadDomain(seenDomains.toArray(EdgeDomain[]::new)));
ret.add(new LoadUrl(seenUrls.toArray(EdgeUrl[]::new)));

View File

@ -1,8 +1,8 @@
package nu.marginalia.wmsa.edge.converting.processor.logic;
import crawlercommons.utils.Strings;
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import org.jsoup.nodes.Document;
@ -35,7 +35,7 @@ public class DocumentValuator {
throw new DisqualifiedException(LENGTH);
}
return Math.log(textBodyLength / (double) rawLength)*htmlStandard.scale
return Math.log(textBodyLength / (double) (1+rawLength))*htmlStandard.scale
+ htmlStandard.offset
- scriptPenalty
- smutCoefficient;
@ -52,17 +52,13 @@ public class DocumentValuator {
double scriptPenalty = 0;
for (var tag : scriptTags) {
String srcTag = tag.attr("src");
if (Strings.isBlank(srcTag)) {
scriptPenalty += 1;
}
else if (srcTag.contains("wp-content") || srcTag.contains("wp-includes") || srcTag.contains("jquery")) {
String srcAttr = tag.attr("src");
if (srcAttr.contains("wp-content") || srcAttr.contains("wp-includes") || srcAttr.contains("jquery")) {
scriptPenalty += 0.49;
}
else {
else if (!Strings.isBlank(srcAttr)) {
scriptPenalty += 1;
}
}
return (int)(scriptPenalty + badScript + (scriptText.length())/1000.);
}

View File

@ -3,26 +3,32 @@ package nu.marginalia.wmsa.edge.converting.processor.logic;
import java.util.Collection;
public enum HtmlFeature {
MEDIA(0),
JS(1),
AFFILIATE_LINK(2),
TRACKING(3),
COOKIES(4)
MEDIA( "special:media"),
JS("special:scripts"),
AFFILIATE_LINK( "special:affiliate"),
TRACKING("special:tracking"),
COOKIES("special:cookies")
;
public final int bit;
private final String keyword;
HtmlFeature(int bit) {
this.bit = bit;
HtmlFeature(String keyword) {
this.keyword = keyword;
}
public String getKeyword() {
return keyword;
}
public static int encode(Collection<HtmlFeature> featuresAll) {
return featuresAll.stream().mapToInt(f -> 1 << f.bit).reduce(0, (l, r) -> (l|r));
int ret = 0;
for (var feature : featuresAll) {
ret |= (1 << (feature.ordinal()));
}
return ret;
}
public static boolean hasFeature(int value, HtmlFeature feature) {
return (value & (1<< feature.bit)) != 0;
}
public static int addFeature(int value, HtmlFeature feature) {
return (value | (1<< feature.bit));
return (value & (1<< feature.ordinal())) != 0;
}
}

View File

@ -102,26 +102,34 @@ public class LinkParser {
return url;
}
private static final Pattern paramRegex = Pattern.compile("\\?.*$");
private static final Pattern spaceRegex = Pattern.compile(" ");
@SneakyThrows
private String resolveUrl(EdgeUrl baseUrl, String s) {
s = paramRegex.matcher(s).replaceAll("");
// url looks like http://www.marginalia.nu/
if (isAbsoluteDomain(s)) {
return s;
}
// url looks like /my-page
if (s.startsWith("/")) {
return baseUrl.withPath(s).toString();
String[] parts = s.split("\\?", 2);
String path = parts[0];
String param;
if (parts.length > 1) {
param = QueryParams.queryParamsSanitizer(parts[0], parts[1]);
}
else {
param = null;
}
final String partFromNewLink = spaceRegex.matcher(s).replaceAll("%20");
// url looks like /my-page
if (path.startsWith("/")) {
return baseUrl.withPathAndParam(path, param).toString();
}
return baseUrl.withPath(relativeNavigation(baseUrl) + partFromNewLink).toString();
final String partFromNewLink = spaceRegex.matcher(path).replaceAll("%20");
return baseUrl.withPathAndParam(relativeNavigation(baseUrl) + partFromNewLink, param).toString();
}
// for a relative url that looks like /foo or /foo/bar; return / or /foo
@ -145,13 +153,8 @@ public class LinkParser {
}
private boolean isRelRelevant(String rel) {
if (null == rel) {
return true;
}
return switch (rel) {
case "noindex" -> false;
default -> true;
};
// this is null safe
return !"noindex".equalsIgnoreCase(rel);
}
private boolean isUrlRelevant(String href) {
@ -188,4 +191,5 @@ public class LinkParser {
return documentUrl;
}
}

View File

@ -72,7 +72,7 @@ public class LinkProcessor {
return false;
}
if (urlBlocklist.isForumLink(link)) {
if (urlBlocklist.isMailingListLink(link)) {
return false;
}

View File

@ -0,0 +1,50 @@
package nu.marginalia.wmsa.edge.converting.processor.logic;
import javax.annotation.Nullable;
import java.util.Arrays;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class QueryParams {
private static final Pattern paramSplitterPattern = Pattern.compile("&");
@Nullable
public static String queryParamsSanitizer(String path, @Nullable String queryParams) {
if (queryParams == null) {
return null;
}
var ret = Arrays.stream(paramSplitterPattern.split(queryParams))
.filter(param -> QueryParams.isPermittedParam(path, param))
.sorted()
.collect(Collectors.joining("&"));
if (ret.isBlank())
return null;
return ret;
}
public static boolean isPermittedParam(String path, String param) {
if (path.endsWith("index.php")) {
if (param.startsWith("showtopic"))
return true;
if (param.startsWith("showforum"))
return true;
}
if (path.endsWith("viewtopic.php")) {
return (param.startsWith("t=") || param.startsWith("p="));
}
if (path.endsWith("viewforum.php")) {
return param.startsWith("v=");
}
if (path.endsWith("showthread.php")) {
return (param.startsWith("t=") || param.startsWith("p="));
}
if (path.endsWith("showforum.php")) {
return param.startsWith("v=");
}
return false;
}
}

View File

@ -34,11 +34,10 @@ public class CrawlJobExtractorMain {
private static final String domainsSql =
"""
SELECT ID, LOWER(EC_DOMAIN.URL_PART)
SELECT ID, LOWER(EC_DOMAIN.DOMAIN_NAME)
FROM EC_DOMAIN
WHERE QUALITY_RAW>-100
AND INDEXED>0
AND STATE<2
WHERE INDEXED>0
AND STATE='ACTIVE' OR STATE='EXHAUSTED'
ORDER BY
INDEX_DATE ASC,
DISCOVER_DATE ASC,
@ -49,8 +48,8 @@ public class CrawlJobExtractorMain {
private static final String urlsSql =
"""
SELECT CONCAT(PROTO, "://", ?, URL)
FROM EC_URL
SELECT URL
FROM EC_URL_VIEW
WHERE DOMAIN_ID=?
ORDER BY
VISITED DESC,

View File

@ -6,6 +6,7 @@ import com.google.common.hash.Hashing;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
@ -30,19 +31,19 @@ public class CrawlJobExtractorPageRankMain {
"""
SELECT ID
FROM EC_DOMAIN
WHERE URL_PART=?
WHERE DOMAIN_NAME=?
""";
private static final String specificDomainSqlFromId =
"""
SELECT LOWER(URL_PART)
SELECT LOWER(DOMAIN_NAME)
FROM EC_DOMAIN
WHERE ID=?
""";
private static final String urlsSql =
"""
SELECT CONCAT(PROTO, "://", ?, URL)
FROM EC_URL
SELECT URL
FROM EC_URL_VIEW
WHERE DOMAIN_ID=?
ORDER BY
VISITED DESC,
@ -73,10 +74,12 @@ public class CrawlJobExtractorPageRankMain {
Gson gson = new GsonBuilder().create();
var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
var ds = new DatabaseModule().provideConnection();
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
var rpr = new BetterReversePageRank(domains, "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
rpr.setMaxKnownUrls(750);
var targetDomainIds = rpr.pageRankWithPeripheralNodes(rpr.size(), false);
var targetDomainIds = rpr.pageRankWithPeripheralNodes(rpr.size());
try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) {
final var extractor = new CrawlJobExtractorPageRankMain(new DatabaseModule().provideConnection());
@ -102,7 +105,7 @@ public class CrawlJobExtractorPageRankMain {
try (var domainQuery = conn.prepareStatement(specificDomainSqlFromId);
var urlQuery = conn.prepareStatement(urlsSql))
{
domainQuery.setInt(1, domainId.getId());
domainQuery.setInt(1, domainId.id());
ResultSet rsp = domainQuery.executeQuery();
domainName = rsp.next() ? rsp.getString(1) : "";
@ -110,10 +113,10 @@ public class CrawlJobExtractorPageRankMain {
spec.id = createId(new EdgeDomain(domainName));
spec.urls = new ArrayList<>(1000);
spec.crawlDepth = getCrawlDepth(new DomainWithId(domainName, domainId.getId()));
spec.crawlDepth = getCrawlDepth(new DomainWithId(domainName, domainId.id()));
urlQuery.setString(1, domainName.toString());
urlQuery.setInt(2, domainId.getId());
urlQuery.setInt(2, domainId.id());
urlQuery.setFetchSize(1000);
rsp = urlQuery.executeQuery();

View File

@ -4,15 +4,26 @@ import nu.marginalia.wmsa.edge.model.EdgeUrl;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.function.Predicate;
import java.util.regex.Pattern;
public class UrlBlocklist {
private final List<Predicate<String>> patterns = new ArrayList<>();
// domains that have a lot of links but we know we don't want to crawl
private final Set<String> badDomains = Set.of("t.co", "facebook.com",
"instagram.com", "youtube.com",
"youtu.be", "amzn.to");
public UrlBlocklist() {
patterns.add(Pattern.compile(".*/[a-f0-9]{40}(/|$)").asPredicate());
patterns.add(Pattern.compile("/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$").asPredicate());
// Don't deep-crawl git repos
patterns.add(Pattern.compile("\\.git/.+").asPredicate());
// long base64-strings in URLs are typically git hashes or the like, rarely worth crawling
patterns.add(Pattern.compile(".*/[^/]*[a-f0-9]{32,}(/|$)").asPredicate());
// link farms &c
patterns.add(Pattern.compile("/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$").asPredicate());
patterns.add(Pattern.compile("/permalink/[a-z]+(-([A-Za-z]+|[0-9]+)){3,}\\.(htm|html|php)$").asPredicate());
patterns.add(Pattern.compile("(webrx3|lib|pdf|book|720p).*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$").asPredicate());
@ -22,34 +33,33 @@ public class UrlBlocklist {
public boolean isUrlBlocked(EdgeUrl url) {
try {
if (badDomains.contains(url.domain.domain)) {
return true;
}
if ("github.com".equals(url.domain.domain)) {
return url.path.chars().filter(c -> c == '/').count() > 2;
}
return patterns.stream().anyMatch(p -> p.test(url.path));
for (var p : patterns) {
if (p.test(url.path))
return true;
}
}
catch (StackOverflowError ex) {
return true;
}
}
public boolean isForumLink(EdgeUrl linkUrl) {
var path = linkUrl.path;
if (path.startsWith("/forum")) {
return true;
}
if (path.startsWith("/lists/")) {
return true;
}
if (path.startsWith("mailinglist")) {
return true;
}
if (path.contains("phpbb")) {
return true;
}
return false;
}
public boolean isMailingListLink(EdgeUrl linkUrl) {
var path = linkUrl.path;
if (path.startsWith("/lists/")) {
return true;
}
if (path.contains("mailinglist")) {
return true;
}
return false;
}
}

View File

@ -63,7 +63,7 @@ public class CrawlerRetreiver {
if (queue.peek() != null) {
var fst = queue.peek();
var root = new EdgeUrl(fst.proto, fst.domain, fst.port, "/");
var root = fst.domain.toRootUrl();
if (known.add(root))
queue.addFirst(root);
}
@ -110,7 +110,7 @@ public class CrawlerRetreiver {
.build());
}
var fetchResult = fetcher.probeDomain(new EdgeUrl(fst.proto, fst.domain, fst.port, "/"));
var fetchResult = fetcher.probeDomain(fst.domain.toRootUrl());
if (!fetchResult.ok()) {
logger.debug("Bad status on {}", domain);
return Optional.of(createErrorPostFromStatus(fetchResult));
@ -121,6 +121,8 @@ public class CrawlerRetreiver {
private CrawledDomain crawlDomain() {
String ip = findIp(domain);
assert !queue.isEmpty();
var robotsRules = fetcher.fetchRobotRules(queue.peek().domain);
long crawlDelay = robotsRules.getCrawlDelay();
@ -209,7 +211,7 @@ public class CrawlerRetreiver {
linkParser.parseLink(baseUrl, link)
.filter(this::isSameDomain)
.filter(u -> !urlBlocklist.isUrlBlocked(u))
.filter(u -> !urlBlocklist.isForumLink(u))
.filter(u -> !urlBlocklist.isMailingListLink(u))
.filter(known::add)
.ifPresent(queue::addLast);
}
@ -217,7 +219,7 @@ public class CrawlerRetreiver {
linkParser.parseFrame(baseUrl, link)
.filter(this::isSameDomain)
.filter(u -> !urlBlocklist.isUrlBlocked(u))
.filter(u -> !urlBlocklist.isForumLink(u))
.filter(u -> !urlBlocklist.isMailingListLink(u))
.filter(known::add)
.ifPresent(queue::addLast);
}
@ -225,14 +227,14 @@ public class CrawlerRetreiver {
linkParser.parseFrame(baseUrl, link)
.filter(this::isSameDomain)
.filter(u -> !urlBlocklist.isUrlBlocked(u))
.filter(u -> !urlBlocklist.isForumLink(u))
.filter(u -> !urlBlocklist.isMailingListLink(u))
.filter(known::add)
.ifPresent(queue::addLast);
}
}
private Optional<EdgeUrl> findCanonicalUrl(EdgeUrl baseUrl, Document parsed) {
baseUrl = baseUrl.withPath("/");
baseUrl = baseUrl.domain.toRootUrl();
for (var link : parsed.select("link[rel=canonical]")) {
return linkParser.parseLink(baseUrl, link);

View File

@ -109,7 +109,7 @@ public class HttpFetcher {
@SneakyThrows
public FetchResult probeDomain(EdgeUrl url) {
var head = new Request.Builder().head().addHeader("User-agent", userAgent)
.url(new EdgeUrl(url.proto, url.domain, url.port, "/").toString())
.url(url.domain.toRootUrl().toString())
.build();
var call = client.newCall(head);
@ -293,7 +293,7 @@ public class HttpFetcher {
private Optional<SimpleRobotRules> fetchRobotsForProto(String proto, EdgeDomain domain) {
try {
var url = new EdgeUrl(proto, domain, null, "/robots.txt");
var url = new EdgeUrl(proto, domain, null, "/robots.txt", null);
return Optional.of(parseRobotsTxt(fetchContent(url)));
}
catch (Exception ex) {

View File

@ -13,44 +13,14 @@ import java.util.Optional;
@ImplementedBy(EdgeDataStoreDaoImpl.class)
public interface EdgeDataStoreDao {
boolean isBlacklisted(EdgeDomain domain);
EdgeId<EdgeDomain> getDomainId(EdgeDomain domain);
EdgeId<EdgeUrl> getUrlId(EdgeUrl domain);
EdgeUrl getUrl(EdgeId<EdgeUrl> id);
EdgeUrlDetails getUrlDetails(EdgeId<EdgeUrl> id);
List<BrowseResult> getDomainNeighbors(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist backlist, int count);
List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist backlist, int count);
List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist backlist);
List<EdgeUrlDetails> getUrlDetailsMulti(List<EdgeId<EdgeUrl>> ids);
List<EdgeId<EdgeDomain>> getDomainIdsFromUrlIds(Collection<EdgeId<EdgeUrl>> urlIds);
EdgeDomain getDomain(EdgeId<EdgeDomain> id);
List<EdgeId<EdgeUrl>> inboudUrls(EdgeId<EdgeUrl> id, int limit);
List<EdgeId<EdgeUrl>> outboundUrls(EdgeId<EdgeUrl> id, int limit);
Optional<EdgeId<EdgeUrl>> resolveAmbiguousDomain(String name);
int getPagesKnown(EdgeId<EdgeDomain> domainId);
int getPagesVisited(EdgeId<EdgeDomain> domainId);
int getPagesIndexed(EdgeId<EdgeDomain> domainId);
int getIncomingLinks(EdgeId<EdgeDomain> domainId);
int getOutboundLinks(EdgeId<EdgeDomain> domainId);
double getDomainQuality(EdgeId<EdgeDomain> domainId);
EdgeDomainIndexingState getDomainState(EdgeId<EdgeDomain> domainId);
List<EdgeDomain> getLinkingDomains(EdgeId<EdgeDomain> domainId);
List<EdgeUrl> getNewUrls(EdgeId<EdgeDomain> domainId, Collection<EdgeUrl> links);
double getRank(EdgeId<EdgeDomain> domainId);
void updateDomainIndexTimestamp(EdgeDomain domain, EdgeDomainIndexingState state, EdgeDomain alias, int minIndexed);
}

View File

@ -17,13 +17,8 @@ import nu.marginalia.wmsa.edge.search.model.BrowseResult;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.Connection;
import java.sql.SQLException;
import java.sql.Types;
import java.util.*;
import java.util.function.Function;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
@ -33,7 +28,6 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
private final Cache<EdgeUrl, EdgeId<EdgeUrl>> urlIdCache = CacheBuilder.newBuilder().maximumSize(100_000).build();
private final Cache<EdgeDomain, EdgeId<EdgeDomain>> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
private static final String DEFAULT_PROTOCOL = "http";
public static double QUALITY_LOWER_BOUND_CUTOFF = -15.;
@Inject
public EdgeDataStoreDaoImpl(HikariDataSource dataSource)
@ -48,30 +42,13 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
domainIdCache.invalidateAll();
}
@SneakyThrows
@Override
public boolean isBlacklisted(EdgeDomain domain) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN=?")) {
stmt.setString(1, domain.domain);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return true;
} else {
return false;
}
}
}
}
@SneakyThrows
@Override
public EdgeId<EdgeDomain> getDomainId(EdgeDomain domain) {
try (var connection = dataSource.getConnection()) {
return domainIdCache.get(domain, () -> {
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) {
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
stmt.setString(1, domain.toString());
var rsp = stmt.executeQuery();
if (rsp.next()) {
@ -86,103 +63,13 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
}
}
@Override
@SneakyThrows
public EdgeId<EdgeUrl> getUrlId(EdgeUrl url) {
try (var connection = dataSource.getConnection()) {
return urlIdCache.get(url, () -> {
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_URL_VIEW WHERE URL_PATH=? AND URL_DOMAIN=? AND URL_PROTO=?")) {
stmt.setString(1, url.path);
stmt.setString(2, url.domain.toString());
stmt.setString(3, url.proto);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return new EdgeId<>(rsp.getInt(1));
private <T> String idList(List<EdgeId<T>> ids) {
StringJoiner j = new StringJoiner(",", "(", ")");
for (var id : ids) {
j.add(Integer.toString(id.id()));
}
return j.toString();
}
// Lenient mode for http->https upgrades etc
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_URL_VIEW WHERE URL_PATH=? AND URL_DOMAIN=?")) {
stmt.setString(1, url.path);
stmt.setString(2, url.domain.toString());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return new EdgeId<>(rsp.getInt(1));
}
}
throw new NoSuchElementException(url.toString());
});
}
catch (UncheckedExecutionException ex) {
throw ex.getCause();
}
}
@SneakyThrows
@Override
public List<EdgeId<EdgeDomain>> getDomainIdsFromUrlIds(Collection<EdgeId<EdgeUrl>> urlIds) {
List<EdgeId<EdgeDomain>> results = new ArrayList<>(urlIds.size());
if (urlIds.isEmpty())
return results;
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT DOMAIN_ID FROM EC_URL WHERE ID IN " + urlIds
.stream()
.map(EdgeId::getId)
.map(Object::toString)
.collect(Collectors.joining(",", "(", ")"))))
{
var rsp = stmt.executeQuery();
while (rsp.next()) {
results.add(new EdgeId<>(rsp.getInt(1)));
}
}
}
return results;
}
static final Pattern badChars = Pattern.compile("[';\\\\]");
private String saneString(String s) {
return "\'"+badChars.matcher(s).replaceAll("?")+"\'";
}
@SneakyThrows
@Override
public EdgeUrl getUrl(EdgeId<EdgeUrl> id) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.createStatement()) {
var rsp = stmt.executeQuery("SELECT URL_PROTO, URL_DOMAIN,URL_PORT,URL_PATH FROM EC_URL_VIEW WHERE ID=" + id.getId());
if (rsp.next()) {
return new EdgeUrl(rsp.getString(1), new EdgeDomain(rsp.getString(2)), rsp.getInt(3), rsp.getString(4));
}
throw new NoSuchElementException();
}
}
}
@SneakyThrows
@Override
public EdgeUrlDetails getUrlDetails(EdgeId<EdgeUrl> id) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.createStatement()) {
var rsp = stmt.executeQuery("SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID=" + id.getId());
if (rsp.next()) {
EdgeUrl url = new EdgeUrl(rsp.getString(2), new EdgeDomain(rsp.getString(3)), rsp.getInt(4), rsp.getString(5));
return new EdgeUrlDetails(rsp.getInt(1), url, rsp.getString(6), rsp.getString(7), rsp.getDouble(8), rsp.getDouble(15), rsp.getDouble(9), rsp.getInt(10), rsp.getInt(11), rsp.getString(12), rsp.getInt(13), EdgePageScoreAdjustment.zero(), Integer.MAX_VALUE, Double.MAX_VALUE, rsp.getString(14), rsp.getInt(16), 0, rsp.getInt(17));
}
throw new NoSuchElementException();
}
}
}
@SneakyThrows
@Override
@ -193,16 +80,39 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
List<EdgeUrlDetails> result = new ArrayList<>(ids.size());
try (var connection = dataSource.getConnection()) {
// This is SQL-injection safe, the IDs are of type int
String idString = ids.stream().map(EdgeId::getId).map(Objects::toString).collect(Collectors.joining(",", "(", ")"));
try (var stmt = connection.prepareStatement("SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID IN " + idString)) {
String idString = idList(ids);
try (var stmt = connection.prepareStatement(
"""
SELECT ID, URL,
TITLE, DESCRIPTION,
QUALITY,
WORDS_TOTAL, FORMAT, FEATURES,
IP, DOMAIN_STATE,
DATA_HASH
FROM EC_URL_VIEW WHERE ID IN
""" + idString)) {
stmt.setFetchSize(ids.size());
var rsp = stmt.executeQuery();
while (rsp.next()) {
EdgeUrl url = new EdgeUrl(rsp.getString(2), new EdgeDomain(rsp.getString(3)), rsp.getInt(4), rsp.getString(5));
var val = new EdgeUrlDetails(rsp.getInt(1), url, rsp.getString(6), rsp.getString(7), rsp.getDouble(8), rsp.getDouble(15), rsp.getDouble(9), rsp.getInt(10), rsp.getInt(11), rsp.getString(12), rsp.getInt(13), EdgePageScoreAdjustment.zero(), Integer.MAX_VALUE, Double.MAX_VALUE, rsp.getString(14), rsp.getInt(16), 0, rsp.getInt(17));
EdgeUrl url = new EdgeUrl(rsp.getString(2));
var val = new EdgeUrlDetails(rsp.getInt(1), url,
rsp.getString(3), // title
rsp.getString(4), // description
rsp.getDouble(5), // quality
rsp.getInt(6), // wordsTotal
rsp.getString(7), // format
rsp.getInt(8), // features
rsp.getString(9), // ip
EdgeDomainIndexingState.valueOf(rsp.getString(10)), // domainState
rsp.getInt(11), // dataHash
EdgePageScoreAdjustment.zero(), // urlQualityAdjustment
Integer.MAX_VALUE, // rankingId
Double.MAX_VALUE, // termScore
0 // queryLength
);
if (val.urlQuality >= QUALITY_LOWER_BOUND_CUTOFF) {
result.add(val);
}
@ -214,82 +124,13 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
return result;
}
@Override
public List<BrowseResult> getDomainNeighbors(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist blacklist, int count) {
final Set<BrowseResult> domains = new HashSet<>(count*3);
final String q = "SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, URL_PART from EC_DOMAIN_NEIGHBORS INNER JOIN EC_DOMAIN ON NEIGHBOR_ID=EC_DOMAIN.ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL AND EC_DOMAIN_NEIGHBORS.DOMAIN_ID = ? ORDER BY ADJ_IDX LIMIT ?";
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(q)) {
stmt.setFetchSize(count);
stmt.setInt(1, domainId.getId());
stmt.setInt(2, count);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int id = rsp.getInt(1);
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
domains.add(new BrowseResult(url, id));
}
}
}
final String q2 = "SELECT EC_DOMAIN.ID, URL_PART FROM EC_DOMAIN_LINK INNER JOIN EC_DOMAIN ON DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE SOURCE_DOMAIN_ID=? AND STATE<2 AND DOMAIN_ALIAS IS NULL GROUP BY EC_DOMAIN.ID ORDER BY RANK ASC LIMIT ?";
try (var stmt = connection.prepareStatement(q2)) {
stmt.setFetchSize(count);
stmt.setInt(1, domainId.getId());
stmt.setInt(2, count);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int id = rsp.getInt(1);
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
domains.add(new BrowseResult(url, id));
}
}
}
final String q3 = "SELECT EC_DOMAIN.ID, URL_PART FROM EC_DOMAIN_LINK INNER JOIN EC_DOMAIN ON DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE DEST_DOMAIN_ID=? AND STATE<2 AND DOMAIN_ALIAS IS NULL GROUP BY EC_DOMAIN.ID ORDER BY RANK ASC LIMIT ?";
try (var stmt = connection.prepareStatement(q3)) {
stmt.setFetchSize(count);
stmt.setInt(1, domainId.getId());
stmt.setInt(2, count);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int id = rsp.getInt(1);
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
domains.add(new BrowseResult(url, id));
}
}
}
} catch (SQLException throwables) {
throwables.printStackTrace();
}
return new ArrayList<>(domains);
}
@Override
public List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist blacklist, int count) {
final Set<BrowseResult> domains = new HashSet<>(count*3);
final String q = """
SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, URL_PART, COUNT(*) AS CNT
SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, DOMAIN_NAME, COUNT(*) AS CNT
FROM EC_DOMAIN_NEIGHBORS
INNER JOIN EC_DOMAIN ON NEIGHBOR_ID=EC_DOMAIN.ID
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
@ -308,7 +149,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(q)) {
stmt.setFetchSize(count);
stmt.setInt(1, domainId.getId());
stmt.setInt(1, domainId.id());
stmt.setInt(2, count);
var rsp = stmt.executeQuery();
while (rsp.next()) {
@ -316,16 +157,14 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
domains.add(new BrowseResult(url, id));
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
}
}
}
if (domains.size() < count/2) {
final String q2 = """
SELECT EC_DOMAIN.ID, URL_PART
SELECT EC_DOMAIN.ID, DOMAIN_NAME
FROM EC_DOMAIN
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
INNER JOIN EC_DOMAIN_LINK B ON DEST_DOMAIN_ID=EC_DOMAIN.ID
@ -339,7 +178,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
try (var stmt = connection.prepareStatement(q2)) {
stmt.setFetchSize(count/2);
stmt.setInt(1, domainId.getId());
stmt.setInt(1, domainId.id());
stmt.setInt(2, count/2 - domains.size());
var rsp = stmt.executeQuery();
while (rsp.next() && domains.size() < count/2) {
@ -347,9 +186,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
domains.add(new BrowseResult(url, id));
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
}
}
}
@ -357,7 +194,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
if (domains.size() < count/2) {
final String q3 = """
SELECT EC_DOMAIN.ID, URL_PART
SELECT EC_DOMAIN.ID, DOMAIN_NAME
FROM EC_DOMAIN
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
INNER JOIN EC_DOMAIN_LINK B ON B.SOURCE_DOMAIN_ID=EC_DOMAIN.ID
@ -372,7 +209,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
LIMIT ?""";
try (var stmt = connection.prepareStatement(q3)) {
stmt.setFetchSize(count/2);
stmt.setInt(1, domainId.getId());
stmt.setInt(1, domainId.id());
stmt.setInt(2, count/2 - domains.size());
var rsp = stmt.executeQuery();
@ -381,9 +218,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
domains.add(new BrowseResult(url, id));
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
}
}
}
@ -399,7 +234,15 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
@Override
public List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist blacklist) {
final String q = "SELECT DOMAIN_ID,URL_PART FROM EC_RANDOM_DOMAINS INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL ORDER BY RAND() LIMIT ?";
final String q = """
SELECT DOMAIN_ID, DOMAIN_NAME
FROM EC_RANDOM_DOMAINS
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID
WHERE STATE<2
AND DOMAIN_ALIAS IS NULL
ORDER BY RAND()
LIMIT ?
""";
List<BrowseResult> domains = new ArrayList<>(count);
try (var conn = dataSource.getConnection()) {
try (var stmt = conn.prepareStatement(q)) {
@ -410,9 +253,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
domains.add(new BrowseResult(url, id));
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
}
}
}
@ -428,8 +269,8 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
public EdgeDomain getDomain(EdgeId<EdgeDomain> id) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT URL_PART FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, id.getId());
try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, id.id());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return new EdgeDomain(rsp.getString(1));
@ -439,330 +280,4 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
}
}
@Override @SneakyThrows
public List<EdgeId<EdgeUrl>> inboudUrls(EdgeId<EdgeUrl> id, int limit) {
List<EdgeId<EdgeUrl>> ret = new ArrayList<>();
try (var connection = dataSource.getConnection()) {
try (var stmt =
connection.prepareStatement("SELECT SRC_URL_ID FROM EC_RELATED_LINKS_IN WHERE DEST_URL_ID=? ORDER BY SRC_URL_QUALITY DESC LIMIT ?")) {
stmt.setFetchSize(limit);
stmt.setInt(1, id.getId());
stmt.setInt(2, limit);
var rsp = stmt.executeQuery();
while (rsp.next()) {
ret.add(new EdgeId<>(rsp.getInt(1)));
}
}
}
return ret;
}
@Override @SneakyThrows
public List<EdgeId<EdgeUrl>> outboundUrls(EdgeId<EdgeUrl> id, int limit) {
List<EdgeId<EdgeUrl>> ret = new ArrayList<>();
try (var connection = dataSource.getConnection()) {
try (var stmt =
connection.prepareStatement("SELECT DEST_URL_ID FROM EC_RELATED_LINKS_IN WHERE SRC_URL_ID=? ORDER BY SRC_URL_QUALITY DESC LIMIT ?")) {
stmt.setFetchSize(limit);
stmt.setInt(1, id.getId());
stmt.setInt(2, limit);
var rsp = stmt.executeQuery();
while (rsp.next()) {
ret.add(new EdgeId<>(rsp.getInt(1)));
}
}
}
return ret;
}
@Override
public Optional<EdgeId<EdgeUrl>> resolveAmbiguousDomain(String name) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
stmt.setString(1, name);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return Optional.of(new EdgeId<>(rsp.getInt(1)));
}
}
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
stmt.setString(1, "https://"+name);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return Optional.of(new EdgeId<>(rsp.getInt(1)));
}
}
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
stmt.setString(1, "http://"+name);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return Optional.of(new EdgeId<>(rsp.getInt(1)));
}
}
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
stmt.setString(1, "https://www."+name);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return Optional.of(new EdgeId<>(rsp.getInt(1)));
}
}
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
stmt.setString(1, "http://www."+name);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return Optional.of(new EdgeId<>(rsp.getInt(1)));
}
}
} catch (SQLException throwables) {
logger.info("Could not resolve domain id for {}", name);
}
return Optional.empty();
}
@SneakyThrows
@Override
public int getPagesKnown(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
@Override
public int getPagesVisited(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
@Override
public int getPagesIndexed(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
@Override
public int getIncomingLinks(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
@Override
public int getOutboundLinks(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
@Override
public double getDomainQuality(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT QUALITY FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getDouble(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return -5;
}
}
@Override
public EdgeDomainIndexingState getDomainState(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return EdgeDomainIndexingState.fromCode(rsp.getInt(1));
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
} catch (SQLException throwables) {
throwables.printStackTrace();
}
return EdgeDomainIndexingState.ERROR;
}
@Override
public List<EdgeDomain> getLinkingDomains(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
List<EdgeDomain> results = new ArrayList<>(25);
try (var stmt = connection.prepareStatement("SELECT SOURCE_URL FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
while (rsp.next()) {
results.add(new EdgeDomain(rsp.getString(1)));
}
return results;
} catch (Exception ex) {
logger.error("DB error", ex);
}
} catch (SQLException throwables) {
throwables.printStackTrace();
}
return Collections.emptyList();
}
@Override
public List<EdgeUrl> getNewUrls(EdgeId<EdgeDomain> domainId, Collection<EdgeUrl> links) {
Map<String, EdgeUrl> edgeUrlByPath = links.stream().collect(Collectors.toMap(EdgeUrl::getPath, Function.identity(), (a,b)->a));
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT URL FROM EC_URL WHERE DOMAIN_ID=?")) {
stmt.setFetchSize(500);
stmt.setInt(1, domainId.getId());
var rs = stmt.executeQuery();
while (rs.next()) {
edgeUrlByPath.remove(rs.getString(1));
}
}
}
catch (Exception ex) {
return Collections.emptyList();
}
return new ArrayList<>(edgeUrlByPath.values());
}
@Override
public double getRank(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT IFNULL(RANK, 1) FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getDouble(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
} catch (SQLException throwables) {
throwables.printStackTrace();
}
return 1;
}
@Override
public void updateDomainIndexTimestamp(EdgeDomain domain, EdgeDomainIndexingState state, EdgeDomain alias, int minIndexed) {
try (var connection = dataSource.getConnection();
var stmt = connection.prepareStatement("UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=?, DOMAIN_ALIAS=?, INDEXED=GREATEST(INDEXED,?) WHERE ID=?")) {
stmt.setInt(1, state.code);
if (null == alias) {
stmt.setNull(2, Types.INTEGER);
}
else {
stmt.setInt(2, getDomainId(alias).getId());
}
stmt.setInt(3, minIndexed);
stmt.setInt(4, getDomainId(domain).getId());
stmt.executeUpdate();
connection.commit();
}
catch (SQLException throwables) {
logger.error("SQL error", throwables);
}
}
@SneakyThrows
private double getDomainQuality(Connection connection, EdgeDomain src) {
try (var stmt = connection.prepareStatement("SELECT QUALITY_RAW FROM EC_DOMAIN WHERE URL_PART=?")) {
stmt.setString(1, src.toString());
var res = stmt.executeQuery();
if (res.next()) {
var q = res.getDouble(1);
if (q > 0.5) {
logger.warn("gDQ({}) -> 1", src);
}
return 0;
}
}
catch (SQLException ex) {
logger.error("DB error", ex);
}
return -5;
}
}

View File

@ -9,7 +9,7 @@ import nu.marginalia.wmsa.edge.model.EdgeId;
public interface EdgeDomainBlacklist {
boolean isBlacklisted(int domainId);
default boolean isBlacklisted(EdgeId<EdgeDomain> domainId) {
return isBlacklisted(domainId.getId());
return isBlacklisted(domainId.id());
}
default TIntHashSet getSpamDomains() {
return new TIntHashSet();

View File

@ -50,7 +50,7 @@ public class EdgeDomainBlacklistImpl implements EdgeDomainBlacklist {
final TIntHashSet result = new TIntHashSet(1_000_000);
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_TOP_DOMAIN ON EC_DOMAIN.URL_TOP_DOMAIN_ID = EC_TOP_DOMAIN.ID INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_TOP_DOMAIN.URL_PART")) {
try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_DOMAIN.DOMAIN_TOP")) {
stmt.setFetchSize(1000);
var rsp = stmt.executeQuery();
while (rsp.next()) {

View File

@ -1,13 +1,11 @@
package nu.marginalia.wmsa.edge.index.radix;
package nu.marginalia.wmsa.edge.index;
import nu.marginalia.wmsa.edge.index.EdgeIndexControl;
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexReader;
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriter;
import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget;
import nu.marginalia.wmsa.edge.index.service.query.Query;
import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriter;
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
import nu.marginalia.wmsa.edge.index.reader.query.Query;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -31,7 +29,7 @@ public class EdgeIndexBucket {
@NotNull
private final IndexServicesFactory servicesFactory;
private final EdgeIndexControl indexControl;
private final SearchIndexWriter writer;
private final SearchIndexJournalWriter writer;
private final int id;

View File

@ -3,7 +3,9 @@ package nu.marginalia.wmsa.edge.index;
import com.google.inject.Inject;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.service.index.ConversionUnnecessaryException;
import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException;
import java.io.IOException;
public class EdgeIndexControl {
@ -21,13 +23,16 @@ public class EdgeIndexControl {
for (IndexBlock block : IndexBlock.values()) {
try {
servicesFactory.getIndexConverter(id, block);
servicesFactory.convertIndex(id, block);
System.runFinalization();
System.gc();
}
catch (ConversionUnnecessaryException unnecessary) {
// swallow quietly
}
catch (IOException e) {
e.printStackTrace();
}
}
@ -35,10 +40,6 @@ public class EdgeIndexControl {
System.gc();
}
public long wordCount(int id) {
return servicesFactory.wordCount(id);
}
public void switchIndexFiles(int id) throws Exception {
servicesFactory.switchFilesJob(id).call();
}

View File

@ -11,15 +11,24 @@ import gnu.trove.set.hash.TIntHashSet;
import io.prometheus.client.Counter;
import io.prometheus.client.Histogram;
import io.reactivex.rxjava3.schedulers.Schedulers;
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
import nu.marginalia.util.ListChunker;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.wmsa.configuration.server.Initialization;
import nu.marginalia.wmsa.configuration.server.MetricsServer;
import nu.marginalia.wmsa.configuration.server.Service;
import nu.marginalia.wmsa.edge.index.model.*;
import nu.marginalia.wmsa.edge.index.service.SearchIndexes;
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl;
import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.wmsa.edge.model.*;
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
import nu.marginalia.wmsa.edge.index.model.EdgePutWordsRequest;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
import nu.marginalia.wmsa.edge.model.search.*;
@ -48,8 +57,11 @@ public class EdgeIndexService extends Service {
@NotNull
private final Initialization init;
private final SearchIndexes indexes;
private final KeywordLexicon keywordLexicon;
private final Gson gson = new GsonBuilder().create();
private final Gson gson = new GsonBuilder()
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
.create();
private static final Histogram wmsa_edge_index_query_time
= Histogram.build().name("wmsa_edge_index_query_time").help("-").register();
@ -66,12 +78,13 @@ public class EdgeIndexService extends Service {
@Named("service-port") Integer port,
Initialization init,
MetricsServer metricsServer,
SearchIndexes indexes
) {
SearchIndexes indexes,
IndexServicesFactory servicesFactory) {
super(ip, port, init, metricsServer);
this.init = init;
this.indexes = indexes;
this.keywordLexicon = servicesFactory.getKeywordLexicon();
Spark.post("/words/", this::putWords);
Spark.post("/search/", this::search, gson::toJson);
@ -173,29 +186,22 @@ public class EdgeIndexService extends Service {
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
EdgePageWords words, int idx
) {
SearchIndexWriterImpl indexWriter = indexes.getIndexWriter(idx);
SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);
if (!words.words.isEmpty()) {
if (words.size() < 1000) {
indexWriter.put(domainId, urlId, words.block, words.words);
} else {
chunks(words.words, 1000).forEach(chunk -> {
indexWriter.put(domainId, urlId, words.block, chunk);
});
}
}
for (var chunk : ListChunker.chopList(words.words, SearchIndexJournalEntry.MAX_LENGTH)) {
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
var header = new SearchIndexJournalEntryHeader(domainId, urlId, words.block);
indexWriter.put(header, entry);
};
}
private <T> List<List<T>> chunks(Collection<T> coll, int size) {
List<List<T>> ret = new ArrayList<>();
List<T> data = List.copyOf(coll);
for (int i = 0; i < data.size(); i+=size) {
ret.add(data.subList(i, Math.min(data.size(), i+size)));
}
return ret;
private long[] getOrInsertWordIds(List<String> words) {
return words.stream()
.filter(w -> w.getBytes().length < Byte.MAX_VALUE)
.mapToLong(keywordLexicon::getOrInsert)
.toArray();
}
private Object search(Request request, Response response) {
@ -341,7 +347,7 @@ public class EdgeIndexService extends Service {
getQuery(i, budget, sq.block, lv -> localFilter.filterRawValue(i, lv), searchTerms)
.mapToObj(id -> new EdgeSearchResultItem(i, sq.termSize(), id))
.filter(ri -> !seenResults.contains(ri.url.getId()) && localFilter.test(i, domainCountFilter, ri))
.filter(ri -> !seenResults.contains(ri.url.id()) && localFilter.test(i, domainCountFilter, ri))
.limit(specs.limitTotal * 3L)
.distinct()
.limit(Math.min(specs.limitByBucket
@ -350,7 +356,7 @@ public class EdgeIndexService extends Service {
for (var result : resultsForBucket) {
seenResults.add(result.url.getId());
seenResults.add(result.url.id());
}
for (var result : resultsForBucket) {
for (var searchTerm : sq.searchTermsInclude) {
@ -401,7 +407,7 @@ public class EdgeIndexService extends Service {
public boolean filterRawValue(int bucket, long value) {
var domain = new EdgeId<EdgeDomain>((int)(value >>> 32));
if (domain.getId() == Integer.MAX_VALUE) {
if (domain.id() == Integer.MAX_VALUE) {
return true;
}
@ -409,11 +415,11 @@ public class EdgeIndexService extends Service {
}
long getKey(int bucket, EdgeId<EdgeDomain> id) {
return ((long)bucket) << 32 | id.getId();
return ((long)bucket) << 32 | id.id();
}
public boolean test(int bucket, EdgeSearchResultItem item) {
if (item.domain.getId() == Integer.MAX_VALUE) {
if (item.domain.id() == Integer.MAX_VALUE) {
return true;
}
@ -431,7 +437,7 @@ public class EdgeIndexService extends Service {
}
public boolean test(int bucket, DomainResultCountFilter root, EdgeSearchResultItem item) {
if (item.domain.getId() == Integer.MAX_VALUE) {
if (item.domain.id() == Integer.MAX_VALUE) {
return true;
}
return root.getCount(bucket, item) + resultsByDomain.adjustOrPutValue(getKey(bucket, item.domain), 1, 1) <= limitByDomain;

View File

@ -4,13 +4,19 @@ import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.google.inject.name.Named;
import lombok.SneakyThrows;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException;
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPreconverter;
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView;
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.radix.EdgeIndexBucket;
import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryReader;
import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryWriter;
import nu.marginalia.wmsa.edge.index.service.index.*;
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -32,15 +38,16 @@ public class IndexServicesFactory {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final PartitionedDataFile writerIndexFile;
private final RootDataFile writerDictionaryFile;
private final RootDataFile keywordLexiconFile;
private final PartitionedDataFile preconverterOutputFile;
private final DoublePartitionedDataFile indexReadWordsFile;
private final DoublePartitionedDataFile indexReadUrlsFile;
private final DoublePartitionedDataFile indexWriteWordsFile;
private final DoublePartitionedDataFile indexWriteUrlsFile;
private volatile static DictionaryWriter dictionaryWriter;
private volatile static KeywordLexicon keywordLexicon;
private final Long dictionaryHashMapSize;
private final SearchIndexPartitioner partitoner;
private final SearchIndexPartitioner partitioner;
@Inject
public IndexServicesFactory(
@Named("tmp-file-dir") Path tmpFileDir,
@ -48,14 +55,14 @@ public class IndexServicesFactory {
@Named("partition-root-slow-tmp") Path partitionRootSlowTmp,
@Named("partition-root-fast") Path partitionRootFast,
@Named("edge-writer-page-index-file") String writerIndexFile,
@Named("edge-writer-dictionary-file") String writerDictionaryFile,
@Named("edge-writer-dictionary-file") String keywordLexiconFile,
@Named("edge-index-read-words-file") String indexReadWordsFile,
@Named("edge-index-read-urls-file") String indexReadUrlsFile,
@Named("edge-index-write-words-file") String indexWriteWordsFile,
@Named("edge-index-write-urls-file") String indexWriteUrlsFile,
@Named("edge-dictionary-hash-map-size") Long dictionaryHashMapSize,
EdgeDomainBlacklist domainBlacklist,
SearchIndexPartitioner partitoner
SearchIndexPartitioner partitioner
) {
this.tmpFileDir = tmpFileDir;
@ -63,41 +70,46 @@ public class IndexServicesFactory {
this.domainBlacklist = domainBlacklist;
this.writerIndexFile = new PartitionedDataFile(partitionRootSlow, writerIndexFile);
this.writerDictionaryFile = new RootDataFile(partitionRootSlow, writerDictionaryFile);
this.keywordLexiconFile = new RootDataFile(partitionRootSlow, keywordLexiconFile);
this.indexReadWordsFile = new DoublePartitionedDataFile(partitionRootFast, indexReadWordsFile);
this.indexReadUrlsFile = new DoublePartitionedDataFile(partitionRootFast, indexReadUrlsFile);
this.indexWriteWordsFile = new DoublePartitionedDataFile(partitionRootFast, indexWriteWordsFile);
this.indexWriteUrlsFile = new DoublePartitionedDataFile(partitionRootFast, indexWriteUrlsFile);
this.preconverterOutputFile = new PartitionedDataFile(partitionRootSlowTmp, "preconverted.dat");
this.partitoner = partitoner;
this.partitioner = partitioner;
}
public SearchIndexWriterImpl getIndexWriter(int idx) {
return new SearchIndexWriterImpl(getDictionaryWriter(), writerIndexFile.get(idx));
}
public DictionaryWriter getDictionaryWriter() {
if (dictionaryWriter == null) {
dictionaryWriter = new DictionaryWriter(writerDictionaryFile.get(), dictionaryHashMapSize, true);
}
return dictionaryWriter;
public SearchIndexJournalWriterImpl getIndexWriter(int idx) {
return new SearchIndexJournalWriterImpl(getKeywordLexicon(), writerIndexFile.get(idx));
}
@SneakyThrows
public DictionaryReader getDictionaryReader() {
return new DictionaryReader(getDictionaryWriter());
public KeywordLexicon getKeywordLexicon() {
if (keywordLexicon == null) {
final var journal = new KeywordLexiconJournal(keywordLexiconFile.get());
keywordLexicon = new KeywordLexicon(journal,
new DictionaryHashMap(dictionaryHashMapSize));
}
return keywordLexicon;
}
@SneakyThrows
public KeywordLexiconReadOnlyView getDictionaryReader() {
return new KeywordLexiconReadOnlyView(getKeywordLexicon());
}
public SearchIndexConverter getIndexConverter(int id, IndexBlock block) throws ConversionUnnecessaryException {
return new SearchIndexConverter(block, id, tmpFileDir,
public void convertIndex(int id, IndexBlock block) throws ConversionUnnecessaryException, IOException {
var converter = new SearchIndexConverter(block, id, tmpFileDir,
preconverterOutputFile.get(id),
indexWriteWordsFile.get(id, block.id),
indexWriteUrlsFile.get(id, block.id),
partitoner,
partitioner,
domainBlacklist
);
converter.convert();
}
@SneakyThrows
public SearchIndexPreconverter getIndexPreconverter() {
File[] outputFiles = new File[DYNAMIC_BUCKET_LENGTH+1];
@ -106,7 +118,7 @@ public class IndexServicesFactory {
}
return new SearchIndexPreconverter(writerIndexFile.get(0),
outputFiles,
partitoner,
partitioner,
domainBlacklist
);
}
@ -115,10 +127,6 @@ public class IndexServicesFactory {
return preconverterOutputFile.get(i);
}
public long wordCount(int id) {
return SearchIndexConverter.wordCount(writerIndexFile.get(0));
}
@SneakyThrows
public SearchIndexReader getIndexReader(int id) {
EnumMap<IndexBlock, SearchIndex> indexMap = new EnumMap<>(IndexBlock.class);

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service.index;
package nu.marginalia.wmsa.edge.index.conversion;
public class ConversionUnnecessaryException extends Exception {
public ConversionUnnecessaryException() {

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service;
package nu.marginalia.wmsa.edge.index.conversion;
import gnu.trove.list.TIntList;
import gnu.trove.map.hash.TIntIntHashMap;

View File

@ -0,0 +1,213 @@
package nu.marginalia.wmsa.edge.index.conversion;
import nu.marginalia.util.RandomWriteFunnel;
import nu.marginalia.util.btree.BTreeWriter;
import nu.marginalia.util.btree.model.BTreeContext;
import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
import nu.marginalia.wmsa.edge.index.conversion.words.WordIndexOffsetsTable;
import nu.marginalia.wmsa.edge.index.conversion.words.WordsTableWriter;
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalReader;
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry.MAX_LENGTH;
public class SearchIndexConverter {
public static final BTreeContext urlsBTreeContext = new BTreeContext(5, 1, ~0, 8);
private final long[] tmpWordsBuffer = new long[MAX_LENGTH];
private final Path tmpFileDir;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final IndexBlock block;
private final int bucketId;
private final File inputFile;
private final File outputFileWords;
private final File outputFileUrls;
private final SearchIndexPartitioner partitioner;
private final EdgeDomainBlacklist blacklist;
private final static int internalSortLimit =
Boolean.getBoolean("small-ram") ? 1024*1024 : 1024*1024*256;
public SearchIndexConverter(IndexBlock block,
int bucketId,
Path tmpFileDir,
File inputFile,
File outputFileWords,
File outputFileUrls,
SearchIndexPartitioner partitioner,
EdgeDomainBlacklist blacklist)
{
this.block = block;
this.bucketId = bucketId;
this.tmpFileDir = tmpFileDir;
this.inputFile = inputFile;
this.outputFileWords = outputFileWords;
this.outputFileUrls = outputFileUrls;
this.partitioner = partitioner;
this.blacklist = blacklist;
}
public void convert() throws IOException {
Files.deleteIfExists(outputFileWords.toPath());
Files.deleteIfExists(outputFileUrls.toPath());
SearchIndexJournalReader journalReader = new SearchIndexJournalReader(MultimapFileLong.forReading(inputFile.toPath()));
if (journalReader.fileHeader.fileSize() <= SearchIndexJournalReader.FILE_HEADER_SIZE_BYTES) {
return;
}
logger.info("Converting {} ({}) {} {}", block.id, block, inputFile, journalReader.fileHeader);
var lock = partitioner.getReadLock();
try {
lock.lock();
var tmpUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat");
logger.info("Creating word index table {} for block {} ({})", outputFileWords, block.id, block);
WordIndexOffsetsTable wordIndexTable = createWordIndexTable(journalReader, outputFileWords);
logger.info("Creating word urls table {} for block {} ({})", outputFileUrls, block.id, block);
createUrlTable(journalReader, tmpUrlsFile, wordIndexTable);
Files.delete(tmpUrlsFile);
}
catch (IOException ex) {
logger.error("Failed to convert", ex);
throw ex;
}
finally {
lock.unlock();
}
}
private WordIndexOffsetsTable createWordIndexTable(SearchIndexJournalReader journalReader,
File outputFileWords) throws IOException
{
final int topWord = (int) journalReader.fileHeader.wordCount();
WordsTableWriter wordsTableWriter = new WordsTableWriter(topWord);
for (var entry : journalReader) {
if (!isRelevantEntry(entry)) {
continue;
}
final SearchIndexJournalEntry entryData = entry.readEntryUsingBuffer(tmpWordsBuffer);
for (int i = 0; i < entryData.size(); i++) {
int wordId = (int) entryData.get(i);
if (wordId < 0 || wordId >= topWord) {
logger.warn("Bad wordId {}", wordId);
}
wordsTableWriter.acceptWord(wordId);
}
}
wordsTableWriter.write(outputFileWords);
return wordsTableWriter.getTable();
}
private void createUrlTable(SearchIndexJournalReader journalReader,
Path tmpUrlsFile,
WordIndexOffsetsTable wordOffsetsTable) throws IOException
{
long numberOfWordsTotal = 0;
for (var entry : journalReader) {
if (isRelevantEntry(entry))
numberOfWordsTotal += entry.wordCount();
}
try (RandomAccessFile urlsTmpFileRAF = new RandomAccessFile(tmpUrlsFile.toFile(), "rw");
FileChannel urlsTmpFileChannel = urlsTmpFileRAF.getChannel()) {
try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, numberOfWordsTotal, 10_000_000)) {
int[] wordWriteOffset = new int[wordOffsetsTable.length()];
for (var entry : journalReader) {
if (!isRelevantEntry(entry)) continue;
var entryData = entry.readEntryUsingBuffer(tmpWordsBuffer);
for (int i = 0; i < entryData.size(); i++) {
int wordId = (int) entryData.get(i);
if (wordId >= wordWriteOffset.length)
continue;
if (wordId < 0) {
logger.warn("Negative wordId {}", wordId);
}
final long urlInternal = translateUrl(entry.docId());
if (wordId > 0) {
rwf.put(wordOffsetsTable.get(wordId - 1) + wordWriteOffset[wordId]++, urlInternal);
} else {
rwf.put(wordWriteOffset[wordId]++, urlInternal);
}
}
}
rwf.write(urlsTmpFileChannel);
}
urlsTmpFileChannel.force(false);
try (var urlsTmpFileMap = MultimapFileLong.forOutput(tmpUrlsFile, numberOfWordsTotal)) {
if (wordOffsetsTable.length() > 0) {
var urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit);
wordOffsetsTable.forEachRange(urlTmpFileSorter::sort);
urlsTmpFileMap.force();
} else {
logger.warn("urls table empty -- nothing to sort");
}
}
try (var urlsFileMap = MultimapFileLong.forOutput(outputFileUrls.toPath(), numberOfWordsTotal)) {
var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext);
wordOffsetsTable.foldRanges((accumulatorIdx, start, length) -> {
// Note: The return value is accumulated into accumulatorIdx!
return writer.write(accumulatorIdx, length,
slice -> slice.transferFromFileChannel(urlsTmpFileChannel, 0, start, start + length));
});
} catch (Exception e) {
logger.error("Error while writing BTree", e);
}
}
}
private long translateUrl(long url) {
int domainId = partitioner.translateId(bucketId, (int) (url >>> 32));
return ((long)domainId << 32) | (url & 0xFFFFFFFFL);
}
private boolean isRelevantEntry(SearchIndexJournalReader.JournalEntry entry) {
return block.equals(entry.header.block())
&& !blacklist.isBlacklisted(entry.domainId())
&& partitioner.filterUnsafe(entry.domainId(), bucketId);
}
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service;
package nu.marginalia.wmsa.edge.index.conversion;
import com.google.inject.Inject;
import com.google.inject.Singleton;
@ -10,7 +10,7 @@ import lombok.SneakyThrows;
import nu.marginalia.util.ranking.BetterReversePageRank;
import nu.marginalia.util.ranking.BetterStandardPageRank;
import nu.marginalia.util.ranking.BuggyStandardPageRank;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -18,41 +18,28 @@ import org.slf4j.LoggerFactory;
@Singleton
public class SearchIndexDao {
private final HikariDataSource dataSource;
private RankingDomainFetcher rankingDomains;
private final RankingSettings rankingSettings;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public SearchIndexDao(HikariDataSource dataSource,
RankingDomainFetcher rankingDomains,
RankingSettings rankingSettings)
{
this.dataSource = dataSource;
this.rankingDomains = rankingDomains;
this.rankingSettings = rankingSettings;
logger.info("SearchIndexDao ranking settings = {}", rankingSettings);
}
@SneakyThrows
public TIntHashSet getSpamDomains() {
final TIntHashSet result = new TIntHashSet(1_000_000);
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_TOP_DOMAIN ON EC_DOMAIN.URL_TOP_DOMAIN_ID = EC_TOP_DOMAIN.ID INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_TOP_DOMAIN.URL_PART")) {
var rsp = stmt.executeQuery();
while (rsp.next()) {
result.add(rsp.getInt(1));
}
}
}
return result;
}
@SneakyThrows
public TIntHashSet goodUrls() {
TIntHashSet domains = new TIntHashSet(10_000_000, 0.5f, -1);
TIntHashSet urls = new TIntHashSet(100_000_000, 0.5f, -1);
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NULL AND STATE>=0")) {
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NULL AND IS_ALIVE")) {
stmt.setFetchSize(10_000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
@ -79,36 +66,36 @@ public class SearchIndexDao {
@SneakyThrows
public TIntList getRetroDomains() {
var spr = new BetterStandardPageRank(dataSource,rankingSettings.retro.toArray(String[]::new));
return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
var spr = new BetterStandardPageRank(rankingDomains,rankingSettings.retro.toArray(String[]::new));
return spr.pageRankWithPeripheralNodes(spr.size()/2);
}
@SneakyThrows
public TIntList getSmallWebDomains() {
var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), rankingSettings.small.toArray(String[]::new));
var rpr = new BetterReversePageRank(rankingDomains, rankingSettings.small.toArray(String[]::new));
rpr.setMaxKnownUrls(750);
return rpr.pageRankWithPeripheralNodes(rpr.size(), false);
return rpr.pageRankWithPeripheralNodes(rpr.size());
}
@SneakyThrows
public TIntList getAcademiaDomains() {
var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), rankingSettings.academia.toArray(String[]::new));
return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
var spr = new BetterStandardPageRank(rankingDomains, rankingSettings.academia.toArray(String[]::new));
return spr.pageRankWithPeripheralNodes(spr.size()/2);
}
@SneakyThrows
public TIntList getStandardDomains() {
var spr = new BuggyStandardPageRank(dataSource,rankingSettings.standard.toArray(String[]::new));
return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
var spr = new BuggyStandardPageRank(rankingDomains,rankingSettings.standard.toArray(String[]::new));
return spr.pageRankWithPeripheralNodes(spr.size()/2);
}
@SneakyThrows
public TIntList getSpecialDomains() {
TIntArrayList results = new TIntArrayList();
try (var connection = dataSource.getConnection();
var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE=2")
var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE='SPECIAL'")
) {
var rs = stmt.executeQuery();
while (rs.next()) {

View File

@ -1,11 +1,9 @@
package nu.marginalia.wmsa.edge.index.service.query;
package nu.marginalia.wmsa.edge.index.conversion;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import gnu.trove.set.hash.TIntHashSet;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.index.service.SearchEngineRanking;
import nu.marginalia.wmsa.edge.index.service.SearchIndexDao;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -124,7 +122,7 @@ public class SearchIndexPartitioner {
public Lock getReadLock() {
return rwl.readLock();
}
public boolean filterUnsafe(Lock lock, int domainId, int bucketId) {
public boolean filterUnsafe(int domainId, int bucketId) {
return partitionSet.test(domainId, bucketId);
}

View File

@ -1,10 +1,11 @@
package nu.marginalia.wmsa.edge.index.service.index;
package nu.marginalia.wmsa.edge.index.conversion;
import com.google.inject.Inject;
import gnu.trove.set.hash.TIntHashSet;
import lombok.SneakyThrows;
import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -47,23 +48,16 @@ public class SearchIndexPreconverter {
}
}
final RandomAccessFile raf = new RandomAccessFile(inputFile, "r");
SearchIndexJournalReader indexJournalReader = new SearchIndexJournalReader(MultimapFileLong.forReading(inputFile.toPath()));
var fileLength = raf.readLong();
var wordCount = raf.readInt();
final int wordCountOriginal = wordCount;
final long wordCountOriginal = indexJournalReader.fileHeader.wordCount();
logger.info("Word Count: {}", wordCount);
logger.info("File Length: {}", fileLength);
var channel = raf.getChannel();
ByteBuffer inByteBuffer = ByteBuffer.allocateDirect(10_000);
logger.info("{}", indexJournalReader.fileHeader);
RandomAccessFile[] randomAccessFiles = new RandomAccessFile[outputFiles.length];
for (int i = 0; i < randomAccessFiles.length; i++) {
randomAccessFiles[i] = new RandomAccessFile(outputFiles[i], "rw");
randomAccessFiles[i].seek(12);
randomAccessFiles[i].seek(SearchIndexJournalReader.FILE_HEADER_SIZE_BYTES);
}
FileChannel[] fileChannels = new FileChannel[outputFiles.length];
for (int i = 0; i < fileChannels.length; i++) {
@ -74,33 +68,24 @@ public class SearchIndexPreconverter {
var lock = partitioner.getReadLock();
try {
lock.lock();
ByteBuffer buffer = ByteBuffer.allocateDirect(8192);
while (channel.position() < fileLength) {
inByteBuffer.clear();
inByteBuffer.limit(CHUNK_HEADER_SIZE);
channel.read(inByteBuffer);
inByteBuffer.flip();
long urlId = inByteBuffer.getLong();
int chunkBlock = inByteBuffer.getInt();
int count = inByteBuffer.getInt();
// inByteBuffer.clear();
inByteBuffer.limit(count * 4 + CHUNK_HEADER_SIZE);
channel.read(inByteBuffer);
inByteBuffer.position(CHUNK_HEADER_SIZE);
for (int i = 0; i < count; i++) {
wordCount = Math.max(wordCount, 1 + inByteBuffer.getInt());
for (var entry : indexJournalReader) {
if (!partitioner.isGoodUrl(entry.urlId())
|| spamDomains.contains(entry.domainId())) {
continue;
}
inByteBuffer.position(count * 4 + CHUNK_HEADER_SIZE);
int domainId = entry.domainId();
buffer.clear();
entry.copyToBuffer(buffer);
if (isUrlAllowed(urlId)) {
for (int i = 0; i < randomAccessFiles.length; i++) {
if (partitioner.filterUnsafe(lock, (int) (urlId >>> 32L), i)) {
inByteBuffer.flip();
fileChannels[i].write(inByteBuffer);
}
if (partitioner.filterUnsafe(domainId, i)) {
buffer.flip();
while (buffer.position() < buffer.limit())
fileChannels[i].write(buffer);
}
}
}
@ -109,27 +94,16 @@ public class SearchIndexPreconverter {
lock.unlock();
}
if (wordCountOriginal < wordCount) {
logger.warn("Raised word count {} => {}", wordCountOriginal, wordCount);
}
for (int i = 0; i < randomAccessFiles.length; i++) {
long pos = randomAccessFiles[i].getFilePointer();
randomAccessFiles[i].seek(0);
randomAccessFiles[i].writeLong(pos);
randomAccessFiles[i].writeInt(wordCount);
randomAccessFiles[i].writeLong(wordCountOriginal);
fileChannels[i].force(true);
fileChannels[i].close();
randomAccessFiles[i].close();
}
}
private boolean isUrlAllowed(long url) {
int urlId = (int)(url & 0xFFFF_FFFFL);
int domainId = (int)(url >>> 32);
return partitioner.isGoodUrl(urlId) && !spamDomains.contains(domainId);
}
}

View File

@ -0,0 +1,10 @@
package nu.marginalia.wmsa.edge.index.conversion.words;
public class WordIndexLengthsTable {
final long[] table;
public WordIndexLengthsTable(int size) {
this.table = new long[size];
}
public void increment(int idx) { table[idx]++; }
}

View File

@ -0,0 +1,67 @@
package nu.marginalia.wmsa.edge.index.conversion.words;
import java.io.IOException;
public class WordIndexOffsetsTable {
final long[] table;
public final int numberOfUsedWords;
public WordIndexOffsetsTable(long[] table, int numberOfUsedWords) {
this.table = table;
this.numberOfUsedWords = numberOfUsedWords;
}
public int length() {
return table.length;
}
public void forEachRange(OffsetTableEntryConsumer o) throws IOException {
if (table[0] > 0) {
o.accept(0, (int) table[0]);
}
for (int i = 1; i < table.length; i++) {
long start = table[i-1];
int length = (int) (table[i] - start);
if (length != 0) {
o.accept(start, length);
}
}
}
/**
* Fold over each span in the file, left to right, accumulating the return value
*/
public long foldRanges(OffsetTableEntryFoldConsumer o) throws IOException {
long total = 0;
if (table[0] > 0) {
total = o.accept(total,0, (int) table[0]);
}
for (int i = 1; i < table.length; i++) {
long start = table[i-1];
int length = (int) (table[i] - start);
if (length != 0) {
total += o.accept(total, start, length);
}
}
return total;
}
public long get(int i) {
return table[i];
}
public interface OffsetTableEntryConsumer {
void accept(long start, int length) throws IOException;
}
public interface OffsetTableEntryFoldConsumer {
long accept(long accumulator, long start, int length) throws IOException;
}
}

View File

@ -0,0 +1,56 @@
package nu.marginalia.wmsa.edge.index.conversion.words;
/** Contains a stateful table of word index offsets, initially in lengths mode
* where the table contains how many postings exist for each word; then in offsets
* mode, where the lengths are converted into the necessary offsets for each block
* of document data.
*
* Caveat! This uses the same underlying array to conserve space.
*
*/
public class WordIndexTables {
private WordIndexLengthsTable lengthsTable;
private WordIndexOffsetsTable offsetsTable;
private boolean converted = false;
public WordIndexTables(int size) {
lengthsTable = new WordIndexLengthsTable(size);
}
public WordIndexLengthsTable lengths() {
if (converted) throw new IllegalStateException("Table has been converted");
return lengthsTable;
}
public WordIndexOffsetsTable offsets() {
if (!converted) throw new IllegalStateException("Table has not been converted");
return offsetsTable;
}
public void convert() {
if (converted) throw new IllegalStateException("Table has been converted");
// Go from lengths to offsets, i.e.
// BEFORE: 1, 2, 1, 3, 0, 2
// AFTER: 1, 3, 4, 7, 7, 9
long[] table = lengthsTable.table;
int numberOfUsedWords = 0;
if (table[0] != 0) numberOfUsedWords = 1;
for (int i = 1; i < table.length; i++) {
if (table[i] != 0) {
numberOfUsedWords++;
}
table[i] += table[i-1];
}
lengthsTable = null;
offsetsTable = new WordIndexOffsetsTable(table, numberOfUsedWords);
converted = true;
}
}

View File

@ -0,0 +1,75 @@
package nu.marginalia.wmsa.edge.index.conversion.words;
import nu.marginalia.util.btree.BTreeWriter;
import nu.marginalia.util.btree.model.BTreeContext;
import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.util.multimap.MultimapFileLongSlice;
import nu.marginalia.wmsa.edge.index.reader.IndexWordsTable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.urlsBTreeContext;
public class WordsTableWriter {
private final WordIndexTables table;
private final Logger logger = LoggerFactory.getLogger(getClass());
public static final BTreeContext wordsBTreeContext = new BTreeContext(7, 2, 0x0000_0000_FFFF_FFFFL, 8);
public WordsTableWriter(int length) {
table = new WordIndexTables(length);
}
public void acceptWord(int wordId) {
table.lengths().increment(wordId);
}
public WordIndexOffsetsTable getTable() {
return table.offsets();
}
public void write(File file) throws IOException {
table.convert();
logger.info("Writing table - {} max", table.offsets().numberOfUsedWords);
final int tableSize = table.offsets().numberOfUsedWords;
try (var mmf = MultimapFileLong.forOutput(file.toPath(), tableSize/8L)) {
mmf.put(0, IndexWordsTable.Strategy.BTREE.ordinal());
long offset = 1;
var writer = new BTreeWriter(mmf, wordsBTreeContext);
writer.write(offset, tableSize, this::writeBTreeDataBlock);
}
}
private void writeBTreeDataBlock(MultimapFileLongSlice mapSlice) {
long urlFileOffset = 0;
int idx = 0;
var offsetTable = table.offsets().table;
if (offsetTable[0] != 0) {
int length = (int) offsetTable[0];
mapSlice.put(idx++, (long)length<<32);
mapSlice.put(idx++, 0);
urlFileOffset += (urlsBTreeContext.calculateSize(length));
}
for (int i = 1; i < offsetTable.length; i++) {
final int length = (int)(offsetTable[i] - offsetTable[i-1]);
if (length > 0) {
mapSlice.put(idx++, (long)length << 32 | i);
mapSlice.put(idx++, urlFileOffset);
urlFileOffset += (urlsBTreeContext.calculateSize(length));
}
}
}
}

View File

@ -0,0 +1,126 @@
package nu.marginalia.wmsa.edge.index.journal;
import com.upserve.uppend.blobs.NativeIO;
import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.util.multimap.MultimapFileLongSlice;
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalFileHeader;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import org.jetbrains.annotations.NotNull;
import java.nio.ByteBuffer;
import java.util.Iterator;
import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS;
public class SearchIndexJournalReader implements Iterable<SearchIndexJournalReader.JournalEntry> {
public static final long FILE_HEADER_SIZE_LONGS = 2;
public static final long FILE_HEADER_SIZE_BYTES = 8*FILE_HEADER_SIZE_LONGS;
public final SearchIndexJournalFileHeader fileHeader;
private final MultimapFileLongSlice map;
private final long committedSize;
public SearchIndexJournalReader(MultimapFileLong map) {
fileHeader = new SearchIndexJournalFileHeader(map.get(0), map.get(1));
committedSize = map.get(0) / 8 - FILE_HEADER_SIZE_LONGS;
map.advice(NativeIO.Advice.Sequential);
this.map = map.atOffset(FILE_HEADER_SIZE_LONGS);
}
@NotNull
@Override
public Iterator<JournalEntry> iterator() {
return new JournalEntryIterator();
}
private class JournalEntryIterator implements Iterator<JournalEntry> {
private JournalEntry entry;
@Override
public boolean hasNext() {
if (entry == null) {
return committedSize > 0;
}
return entry.hasNext();
}
@Override
public JournalEntry next() {
if (entry == null) {
entry = new JournalEntry(0);
}
else {
entry = entry.next();
}
return entry;
}
}
public class JournalEntry {
private final long offset;
public final SearchIndexJournalEntryHeader header;
JournalEntry(long offset) {
final long sizeBlock = map.get(offset);
final long docId = map.get(offset + 1);
this.offset = offset;
this.header = new SearchIndexJournalEntryHeader(
(int)(sizeBlock >>> 32L),
docId,
IndexBlock.byId((int)(sizeBlock & 0xFFFF_FFFFL)));
}
public boolean hasNext() {
return nextId() < committedSize;
}
public long docId() {
return header.documentId();
}
public int domainId() {
return (int) (docId() >>> 32L);
}
public int urlId() {
return (int)(docId() & 0xFFFF_FFFFL);
}
public IndexBlock block() {
return header.block();
}
public int wordCount() { return header.entrySize(); }
public SearchIndexJournalEntry readEntry() {
long[] dest = new long[header.entrySize()];
map.read(dest, offset + HEADER_SIZE_LONGS);
return new SearchIndexJournalEntry(header.entrySize(), dest);
}
public SearchIndexJournalEntry readEntryUsingBuffer(long[] dest) {
if (dest.length >= header.entrySize()) {
map.read(dest, header.entrySize(), offset + HEADER_SIZE_LONGS);
return new SearchIndexJournalEntry(header.entrySize(), dest);
}
else {
return readEntry();
}
}
public long nextId() {
return offset + HEADER_SIZE_LONGS + header.entrySize();
}
public JournalEntry next() { return new JournalEntry(nextId()); }
public void copyToBuffer(ByteBuffer buffer) {
var dest = buffer.asLongBuffer();
dest.position(buffer.position() * 8);
dest.limit(buffer.position()*8 + header.entrySize() + HEADER_SIZE_LONGS);
map.read(dest, offset);
buffer.position(dest.limit()*8);
}
}
}

View File

@ -0,0 +1,13 @@
package nu.marginalia.wmsa.edge.index.journal;
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
public interface SearchIndexJournalWriter {
void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entry);
void forceWrite();
void flushWords();
}

View File

@ -1,13 +1,11 @@
package nu.marginalia.wmsa.edge.index.service.index;
package nu.marginalia.wmsa.edge.index.journal;
import io.reactivex.rxjava3.disposables.Disposable;
import io.reactivex.rxjava3.schedulers.Schedulers;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryWriter;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -17,23 +15,22 @@ import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.List;
import java.util.concurrent.TimeUnit;
public class SearchIndexWriterImpl implements SearchIndexWriter {
private final DictionaryWriter dictionaryWriter;
public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
private final KeywordLexicon dictionaryWriter;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Disposable writerTask;
private RandomAccessFile raf;
private FileChannel channel;
public static final int MAX_BLOCK_SIZE = 1000*32*8*4;
public static final int MAX_BLOCK_SIZE = SearchIndexJournalEntry.MAX_LENGTH*32*8*4;
private final ByteBuffer byteBuffer;
private long pos;
@SneakyThrows
public SearchIndexWriterImpl(DictionaryWriter dictionaryWriter, File indexFile) {
public SearchIndexJournalWriterImpl(KeywordLexicon dictionaryWriter, File indexFile) {
this.dictionaryWriter = dictionaryWriter;
initializeIndexFile(indexFile);
@ -61,23 +58,16 @@ public class SearchIndexWriterImpl implements SearchIndexWriter {
@Override
@SneakyThrows
public synchronized void put(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId, IndexBlock block, List<String> wordsSuspect) {
int numGoodWords = 0;
for (String word : wordsSuspect) {
if (word.length() < Byte.MAX_VALUE) numGoodWords++;
}
public synchronized void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {
byteBuffer.clear();
long url_id = ((long) domainId.getId() << 32) | urlId.getId();
byteBuffer.putLong(url_id);
byteBuffer.putInt(block.id);
byteBuffer.putInt(numGoodWords);
for (String word : wordsSuspect) {
if (word.length() < Byte.MAX_VALUE) {
byteBuffer.putInt(dictionaryWriter.get(word));
}
}
byteBuffer.putInt(entryData.size());
byteBuffer.putInt(header.block().id);
byteBuffer.putLong(header.documentId());
entryData.write(byteBuffer);
byteBuffer.limit(byteBuffer.position());
byteBuffer.rewind();
@ -104,11 +94,11 @@ public class SearchIndexWriterImpl implements SearchIndexWriter {
}
private void writePositionMarker() throws IOException {
var lock = channel.lock(0, 12, false);
var lock = channel.lock(0, 16, false);
pos = channel.size();
raf.seek(0);
raf.writeLong(pos);
raf.writeInt(dictionaryWriter.size());
raf.writeLong(dictionaryWriter.size());
raf.seek(pos);
lock.release();
}

View File

@ -0,0 +1,49 @@
package nu.marginalia.wmsa.edge.index.journal.model;
import java.nio.ByteBuffer;
import java.util.Arrays;
public class SearchIndexJournalEntry {
private final int size;
private final long[] underlyingArray;
public static final int MAX_LENGTH = 1000;
public SearchIndexJournalEntry(long[] underlyingArray) {
this.size = underlyingArray.length;
this.underlyingArray = underlyingArray;
}
public SearchIndexJournalEntry(int size, long[] underlyingArray) {
this.size = size;
this.underlyingArray = underlyingArray;
}
public void write(ByteBuffer buffer) {
for (int i = 0; i < size; i++) {
buffer.putLong(underlyingArray[i]);
}
}
public long get(int idx) {
if (idx >= size)
throw new ArrayIndexOutOfBoundsException();
return underlyingArray[idx];
}
public int size() {
return size;
}
public long[] toArray() {
if (size == underlyingArray.length)
return underlyingArray;
else
return Arrays.copyOf(underlyingArray, size);
}
public String toString() {
return String.format("%s[%s]", getClass().getSimpleName(), Arrays.toString(toArray()));
}
}

Some files were not shown because too many files have changed in this diff Show More