mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Merge pull request 'master' (#35) from master into release
Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/35
This commit is contained in:
commit
8c6a8fb7aa
@ -59,12 +59,12 @@ dependencies {
|
||||
implementation "com.sparkjava:spark-core:2.9.3"
|
||||
implementation 'com.opencsv:opencsv:5.6'
|
||||
|
||||
implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.1'
|
||||
implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.1'
|
||||
implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.1'
|
||||
implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.1'
|
||||
implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.1'
|
||||
implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.1'
|
||||
implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.2'
|
||||
implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.2'
|
||||
implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.2'
|
||||
implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.2'
|
||||
implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.2'
|
||||
implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.2'
|
||||
|
||||
implementation 'org.slf4j:slf4j-api:1.7.36'
|
||||
|
||||
@ -76,7 +76,6 @@ dependencies {
|
||||
implementation 'com.github.ThatJavaNerd:JRAW:1.1.0'
|
||||
|
||||
implementation group: 'com.h2database', name: 'h2', version: '2.1.210'
|
||||
testImplementation group: 'org.mockito', name: 'mockito-core', version: '4.3.1'
|
||||
|
||||
implementation 'org.jsoup:jsoup:1.14.3'
|
||||
implementation group: 'com.github.crawler-commons', name: 'crawler-commons', version: '1.2'
|
||||
@ -86,7 +85,7 @@ dependencies {
|
||||
|
||||
implementation 'com.zaxxer:HikariCP:5.0.1'
|
||||
|
||||
implementation 'org.apache.opennlp:opennlp-tools:1.9.3'
|
||||
implementation 'org.apache.opennlp:opennlp-tools:1.9.4'
|
||||
implementation 'io.prometheus:simpleclient:0.15.0'
|
||||
implementation 'io.prometheus:simpleclient_servlet:0.15.0'
|
||||
implementation 'io.prometheus:simpleclient_httpserver:0.15.0'
|
||||
@ -123,15 +122,19 @@ dependencies {
|
||||
testImplementation 'org.projectlombok:lombok:1.18.24'
|
||||
testAnnotationProcessor 'org.projectlombok:lombok:1.18.24'
|
||||
|
||||
testImplementation group: 'org.mockito', name: 'mockito-core', version: '4.5.1'
|
||||
|
||||
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.2')
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.2'
|
||||
testImplementation "org.testcontainers:junit-jupiter:1.17.2"
|
||||
|
||||
e2eTestImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2'
|
||||
e2eTestRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
|
||||
e2eTestImplementation 'org.projectlombok:lombok:1.18.24'
|
||||
e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.22'
|
||||
e2eTestImplementation 'org.testcontainers:mariadb:1.17.1'
|
||||
e2eTestImplementation 'org.testcontainers:nginx:1.17.1'
|
||||
e2eTestImplementation 'org.testcontainers:testcontainers:1.17.1'
|
||||
e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.1"
|
||||
e2eTestImplementation "org.testcontainers:selenium:1.17.1"
|
||||
e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.24'
|
||||
e2eTestImplementation 'org.testcontainers:nginx:1.17.2'
|
||||
e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.2"
|
||||
e2eTestImplementation "org.testcontainers:selenium:1.17.2"
|
||||
e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.1.4'
|
||||
e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.1.4'
|
||||
}
|
||||
|
@ -12,7 +12,10 @@ import org.openqa.selenium.chrome.ChromeOptions;
|
||||
import org.openzim.ZIMTypes.ZIMFile;
|
||||
import org.openzim.ZIMTypes.ZIMReader;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.testcontainers.containers.*;
|
||||
import org.testcontainers.containers.BindMode;
|
||||
import org.testcontainers.containers.BrowserWebDriverContainer;
|
||||
import org.testcontainers.containers.GenericContainer;
|
||||
import org.testcontainers.containers.NginxContainer;
|
||||
import org.testcontainers.containers.output.Slf4jLogConsumer;
|
||||
import org.testcontainers.containers.wait.strategy.Wait;
|
||||
import org.testcontainers.junit.jupiter.Container;
|
||||
@ -28,6 +31,7 @@ import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
@Tag("e2e")
|
||||
@Testcontainers
|
||||
@ -40,8 +44,6 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
||||
@Container
|
||||
public static GenericContainer<?> assistantContainer = forService(EDGE_ASSISTANT, mariaDB);
|
||||
@Container
|
||||
public static GenericContainer<?> encyclopediaContainer = forService(ENCYCLOPEDIA, mariaDB);
|
||||
@Container
|
||||
public static GenericContainer<?> indexContainer = forService(EDGE_INDEX, mariaDB);
|
||||
|
||||
@Container
|
||||
@ -156,6 +158,16 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
||||
return wikipediaFiles.toString();
|
||||
}
|
||||
|
||||
private List<String> getTitlesFromSearchResults(String html) {
|
||||
List<String> ret = new ArrayList<>();
|
||||
|
||||
for (var title : Jsoup.parse(html).select(".card.search-result > h2")) {
|
||||
ret.add(title.text());
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFrontPage() throws IOException {
|
||||
var driver = chrome.getWebDriver();
|
||||
@ -173,8 +185,9 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
||||
|
||||
driver.get("http://proxyNginx/search?query=bird&profile=corpo");
|
||||
System.out.println(driver.getTitle());
|
||||
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
||||
|
||||
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
|
||||
assertEquals(List.of("Bird"), getTitlesFromSearchResults(html));
|
||||
|
||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query"));
|
||||
}
|
||||
@ -187,20 +200,24 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
||||
System.out.println(driver.getTitle());
|
||||
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
||||
|
||||
|
||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-info"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSiteSearch() throws IOException {
|
||||
var driver = chrome.getWebDriver();
|
||||
|
||||
driver.get("http://proxyNginx/search?query=site:wikipedia.local%20frog");
|
||||
System.out.println(driver.getTitle());
|
||||
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
||||
|
||||
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
|
||||
|
||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-search"));
|
||||
|
||||
assertEquals(List.of("Frog", "Binomial nomenclature", "Mantis", "Amphibian"), getTitlesFromSearchResults(html));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBrowse() throws IOException {
|
||||
var driver = chrome.getWebDriver();
|
||||
@ -209,7 +226,6 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
||||
System.out.println(driver.getTitle());
|
||||
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
||||
|
||||
|
||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("browse"));
|
||||
}
|
||||
@Test
|
||||
@ -220,7 +236,6 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
||||
System.out.println(driver.getTitle());
|
||||
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
||||
|
||||
|
||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("define"));
|
||||
}
|
||||
@Test
|
||||
|
@ -10,7 +10,7 @@ import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
import org.openqa.selenium.By;
|
||||
import org.openqa.selenium.OutputType;
|
||||
import org.openqa.selenium.chrome.ChromeOptions;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.testcontainers.containers.*;
|
||||
@ -23,16 +23,16 @@ import org.testcontainers.utility.MountableFile;
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Types;
|
||||
import java.time.Duration;
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import static nu.marginalia.wmsa.configuration.ServiceDescriptor.ENCYCLOPEDIA;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
@Tag("e2e")
|
||||
@Testcontainers
|
||||
@ -80,12 +80,23 @@ public class EncyclopediaE2ETest extends E2ETestBase {
|
||||
return Path.of(System.getProperty("user.dir")).resolve("data/test");
|
||||
}
|
||||
|
||||
private static Path screenshotFilename(String operation) throws IOException {
|
||||
var path = Path.of(System.getProperty("user.dir")).resolve("build/test/e2e/");
|
||||
Files.createDirectories(path);
|
||||
|
||||
String name = String.format("test-encyclopedia-%s-%s.png", operation, LocalDateTime.now());
|
||||
path = path.resolve(name);
|
||||
|
||||
System.out.println("Screenshot in " + path);
|
||||
return path;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void run() throws MalformedURLException {
|
||||
public void run() throws IOException {
|
||||
new Driver();
|
||||
|
||||
try (var conn = DriverManager.getConnection(mariaDB.getJdbcUrl(), "wmsa", "wmsa");
|
||||
var stmt = conn.prepareStatement("INSERT IGNORE INTO REF_WIKI_TITLE(NAME,REF_NAME) VALUES (?,?)")) {
|
||||
var stmt = conn.prepareStatement("INSERT IGNORE INTO REF_WIKI_ARTICLE(NAME,REF_NAME) VALUES (?,?)")) {
|
||||
|
||||
stmt.setString(1, "Forg");
|
||||
stmt.setString(2, "Frog");
|
||||
@ -102,24 +113,16 @@ public class EncyclopediaE2ETest extends E2ETestBase {
|
||||
var driver = chrome.getWebDriver();
|
||||
|
||||
driver.get("http://proxyNginx/wiki/Frog");
|
||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("get-article"));
|
||||
|
||||
driver.get("http://proxyNginx/wiki/Forg");
|
||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("get-article-redir"));
|
||||
|
||||
System.out.println(driver.getTitle());
|
||||
driver.get("http://proxyNginx/wiki-search?query=Forg");
|
||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("disambig"));
|
||||
System.out.println(driver.getTitle());
|
||||
|
||||
assertTrue(get(encyclopediaContainer.getHost(),
|
||||
encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port),
|
||||
"/wiki/has?url=Frog", Boolean.class));
|
||||
|
||||
assertFalse(get(encyclopediaContainer.getHost(),
|
||||
encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port),
|
||||
"/wiki/has?url=Marginalia", Boolean.class));
|
||||
|
||||
assertFalse(get(encyclopediaContainer.getHost(),
|
||||
encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port),
|
||||
"/wiki/has?url=Marginalia", Boolean.class));
|
||||
|
||||
|
||||
|
||||
var resultsForMarginalia = get(encyclopediaContainer.getHost(),
|
||||
encyclopediaContainer.getMappedPort(ENCYCLOPEDIA.port),
|
||||
"/encyclopedia/Marginalia", WikiArticles.class);
|
||||
|
@ -70,4 +70,4 @@ dating dating
|
||||
EOF
|
||||
|
||||
echo "*** Starting $1"
|
||||
WMSA_HOME=${HOME} java -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1
|
||||
WMSA_HOME=${HOME} java -server -Xmx2G -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1
|
@ -0,0 +1,37 @@
|
||||
package nu.marginalia.util;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
|
||||
public class DenseBitMap {
|
||||
public static final long MAX_CAPACITY_2GB_16BN_ITEMS=(1L<<34)-8;
|
||||
|
||||
public final long cardinality;
|
||||
private final ByteBuffer buffer;
|
||||
|
||||
public DenseBitMap(long cardinality) {
|
||||
this.cardinality = cardinality;
|
||||
|
||||
boolean misaligned = (cardinality & 7) > 0;
|
||||
this.buffer = ByteBuffer.allocateDirect((int)((cardinality / 8) + (misaligned ? 1 : 0)));
|
||||
}
|
||||
|
||||
public boolean get(long pos) {
|
||||
return (buffer.get((int)(pos >>> 3)) & ((byte)1 << (int)(pos & 7))) != 0;
|
||||
}
|
||||
|
||||
/** Set the bit indexed by pos, returns
|
||||
* its previous value.
|
||||
*/
|
||||
public boolean set(long pos) {
|
||||
int offset = (int) (pos >>> 3);
|
||||
int oldVal = buffer.get(offset);
|
||||
int mask = (byte) 1 << (int) (pos & 7);
|
||||
buffer.put(offset, (byte) (oldVal | mask));
|
||||
return (oldVal & mask) != 0;
|
||||
}
|
||||
|
||||
public void clear(long pos) {
|
||||
int offset = (int)(pos >>> 3);
|
||||
buffer.put(offset, (byte)(buffer.get(offset) & ~(byte)(1 << (int)(pos & 7))));
|
||||
}
|
||||
}
|
@ -0,0 +1,31 @@
|
||||
package nu.marginalia.util;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
public class ListChunker {
|
||||
|
||||
/** Chops data into a list of lists of max length size
|
||||
*
|
||||
* Caveat: Relies on subList and does not clone "data", so
|
||||
* changes to the original list may affect the sub-lists
|
||||
* in unspecified ways
|
||||
*
|
||||
* @see List#subList
|
||||
*/
|
||||
public static <T> List<List<T>> chopList(List<T> data, int size) {
|
||||
if (data.isEmpty())
|
||||
return Collections.emptyList();
|
||||
else if (data.size() < size)
|
||||
return List.of(data);
|
||||
|
||||
final List<List<T>> ret = new ArrayList<>(1 + data.size() / size);
|
||||
|
||||
for (int i = 0; i < data.size(); i+=size) {
|
||||
ret.add(data.subList(i, Math.min(data.size(), i+size)));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
}
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.util;
|
||||
|
||||
import io.prometheus.client.Gauge;
|
||||
import lombok.SneakyThrows;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -18,10 +18,6 @@ import java.nio.file.Path;
|
||||
* */
|
||||
public class RandomWriteFunnel implements AutoCloseable {
|
||||
|
||||
private final static Gauge write_rate = Gauge.build("wmsa_rwf_write_bytes", "Bytes/s")
|
||||
.register();
|
||||
private final static Gauge transfer_rate = Gauge.build("wmsa_rwf_transfer_bytes", "Bytes/s")
|
||||
.register();
|
||||
private static final Logger logger = LoggerFactory.getLogger(RandomWriteFunnel.class);
|
||||
private final DataBin[] bins;
|
||||
|
||||
@ -34,7 +30,7 @@ public class RandomWriteFunnel implements AutoCloseable {
|
||||
int binCount = (int) (size / binSize + ((size % binSize) != 0L ? 1 : 0));
|
||||
bins = new DataBin[binCount];
|
||||
for (int i = 0; i < binCount; i++) {
|
||||
bins[i] = new DataBin(tempDir, (int) Math.min(size - binSize * i, binSize));
|
||||
bins[i] = new DataBin(tempDir, Math.min((int) (size - binSize * i), binSize));
|
||||
}
|
||||
}
|
||||
else {
|
||||
@ -42,25 +38,25 @@ public class RandomWriteFunnel implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
public void put(long address, long data) throws IOException {
|
||||
bins[((int)(address / binSize))].put((int)(address%binSize), data);
|
||||
@SneakyThrows
|
||||
public void put(long address, long data) {
|
||||
int bin = (int)(address / binSize);
|
||||
int offset = (int)(address%binSize);
|
||||
|
||||
bins[bin].put(offset, data);
|
||||
}
|
||||
|
||||
public void write(FileChannel o) throws IOException {
|
||||
ByteBuffer buffer = ByteBuffer.allocateDirect(binSize*8);
|
||||
logger.debug("Writing from RWF");
|
||||
|
||||
for (int i = 0; i < bins.length; i++) {
|
||||
var bin = bins[i];
|
||||
for (var bin : bins) {
|
||||
buffer.clear();
|
||||
bin.eval(buffer);
|
||||
|
||||
while (buffer.hasRemaining()) {
|
||||
int wb = o.write(buffer);
|
||||
write_rate.set(wb);
|
||||
o.write(buffer);
|
||||
}
|
||||
}
|
||||
logger.debug("Done");
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -84,12 +80,12 @@ public class RandomWriteFunnel implements AutoCloseable {
|
||||
}
|
||||
|
||||
void put(int address, long data) throws IOException {
|
||||
buffer.putInt(address);
|
||||
buffer.putLong(data);
|
||||
|
||||
if (buffer.capacity() - buffer.position() < 12) {
|
||||
if (buffer.remaining() < 12) {
|
||||
flushBuffer();
|
||||
}
|
||||
|
||||
buffer.putInt(address);
|
||||
buffer.putLong(data);
|
||||
}
|
||||
|
||||
private void flushBuffer() throws IOException {
|
||||
@ -97,12 +93,15 @@ public class RandomWriteFunnel implements AutoCloseable {
|
||||
return;
|
||||
|
||||
buffer.flip();
|
||||
while (channel.write(buffer) > 0);
|
||||
while (buffer.hasRemaining())
|
||||
channel.write(buffer);
|
||||
|
||||
buffer.clear();
|
||||
}
|
||||
|
||||
private void eval(ByteBuffer dest) throws IOException {
|
||||
flushBuffer();
|
||||
channel.force(false);
|
||||
|
||||
channel.position(0);
|
||||
buffer.clear();
|
||||
@ -117,14 +116,17 @@ public class RandomWriteFunnel implements AutoCloseable {
|
||||
if (rb < 0) {
|
||||
break;
|
||||
}
|
||||
else {
|
||||
transfer_rate.set(rb);
|
||||
}
|
||||
buffer.flip();
|
||||
while (buffer.limit() - buffer.position() >= 12) {
|
||||
int addr = buffer.getInt();
|
||||
int addr = 8 * buffer.getInt();
|
||||
long data = buffer.getLong();
|
||||
dest.putLong(8*addr, data);
|
||||
|
||||
try {
|
||||
dest.putLong(addr, data);
|
||||
}
|
||||
catch (IndexOutOfBoundsException ex) {
|
||||
logger.info("Bad poke[{}]={}, this happens if an RWF is allocated with insufficient size", addr, data);
|
||||
}
|
||||
}
|
||||
buffer.compact();
|
||||
}
|
||||
|
@ -4,101 +4,80 @@ import nu.marginalia.util.btree.model.BTreeContext;
|
||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||
import nu.marginalia.util.multimap.MultimapSearcher;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import static java.lang.Math.min;
|
||||
|
||||
public class BTreeReader {
|
||||
|
||||
private final MultimapFileLong file;
|
||||
private final BTreeContext ctx;
|
||||
private final Logger logger = LoggerFactory.getLogger(BTreeReader.class);
|
||||
private final long mask;
|
||||
private final MultimapSearcher searcher;
|
||||
|
||||
private final MultimapSearcher indexSearcher;
|
||||
private final MultimapSearcher dataSearcher;
|
||||
|
||||
public BTreeReader(MultimapFileLong file, BTreeContext ctx) {
|
||||
this.file = file;
|
||||
this.searcher = file.createSearcher();
|
||||
this.indexSearcher = MultimapSearcher.forContext(file, ~0, 1);
|
||||
this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize());
|
||||
|
||||
this.ctx = ctx;
|
||||
this.mask = ctx.equalityMask();
|
||||
}
|
||||
|
||||
public long fileSize() {
|
||||
return file.size();
|
||||
public BTreeHeader getHeader(long fileOffset) {
|
||||
return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2));
|
||||
}
|
||||
|
||||
public BTreeHeader getHeader(long offset) {
|
||||
return new BTreeHeader(file.get(offset), file.get(offset+1), file.get(offset+2));
|
||||
}
|
||||
/**
|
||||
*
|
||||
* @return file offset of entry matching keyRaw, negative if absent
|
||||
*/
|
||||
public long findEntry(BTreeHeader header, final long keyRaw) {
|
||||
final int blockSize = ctx.BLOCK_SIZE_WORDS();
|
||||
|
||||
public long offsetForEntry(BTreeHeader header, final long keyRaw) {
|
||||
final long key = keyRaw & mask;
|
||||
final long key = keyRaw & ctx.equalityMask();
|
||||
final long dataAddress = header.dataOffsetLongs();
|
||||
|
||||
if (header.layers() == 0) {
|
||||
return trivialSearch(header, key);
|
||||
final long searchStart;
|
||||
final long numEntries;
|
||||
|
||||
if (header.layers() == 0) { // For small data, there is no index block, only a flat data block
|
||||
searchStart = dataAddress;
|
||||
numEntries = header.numEntries();
|
||||
}
|
||||
|
||||
long p = searchEntireTopLayer(header, key);
|
||||
if (p < 0) return -1;
|
||||
|
||||
long cumOffset = p * ctx.BLOCK_SIZE_WORDS();
|
||||
for (int i = header.layers() - 2; i >= 0; --i) {
|
||||
long offsetBase = header.indexOffsetLongs() + header.relativeLayerOffset(ctx, i);
|
||||
p = searchLayerBlock(key, offsetBase+cumOffset);
|
||||
if (p < 0)
|
||||
return -1;
|
||||
cumOffset = ctx.BLOCK_SIZE_WORDS()*(p + cumOffset);
|
||||
}
|
||||
|
||||
long dataMax = header.dataOffsetLongs() + (long) header.numEntries() * ctx.entrySize();
|
||||
return searchDataBlock(key,
|
||||
header.dataOffsetLongs() + ctx.entrySize()*cumOffset,
|
||||
dataMax);
|
||||
}
|
||||
|
||||
|
||||
private long searchEntireTopLayer(BTreeHeader header, long key) {
|
||||
long offset = header.indexOffsetLongs();
|
||||
|
||||
return searcher.binarySearchUpperBound(key, offset, offset + ctx.BLOCK_SIZE_WORDS()) - offset;
|
||||
}
|
||||
|
||||
private long searchLayerBlock(long key, long blockOffset) {
|
||||
if (blockOffset < 0)
|
||||
return blockOffset;
|
||||
|
||||
return searcher.binarySearchUpperBound(key, blockOffset, blockOffset + ctx.BLOCK_SIZE_WORDS()) - blockOffset;
|
||||
}
|
||||
|
||||
|
||||
private long searchDataBlock(long key, long blockOffset, long dataMax) {
|
||||
if (blockOffset < 0)
|
||||
return blockOffset;
|
||||
|
||||
long lastOffset = Math.min(blockOffset+ctx.BLOCK_SIZE_WORDS()*(long)ctx.entrySize(), dataMax);
|
||||
int length = (int)(lastOffset - blockOffset);
|
||||
|
||||
if (ctx.entrySize() == 1) {
|
||||
if (mask == ~0L) return searcher.binarySearchUpperBoundNoMiss(key, blockOffset, blockOffset+length);
|
||||
return searcher.binarySearchUpperBoundNoMiss(key, blockOffset, blockOffset+length, mask);
|
||||
}
|
||||
|
||||
return searcher.binarySearchUpperBoundNoMiss(key, blockOffset, ctx.entrySize(), length/ctx.entrySize(), mask);
|
||||
}
|
||||
|
||||
private long trivialSearch(BTreeHeader header, long key) {
|
||||
long offset = header.dataOffsetLongs();
|
||||
|
||||
if (ctx.entrySize() == 1) {
|
||||
if (mask == ~0L) {
|
||||
return searcher.binarySearchUpperBoundNoMiss(key, offset, offset+header.numEntries());
|
||||
}
|
||||
else {
|
||||
return searcher.binarySearchUpperBoundNoMiss(key, offset, offset+header.numEntries(), mask);
|
||||
else {
|
||||
long dataLayerOffset = searchIndex(header, key);
|
||||
if (dataLayerOffset < 0) {
|
||||
return dataLayerOffset;
|
||||
}
|
||||
|
||||
searchStart = dataAddress + dataLayerOffset * ctx.entrySize();
|
||||
numEntries = min(header.numEntries() - dataLayerOffset, blockSize);
|
||||
}
|
||||
|
||||
return searcher.binarySearchUpperBoundNoMiss(key, offset, ctx.entrySize(), header.numEntries(), mask);
|
||||
return dataSearcher.binarySearch(key, searchStart, numEntries);
|
||||
}
|
||||
|
||||
private long searchIndex(BTreeHeader header, long key) {
|
||||
final int blockSize = ctx.BLOCK_SIZE_WORDS();
|
||||
final long indexAddress = header.indexOffsetLongs();
|
||||
|
||||
long layerOffset = 0;
|
||||
|
||||
for (int i = header.layers() - 1; i >= 0; --i) {
|
||||
final long indexLayerBlockOffset = header.relativeIndexLayerOffset(ctx, i) + layerOffset;
|
||||
|
||||
final long nextLayerOffset = relativePositionInIndex(key, indexAddress + indexLayerBlockOffset, blockSize);
|
||||
if (nextLayerOffset < 0)
|
||||
return nextLayerOffset;
|
||||
|
||||
layerOffset = blockSize * (nextLayerOffset + layerOffset);
|
||||
}
|
||||
|
||||
return layerOffset;
|
||||
}
|
||||
|
||||
private long relativePositionInIndex(long key, long start, long n) {
|
||||
return indexSearcher.binarySearchUpper(key, start, n) - start;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -2,19 +2,16 @@ package nu.marginalia.util.btree;
|
||||
|
||||
import nu.marginalia.util.btree.model.BTreeContext;
|
||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import nu.marginalia.util.multimap.MultimapFileLongSlice;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
public class BTreeWriter {
|
||||
private final Logger logger = LoggerFactory.getLogger(BTreeWriter.class);
|
||||
private final BTreeContext ctx;
|
||||
private final MultimapFileLong map;
|
||||
private final MultimapFileLongSlice map;
|
||||
|
||||
public BTreeWriter(MultimapFileLong map, BTreeContext ctx) {
|
||||
public BTreeWriter(MultimapFileLongSlice map, BTreeContext ctx) {
|
||||
this.map = map;
|
||||
this.ctx = ctx;
|
||||
}
|
||||
@ -26,30 +23,35 @@ public class BTreeWriter {
|
||||
|
||||
long size = 0;
|
||||
for (int layer = 0; layer < numLayers; layer++) {
|
||||
size += ctx.layerSize(numWords, layer);
|
||||
size += ctx.indexLayerSize(numWords, layer);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
public long write(long offset, int numEntries, WriteCallback writeIndex)
|
||||
/** Construct a BTree with numEntries entries at offset in the associated map
|
||||
*
|
||||
* @return The size of the written data
|
||||
*/
|
||||
public long write(long offset, int numEntries, WriteCallback writeIndexCallback)
|
||||
throws IOException
|
||||
{
|
||||
var header = makeHeader(offset, numEntries);
|
||||
BTreeHeader header = makeHeader(offset, numEntries);
|
||||
|
||||
header.write(map, offset);
|
||||
writeIndex.write(header.dataOffsetLongs());
|
||||
|
||||
if (header.layers() < 1) {
|
||||
writeIndexCallback.write(map.atOffset(header.dataOffsetLongs()));
|
||||
|
||||
if (header.layers() < 1) { // The data is too small to benefit from indexing
|
||||
return ctx.calculateSize(numEntries);
|
||||
}
|
||||
else {
|
||||
writeIndex(header);
|
||||
return ctx.calculateSize(numEntries);
|
||||
}
|
||||
|
||||
writeIndex(header);
|
||||
|
||||
return ctx.calculateSize(numEntries);
|
||||
}
|
||||
|
||||
public static BTreeHeader makeHeader(BTreeContext ctx, long offset, int numEntries) {
|
||||
final int numLayers = ctx.numLayers(numEntries);
|
||||
final int numLayers = ctx.numIndexLayers(numEntries);
|
||||
|
||||
final int padding = BTreeHeader.getPadding(ctx, offset, numLayers);
|
||||
|
||||
@ -65,46 +67,50 @@ public class BTreeWriter {
|
||||
|
||||
|
||||
private void writeIndex(BTreeHeader header) {
|
||||
var layerOffsets = getRelativeLayerOffsets(header);
|
||||
var layerOffsets = header.getRelativeLayerOffsets(ctx);
|
||||
|
||||
long stride = ctx.BLOCK_SIZE_WORDS();
|
||||
long indexedDataStepSize = ctx.BLOCK_SIZE_WORDS();
|
||||
|
||||
/* Index layer 0 indexes the data itself
|
||||
Index layer 1 indexes layer 0
|
||||
Index layer 2 indexes layer 1
|
||||
And so on
|
||||
*/
|
||||
for (int layer = 0; layer < header.layers(); layer++,
|
||||
stride*=ctx.BLOCK_SIZE_WORDS()) {
|
||||
long indexWord = 0;
|
||||
long offsetBase = layerOffsets[layer] + header.indexOffsetLongs();
|
||||
long numEntries = header.numEntries();
|
||||
for (long idx = 0; idx < numEntries; idx += stride, indexWord++) {
|
||||
long dataOffset = header.dataOffsetLongs() + (idx + (stride-1)) * ctx.entrySize();
|
||||
long val;
|
||||
indexedDataStepSize*=ctx.BLOCK_SIZE_WORDS()) {
|
||||
|
||||
if (idx + (stride-1) < numEntries) {
|
||||
val = map.get(dataOffset) & ctx.equalityMask();
|
||||
}
|
||||
else {
|
||||
val = Long.MAX_VALUE;
|
||||
}
|
||||
if (offsetBase + indexWord < 0) {
|
||||
logger.error("bad put @ {}", offsetBase + indexWord);
|
||||
logger.error("layer{}", layer);
|
||||
logger.error("layer offsets {}", layerOffsets);
|
||||
logger.error("offsetBase = {}", offsetBase);
|
||||
logger.error("numEntries = {}", numEntries);
|
||||
logger.error("indexWord = {}", indexWord);
|
||||
}
|
||||
map.put(offsetBase + indexWord, val);
|
||||
}
|
||||
for (; (indexWord % ctx.BLOCK_SIZE_WORDS()) != 0; indexWord++) {
|
||||
map.put(offsetBase + indexWord, Long.MAX_VALUE);
|
||||
}
|
||||
writeIndexLayer(header, layerOffsets, indexedDataStepSize, layer);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private long[] getRelativeLayerOffsets(BTreeHeader header) {
|
||||
long[] layerOffsets = new long[header.layers()];
|
||||
for (int i = 0; i < header.layers(); i++) {
|
||||
layerOffsets[i] = header.relativeLayerOffset(ctx, i);
|
||||
private void writeIndexLayer(BTreeHeader header, long[] layerOffsets,
|
||||
final long indexedDataStepSize,
|
||||
final int layer) {
|
||||
|
||||
final long indexOffsetBase = layerOffsets[layer] + header.indexOffsetLongs();
|
||||
final long dataOffsetBase = header.dataOffsetLongs();
|
||||
|
||||
final long dataEntriesMax = header.numEntries();
|
||||
final int entrySize = ctx.entrySize();
|
||||
|
||||
final long lastDataEntryOffset = indexedDataStepSize - 1;
|
||||
|
||||
long indexWord = 0;
|
||||
|
||||
for (long dataPtr = 0;
|
||||
dataPtr + lastDataEntryOffset < dataEntriesMax;
|
||||
dataPtr += indexedDataStepSize)
|
||||
{
|
||||
long dataOffset = dataOffsetBase + (dataPtr + lastDataEntryOffset) * entrySize;
|
||||
map.put(indexOffsetBase + indexWord++, map.get(dataOffset) & ctx.equalityMask());
|
||||
}
|
||||
return layerOffsets;
|
||||
|
||||
// Fill the remaining block with LONG_MAX
|
||||
map.setRange(indexOffsetBase+indexWord,
|
||||
(int) (ctx.BLOCK_SIZE_WORDS() - (indexWord % ctx.BLOCK_SIZE_WORDS())),
|
||||
Long.MAX_VALUE);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -1,7 +1,9 @@
|
||||
package nu.marginalia.util.btree;
|
||||
|
||||
import nu.marginalia.util.multimap.MultimapFileLongSlice;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public interface WriteCallback {
|
||||
void write(long offset) throws IOException;
|
||||
void write(MultimapFileLongSlice slice) throws IOException;
|
||||
}
|
||||
|
@ -10,7 +10,6 @@ public record BTreeContext(int MAX_LAYERS,
|
||||
|
||||
public BTreeContext(int MAX_LAYERS, int entrySize, long equalityMask, int BLOCK_SIZE_BITS) {
|
||||
this(MAX_LAYERS, entrySize, equalityMask, BLOCK_SIZE_BITS, 1 << BLOCK_SIZE_BITS);
|
||||
|
||||
}
|
||||
|
||||
public long calculateSize(int numEntries) {
|
||||
@ -19,7 +18,7 @@ public record BTreeContext(int MAX_LAYERS,
|
||||
return header.dataOffsetLongs() + (long)numEntries * entrySize;
|
||||
}
|
||||
|
||||
public int numLayers(int numEntries) {
|
||||
public int numIndexLayers(int numEntries) {
|
||||
if (numEntries <= BLOCK_SIZE_WORDS*2) {
|
||||
return 0;
|
||||
}
|
||||
@ -36,21 +35,14 @@ public record BTreeContext(int MAX_LAYERS,
|
||||
return MAX_LAYERS;
|
||||
}
|
||||
|
||||
public long layerSize(int numEntries, int level) {
|
||||
return BLOCK_SIZE_WORDS * numBlocks(numEntries, level);
|
||||
}
|
||||
public long indexLayerSize(int numWords, int level) {
|
||||
final long layerSize = 1L<<(BLOCK_SIZE_BITS*(level+1));
|
||||
final long numBlocks = numWords / layerSize;
|
||||
|
||||
private long numBlocks(int numWords, int level) {
|
||||
|
||||
long layerSize = 1L<<(BLOCK_SIZE_BITS*(level+1));
|
||||
int numBlocks = 0;
|
||||
|
||||
numBlocks += numWords / layerSize;
|
||||
if (numWords % layerSize != 0) {
|
||||
numBlocks++;
|
||||
return BLOCK_SIZE_WORDS * (numBlocks + 1);
|
||||
}
|
||||
|
||||
return numBlocks;
|
||||
return BLOCK_SIZE_WORDS * numBlocks;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.util.btree.model;
|
||||
|
||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||
import nu.marginalia.util.multimap.MultimapFileLongSlice;
|
||||
|
||||
public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, long dataOffsetLongs) {
|
||||
public BTreeHeader {
|
||||
@ -28,19 +28,27 @@ public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, lon
|
||||
return padding;
|
||||
}
|
||||
|
||||
public void write(MultimapFileLong dest, long offset) {
|
||||
public void write(MultimapFileLongSlice dest, long offset) {
|
||||
dest.put(offset, ((long) layers << 32L) | ((long)numEntries & 0xFFFF_FFFFL));
|
||||
dest.put(offset+1, indexOffsetLongs);
|
||||
dest.put(offset+2, dataOffsetLongs);
|
||||
}
|
||||
|
||||
|
||||
public long relativeLayerOffset(BTreeContext ctx, int n) {
|
||||
public long relativeIndexLayerOffset(BTreeContext ctx, int n) {
|
||||
long offset = 0;
|
||||
for (int i = n+1; i < layers; i++) {
|
||||
offset += ctx.layerSize( numEntries, i);
|
||||
offset += ctx.indexLayerSize( numEntries, i);
|
||||
}
|
||||
return offset;
|
||||
}
|
||||
|
||||
public long[] getRelativeLayerOffsets(BTreeContext ctx) {
|
||||
long[] layerOffsets = new long[layers()];
|
||||
for (int i = 0; i < layers(); i++) {
|
||||
layerOffsets[i] = relativeIndexLayerOffset(ctx, i);
|
||||
}
|
||||
return layerOffsets;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -5,7 +5,7 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.Arrays;
|
||||
import java.nio.LongBuffer;
|
||||
|
||||
public class DictionaryData {
|
||||
|
||||
@ -17,22 +17,22 @@ public class DictionaryData {
|
||||
public DictionaryData(int bankSize) {
|
||||
DICTIONARY_BANK_SIZE = bankSize;
|
||||
|
||||
banks.add(new DictionaryDataBank(0));
|
||||
banks.add(new DictionaryDataBank(0, bankSize));
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return banks.end();
|
||||
}
|
||||
|
||||
public int add(byte[] data, int value) {
|
||||
public int add(long key) {
|
||||
var activeBank = banks.last();
|
||||
int rb = activeBank.add(data, value);
|
||||
int rb = activeBank.add(key);
|
||||
|
||||
if (rb == -1) {
|
||||
int end = activeBank.getEnd();
|
||||
logger.debug("Switching bank @ {}", end);
|
||||
var newBank = new DictionaryDataBank(end);
|
||||
rb = newBank.add(data, value);
|
||||
var newBank = new DictionaryDataBank(end, DICTIONARY_BANK_SIZE);
|
||||
rb = newBank.add(key);
|
||||
|
||||
banks.add(newBank);
|
||||
}
|
||||
@ -41,33 +41,32 @@ public class DictionaryData {
|
||||
}
|
||||
|
||||
|
||||
public byte[] getBytes(int offset) {
|
||||
return banks.bankForOffset(offset).getBytes(offset);
|
||||
public long getKey(int offset) {
|
||||
return banks.bankForOffset(offset).getKey(offset);
|
||||
}
|
||||
public boolean keyEquals(int offset, byte[] data) {
|
||||
return banks.bankForOffset(offset).keyEquals(offset, data);
|
||||
public boolean keyEquals(int offset, long otherKey) {
|
||||
return banks.bankForOffset(offset).keyEquals(offset, otherKey);
|
||||
}
|
||||
|
||||
public int getValue(int offset) {
|
||||
return banks.bankForOffset(offset).getValue(offset);
|
||||
}
|
||||
|
||||
public class DictionaryDataBank {
|
||||
private static class DictionaryDataBank {
|
||||
|
||||
private final int start_idx;
|
||||
private final ByteBuffer data;
|
||||
|
||||
// Humongous long-lived arrays seem to sometimes yield considerable memory overhead and
|
||||
// can make the GC behave poorly. Using off-heap memory seems preferred when their
|
||||
// lifetime is "forever"
|
||||
|
||||
private final LongBuffer keys;
|
||||
|
||||
private int size;
|
||||
private int[] offset;
|
||||
private int[] value;
|
||||
private final int capacity;
|
||||
|
||||
public DictionaryDataBank(int start_idx) {
|
||||
|
||||
public DictionaryDataBank(int start_idx, int sz) {
|
||||
this.start_idx = start_idx;
|
||||
this.capacity = sz;
|
||||
|
||||
data = ByteBuffer.allocateDirect(DICTIONARY_BANK_SIZE);
|
||||
|
||||
offset = new int[DICTIONARY_BANK_SIZE/16];
|
||||
value = new int[DICTIONARY_BANK_SIZE/16];
|
||||
keys = ByteBuffer.allocateDirect(8*capacity).asLongBuffer();
|
||||
size = 0;
|
||||
}
|
||||
|
||||
@ -79,102 +78,26 @@ public class DictionaryData {
|
||||
return start_idx + size;
|
||||
}
|
||||
|
||||
public byte[] getBytes(int idx) {
|
||||
public long getKey(int idx) {
|
||||
if (idx < start_idx || idx - start_idx >= size) {
|
||||
throw new IndexOutOfBoundsException(idx);
|
||||
}
|
||||
return keys.get(idx - start_idx);
|
||||
}
|
||||
|
||||
public boolean keyEquals(int idx, long other) {
|
||||
if (idx < start_idx || idx - start_idx >= size) {
|
||||
throw new IndexOutOfBoundsException(idx);
|
||||
}
|
||||
|
||||
idx = idx - start_idx;
|
||||
|
||||
final int start;
|
||||
final int end = offset[idx];
|
||||
|
||||
if (idx == 0) start = 0;
|
||||
else start = offset[idx-1];
|
||||
|
||||
byte[] dst = new byte[end-start];
|
||||
data.get(start, dst);
|
||||
return dst;
|
||||
return keys.get(idx - start_idx) == other;
|
||||
}
|
||||
|
||||
public int getValue(int idx) {
|
||||
if (idx < start_idx || idx - start_idx >= size) {
|
||||
throw new IndexOutOfBoundsException(idx);
|
||||
}
|
||||
return value[idx - start_idx];
|
||||
}
|
||||
public int add(long newKey) {
|
||||
if (size >= capacity)
|
||||
return -1;
|
||||
|
||||
public boolean keyEquals(int idx, byte[] data) {
|
||||
if (idx < start_idx || idx - start_idx >= size) {
|
||||
throw new IndexOutOfBoundsException(idx);
|
||||
}
|
||||
|
||||
idx = idx - start_idx;
|
||||
int start;
|
||||
int end = offset[idx];
|
||||
|
||||
if (idx == 0) {
|
||||
start = 0;
|
||||
}
|
||||
else {
|
||||
start = offset[idx-1];
|
||||
}
|
||||
if (data.length != end - start) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
if (this.data.get(start + i) != data[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public long longHashCode(int idx) {
|
||||
if (idx < start_idx || idx - start_idx >= size) {
|
||||
throw new IndexOutOfBoundsException(idx);
|
||||
}
|
||||
|
||||
idx = idx - start_idx;
|
||||
int start;
|
||||
int end = offset[idx];
|
||||
|
||||
if (idx == 0) {
|
||||
start = 0;
|
||||
}
|
||||
else {
|
||||
start = offset[idx-1];
|
||||
}
|
||||
|
||||
long result = 1;
|
||||
for (int i = start; i < end; i++)
|
||||
result = 31 * result + data.get(i);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public int add(byte[] newData, int newValue) {
|
||||
if (size == offset.length) {
|
||||
logger.debug("Growing bank from {} to {}", offset.length, offset.length*2);
|
||||
offset = Arrays.copyOf(offset, offset.length*2);
|
||||
value = Arrays.copyOf(value, value.length*2);
|
||||
}
|
||||
|
||||
if (size > 0 && offset[size-1]+newData.length >= DICTIONARY_BANK_SIZE) {
|
||||
if (offset.length > size+1) {
|
||||
logger.debug("Shrinking bank from {} to {}", offset.length, size - 1);
|
||||
offset = Arrays.copyOf(offset, size + 1);
|
||||
value = Arrays.copyOf(value, size + 1);
|
||||
}
|
||||
return -1; // Full
|
||||
}
|
||||
|
||||
int dataOffset = size > 0 ? offset[size-1] : 0;
|
||||
|
||||
data.put(dataOffset, newData);
|
||||
|
||||
offset[size] = dataOffset + newData.length;
|
||||
value[size] = newValue;
|
||||
keys.put(size, newKey);
|
||||
|
||||
return start_idx + size++;
|
||||
}
|
||||
|
@ -66,8 +66,7 @@ public class DictionaryHashMap {
|
||||
logger.debug("Buffer size sanity checked passed");
|
||||
}
|
||||
|
||||
|
||||
dictionaryData = new DictionaryData(Math.min(1<<30, Math.max(32, (int)(sizeMemory/4))));
|
||||
dictionaryData = new DictionaryData((int)Math.min(1<<27, Math.max(32L, sizeMemory/4)));
|
||||
|
||||
initializeBuffers();
|
||||
}
|
||||
@ -82,9 +81,6 @@ public class DictionaryHashMap {
|
||||
}
|
||||
}
|
||||
|
||||
public int memSz() {
|
||||
return dictionaryData.size();
|
||||
}
|
||||
public int size() {
|
||||
return sz.get();
|
||||
}
|
||||
@ -101,20 +97,20 @@ public class DictionaryHashMap {
|
||||
buffers[buffer].put(bufferIdx, val);
|
||||
}
|
||||
|
||||
public int put(byte[] data, int value) {
|
||||
public int put(long key) {
|
||||
|
||||
long hash = longHash(data) & 0x7FFF_FFFF_FFFF_FFFFL;
|
||||
long hash = key & 0x7FFF_FFFF_FFFF_FFFFL;
|
||||
|
||||
long idx = hash % hashTableSize;
|
||||
|
||||
if (getCell(idx) == NO_VALUE) {
|
||||
return setValue(data, value, idx);
|
||||
return setValue(key, idx);
|
||||
}
|
||||
|
||||
return putRehash(data, value, idx, hash);
|
||||
return putRehash(key, idx, hash);
|
||||
}
|
||||
|
||||
private int putRehash(byte[] data, int value, long idx, long hash) {
|
||||
private int putRehash(long key, long idx, long hash) {
|
||||
final long pStride = 1 + (hash % (hashTableSize - 2));
|
||||
|
||||
for (long j = 1; j < maxProbeLength; j++) {
|
||||
@ -129,9 +125,9 @@ public class DictionaryHashMap {
|
||||
if (val == NO_VALUE) {
|
||||
probe_count_metrics.set(j);
|
||||
|
||||
return setValue(data, value, idx);
|
||||
return setValue(key, idx);
|
||||
}
|
||||
else if (dictionaryData.keyEquals(val, data)) {
|
||||
else if (dictionaryData.keyEquals(val, key)) {
|
||||
return val;
|
||||
}
|
||||
}
|
||||
@ -139,16 +135,16 @@ public class DictionaryHashMap {
|
||||
throw new IllegalStateException("DictionaryHashMap full @ size " + size() + "/" + hashTableSize + ", " + round((100.0*size()) / hashTableSize) + "%");
|
||||
}
|
||||
|
||||
private int setValue(byte[] data, int value, long cell) {
|
||||
private int setValue(long key, long cell) {
|
||||
sz.incrementAndGet();
|
||||
|
||||
int di = dictionaryData.add(data, value);
|
||||
int di = dictionaryData.add(key);
|
||||
setCell(cell, di);
|
||||
return di;
|
||||
}
|
||||
|
||||
public int get(byte[] data) {
|
||||
final long hash = longHash(data) & 0x7FFF_FFFF_FFFF_FFFFL;
|
||||
public int get(long key) {
|
||||
final long hash = key & 0x7FFF_FFFF_FFFF_FFFFL;
|
||||
final long cell = hash % hashTableSize;
|
||||
|
||||
if (getCell(cell) == NO_VALUE) {
|
||||
@ -157,15 +153,15 @@ public class DictionaryHashMap {
|
||||
else {
|
||||
int val = getCell(cell);
|
||||
|
||||
if (dictionaryData.keyEquals(val, data)) {
|
||||
return dictionaryData.getValue(val);
|
||||
if (dictionaryData.keyEquals(val, key)) {
|
||||
return val;
|
||||
}
|
||||
}
|
||||
|
||||
return getRehash(data, cell, hash);
|
||||
return getRehash(key, cell, hash);
|
||||
}
|
||||
|
||||
private int getRehash(byte[] data, long idx, long hash) {
|
||||
private int getRehash(long key, long idx, long hash) {
|
||||
final long pStride = 1 + (hash % (hashTableSize - 2));
|
||||
|
||||
for (long j = 1; j < maxProbeLength; j++) {
|
||||
@ -180,29 +176,12 @@ public class DictionaryHashMap {
|
||||
if (val == NO_VALUE) {
|
||||
return NO_VALUE;
|
||||
}
|
||||
else if (dictionaryData.keyEquals(val, data)) {
|
||||
return dictionaryData.getValue(val);
|
||||
else if (dictionaryData.keyEquals(val, key)) {
|
||||
return val;
|
||||
}
|
||||
}
|
||||
|
||||
throw new IllegalStateException("DictionaryHashMap full @ size " + size() + "/" + hashTableSize + ", " + round((100.0*size()) / hashTableSize) + "%");
|
||||
}
|
||||
|
||||
private long longHash(byte[] bytes) {
|
||||
if (bytes == null)
|
||||
return 0;
|
||||
|
||||
// https://cp-algorithms.com/string/string-hashing.html
|
||||
int p = 127;
|
||||
long m = (1L<<61)-1;
|
||||
long p_power = 1;
|
||||
long hash_val = 0;
|
||||
|
||||
for (byte element : bytes) {
|
||||
hash_val = (hash_val + (element+1) * p_power) % m;
|
||||
p_power = (p_power * p) % m;
|
||||
}
|
||||
return hash_val;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,9 +1,7 @@
|
||||
package nu.marginalia.util.hash;
|
||||
|
||||
import io.prometheus.client.Gauge;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import nu.marginalia.wmsa.edge.index.service.index.wordstable.IndexWordsTable;
|
||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||
import nu.marginalia.util.PrimeUtil;
|
||||
import org.slf4j.Logger;
|
||||
@ -17,9 +15,7 @@ import static java.lang.Math.round;
|
||||
*/
|
||||
public class LongPairHashMap {
|
||||
private static final Logger logger = LoggerFactory.getLogger(LongPairHashMap.class);
|
||||
private static final Gauge probe_count_metrics
|
||||
= Gauge.build("wmsa_wordfile_hash_map_probe_count", "Probing Count")
|
||||
.register();
|
||||
private static final long MAGIC_WORD = 0xE00E00E00E0E0E0EL; // it's the data police
|
||||
|
||||
private final long hashTableSize;
|
||||
private final MultimapFileLong data;
|
||||
@ -27,26 +23,37 @@ public class LongPairHashMap {
|
||||
private int sz = 0;
|
||||
private static final int HEADER_SIZE = 2;
|
||||
|
||||
public LongPairHashMap(MultimapFileLong data, long size) {
|
||||
private LongPairHashMap(MultimapFileLong data, long hashTableSize, long maxProbeLength) {
|
||||
this.data = data;
|
||||
// Actually use a prime size for Donald Knuth reasons
|
||||
hashTableSize = PrimeUtil.nextPrime(size, 1);
|
||||
maxProbeLength = hashTableSize / 2;
|
||||
this.hashTableSize = hashTableSize;
|
||||
this.maxProbeLength = maxProbeLength;
|
||||
}
|
||||
|
||||
logger.debug("Table size = " + hashTableSize);
|
||||
public static LongPairHashMap createNew(MultimapFileLong data, long size) {
|
||||
var tableSize = PrimeUtil.nextPrime(size, 1);
|
||||
var ret = new LongPairHashMap(data, tableSize, tableSize/2);
|
||||
|
||||
data.put(0, IndexWordsTable.Strategy.HASH.ordinal());
|
||||
data.put(1, hashTableSize);
|
||||
for (int i = 2; i < hashTableSize; i++) {
|
||||
data.put(0, MAGIC_WORD);
|
||||
data.put(1, tableSize);
|
||||
|
||||
for (int i = 2; i < tableSize; i++) {
|
||||
data.put(HEADER_SIZE + 2L*i, 0);
|
||||
}
|
||||
}
|
||||
public LongPairHashMap(MultimapFileLong data) {
|
||||
this.data = data;
|
||||
hashTableSize = data.get(1);
|
||||
maxProbeLength = hashTableSize / 10;
|
||||
|
||||
logger.debug("Table size = " + hashTableSize);
|
||||
return ret;
|
||||
}
|
||||
|
||||
public static LongPairHashMap loadExisting(MultimapFileLong data) {
|
||||
long key = data.get(0);
|
||||
|
||||
if (key != MAGIC_WORD) {
|
||||
logger.warn("LongPairHashMap lacks magic word, could this be garbage data?");
|
||||
}
|
||||
|
||||
var hashTableSize = data.get(1);
|
||||
var maxProbeLength = hashTableSize / 10;
|
||||
|
||||
return new LongPairHashMap(data, hashTableSize, maxProbeLength);
|
||||
}
|
||||
|
||||
public int size() {
|
||||
@ -91,8 +98,6 @@ public class LongPairHashMap {
|
||||
final var val = getCell(idx);
|
||||
|
||||
if (!val.isSet()) {
|
||||
probe_count_metrics.set(j);
|
||||
|
||||
return setValue(data, idx);
|
||||
}
|
||||
else if (val.getKey() == data.getKey()) {
|
||||
|
@ -69,7 +69,7 @@ public class DocumentDebugger {
|
||||
Set<String> reps = new HashSet<>();
|
||||
|
||||
// kc.count(languageData, 0.75).forEach(rep -> reps.add(rep.stemmed));
|
||||
kc.count(languageData, 0.75).forEach(rep -> reps.add(rep.stemmed));
|
||||
kc.count(languageData).forEach(rep -> reps.add(rep.stemmed));
|
||||
|
||||
try (var pw = new PrintWriter(new FileOutputStream(output.toFile()))) {
|
||||
|
||||
|
@ -3,7 +3,9 @@ package nu.marginalia.util.language;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.*;
|
||||
import java.util.HashSet;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@ -13,21 +15,13 @@ public class WordPatterns {
|
||||
|
||||
public static final String WORD_TOKEN_JOINER = "_";
|
||||
public static final Pattern wordPattern = Pattern.compile("[#]?[_@.a-zA-Z0-9'+\\-\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+[#]?");
|
||||
public static final Pattern wordPatternRestrictive = Pattern.compile("[#]?[@a-zA-Z0-9'+\\-\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+[#]?");
|
||||
public static final Pattern keyWordPattern = Pattern.compile("[A-Z\\u00C0-\\u00D6][_a-zA-Z\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]{0,32}('[a-zA-Z])?");
|
||||
public static final Pattern wordAppendixPattern = Pattern.compile("[.]?[0-9a-zA-Z\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]{1,3}[0-9]?");
|
||||
public static final Pattern joinWord = Pattern.compile("(as|an|the|of|in|a)");
|
||||
public static final Pattern keywordAppendixPattern = Pattern.compile("([0-9A-Z][A-Z0-9]{0,3})");
|
||||
public static final Pattern wordBreakPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))");
|
||||
public static final Pattern characterNoisePattern = Pattern.compile("^[/+\\-]+$");
|
||||
|
||||
public static final Predicate<String> wordQualitiesPredicate = wordPattern.asMatchPredicate();
|
||||
public static final Predicate<String> restrictivePredicate = wordPatternRestrictive.asMatchPredicate();
|
||||
public static final Predicate<String> wordAppendixPredicate = wordAppendixPattern.asMatchPredicate();
|
||||
public static final Predicate<String> keywordPredicate = keyWordPattern.asMatchPredicate();
|
||||
public static final Predicate<String> keywordAppendixPredicate = keywordAppendixPattern.asMatchPredicate();
|
||||
public static final Predicate<String> wordPredicateEither = wordQualitiesPredicate.or(wordAppendixPredicate);
|
||||
public static final Predicate<String> keywordPredicateEither = keywordPredicate.or(keywordAppendixPredicate);
|
||||
public static final Predicate<String> characterNoisePredicate = characterNoisePattern.asMatchPredicate();
|
||||
|
||||
public static final Set<String> topWords;
|
||||
@ -88,16 +82,6 @@ public class WordPatterns {
|
||||
return true;
|
||||
}
|
||||
|
||||
public static boolean filterStrict(String word) {
|
||||
|
||||
int numDigits = (int) word.chars().filter(Character::isDigit).count();
|
||||
if (numDigits == word.length()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public static boolean isStopWord(String s) {
|
||||
if (s.length() < MIN_WORD_LENGTH) {
|
||||
return true;
|
||||
|
@ -39,13 +39,12 @@ public class DocumentKeywordExtractor {
|
||||
|
||||
public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData) {
|
||||
|
||||
var titleWords = extractTitleWords(documentLanguageData);
|
||||
|
||||
var wordsTfIdf = tfIdfCounter.count(documentLanguageData, 0.75);
|
||||
var wordsNamesRepeated = nameCounter.count(documentLanguageData, 2);
|
||||
var wordsNamesAll = nameCounter.count(documentLanguageData, 1);
|
||||
var subjects = subjectCounter.count(documentLanguageData);
|
||||
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
||||
|
||||
List<WordRep> wordsTfIdf = tfIdfCounter.count(documentLanguageData);
|
||||
List<WordRep> wordsNamesRepeated = nameCounter.count(documentLanguageData, 2);
|
||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1);
|
||||
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
||||
List<WordRep> wordsLongName = longNameCounter.count(documentLanguageData);
|
||||
|
||||
int totalSize = wordsTfIdf.size();
|
||||
@ -55,8 +54,8 @@ public class DocumentKeywordExtractor {
|
||||
List<WordRep> topKeywords = new ArrayList<>(totalSize / 2);
|
||||
|
||||
for(var v : wordsTfIdf) {
|
||||
if (topKeywords.size() < totalSize / 10) topKeywords.add(v);
|
||||
else if (midKeywords.size() < totalSize / 5) midKeywords.add(v);
|
||||
if (topKeywords.size() <= totalSize / 10) topKeywords.add(v);
|
||||
else if (midKeywords.size() <= totalSize / 5) midKeywords.add(v);
|
||||
else lowKeywords.add(v);
|
||||
}
|
||||
|
||||
@ -125,17 +124,18 @@ public class DocumentKeywordExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
return counts.entrySet().stream().filter(c2 -> c2.getValue()>=1)
|
||||
.sorted(Comparator.comparing(this::value))
|
||||
return counts.entrySet().stream()
|
||||
.sorted(Comparator.comparing(e -> {
|
||||
double N = 11820118.; // Number of documents in term freq dictionary
|
||||
|
||||
// Caveat: This is actually the *negated* term score, because the second logarithm has
|
||||
// its parameter inverted (log(a^b) = b log(a); here b = -1)
|
||||
return (1+Math.log(e.getValue())) * Math.log((1.+dict.getTermFreq(e.getKey()))/N);
|
||||
}))
|
||||
.map(Map.Entry::getKey)
|
||||
.limit(512).collect(Collectors.toSet());
|
||||
.limit(512).collect(Collectors.toCollection(LinkedHashSet::new));
|
||||
}
|
||||
|
||||
private double value(Map.Entry<String, Integer> e) {
|
||||
double N = 11820118.; // Number of documents in term freq dictionary
|
||||
|
||||
return (1+Math.log(e.getValue())) * Math.log((1.+dict.getTermFreq(e.getKey()))/N);
|
||||
}
|
||||
|
||||
public EdgePageWords createWords(IndexBlock block, Collection<WordRep> words) {
|
||||
return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns.wordQualitiesPredicate).collect(Collectors.toSet()));
|
||||
|
@ -1,15 +1,12 @@
|
||||
package nu.marginalia.util.language.processing;
|
||||
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
||||
import nu.marginalia.util.language.processing.model.WordRep;
|
||||
import nu.marginalia.util.language.processing.model.WordSpan;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
public class KeywordCounter {
|
||||
private final KeywordExtractor keywordExtractor;
|
||||
@ -20,58 +17,29 @@ public class KeywordCounter {
|
||||
this.keywordExtractor = keywordExtractor;
|
||||
}
|
||||
|
||||
public List<WordRep> count(DocumentLanguageData dld, double cutoff) {
|
||||
public List<WordRep> count(DocumentLanguageData dld) {
|
||||
HashMap<String, Double> counts = new HashMap<>(1000);
|
||||
HashMap<String, HashSet<String>> instances = new HashMap<>(1000);
|
||||
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(1000);
|
||||
|
||||
for (int i = 0; i < dld.sentences.length; i++) {
|
||||
DocumentSentence sent = dld.sentences[i];
|
||||
double value = 1.0 / Math.log(1+i);
|
||||
for (var sent : dld.sentences) {
|
||||
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
|
||||
for (var span : keywords) {
|
||||
var stemmed = sent.constructStemmedWordFromSpan(span);
|
||||
if (stemmed.isBlank())
|
||||
continue;
|
||||
|
||||
counts.merge(stemmed, value, Double::sum);
|
||||
String stemmed = sent.constructStemmedWordFromSpan(span);
|
||||
|
||||
instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(sent.constructWordFromSpan(span));
|
||||
counts.merge(stemmed, 1., Double::sum);
|
||||
instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(new WordRep(sent, span));
|
||||
}
|
||||
}
|
||||
|
||||
var topWords = counts.entrySet().stream()
|
||||
.filter(w -> w.getValue() > cutoff)
|
||||
return counts.entrySet().stream()
|
||||
.filter(e -> e.getValue() > 1)
|
||||
.sorted(Comparator.comparing(this::getTermValue))
|
||||
.limit(Math.min(100, counts.size()/2))
|
||||
.map(Map.Entry::getKey)
|
||||
.flatMap(w -> instances.get(w).stream())
|
||||
.filter(w -> w.word.length() > 1)
|
||||
.limit(150)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
var topWordsSet = new HashSet<>(topWords);
|
||||
|
||||
final Set<WordRep> keywords = new HashSet<>();
|
||||
|
||||
for (var sentence : dld.sentences) {
|
||||
for (WordSpan kw : keywordExtractor.getKeywordsFromSentence(sentence)) {
|
||||
String stemmedWord = sentence.constructStemmedWordFromSpan(kw);
|
||||
if (topWords.contains(stemmedWord)) {
|
||||
keywords.add(new WordRep(sentence, kw));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (var sentence : dld.sentences) {
|
||||
for (var kw : keywordExtractor.getKeywordsFromSentenceStrict(sentence, topWordsSet, true)) {
|
||||
keywords.add(new WordRep(sentence, kw));
|
||||
}
|
||||
}
|
||||
|
||||
Map<String, Integer> sortOrder = IntStream.range(0, topWords.size()).boxed().collect(Collectors.toMap(topWords::get, i->i));
|
||||
|
||||
Comparator<WordRep> comp = Comparator.comparing(wr -> sortOrder.getOrDefault(wr.stemmed, topWords.size()));
|
||||
|
||||
var ret = new ArrayList<>(keywords);
|
||||
ret.sort(comp);
|
||||
return ret;
|
||||
}
|
||||
|
||||
private static final Pattern separator = Pattern.compile("_");
|
||||
@ -86,7 +54,11 @@ public class KeywordCounter {
|
||||
}
|
||||
|
||||
double value(String key, double value) {
|
||||
return (1+Math.log(value)) * Math.log((1.+dict.getTermFreq(key))/11820118.);
|
||||
double freq = dict.getTermFreqStemmed(key);
|
||||
if (freq < 1) {
|
||||
freq = 10;
|
||||
}
|
||||
return (1+Math.log(value)) * Math.log((1.1+freq)/11820118.);
|
||||
}
|
||||
|
||||
|
||||
|
@ -56,7 +56,7 @@ public class LongNameCounter {
|
||||
}
|
||||
|
||||
double value(String key, double value) {
|
||||
return (1+Math.log(value)) * Math.log((1.+dict.getTermFreq(key))/11820118.);
|
||||
return (1+Math.log(value)) * Math.log((1.1+dict.getTermFreqStemmed(key))/11820118.);
|
||||
}
|
||||
|
||||
|
||||
|
@ -5,7 +5,9 @@ import nu.marginalia.util.language.processing.model.WordRep;
|
||||
import nu.marginalia.util.language.processing.model.WordSpan;
|
||||
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class SubjectCounter {
|
||||
@ -15,6 +17,14 @@ public class SubjectCounter {
|
||||
this.keywordExtractor = keywordExtractor;
|
||||
}
|
||||
|
||||
// Seeks out subjects in a sentence by constructs like
|
||||
//
|
||||
// [Name] (Verbs) (the|a|Adverb|Verb) ...
|
||||
// e.g.
|
||||
//
|
||||
// Greeks bearing gifts -> Greeks
|
||||
// Steve McQueen drove fast | cars -> Steve McQueen
|
||||
|
||||
public List<WordRep> count(DocumentLanguageData dld) {
|
||||
|
||||
Map<WordRep, Integer> counts = new HashMap<>();
|
||||
@ -27,9 +37,10 @@ public class SubjectCounter {
|
||||
|| sentence.separators[kw.end + 1] == WordSeparator.COMMA)
|
||||
break;
|
||||
|
||||
if (("VBZ".equals(sentence.posTags[kw.end]) || "VBP".equals(sentence.posTags[kw.end]))
|
||||
&& ("DT".equals(sentence.posTags[kw.end + 1]) || "RB".equals(sentence.posTags[kw.end]) || sentence.posTags[kw.end].startsWith("VB"))
|
||||
) {
|
||||
String nextTag = sentence.posTags[kw.end];
|
||||
String nextNextTag = sentence.posTags[kw.end+1];
|
||||
|
||||
if (isVerb(nextTag) && isDetOrAdverbOrVerb(nextNextTag)) {
|
||||
counts.merge(new WordRep(sentence, new WordSpan(kw.start, kw.end)), -1, Integer::sum);
|
||||
}
|
||||
}
|
||||
@ -43,4 +54,16 @@ public class SubjectCounter {
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private boolean isDetOrAdverbOrVerb(String posTag) {
|
||||
return "DT".equals(posTag) // determinant
|
||||
|| "RB".equals(posTag) // adverb
|
||||
|| posTag.startsWith("VB") // verb
|
||||
|| posTag.startsWith("JJ"); // adjective
|
||||
}
|
||||
|
||||
boolean isVerb(String posTag) {
|
||||
return posTag.startsWith("VB")
|
||||
&& !posTag.equals("VB"); // not interested in the infinitive
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -2,15 +2,17 @@ package nu.marginalia.util.language.processing.model;
|
||||
|
||||
|
||||
import nu.marginalia.util.language.WordPatterns;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.lang.ref.SoftReference;
|
||||
import java.util.BitSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.StringJoiner;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
public class DocumentSentence {
|
||||
public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>{
|
||||
public final String originalSentence;
|
||||
public final String[] words;
|
||||
public final int[] separators;
|
||||
@ -85,4 +87,37 @@ public class DocumentSentence {
|
||||
public String toString() {
|
||||
return IntStream.range(0, length()).mapToObj(i -> String.format("%s[%s]", words[i], posTags[i])).collect(Collectors.joining(" "));
|
||||
}
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
public Iterator<SentencePos> iterator() {
|
||||
return new Iterator<>() {
|
||||
int i = -1;
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return i+1 < length();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SentencePos next() {
|
||||
return new SentencePos(++i);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
public class SentencePos {
|
||||
public final int pos;
|
||||
|
||||
public SentencePos(int pos) {
|
||||
this.pos = pos;
|
||||
}
|
||||
|
||||
public String word() { return words[pos]; }
|
||||
public String wordLowerCase() { return wordsLowerCase[pos]; }
|
||||
public String posTag() { return posTags[pos]; }
|
||||
public String stemmed() { return stemmedWords[pos]; }
|
||||
public int separator() { return separators[pos]; }
|
||||
public boolean isStopWord() { return DocumentSentence.this.isStopWord(pos); }
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -21,7 +21,7 @@ import static java.nio.channels.FileChannel.MapMode.READ_WRITE;
|
||||
import static nu.marginalia.util.FileSizeUtil.readableSize;
|
||||
|
||||
|
||||
public class MultimapFileLong implements AutoCloseable {
|
||||
public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
|
||||
|
||||
private final ArrayList<LongBuffer> buffers = new ArrayList<>();
|
||||
private final ArrayList<MappedByteBuffer> mappedByteBuffers = new ArrayList<>();
|
||||
@ -36,9 +36,7 @@ public class MultimapFileLong implements AutoCloseable {
|
||||
private long mappedSize;
|
||||
final static long WORD_SIZE = 8;
|
||||
|
||||
private boolean loadAggressively;
|
||||
|
||||
private final NativeIO.Advice advice = null;
|
||||
private NativeIO.Advice defaultAdvice = null;
|
||||
|
||||
public static MultimapFileLong forReading(Path file) throws IOException {
|
||||
long fileSize = Files.size(file);
|
||||
@ -70,12 +68,7 @@ public class MultimapFileLong implements AutoCloseable {
|
||||
long mapSize,
|
||||
int bufferSize) throws IOException {
|
||||
|
||||
this(new RandomAccessFile(file, translateToRAFMode(mode)), mode, mapSize, bufferSize, false);
|
||||
}
|
||||
|
||||
public MultimapFileLong loadAggressively(boolean v) {
|
||||
this.loadAggressively = v;
|
||||
return this;
|
||||
this(new RandomAccessFile(file, translateToRAFMode(mode)), mode, mapSize, bufferSize);
|
||||
}
|
||||
|
||||
private static String translateToRAFMode(FileChannel.MapMode mode) {
|
||||
@ -91,13 +84,11 @@ public class MultimapFileLong implements AutoCloseable {
|
||||
public MultimapFileLong(RandomAccessFile file,
|
||||
FileChannel.MapMode mode,
|
||||
long mapSizeBytes,
|
||||
int bufferSizeWords,
|
||||
boolean loadAggressively) throws IOException {
|
||||
int bufferSizeWords) throws IOException {
|
||||
this.mode = mode;
|
||||
this.bufferSize = bufferSizeWords;
|
||||
this.mapSize = mapSizeBytes;
|
||||
this.fileLength = file.length();
|
||||
this.loadAggressively = loadAggressively;
|
||||
|
||||
channel = file.getChannel();
|
||||
mappedSize = 0;
|
||||
@ -106,8 +97,8 @@ public class MultimapFileLong implements AutoCloseable {
|
||||
readableSize(mapSizeBytes), readableSize(8L*bufferSizeWords), mode);
|
||||
}
|
||||
|
||||
public MultimapSearcher createSearcher() {
|
||||
return new MultimapSearcher(this);
|
||||
public MultimapSearcherBase createSearcher() {
|
||||
return new MultimapSearcherBase(this);
|
||||
}
|
||||
public MultimapSorter createSorter(Path tmpFile, int internalSortLimit) {
|
||||
return new MultimapSorter(this, tmpFile, internalSortLimit);
|
||||
@ -115,6 +106,7 @@ public class MultimapFileLong implements AutoCloseable {
|
||||
|
||||
@SneakyThrows
|
||||
public void advice(NativeIO.Advice advice) {
|
||||
this.defaultAdvice = advice;
|
||||
for (var buffer : mappedByteBuffers) {
|
||||
NativeIO.madvise(buffer, advice);
|
||||
}
|
||||
@ -157,7 +149,7 @@ public class MultimapFileLong implements AutoCloseable {
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private void grow(long posIdxRequired) {
|
||||
public void grow(long posIdxRequired) {
|
||||
if (posIdxRequired*WORD_SIZE > mapSize && mode == READ_ONLY) {
|
||||
throw new IndexOutOfBoundsException(posIdxRequired + " (max " + mapSize + ")");
|
||||
}
|
||||
@ -182,11 +174,8 @@ public class MultimapFileLong implements AutoCloseable {
|
||||
|
||||
var buffer = channel.map(mode, posBytes, bzBytes);
|
||||
|
||||
if (loadAggressively)
|
||||
buffer.load();
|
||||
|
||||
if (advice != null) {
|
||||
NativeIO.madvise(buffer, advice);
|
||||
if (defaultAdvice != null) {
|
||||
NativeIO.madvise(buffer, defaultAdvice);
|
||||
}
|
||||
|
||||
buffers.add(buffer.asLongBuffer());
|
||||
@ -196,10 +185,12 @@ public class MultimapFileLong implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long size() {
|
||||
return fileLength;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void put(long idx, long val) {
|
||||
if (idx >= mappedSize)
|
||||
grow(idx);
|
||||
@ -214,6 +205,7 @@ public class MultimapFileLong implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long get(long idx) {
|
||||
if (idx >= mappedSize)
|
||||
grow(idx);
|
||||
@ -229,10 +221,12 @@ public class MultimapFileLong implements AutoCloseable {
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void read(long[] vals, long idx) {
|
||||
read(vals, vals.length, idx);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void read(long[] vals, int n, long idx) {
|
||||
if (idx+n >= mappedSize) {
|
||||
grow(idx+n);
|
||||
@ -257,10 +251,38 @@ public class MultimapFileLong implements AutoCloseable {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void read(LongBuffer vals, long idx) {
|
||||
int n = vals.limit() - vals.position();
|
||||
if (idx+n >= mappedSize) {
|
||||
grow(idx+n);
|
||||
}
|
||||
int iN = (int)((idx + n) / bufferSize);
|
||||
|
||||
for (int i = 0; i < n; ) {
|
||||
int i0 = (int)((idx + i) / bufferSize);
|
||||
|
||||
int bufferOffset = (int) ((idx+i) % bufferSize);
|
||||
var buffer = buffers.get(i0);
|
||||
|
||||
final int l;
|
||||
|
||||
if (i0 < iN) l = bufferSize - bufferOffset;
|
||||
else l = Math.min(n - i, bufferSize - bufferOffset);
|
||||
|
||||
vals.put(vals.position() + i, buffer, bufferOffset, l);
|
||||
i+=l;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void write(long[] vals, long idx) {
|
||||
write(vals, vals.length, idx);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(long[] vals, int n, long idx) {
|
||||
if (idx+n >= mappedSize) {
|
||||
grow(idx+n);
|
||||
@ -285,6 +307,7 @@ public class MultimapFileLong implements AutoCloseable {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(LongBuffer vals, long idx) {
|
||||
int n = vals.limit() - vals.position();
|
||||
if (idx+n >= mappedSize) {
|
||||
@ -309,7 +332,36 @@ public class MultimapFileLong implements AutoCloseable {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setRange(long idx, int n, long val) {
|
||||
if (n == 0) return;
|
||||
|
||||
if (idx+n >= mappedSize) {
|
||||
grow(idx+n);
|
||||
}
|
||||
int iN = (int)((idx + n) / bufferSize);
|
||||
|
||||
for (int i = 0; i < n; ) {
|
||||
int i0 = (int)((idx + i) / bufferSize);
|
||||
|
||||
int bufferOffset = (int) ((idx+i) % bufferSize);
|
||||
var buffer = buffers.get(i0);
|
||||
|
||||
final int l;
|
||||
|
||||
if (i0 < iN) l = bufferSize - bufferOffset;
|
||||
else l = Math.min(n - i, bufferSize - bufferOffset);
|
||||
|
||||
for (int p = 0; p < l; p++) {
|
||||
buffer.put(bufferOffset + p, val);
|
||||
}
|
||||
|
||||
i+=l;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException {
|
||||
|
||||
int length = (int)(sourceEnd - sourceStart);
|
||||
@ -354,8 +406,10 @@ public class MultimapFileLong implements AutoCloseable {
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
force();
|
||||
|
||||
mappedByteBuffers.clear();
|
||||
buffers.clear();
|
||||
|
||||
channel.close();
|
||||
|
||||
// I want to believe
|
||||
|
@ -0,0 +1,78 @@
|
||||
package nu.marginalia.util.multimap;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.LongBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
|
||||
public class MultimapFileLongOffsetSlice implements MultimapFileLongSlice {
|
||||
private final long off;
|
||||
private final MultimapFileLongSlice map;
|
||||
|
||||
public MultimapFileLongOffsetSlice(MultimapFileLongSlice map, long off) {
|
||||
this.off = off;
|
||||
this.map = map;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long size() {
|
||||
return map.size() - off;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void put(long idx, long val) {
|
||||
map.put(off+idx, val);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setRange(long idx, int n, long val) {
|
||||
map.setRange(off+idx, n, val);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long get(long idx) {
|
||||
return map.get(off+idx);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void read(long[] vals, long idx) {
|
||||
map.read(vals, idx+off);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void read(long[] vals, int n, long idx) {
|
||||
map.read(vals, n, idx+off);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void read(LongBuffer vals, long idx) { map.read(vals, idx+off); }
|
||||
|
||||
@Override
|
||||
public void write(long[] vals, long idx) {
|
||||
map.write(vals, idx+off);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(long[] vals, int n, long idx) {
|
||||
map.write(vals, n, idx+off);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(LongBuffer vals, long idx) {
|
||||
map.write(vals, idx+off);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd)
|
||||
throws IOException {
|
||||
map.transferFromFileChannel(sourceChannel, destOffset + off, sourceStart, sourceEnd);
|
||||
}
|
||||
|
||||
@Override
|
||||
public MultimapFileLongSlice atOffset(long off) {
|
||||
// If we don't override this, the default implementation would build a pyramid of
|
||||
// MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(...)))
|
||||
// if this is called iteratively (e.g. to walk over a file)
|
||||
|
||||
return new MultimapFileLongOffsetSlice(map, this.off + off);
|
||||
}
|
||||
}
|
@ -0,0 +1,33 @@
|
||||
package nu.marginalia.util.multimap;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.LongBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
|
||||
public interface MultimapFileLongSlice {
|
||||
long size();
|
||||
|
||||
void put(long idx, long val);
|
||||
|
||||
void setRange(long idx, int n, long val);
|
||||
|
||||
long get(long idx);
|
||||
|
||||
void read(long[] vals, long idx);
|
||||
|
||||
void read(long[] vals, int n, long idx);
|
||||
|
||||
void read(LongBuffer vals, long idx);
|
||||
|
||||
void write(long[] vals, long idx);
|
||||
|
||||
void write(long[] vals, int n, long idx);
|
||||
|
||||
void write(LongBuffer vals, long idx);
|
||||
|
||||
void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException;
|
||||
|
||||
default MultimapFileLongSlice atOffset(long off) {
|
||||
return new MultimapFileLongOffsetSlice(this, off);
|
||||
}
|
||||
}
|
@ -1,128 +1,80 @@
|
||||
package nu.marginalia.util.multimap;
|
||||
|
||||
import lombok.experimental.Delegate;
|
||||
public interface MultimapSearcher {
|
||||
long binarySearchUpper(long key, long fromIndex, long n);
|
||||
long binarySearch(long key, long fromIndex, long n);
|
||||
|
||||
public class MultimapSearcher {
|
||||
@Delegate
|
||||
private final MultimapFileLong mmf;
|
||||
|
||||
public MultimapSearcher(MultimapFileLong mmf) {
|
||||
this.mmf = mmf;
|
||||
}
|
||||
|
||||
public boolean binarySearch(long key, long fromIndex, long toIndex) {
|
||||
|
||||
long low = fromIndex;
|
||||
long high = toIndex - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(mid);
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return true; // key found
|
||||
static MultimapSearcher forContext(MultimapFileLongSlice slice, long mask, int stepSize) {
|
||||
if (mask == ~0L && stepSize == 1) {
|
||||
return new SimpleMultimapSearcher(new MultimapSearcherBase(slice));
|
||||
}
|
||||
return false; // key not found.
|
||||
}
|
||||
|
||||
public long binarySearchUpperBound(long key, long fromIndex, long toIndex) {
|
||||
|
||||
long low = fromIndex;
|
||||
long high = toIndex - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(mid);
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return mid;
|
||||
else if (stepSize == 1) {
|
||||
return new MaskedMultimapSearcher(new MultimapSearcherBase(slice), mask);
|
||||
}
|
||||
return low;
|
||||
}
|
||||
|
||||
public long binarySearchUpperBound(long key, long fromIndex, long toIndex, long mask) {
|
||||
|
||||
long low = fromIndex;
|
||||
long high = toIndex - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(mid) & mask;
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return mid;
|
||||
else {
|
||||
return new SteppingMaskedMultimapSearcher(new MultimapSearcherBase(slice), mask, stepSize);
|
||||
}
|
||||
return low;
|
||||
}
|
||||
|
||||
public long binarySearchUpperBoundNoMiss(long key, long fromIndex, long toIndex) {
|
||||
|
||||
long low = fromIndex;
|
||||
long high = toIndex - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(mid);
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return mid;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
public long binarySearchUpperBoundNoMiss(long key, long fromIndex, long toIndex, long mask) {
|
||||
|
||||
long low = fromIndex;
|
||||
long high = toIndex - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(mid) & mask;
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return mid;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
public long binarySearchUpperBoundNoMiss(long key, long fromIndex, long step, long steps, long mask) {
|
||||
|
||||
long low = 0;
|
||||
long high = steps - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + mid*step) & mask;
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid*step;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
class SimpleMultimapSearcher implements MultimapSearcher {
|
||||
private final MultimapSearcherBase base;
|
||||
|
||||
SimpleMultimapSearcher(MultimapSearcherBase base) {
|
||||
this.base = base;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearchUpper(long key, long fromIndex, long n) {
|
||||
return base.binarySearchUpper(key, fromIndex, n);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearch(long key, long fromIndex, long n) {
|
||||
return base.binarySearch(key, fromIndex, n);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class MaskedMultimapSearcher implements MultimapSearcher {
|
||||
private final MultimapSearcherBase base;
|
||||
private final long mask;
|
||||
|
||||
MaskedMultimapSearcher(MultimapSearcherBase base, long mask) {
|
||||
this.base = base;
|
||||
this.mask = mask;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearchUpper(long key, long fromIndex, long n) {
|
||||
return base.binarySearchUpper(key, fromIndex, n, mask);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearch(long key, long fromIndex, long n) {
|
||||
return base.binarySearch(key, fromIndex, n, mask);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class SteppingMaskedMultimapSearcher implements MultimapSearcher {
|
||||
private final MultimapSearcherBase base;
|
||||
private final long mask;
|
||||
private final int step;
|
||||
|
||||
SteppingMaskedMultimapSearcher(MultimapSearcherBase base, long mask, int step) {
|
||||
this.base = base;
|
||||
this.mask = mask;
|
||||
this.step = step;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearchUpper(long key, long fromIndex, long n) {
|
||||
return base.binarySearchUpper(key, fromIndex, step, n, mask);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearch(long key, long fromIndex, long n) {
|
||||
return base.binarySearch(key, fromIndex, step, n, mask);
|
||||
}
|
||||
}
|
@ -0,0 +1,143 @@
|
||||
package nu.marginalia.util.multimap;
|
||||
|
||||
import lombok.experimental.Delegate;
|
||||
|
||||
public class MultimapSearcherBase {
|
||||
@Delegate
|
||||
private final MultimapFileLongSlice mmf;
|
||||
|
||||
public MultimapSearcherBase(MultimapFileLongSlice mmf) {
|
||||
this.mmf = mmf;
|
||||
}
|
||||
|
||||
public boolean binarySearchTest(long key, long fromIndex, long n) {
|
||||
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + mid);
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public long binarySearchUpper(long key, long fromIndex, long n) {
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + mid);
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
}
|
||||
return fromIndex + low;
|
||||
}
|
||||
|
||||
|
||||
public long binarySearchUpper(long key, long fromIndex, long n, long mask) {
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + mid) & mask;
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
}
|
||||
return fromIndex + low;
|
||||
}
|
||||
|
||||
|
||||
public long binarySearchUpper(long key, long fromIndex, int step, long n, long mask) {
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + mid*step) & mask;
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid*step;
|
||||
}
|
||||
return fromIndex + low;
|
||||
}
|
||||
|
||||
public long binarySearch(long key, long fromIndex, long n) {
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + mid);
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
public long binarySearch(long key, long fromIndex, long n, long mask) {
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + mid) & mask;
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
public long binarySearch(long key, long fromIndex, int step, long n, long mask) {
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + mid*step) & mask;
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid*step;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
}
|
@ -13,10 +13,10 @@ import static nu.marginalia.util.multimap.MultimapFileLong.WORD_SIZE;
|
||||
public class MultimapSorter {
|
||||
private final Path tmpFileDir;
|
||||
private final int internalSortLimit;
|
||||
private final MultimapFileLong multimapFileLong;
|
||||
private final MultimapFileLongSlice multimapFileLong;
|
||||
private final long[] buffer;
|
||||
|
||||
public MultimapSorter(MultimapFileLong multimapFileLong, Path tmpFileDir, int internalSortLimit) {
|
||||
public MultimapSorter(MultimapFileLongSlice multimapFileLong, Path tmpFileDir, int internalSortLimit) {
|
||||
this.multimapFileLong = multimapFileLong;
|
||||
this.tmpFileDir = tmpFileDir;
|
||||
this.internalSortLimit = internalSortLimit;
|
||||
|
@ -1,49 +0,0 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import gnu.trove.list.TIntList;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import gnu.trove.map.hash.TIntIntHashMap;
|
||||
import it.unimi.dsi.fastutil.ints.IntArrays;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
|
||||
public class AcademiaRank {
|
||||
private final TIntArrayList result;
|
||||
private static final Logger logger = LoggerFactory.getLogger(AcademiaRank.class);
|
||||
|
||||
public AcademiaRank(HikariDataSource ds, String... origins) throws IOException {
|
||||
|
||||
TIntList rankingResults = new BetterStandardPageRank(ds, origins).pageRank(100_000);
|
||||
TIntIntHashMap idToRanking = new TIntIntHashMap(100_000, 0.5f, -1, 1_000_000_000);
|
||||
|
||||
for (int i = 0; i < rankingResults.size(); i++) {
|
||||
idToRanking.put(rankingResults.get(i), i);
|
||||
}
|
||||
|
||||
result = new TIntArrayList(10000);
|
||||
try (var conn = ds.getConnection();
|
||||
var stmt = conn.prepareStatement("select EC_DOMAIN.ID,COUNT(SOURCE_DOMAIN_ID) AS CNT from EC_DOMAIN INNER JOIN DOMAIN_METADATA ON DOMAIN_METADATA.ID=EC_DOMAIN.ID INNER JOIN EC_DOMAIN_LINK ON EC_DOMAIN_LINK.DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE INDEXED>0 AND STATE>=0 AND STATE<2 AND ((VISITED_URLS>1000+1500*RANK AND RANK<1) OR (GOOD_URLS>1000 AND URL_PART LIKE '%edu')) GROUP BY EC_DOMAIN.ID HAVING CNT<1500 ORDER BY RANK ASC")) {
|
||||
|
||||
stmt.setFetchSize(1000);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
result.add(rsp.getInt(1));
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("SQL error", ex);
|
||||
}
|
||||
|
||||
int[] internalArray = result.toArray();
|
||||
IntArrays.quickSort(internalArray, (a,b) -> idToRanking.get(a) - idToRanking.get(b));
|
||||
result.set(0, internalArray);
|
||||
}
|
||||
|
||||
public TIntArrayList getResult() {
|
||||
return result;
|
||||
}
|
||||
}
|
@ -1,15 +1,11 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class BetterReversePageRank extends RankingAlgorithm {
|
||||
|
||||
|
||||
public BetterReversePageRank(HikariDataSource dataSource, String... origins) {
|
||||
super(dataSource, origins);
|
||||
public BetterReversePageRank(RankingDomainFetcher domains, String... origins) {
|
||||
super(domains, origins);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -1,14 +1,10 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class BetterStandardPageRank extends RankingAlgorithm {
|
||||
|
||||
public BetterStandardPageRank(HikariDataSource dataSource, String... origins) {
|
||||
super(dataSource, origins);
|
||||
public BetterStandardPageRank(RankingDomainFetcher domains, String... origins) {
|
||||
super(domains, origins);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -1,15 +1,11 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class BuggyReversePageRank extends RankingAlgorithm {
|
||||
|
||||
|
||||
public BuggyReversePageRank(HikariDataSource dataSource, String... origins) {
|
||||
super(dataSource, origins);
|
||||
public BuggyReversePageRank(RankingDomainFetcher domains, String... origins) {
|
||||
super(domains, origins);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -1,14 +1,10 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class BuggyStandardPageRank extends RankingAlgorithm {
|
||||
|
||||
public BuggyStandardPageRank(HikariDataSource dataSource, String... origins) {
|
||||
super(dataSource, origins);
|
||||
public BuggyStandardPageRank(RankingDomainFetcher domains, String... origins) {
|
||||
super(domains, origins);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -1,224 +1,129 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import gnu.trove.list.TIntList;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import gnu.trove.map.hash.TIntIntHashMap;
|
||||
import gnu.trove.map.hash.TIntObjectHashMap;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import it.unimi.dsi.fastutil.ints.IntComparator;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.function.IntToDoubleFunction;
|
||||
import java.util.stream.IntStream;
|
||||
import it.unimi.dsi.fastutil.ints.IntArrays;
|
||||
|
||||
public abstract class RankingAlgorithm {
|
||||
final TIntObjectHashMap<DomainData> domainsById = new TIntObjectHashMap<>();
|
||||
final TIntIntHashMap domainIndexToId = new TIntIntHashMap();
|
||||
final TIntIntHashMap domainIdToIndex = new TIntIntHashMap();
|
||||
protected final TIntObjectHashMap<RankingDomainData> domainsById = new TIntObjectHashMap<>();
|
||||
protected final TIntIntHashMap domainIndexToId = new TIntIntHashMap();
|
||||
protected final TIntIntHashMap domainIdToIndex = new TIntIntHashMap();
|
||||
|
||||
private final TIntHashSet spamDomains;
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
TIntArrayList[] linkDataSrc2Dest;
|
||||
TIntArrayList[] linkDataDest2Src;
|
||||
protected TIntArrayList[] linkDataSrc2Dest;
|
||||
protected TIntArrayList[] linkDataDest2Src;
|
||||
|
||||
public final Set<String> originDomains = new HashSet<>();
|
||||
public final Set<Integer> originDomainIds = new HashSet<>();
|
||||
|
||||
private int maxKnownUrls = Integer.MAX_VALUE;
|
||||
|
||||
private static final boolean getNames = true;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
public static void main(String... args) throws IOException {
|
||||
var rpr = new BuggyReversePageRank(new DatabaseModule().provideConnection(), "wiki.xxiivv.com");
|
||||
var spr = new BuggyStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu");
|
||||
private final RankingDomainFetcher domains;
|
||||
|
||||
var rankVector = spr.pageRankVector();
|
||||
var norm = rankVector.norm();
|
||||
rpr.pageRank(i -> rankVector.get(i) / norm, 25).forEach(i -> {
|
||||
System.out.println(spr.domainNameFromId(i));
|
||||
return true;
|
||||
});
|
||||
}
|
||||
public RankingAlgorithm(RankingDomainFetcher domains, String... origins) {
|
||||
this.domains = domains;
|
||||
|
||||
public String domainNameFromId(int id) {
|
||||
return domainsById.get(id).name;
|
||||
}
|
||||
public boolean isPeripheral(int id) {
|
||||
return domainsById.get(id).peripheral;
|
||||
}
|
||||
|
||||
public RankingAlgorithm(HikariDataSource dataSource, String... origins) {
|
||||
this.dataSource = dataSource;
|
||||
var blacklist = new EdgeDomainBlacklistImpl(dataSource);
|
||||
|
||||
spamDomains = blacklist.getSpamDomains();
|
||||
originDomains.addAll(Arrays.asList(origins));
|
||||
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
domains.getDomains(domainData -> {
|
||||
int id = domainData.id;
|
||||
|
||||
String s;
|
||||
if (getNames) {
|
||||
s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
|
||||
domainsById.put(id, domainData);
|
||||
|
||||
domainIndexToId.put(domainIndexToId.size(), id);
|
||||
domainIdToIndex.put(id, domainIdToIndex.size());
|
||||
});
|
||||
|
||||
linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()];
|
||||
linkDataDest2Src = new TIntArrayList[domainIndexToId.size()];
|
||||
|
||||
domains.eachDomainLink((src, dst) -> {
|
||||
if (src == dst) return;
|
||||
|
||||
if (domainsById.contains(src) && domainsById.contains(dst)) {
|
||||
|
||||
int srcIdx = domainIdToIndex.get(src);
|
||||
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
|
||||
|
||||
if (linkDataSrc2Dest[srcIdx] == null) {
|
||||
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
|
||||
}
|
||||
linkDataSrc2Dest[srcIdx].add(dstIdx);
|
||||
|
||||
if (linkDataDest2Src[dstIdx] == null) {
|
||||
linkDataDest2Src[dstIdx] = new TIntArrayList();
|
||||
}
|
||||
linkDataDest2Src[dstIdx].add(srcIdx);
|
||||
}
|
||||
});
|
||||
|
||||
for (var namePattern : this.originDomains) {
|
||||
domains.domainsByPattern(namePattern, i -> {
|
||||
int ival = domainIdToIndex.get(i);
|
||||
if (ival != domainIdToIndex.getNoEntryValue() || domainIndexToId.get(0) == i) {
|
||||
originDomainIds.add(ival);
|
||||
}
|
||||
else {
|
||||
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
|
||||
logger.debug("No value for {}", i);
|
||||
}
|
||||
try (var stmt = conn.prepareStatement(s)) {
|
||||
stmt.setFetchSize(10000);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
int id = rsp.getInt(1);
|
||||
if (!spamDomains.contains(id)) {
|
||||
|
||||
domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), rsp.getInt(4), rsp.getInt(5), false));
|
||||
|
||||
domainIndexToId.put(domainIndexToId.size(), id);
|
||||
domainIdToIndex.put(id, domainIdToIndex.size());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()];
|
||||
linkDataDest2Src = new TIntArrayList[domainIndexToId.size()];
|
||||
|
||||
try (var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) {
|
||||
stmt.setFetchSize(10000);
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
|
||||
while (rsp.next()) {
|
||||
int src = rsp.getInt(1);
|
||||
int dst = rsp.getInt(2);
|
||||
|
||||
if (src == dst) continue;
|
||||
|
||||
if (domainsById.contains(src) && domainsById.contains(dst)) {
|
||||
|
||||
int srcIdx = domainIdToIndex.get(src);
|
||||
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
|
||||
|
||||
if (linkDataSrc2Dest[srcIdx] == null) {
|
||||
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
|
||||
}
|
||||
linkDataSrc2Dest[srcIdx].add(dstIdx);
|
||||
|
||||
if (linkDataDest2Src[dstIdx] == null) {
|
||||
linkDataDest2Src[dstIdx] = new TIntArrayList();
|
||||
}
|
||||
linkDataDest2Src[dstIdx].add(srcIdx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART LIKE ?")) {
|
||||
for (var seed : this.originDomains) {
|
||||
stmt.setString(1, seed);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
int i = rsp.getInt(1);
|
||||
int ival = domainIdToIndex.get(i);
|
||||
if (ival != domainIdToIndex.getNoEntryValue() || domainIndexToId.get(0) == i) {
|
||||
originDomainIds.add(ival);
|
||||
}
|
||||
else {
|
||||
logger.debug("No value for {}", i);
|
||||
}
|
||||
}
|
||||
logger.debug("{} -> {}", seed, originDomainIds.size());
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("Origin Domains: {}", originDomainIds.size());
|
||||
|
||||
} catch (SQLException throwables) {
|
||||
logger.error("SQL error", throwables);
|
||||
});
|
||||
}
|
||||
logger.info("Origin Domains: {}", originDomainIds.size());
|
||||
}
|
||||
|
||||
public void addPeripheralNodes(boolean includeErrorStates) {
|
||||
public void addPeripheralNodes() {
|
||||
|
||||
int newNodesIdxCutoff = domainIdToIndex.size();
|
||||
|
||||
logger.info("Inserting peripheral nodes");
|
||||
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
String s;
|
||||
if (getNames) {
|
||||
s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
|
||||
domains.getPeripheralDomains(domainData -> {
|
||||
int id = domainData.id;
|
||||
|
||||
if (domainsById.put(id, domainData) == null) { // true if id was not already present
|
||||
domainIndexToId.put(domainIndexToId.size(), id);
|
||||
domainIdToIndex.put(id, domainIdToIndex.size());
|
||||
}
|
||||
else {
|
||||
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
|
||||
}
|
||||
try (var stmt = conn.prepareStatement(s)) {
|
||||
stmt.setFetchSize(10000);
|
||||
var rsp = stmt.executeQuery();
|
||||
});
|
||||
|
||||
while (rsp.next()) {
|
||||
int id = rsp.getInt(1);
|
||||
linkDataSrc2Dest = Arrays.copyOf(linkDataSrc2Dest, domainIndexToId.size());
|
||||
linkDataDest2Src = Arrays.copyOf(linkDataDest2Src, domainIndexToId.size());
|
||||
|
||||
if (!spamDomains.contains(id)) {
|
||||
domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), rsp.getInt(4), rsp.getInt(5), true));
|
||||
domains.eachDomainLink((src, dst) -> {
|
||||
if (src == dst) return;
|
||||
|
||||
domainIndexToId.put(domainIndexToId.size(), id);
|
||||
domainIdToIndex.put(id, domainIdToIndex.size());
|
||||
}
|
||||
if (domainsById.contains(src) && domainsById.contains(dst)) {
|
||||
int srcIdx = domainIdToIndex.get(src);
|
||||
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
|
||||
|
||||
// This looks like a bug, but it improves the results
|
||||
if (srcIdx < newNodesIdxCutoff || dstIdx < newNodesIdxCutoff)
|
||||
return;
|
||||
|
||||
if (linkDataSrc2Dest[srcIdx] == null) {
|
||||
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
|
||||
}
|
||||
linkDataSrc2Dest[srcIdx].add(dstIdx);
|
||||
|
||||
}
|
||||
|
||||
linkDataSrc2Dest = Arrays.copyOf(linkDataSrc2Dest, domainIndexToId.size());
|
||||
linkDataDest2Src = Arrays.copyOf(linkDataDest2Src, domainIndexToId.size());
|
||||
|
||||
try (var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) {
|
||||
stmt.setFetchSize(10000);
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
|
||||
while (rsp.next()) {
|
||||
int src = rsp.getInt(1);
|
||||
int dst = rsp.getInt(2);
|
||||
|
||||
if (src == dst) continue;
|
||||
|
||||
if (domainsById.contains(src) && domainsById.contains(dst)) {
|
||||
|
||||
int srcIdx = domainIdToIndex.get(src);
|
||||
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
|
||||
|
||||
// This looks like a bug, but it improves the results
|
||||
if (srcIdx < newNodesIdxCutoff || dstIdx < newNodesIdxCutoff)
|
||||
continue;
|
||||
|
||||
if (linkDataSrc2Dest[srcIdx] == null) {
|
||||
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
|
||||
}
|
||||
linkDataSrc2Dest[srcIdx].add(dstIdx);
|
||||
|
||||
if (linkDataDest2Src[dstIdx] == null) {
|
||||
linkDataDest2Src[dstIdx] = new TIntArrayList();
|
||||
}
|
||||
linkDataDest2Src[dstIdx].add(srcIdx);
|
||||
}
|
||||
if (linkDataDest2Src[dstIdx] == null) {
|
||||
linkDataDest2Src[dstIdx] = new TIntArrayList();
|
||||
}
|
||||
linkDataDest2Src[dstIdx].add(srcIdx);
|
||||
}
|
||||
} catch (SQLException throwables) {
|
||||
logger.error("SQL error", throwables);
|
||||
}
|
||||
});
|
||||
|
||||
logger.info("Peripheral nodes inserted {} -> {}", newNodesIdxCutoff, domainIdToIndex.size());
|
||||
}
|
||||
@ -271,14 +176,14 @@ public abstract class RankingAlgorithm {
|
||||
return rank.getRanking(resultCount);
|
||||
}
|
||||
|
||||
public TIntList pageRankWithPeripheralNodes(int resultCount, boolean includeErrorStates) {
|
||||
public TIntList pageRankWithPeripheralNodes(int resultCount) {
|
||||
RankVector rank = new RankVector(1.d / domainsById.size());
|
||||
|
||||
int iter_max = 100;
|
||||
|
||||
for (int i = 0; i < iter_max; i++) {
|
||||
if (i == iter_max-1) {
|
||||
addPeripheralNodes(includeErrorStates);
|
||||
addPeripheralNodes();
|
||||
}
|
||||
RankVector newRank = createNewRankVector(rank);
|
||||
|
||||
@ -323,7 +228,7 @@ public abstract class RankingAlgorithm {
|
||||
|
||||
abstract RankVector createNewRankVector(RankVector rank);
|
||||
|
||||
public boolean includeInRanking(DomainData data) {
|
||||
public boolean includeInRanking(RankingDomainData data) {
|
||||
if (data.isAlias())
|
||||
return false;
|
||||
if (data.isSpecial())
|
||||
@ -445,32 +350,4 @@ public abstract class RankingAlgorithm {
|
||||
}
|
||||
}
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
static class DomainData {
|
||||
public final int id;
|
||||
public final String name;
|
||||
private int alias;
|
||||
private int state;
|
||||
public final int knownUrls;
|
||||
public boolean peripheral;
|
||||
|
||||
public int resolveAlias() {
|
||||
if (alias == 0) return id;
|
||||
return alias;
|
||||
}
|
||||
|
||||
public boolean isAlias() {
|
||||
return alias != 0;
|
||||
}
|
||||
|
||||
public boolean isSpecial() {
|
||||
return EdgeDomainIndexingState.SPECIAL.code == state;
|
||||
}
|
||||
|
||||
public boolean isSocialMedia() {
|
||||
return EdgeDomainIndexingState.SOCIAL_MEDIA.code == state;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,33 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
class RankingDomainData {
|
||||
public final int id;
|
||||
public final String name;
|
||||
private int alias;
|
||||
private EdgeDomainIndexingState state;
|
||||
public final int knownUrls;
|
||||
public boolean peripheral;
|
||||
|
||||
public int resolveAlias() {
|
||||
if (alias == 0) return id;
|
||||
return alias;
|
||||
}
|
||||
|
||||
public boolean isAlias() {
|
||||
return alias != 0;
|
||||
}
|
||||
|
||||
public boolean isSpecial() {
|
||||
return EdgeDomainIndexingState.SPECIAL == state;
|
||||
}
|
||||
|
||||
public boolean isSocialMedia() {
|
||||
return EdgeDomainIndexingState.SOCIAL_MEDIA == state;
|
||||
}
|
||||
}
|
@ -0,0 +1,105 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.IntConsumer;
|
||||
|
||||
public class RankingDomainFetcher {
|
||||
private final HikariDataSource dataSource;
|
||||
private final EdgeDomainBlacklistImpl blacklist;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final boolean getNames = false;
|
||||
|
||||
@Inject
|
||||
public RankingDomainFetcher(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) {
|
||||
this.dataSource = dataSource;
|
||||
this.blacklist = blacklist;
|
||||
}
|
||||
|
||||
public void getDomains(Consumer<RankingDomainData> consumer) {
|
||||
String query;
|
||||
if (getNames) {
|
||||
query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
|
||||
}
|
||||
else {
|
||||
query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
|
||||
}
|
||||
|
||||
getDomains(query, consumer);
|
||||
}
|
||||
|
||||
|
||||
public void getPeripheralDomains(Consumer<RankingDomainData> consumer) {
|
||||
String query;
|
||||
if (getNames) {
|
||||
query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
|
||||
}
|
||||
else {
|
||||
query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
|
||||
}
|
||||
|
||||
getDomains(query, consumer);
|
||||
}
|
||||
|
||||
private void getDomains(String query, Consumer<RankingDomainData> consumer) {
|
||||
try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(query)) {
|
||||
stmt.setFetchSize(10000);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
int id = rsp.getInt(1);
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
consumer.accept(new RankingDomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), false));
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to fetch domains", ex);
|
||||
}
|
||||
}
|
||||
|
||||
public void eachDomainLink(DomainLinkConsumer consumer) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK"))
|
||||
{
|
||||
stmt.setFetchSize(10000);
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
|
||||
while (rsp.next()) {
|
||||
int src = rsp.getInt(1);
|
||||
int dst = rsp.getInt(2);
|
||||
|
||||
consumer.accept(src, dst);
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to fetch domain links", ex);
|
||||
}
|
||||
}
|
||||
|
||||
public void domainsByPattern(String pattern, IntConsumer idConsumer) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE ?")) {
|
||||
stmt.setString(1, pattern);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
idConsumer.accept(rsp.getInt(1));
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to fetch domains by pattern", ex);
|
||||
}
|
||||
}
|
||||
|
||||
public interface DomainLinkConsumer {
|
||||
void accept(int from, int to);
|
||||
}
|
||||
}
|
@ -66,7 +66,7 @@ public class OldReversePageRankV2 {
|
||||
originDomains.add("memex.marginalia.nu");
|
||||
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE FROM EC_DOMAIN WHERE INDEXED>1 AND STATE>=0 AND QUALITY_RAW>=-10")) {
|
||||
try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE FROM EC_DOMAIN WHERE INDEXED>1 AND IS_ALIVE")) {
|
||||
stmt.setFetchSize(10000);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
@ -90,7 +90,7 @@ public class OldReversePageRankV2 {
|
||||
}
|
||||
}
|
||||
|
||||
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) {
|
||||
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
stmt.setFetchSize(10000);
|
||||
|
||||
for (var seed : this.originDomains) {
|
||||
|
@ -48,7 +48,7 @@ public class StandardPageRank {
|
||||
originDomains.addAll(Arrays.asList(origins));
|
||||
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE,URL_PART FROM EC_DOMAIN WHERE INDEXED>1 AND STATE>=0 AND QUALITY>=-10")) {
|
||||
try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE,DOMAIN_NAME FROM EC_DOMAIN WHERE INDEXED>1 AND IS_ALIVE AND QUALITY>=-10")) {
|
||||
stmt.setFetchSize(10000);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
@ -78,7 +78,7 @@ public class StandardPageRank {
|
||||
}
|
||||
}
|
||||
|
||||
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) {
|
||||
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
for (var seed : this.originDomains) {
|
||||
stmt.setString(1, seed);
|
||||
var rsp = stmt.executeQuery();
|
||||
|
@ -50,7 +50,7 @@ public class DedupTool {
|
||||
Map<Integer, Map<Integer, List<Data>>> domainToHashToUrl = new HashMap<>();
|
||||
|
||||
try (var conn = ds.getConnection();
|
||||
var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.URL_PART FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL");
|
||||
var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.DOMAIN_NAME FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL");
|
||||
var updateStmt = conn.prepareStatement("UPDATE EC_URL SET STATE='redirect' WHERE ID=?")
|
||||
|
||||
) {
|
||||
|
@ -112,10 +112,10 @@ public class PerusePageRankV2 {
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
String s;
|
||||
if (getNames) {
|
||||
s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 GROUP BY EC_DOMAIN.ID";
|
||||
s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) GROUP BY EC_DOMAIN.ID";
|
||||
}
|
||||
else {
|
||||
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS FROM EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 GROUP BY EC_DOMAIN.ID";
|
||||
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS FROM EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) GROUP BY EC_DOMAIN.ID";
|
||||
}
|
||||
try (var stmt = conn.prepareStatement(s)) {
|
||||
stmt.setFetchSize(10000);
|
||||
|
@ -1,30 +0,0 @@
|
||||
package nu.marginalia.util.ranking.tool;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.ranking.AcademiaRank;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class TestAcademiaRankTool {
|
||||
|
||||
@SneakyThrows
|
||||
public static void main(String... args) {
|
||||
Driver driver = new Driver();
|
||||
var conn = new DatabaseModule().provideConnection();
|
||||
|
||||
var rank = new AcademiaRank(new DatabaseModule().provideConnection(), "www.perseus.tufts.edu", "xroads.virginia.edu");
|
||||
var res = rank.getResult();
|
||||
|
||||
try (var c = conn.getConnection(); var stmt = c.prepareStatement("SELECT URL_PART FROM EC_DOMAIN WHERE ID=?")) {
|
||||
for (int i = 0; i < Math.min(res.size(), 100); i++) {
|
||||
stmt.setInt(1, res.getQuick(i));
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next())
|
||||
System.out.println(rsp.getString(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -3,12 +3,13 @@ package nu.marginalia.util.ranking.tool;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.ranking.BuggyStandardPageRank;
|
||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
@ -43,12 +44,14 @@ public class UpdateDomainRanksTool {
|
||||
var uploader = new Thread(() -> uploadThread(conn), "Uploader");
|
||||
|
||||
logger.info("Ranking");
|
||||
var spr = new BuggyStandardPageRank(new DatabaseModule().provideConnection(),"memex.marginalia.nu");
|
||||
var ds = new DatabaseModule().provideConnection();
|
||||
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
||||
var spr = new BuggyStandardPageRank(domains, "memex.marginalia.nu");
|
||||
|
||||
rankMax = spr.size()*2;
|
||||
uploader.start();
|
||||
|
||||
spr.pageRankWithPeripheralNodes(rankMax, false).forEach(i -> {
|
||||
spr.pageRankWithPeripheralNodes(rankMax).forEach(i -> {
|
||||
try {
|
||||
uploadQueue.put(i);
|
||||
} catch (InterruptedException e) {
|
||||
@ -83,11 +86,6 @@ public class UpdateDomainRanksTool {
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("Recalculating quality");
|
||||
try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET QUALITY=-5*RANK+IF(RANK=1,RANK*GREATEST(QUALITY_RAW,QUALITY_ORIGINAL)/2, 0)")) {
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
|
||||
} catch (SQLException | InterruptedException throwables) {
|
||||
throwables.printStackTrace();
|
||||
}
|
||||
|
@ -3,12 +3,13 @@ package nu.marginalia.util.ranking.tool;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.ranking.BetterReversePageRank;
|
||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
@ -45,7 +46,9 @@ public class UpdateDomainRanksTool2 {
|
||||
logger.info("Ranking");
|
||||
// "memex.marginalia.nu", "wiki.xxiivv.com", "bikobatanari.art", "sadgrl.online", "lileks.com",
|
||||
// "www.rep.routledge.com", "www.personal.kent.edu", "xroads.virginia.edu", "classics.mit.edu", "faculty.washington.edu", "monadnock.net"
|
||||
var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
|
||||
var ds = new DatabaseModule().provideConnection();
|
||||
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
||||
var rpr = new BetterReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
|
||||
// var rpr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "%edu");
|
||||
// var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu");
|
||||
|
||||
@ -58,7 +61,7 @@ public class UpdateDomainRanksTool2 {
|
||||
rankMax = rpr.size();
|
||||
|
||||
|
||||
rpr.pageRankWithPeripheralNodes(rankMax, false).forEach(i -> {
|
||||
rpr.pageRankWithPeripheralNodes(rankMax).forEach(i -> {
|
||||
try {
|
||||
uploadQueue.put(i);
|
||||
} catch (InterruptedException e) {
|
||||
@ -94,9 +97,6 @@ public class UpdateDomainRanksTool2 {
|
||||
}
|
||||
|
||||
logger.info("Recalculating quality");
|
||||
try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET QUALITY=-5*RANK+IF(RANK=1,RANK*GREATEST(QUALITY_RAW,QUALITY_ORIGINAL)/2, 0)")) {
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
|
||||
} catch (SQLException | InterruptedException throwables) {
|
||||
throwables.printStackTrace();
|
||||
|
@ -6,6 +6,7 @@ import io.reactivex.rxjava3.core.Observable;
|
||||
import io.reactivex.rxjava3.core.ObservableSource;
|
||||
import io.reactivex.rxjava3.plugins.RxJavaPlugins;
|
||||
import lombok.SneakyThrows;
|
||||
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
|
||||
import nu.marginalia.wmsa.client.exception.LocalException;
|
||||
import nu.marginalia.wmsa.client.exception.NetworkException;
|
||||
import nu.marginalia.wmsa.client.exception.RemoteException;
|
||||
@ -30,9 +31,12 @@ import java.util.zip.GZIPOutputStream;
|
||||
|
||||
public abstract class AbstractClient implements AutoCloseable {
|
||||
public static final String CONTEXT_OUTBOUND_REQUEST = "outbound-request";
|
||||
private final Gson gson = new GsonBuilder().create();
|
||||
|
||||
private final Gson gson = new GsonBuilder()
|
||||
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
|
||||
.create();
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final Marker httpMarker = MarkerFactory.getMarker("HTTP");
|
||||
|
||||
private final OkHttpClient client;
|
||||
|
||||
|
@ -4,10 +4,10 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import io.reactivex.rxjava3.core.Observable;
|
||||
import nu.marginalia.wmsa.client.AbstractDynamicClient;
|
||||
import nu.marginalia.wmsa.client.exception.RouteNotConfiguredException;
|
||||
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
||||
import nu.marginalia.wmsa.configuration.server.Context;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.DictionaryResponse;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles;
|
||||
import org.eclipse.jetty.util.UrlEncoded;
|
||||
|
||||
import java.util.List;
|
||||
@ -21,18 +21,38 @@ public class AssistantClient extends AbstractDynamicClient {
|
||||
}
|
||||
|
||||
public Observable<DictionaryResponse> dictionaryLookup(Context ctx, String word) {
|
||||
return super.get(ctx,"/dictionary/" + UrlEncoded.encodeString(word), DictionaryResponse.class);
|
||||
try {
|
||||
return super.get(ctx, "/dictionary/" + UrlEncoded.encodeString(word), DictionaryResponse.class);
|
||||
}
|
||||
catch (RouteNotConfiguredException ex) {
|
||||
return Observable.empty();
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public Observable<List<String>> spellCheck(Context ctx, String word) {
|
||||
return (Observable<List<String>>) (Object) super.get(ctx,"/spell-check/" + UrlEncoded.encodeString(word), List.class);
|
||||
try {
|
||||
return (Observable<List<String>>) (Object) super.get(ctx, "/spell-check/" + UrlEncoded.encodeString(word), List.class);
|
||||
}
|
||||
catch (RouteNotConfiguredException ex) {
|
||||
return Observable.empty();
|
||||
}
|
||||
}
|
||||
public Observable<String> unitConversion(Context ctx, String value, String from, String to) {
|
||||
return super.get(ctx,"/unit-conversion?value="+value + "&from="+from+"&to="+to);
|
||||
try {
|
||||
return super.get(ctx, "/unit-conversion?value=" + value + "&from=" + from + "&to=" + to);
|
||||
}
|
||||
catch (RouteNotConfiguredException ex) {
|
||||
return Observable.empty();
|
||||
}
|
||||
}
|
||||
|
||||
public Observable<String> evalMath(Context ctx, String expression) {
|
||||
return super.get(ctx,"/eval-expression?value="+UrlEncoded.encodeString(expression));
|
||||
try {
|
||||
return super.get(ctx, "/eval-expression?value=" + UrlEncoded.encodeString(expression));
|
||||
}
|
||||
catch (RouteNotConfiguredException ex) {
|
||||
return Observable.empty();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -79,10 +79,7 @@ public class WikiCleaner {
|
||||
}
|
||||
});
|
||||
|
||||
Optional.ofNullable(doc.getElementsByTag("cite")).ifPresent(cite -> cite.forEach(c -> {
|
||||
c.tagName("span");
|
||||
}));
|
||||
|
||||
doc.getElementsByTag("cite").tagName("span");
|
||||
|
||||
removeIds(doc, "toc", "catlinks", "Notes", "mw-navigation", "mw-data-after-content", "jump-to-nav");
|
||||
removeByClass(doc, "mw-references-wrap", "references", "reference", "siteSub", "refbegin");
|
||||
@ -205,7 +202,7 @@ public class WikiCleaner {
|
||||
}
|
||||
});
|
||||
doc.getAllElements().forEach(elem -> {
|
||||
var classes = elem.classNames().stream().filter(this::isWikiClass).collect(Collectors.toList());
|
||||
var classes = elem.classNames().stream().filter(this::isWikiClass).toList();
|
||||
classes.forEach(elem::removeClass);
|
||||
elem.removeAttr("lang");
|
||||
elem.removeAttr("dir");
|
||||
@ -251,9 +248,8 @@ public class WikiCleaner {
|
||||
var formula = math.getElementsByTag("math");
|
||||
var converter = net.sourceforge.jeuclid.converter.Converter.getInstance();
|
||||
var sos = new ByteArrayOutputStream();
|
||||
var alt = Optional.ofNullable(formula.attr("alttext"))
|
||||
.or(() -> Optional.ofNullable(math.getElementsByTag("annotation").text()))
|
||||
.orElse("");
|
||||
var alt = Optional.of(formula.attr("alttext")).filter(s -> !s.isBlank())
|
||||
.orElseGet(() -> math.getElementsByTag("annotation").text());
|
||||
|
||||
var layoutContext = new LayoutContextImpl(LayoutContextImpl.getDefaultLayoutContext());
|
||||
|
||||
@ -309,16 +305,16 @@ public class WikiCleaner {
|
||||
@NotNull
|
||||
private List<Pair<String, String>> getWikiPageLinks(Document doc) {
|
||||
List<Pair<String,String>> topLinks = new ArrayList<>();
|
||||
Optional.ofNullable(doc.select("p a")).ifPresent(links -> links.forEach(atag -> {
|
||||
doc.select("p a").forEach(atag -> {
|
||||
String href = atag.attr("href");
|
||||
|
||||
if (href != null && !href.isBlank()
|
||||
if (!href.isBlank()
|
||||
&& !href.contains(":")
|
||||
&& !href.startsWith("#")
|
||||
) {
|
||||
topLinks.add(Pair.of(href, atag.attr("title")));
|
||||
}
|
||||
}));
|
||||
});
|
||||
return topLinks;
|
||||
}
|
||||
|
||||
@ -336,19 +332,16 @@ public class WikiCleaner {
|
||||
private List<Pair<String, String>> getDisambiguationLinks(Document doc) {
|
||||
List<Pair<String,String>> disambig = new ArrayList<>();
|
||||
|
||||
for (var note: doc.getElementsByClass("hatnote")) {
|
||||
for (var atag : note.getElementsByTag("a")) {
|
||||
String href = atag.attr("href");
|
||||
if (atag.hasClass("mw-disambig") && !href.isBlank()) {
|
||||
disambig.add(Pair.of(href, atag.attr("title")));
|
||||
}
|
||||
}
|
||||
}
|
||||
doc.getElementsByClass("hatnote").remove();
|
||||
|
||||
Optional.ofNullable(doc.getElementsByClass("hatnote")).ifPresent(hatnotes -> {
|
||||
hatnotes.forEach(note -> {
|
||||
Optional.ofNullable(note.getElementsByTag("a"))
|
||||
.ifPresent(links -> links.forEach(atag -> {
|
||||
String href = atag.attr("href");
|
||||
if (atag.hasClass("mw-disambig") && href != null) {
|
||||
disambig.add(Pair.of(href, atag.attr("title")));
|
||||
}
|
||||
}));
|
||||
});
|
||||
});
|
||||
Optional.ofNullable(doc.getElementsByClass("hatnote")).ifPresent(Elements::remove);
|
||||
return disambig;
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,40 @@
|
||||
package nu.marginalia.wmsa.edge.assistant.screenshot;
|
||||
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
public class ScreenshotLoaderMain {
|
||||
public static void main(String... args) throws IOException {
|
||||
|
||||
org.mariadb.jdbc.Driver driver = new Driver();
|
||||
var ds = new DatabaseModule().provideConnection();
|
||||
|
||||
try (var tis = new TarArchiveInputStream(new GZIPInputStream(new FileInputStream(args[0])));
|
||||
var conn = ds.getConnection();
|
||||
var ps = conn.prepareStatement("REPLACE INTO DATA_DOMAIN_SCREENSHOT(DOMAIN_NAME, CONTENT_TYPE, DATA) VALUES (?,?,?)")
|
||||
) {
|
||||
for (TarArchiveEntry entry = tis.getNextTarEntry(); entry != null; entry = tis.getNextTarEntry()) {
|
||||
if (entry.isFile()) {
|
||||
String fileName = entry.getName();
|
||||
String domainName = fileName.substring(fileName.indexOf('/')+1, fileName.lastIndexOf('.'));
|
||||
|
||||
ps.setString(1, domainName);
|
||||
ps.setString(2, "image/webp");
|
||||
ps.setBlob(3, tis);
|
||||
ps.executeUpdate();
|
||||
|
||||
System.out.println(domainName);
|
||||
}
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
@ -2,47 +2,49 @@ package nu.marginalia.wmsa.edge.assistant.screenshot;
|
||||
|
||||
import com.google.common.base.Strings;
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.utils.IOUtils;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.sql.SQLException;
|
||||
|
||||
import static java.lang.Integer.parseInt;
|
||||
|
||||
public class ScreenshotService {
|
||||
|
||||
private final Path screenshotsRoot = Path.of("/var/lib/wmsa/archive/screenshots/screenshots/");
|
||||
private final Path screenshotsRootWebp = Path.of("/var/lib/wmsa/archive.fast/screenshots/");
|
||||
private final EdgeDataStoreDao edgeDataStoreDao;
|
||||
private final long MIN_FILE_SIZE = 4096;
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Inject
|
||||
public ScreenshotService(EdgeDataStoreDao edgeDataStoreDao) {
|
||||
public ScreenshotService(EdgeDataStoreDao edgeDataStoreDao, HikariDataSource dataSource) {
|
||||
this.edgeDataStoreDao = edgeDataStoreDao;
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
public boolean hasScreenshot(EdgeId<EdgeDomain> domainId) {
|
||||
EdgeDomain domain = edgeDataStoreDao.getDomain(domainId);
|
||||
|
||||
Path p = getScreenshotPath(screenshotsRootWebp, domain, ".webp");
|
||||
if (p == null) {
|
||||
p = getScreenshotPath(screenshotsRoot, domain, ".png");
|
||||
try (var conn = dataSource.getConnection();
|
||||
var ps = conn.prepareStatement("""
|
||||
SELECT TRUE
|
||||
FROM DATA_DOMAIN_SCREENSHOT
|
||||
INNER JOIN EC_DOMAIN ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME
|
||||
WHERE EC_DOMAIN.ID=?
|
||||
""")) {
|
||||
ps.setInt(1, domainId.id());
|
||||
var rs = ps.executeQuery();
|
||||
return rs.next();
|
||||
}
|
||||
|
||||
try {
|
||||
return p != null && Files.size(p) >= MIN_FILE_SIZE;
|
||||
} catch (IOException e) {
|
||||
return false;
|
||||
catch (SQLException ex) {
|
||||
logger.warn("SQL error", ex);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@ -54,79 +56,55 @@ public class ScreenshotService {
|
||||
|
||||
int id = parseInt(request.params("id"));
|
||||
|
||||
Path p = null;
|
||||
if (id == 0) {
|
||||
p = screenshotsRootWebp.resolve("dummy-snapshot.webp");
|
||||
} else {
|
||||
EdgeDomain domain;
|
||||
try {
|
||||
domain = edgeDataStoreDao.getDomain(new EdgeId<>(id));
|
||||
p = getScreenshotPath(screenshotsRootWebp, domain, ".webp");
|
||||
if (p == null) {
|
||||
p = getScreenshotPath(screenshotsRoot, domain, ".png");
|
||||
}
|
||||
|
||||
if (p != null && Files.size(p) <= MIN_FILE_SIZE) {
|
||||
p = null;
|
||||
}
|
||||
} catch (NoSuchElementException ex) {
|
||||
domain = new EdgeDomain("error.example.com");
|
||||
}
|
||||
|
||||
if (p == null) {
|
||||
response.type("image/svg+xml");
|
||||
|
||||
return String.format("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" +
|
||||
"<svg\n" +
|
||||
" xmlns=\"http://www.w3.org/2000/svg\"\n" +
|
||||
" width=\"640px\"\n" +
|
||||
" height=\"480px\"\n" +
|
||||
" viewBox=\"0 0 640 480\"\n" +
|
||||
" version=\"1.1\">\n" +
|
||||
" <g>\n" +
|
||||
" <rect\n" +
|
||||
" style=\"fill:#808080\"\n" +
|
||||
" id=\"rect288\"\n" +
|
||||
" width=\"595.41992\"\n" +
|
||||
" height=\"430.01825\"\n" +
|
||||
" x=\"23.034981\"\n" +
|
||||
" y=\"27.850344\" />\n" +
|
||||
" <text\n" +
|
||||
" xml:space=\"preserve\"\n" +
|
||||
" style=\"font-size:100px;fill:#909090;font-family:sans-serif;\"\n" +
|
||||
" x=\"20\"\n" +
|
||||
" y=\"120\">Placeholder</text>\n" +
|
||||
" <text\n" +
|
||||
" xml:space=\"preserve\"\n" +
|
||||
" style=\"font-size:32px;fill:#000000;font-family:monospace;\"\n" +
|
||||
" x=\"320\" y=\"240\" dominant-baseline=\"middle\" text-anchor=\"middle\">%s</text>\n" +
|
||||
" </g>\n" +
|
||||
"</svg>\n", domain);
|
||||
try (var conn = dataSource.getConnection();
|
||||
var ps = conn.prepareStatement("""
|
||||
SELECT CONTENT_TYPE, DATA
|
||||
FROM DATA_DOMAIN_SCREENSHOT
|
||||
INNER JOIN EC_DOMAIN ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME
|
||||
WHERE EC_DOMAIN.ID=?
|
||||
""")) {
|
||||
ps.setInt(1, id);
|
||||
var rsp = ps.executeQuery();
|
||||
if (rsp.next()) {
|
||||
response.type(rsp.getString(1));
|
||||
rsp.getBlob(2).getBinaryStream().transferTo(response.raw().getOutputStream());
|
||||
return "";
|
||||
}
|
||||
}
|
||||
response.status(200);
|
||||
response.header("Cache-control", "public,max-age=3600");
|
||||
if (p.toString().endsWith("webp")) {
|
||||
response.type("image/webp");
|
||||
} else {
|
||||
response.type("image/png");
|
||||
catch (SQLException ex) {
|
||||
logger.warn("SQL error", ex);
|
||||
}
|
||||
IOUtils.copy(new ByteArrayInputStream(Files.readAllBytes(p)), response.raw().getOutputStream());
|
||||
return "";
|
||||
|
||||
return serveSvgPlaceholder(response, id);
|
||||
}
|
||||
|
||||
private Path getScreenshotPath(Path root, EdgeDomain domain, String ending) {
|
||||
|
||||
var p = root.resolve(domain.toString() + ending);
|
||||
if (!p.normalize().startsWith(root)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!Files.exists(p)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return p;
|
||||
private Object serveSvgPlaceholder(Response response, int id) {
|
||||
response.type("image/svg+xml");
|
||||
return String.format("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" +
|
||||
"<svg\n" +
|
||||
" xmlns=\"http://www.w3.org/2000/svg\"\n" +
|
||||
" width=\"640px\"\n" +
|
||||
" height=\"480px\"\n" +
|
||||
" viewBox=\"0 0 640 480\"\n" +
|
||||
" version=\"1.1\">\n" +
|
||||
" <g>\n" +
|
||||
" <rect\n" +
|
||||
" style=\"fill:#808080\"\n" +
|
||||
" id=\"rect288\"\n" +
|
||||
" width=\"595.41992\"\n" +
|
||||
" height=\"430.01825\"\n" +
|
||||
" x=\"23.034981\"\n" +
|
||||
" y=\"27.850344\" />\n" +
|
||||
" <text\n" +
|
||||
" xml:space=\"preserve\"\n" +
|
||||
" style=\"font-size:100px;fill:#909090;font-family:sans-serif;\"\n" +
|
||||
" x=\"20\"\n" +
|
||||
" y=\"120\">Placeholder</text>\n" +
|
||||
" <text\n" +
|
||||
" xml:space=\"preserve\"\n" +
|
||||
" style=\"font-size:32px;fill:#000000;font-family:monospace;\"\n" +
|
||||
" x=\"320\" y=\"240\" dominant-baseline=\"middle\" text-anchor=\"middle\">%s</text>\n" +
|
||||
" </g>\n" +
|
||||
"</svg>\n", edgeDataStoreDao.getDomain(new EdgeId<>(id)));
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -45,7 +45,7 @@ public class ConvertedDomainReader {
|
||||
try {
|
||||
ret.add(gson.fromJson(parts[1], type));
|
||||
}
|
||||
catch (JsonParseException ex) {
|
||||
catch (NullPointerException|JsonParseException ex) {
|
||||
logger.warn("Failed to deserialize {} {}", type.getSimpleName(), StringUtils.abbreviate(parts[1], 255));
|
||||
logger.warn("Json error", ex);
|
||||
}
|
||||
|
@ -1,18 +1,19 @@
|
||||
package nu.marginalia.wmsa.edge.converting;
|
||||
|
||||
import com.google.gson.*;
|
||||
import com.google.common.base.Strings;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.util.ParallelPipe;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.InstructionsCompiler;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader;
|
||||
import nu.marginalia.wmsa.edge.crawling.WorkLog;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader;
|
||||
import nu.marginalia.wmsa.edge.crawling.WorkLog;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.util.ParallelPipe;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -52,13 +53,6 @@ public class ConverterMain {
|
||||
injector.getInstance(ConverterMain.class);
|
||||
}
|
||||
|
||||
private static void requireArgs(String[] args, String... help) {
|
||||
if (args.length != help.length) {
|
||||
System.out.println("Usage: " + String.join(", ", help));
|
||||
System.exit(255);
|
||||
}
|
||||
}
|
||||
|
||||
@Inject
|
||||
public ConverterMain(
|
||||
EdgeCrawlPlan plan,
|
||||
@ -103,7 +97,12 @@ public class ConverterMain {
|
||||
|
||||
domainToId.forEach((domain, id) -> {
|
||||
String fileName = idToFileName.get(id);
|
||||
Path dest = getFilePath(plan.crawl.getDir(), fileName);
|
||||
|
||||
if (Strings.isNullOrEmpty(fileName))
|
||||
return;
|
||||
|
||||
Path dest = plan.getCrawledFilePath(fileName);
|
||||
|
||||
logger.info("{} - {} - {}", domain, id, dest);
|
||||
|
||||
if (!processLog.isJobFinished(id)) {
|
||||
@ -128,10 +127,4 @@ public class ConverterMain {
|
||||
|
||||
record ProcessingInstructions(String id, List<Instruction> instructions) {}
|
||||
|
||||
private Path getFilePath(Path dir, String fileName) {
|
||||
String sp1 = fileName.substring(0, 2);
|
||||
String sp2 = fileName.substring(2, 4);
|
||||
return dir.resolve(sp1).resolve(sp2).resolve(fileName);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -11,7 +11,6 @@ import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class ConverterModule extends AbstractModule {
|
||||
|
||||
@ -27,7 +26,8 @@ public class ConverterModule extends AbstractModule {
|
||||
bind(Gson.class).toInstance(createGson());
|
||||
|
||||
bind(Double.class).annotatedWith(Names.named("min-document-quality")).toInstance(-15.);
|
||||
bind(Integer.class).annotatedWith(Names.named("min-document-length")).toInstance(100);
|
||||
bind(Double.class).annotatedWith(Names.named("min-avg-document-quality")).toInstance(-25.);
|
||||
bind(Integer.class).annotatedWith(Names.named("min-document-length")).toInstance(250);
|
||||
bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128);
|
||||
bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255);
|
||||
|
||||
|
@ -0,0 +1,194 @@
|
||||
package nu.marginalia.wmsa.edge.converting;
|
||||
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import nu.marginalia.wmsa.edge.converting.atags.AnchorTextExtractor;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader;
|
||||
import nu.marginalia.wmsa.edge.crawling.WorkLog;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
|
||||
import nu.marginalia.wmsa.edge.integration.stackoverflow.StackOverflowPostsReader;
|
||||
import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
public class LinkKeywordExtractorMain {
|
||||
private static final Logger logger = LoggerFactory.getLogger(LinkKeywordExtractorMain.class);
|
||||
|
||||
public static void main(String... args) throws IOException, InterruptedException {
|
||||
|
||||
if (args.length < 2) {
|
||||
System.err.println("Arguments: [crawl|so|wiki] crawl-plan.yaml [data]");
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
String command = args[0];
|
||||
var plan = new CrawlPlanLoader().load(Path.of(args[1]));
|
||||
|
||||
switch (command) {
|
||||
case "crawl": getKeywordsFromCrawl(plan); break;
|
||||
case "so": getKeywordsFromSo(plan, args[2]); break;
|
||||
case "wiki": getKeywordsFromWiki(plan, args[2]); break;
|
||||
default: System.err.println("Unrecognized command");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static void getKeywordsFromWiki(EdgeCrawlPlan plan, String arg) throws IOException, InterruptedException {
|
||||
|
||||
|
||||
HashSet<String> crawledDomains = new HashSet<>();
|
||||
TIntHashSet crawledUrls = new TIntHashSet(50_000_000);
|
||||
|
||||
logger.info("Loading URLs");
|
||||
Files.lines(Path.of("/home/vlofgren/good-urls3.txt"))
|
||||
.filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange"))
|
||||
.mapToInt(String::hashCode)
|
||||
.forEach(crawledUrls::add);
|
||||
|
||||
logger.info("Loading input spec");
|
||||
CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
|
||||
spec -> { crawledDomains.add(spec.domain); });
|
||||
|
||||
try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
|
||||
AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(domain -> crawledDomains.contains(domain)
|
||||
&& !domain.contains("wiki")
|
||||
&& !domain.contains("isni")
|
||||
&& !domain.contains("wiktionary"),
|
||||
url -> crawledUrls.contains(url.toString().hashCode()),
|
||||
output::write);
|
||||
|
||||
new WikipediaReader(arg, new EdgeDomain("invalid.example"), article -> {
|
||||
anchorTextExtractor.processDocument(article.getUrl().toString(), article.body);
|
||||
}).join();
|
||||
}
|
||||
catch (IOException ex) {
|
||||
ex.printStackTrace();
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
private static void getKeywordsFromSo(EdgeCrawlPlan plan, String arg) throws IOException, InterruptedException {
|
||||
TIntHashSet crawledUrls = new TIntHashSet(50_000_000);
|
||||
|
||||
logger.info("Loading URLs");
|
||||
Files.lines(Path.of("/home/vlofgren/good-urls3.txt"))
|
||||
.filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange"))
|
||||
.mapToInt(String::hashCode)
|
||||
.forEach(crawledUrls::add);
|
||||
|
||||
logger.info("Loading input spec");
|
||||
|
||||
HashSet<String> crawledDomains = new HashSet<>();
|
||||
CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
|
||||
spec -> crawledDomains.add(spec.domain));
|
||||
|
||||
crawledDomains.remove("jsfiddle.net"); // like 30% of SO's links go here
|
||||
crawledDomains.remove("jsbin.com");
|
||||
crawledDomains.remove("codepad.org");
|
||||
|
||||
|
||||
try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
|
||||
AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains,
|
||||
url -> crawledUrls.contains(url.toString().hashCode()),
|
||||
output::write);
|
||||
|
||||
new StackOverflowPostsReader(arg, new EdgeDomain("invalid.example"), post -> {
|
||||
anchorTextExtractor.processDocument(post.getUrl().toString(), post.fullBody);
|
||||
}).join();
|
||||
}
|
||||
catch (IOException ex) {
|
||||
ex.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static void getKeywordsFromCrawl(EdgeCrawlPlan plan) throws IOException {
|
||||
|
||||
TIntHashSet crawledUrls = new TIntHashSet(50_000_000);
|
||||
|
||||
logger.info("Loading URLs");
|
||||
Files.lines(Path.of("/home/vlofgren/good-urls3.txt"))
|
||||
.filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange"))
|
||||
.mapToInt(String::hashCode)
|
||||
.forEach(crawledUrls::add);
|
||||
|
||||
|
||||
logger.info("Loading input spec");
|
||||
|
||||
HashSet<String> crawledDomains = new HashSet<>();
|
||||
CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
|
||||
spec -> crawledDomains.add(spec.domain));
|
||||
|
||||
List<String> fileNames = new ArrayList<>();
|
||||
|
||||
logger.info("Replaying crawl log");
|
||||
WorkLog.readLog(plan.crawl.getLogFile(),
|
||||
entry -> fileNames.add(entry.path()));
|
||||
|
||||
try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
|
||||
AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains,
|
||||
url -> url.param != null,
|
||||
//url -> crawledUrls.contains(url.toString().hashCode()),
|
||||
output::write);
|
||||
|
||||
logger.info("Reading files");
|
||||
for (var fn : fileNames) {
|
||||
CrawledDomainReader crawledDomainReader = new CrawledDomainReader();
|
||||
var crawledDomain = crawledDomainReader.read(plan.getCrawledFilePath(fn));
|
||||
if (crawledDomain.doc == null) continue;
|
||||
|
||||
System.out.println("# " + crawledDomain.domain);
|
||||
|
||||
for (var doc : crawledDomain.doc) {
|
||||
if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) {
|
||||
anchorTextExtractor.processDocument(doc.url, doc.documentBody);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static class UrlKeywordTsvWriter implements AutoCloseable {
|
||||
|
||||
private final OutputStream stream;
|
||||
|
||||
UrlKeywordTsvWriter(Path outputFile) throws IOException {
|
||||
this.stream = new BufferedOutputStream(new FileOutputStream(outputFile.toFile()));
|
||||
}
|
||||
|
||||
void write(EdgeUrl url, String keyword) {
|
||||
try {
|
||||
stream.write(url.toString().getBytes());
|
||||
stream.write('\t');
|
||||
stream.write(keyword.getBytes());
|
||||
stream.write('\n');
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
stream.close();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -3,7 +3,6 @@ package nu.marginalia.wmsa.edge.converting;
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
||||
@ -25,14 +24,15 @@ import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
public class LoaderMain {
|
||||
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(LoaderMain.class);
|
||||
|
||||
private final Path processDir;
|
||||
private final EdgeCrawlPlan plan;
|
||||
private final ConvertedDomainReader instructionsReader;
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(LoaderMain.class);
|
||||
private final LoaderFactory loaderFactory;
|
||||
private final EdgeIndexClient indexClient;
|
||||
|
||||
private volatile boolean running = true;
|
||||
|
||||
final Thread processorThread = new Thread(this::processor, "Processor Thread");
|
||||
@ -56,14 +56,12 @@ public class LoaderMain {
|
||||
@Inject
|
||||
public LoaderMain(EdgeCrawlPlan plan,
|
||||
ConvertedDomainReader instructionsReader,
|
||||
HikariDataSource dataSource,
|
||||
LoaderFactory loaderFactory,
|
||||
EdgeIndexClient indexClient) {
|
||||
|
||||
this.processDir = plan.process.getDir();
|
||||
this.plan = plan;
|
||||
this.instructionsReader = instructionsReader;
|
||||
this.dataSource = dataSource;
|
||||
this.loaderFactory = loaderFactory;
|
||||
this.indexClient = indexClient;
|
||||
|
||||
@ -79,7 +77,7 @@ public class LoaderMain {
|
||||
LoaderMain.loadTotal = loadTotal.get();
|
||||
|
||||
WorkLog.readLog(logFile, entry -> {
|
||||
load(entry.path(), entry.cnt());
|
||||
load(plan, entry.path(), entry.cnt());
|
||||
});
|
||||
|
||||
running = false;
|
||||
@ -90,15 +88,9 @@ public class LoaderMain {
|
||||
}
|
||||
|
||||
private volatile static int loadTotal;
|
||||
private static final int loaded = 0;
|
||||
|
||||
private void load(String path, int cnt) {
|
||||
String first = path.substring(0, 2);
|
||||
String second = path.substring(2, 4);
|
||||
Path destDir = processDir.resolve(first).resolve(second).resolve(path);
|
||||
|
||||
|
||||
|
||||
private void load(EdgeCrawlPlan plan, String path, int cnt) {
|
||||
Path destDir = plan.getProcessedFilePath(path);
|
||||
try {
|
||||
var loader = loaderFactory.create(cnt);
|
||||
var instructions = instructionsReader.read(destDir, cnt);
|
||||
@ -120,7 +112,8 @@ public class LoaderMain {
|
||||
loader.finish();
|
||||
long loadTime = System.currentTimeMillis() - startTime;
|
||||
taskStats.observe(loadTime);
|
||||
logger.info("Loaded {}/{} : {} ({}) {}ms {} l/s", taskStats.getCount(), loadTotal, path, loader.data.sizeHint, loadTime, taskStats.avgTime());
|
||||
logger.info("Loaded {}/{} : {} ({}) {}ms {} l/s", taskStats.getCount(),
|
||||
loadTotal, path, loader.data.sizeHint, loadTime, taskStats.avgTime());
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -29,7 +29,7 @@ public class ReindexTriggerMain {
|
||||
.build();
|
||||
|
||||
try (var ds = db.provideConnection(); var conn = ds.getConnection(); var stmt = conn.createStatement()) {
|
||||
var rs = stmt.executeQuery("SELECT ID, URL_PART, STATE, INDEXED FROM EC_DOMAIN LIMIT 100");
|
||||
var rs = stmt.executeQuery("SELECT ID, DOMAIN_NAME, STATE, INDEXED FROM EC_DOMAIN LIMIT 100");
|
||||
while (rs.next()) {
|
||||
System.out.printf("%d %s %s %d\n",
|
||||
rs.getInt(1),
|
||||
@ -38,7 +38,7 @@ public class ReindexTriggerMain {
|
||||
rs.getInt(4));
|
||||
}
|
||||
|
||||
rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, URL, VISITED, STATE FROM EC_URL LIMIT 100");
|
||||
rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, PATH, VISITED, STATE FROM EC_URL LIMIT 100");
|
||||
while (rs.next()) {
|
||||
System.out.printf("%d %d %s %d %s\n",
|
||||
rs.getInt(1),
|
||||
|
@ -0,0 +1,149 @@
|
||||
package nu.marginalia.wmsa.edge.converting.atags;
|
||||
|
||||
import com.google.common.hash.HashFunction;
|
||||
import com.google.common.hash.Hashing;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.DenseBitMap;
|
||||
import nu.marginalia.util.language.WordPatterns;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class AnchorTextExtractor {
|
||||
private final Predicate<String> includeDomainPredicate;
|
||||
private final Predicate<EdgeUrl> includeUrlPredicate;
|
||||
private final BiConsumer<EdgeUrl, String> linkKeywordConsumer;
|
||||
|
||||
private final LinkParser linkParser = new LinkParser();
|
||||
|
||||
private final HashFunction hashFunction = Hashing.murmur3_128();
|
||||
|
||||
// This bit map is used as a bloom filter to deduplicate url-keyword combinations
|
||||
// false positives are expected, but that's an acceptable trade-off to not have to deal with
|
||||
// de-duplicating billions of shuffled (url, word) tuples on limited hardware
|
||||
private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS);
|
||||
|
||||
public AnchorTextExtractor(Predicate<String> includeDomainPredicate,
|
||||
Predicate<EdgeUrl> includeUrlPredicate,
|
||||
BiConsumer<EdgeUrl, String> linkKeywordConsumer) {
|
||||
this.includeDomainPredicate = includeDomainPredicate;
|
||||
this.includeUrlPredicate = includeUrlPredicate;
|
||||
this.linkKeywordConsumer = linkKeywordConsumer;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void processDocument(String docUrl, String documentBody) {
|
||||
final Document processed = Jsoup.parse(documentBody);
|
||||
final EdgeUrl documentUrl = new EdgeUrl(docUrl);
|
||||
|
||||
for (var link : processed.getElementsByTag("a")) {
|
||||
if (link.hasAttr("href")) {
|
||||
String href = link.attr("href");
|
||||
String text = getLinkText(link);
|
||||
|
||||
processAnchor(documentUrl, href, text);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private final Pattern anchorTextNoise = Pattern.compile("[ \t\n\"()“”]+");
|
||||
|
||||
private String getLinkText(Element link) {
|
||||
String text = link.text();
|
||||
|
||||
if (link.text().isBlank()) {
|
||||
for (var img: link.getElementsByTag("img")) {
|
||||
if (img.hasAttr("alt")) {
|
||||
text = img.attr("alt");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim();
|
||||
}
|
||||
|
||||
private void processAnchor(EdgeUrl documentUrl, String href, String text) {
|
||||
if (!isInterestingAnchorText(text)) {
|
||||
return;
|
||||
}
|
||||
|
||||
var optLinkUrl = linkParser.parseLink(documentUrl, href);
|
||||
if (optLinkUrl.isEmpty()) return;
|
||||
|
||||
var linkUrl = optLinkUrl.get();
|
||||
|
||||
if (!isInterestingAnchorLink(linkUrl)) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (String word: anchorTextNoise.split(text)) {
|
||||
if (WordPatterns.isStopWord(word))
|
||||
continue;
|
||||
|
||||
word = word.toLowerCase();
|
||||
if (!WordPatterns.filter(word)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (linkUrl.domain.equals(documentUrl.domain)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (isNewKeywordForLink(word, linkUrl.toString())) {
|
||||
linkKeywordConsumer.accept(linkUrl, word);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine
|
||||
private final Predicate<String> looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate();
|
||||
|
||||
private boolean isInterestingAnchorText(String text) {
|
||||
if (text.isBlank()) return false;
|
||||
if (text.length() > 32) return false;
|
||||
|
||||
// Google loves questions, and so does SEO spammers
|
||||
if (text.endsWith("?")) return false;
|
||||
|
||||
if (text.startsWith("http:") || text.startsWith("https:")) return false;
|
||||
|
||||
if (looksLikeAnURL.test(text)) return false;
|
||||
|
||||
return switch (text) {
|
||||
case "this", "here", "click", "click here", "download", "source" -> false;
|
||||
default -> true;
|
||||
};
|
||||
}
|
||||
|
||||
private boolean isInterestingAnchorLink(EdgeUrl linkUrl) {
|
||||
if (!(linkUrl.proto.endsWith("http") || linkUrl.proto.equals("https"))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!includeUrlPredicate.test(linkUrl)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return includeDomainPredicate.test(linkUrl.domain.toString());
|
||||
}
|
||||
|
||||
private boolean isNewKeywordForLink(String href, String text) {
|
||||
long hash = 0;
|
||||
|
||||
hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).padToLong();
|
||||
hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).padToLong();
|
||||
|
||||
// Remove sign bit because we don't want a negative index in deduplicateHashBitset
|
||||
hash &= 0x7FFF_FFFF_FFFF_FFFFL;
|
||||
|
||||
return !deduplicateHashBitset.set(hash % deduplicateHashBitset.cardinality);
|
||||
}
|
||||
}
|
@ -14,7 +14,7 @@ public interface Interpreter {
|
||||
void loadRssFeed(EdgeUrl[] rssFeed);
|
||||
void loadDomainLink(DomainLink[] links);
|
||||
|
||||
void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality);
|
||||
void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip);
|
||||
void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument);
|
||||
void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError);
|
||||
|
||||
|
@ -6,11 +6,11 @@ import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
|
||||
public record LoadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality) implements Instruction {
|
||||
public record LoadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) implements Instruction {
|
||||
|
||||
@Override
|
||||
public void apply(Interpreter interpreter) {
|
||||
interpreter.loadProcessedDomain(domain, state, quality);
|
||||
interpreter.loadProcessedDomain(domain, state, ip);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -76,9 +76,9 @@ public class Loader implements Interpreter {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality) {
|
||||
logger.debug("loadProcessedDomain({}, {}, {})", domain, state, quality);
|
||||
sqlLoadProcessedDomain.load(data, domain, state, quality);
|
||||
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) {
|
||||
logger.debug("loadProcessedDomain({}, {}, {})", domain, state, ip);
|
||||
sqlLoadProcessedDomain.load(data, domain, state, ip);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -30,7 +30,7 @@ public class SqlLoadDomainLinks {
|
||||
INSERT IGNORE INTO EC_DOMAIN_LINK (SOURCE_DOMAIN_ID, DEST_DOMAIN_ID)
|
||||
SELECT SOURCE.ID,DEST.ID
|
||||
FROM EC_DOMAIN SOURCE INNER JOIN EC_DOMAIN DEST
|
||||
ON SOURCE.URL_PART=FROM_DOMAIN AND DEST.URL_PART=TO_DOMAIN;
|
||||
ON SOURCE.DOMAIN_NAME=FROM_DOMAIN AND DEST.DOMAIN_NAME=TO_DOMAIN;
|
||||
END
|
||||
""");
|
||||
}
|
||||
@ -61,8 +61,8 @@ public class SqlLoadDomainLinks {
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (SQLException sql) {
|
||||
sql.printStackTrace();
|
||||
catch (SQLException ex) {
|
||||
logger.warn("SQL error inserting domain links", ex);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -25,15 +25,9 @@ public class SqlLoadDomains {
|
||||
stmt.execute("""
|
||||
CREATE PROCEDURE INSERT_DOMAIN (
|
||||
IN DOMAIN_NAME VARCHAR(255),
|
||||
IN SUB_DOMAIN VARCHAR(255),
|
||||
IN TOP_DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci)
|
||||
BEGIN
|
||||
INSERT IGNORE INTO EC_TOP_DOMAIN (URL_PART) VALUES (TOP_DOMAIN);
|
||||
|
||||
INSERT IGNORE INTO EC_DOMAIN(URL_PART, URL_SUBDOMAIN, URL_TOP_DOMAIN_ID)
|
||||
SELECT DOMAIN_NAME,SUB_DOMAIN,ID
|
||||
FROM EC_TOP_DOMAIN
|
||||
WHERE EC_TOP_DOMAIN.URL_PART=TOP_DOMAIN;
|
||||
INSERT IGNORE INTO EC_DOMAIN(DOMAIN_NAME, DOMAIN_TOP) VALUES (DOMAIN_NAME, TOP_DOMAIN);
|
||||
END
|
||||
""");
|
||||
}
|
||||
@ -46,10 +40,9 @@ public class SqlLoadDomains {
|
||||
public void load(LoaderData data, EdgeDomain domain) {
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?,?)")) {
|
||||
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) {
|
||||
insertCall.setString(1, domain.toString());
|
||||
insertCall.setString(2, domain.subDomain);
|
||||
insertCall.setString(3, domain.domain);
|
||||
insertCall.setString(2, domain.domain);
|
||||
insertCall.addBatch();
|
||||
|
||||
var ret = insertCall.executeUpdate();
|
||||
@ -57,12 +50,11 @@ public class SqlLoadDomains {
|
||||
logger.warn("load({}) -- bad row count {}", domain, ret);
|
||||
}
|
||||
|
||||
connection.commit();
|
||||
findIdForTargetDomain(connection, data);
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
ex.printStackTrace();
|
||||
logger.warn("SQL error inserting domain", ex);
|
||||
}
|
||||
|
||||
|
||||
@ -73,12 +65,11 @@ public class SqlLoadDomains {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
connection.setAutoCommit(false);
|
||||
|
||||
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?,?)")) {
|
||||
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) {
|
||||
|
||||
for (var domain : domains) {
|
||||
insertCall.setString(1, domain.toString());
|
||||
insertCall.setString(2, domain.subDomain);
|
||||
insertCall.setString(3, domain.domain);
|
||||
insertCall.setString(2, domain.domain);
|
||||
insertCall.addBatch();
|
||||
}
|
||||
var ret = insertCall.executeBatch();
|
||||
@ -95,7 +86,7 @@ public class SqlLoadDomains {
|
||||
findIdForTargetDomain(connection, data);
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
ex.printStackTrace();
|
||||
logger.warn("SQL error inserting domains", ex);
|
||||
}
|
||||
}
|
||||
|
||||
@ -104,7 +95,7 @@ public class SqlLoadDomains {
|
||||
return;
|
||||
}
|
||||
|
||||
try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?"))
|
||||
try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?"))
|
||||
{
|
||||
|
||||
var targetDomain = data.getTargetDomain();
|
||||
@ -118,7 +109,7 @@ public class SqlLoadDomains {
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
ex.printStackTrace();
|
||||
logger.warn("SQL error finding id for domain", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -31,14 +31,14 @@ public class SqlLoadProcessedDocument {
|
||||
IN TITLE VARCHAR(255),
|
||||
IN DESCRIPTION VARCHAR(255),
|
||||
IN LENGTH INT,
|
||||
IN QUALITY_MEASURE DOUBLE,
|
||||
IN FEATURES INT,
|
||||
IN STANDARD VARCHAR(32),
|
||||
IN QUALITY DOUBLE,
|
||||
IN HASH INT)
|
||||
BEGIN
|
||||
SET FOREIGN_KEY_CHECKS=0;
|
||||
REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES);
|
||||
UPDATE EC_URL SET VISITED=1, STATE=STATE, QUALITY_MEASURE=QUALITY_MEASURE, DATA_HASH=HASH WHERE ID=URL_ID;
|
||||
REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES, HASH, QUALITY);
|
||||
UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID;
|
||||
SET FOREIGN_KEY_CHECKS=1;
|
||||
END
|
||||
""");
|
||||
@ -47,7 +47,8 @@ public class SqlLoadProcessedDocument {
|
||||
IN URL_ID INT,
|
||||
IN STATE VARCHAR(32))
|
||||
BEGIN
|
||||
UPDATE EC_URL SET VISITED=1, STATE=STATE, QUALITY_MEASURE=-100, DATA_HASH=NULL WHERE ID=URL_ID;
|
||||
UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID;
|
||||
DELETE FROM EC_PAGE_DATA WHERE ID=URL_ID;
|
||||
END
|
||||
""");
|
||||
|
||||
@ -61,6 +62,7 @@ public class SqlLoadProcessedDocument {
|
||||
public void load(LoaderData data, List<LoadProcessedDocument> documents) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?)")) {
|
||||
conn.setAutoCommit(false);
|
||||
|
||||
for (var doc : documents) {
|
||||
int urlId = data.getUrlId(doc.url());
|
||||
@ -74,9 +76,9 @@ public class SqlLoadProcessedDocument {
|
||||
stmt.setString(3, doc.title());
|
||||
stmt.setString(4, doc.description());
|
||||
stmt.setInt(5, doc.length());
|
||||
stmt.setDouble(6, doc.quality());
|
||||
stmt.setInt(7, doc.htmlFeatures());
|
||||
stmt.setString(8, doc.standard().name());
|
||||
stmt.setInt(6, doc.htmlFeatures());
|
||||
stmt.setString(7, doc.standard().name());
|
||||
stmt.setDouble(8, doc.quality());
|
||||
stmt.setInt(9, (int) doc.hash());
|
||||
stmt.addBatch();
|
||||
}
|
||||
@ -89,11 +91,9 @@ public class SqlLoadProcessedDocument {
|
||||
}
|
||||
|
||||
conn.commit();
|
||||
} catch (SQLException e) {
|
||||
e.printStackTrace();
|
||||
} catch (SQLException ex) {
|
||||
logger.warn("SQL error inserting document", ex);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
public void loadWithError(LoaderData data, List<LoadProcessedDocumentWithError> documents) {
|
||||
@ -117,8 +117,8 @@ public class SqlLoadProcessedDocument {
|
||||
logger.warn("load({}) -- bad row count {}", documents.get(rv), ret[rv]);
|
||||
}
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
e.printStackTrace();
|
||||
} catch (SQLException ex) {
|
||||
logger.warn("SQL error inserting failed document", ex);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -25,12 +25,12 @@ public class SqlLoadProcessedDomain {
|
||||
stmt.execute("DROP PROCEDURE IF EXISTS INITIALIZE_DOMAIN");
|
||||
stmt.execute("""
|
||||
CREATE PROCEDURE INITIALIZE_DOMAIN (
|
||||
IN ST INT,
|
||||
IN ST ENUM('ACTIVE', 'EXHAUSTED', 'SPECIAL', 'SOCIAL_MEDIA', 'BLOCKED', 'REDIR', 'ERROR', 'UNKNOWN'),
|
||||
IN IDX INT,
|
||||
IN QUAL DOUBLE,
|
||||
IN DID INT)
|
||||
IN DID INT,
|
||||
IN IP VARCHAR(32))
|
||||
BEGIN
|
||||
UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), QUALITY=QUAL, QUALITY_RAW=QUAL, QUALITY_ORIGINAL=QUAL WHERE ID=DID;
|
||||
UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), IP=IP WHERE ID=DID;
|
||||
DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID;
|
||||
END
|
||||
""");
|
||||
@ -41,7 +41,7 @@ public class SqlLoadProcessedDomain {
|
||||
}
|
||||
}
|
||||
|
||||
public void load(LoaderData data, EdgeDomain domain, EdgeDomainIndexingState state, double quality) {
|
||||
public void load(LoaderData data, EdgeDomain domain, EdgeDomainIndexingState state, String ip) {
|
||||
data.setTargetDomain(domain);
|
||||
|
||||
loadDomains.load(data, domain);
|
||||
@ -49,18 +49,17 @@ public class SqlLoadProcessedDomain {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var initCall = conn.prepareCall("CALL INITIALIZE_DOMAIN(?,?,?,?)"))
|
||||
{
|
||||
initCall.setInt(1, state.code);
|
||||
initCall.setString(1, state.name());
|
||||
initCall.setInt(2, 1 + data.sizeHint / 100);
|
||||
initCall.setDouble(3, quality);
|
||||
initCall.setInt(4, data.getDomainId(domain));
|
||||
initCall.setInt(3, data.getDomainId(domain));
|
||||
initCall.setString(4, ip);
|
||||
int rc = initCall.executeUpdate();
|
||||
if (rc < 1) {
|
||||
logger.warn("load({},{},{}) -- bad rowcount {}", domain, state, quality, rc);
|
||||
logger.warn("load({},{}) -- bad rowcount {}", domain, state, rc);
|
||||
}
|
||||
conn.commit();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
ex.printStackTrace();
|
||||
logger.warn("SQL error initializing domain", ex);
|
||||
}
|
||||
|
||||
}
|
||||
@ -69,9 +68,9 @@ public class SqlLoadProcessedDomain {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
UPDATE EC_DOMAIN TARGET
|
||||
INNER JOIN EC_DOMAIN ALIAS ON ALIAS.URL_PART=?
|
||||
INNER JOIN EC_DOMAIN ALIAS ON ALIAS.DOMAIN_NAME=?
|
||||
SET TARGET.DOMAIN_ALIAS=ALIAS.ID
|
||||
WHERE TARGET.URL_PART=?
|
||||
WHERE TARGET.DOMAIN_NAME=?
|
||||
""")) {
|
||||
stmt.setString(1, link.to().toString());
|
||||
stmt.setString(2, link.from().toString());
|
||||
@ -81,7 +80,7 @@ public class SqlLoadProcessedDomain {
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
ex.printStackTrace();
|
||||
logger.warn("SQL error inserting domain alias", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,11 +1,14 @@
|
||||
package nu.marginalia.wmsa.edge.converting.loader;
|
||||
|
||||
import com.google.common.hash.HashFunction;
|
||||
import com.google.common.hash.Hashing;
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Types;
|
||||
|
||||
@ -25,12 +28,14 @@ public class SqlLoadUrls {
|
||||
stmt.execute("""
|
||||
CREATE PROCEDURE INSERT_URL (
|
||||
IN PROTO VARCHAR(255),
|
||||
IN DOMAIN_NAME VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
|
||||
IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
|
||||
IN PORT INT,
|
||||
IN URL VARCHAR(255)
|
||||
IN PATH VARCHAR(255),
|
||||
IN PARAM VARCHAR(255),
|
||||
IN PATH_HASH BIGINT
|
||||
)
|
||||
BEGIN
|
||||
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,URL) SELECT PROTO,ID,PORT,URL FROM EC_DOMAIN WHERE URL_PART=DOMAIN_NAME;
|
||||
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN;
|
||||
END
|
||||
""");
|
||||
}
|
||||
@ -42,12 +47,16 @@ public class SqlLoadUrls {
|
||||
|
||||
public void load(LoaderData data, EdgeUrl[] urls) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?)");
|
||||
var queryCall = conn.prepareStatement("SELECT ID, PROTO, URL FROM EC_URL WHERE DOMAIN_ID=?")
|
||||
var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?,?,?)");
|
||||
var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH, PARAM FROM EC_URL WHERE DOMAIN_ID=?")
|
||||
)
|
||||
{
|
||||
conn.setAutoCommit(false);
|
||||
for (var url : urls) {
|
||||
if (url.path.length() >= 255) {
|
||||
logger.warn("Skipping bad URL {}", url);
|
||||
continue;
|
||||
}
|
||||
|
||||
insertCall.setString(1, url.proto);
|
||||
insertCall.setString(2, url.domain.toString());
|
||||
@ -58,10 +67,12 @@ public class SqlLoadUrls {
|
||||
insertCall.setNull(3, Types.INTEGER);
|
||||
}
|
||||
insertCall.setString(4, url.path);
|
||||
insertCall.setString(5, url.param);
|
||||
insertCall.setLong(6, hashPath(url.path, url.param));
|
||||
insertCall.addBatch();
|
||||
}
|
||||
var ret = insertCall.executeBatch();
|
||||
for (int rv = 0; rv < urls.length; rv++) {
|
||||
for (int rv = 0; rv < ret.length; rv++) {
|
||||
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
|
||||
logger.warn("load({}) -- bad row count {}", urls[rv], ret[rv]);
|
||||
}
|
||||
@ -80,13 +91,26 @@ public class SqlLoadUrls {
|
||||
int urlId = rsp.getInt(1);
|
||||
String proto = rsp.getString(2);
|
||||
String path = rsp.getString(3);
|
||||
String param = rsp.getString(4);
|
||||
|
||||
data.addUrl(new EdgeUrl(proto, targetDomain, null, path), urlId);
|
||||
data.addUrl(new EdgeUrl(proto, targetDomain, null, path, param), urlId);
|
||||
}
|
||||
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
ex.printStackTrace();
|
||||
logger.warn("SQL error inserting URLs", ex);
|
||||
}
|
||||
}
|
||||
|
||||
private static final HashFunction murmur3_128 = Hashing.murmur3_128();
|
||||
private long hashPath(String path, String queryParam) {
|
||||
long pathHash = murmur3_128.hashString(path, StandardCharsets.UTF_8).padToLong();
|
||||
|
||||
if (queryParam == null) {
|
||||
return pathHash;
|
||||
}
|
||||
else {
|
||||
return pathHash + murmur3_128.hashString(queryParam, StandardCharsets.UTF_8).padToLong();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -12,6 +12,11 @@ public class DisqualifiedException extends Exception {
|
||||
}
|
||||
|
||||
public enum DisqualificationReason {
|
||||
LENGTH, CONTENT_TYPE, LANGUAGE, STATUS, QUALITY
|
||||
LENGTH,
|
||||
CONTENT_TYPE,
|
||||
LANGUAGE,
|
||||
STATUS,
|
||||
QUALITY,
|
||||
ACCEPTABLE_ADS
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,22 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor;
|
||||
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
|
||||
public class AcceptableAds {
|
||||
/* Acceptable Ads is an initiative to allow less intrusive ads to punch through adblockers.
|
||||
*
|
||||
* In practice, from looking at crawled data, the only sites in the crawled corpus that seem to
|
||||
* follow this standard are domain squatters and other nuisance sites.
|
||||
*
|
||||
*/
|
||||
|
||||
public static boolean hasAcceptableAdsTag(Document parsedDocument) {
|
||||
return parsedDocument.getElementsByTag("html").hasAttr("data-adblockkey");
|
||||
}
|
||||
|
||||
public static boolean hasAcceptableAdsHeader(CrawledDocument document) {
|
||||
return document.headers.contains("X-Adblock-Key");
|
||||
}
|
||||
}
|
@ -3,19 +3,15 @@ package nu.marginalia.wmsa.edge.converting.processor;
|
||||
import com.google.common.hash.HashCode;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.util.language.LanguageFilter;
|
||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException.DisqualificationReason;
|
||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocumentDetails;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.*;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.FeedExtractor;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||
import nu.marginalia.util.language.LanguageFilter;
|
||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlStandardExtractor;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
|
||||
@ -81,6 +77,10 @@ public class DocumentProcessor {
|
||||
|
||||
if (ret.state == EdgeUrlState.OK) {
|
||||
|
||||
if (AcceptableAds.hasAcceptableAdsHeader(crawledDocument)) {
|
||||
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
|
||||
}
|
||||
|
||||
if (isAcceptedContentType(crawledDocument)) {
|
||||
var detailsWords = createDetails(crawledDomain, crawledDocument);
|
||||
|
||||
@ -101,7 +101,7 @@ public class DocumentProcessor {
|
||||
}
|
||||
catch (DisqualifiedException ex) {
|
||||
ret.state = EdgeUrlState.DISQUALIFIED;
|
||||
logger.info("Disqualified {}: {}", ret.url, ex.reason);
|
||||
logger.debug("Disqualified {}: {}", ret.url, ex.reason);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
ret.state = EdgeUrlState.DISQUALIFIED;
|
||||
@ -113,7 +113,19 @@ public class DocumentProcessor {
|
||||
}
|
||||
|
||||
private boolean isAcceptedContentType(CrawledDocument crawledDocument) {
|
||||
return crawledDocument.contentType != null && acceptedContentTypes.contains(crawledDocument.contentType.toLowerCase());
|
||||
if (crawledDocument.contentType == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
var ct = crawledDocument.contentType;
|
||||
|
||||
if (acceptedContentTypes.contains(ct))
|
||||
return true;
|
||||
|
||||
if (ct.contains(";")) {
|
||||
return acceptedContentTypes.contains(ct.substring(0, ct.indexOf(';')));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private EdgeUrlState crawlerStatusToUrlState(String crawlerStatus, int httpStatus) {
|
||||
@ -128,6 +140,11 @@ public class DocumentProcessor {
|
||||
throws DisqualifiedException, URISyntaxException {
|
||||
|
||||
var doc = Jsoup.parse(crawledDocument.documentBody);
|
||||
|
||||
if (AcceptableAds.hasAcceptableAdsTag(doc)) {
|
||||
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
|
||||
}
|
||||
|
||||
var dld = sentenceExtractor.extractSentences(doc.clone());
|
||||
|
||||
checkDocumentLanguage(dld);
|
||||
@ -158,7 +175,6 @@ public class DocumentProcessor {
|
||||
var edgeDomain = url.domain;
|
||||
tagWords.add("format:"+ret.standard.toString().toLowerCase());
|
||||
|
||||
|
||||
tagWords.add("site:" + edgeDomain.toString().toLowerCase());
|
||||
if (!Objects.equals(edgeDomain.toString(), edgeDomain.domain)) {
|
||||
tagWords.add("site:" + edgeDomain.domain.toLowerCase());
|
||||
@ -167,19 +183,12 @@ public class DocumentProcessor {
|
||||
tagWords.add("proto:"+url.proto.toLowerCase());
|
||||
tagWords.add("js:" + Boolean.toString(ret.features.contains(HtmlFeature.JS)).toLowerCase());
|
||||
|
||||
if (ret.features.contains(HtmlFeature.MEDIA)) {
|
||||
tagWords.add("special:media");
|
||||
}
|
||||
if (ret.features.contains(HtmlFeature.TRACKING)) {
|
||||
tagWords.add("special:tracking");
|
||||
}
|
||||
if (ret.features.contains(HtmlFeature.AFFILIATE_LINK)) {
|
||||
tagWords.add("special:affiliate");
|
||||
}
|
||||
if (ret.features.contains(HtmlFeature.COOKIES)) {
|
||||
tagWords.add("special:cookies");
|
||||
if (domain.ip != null) {
|
||||
tagWords.add("ip:" + domain.ip.toLowerCase()); // lower case because IPv6 is hexadecimal
|
||||
}
|
||||
|
||||
ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add);
|
||||
|
||||
words.append(IndexBlock.Meta, tagWords);
|
||||
words.append(IndexBlock.Words, tagWords);
|
||||
}
|
||||
@ -196,7 +205,9 @@ public class DocumentProcessor {
|
||||
for (var frame : doc.getElementsByTag("frame")) {
|
||||
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
|
||||
}
|
||||
|
||||
for (var frame : doc.getElementsByTag("iframe")) {
|
||||
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
|
||||
}
|
||||
for (var link : doc.select("link[rel=alternate]")) {
|
||||
feedExtractor
|
||||
.getFeedFromAlternateTag(baseUrl, link)
|
||||
|
@ -1,21 +1,29 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
public class DomainProcessor {
|
||||
private final DocumentProcessor documentProcessor;
|
||||
private final Double minAvgDocumentQuality;
|
||||
|
||||
@Inject
|
||||
public DomainProcessor(DocumentProcessor documentProcessor) {
|
||||
public DomainProcessor(DocumentProcessor documentProcessor,
|
||||
@Named("min-avg-document-quality") Double minAvgDocumentQuality
|
||||
) {
|
||||
this.documentProcessor = documentProcessor;
|
||||
this.minAvgDocumentQuality = minAvgDocumentQuality;
|
||||
}
|
||||
|
||||
public ProcessedDomain process(CrawledDomain crawledDomain) {
|
||||
@ -37,17 +45,37 @@ public class DomainProcessor {
|
||||
ret.documents.add(processedDoc);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
else {
|
||||
ret.documents = Collections.emptyList();
|
||||
}
|
||||
|
||||
double averageQuality = getAverageQuality(ret.documents);
|
||||
if (averageQuality < minAvgDocumentQuality) {
|
||||
ret.documents.forEach(doc -> doc.state = EdgeUrlState.DISQUALIFIED);
|
||||
}
|
||||
|
||||
ret.state = getState(crawledDomain.crawlerStatus);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
private double getAverageQuality(List<ProcessedDocument> documents) {
|
||||
int n = 0;
|
||||
double q = 0.;
|
||||
for (var doc : documents) {
|
||||
if (doc.quality().isPresent()) {
|
||||
n++;
|
||||
q += doc.quality().getAsDouble();
|
||||
}
|
||||
}
|
||||
|
||||
if (n > 0) {
|
||||
return q / n;
|
||||
}
|
||||
return -5.;
|
||||
}
|
||||
|
||||
private EdgeDomainIndexingState getState(String crawlerStatus) {
|
||||
return switch (CrawlerDomainStatus.valueOf(crawlerStatus)) {
|
||||
case OK -> EdgeDomainIndexingState.ACTIVE;
|
||||
|
@ -15,7 +15,7 @@ public class InstructionsCompiler {
|
||||
public List<Instruction> compile(ProcessedDomain domain) {
|
||||
List<Instruction> ret = new ArrayList<>(domain.size()*4);
|
||||
|
||||
ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.averageQuality().orElse(-5.)));
|
||||
ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.ip));
|
||||
|
||||
if (domain.documents != null) {
|
||||
compileUrls(ret, domain.documents);
|
||||
@ -42,15 +42,16 @@ public class InstructionsCompiler {
|
||||
Set<EdgeUrl> seenUrls = new HashSet<>(documents.size()*4);
|
||||
Set<EdgeDomain> seenDomains = new HashSet<>(documents.size());
|
||||
|
||||
documents.stream().map(doc -> doc.url).forEach(seenUrls::add);
|
||||
|
||||
for (var doc : documents) {
|
||||
if (doc.details == null) continue;
|
||||
for (var url : doc.details.linksExternal) {
|
||||
seenDomains.add(url.domain);
|
||||
seenUrls.add(doc.url);
|
||||
|
||||
if (doc.details != null) {
|
||||
for (var url : doc.details.linksExternal) {
|
||||
seenDomains.add(url.domain);
|
||||
}
|
||||
seenUrls.addAll(doc.details.linksExternal);
|
||||
seenUrls.addAll(doc.details.linksInternal);
|
||||
}
|
||||
seenUrls.addAll(doc.details.linksExternal);
|
||||
seenUrls.addAll(doc.details.linksInternal);
|
||||
}
|
||||
|
||||
ret.add(new LoadDomain(seenDomains.toArray(EdgeDomain[]::new)));
|
||||
|
@ -1,8 +1,8 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||
|
||||
import crawlercommons.utils.Strings;
|
||||
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
@ -35,7 +35,7 @@ public class DocumentValuator {
|
||||
throw new DisqualifiedException(LENGTH);
|
||||
}
|
||||
|
||||
return Math.log(textBodyLength / (double) rawLength)*htmlStandard.scale
|
||||
return Math.log(textBodyLength / (double) (1+rawLength))*htmlStandard.scale
|
||||
+ htmlStandard.offset
|
||||
- scriptPenalty
|
||||
- smutCoefficient;
|
||||
@ -52,17 +52,13 @@ public class DocumentValuator {
|
||||
|
||||
double scriptPenalty = 0;
|
||||
for (var tag : scriptTags) {
|
||||
String srcTag = tag.attr("src");
|
||||
if (Strings.isBlank(srcTag)) {
|
||||
scriptPenalty += 1;
|
||||
}
|
||||
else if (srcTag.contains("wp-content") || srcTag.contains("wp-includes") || srcTag.contains("jquery")) {
|
||||
String srcAttr = tag.attr("src");
|
||||
if (srcAttr.contains("wp-content") || srcAttr.contains("wp-includes") || srcAttr.contains("jquery")) {
|
||||
scriptPenalty += 0.49;
|
||||
}
|
||||
else {
|
||||
else if (!Strings.isBlank(srcAttr)) {
|
||||
scriptPenalty += 1;
|
||||
}
|
||||
|
||||
}
|
||||
return (int)(scriptPenalty + badScript + (scriptText.length())/1000.);
|
||||
}
|
||||
|
@ -3,26 +3,32 @@ package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||
import java.util.Collection;
|
||||
|
||||
public enum HtmlFeature {
|
||||
MEDIA(0),
|
||||
JS(1),
|
||||
AFFILIATE_LINK(2),
|
||||
TRACKING(3),
|
||||
COOKIES(4)
|
||||
MEDIA( "special:media"),
|
||||
JS("special:scripts"),
|
||||
AFFILIATE_LINK( "special:affiliate"),
|
||||
TRACKING("special:tracking"),
|
||||
COOKIES("special:cookies")
|
||||
;
|
||||
|
||||
public final int bit;
|
||||
private final String keyword;
|
||||
|
||||
HtmlFeature(int bit) {
|
||||
this.bit = bit;
|
||||
HtmlFeature(String keyword) {
|
||||
this.keyword = keyword;
|
||||
}
|
||||
|
||||
public String getKeyword() {
|
||||
return keyword;
|
||||
}
|
||||
|
||||
public static int encode(Collection<HtmlFeature> featuresAll) {
|
||||
return featuresAll.stream().mapToInt(f -> 1 << f.bit).reduce(0, (l, r) -> (l|r));
|
||||
int ret = 0;
|
||||
for (var feature : featuresAll) {
|
||||
ret |= (1 << (feature.ordinal()));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
public static boolean hasFeature(int value, HtmlFeature feature) {
|
||||
return (value & (1<< feature.bit)) != 0;
|
||||
}
|
||||
public static int addFeature(int value, HtmlFeature feature) {
|
||||
return (value | (1<< feature.bit));
|
||||
return (value & (1<< feature.ordinal())) != 0;
|
||||
}
|
||||
}
|
||||
|
@ -102,26 +102,34 @@ public class LinkParser {
|
||||
return url;
|
||||
}
|
||||
|
||||
private static final Pattern paramRegex = Pattern.compile("\\?.*$");
|
||||
private static final Pattern spaceRegex = Pattern.compile(" ");
|
||||
|
||||
@SneakyThrows
|
||||
private String resolveUrl(EdgeUrl baseUrl, String s) {
|
||||
s = paramRegex.matcher(s).replaceAll("");
|
||||
|
||||
// url looks like http://www.marginalia.nu/
|
||||
if (isAbsoluteDomain(s)) {
|
||||
return s;
|
||||
}
|
||||
|
||||
// url looks like /my-page
|
||||
if (s.startsWith("/")) {
|
||||
return baseUrl.withPath(s).toString();
|
||||
String[] parts = s.split("\\?", 2);
|
||||
String path = parts[0];
|
||||
String param;
|
||||
if (parts.length > 1) {
|
||||
param = QueryParams.queryParamsSanitizer(parts[0], parts[1]);
|
||||
}
|
||||
else {
|
||||
param = null;
|
||||
}
|
||||
|
||||
final String partFromNewLink = spaceRegex.matcher(s).replaceAll("%20");
|
||||
// url looks like /my-page
|
||||
if (path.startsWith("/")) {
|
||||
return baseUrl.withPathAndParam(path, param).toString();
|
||||
}
|
||||
|
||||
return baseUrl.withPath(relativeNavigation(baseUrl) + partFromNewLink).toString();
|
||||
final String partFromNewLink = spaceRegex.matcher(path).replaceAll("%20");
|
||||
|
||||
return baseUrl.withPathAndParam(relativeNavigation(baseUrl) + partFromNewLink, param).toString();
|
||||
}
|
||||
|
||||
// for a relative url that looks like /foo or /foo/bar; return / or /foo
|
||||
@ -145,13 +153,8 @@ public class LinkParser {
|
||||
}
|
||||
|
||||
private boolean isRelRelevant(String rel) {
|
||||
if (null == rel) {
|
||||
return true;
|
||||
}
|
||||
return switch (rel) {
|
||||
case "noindex" -> false;
|
||||
default -> true;
|
||||
};
|
||||
// this is null safe
|
||||
return !"noindex".equalsIgnoreCase(rel);
|
||||
}
|
||||
|
||||
private boolean isUrlRelevant(String href) {
|
||||
@ -188,4 +191,5 @@ public class LinkParser {
|
||||
|
||||
return documentUrl;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -72,7 +72,7 @@ public class LinkProcessor {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (urlBlocklist.isForumLink(link)) {
|
||||
if (urlBlocklist.isMailingListLink(link)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,50 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.Arrays;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class QueryParams {
|
||||
|
||||
private static final Pattern paramSplitterPattern = Pattern.compile("&");
|
||||
|
||||
@Nullable
|
||||
public static String queryParamsSanitizer(String path, @Nullable String queryParams) {
|
||||
if (queryParams == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
var ret = Arrays.stream(paramSplitterPattern.split(queryParams))
|
||||
.filter(param -> QueryParams.isPermittedParam(path, param))
|
||||
.sorted()
|
||||
.collect(Collectors.joining("&"));
|
||||
|
||||
if (ret.isBlank())
|
||||
return null;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public static boolean isPermittedParam(String path, String param) {
|
||||
if (path.endsWith("index.php")) {
|
||||
if (param.startsWith("showtopic"))
|
||||
return true;
|
||||
if (param.startsWith("showforum"))
|
||||
return true;
|
||||
}
|
||||
if (path.endsWith("viewtopic.php")) {
|
||||
return (param.startsWith("t=") || param.startsWith("p="));
|
||||
}
|
||||
if (path.endsWith("viewforum.php")) {
|
||||
return param.startsWith("v=");
|
||||
}
|
||||
if (path.endsWith("showthread.php")) {
|
||||
return (param.startsWith("t=") || param.startsWith("p="));
|
||||
}
|
||||
if (path.endsWith("showforum.php")) {
|
||||
return param.startsWith("v=");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
@ -34,11 +34,10 @@ public class CrawlJobExtractorMain {
|
||||
|
||||
private static final String domainsSql =
|
||||
"""
|
||||
SELECT ID, LOWER(EC_DOMAIN.URL_PART)
|
||||
SELECT ID, LOWER(EC_DOMAIN.DOMAIN_NAME)
|
||||
FROM EC_DOMAIN
|
||||
WHERE QUALITY_RAW>-100
|
||||
AND INDEXED>0
|
||||
AND STATE<2
|
||||
WHERE INDEXED>0
|
||||
AND STATE='ACTIVE' OR STATE='EXHAUSTED'
|
||||
ORDER BY
|
||||
INDEX_DATE ASC,
|
||||
DISCOVER_DATE ASC,
|
||||
@ -49,8 +48,8 @@ public class CrawlJobExtractorMain {
|
||||
|
||||
private static final String urlsSql =
|
||||
"""
|
||||
SELECT CONCAT(PROTO, "://", ?, URL)
|
||||
FROM EC_URL
|
||||
SELECT URL
|
||||
FROM EC_URL_VIEW
|
||||
WHERE DOMAIN_ID=?
|
||||
ORDER BY
|
||||
VISITED DESC,
|
||||
|
@ -6,6 +6,7 @@ import com.google.common.hash.Hashing;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||
@ -30,19 +31,19 @@ public class CrawlJobExtractorPageRankMain {
|
||||
"""
|
||||
SELECT ID
|
||||
FROM EC_DOMAIN
|
||||
WHERE URL_PART=?
|
||||
WHERE DOMAIN_NAME=?
|
||||
""";
|
||||
private static final String specificDomainSqlFromId =
|
||||
"""
|
||||
SELECT LOWER(URL_PART)
|
||||
SELECT LOWER(DOMAIN_NAME)
|
||||
FROM EC_DOMAIN
|
||||
WHERE ID=?
|
||||
""";
|
||||
|
||||
private static final String urlsSql =
|
||||
"""
|
||||
SELECT CONCAT(PROTO, "://", ?, URL)
|
||||
FROM EC_URL
|
||||
SELECT URL
|
||||
FROM EC_URL_VIEW
|
||||
WHERE DOMAIN_ID=?
|
||||
ORDER BY
|
||||
VISITED DESC,
|
||||
@ -73,10 +74,12 @@ public class CrawlJobExtractorPageRankMain {
|
||||
|
||||
Gson gson = new GsonBuilder().create();
|
||||
|
||||
var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
|
||||
var ds = new DatabaseModule().provideConnection();
|
||||
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
||||
var rpr = new BetterReversePageRank(domains, "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
|
||||
rpr.setMaxKnownUrls(750);
|
||||
|
||||
var targetDomainIds = rpr.pageRankWithPeripheralNodes(rpr.size(), false);
|
||||
var targetDomainIds = rpr.pageRankWithPeripheralNodes(rpr.size());
|
||||
|
||||
try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) {
|
||||
final var extractor = new CrawlJobExtractorPageRankMain(new DatabaseModule().provideConnection());
|
||||
@ -102,7 +105,7 @@ public class CrawlJobExtractorPageRankMain {
|
||||
try (var domainQuery = conn.prepareStatement(specificDomainSqlFromId);
|
||||
var urlQuery = conn.prepareStatement(urlsSql))
|
||||
{
|
||||
domainQuery.setInt(1, domainId.getId());
|
||||
domainQuery.setInt(1, domainId.id());
|
||||
ResultSet rsp = domainQuery.executeQuery();
|
||||
domainName = rsp.next() ? rsp.getString(1) : "";
|
||||
|
||||
@ -110,10 +113,10 @@ public class CrawlJobExtractorPageRankMain {
|
||||
spec.id = createId(new EdgeDomain(domainName));
|
||||
spec.urls = new ArrayList<>(1000);
|
||||
|
||||
spec.crawlDepth = getCrawlDepth(new DomainWithId(domainName, domainId.getId()));
|
||||
spec.crawlDepth = getCrawlDepth(new DomainWithId(domainName, domainId.id()));
|
||||
|
||||
urlQuery.setString(1, domainName.toString());
|
||||
urlQuery.setInt(2, domainId.getId());
|
||||
urlQuery.setInt(2, domainId.id());
|
||||
urlQuery.setFetchSize(1000);
|
||||
rsp = urlQuery.executeQuery();
|
||||
|
||||
|
@ -4,15 +4,26 @@ import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class UrlBlocklist {
|
||||
private final List<Predicate<String>> patterns = new ArrayList<>();
|
||||
|
||||
// domains that have a lot of links but we know we don't want to crawl
|
||||
private final Set<String> badDomains = Set.of("t.co", "facebook.com",
|
||||
"instagram.com", "youtube.com",
|
||||
"youtu.be", "amzn.to");
|
||||
|
||||
public UrlBlocklist() {
|
||||
patterns.add(Pattern.compile(".*/[a-f0-9]{40}(/|$)").asPredicate());
|
||||
patterns.add(Pattern.compile("/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$").asPredicate());
|
||||
// Don't deep-crawl git repos
|
||||
patterns.add(Pattern.compile("\\.git/.+").asPredicate());
|
||||
|
||||
// long base64-strings in URLs are typically git hashes or the like, rarely worth crawling
|
||||
patterns.add(Pattern.compile(".*/[^/]*[a-f0-9]{32,}(/|$)").asPredicate());
|
||||
|
||||
// link farms &c
|
||||
patterns.add(Pattern.compile("/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$").asPredicate());
|
||||
patterns.add(Pattern.compile("/permalink/[a-z]+(-([A-Za-z]+|[0-9]+)){3,}\\.(htm|html|php)$").asPredicate());
|
||||
patterns.add(Pattern.compile("(webrx3|lib|pdf|book|720p).*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$").asPredicate());
|
||||
@ -22,34 +33,33 @@ public class UrlBlocklist {
|
||||
|
||||
public boolean isUrlBlocked(EdgeUrl url) {
|
||||
try {
|
||||
if (badDomains.contains(url.domain.domain)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if ("github.com".equals(url.domain.domain)) {
|
||||
return url.path.chars().filter(c -> c == '/').count() > 2;
|
||||
}
|
||||
|
||||
return patterns.stream().anyMatch(p -> p.test(url.path));
|
||||
for (var p : patterns) {
|
||||
if (p.test(url.path))
|
||||
return true;
|
||||
}
|
||||
}
|
||||
catch (StackOverflowError ex) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isForumLink(EdgeUrl linkUrl) {
|
||||
var path = linkUrl.path;
|
||||
if (path.startsWith("/forum")) {
|
||||
return true;
|
||||
}
|
||||
if (path.startsWith("/lists/")) {
|
||||
return true;
|
||||
}
|
||||
if (path.startsWith("mailinglist")) {
|
||||
return true;
|
||||
}
|
||||
if (path.contains("phpbb")) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public boolean isMailingListLink(EdgeUrl linkUrl) {
|
||||
var path = linkUrl.path;
|
||||
if (path.startsWith("/lists/")) {
|
||||
return true;
|
||||
}
|
||||
if (path.contains("mailinglist")) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -63,7 +63,7 @@ public class CrawlerRetreiver {
|
||||
|
||||
if (queue.peek() != null) {
|
||||
var fst = queue.peek();
|
||||
var root = new EdgeUrl(fst.proto, fst.domain, fst.port, "/");
|
||||
var root = fst.domain.toRootUrl();
|
||||
if (known.add(root))
|
||||
queue.addFirst(root);
|
||||
}
|
||||
@ -110,7 +110,7 @@ public class CrawlerRetreiver {
|
||||
.build());
|
||||
}
|
||||
|
||||
var fetchResult = fetcher.probeDomain(new EdgeUrl(fst.proto, fst.domain, fst.port, "/"));
|
||||
var fetchResult = fetcher.probeDomain(fst.domain.toRootUrl());
|
||||
if (!fetchResult.ok()) {
|
||||
logger.debug("Bad status on {}", domain);
|
||||
return Optional.of(createErrorPostFromStatus(fetchResult));
|
||||
@ -121,6 +121,8 @@ public class CrawlerRetreiver {
|
||||
private CrawledDomain crawlDomain() {
|
||||
String ip = findIp(domain);
|
||||
|
||||
assert !queue.isEmpty();
|
||||
|
||||
var robotsRules = fetcher.fetchRobotRules(queue.peek().domain);
|
||||
long crawlDelay = robotsRules.getCrawlDelay();
|
||||
|
||||
@ -209,7 +211,7 @@ public class CrawlerRetreiver {
|
||||
linkParser.parseLink(baseUrl, link)
|
||||
.filter(this::isSameDomain)
|
||||
.filter(u -> !urlBlocklist.isUrlBlocked(u))
|
||||
.filter(u -> !urlBlocklist.isForumLink(u))
|
||||
.filter(u -> !urlBlocklist.isMailingListLink(u))
|
||||
.filter(known::add)
|
||||
.ifPresent(queue::addLast);
|
||||
}
|
||||
@ -217,7 +219,7 @@ public class CrawlerRetreiver {
|
||||
linkParser.parseFrame(baseUrl, link)
|
||||
.filter(this::isSameDomain)
|
||||
.filter(u -> !urlBlocklist.isUrlBlocked(u))
|
||||
.filter(u -> !urlBlocklist.isForumLink(u))
|
||||
.filter(u -> !urlBlocklist.isMailingListLink(u))
|
||||
.filter(known::add)
|
||||
.ifPresent(queue::addLast);
|
||||
}
|
||||
@ -225,14 +227,14 @@ public class CrawlerRetreiver {
|
||||
linkParser.parseFrame(baseUrl, link)
|
||||
.filter(this::isSameDomain)
|
||||
.filter(u -> !urlBlocklist.isUrlBlocked(u))
|
||||
.filter(u -> !urlBlocklist.isForumLink(u))
|
||||
.filter(u -> !urlBlocklist.isMailingListLink(u))
|
||||
.filter(known::add)
|
||||
.ifPresent(queue::addLast);
|
||||
}
|
||||
}
|
||||
|
||||
private Optional<EdgeUrl> findCanonicalUrl(EdgeUrl baseUrl, Document parsed) {
|
||||
baseUrl = baseUrl.withPath("/");
|
||||
baseUrl = baseUrl.domain.toRootUrl();
|
||||
|
||||
for (var link : parsed.select("link[rel=canonical]")) {
|
||||
return linkParser.parseLink(baseUrl, link);
|
||||
|
@ -109,7 +109,7 @@ public class HttpFetcher {
|
||||
@SneakyThrows
|
||||
public FetchResult probeDomain(EdgeUrl url) {
|
||||
var head = new Request.Builder().head().addHeader("User-agent", userAgent)
|
||||
.url(new EdgeUrl(url.proto, url.domain, url.port, "/").toString())
|
||||
.url(url.domain.toRootUrl().toString())
|
||||
.build();
|
||||
|
||||
var call = client.newCall(head);
|
||||
@ -293,7 +293,7 @@ public class HttpFetcher {
|
||||
|
||||
private Optional<SimpleRobotRules> fetchRobotsForProto(String proto, EdgeDomain domain) {
|
||||
try {
|
||||
var url = new EdgeUrl(proto, domain, null, "/robots.txt");
|
||||
var url = new EdgeUrl(proto, domain, null, "/robots.txt", null);
|
||||
return Optional.of(parseRobotsTxt(fetchContent(url)));
|
||||
}
|
||||
catch (Exception ex) {
|
||||
|
@ -13,44 +13,14 @@ import java.util.Optional;
|
||||
|
||||
@ImplementedBy(EdgeDataStoreDaoImpl.class)
|
||||
public interface EdgeDataStoreDao {
|
||||
boolean isBlacklisted(EdgeDomain domain);
|
||||
|
||||
EdgeId<EdgeDomain> getDomainId(EdgeDomain domain);
|
||||
EdgeId<EdgeUrl> getUrlId(EdgeUrl domain);
|
||||
EdgeUrl getUrl(EdgeId<EdgeUrl> id);
|
||||
EdgeUrlDetails getUrlDetails(EdgeId<EdgeUrl> id);
|
||||
|
||||
List<BrowseResult> getDomainNeighbors(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist backlist, int count);
|
||||
List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist backlist, int count);
|
||||
|
||||
List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist backlist);
|
||||
List<EdgeUrlDetails> getUrlDetailsMulti(List<EdgeId<EdgeUrl>> ids);
|
||||
List<EdgeId<EdgeDomain>> getDomainIdsFromUrlIds(Collection<EdgeId<EdgeUrl>> urlIds);
|
||||
|
||||
|
||||
EdgeDomain getDomain(EdgeId<EdgeDomain> id);
|
||||
|
||||
List<EdgeId<EdgeUrl>> inboudUrls(EdgeId<EdgeUrl> id, int limit);
|
||||
List<EdgeId<EdgeUrl>> outboundUrls(EdgeId<EdgeUrl> id, int limit);
|
||||
|
||||
Optional<EdgeId<EdgeUrl>> resolveAmbiguousDomain(String name);
|
||||
|
||||
|
||||
int getPagesKnown(EdgeId<EdgeDomain> domainId);
|
||||
int getPagesVisited(EdgeId<EdgeDomain> domainId);
|
||||
int getPagesIndexed(EdgeId<EdgeDomain> domainId);
|
||||
|
||||
int getIncomingLinks(EdgeId<EdgeDomain> domainId);
|
||||
int getOutboundLinks(EdgeId<EdgeDomain> domainId);
|
||||
|
||||
double getDomainQuality(EdgeId<EdgeDomain> domainId);
|
||||
|
||||
EdgeDomainIndexingState getDomainState(EdgeId<EdgeDomain> domainId);
|
||||
|
||||
List<EdgeDomain> getLinkingDomains(EdgeId<EdgeDomain> domainId);
|
||||
|
||||
List<EdgeUrl> getNewUrls(EdgeId<EdgeDomain> domainId, Collection<EdgeUrl> links);
|
||||
|
||||
double getRank(EdgeId<EdgeDomain> domainId);
|
||||
|
||||
void updateDomainIndexTimestamp(EdgeDomain domain, EdgeDomainIndexingState state, EdgeDomain alias, int minIndexed);
|
||||
}
|
||||
|
@ -17,13 +17,8 @@ import nu.marginalia.wmsa.edge.search.model.BrowseResult;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.Connection;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Types;
|
||||
import java.util.*;
|
||||
import java.util.function.Function;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
|
||||
public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
@ -33,7 +28,6 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
private final Cache<EdgeUrl, EdgeId<EdgeUrl>> urlIdCache = CacheBuilder.newBuilder().maximumSize(100_000).build();
|
||||
private final Cache<EdgeDomain, EdgeId<EdgeDomain>> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
|
||||
private static final String DEFAULT_PROTOCOL = "http";
|
||||
public static double QUALITY_LOWER_BOUND_CUTOFF = -15.;
|
||||
@Inject
|
||||
public EdgeDataStoreDaoImpl(HikariDataSource dataSource)
|
||||
@ -48,30 +42,13 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
domainIdCache.invalidateAll();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public boolean isBlacklisted(EdgeDomain domain) {
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN=?")) {
|
||||
stmt.setString(1, domain.domain);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public EdgeId<EdgeDomain> getDomainId(EdgeDomain domain) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
return domainIdCache.get(domain, () -> {
|
||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) {
|
||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
stmt.setString(1, domain.toString());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
@ -86,104 +63,14 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public EdgeId<EdgeUrl> getUrlId(EdgeUrl url) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
return urlIdCache.get(url, () -> {
|
||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_URL_VIEW WHERE URL_PATH=? AND URL_DOMAIN=? AND URL_PROTO=?")) {
|
||||
stmt.setString(1, url.path);
|
||||
stmt.setString(2, url.domain.toString());
|
||||
stmt.setString(3, url.proto);
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return new EdgeId<>(rsp.getInt(1));
|
||||
}
|
||||
}
|
||||
// Lenient mode for http->https upgrades etc
|
||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_URL_VIEW WHERE URL_PATH=? AND URL_DOMAIN=?")) {
|
||||
stmt.setString(1, url.path);
|
||||
stmt.setString(2, url.domain.toString());
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return new EdgeId<>(rsp.getInt(1));
|
||||
}
|
||||
}
|
||||
throw new NoSuchElementException(url.toString());
|
||||
});
|
||||
}
|
||||
catch (UncheckedExecutionException ex) {
|
||||
throw ex.getCause();
|
||||
private <T> String idList(List<EdgeId<T>> ids) {
|
||||
StringJoiner j = new StringJoiner(",", "(", ")");
|
||||
for (var id : ids) {
|
||||
j.add(Integer.toString(id.id()));
|
||||
}
|
||||
return j.toString();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public List<EdgeId<EdgeDomain>> getDomainIdsFromUrlIds(Collection<EdgeId<EdgeUrl>> urlIds) {
|
||||
List<EdgeId<EdgeDomain>> results = new ArrayList<>(urlIds.size());
|
||||
|
||||
if (urlIds.isEmpty())
|
||||
return results;
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT DOMAIN_ID FROM EC_URL WHERE ID IN " + urlIds
|
||||
.stream()
|
||||
.map(EdgeId::getId)
|
||||
.map(Object::toString)
|
||||
.collect(Collectors.joining(",", "(", ")"))))
|
||||
{
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
results.add(new EdgeId<>(rsp.getInt(1)));
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
static final Pattern badChars = Pattern.compile("[';\\\\]");
|
||||
private String saneString(String s) {
|
||||
return "\'"+badChars.matcher(s).replaceAll("?")+"\'";
|
||||
}
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public EdgeUrl getUrl(EdgeId<EdgeUrl> id) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.createStatement()) {
|
||||
var rsp = stmt.executeQuery("SELECT URL_PROTO, URL_DOMAIN,URL_PORT,URL_PATH FROM EC_URL_VIEW WHERE ID=" + id.getId());
|
||||
if (rsp.next()) {
|
||||
return new EdgeUrl(rsp.getString(1), new EdgeDomain(rsp.getString(2)), rsp.getInt(3), rsp.getString(4));
|
||||
}
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public EdgeUrlDetails getUrlDetails(EdgeId<EdgeUrl> id) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.createStatement()) {
|
||||
var rsp = stmt.executeQuery("SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID=" + id.getId());
|
||||
if (rsp.next()) {
|
||||
EdgeUrl url = new EdgeUrl(rsp.getString(2), new EdgeDomain(rsp.getString(3)), rsp.getInt(4), rsp.getString(5));
|
||||
return new EdgeUrlDetails(rsp.getInt(1), url, rsp.getString(6), rsp.getString(7), rsp.getDouble(8), rsp.getDouble(15), rsp.getDouble(9), rsp.getInt(10), rsp.getInt(11), rsp.getString(12), rsp.getInt(13), EdgePageScoreAdjustment.zero(), Integer.MAX_VALUE, Double.MAX_VALUE, rsp.getString(14), rsp.getInt(16), 0, rsp.getInt(17));
|
||||
}
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public List<EdgeUrlDetails> getUrlDetailsMulti(List<EdgeId<EdgeUrl>> ids) {
|
||||
@ -193,16 +80,39 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
List<EdgeUrlDetails> result = new ArrayList<>(ids.size());
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
// This is SQL-injection safe, the IDs are of type int
|
||||
String idString = ids.stream().map(EdgeId::getId).map(Objects::toString).collect(Collectors.joining(",", "(", ")"));
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID IN " + idString)) {
|
||||
String idString = idList(ids);
|
||||
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
SELECT ID, URL,
|
||||
TITLE, DESCRIPTION,
|
||||
QUALITY,
|
||||
WORDS_TOTAL, FORMAT, FEATURES,
|
||||
IP, DOMAIN_STATE,
|
||||
DATA_HASH
|
||||
FROM EC_URL_VIEW WHERE ID IN
|
||||
""" + idString)) {
|
||||
stmt.setFetchSize(ids.size());
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
EdgeUrl url = new EdgeUrl(rsp.getString(2), new EdgeDomain(rsp.getString(3)), rsp.getInt(4), rsp.getString(5));
|
||||
var val = new EdgeUrlDetails(rsp.getInt(1), url, rsp.getString(6), rsp.getString(7), rsp.getDouble(8), rsp.getDouble(15), rsp.getDouble(9), rsp.getInt(10), rsp.getInt(11), rsp.getString(12), rsp.getInt(13), EdgePageScoreAdjustment.zero(), Integer.MAX_VALUE, Double.MAX_VALUE, rsp.getString(14), rsp.getInt(16), 0, rsp.getInt(17));
|
||||
EdgeUrl url = new EdgeUrl(rsp.getString(2));
|
||||
var val = new EdgeUrlDetails(rsp.getInt(1), url,
|
||||
rsp.getString(3), // title
|
||||
rsp.getString(4), // description
|
||||
rsp.getDouble(5), // quality
|
||||
rsp.getInt(6), // wordsTotal
|
||||
rsp.getString(7), // format
|
||||
rsp.getInt(8), // features
|
||||
rsp.getString(9), // ip
|
||||
EdgeDomainIndexingState.valueOf(rsp.getString(10)), // domainState
|
||||
rsp.getInt(11), // dataHash
|
||||
EdgePageScoreAdjustment.zero(), // urlQualityAdjustment
|
||||
Integer.MAX_VALUE, // rankingId
|
||||
Double.MAX_VALUE, // termScore
|
||||
0 // queryLength
|
||||
);
|
||||
if (val.urlQuality >= QUALITY_LOWER_BOUND_CUTOFF) {
|
||||
result.add(val);
|
||||
}
|
||||
@ -214,82 +124,13 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<BrowseResult> getDomainNeighbors(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist blacklist, int count) {
|
||||
final Set<BrowseResult> domains = new HashSet<>(count*3);
|
||||
|
||||
final String q = "SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, URL_PART from EC_DOMAIN_NEIGHBORS INNER JOIN EC_DOMAIN ON NEIGHBOR_ID=EC_DOMAIN.ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL AND EC_DOMAIN_NEIGHBORS.DOMAIN_ID = ? ORDER BY ADJ_IDX LIMIT ?";
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(q)) {
|
||||
stmt.setFetchSize(count);
|
||||
stmt.setInt(1, domainId.getId());
|
||||
stmt.setInt(2, count);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
int id = rsp.getInt(1);
|
||||
String domain = rsp.getString(2);
|
||||
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
|
||||
|
||||
domains.add(new BrowseResult(url, id));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final String q2 = "SELECT EC_DOMAIN.ID, URL_PART FROM EC_DOMAIN_LINK INNER JOIN EC_DOMAIN ON DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE SOURCE_DOMAIN_ID=? AND STATE<2 AND DOMAIN_ALIAS IS NULL GROUP BY EC_DOMAIN.ID ORDER BY RANK ASC LIMIT ?";
|
||||
try (var stmt = connection.prepareStatement(q2)) {
|
||||
|
||||
stmt.setFetchSize(count);
|
||||
stmt.setInt(1, domainId.getId());
|
||||
stmt.setInt(2, count);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
int id = rsp.getInt(1);
|
||||
String domain = rsp.getString(2);
|
||||
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
|
||||
|
||||
domains.add(new BrowseResult(url, id));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final String q3 = "SELECT EC_DOMAIN.ID, URL_PART FROM EC_DOMAIN_LINK INNER JOIN EC_DOMAIN ON DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE DEST_DOMAIN_ID=? AND STATE<2 AND DOMAIN_ALIAS IS NULL GROUP BY EC_DOMAIN.ID ORDER BY RANK ASC LIMIT ?";
|
||||
try (var stmt = connection.prepareStatement(q3)) {
|
||||
stmt.setFetchSize(count);
|
||||
stmt.setInt(1, domainId.getId());
|
||||
stmt.setInt(2, count);
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
int id = rsp.getInt(1);
|
||||
String domain = rsp.getString(2);
|
||||
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
|
||||
|
||||
domains.add(new BrowseResult(url, id));
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (SQLException throwables) {
|
||||
throwables.printStackTrace();
|
||||
}
|
||||
|
||||
|
||||
return new ArrayList<>(domains);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist blacklist, int count) {
|
||||
final Set<BrowseResult> domains = new HashSet<>(count*3);
|
||||
|
||||
final String q = """
|
||||
SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, URL_PART, COUNT(*) AS CNT
|
||||
SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, DOMAIN_NAME, COUNT(*) AS CNT
|
||||
FROM EC_DOMAIN_NEIGHBORS
|
||||
INNER JOIN EC_DOMAIN ON NEIGHBOR_ID=EC_DOMAIN.ID
|
||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
@ -308,7 +149,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(q)) {
|
||||
stmt.setFetchSize(count);
|
||||
stmt.setInt(1, domainId.getId());
|
||||
stmt.setInt(1, domainId.id());
|
||||
stmt.setInt(2, count);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
@ -316,16 +157,14 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
String domain = rsp.getString(2);
|
||||
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
|
||||
|
||||
domains.add(new BrowseResult(url, id));
|
||||
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (domains.size() < count/2) {
|
||||
final String q2 = """
|
||||
SELECT EC_DOMAIN.ID, URL_PART
|
||||
SELECT EC_DOMAIN.ID, DOMAIN_NAME
|
||||
FROM EC_DOMAIN
|
||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
INNER JOIN EC_DOMAIN_LINK B ON DEST_DOMAIN_ID=EC_DOMAIN.ID
|
||||
@ -339,7 +178,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
try (var stmt = connection.prepareStatement(q2)) {
|
||||
|
||||
stmt.setFetchSize(count/2);
|
||||
stmt.setInt(1, domainId.getId());
|
||||
stmt.setInt(1, domainId.id());
|
||||
stmt.setInt(2, count/2 - domains.size());
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next() && domains.size() < count/2) {
|
||||
@ -347,9 +186,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
String domain = rsp.getString(2);
|
||||
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
|
||||
|
||||
domains.add(new BrowseResult(url, id));
|
||||
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -357,11 +194,11 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
|
||||
if (domains.size() < count/2) {
|
||||
final String q3 = """
|
||||
SELECT EC_DOMAIN.ID, URL_PART
|
||||
FROM EC_DOMAIN
|
||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
SELECT EC_DOMAIN.ID, DOMAIN_NAME
|
||||
FROM EC_DOMAIN
|
||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
INNER JOIN EC_DOMAIN_LINK B ON B.SOURCE_DOMAIN_ID=EC_DOMAIN.ID
|
||||
INNER JOIN EC_DOMAIN_LINK O ON O.DEST_DOMAIN_ID=EC_DOMAIN.ID
|
||||
INNER JOIN EC_DOMAIN_LINK O ON O.DEST_DOMAIN_ID=EC_DOMAIN.ID
|
||||
WHERE B.DEST_DOMAIN_ID=?
|
||||
AND STATE<2
|
||||
AND KNOWN_URLS<1000
|
||||
@ -372,7 +209,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
LIMIT ?""";
|
||||
try (var stmt = connection.prepareStatement(q3)) {
|
||||
stmt.setFetchSize(count/2);
|
||||
stmt.setInt(1, domainId.getId());
|
||||
stmt.setInt(1, domainId.id());
|
||||
stmt.setInt(2, count/2 - domains.size());
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
@ -381,9 +218,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
String domain = rsp.getString(2);
|
||||
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
|
||||
|
||||
domains.add(new BrowseResult(url, id));
|
||||
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -399,7 +234,15 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
@Override
|
||||
public List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist blacklist) {
|
||||
|
||||
final String q = "SELECT DOMAIN_ID,URL_PART FROM EC_RANDOM_DOMAINS INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL ORDER BY RAND() LIMIT ?";
|
||||
final String q = """
|
||||
SELECT DOMAIN_ID, DOMAIN_NAME
|
||||
FROM EC_RANDOM_DOMAINS
|
||||
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID
|
||||
WHERE STATE<2
|
||||
AND DOMAIN_ALIAS IS NULL
|
||||
ORDER BY RAND()
|
||||
LIMIT ?
|
||||
""";
|
||||
List<BrowseResult> domains = new ArrayList<>(count);
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
try (var stmt = conn.prepareStatement(q)) {
|
||||
@ -410,9 +253,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
String domain = rsp.getString(2);
|
||||
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
|
||||
|
||||
domains.add(new BrowseResult(url, id));
|
||||
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -428,8 +269,8 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
public EdgeDomain getDomain(EdgeId<EdgeDomain> id) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT URL_PART FROM EC_DOMAIN WHERE ID=?")) {
|
||||
stmt.setInt(1, id.getId());
|
||||
try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
|
||||
stmt.setInt(1, id.id());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return new EdgeDomain(rsp.getString(1));
|
||||
@ -439,330 +280,4 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
}
|
||||
}
|
||||
|
||||
@Override @SneakyThrows
|
||||
public List<EdgeId<EdgeUrl>> inboudUrls(EdgeId<EdgeUrl> id, int limit) {
|
||||
|
||||
List<EdgeId<EdgeUrl>> ret = new ArrayList<>();
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt =
|
||||
connection.prepareStatement("SELECT SRC_URL_ID FROM EC_RELATED_LINKS_IN WHERE DEST_URL_ID=? ORDER BY SRC_URL_QUALITY DESC LIMIT ?")) {
|
||||
stmt.setFetchSize(limit);
|
||||
stmt.setInt(1, id.getId());
|
||||
stmt.setInt(2, limit);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
ret.add(new EdgeId<>(rsp.getInt(1)));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@Override @SneakyThrows
|
||||
public List<EdgeId<EdgeUrl>> outboundUrls(EdgeId<EdgeUrl> id, int limit) {
|
||||
|
||||
List<EdgeId<EdgeUrl>> ret = new ArrayList<>();
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt =
|
||||
connection.prepareStatement("SELECT DEST_URL_ID FROM EC_RELATED_LINKS_IN WHERE SRC_URL_ID=? ORDER BY SRC_URL_QUALITY DESC LIMIT ?")) {
|
||||
stmt.setFetchSize(limit);
|
||||
stmt.setInt(1, id.getId());
|
||||
stmt.setInt(2, limit);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
ret.add(new EdgeId<>(rsp.getInt(1)));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Optional<EdgeId<EdgeUrl>> resolveAmbiguousDomain(String name) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
|
||||
stmt.setString(1, name);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return Optional.of(new EdgeId<>(rsp.getInt(1)));
|
||||
}
|
||||
}
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
|
||||
stmt.setString(1, "https://"+name);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return Optional.of(new EdgeId<>(rsp.getInt(1)));
|
||||
}
|
||||
}
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
|
||||
stmt.setString(1, "http://"+name);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return Optional.of(new EdgeId<>(rsp.getInt(1)));
|
||||
}
|
||||
}
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
|
||||
stmt.setString(1, "https://www."+name);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return Optional.of(new EdgeId<>(rsp.getInt(1)));
|
||||
}
|
||||
}
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
|
||||
stmt.setString(1, "http://www."+name);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return Optional.of(new EdgeId<>(rsp.getInt(1)));
|
||||
}
|
||||
}
|
||||
|
||||
} catch (SQLException throwables) {
|
||||
logger.info("Could not resolve domain id for {}", name);
|
||||
}
|
||||
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public int getPagesKnown(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public int getPagesVisited(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public int getPagesIndexed(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public int getIncomingLinks(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) {
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public int getOutboundLinks(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) {
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public double getDomainQuality(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT QUALITY FROM EC_DOMAIN WHERE ID=?")) {
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getDouble(1);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
return -5;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public EdgeDomainIndexingState getDomainState(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) {
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return EdgeDomainIndexingState.fromCode(rsp.getInt(1));
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
} catch (SQLException throwables) {
|
||||
throwables.printStackTrace();
|
||||
}
|
||||
return EdgeDomainIndexingState.ERROR;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<EdgeDomain> getLinkingDomains(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
List<EdgeDomain> results = new ArrayList<>(25);
|
||||
try (var stmt = connection.prepareStatement("SELECT SOURCE_URL FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) {
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
results.add(new EdgeDomain(rsp.getString(1)));
|
||||
}
|
||||
return results;
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
|
||||
} catch (SQLException throwables) {
|
||||
throwables.printStackTrace();
|
||||
}
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<EdgeUrl> getNewUrls(EdgeId<EdgeDomain> domainId, Collection<EdgeUrl> links) {
|
||||
Map<String, EdgeUrl> edgeUrlByPath = links.stream().collect(Collectors.toMap(EdgeUrl::getPath, Function.identity(), (a,b)->a));
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement("SELECT URL FROM EC_URL WHERE DOMAIN_ID=?")) {
|
||||
stmt.setFetchSize(500);
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
edgeUrlByPath.remove(rs.getString(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
return new ArrayList<>(edgeUrlByPath.values());
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getRank(EdgeId<EdgeDomain> domainId) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT IFNULL(RANK, 1) FROM EC_DOMAIN WHERE ID=?")) {
|
||||
stmt.setInt(1, domainId.getId());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getDouble(1);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
} catch (SQLException throwables) {
|
||||
throwables.printStackTrace();
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateDomainIndexTimestamp(EdgeDomain domain, EdgeDomainIndexingState state, EdgeDomain alias, int minIndexed) {
|
||||
try (var connection = dataSource.getConnection();
|
||||
var stmt = connection.prepareStatement("UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=?, DOMAIN_ALIAS=?, INDEXED=GREATEST(INDEXED,?) WHERE ID=?")) {
|
||||
stmt.setInt(1, state.code);
|
||||
if (null == alias) {
|
||||
stmt.setNull(2, Types.INTEGER);
|
||||
}
|
||||
else {
|
||||
stmt.setInt(2, getDomainId(alias).getId());
|
||||
}
|
||||
|
||||
stmt.setInt(3, minIndexed);
|
||||
stmt.setInt(4, getDomainId(domain).getId());
|
||||
stmt.executeUpdate();
|
||||
connection.commit();
|
||||
}
|
||||
catch (SQLException throwables) {
|
||||
logger.error("SQL error", throwables);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private double getDomainQuality(Connection connection, EdgeDomain src) {
|
||||
try (var stmt = connection.prepareStatement("SELECT QUALITY_RAW FROM EC_DOMAIN WHERE URL_PART=?")) {
|
||||
stmt.setString(1, src.toString());
|
||||
var res = stmt.executeQuery();
|
||||
|
||||
if (res.next()) {
|
||||
var q = res.getDouble(1);
|
||||
if (q > 0.5) {
|
||||
logger.warn("gDQ({}) -> 1", src);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("DB error", ex);
|
||||
}
|
||||
|
||||
return -5;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -9,7 +9,7 @@ import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
public interface EdgeDomainBlacklist {
|
||||
boolean isBlacklisted(int domainId);
|
||||
default boolean isBlacklisted(EdgeId<EdgeDomain> domainId) {
|
||||
return isBlacklisted(domainId.getId());
|
||||
return isBlacklisted(domainId.id());
|
||||
}
|
||||
default TIntHashSet getSpamDomains() {
|
||||
return new TIntHashSet();
|
||||
|
@ -50,7 +50,7 @@ public class EdgeDomainBlacklistImpl implements EdgeDomainBlacklist {
|
||||
final TIntHashSet result = new TIntHashSet(1_000_000);
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_TOP_DOMAIN ON EC_DOMAIN.URL_TOP_DOMAIN_ID = EC_TOP_DOMAIN.ID INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_TOP_DOMAIN.URL_PART")) {
|
||||
try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_DOMAIN.DOMAIN_TOP")) {
|
||||
stmt.setFetchSize(1000);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
|
@ -1,13 +1,11 @@
|
||||
package nu.marginalia.wmsa.edge.index.radix;
|
||||
package nu.marginalia.wmsa.edge.index;
|
||||
|
||||
import nu.marginalia.wmsa.edge.index.EdgeIndexControl;
|
||||
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexReader;
|
||||
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriter;
|
||||
import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget;
|
||||
import nu.marginalia.wmsa.edge.index.service.query.Query;
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
|
||||
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriter;
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.Query;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -31,7 +29,7 @@ public class EdgeIndexBucket {
|
||||
@NotNull
|
||||
private final IndexServicesFactory servicesFactory;
|
||||
private final EdgeIndexControl indexControl;
|
||||
private final SearchIndexWriter writer;
|
||||
private final SearchIndexJournalWriter writer;
|
||||
|
||||
private final int id;
|
||||
|
@ -3,7 +3,9 @@ package nu.marginalia.wmsa.edge.index;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.service.index.ConversionUnnecessaryException;
|
||||
import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
public class EdgeIndexControl {
|
||||
@ -21,13 +23,16 @@ public class EdgeIndexControl {
|
||||
|
||||
for (IndexBlock block : IndexBlock.values()) {
|
||||
try {
|
||||
servicesFactory.getIndexConverter(id, block);
|
||||
servicesFactory.convertIndex(id, block);
|
||||
|
||||
System.runFinalization();
|
||||
System.gc();
|
||||
}
|
||||
catch (ConversionUnnecessaryException unnecessary) {
|
||||
|
||||
// swallow quietly
|
||||
}
|
||||
catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
@ -35,10 +40,6 @@ public class EdgeIndexControl {
|
||||
System.gc();
|
||||
}
|
||||
|
||||
public long wordCount(int id) {
|
||||
return servicesFactory.wordCount(id);
|
||||
}
|
||||
|
||||
public void switchIndexFiles(int id) throws Exception {
|
||||
servicesFactory.switchFilesJob(id).call();
|
||||
}
|
||||
|
@ -11,15 +11,24 @@ import gnu.trove.set.hash.TIntHashSet;
|
||||
import io.prometheus.client.Counter;
|
||||
import io.prometheus.client.Histogram;
|
||||
import io.reactivex.rxjava3.schedulers.Schedulers;
|
||||
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
|
||||
import nu.marginalia.util.ListChunker;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||
import nu.marginalia.wmsa.configuration.server.MetricsServer;
|
||||
import nu.marginalia.wmsa.configuration.server.Service;
|
||||
import nu.marginalia.wmsa.edge.index.model.*;
|
||||
import nu.marginalia.wmsa.edge.index.service.SearchIndexes;
|
||||
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl;
|
||||
import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.wmsa.edge.model.*;
|
||||
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
|
||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
|
||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePutWordsRequest;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||
import nu.marginalia.wmsa.edge.model.search.*;
|
||||
@ -48,8 +57,11 @@ public class EdgeIndexService extends Service {
|
||||
@NotNull
|
||||
private final Initialization init;
|
||||
private final SearchIndexes indexes;
|
||||
private final KeywordLexicon keywordLexicon;
|
||||
|
||||
private final Gson gson = new GsonBuilder().create();
|
||||
private final Gson gson = new GsonBuilder()
|
||||
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
|
||||
.create();
|
||||
|
||||
private static final Histogram wmsa_edge_index_query_time
|
||||
= Histogram.build().name("wmsa_edge_index_query_time").help("-").register();
|
||||
@ -66,12 +78,13 @@ public class EdgeIndexService extends Service {
|
||||
@Named("service-port") Integer port,
|
||||
Initialization init,
|
||||
MetricsServer metricsServer,
|
||||
SearchIndexes indexes
|
||||
) {
|
||||
SearchIndexes indexes,
|
||||
IndexServicesFactory servicesFactory) {
|
||||
super(ip, port, init, metricsServer);
|
||||
|
||||
this.init = init;
|
||||
this.indexes = indexes;
|
||||
this.keywordLexicon = servicesFactory.getKeywordLexicon();
|
||||
|
||||
Spark.post("/words/", this::putWords);
|
||||
Spark.post("/search/", this::search, gson::toJson);
|
||||
@ -173,29 +186,22 @@ public class EdgeIndexService extends Service {
|
||||
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
|
||||
EdgePageWords words, int idx
|
||||
) {
|
||||
SearchIndexWriterImpl indexWriter = indexes.getIndexWriter(idx);
|
||||
SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);
|
||||
|
||||
if (!words.words.isEmpty()) {
|
||||
if (words.size() < 1000) {
|
||||
indexWriter.put(domainId, urlId, words.block, words.words);
|
||||
} else {
|
||||
chunks(words.words, 1000).forEach(chunk -> {
|
||||
indexWriter.put(domainId, urlId, words.block, chunk);
|
||||
});
|
||||
}
|
||||
}
|
||||
for (var chunk : ListChunker.chopList(words.words, SearchIndexJournalEntry.MAX_LENGTH)) {
|
||||
|
||||
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
|
||||
var header = new SearchIndexJournalEntryHeader(domainId, urlId, words.block);
|
||||
|
||||
indexWriter.put(header, entry);
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
private <T> List<List<T>> chunks(Collection<T> coll, int size) {
|
||||
List<List<T>> ret = new ArrayList<>();
|
||||
List<T> data = List.copyOf(coll);
|
||||
|
||||
for (int i = 0; i < data.size(); i+=size) {
|
||||
ret.add(data.subList(i, Math.min(data.size(), i+size)));
|
||||
}
|
||||
|
||||
return ret;
|
||||
private long[] getOrInsertWordIds(List<String> words) {
|
||||
return words.stream()
|
||||
.filter(w -> w.getBytes().length < Byte.MAX_VALUE)
|
||||
.mapToLong(keywordLexicon::getOrInsert)
|
||||
.toArray();
|
||||
}
|
||||
|
||||
private Object search(Request request, Response response) {
|
||||
@ -341,7 +347,7 @@ public class EdgeIndexService extends Service {
|
||||
|
||||
getQuery(i, budget, sq.block, lv -> localFilter.filterRawValue(i, lv), searchTerms)
|
||||
.mapToObj(id -> new EdgeSearchResultItem(i, sq.termSize(), id))
|
||||
.filter(ri -> !seenResults.contains(ri.url.getId()) && localFilter.test(i, domainCountFilter, ri))
|
||||
.filter(ri -> !seenResults.contains(ri.url.id()) && localFilter.test(i, domainCountFilter, ri))
|
||||
.limit(specs.limitTotal * 3L)
|
||||
.distinct()
|
||||
.limit(Math.min(specs.limitByBucket
|
||||
@ -350,7 +356,7 @@ public class EdgeIndexService extends Service {
|
||||
|
||||
|
||||
for (var result : resultsForBucket) {
|
||||
seenResults.add(result.url.getId());
|
||||
seenResults.add(result.url.id());
|
||||
}
|
||||
for (var result : resultsForBucket) {
|
||||
for (var searchTerm : sq.searchTermsInclude) {
|
||||
@ -401,7 +407,7 @@ public class EdgeIndexService extends Service {
|
||||
public boolean filterRawValue(int bucket, long value) {
|
||||
var domain = new EdgeId<EdgeDomain>((int)(value >>> 32));
|
||||
|
||||
if (domain.getId() == Integer.MAX_VALUE) {
|
||||
if (domain.id() == Integer.MAX_VALUE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -409,11 +415,11 @@ public class EdgeIndexService extends Service {
|
||||
}
|
||||
|
||||
long getKey(int bucket, EdgeId<EdgeDomain> id) {
|
||||
return ((long)bucket) << 32 | id.getId();
|
||||
return ((long)bucket) << 32 | id.id();
|
||||
}
|
||||
|
||||
public boolean test(int bucket, EdgeSearchResultItem item) {
|
||||
if (item.domain.getId() == Integer.MAX_VALUE) {
|
||||
if (item.domain.id() == Integer.MAX_VALUE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -431,7 +437,7 @@ public class EdgeIndexService extends Service {
|
||||
}
|
||||
|
||||
public boolean test(int bucket, DomainResultCountFilter root, EdgeSearchResultItem item) {
|
||||
if (item.domain.getId() == Integer.MAX_VALUE) {
|
||||
if (item.domain.id() == Integer.MAX_VALUE) {
|
||||
return true;
|
||||
}
|
||||
return root.getCount(bucket, item) + resultsByDomain.adjustOrPutValue(getKey(bucket, item.domain), 1, 1) <= limitByDomain;
|
||||
|
@ -4,13 +4,19 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.google.inject.name.Named;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
||||
import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException;
|
||||
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
|
||||
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
|
||||
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPreconverter;
|
||||
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.radix.EdgeIndexBucket;
|
||||
import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryReader;
|
||||
import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryWriter;
|
||||
import nu.marginalia.wmsa.edge.index.service.index.*;
|
||||
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -32,15 +38,16 @@ public class IndexServicesFactory {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final PartitionedDataFile writerIndexFile;
|
||||
private final RootDataFile writerDictionaryFile;
|
||||
private final RootDataFile keywordLexiconFile;
|
||||
private final PartitionedDataFile preconverterOutputFile;
|
||||
private final DoublePartitionedDataFile indexReadWordsFile;
|
||||
private final DoublePartitionedDataFile indexReadUrlsFile;
|
||||
private final DoublePartitionedDataFile indexWriteWordsFile;
|
||||
private final DoublePartitionedDataFile indexWriteUrlsFile;
|
||||
private volatile static DictionaryWriter dictionaryWriter;
|
||||
private volatile static KeywordLexicon keywordLexicon;
|
||||
private final Long dictionaryHashMapSize;
|
||||
private final SearchIndexPartitioner partitoner;
|
||||
private final SearchIndexPartitioner partitioner;
|
||||
|
||||
@Inject
|
||||
public IndexServicesFactory(
|
||||
@Named("tmp-file-dir") Path tmpFileDir,
|
||||
@ -48,14 +55,14 @@ public class IndexServicesFactory {
|
||||
@Named("partition-root-slow-tmp") Path partitionRootSlowTmp,
|
||||
@Named("partition-root-fast") Path partitionRootFast,
|
||||
@Named("edge-writer-page-index-file") String writerIndexFile,
|
||||
@Named("edge-writer-dictionary-file") String writerDictionaryFile,
|
||||
@Named("edge-writer-dictionary-file") String keywordLexiconFile,
|
||||
@Named("edge-index-read-words-file") String indexReadWordsFile,
|
||||
@Named("edge-index-read-urls-file") String indexReadUrlsFile,
|
||||
@Named("edge-index-write-words-file") String indexWriteWordsFile,
|
||||
@Named("edge-index-write-urls-file") String indexWriteUrlsFile,
|
||||
@Named("edge-dictionary-hash-map-size") Long dictionaryHashMapSize,
|
||||
EdgeDomainBlacklist domainBlacklist,
|
||||
SearchIndexPartitioner partitoner
|
||||
SearchIndexPartitioner partitioner
|
||||
) {
|
||||
|
||||
this.tmpFileDir = tmpFileDir;
|
||||
@ -63,41 +70,46 @@ public class IndexServicesFactory {
|
||||
this.domainBlacklist = domainBlacklist;
|
||||
|
||||
this.writerIndexFile = new PartitionedDataFile(partitionRootSlow, writerIndexFile);
|
||||
this.writerDictionaryFile = new RootDataFile(partitionRootSlow, writerDictionaryFile);
|
||||
this.keywordLexiconFile = new RootDataFile(partitionRootSlow, keywordLexiconFile);
|
||||
this.indexReadWordsFile = new DoublePartitionedDataFile(partitionRootFast, indexReadWordsFile);
|
||||
this.indexReadUrlsFile = new DoublePartitionedDataFile(partitionRootFast, indexReadUrlsFile);
|
||||
this.indexWriteWordsFile = new DoublePartitionedDataFile(partitionRootFast, indexWriteWordsFile);
|
||||
this.indexWriteUrlsFile = new DoublePartitionedDataFile(partitionRootFast, indexWriteUrlsFile);
|
||||
this.preconverterOutputFile = new PartitionedDataFile(partitionRootSlowTmp, "preconverted.dat");
|
||||
this.partitoner = partitoner;
|
||||
this.partitioner = partitioner;
|
||||
}
|
||||
|
||||
public SearchIndexWriterImpl getIndexWriter(int idx) {
|
||||
return new SearchIndexWriterImpl(getDictionaryWriter(), writerIndexFile.get(idx));
|
||||
}
|
||||
|
||||
public DictionaryWriter getDictionaryWriter() {
|
||||
if (dictionaryWriter == null) {
|
||||
dictionaryWriter = new DictionaryWriter(writerDictionaryFile.get(), dictionaryHashMapSize, true);
|
||||
}
|
||||
return dictionaryWriter;
|
||||
public SearchIndexJournalWriterImpl getIndexWriter(int idx) {
|
||||
return new SearchIndexJournalWriterImpl(getKeywordLexicon(), writerIndexFile.get(idx));
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public DictionaryReader getDictionaryReader() {
|
||||
return new DictionaryReader(getDictionaryWriter());
|
||||
public KeywordLexicon getKeywordLexicon() {
|
||||
if (keywordLexicon == null) {
|
||||
final var journal = new KeywordLexiconJournal(keywordLexiconFile.get());
|
||||
keywordLexicon = new KeywordLexicon(journal,
|
||||
new DictionaryHashMap(dictionaryHashMapSize));
|
||||
}
|
||||
return keywordLexicon;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public KeywordLexiconReadOnlyView getDictionaryReader() {
|
||||
return new KeywordLexiconReadOnlyView(getKeywordLexicon());
|
||||
|
||||
}
|
||||
|
||||
public SearchIndexConverter getIndexConverter(int id, IndexBlock block) throws ConversionUnnecessaryException {
|
||||
return new SearchIndexConverter(block, id, tmpFileDir,
|
||||
public void convertIndex(int id, IndexBlock block) throws ConversionUnnecessaryException, IOException {
|
||||
var converter = new SearchIndexConverter(block, id, tmpFileDir,
|
||||
preconverterOutputFile.get(id),
|
||||
indexWriteWordsFile.get(id, block.id),
|
||||
indexWriteUrlsFile.get(id, block.id),
|
||||
partitoner,
|
||||
partitioner,
|
||||
domainBlacklist
|
||||
);
|
||||
converter.convert();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public SearchIndexPreconverter getIndexPreconverter() {
|
||||
File[] outputFiles = new File[DYNAMIC_BUCKET_LENGTH+1];
|
||||
@ -106,7 +118,7 @@ public class IndexServicesFactory {
|
||||
}
|
||||
return new SearchIndexPreconverter(writerIndexFile.get(0),
|
||||
outputFiles,
|
||||
partitoner,
|
||||
partitioner,
|
||||
domainBlacklist
|
||||
);
|
||||
}
|
||||
@ -115,10 +127,6 @@ public class IndexServicesFactory {
|
||||
return preconverterOutputFile.get(i);
|
||||
}
|
||||
|
||||
public long wordCount(int id) {
|
||||
return SearchIndexConverter.wordCount(writerIndexFile.get(0));
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public SearchIndexReader getIndexReader(int id) {
|
||||
EnumMap<IndexBlock, SearchIndex> indexMap = new EnumMap<>(IndexBlock.class);
|
||||
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.wmsa.edge.index.service.index;
|
||||
package nu.marginalia.wmsa.edge.index.conversion;
|
||||
|
||||
public class ConversionUnnecessaryException extends Exception {
|
||||
public ConversionUnnecessaryException() {
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.wmsa.edge.index.service;
|
||||
package nu.marginalia.wmsa.edge.index.conversion;
|
||||
|
||||
import gnu.trove.list.TIntList;
|
||||
import gnu.trove.map.hash.TIntIntHashMap;
|
@ -0,0 +1,213 @@
|
||||
package nu.marginalia.wmsa.edge.index.conversion;
|
||||
|
||||
import nu.marginalia.util.RandomWriteFunnel;
|
||||
import nu.marginalia.util.btree.BTreeWriter;
|
||||
import nu.marginalia.util.btree.model.BTreeContext;
|
||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
||||
import nu.marginalia.wmsa.edge.index.conversion.words.WordIndexOffsetsTable;
|
||||
import nu.marginalia.wmsa.edge.index.conversion.words.WordsTableWriter;
|
||||
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalReader;
|
||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.RandomAccessFile;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry.MAX_LENGTH;
|
||||
|
||||
public class SearchIndexConverter {
|
||||
public static final BTreeContext urlsBTreeContext = new BTreeContext(5, 1, ~0, 8);
|
||||
|
||||
private final long[] tmpWordsBuffer = new long[MAX_LENGTH];
|
||||
|
||||
private final Path tmpFileDir;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final IndexBlock block;
|
||||
private final int bucketId;
|
||||
|
||||
private final File inputFile;
|
||||
private final File outputFileWords;
|
||||
private final File outputFileUrls;
|
||||
|
||||
private final SearchIndexPartitioner partitioner;
|
||||
private final EdgeDomainBlacklist blacklist;
|
||||
|
||||
private final static int internalSortLimit =
|
||||
Boolean.getBoolean("small-ram") ? 1024*1024 : 1024*1024*256;
|
||||
|
||||
public SearchIndexConverter(IndexBlock block,
|
||||
int bucketId,
|
||||
Path tmpFileDir,
|
||||
File inputFile,
|
||||
File outputFileWords,
|
||||
File outputFileUrls,
|
||||
SearchIndexPartitioner partitioner,
|
||||
EdgeDomainBlacklist blacklist)
|
||||
{
|
||||
this.block = block;
|
||||
this.bucketId = bucketId;
|
||||
this.tmpFileDir = tmpFileDir;
|
||||
this.inputFile = inputFile;
|
||||
this.outputFileWords = outputFileWords;
|
||||
this.outputFileUrls = outputFileUrls;
|
||||
this.partitioner = partitioner;
|
||||
this.blacklist = blacklist;
|
||||
}
|
||||
|
||||
public void convert() throws IOException {
|
||||
Files.deleteIfExists(outputFileWords.toPath());
|
||||
Files.deleteIfExists(outputFileUrls.toPath());
|
||||
|
||||
SearchIndexJournalReader journalReader = new SearchIndexJournalReader(MultimapFileLong.forReading(inputFile.toPath()));
|
||||
|
||||
if (journalReader.fileHeader.fileSize() <= SearchIndexJournalReader.FILE_HEADER_SIZE_BYTES) {
|
||||
return;
|
||||
}
|
||||
|
||||
logger.info("Converting {} ({}) {} {}", block.id, block, inputFile, journalReader.fileHeader);
|
||||
|
||||
var lock = partitioner.getReadLock();
|
||||
try {
|
||||
lock.lock();
|
||||
|
||||
var tmpUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat");
|
||||
|
||||
logger.info("Creating word index table {} for block {} ({})", outputFileWords, block.id, block);
|
||||
WordIndexOffsetsTable wordIndexTable = createWordIndexTable(journalReader, outputFileWords);
|
||||
|
||||
logger.info("Creating word urls table {} for block {} ({})", outputFileUrls, block.id, block);
|
||||
createUrlTable(journalReader, tmpUrlsFile, wordIndexTable);
|
||||
|
||||
Files.delete(tmpUrlsFile);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Failed to convert", ex);
|
||||
throw ex;
|
||||
}
|
||||
finally {
|
||||
lock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
private WordIndexOffsetsTable createWordIndexTable(SearchIndexJournalReader journalReader,
|
||||
File outputFileWords) throws IOException
|
||||
{
|
||||
final int topWord = (int) journalReader.fileHeader.wordCount();
|
||||
|
||||
WordsTableWriter wordsTableWriter = new WordsTableWriter(topWord);
|
||||
|
||||
for (var entry : journalReader) {
|
||||
if (!isRelevantEntry(entry)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
final SearchIndexJournalEntry entryData = entry.readEntryUsingBuffer(tmpWordsBuffer);
|
||||
|
||||
for (int i = 0; i < entryData.size(); i++) {
|
||||
int wordId = (int) entryData.get(i);
|
||||
if (wordId < 0 || wordId >= topWord) {
|
||||
logger.warn("Bad wordId {}", wordId);
|
||||
}
|
||||
wordsTableWriter.acceptWord(wordId);
|
||||
}
|
||||
}
|
||||
|
||||
wordsTableWriter.write(outputFileWords);
|
||||
|
||||
return wordsTableWriter.getTable();
|
||||
}
|
||||
|
||||
private void createUrlTable(SearchIndexJournalReader journalReader,
|
||||
Path tmpUrlsFile,
|
||||
WordIndexOffsetsTable wordOffsetsTable) throws IOException
|
||||
{
|
||||
long numberOfWordsTotal = 0;
|
||||
for (var entry : journalReader) {
|
||||
if (isRelevantEntry(entry))
|
||||
numberOfWordsTotal += entry.wordCount();
|
||||
}
|
||||
|
||||
try (RandomAccessFile urlsTmpFileRAF = new RandomAccessFile(tmpUrlsFile.toFile(), "rw");
|
||||
FileChannel urlsTmpFileChannel = urlsTmpFileRAF.getChannel()) {
|
||||
|
||||
try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, numberOfWordsTotal, 10_000_000)) {
|
||||
int[] wordWriteOffset = new int[wordOffsetsTable.length()];
|
||||
|
||||
for (var entry : journalReader) {
|
||||
if (!isRelevantEntry(entry)) continue;
|
||||
|
||||
var entryData = entry.readEntryUsingBuffer(tmpWordsBuffer);
|
||||
|
||||
for (int i = 0; i < entryData.size(); i++) {
|
||||
int wordId = (int) entryData.get(i);
|
||||
|
||||
if (wordId >= wordWriteOffset.length)
|
||||
continue;
|
||||
if (wordId < 0) {
|
||||
logger.warn("Negative wordId {}", wordId);
|
||||
}
|
||||
|
||||
final long urlInternal = translateUrl(entry.docId());
|
||||
if (wordId > 0) {
|
||||
rwf.put(wordOffsetsTable.get(wordId - 1) + wordWriteOffset[wordId]++, urlInternal);
|
||||
} else {
|
||||
rwf.put(wordWriteOffset[wordId]++, urlInternal);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
rwf.write(urlsTmpFileChannel);
|
||||
}
|
||||
|
||||
urlsTmpFileChannel.force(false);
|
||||
|
||||
try (var urlsTmpFileMap = MultimapFileLong.forOutput(tmpUrlsFile, numberOfWordsTotal)) {
|
||||
if (wordOffsetsTable.length() > 0) {
|
||||
var urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit);
|
||||
|
||||
wordOffsetsTable.forEachRange(urlTmpFileSorter::sort);
|
||||
|
||||
urlsTmpFileMap.force();
|
||||
} else {
|
||||
logger.warn("urls table empty -- nothing to sort");
|
||||
}
|
||||
}
|
||||
|
||||
try (var urlsFileMap = MultimapFileLong.forOutput(outputFileUrls.toPath(), numberOfWordsTotal)) {
|
||||
var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext);
|
||||
|
||||
wordOffsetsTable.foldRanges((accumulatorIdx, start, length) -> {
|
||||
// Note: The return value is accumulated into accumulatorIdx!
|
||||
|
||||
return writer.write(accumulatorIdx, length,
|
||||
slice -> slice.transferFromFileChannel(urlsTmpFileChannel, 0, start, start + length));
|
||||
});
|
||||
|
||||
} catch (Exception e) {
|
||||
logger.error("Error while writing BTree", e);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private long translateUrl(long url) {
|
||||
int domainId = partitioner.translateId(bucketId, (int) (url >>> 32));
|
||||
return ((long)domainId << 32) | (url & 0xFFFFFFFFL);
|
||||
}
|
||||
|
||||
private boolean isRelevantEntry(SearchIndexJournalReader.JournalEntry entry) {
|
||||
return block.equals(entry.header.block())
|
||||
&& !blacklist.isBlacklisted(entry.domainId())
|
||||
&& partitioner.filterUnsafe(entry.domainId(), bucketId);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.wmsa.edge.index.service;
|
||||
package nu.marginalia.wmsa.edge.index.conversion;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
@ -10,7 +10,7 @@ import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.ranking.BetterReversePageRank;
|
||||
import nu.marginalia.util.ranking.BetterStandardPageRank;
|
||||
import nu.marginalia.util.ranking.BuggyStandardPageRank;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -18,41 +18,28 @@ import org.slf4j.LoggerFactory;
|
||||
@Singleton
|
||||
public class SearchIndexDao {
|
||||
private final HikariDataSource dataSource;
|
||||
private RankingDomainFetcher rankingDomains;
|
||||
private final RankingSettings rankingSettings;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Inject
|
||||
public SearchIndexDao(HikariDataSource dataSource,
|
||||
RankingDomainFetcher rankingDomains,
|
||||
RankingSettings rankingSettings)
|
||||
{
|
||||
this.dataSource = dataSource;
|
||||
this.rankingDomains = rankingDomains;
|
||||
this.rankingSettings = rankingSettings;
|
||||
logger.info("SearchIndexDao ranking settings = {}", rankingSettings);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public TIntHashSet getSpamDomains() {
|
||||
final TIntHashSet result = new TIntHashSet(1_000_000);
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_TOP_DOMAIN ON EC_DOMAIN.URL_TOP_DOMAIN_ID = EC_TOP_DOMAIN.ID INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_TOP_DOMAIN.URL_PART")) {
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
result.add(rsp.getInt(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public TIntHashSet goodUrls() {
|
||||
TIntHashSet domains = new TIntHashSet(10_000_000, 0.5f, -1);
|
||||
TIntHashSet urls = new TIntHashSet(100_000_000, 0.5f, -1);
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NULL AND STATE>=0")) {
|
||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NULL AND IS_ALIVE")) {
|
||||
stmt.setFetchSize(10_000);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
@ -79,36 +66,36 @@ public class SearchIndexDao {
|
||||
|
||||
@SneakyThrows
|
||||
public TIntList getRetroDomains() {
|
||||
var spr = new BetterStandardPageRank(dataSource,rankingSettings.retro.toArray(String[]::new));
|
||||
return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
|
||||
var spr = new BetterStandardPageRank(rankingDomains,rankingSettings.retro.toArray(String[]::new));
|
||||
return spr.pageRankWithPeripheralNodes(spr.size()/2);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public TIntList getSmallWebDomains() {
|
||||
var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), rankingSettings.small.toArray(String[]::new));
|
||||
var rpr = new BetterReversePageRank(rankingDomains, rankingSettings.small.toArray(String[]::new));
|
||||
|
||||
rpr.setMaxKnownUrls(750);
|
||||
|
||||
return rpr.pageRankWithPeripheralNodes(rpr.size(), false);
|
||||
return rpr.pageRankWithPeripheralNodes(rpr.size());
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public TIntList getAcademiaDomains() {
|
||||
var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), rankingSettings.academia.toArray(String[]::new));
|
||||
return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
|
||||
var spr = new BetterStandardPageRank(rankingDomains, rankingSettings.academia.toArray(String[]::new));
|
||||
return spr.pageRankWithPeripheralNodes(spr.size()/2);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public TIntList getStandardDomains() {
|
||||
var spr = new BuggyStandardPageRank(dataSource,rankingSettings.standard.toArray(String[]::new));
|
||||
return spr.pageRankWithPeripheralNodes(spr.size()/2, false);
|
||||
var spr = new BuggyStandardPageRank(rankingDomains,rankingSettings.standard.toArray(String[]::new));
|
||||
return spr.pageRankWithPeripheralNodes(spr.size()/2);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public TIntList getSpecialDomains() {
|
||||
TIntArrayList results = new TIntArrayList();
|
||||
try (var connection = dataSource.getConnection();
|
||||
var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE=2")
|
||||
var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE='SPECIAL'")
|
||||
) {
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
@ -1,11 +1,9 @@
|
||||
package nu.marginalia.wmsa.edge.index.service.query;
|
||||
package nu.marginalia.wmsa.edge.index.conversion;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.index.service.SearchEngineRanking;
|
||||
import nu.marginalia.wmsa.edge.index.service.SearchIndexDao;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -124,7 +122,7 @@ public class SearchIndexPartitioner {
|
||||
public Lock getReadLock() {
|
||||
return rwl.readLock();
|
||||
}
|
||||
public boolean filterUnsafe(Lock lock, int domainId, int bucketId) {
|
||||
public boolean filterUnsafe(int domainId, int bucketId) {
|
||||
return partitionSet.test(domainId, bucketId);
|
||||
}
|
||||
|
@ -1,10 +1,11 @@
|
||||
package nu.marginalia.wmsa.edge.index.service.index;
|
||||
package nu.marginalia.wmsa.edge.index.conversion;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
||||
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
|
||||
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalReader;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -47,23 +48,16 @@ public class SearchIndexPreconverter {
|
||||
}
|
||||
}
|
||||
|
||||
final RandomAccessFile raf = new RandomAccessFile(inputFile, "r");
|
||||
SearchIndexJournalReader indexJournalReader = new SearchIndexJournalReader(MultimapFileLong.forReading(inputFile.toPath()));
|
||||
|
||||
var fileLength = raf.readLong();
|
||||
var wordCount = raf.readInt();
|
||||
final int wordCountOriginal = wordCount;
|
||||
final long wordCountOriginal = indexJournalReader.fileHeader.wordCount();
|
||||
|
||||
logger.info("Word Count: {}", wordCount);
|
||||
logger.info("File Length: {}", fileLength);
|
||||
|
||||
var channel = raf.getChannel();
|
||||
|
||||
ByteBuffer inByteBuffer = ByteBuffer.allocateDirect(10_000);
|
||||
logger.info("{}", indexJournalReader.fileHeader);
|
||||
|
||||
RandomAccessFile[] randomAccessFiles = new RandomAccessFile[outputFiles.length];
|
||||
for (int i = 0; i < randomAccessFiles.length; i++) {
|
||||
randomAccessFiles[i] = new RandomAccessFile(outputFiles[i], "rw");
|
||||
randomAccessFiles[i].seek(12);
|
||||
randomAccessFiles[i].seek(SearchIndexJournalReader.FILE_HEADER_SIZE_BYTES);
|
||||
}
|
||||
FileChannel[] fileChannels = new FileChannel[outputFiles.length];
|
||||
for (int i = 0; i < fileChannels.length; i++) {
|
||||
@ -74,33 +68,24 @@ public class SearchIndexPreconverter {
|
||||
var lock = partitioner.getReadLock();
|
||||
try {
|
||||
lock.lock();
|
||||
ByteBuffer buffer = ByteBuffer.allocateDirect(8192);
|
||||
|
||||
while (channel.position() < fileLength) {
|
||||
inByteBuffer.clear();
|
||||
inByteBuffer.limit(CHUNK_HEADER_SIZE);
|
||||
channel.read(inByteBuffer);
|
||||
inByteBuffer.flip();
|
||||
long urlId = inByteBuffer.getLong();
|
||||
int chunkBlock = inByteBuffer.getInt();
|
||||
int count = inByteBuffer.getInt();
|
||||
// inByteBuffer.clear();
|
||||
inByteBuffer.limit(count * 4 + CHUNK_HEADER_SIZE);
|
||||
channel.read(inByteBuffer);
|
||||
inByteBuffer.position(CHUNK_HEADER_SIZE);
|
||||
|
||||
for (int i = 0; i < count; i++) {
|
||||
wordCount = Math.max(wordCount, 1 + inByteBuffer.getInt());
|
||||
for (var entry : indexJournalReader) {
|
||||
if (!partitioner.isGoodUrl(entry.urlId())
|
||||
|| spamDomains.contains(entry.domainId())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
inByteBuffer.position(count * 4 + CHUNK_HEADER_SIZE);
|
||||
int domainId = entry.domainId();
|
||||
buffer.clear();
|
||||
entry.copyToBuffer(buffer);
|
||||
|
||||
for (int i = 0; i < randomAccessFiles.length; i++) {
|
||||
if (partitioner.filterUnsafe(domainId, i)) {
|
||||
buffer.flip();
|
||||
|
||||
if (isUrlAllowed(urlId)) {
|
||||
for (int i = 0; i < randomAccessFiles.length; i++) {
|
||||
if (partitioner.filterUnsafe(lock, (int) (urlId >>> 32L), i)) {
|
||||
inByteBuffer.flip();
|
||||
fileChannels[i].write(inByteBuffer);
|
||||
}
|
||||
while (buffer.position() < buffer.limit())
|
||||
fileChannels[i].write(buffer);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -109,27 +94,16 @@ public class SearchIndexPreconverter {
|
||||
lock.unlock();
|
||||
}
|
||||
|
||||
if (wordCountOriginal < wordCount) {
|
||||
logger.warn("Raised word count {} => {}", wordCountOriginal, wordCount);
|
||||
}
|
||||
|
||||
for (int i = 0; i < randomAccessFiles.length; i++) {
|
||||
long pos = randomAccessFiles[i].getFilePointer();
|
||||
randomAccessFiles[i].seek(0);
|
||||
randomAccessFiles[i].writeLong(pos);
|
||||
randomAccessFiles[i].writeInt(wordCount);
|
||||
randomAccessFiles[i].writeLong(wordCountOriginal);
|
||||
fileChannels[i].force(true);
|
||||
fileChannels[i].close();
|
||||
randomAccessFiles[i].close();
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isUrlAllowed(long url) {
|
||||
int urlId = (int)(url & 0xFFFF_FFFFL);
|
||||
int domainId = (int)(url >>> 32);
|
||||
|
||||
return partitioner.isGoodUrl(urlId) && !spamDomains.contains(domainId);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,10 @@
|
||||
package nu.marginalia.wmsa.edge.index.conversion.words;
|
||||
|
||||
public class WordIndexLengthsTable {
|
||||
final long[] table;
|
||||
|
||||
public WordIndexLengthsTable(int size) {
|
||||
this.table = new long[size];
|
||||
}
|
||||
public void increment(int idx) { table[idx]++; }
|
||||
}
|
@ -0,0 +1,67 @@
|
||||
package nu.marginalia.wmsa.edge.index.conversion.words;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class WordIndexOffsetsTable {
|
||||
final long[] table;
|
||||
public final int numberOfUsedWords;
|
||||
|
||||
public WordIndexOffsetsTable(long[] table, int numberOfUsedWords) {
|
||||
|
||||
this.table = table;
|
||||
this.numberOfUsedWords = numberOfUsedWords;
|
||||
}
|
||||
|
||||
public int length() {
|
||||
return table.length;
|
||||
}
|
||||
|
||||
public void forEachRange(OffsetTableEntryConsumer o) throws IOException {
|
||||
if (table[0] > 0) {
|
||||
o.accept(0, (int) table[0]);
|
||||
}
|
||||
|
||||
for (int i = 1; i < table.length; i++) {
|
||||
long start = table[i-1];
|
||||
int length = (int) (table[i] - start);
|
||||
|
||||
if (length != 0) {
|
||||
o.accept(start, length);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fold over each span in the file, left to right, accumulating the return value
|
||||
*/
|
||||
public long foldRanges(OffsetTableEntryFoldConsumer o) throws IOException {
|
||||
long total = 0;
|
||||
|
||||
if (table[0] > 0) {
|
||||
total = o.accept(total,0, (int) table[0]);
|
||||
}
|
||||
|
||||
for (int i = 1; i < table.length; i++) {
|
||||
long start = table[i-1];
|
||||
int length = (int) (table[i] - start);
|
||||
|
||||
if (length != 0) {
|
||||
total += o.accept(total, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
return total;
|
||||
}
|
||||
|
||||
public long get(int i) {
|
||||
return table[i];
|
||||
}
|
||||
|
||||
public interface OffsetTableEntryConsumer {
|
||||
void accept(long start, int length) throws IOException;
|
||||
}
|
||||
|
||||
public interface OffsetTableEntryFoldConsumer {
|
||||
long accept(long accumulator, long start, int length) throws IOException;
|
||||
}
|
||||
}
|
@ -0,0 +1,56 @@
|
||||
package nu.marginalia.wmsa.edge.index.conversion.words;
|
||||
|
||||
/** Contains a stateful table of word index offsets, initially in lengths mode
|
||||
* where the table contains how many postings exist for each word; then in offsets
|
||||
* mode, where the lengths are converted into the necessary offsets for each block
|
||||
* of document data.
|
||||
*
|
||||
* Caveat! This uses the same underlying array to conserve space.
|
||||
*
|
||||
*/
|
||||
public class WordIndexTables {
|
||||
private WordIndexLengthsTable lengthsTable;
|
||||
private WordIndexOffsetsTable offsetsTable;
|
||||
|
||||
private boolean converted = false;
|
||||
|
||||
public WordIndexTables(int size) {
|
||||
lengthsTable = new WordIndexLengthsTable(size);
|
||||
}
|
||||
|
||||
public WordIndexLengthsTable lengths() {
|
||||
if (converted) throw new IllegalStateException("Table has been converted");
|
||||
|
||||
return lengthsTable;
|
||||
}
|
||||
|
||||
public WordIndexOffsetsTable offsets() {
|
||||
if (!converted) throw new IllegalStateException("Table has not been converted");
|
||||
|
||||
return offsetsTable;
|
||||
}
|
||||
|
||||
public void convert() {
|
||||
if (converted) throw new IllegalStateException("Table has been converted");
|
||||
|
||||
// Go from lengths to offsets, i.e.
|
||||
// BEFORE: 1, 2, 1, 3, 0, 2
|
||||
// AFTER: 1, 3, 4, 7, 7, 9
|
||||
|
||||
long[] table = lengthsTable.table;
|
||||
int numberOfUsedWords = 0;
|
||||
|
||||
if (table[0] != 0) numberOfUsedWords = 1;
|
||||
|
||||
for (int i = 1; i < table.length; i++) {
|
||||
if (table[i] != 0) {
|
||||
numberOfUsedWords++;
|
||||
}
|
||||
table[i] += table[i-1];
|
||||
}
|
||||
|
||||
lengthsTable = null;
|
||||
offsetsTable = new WordIndexOffsetsTable(table, numberOfUsedWords);
|
||||
converted = true;
|
||||
}
|
||||
}
|
@ -0,0 +1,75 @@
|
||||
package nu.marginalia.wmsa.edge.index.conversion.words;
|
||||
|
||||
import nu.marginalia.util.btree.BTreeWriter;
|
||||
import nu.marginalia.util.btree.model.BTreeContext;
|
||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||
import nu.marginalia.util.multimap.MultimapFileLongSlice;
|
||||
import nu.marginalia.wmsa.edge.index.reader.IndexWordsTable;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.urlsBTreeContext;
|
||||
|
||||
public class WordsTableWriter {
|
||||
private final WordIndexTables table;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
public static final BTreeContext wordsBTreeContext = new BTreeContext(7, 2, 0x0000_0000_FFFF_FFFFL, 8);
|
||||
|
||||
public WordsTableWriter(int length) {
|
||||
table = new WordIndexTables(length);
|
||||
}
|
||||
|
||||
public void acceptWord(int wordId) {
|
||||
table.lengths().increment(wordId);
|
||||
}
|
||||
|
||||
public WordIndexOffsetsTable getTable() {
|
||||
return table.offsets();
|
||||
}
|
||||
|
||||
public void write(File file) throws IOException {
|
||||
table.convert();
|
||||
|
||||
logger.info("Writing table - {} max", table.offsets().numberOfUsedWords);
|
||||
|
||||
final int tableSize = table.offsets().numberOfUsedWords;
|
||||
|
||||
try (var mmf = MultimapFileLong.forOutput(file.toPath(), tableSize/8L)) {
|
||||
mmf.put(0, IndexWordsTable.Strategy.BTREE.ordinal());
|
||||
long offset = 1;
|
||||
|
||||
var writer = new BTreeWriter(mmf, wordsBTreeContext);
|
||||
|
||||
writer.write(offset, tableSize, this::writeBTreeDataBlock);
|
||||
}
|
||||
}
|
||||
|
||||
private void writeBTreeDataBlock(MultimapFileLongSlice mapSlice) {
|
||||
long urlFileOffset = 0;
|
||||
int idx = 0;
|
||||
|
||||
var offsetTable = table.offsets().table;
|
||||
|
||||
if (offsetTable[0] != 0) {
|
||||
int length = (int) offsetTable[0];
|
||||
mapSlice.put(idx++, (long)length<<32);
|
||||
mapSlice.put(idx++, 0);
|
||||
|
||||
urlFileOffset += (urlsBTreeContext.calculateSize(length));
|
||||
}
|
||||
|
||||
for (int i = 1; i < offsetTable.length; i++) {
|
||||
final int length = (int)(offsetTable[i] - offsetTable[i-1]);
|
||||
|
||||
if (length > 0) {
|
||||
mapSlice.put(idx++, (long)length << 32 | i);
|
||||
mapSlice.put(idx++, urlFileOffset);
|
||||
|
||||
urlFileOffset += (urlsBTreeContext.calculateSize(length));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,126 @@
|
||||
package nu.marginalia.wmsa.edge.index.journal;
|
||||
|
||||
import com.upserve.uppend.blobs.NativeIO;
|
||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||
import nu.marginalia.util.multimap.MultimapFileLongSlice;
|
||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
|
||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
|
||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalFileHeader;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.Iterator;
|
||||
|
||||
import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS;
|
||||
|
||||
public class SearchIndexJournalReader implements Iterable<SearchIndexJournalReader.JournalEntry> {
|
||||
public static final long FILE_HEADER_SIZE_LONGS = 2;
|
||||
public static final long FILE_HEADER_SIZE_BYTES = 8*FILE_HEADER_SIZE_LONGS;
|
||||
|
||||
public final SearchIndexJournalFileHeader fileHeader;
|
||||
|
||||
private final MultimapFileLongSlice map;
|
||||
private final long committedSize;
|
||||
|
||||
public SearchIndexJournalReader(MultimapFileLong map) {
|
||||
fileHeader = new SearchIndexJournalFileHeader(map.get(0), map.get(1));
|
||||
committedSize = map.get(0) / 8 - FILE_HEADER_SIZE_LONGS;
|
||||
|
||||
map.advice(NativeIO.Advice.Sequential);
|
||||
|
||||
this.map = map.atOffset(FILE_HEADER_SIZE_LONGS);
|
||||
}
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
public Iterator<JournalEntry> iterator() {
|
||||
return new JournalEntryIterator();
|
||||
}
|
||||
|
||||
private class JournalEntryIterator implements Iterator<JournalEntry> {
|
||||
private JournalEntry entry;
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (entry == null) {
|
||||
return committedSize > 0;
|
||||
}
|
||||
|
||||
return entry.hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public JournalEntry next() {
|
||||
if (entry == null) {
|
||||
entry = new JournalEntry(0);
|
||||
}
|
||||
else {
|
||||
entry = entry.next();
|
||||
}
|
||||
return entry;
|
||||
}
|
||||
}
|
||||
|
||||
public class JournalEntry {
|
||||
private final long offset;
|
||||
public final SearchIndexJournalEntryHeader header;
|
||||
|
||||
JournalEntry(long offset) {
|
||||
final long sizeBlock = map.get(offset);
|
||||
final long docId = map.get(offset + 1);
|
||||
|
||||
this.offset = offset;
|
||||
this.header = new SearchIndexJournalEntryHeader(
|
||||
(int)(sizeBlock >>> 32L),
|
||||
docId,
|
||||
IndexBlock.byId((int)(sizeBlock & 0xFFFF_FFFFL)));
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return nextId() < committedSize;
|
||||
}
|
||||
public long docId() {
|
||||
return header.documentId();
|
||||
}
|
||||
public int domainId() {
|
||||
return (int) (docId() >>> 32L);
|
||||
}
|
||||
public int urlId() {
|
||||
return (int)(docId() & 0xFFFF_FFFFL);
|
||||
}
|
||||
public IndexBlock block() {
|
||||
return header.block();
|
||||
}
|
||||
public int wordCount() { return header.entrySize(); }
|
||||
|
||||
public SearchIndexJournalEntry readEntry() {
|
||||
long[] dest = new long[header.entrySize()];
|
||||
map.read(dest, offset + HEADER_SIZE_LONGS);
|
||||
return new SearchIndexJournalEntry(header.entrySize(), dest);
|
||||
}
|
||||
|
||||
public SearchIndexJournalEntry readEntryUsingBuffer(long[] dest) {
|
||||
if (dest.length >= header.entrySize()) {
|
||||
map.read(dest, header.entrySize(), offset + HEADER_SIZE_LONGS);
|
||||
return new SearchIndexJournalEntry(header.entrySize(), dest);
|
||||
}
|
||||
else {
|
||||
return readEntry();
|
||||
}
|
||||
}
|
||||
|
||||
public long nextId() {
|
||||
return offset + HEADER_SIZE_LONGS + header.entrySize();
|
||||
}
|
||||
public JournalEntry next() { return new JournalEntry(nextId()); }
|
||||
|
||||
public void copyToBuffer(ByteBuffer buffer) {
|
||||
var dest = buffer.asLongBuffer();
|
||||
dest.position(buffer.position() * 8);
|
||||
dest.limit(buffer.position()*8 + header.entrySize() + HEADER_SIZE_LONGS);
|
||||
map.read(dest, offset);
|
||||
buffer.position(dest.limit()*8);
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,13 @@
|
||||
package nu.marginalia.wmsa.edge.index.journal;
|
||||
|
||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
|
||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
|
||||
|
||||
public interface SearchIndexJournalWriter {
|
||||
void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entry);
|
||||
|
||||
void forceWrite();
|
||||
|
||||
void flushWords();
|
||||
|
||||
}
|
@ -1,13 +1,11 @@
|
||||
package nu.marginalia.wmsa.edge.index.service.index;
|
||||
package nu.marginalia.wmsa.edge.index.journal;
|
||||
|
||||
import io.reactivex.rxjava3.disposables.Disposable;
|
||||
import io.reactivex.rxjava3.schedulers.Schedulers;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryWriter;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
|
||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -17,23 +15,22 @@ import java.io.IOException;
|
||||
import java.io.RandomAccessFile;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class SearchIndexWriterImpl implements SearchIndexWriter {
|
||||
private final DictionaryWriter dictionaryWriter;
|
||||
public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
|
||||
private final KeywordLexicon dictionaryWriter;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final Disposable writerTask;
|
||||
private RandomAccessFile raf;
|
||||
private FileChannel channel;
|
||||
|
||||
public static final int MAX_BLOCK_SIZE = 1000*32*8*4;
|
||||
public static final int MAX_BLOCK_SIZE = SearchIndexJournalEntry.MAX_LENGTH*32*8*4;
|
||||
private final ByteBuffer byteBuffer;
|
||||
private long pos;
|
||||
|
||||
@SneakyThrows
|
||||
public SearchIndexWriterImpl(DictionaryWriter dictionaryWriter, File indexFile) {
|
||||
public SearchIndexJournalWriterImpl(KeywordLexicon dictionaryWriter, File indexFile) {
|
||||
this.dictionaryWriter = dictionaryWriter;
|
||||
initializeIndexFile(indexFile);
|
||||
|
||||
@ -61,23 +58,16 @@ public class SearchIndexWriterImpl implements SearchIndexWriter {
|
||||
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public synchronized void put(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId, IndexBlock block, List<String> wordsSuspect) {
|
||||
int numGoodWords = 0;
|
||||
for (String word : wordsSuspect) {
|
||||
if (word.length() < Byte.MAX_VALUE) numGoodWords++;
|
||||
}
|
||||
public synchronized void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {
|
||||
|
||||
byteBuffer.clear();
|
||||
long url_id = ((long) domainId.getId() << 32) | urlId.getId();
|
||||
byteBuffer.putLong(url_id);
|
||||
byteBuffer.putInt(block.id);
|
||||
byteBuffer.putInt(numGoodWords);
|
||||
|
||||
for (String word : wordsSuspect) {
|
||||
if (word.length() < Byte.MAX_VALUE) {
|
||||
byteBuffer.putInt(dictionaryWriter.get(word));
|
||||
}
|
||||
}
|
||||
byteBuffer.putInt(entryData.size());
|
||||
byteBuffer.putInt(header.block().id);
|
||||
byteBuffer.putLong(header.documentId());
|
||||
|
||||
entryData.write(byteBuffer);
|
||||
|
||||
byteBuffer.limit(byteBuffer.position());
|
||||
byteBuffer.rewind();
|
||||
|
||||
@ -104,11 +94,11 @@ public class SearchIndexWriterImpl implements SearchIndexWriter {
|
||||
}
|
||||
|
||||
private void writePositionMarker() throws IOException {
|
||||
var lock = channel.lock(0, 12, false);
|
||||
var lock = channel.lock(0, 16, false);
|
||||
pos = channel.size();
|
||||
raf.seek(0);
|
||||
raf.writeLong(pos);
|
||||
raf.writeInt(dictionaryWriter.size());
|
||||
raf.writeLong(dictionaryWriter.size());
|
||||
raf.seek(pos);
|
||||
lock.release();
|
||||
}
|
@ -0,0 +1,49 @@
|
||||
package nu.marginalia.wmsa.edge.index.journal.model;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.Arrays;
|
||||
|
||||
public class SearchIndexJournalEntry {
|
||||
private final int size;
|
||||
private final long[] underlyingArray;
|
||||
|
||||
public static final int MAX_LENGTH = 1000;
|
||||
|
||||
public SearchIndexJournalEntry(long[] underlyingArray) {
|
||||
this.size = underlyingArray.length;
|
||||
this.underlyingArray = underlyingArray;
|
||||
}
|
||||
|
||||
public SearchIndexJournalEntry(int size, long[] underlyingArray) {
|
||||
this.size = size;
|
||||
this.underlyingArray = underlyingArray;
|
||||
}
|
||||
|
||||
public void write(ByteBuffer buffer) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
buffer.putLong(underlyingArray[i]);
|
||||
}
|
||||
}
|
||||
|
||||
public long get(int idx) {
|
||||
if (idx >= size)
|
||||
throw new ArrayIndexOutOfBoundsException();
|
||||
return underlyingArray[idx];
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return size;
|
||||
}
|
||||
|
||||
public long[] toArray() {
|
||||
if (size == underlyingArray.length)
|
||||
return underlyingArray;
|
||||
else
|
||||
return Arrays.copyOf(underlyingArray, size);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s[%s]", getClass().getSimpleName(), Arrays.toString(toArray()));
|
||||
}
|
||||
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user