Revert "Revert "Merge branch 'experimental' into master""

This reverts commit 81c77e7fcb.
This commit is contained in:
vlofgren 2022-06-16 14:09:57 +02:00
parent 5ef953ae3d
commit 2e55599850
100 changed files with 1564 additions and 1654 deletions

View File

@ -59,12 +59,12 @@ dependencies {
implementation "com.sparkjava:spark-core:2.9.3" implementation "com.sparkjava:spark-core:2.9.3"
implementation 'com.opencsv:opencsv:5.6' implementation 'com.opencsv:opencsv:5.6'
implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.1' implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.2'
implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.1' implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.2'
implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.1' implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.2'
implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.1' implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.2'
implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.1' implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.2'
implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.1' implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.2'
implementation 'org.slf4j:slf4j-api:1.7.36' implementation 'org.slf4j:slf4j-api:1.7.36'
@ -76,7 +76,6 @@ dependencies {
implementation 'com.github.ThatJavaNerd:JRAW:1.1.0' implementation 'com.github.ThatJavaNerd:JRAW:1.1.0'
implementation group: 'com.h2database', name: 'h2', version: '2.1.210' implementation group: 'com.h2database', name: 'h2', version: '2.1.210'
testImplementation group: 'org.mockito', name: 'mockito-core', version: '4.3.1'
implementation 'org.jsoup:jsoup:1.14.3' implementation 'org.jsoup:jsoup:1.14.3'
implementation group: 'com.github.crawler-commons', name: 'crawler-commons', version: '1.2' implementation group: 'com.github.crawler-commons', name: 'crawler-commons', version: '1.2'
@ -86,7 +85,7 @@ dependencies {
implementation 'com.zaxxer:HikariCP:5.0.1' implementation 'com.zaxxer:HikariCP:5.0.1'
implementation 'org.apache.opennlp:opennlp-tools:1.9.3' implementation 'org.apache.opennlp:opennlp-tools:1.9.4'
implementation 'io.prometheus:simpleclient:0.15.0' implementation 'io.prometheus:simpleclient:0.15.0'
implementation 'io.prometheus:simpleclient_servlet:0.15.0' implementation 'io.prometheus:simpleclient_servlet:0.15.0'
implementation 'io.prometheus:simpleclient_httpserver:0.15.0' implementation 'io.prometheus:simpleclient_httpserver:0.15.0'
@ -123,15 +122,19 @@ dependencies {
testImplementation 'org.projectlombok:lombok:1.18.24' testImplementation 'org.projectlombok:lombok:1.18.24'
testAnnotationProcessor 'org.projectlombok:lombok:1.18.24' testAnnotationProcessor 'org.projectlombok:lombok:1.18.24'
testImplementation group: 'org.mockito', name: 'mockito-core', version: '4.5.1'
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.2')
testImplementation 'org.testcontainers:mariadb:1.17.2'
testImplementation "org.testcontainers:junit-jupiter:1.17.2"
e2eTestImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2' e2eTestImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2'
e2eTestRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine' e2eTestRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
e2eTestImplementation 'org.projectlombok:lombok:1.18.24' e2eTestImplementation 'org.projectlombok:lombok:1.18.24'
e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.22' e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.24'
e2eTestImplementation 'org.testcontainers:mariadb:1.17.1' e2eTestImplementation 'org.testcontainers:nginx:1.17.2'
e2eTestImplementation 'org.testcontainers:nginx:1.17.1' e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.2"
e2eTestImplementation 'org.testcontainers:testcontainers:1.17.1' e2eTestImplementation "org.testcontainers:selenium:1.17.2"
e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.1"
e2eTestImplementation "org.testcontainers:selenium:1.17.1"
e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.1.4' e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.1.4'
e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.1.4' e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.1.4'
} }

View File

@ -28,6 +28,7 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*; import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
@Tag("e2e") @Tag("e2e")
@Testcontainers @Testcontainers
@ -156,6 +157,16 @@ public class EdgeSearchE2ETest extends E2ETestBase {
return wikipediaFiles.toString(); return wikipediaFiles.toString();
} }
private List<String> getTitlesFromSearchResults(String html) {
List<String> ret = new ArrayList<>();
for (var title : Jsoup.parse(html).select(".card.search-result > h2")) {
ret.add(title.text());
}
return ret;
}
@Test @Test
public void testFrontPage() throws IOException { public void testFrontPage() throws IOException {
var driver = chrome.getWebDriver(); var driver = chrome.getWebDriver();
@ -173,8 +184,9 @@ public class EdgeSearchE2ETest extends E2ETestBase {
driver.get("http://proxyNginx/search?query=bird&profile=corpo"); driver.get("http://proxyNginx/search?query=bird&profile=corpo");
System.out.println(driver.getTitle()); System.out.println(driver.getTitle());
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
assertEquals(List.of("Bird"), getTitlesFromSearchResults(html));
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query")); Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query"));
} }
@ -187,20 +199,23 @@ public class EdgeSearchE2ETest extends E2ETestBase {
System.out.println(driver.getTitle()); System.out.println(driver.getTitle());
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-info")); Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-info"));
} }
@Test @Test
public void testSiteSearch() throws IOException { public void testSiteSearch() throws IOException {
var driver = chrome.getWebDriver(); var driver = chrome.getWebDriver();
driver.get("http://proxyNginx/search?query=site:wikipedia.local%20frog"); driver.get("http://proxyNginx/search?query=site:wikipedia.local%20frog");
System.out.println(driver.getTitle()); System.out.println(driver.getTitle());
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
assertEquals(List.of("Frog", "Binomial nomenclature", "Amphibian", "Mantis"), getTitlesFromSearchResults(html));
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-search")); Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-search"));
} }
@Test @Test
public void testBrowse() throws IOException { public void testBrowse() throws IOException {
var driver = chrome.getWebDriver(); var driver = chrome.getWebDriver();
@ -209,7 +224,6 @@ public class EdgeSearchE2ETest extends E2ETestBase {
System.out.println(driver.getTitle()); System.out.println(driver.getTitle());
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("browse")); Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("browse"));
} }
@Test @Test
@ -220,7 +234,6 @@ public class EdgeSearchE2ETest extends E2ETestBase {
System.out.println(driver.getTitle()); System.out.println(driver.getTitle());
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("define")); Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("define"));
} }
@Test @Test

View File

@ -69,4 +69,4 @@ memex memex
dating dating dating dating
EOF EOF
WMSA_HOME=${HOME} java -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1 WMSA_HOME=${HOME} java -server -Xmx2G -Dsmall-ram=TRUE -Dservice-host=0.0.0.0 -jar /WMSA.jar start $1

View File

@ -3,6 +3,7 @@ package nu.marginalia.util.btree;
import nu.marginalia.util.btree.model.BTreeContext; import nu.marginalia.util.btree.model.BTreeContext;
import nu.marginalia.util.btree.model.BTreeHeader; import nu.marginalia.util.btree.model.BTreeHeader;
import nu.marginalia.util.multimap.MultimapFileLong; import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.util.multimap.MultimapFileLongSlice;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -12,9 +13,9 @@ import java.io.IOException;
public class BTreeWriter { public class BTreeWriter {
private final Logger logger = LoggerFactory.getLogger(BTreeWriter.class); private final Logger logger = LoggerFactory.getLogger(BTreeWriter.class);
private final BTreeContext ctx; private final BTreeContext ctx;
private final MultimapFileLong map; private final MultimapFileLongSlice map;
public BTreeWriter(MultimapFileLong map, BTreeContext ctx) { public BTreeWriter(MultimapFileLongSlice map, BTreeContext ctx) {
this.map = map; this.map = map;
this.ctx = ctx; this.ctx = ctx;
} }
@ -31,13 +32,18 @@ public class BTreeWriter {
return size; return size;
} }
public long write(long offset, int numEntries, WriteCallback writeIndex) /** Construct a BTree with numEntries entries at offset in the associated map
*
* @return The size of the written data
*/
public long write(long offset, int numEntries, WriteCallback writeIndexCallback)
throws IOException throws IOException
{ {
var header = makeHeader(offset, numEntries); BTreeHeader header = makeHeader(offset, numEntries);
header.write(map, offset); header.write(map, offset);
writeIndex.write(header.dataOffsetLongs());
writeIndexCallback.write(map.atOffset(header.dataOffsetLongs()));
if (header.layers() < 1) { if (header.layers() < 1) {
return ctx.calculateSize(numEntries); return ctx.calculateSize(numEntries);

View File

@ -1,7 +1,9 @@
package nu.marginalia.util.btree; package nu.marginalia.util.btree;
import nu.marginalia.util.multimap.MultimapFileLongSlice;
import java.io.IOException; import java.io.IOException;
public interface WriteCallback { public interface WriteCallback {
void write(long offset) throws IOException; void write(MultimapFileLongSlice slice) throws IOException;
} }

View File

@ -1,6 +1,7 @@
package nu.marginalia.util.btree.model; package nu.marginalia.util.btree.model;
import nu.marginalia.util.multimap.MultimapFileLong; import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.util.multimap.MultimapFileLongSlice;
public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, long dataOffsetLongs) { public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, long dataOffsetLongs) {
public BTreeHeader { public BTreeHeader {
@ -28,7 +29,7 @@ public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, lon
return padding; return padding;
} }
public void write(MultimapFileLong dest, long offset) { public void write(MultimapFileLongSlice dest, long offset) {
dest.put(offset, ((long) layers << 32L) | ((long)numEntries & 0xFFFF_FFFFL)); dest.put(offset, ((long) layers << 32L) | ((long)numEntries & 0xFFFF_FFFFL));
dest.put(offset+1, indexOffsetLongs); dest.put(offset+1, indexOffsetLongs);
dest.put(offset+2, dataOffsetLongs); dest.put(offset+2, dataOffsetLongs);

View File

@ -1,9 +1,7 @@
package nu.marginalia.util.hash; package nu.marginalia.util.hash;
import io.prometheus.client.Gauge;
import lombok.EqualsAndHashCode; import lombok.EqualsAndHashCode;
import lombok.Getter; import lombok.Getter;
import nu.marginalia.wmsa.edge.index.service.index.wordstable.IndexWordsTable;
import nu.marginalia.util.multimap.MultimapFileLong; import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.util.PrimeUtil; import nu.marginalia.util.PrimeUtil;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -17,9 +15,7 @@ import static java.lang.Math.round;
*/ */
public class LongPairHashMap { public class LongPairHashMap {
private static final Logger logger = LoggerFactory.getLogger(LongPairHashMap.class); private static final Logger logger = LoggerFactory.getLogger(LongPairHashMap.class);
private static final Gauge probe_count_metrics private static final long MAGIC_WORD = 0xE00E00E00E0E0E0EL; // it's the data police
= Gauge.build("wmsa_wordfile_hash_map_probe_count", "Probing Count")
.register();
private final long hashTableSize; private final long hashTableSize;
private final MultimapFileLong data; private final MultimapFileLong data;
@ -27,26 +23,37 @@ public class LongPairHashMap {
private int sz = 0; private int sz = 0;
private static final int HEADER_SIZE = 2; private static final int HEADER_SIZE = 2;
public LongPairHashMap(MultimapFileLong data, long size) { private LongPairHashMap(MultimapFileLong data, long hashTableSize, long maxProbeLength) {
this.data = data; this.data = data;
// Actually use a prime size for Donald Knuth reasons this.hashTableSize = hashTableSize;
hashTableSize = PrimeUtil.nextPrime(size, 1); this.maxProbeLength = maxProbeLength;
maxProbeLength = hashTableSize / 2; }
logger.debug("Table size = " + hashTableSize); public static LongPairHashMap createNew(MultimapFileLong data, long size) {
var tableSize = PrimeUtil.nextPrime(size, 1);
var ret = new LongPairHashMap(data, tableSize, tableSize/2);
data.put(0, IndexWordsTable.Strategy.HASH.ordinal()); data.put(0, MAGIC_WORD);
data.put(1, hashTableSize); data.put(1, tableSize);
for (int i = 2; i < hashTableSize; i++) {
for (int i = 2; i < tableSize; i++) {
data.put(HEADER_SIZE + 2L*i, 0); data.put(HEADER_SIZE + 2L*i, 0);
} }
}
public LongPairHashMap(MultimapFileLong data) {
this.data = data;
hashTableSize = data.get(1);
maxProbeLength = hashTableSize / 10;
logger.debug("Table size = " + hashTableSize); return ret;
}
public static LongPairHashMap loadExisting(MultimapFileLong data) {
long key = data.get(0);
if (key != MAGIC_WORD) {
logger.warn("LongPairHashMap lacks magic word, could this be garbage data?");
}
var hashTableSize = data.get(1);
var maxProbeLength = hashTableSize / 10;
return new LongPairHashMap(data, hashTableSize, maxProbeLength);
} }
public int size() { public int size() {
@ -91,8 +98,6 @@ public class LongPairHashMap {
final var val = getCell(idx); final var val = getCell(idx);
if (!val.isSet()) { if (!val.isSet()) {
probe_count_metrics.set(j);
return setValue(data, idx); return setValue(data, idx);
} }
else if (val.getKey() == data.getKey()) { else if (val.getKey() == data.getKey()) {

View File

@ -21,7 +21,7 @@ import static java.nio.channels.FileChannel.MapMode.READ_WRITE;
import static nu.marginalia.util.FileSizeUtil.readableSize; import static nu.marginalia.util.FileSizeUtil.readableSize;
public class MultimapFileLong implements AutoCloseable { public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
private final ArrayList<LongBuffer> buffers = new ArrayList<>(); private final ArrayList<LongBuffer> buffers = new ArrayList<>();
private final ArrayList<MappedByteBuffer> mappedByteBuffers = new ArrayList<>(); private final ArrayList<MappedByteBuffer> mappedByteBuffers = new ArrayList<>();
@ -196,10 +196,12 @@ public class MultimapFileLong implements AutoCloseable {
} }
} }
@Override
public long size() { public long size() {
return fileLength; return fileLength;
} }
@Override
public void put(long idx, long val) { public void put(long idx, long val) {
if (idx >= mappedSize) if (idx >= mappedSize)
grow(idx); grow(idx);
@ -214,6 +216,7 @@ public class MultimapFileLong implements AutoCloseable {
} }
} }
@Override
public long get(long idx) { public long get(long idx) {
if (idx >= mappedSize) if (idx >= mappedSize)
grow(idx); grow(idx);
@ -229,10 +232,12 @@ public class MultimapFileLong implements AutoCloseable {
} }
@Override
public void read(long[] vals, long idx) { public void read(long[] vals, long idx) {
read(vals, vals.length, idx); read(vals, vals.length, idx);
} }
@Override
public void read(long[] vals, int n, long idx) { public void read(long[] vals, int n, long idx) {
if (idx+n >= mappedSize) { if (idx+n >= mappedSize) {
grow(idx+n); grow(idx+n);
@ -257,10 +262,12 @@ public class MultimapFileLong implements AutoCloseable {
} }
@Override
public void write(long[] vals, long idx) { public void write(long[] vals, long idx) {
write(vals, vals.length, idx); write(vals, vals.length, idx);
} }
@Override
public void write(long[] vals, int n, long idx) { public void write(long[] vals, int n, long idx) {
if (idx+n >= mappedSize) { if (idx+n >= mappedSize) {
grow(idx+n); grow(idx+n);
@ -285,6 +292,7 @@ public class MultimapFileLong implements AutoCloseable {
} }
@Override
public void write(LongBuffer vals, long idx) { public void write(LongBuffer vals, long idx) {
int n = vals.limit() - vals.position(); int n = vals.limit() - vals.position();
if (idx+n >= mappedSize) { if (idx+n >= mappedSize) {
@ -310,6 +318,7 @@ public class MultimapFileLong implements AutoCloseable {
} }
@Override
public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException { public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException {
int length = (int)(sourceEnd - sourceStart); int length = (int)(sourceEnd - sourceStart);

View File

@ -0,0 +1,70 @@
package nu.marginalia.util.multimap;
import java.io.IOException;
import java.nio.LongBuffer;
import java.nio.channels.FileChannel;
public class MultimapFileLongOffsetSlice implements MultimapFileLongSlice {
private final long off;
private final MultimapFileLongSlice map;
public MultimapFileLongOffsetSlice(MultimapFileLongSlice map, long off) {
this.off = off;
this.map = map;
}
@Override
public long size() {
return map.size() - off;
}
@Override
public void put(long idx, long val) {
map.put(off+idx, val);
}
@Override
public long get(long idx) {
return map.get(off+idx);
}
@Override
public void read(long[] vals, long idx) {
map.read(vals, idx+off);
}
@Override
public void read(long[] vals, int n, long idx) {
map.read(vals, n, idx+off);
}
@Override
public void write(long[] vals, long idx) {
map.write(vals, idx+off);
}
@Override
public void write(long[] vals, int n, long idx) {
map.write(vals, n, idx+off);
}
@Override
public void write(LongBuffer vals, long idx) {
map.write(vals, idx+off);
}
@Override
public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd)
throws IOException {
map.transferFromFileChannel(sourceChannel, destOffset + off, sourceStart, sourceEnd);
}
@Override
public MultimapFileLongSlice atOffset(long off) {
// If we don't override this, the default implementation would build a pyramid of
// MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(MultimapFileLongSlice(...)))
// if this is called iteratively (e.g. to walk over a file)
return new MultimapFileLongOffsetSlice(map, this.off + off);
}
}

View File

@ -0,0 +1,29 @@
package nu.marginalia.util.multimap;
import java.io.IOException;
import java.nio.LongBuffer;
import java.nio.channels.FileChannel;
public interface MultimapFileLongSlice {
long size();
void put(long idx, long val);
long get(long idx);
void read(long[] vals, long idx);
void read(long[] vals, int n, long idx);
void write(long[] vals, long idx);
void write(long[] vals, int n, long idx);
void write(LongBuffer vals, long idx);
void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException;
default MultimapFileLongSlice atOffset(long off) {
return new MultimapFileLongOffsetSlice(this, off);
}
}

View File

@ -4,9 +4,9 @@ import lombok.experimental.Delegate;
public class MultimapSearcher { public class MultimapSearcher {
@Delegate @Delegate
private final MultimapFileLong mmf; private final MultimapFileLongSlice mmf;
public MultimapSearcher(MultimapFileLong mmf) { public MultimapSearcher(MultimapFileLongSlice mmf) {
this.mmf = mmf; this.mmf = mmf;
} }

View File

@ -13,10 +13,10 @@ import static nu.marginalia.util.multimap.MultimapFileLong.WORD_SIZE;
public class MultimapSorter { public class MultimapSorter {
private final Path tmpFileDir; private final Path tmpFileDir;
private final int internalSortLimit; private final int internalSortLimit;
private final MultimapFileLong multimapFileLong; private final MultimapFileLongSlice multimapFileLong;
private final long[] buffer; private final long[] buffer;
public MultimapSorter(MultimapFileLong multimapFileLong, Path tmpFileDir, int internalSortLimit) { public MultimapSorter(MultimapFileLongSlice multimapFileLong, Path tmpFileDir, int internalSortLimit) {
this.multimapFileLong = multimapFileLong; this.multimapFileLong = multimapFileLong;
this.tmpFileDir = tmpFileDir; this.tmpFileDir = tmpFileDir;
this.internalSortLimit = internalSortLimit; this.internalSortLimit = internalSortLimit;

View File

@ -1,49 +0,0 @@
package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.hash.TIntIntHashMap;
import it.unimi.dsi.fastutil.ints.IntArrays;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.sql.SQLException;
public class AcademiaRank {
private final TIntArrayList result;
private static final Logger logger = LoggerFactory.getLogger(AcademiaRank.class);
public AcademiaRank(HikariDataSource ds, String... origins) throws IOException {
TIntList rankingResults = new BetterStandardPageRank(ds, origins).pageRank(100_000);
TIntIntHashMap idToRanking = new TIntIntHashMap(100_000, 0.5f, -1, 1_000_000_000);
for (int i = 0; i < rankingResults.size(); i++) {
idToRanking.put(rankingResults.get(i), i);
}
result = new TIntArrayList(10000);
try (var conn = ds.getConnection();
var stmt = conn.prepareStatement("select EC_DOMAIN.ID,COUNT(SOURCE_DOMAIN_ID) AS CNT from EC_DOMAIN INNER JOIN DOMAIN_METADATA ON DOMAIN_METADATA.ID=EC_DOMAIN.ID INNER JOIN EC_DOMAIN_LINK ON EC_DOMAIN_LINK.DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE INDEXED>0 AND STATE>=0 AND STATE<2 AND ((VISITED_URLS>1000+1500*RANK AND RANK<1) OR (GOOD_URLS>1000 AND URL_PART LIKE '%edu')) GROUP BY EC_DOMAIN.ID HAVING CNT<1500 ORDER BY RANK ASC")) {
stmt.setFetchSize(1000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
result.add(rsp.getInt(1));
}
}
catch (SQLException ex) {
logger.error("SQL error", ex);
}
int[] internalArray = result.toArray();
IntArrays.quickSort(internalArray, (a,b) -> idToRanking.get(a) - idToRanking.get(b));
result.set(0, internalArray);
}
public TIntArrayList getResult() {
return result;
}
}

View File

@ -1,15 +1,11 @@
package nu.marginalia.util.ranking; package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource;
import java.io.IOException;
public class BetterReversePageRank extends RankingAlgorithm { public class BetterReversePageRank extends RankingAlgorithm {
public BetterReversePageRank(HikariDataSource dataSource, String... origins) { public BetterReversePageRank(RankingDomainFetcher domains, String... origins) {
super(dataSource, origins); super(domains, origins);
} }
@Override @Override

View File

@ -1,14 +1,10 @@
package nu.marginalia.util.ranking; package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource;
import java.io.IOException;
public class BetterStandardPageRank extends RankingAlgorithm { public class BetterStandardPageRank extends RankingAlgorithm {
public BetterStandardPageRank(HikariDataSource dataSource, String... origins) { public BetterStandardPageRank(RankingDomainFetcher domains, String... origins) {
super(dataSource, origins); super(domains, origins);
} }
@Override @Override

View File

@ -1,15 +1,11 @@
package nu.marginalia.util.ranking; package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource;
import java.io.IOException;
public class BuggyReversePageRank extends RankingAlgorithm { public class BuggyReversePageRank extends RankingAlgorithm {
public BuggyReversePageRank(HikariDataSource dataSource, String... origins) { public BuggyReversePageRank(RankingDomainFetcher domains, String... origins) {
super(dataSource, origins); super(domains, origins);
} }
@Override @Override

View File

@ -1,14 +1,10 @@
package nu.marginalia.util.ranking; package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource;
import java.io.IOException;
public class BuggyStandardPageRank extends RankingAlgorithm { public class BuggyStandardPageRank extends RankingAlgorithm {
public BuggyStandardPageRank(HikariDataSource dataSource, String... origins) { public BuggyStandardPageRank(RankingDomainFetcher domains, String... origins) {
super(dataSource, origins); super(domains, origins);
} }
@Override @Override

View File

@ -1,224 +1,129 @@
package nu.marginalia.util.ranking; package nu.marginalia.util.ranking;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.TIntList; import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList; import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.hash.TIntIntHashMap; import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.map.hash.TIntObjectHashMap; import gnu.trove.map.hash.TIntObjectHashMap;
import gnu.trove.set.hash.TIntHashSet;
import it.unimi.dsi.fastutil.ints.IntComparator; import it.unimi.dsi.fastutil.ints.IntComparator;
import lombok.AllArgsConstructor;
import lombok.Data;
import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.IOException; import java.io.IOException;
import java.sql.SQLException;
import java.util.*; import java.util.*;
import java.util.function.IntToDoubleFunction; import java.util.function.IntToDoubleFunction;
import java.util.stream.IntStream; import java.util.stream.IntStream;
import it.unimi.dsi.fastutil.ints.IntArrays; import it.unimi.dsi.fastutil.ints.IntArrays;
public abstract class RankingAlgorithm { public abstract class RankingAlgorithm {
final TIntObjectHashMap<DomainData> domainsById = new TIntObjectHashMap<>(); protected final TIntObjectHashMap<RankingDomainData> domainsById = new TIntObjectHashMap<>();
final TIntIntHashMap domainIndexToId = new TIntIntHashMap(); protected final TIntIntHashMap domainIndexToId = new TIntIntHashMap();
final TIntIntHashMap domainIdToIndex = new TIntIntHashMap(); protected final TIntIntHashMap domainIdToIndex = new TIntIntHashMap();
private final TIntHashSet spamDomains; protected TIntArrayList[] linkDataSrc2Dest;
private final HikariDataSource dataSource; protected TIntArrayList[] linkDataDest2Src;
TIntArrayList[] linkDataSrc2Dest;
TIntArrayList[] linkDataDest2Src;
public final Set<String> originDomains = new HashSet<>(); public final Set<String> originDomains = new HashSet<>();
public final Set<Integer> originDomainIds = new HashSet<>(); public final Set<Integer> originDomainIds = new HashSet<>();
private int maxKnownUrls = Integer.MAX_VALUE; private int maxKnownUrls = Integer.MAX_VALUE;
private static final boolean getNames = true;
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
public static void main(String... args) throws IOException { private final RankingDomainFetcher domains;
var rpr = new BuggyReversePageRank(new DatabaseModule().provideConnection(), "wiki.xxiivv.com");
var spr = new BuggyStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu");
var rankVector = spr.pageRankVector(); public RankingAlgorithm(RankingDomainFetcher domains, String... origins) {
var norm = rankVector.norm(); this.domains = domains;
rpr.pageRank(i -> rankVector.get(i) / norm, 25).forEach(i -> {
System.out.println(spr.domainNameFromId(i));
return true;
});
}
public String domainNameFromId(int id) {
return domainsById.get(id).name;
}
public boolean isPeripheral(int id) {
return domainsById.get(id).peripheral;
}
public RankingAlgorithm(HikariDataSource dataSource, String... origins) {
this.dataSource = dataSource;
var blacklist = new EdgeDomainBlacklistImpl(dataSource);
spamDomains = blacklist.getSpamDomains();
originDomains.addAll(Arrays.asList(origins)); originDomains.addAll(Arrays.asList(origins));
try (var conn = dataSource.getConnection()) { domains.getDomains(domainData -> {
int id = domainData.id;
String s; domainsById.put(id, domainData);
if (getNames) {
s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID"; domainIndexToId.put(domainIndexToId.size(), id);
domainIdToIndex.put(id, domainIdToIndex.size());
});
linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()];
linkDataDest2Src = new TIntArrayList[domainIndexToId.size()];
domains.eachDomainLink((src, dst) -> {
if (src == dst) return;
if (domainsById.contains(src) && domainsById.contains(dst)) {
int srcIdx = domainIdToIndex.get(src);
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
if (linkDataSrc2Dest[srcIdx] == null) {
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
}
linkDataSrc2Dest[srcIdx].add(dstIdx);
if (linkDataDest2Src[dstIdx] == null) {
linkDataDest2Src[dstIdx] = new TIntArrayList();
}
linkDataDest2Src[dstIdx].add(srcIdx);
}
});
for (var namePattern : this.originDomains) {
domains.domainsByPattern(namePattern, i -> {
int ival = domainIdToIndex.get(i);
if (ival != domainIdToIndex.getNoEntryValue() || domainIndexToId.get(0) == i) {
originDomainIds.add(ival);
} }
else { else {
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID"; logger.debug("No value for {}", i);
} }
try (var stmt = conn.prepareStatement(s)) { });
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int id = rsp.getInt(1);
if (!spamDomains.contains(id)) {
domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), rsp.getInt(4), rsp.getInt(5), false));
domainIndexToId.put(domainIndexToId.size(), id);
domainIdToIndex.put(id, domainIdToIndex.size());
}
}
}
linkDataSrc2Dest = new TIntArrayList[domainIndexToId.size()];
linkDataDest2Src = new TIntArrayList[domainIndexToId.size()];
try (var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) {
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int src = rsp.getInt(1);
int dst = rsp.getInt(2);
if (src == dst) continue;
if (domainsById.contains(src) && domainsById.contains(dst)) {
int srcIdx = domainIdToIndex.get(src);
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
if (linkDataSrc2Dest[srcIdx] == null) {
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
}
linkDataSrc2Dest[srcIdx].add(dstIdx);
if (linkDataDest2Src[dstIdx] == null) {
linkDataDest2Src[dstIdx] = new TIntArrayList();
}
linkDataDest2Src[dstIdx].add(srcIdx);
}
}
}
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART LIKE ?")) {
for (var seed : this.originDomains) {
stmt.setString(1, seed);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int i = rsp.getInt(1);
int ival = domainIdToIndex.get(i);
if (ival != domainIdToIndex.getNoEntryValue() || domainIndexToId.get(0) == i) {
originDomainIds.add(ival);
}
else {
logger.debug("No value for {}", i);
}
}
logger.debug("{} -> {}", seed, originDomainIds.size());
}
}
logger.info("Origin Domains: {}", originDomainIds.size());
} catch (SQLException throwables) {
logger.error("SQL error", throwables);
} }
logger.info("Origin Domains: {}", originDomainIds.size());
} }
public void addPeripheralNodes(boolean includeErrorStates) { public void addPeripheralNodes() {
int newNodesIdxCutoff = domainIdToIndex.size(); int newNodesIdxCutoff = domainIdToIndex.size();
logger.info("Inserting peripheral nodes"); logger.info("Inserting peripheral nodes");
try (var conn = dataSource.getConnection()) { domains.getPeripheralDomains(domainData -> {
String s; int id = domainData.id;
if (getNames) {
s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID"; if (domainsById.put(id, domainData) == null) { // true if id was not already present
domainIndexToId.put(domainIndexToId.size(), id);
domainIdToIndex.put(id, domainIdToIndex.size());
} }
else { });
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
}
try (var stmt = conn.prepareStatement(s)) {
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) { linkDataSrc2Dest = Arrays.copyOf(linkDataSrc2Dest, domainIndexToId.size());
int id = rsp.getInt(1); linkDataDest2Src = Arrays.copyOf(linkDataDest2Src, domainIndexToId.size());
if (!spamDomains.contains(id)) { domains.eachDomainLink((src, dst) -> {
domainsById.put(id, new DomainData(id, rsp.getString(2), rsp.getInt(3), rsp.getInt(4), rsp.getInt(5), true)); if (src == dst) return;
domainIndexToId.put(domainIndexToId.size(), id); if (domainsById.contains(src) && domainsById.contains(dst)) {
domainIdToIndex.put(id, domainIdToIndex.size()); int srcIdx = domainIdToIndex.get(src);
} int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
// This looks like a bug, but it improves the results
if (srcIdx < newNodesIdxCutoff || dstIdx < newNodesIdxCutoff)
return;
if (linkDataSrc2Dest[srcIdx] == null) {
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
} }
linkDataSrc2Dest[srcIdx].add(dstIdx);
} if (linkDataDest2Src[dstIdx] == null) {
linkDataDest2Src[dstIdx] = new TIntArrayList();
linkDataSrc2Dest = Arrays.copyOf(linkDataSrc2Dest, domainIndexToId.size());
linkDataDest2Src = Arrays.copyOf(linkDataDest2Src, domainIndexToId.size());
try (var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) {
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int src = rsp.getInt(1);
int dst = rsp.getInt(2);
if (src == dst) continue;
if (domainsById.contains(src) && domainsById.contains(dst)) {
int srcIdx = domainIdToIndex.get(src);
int dstIdx = domainIdToIndex.get(domainsById.get(dst).resolveAlias());
// This looks like a bug, but it improves the results
if (srcIdx < newNodesIdxCutoff || dstIdx < newNodesIdxCutoff)
continue;
if (linkDataSrc2Dest[srcIdx] == null) {
linkDataSrc2Dest[srcIdx] = new TIntArrayList();
}
linkDataSrc2Dest[srcIdx].add(dstIdx);
if (linkDataDest2Src[dstIdx] == null) {
linkDataDest2Src[dstIdx] = new TIntArrayList();
}
linkDataDest2Src[dstIdx].add(srcIdx);
}
} }
linkDataDest2Src[dstIdx].add(srcIdx);
} }
} catch (SQLException throwables) { });
logger.error("SQL error", throwables);
}
logger.info("Peripheral nodes inserted {} -> {}", newNodesIdxCutoff, domainIdToIndex.size()); logger.info("Peripheral nodes inserted {} -> {}", newNodesIdxCutoff, domainIdToIndex.size());
} }
@ -271,14 +176,14 @@ public abstract class RankingAlgorithm {
return rank.getRanking(resultCount); return rank.getRanking(resultCount);
} }
public TIntList pageRankWithPeripheralNodes(int resultCount, boolean includeErrorStates) { public TIntList pageRankWithPeripheralNodes(int resultCount) {
RankVector rank = new RankVector(1.d / domainsById.size()); RankVector rank = new RankVector(1.d / domainsById.size());
int iter_max = 100; int iter_max = 100;
for (int i = 0; i < iter_max; i++) { for (int i = 0; i < iter_max; i++) {
if (i == iter_max-1) { if (i == iter_max-1) {
addPeripheralNodes(includeErrorStates); addPeripheralNodes();
} }
RankVector newRank = createNewRankVector(rank); RankVector newRank = createNewRankVector(rank);
@ -323,7 +228,7 @@ public abstract class RankingAlgorithm {
abstract RankVector createNewRankVector(RankVector rank); abstract RankVector createNewRankVector(RankVector rank);
public boolean includeInRanking(DomainData data) { public boolean includeInRanking(RankingDomainData data) {
if (data.isAlias()) if (data.isAlias())
return false; return false;
if (data.isSpecial()) if (data.isSpecial())
@ -445,32 +350,4 @@ public abstract class RankingAlgorithm {
} }
} }
@Data
@AllArgsConstructor
static class DomainData {
public final int id;
public final String name;
private int alias;
private int state;
public final int knownUrls;
public boolean peripheral;
public int resolveAlias() {
if (alias == 0) return id;
return alias;
}
public boolean isAlias() {
return alias != 0;
}
public boolean isSpecial() {
return EdgeDomainIndexingState.SPECIAL.code == state;
}
public boolean isSocialMedia() {
return EdgeDomainIndexingState.SOCIAL_MEDIA.code == state;
}
}
} }

View File

@ -0,0 +1,33 @@
package nu.marginalia.util.ranking;
import lombok.AllArgsConstructor;
import lombok.Data;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
@Data
@AllArgsConstructor
class RankingDomainData {
public final int id;
public final String name;
private int alias;
private EdgeDomainIndexingState state;
public final int knownUrls;
public boolean peripheral;
public int resolveAlias() {
if (alias == 0) return id;
return alias;
}
public boolean isAlias() {
return alias != 0;
}
public boolean isSpecial() {
return EdgeDomainIndexingState.SPECIAL == state;
}
public boolean isSocialMedia() {
return EdgeDomainIndexingState.SOCIAL_MEDIA == state;
}
}

View File

@ -0,0 +1,105 @@
package nu.marginalia.util.ranking;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.function.Consumer;
import java.util.function.IntConsumer;
public class RankingDomainFetcher {
private final HikariDataSource dataSource;
private final EdgeDomainBlacklistImpl blacklist;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final boolean getNames = false;
@Inject
public RankingDomainFetcher(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) {
this.dataSource = dataSource;
this.blacklist = blacklist;
}
public void getDomains(Consumer<RankingDomainData> consumer) {
String query;
if (getNames) {
query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
}
else {
query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID GROUP BY EC_DOMAIN.ID";
}
getDomains(query, consumer);
}
public void getPeripheralDomains(Consumer<RankingDomainData> consumer) {
String query;
if (getNames) {
query = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
}
else {
query = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS,STATE,KNOWN_URLS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN EC_DOMAIN_LINK ON SOURCE_DOMAIN_ID=EC_DOMAIN.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND EC_DOMAIN_LINK.ID IS NULL GROUP BY EC_DOMAIN.ID";
}
getDomains(query, consumer);
}
private void getDomains(String query, Consumer<RankingDomainData> consumer) {
try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(query)) {
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int id = rsp.getInt(1);
if (!blacklist.isBlacklisted(id)) {
consumer.accept(new RankingDomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5), false));
}
}
}
catch (SQLException ex) {
logger.error("Failed to fetch domains", ex);
}
}
public void eachDomainLink(DomainLinkConsumer consumer) {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK"))
{
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int src = rsp.getInt(1);
int dst = rsp.getInt(2);
consumer.accept(src, dst);
}
}
catch (SQLException ex) {
logger.error("Failed to fetch domain links", ex);
}
}
public void domainsByPattern(String pattern, IntConsumer idConsumer) {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME LIKE ?")) {
stmt.setString(1, pattern);
var rsp = stmt.executeQuery();
while (rsp.next()) {
idConsumer.accept(rsp.getInt(1));
}
}
catch (SQLException ex) {
logger.error("Failed to fetch domains by pattern", ex);
}
}
public interface DomainLinkConsumer {
void accept(int from, int to);
}
}

View File

@ -66,7 +66,7 @@ public class OldReversePageRankV2 {
originDomains.add("memex.marginalia.nu"); originDomains.add("memex.marginalia.nu");
try (var conn = dataSource.getConnection()) { try (var conn = dataSource.getConnection()) {
try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE FROM EC_DOMAIN WHERE INDEXED>1 AND STATE>=0 AND QUALITY_RAW>=-10")) { try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE FROM EC_DOMAIN WHERE INDEXED>1 AND IS_ALIVE")) {
stmt.setFetchSize(10000); stmt.setFetchSize(10000);
var rsp = stmt.executeQuery(); var rsp = stmt.executeQuery();
while (rsp.next()) { while (rsp.next()) {
@ -90,7 +90,7 @@ public class OldReversePageRankV2 {
} }
} }
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) { try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
stmt.setFetchSize(10000); stmt.setFetchSize(10000);
for (var seed : this.originDomains) { for (var seed : this.originDomains) {

View File

@ -48,7 +48,7 @@ public class StandardPageRank {
originDomains.addAll(Arrays.asList(origins)); originDomains.addAll(Arrays.asList(origins));
try (var conn = dataSource.getConnection()) { try (var conn = dataSource.getConnection()) {
try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE,URL_PART FROM EC_DOMAIN WHERE INDEXED>1 AND STATE>=0 AND QUALITY>=-10")) { try (var stmt = conn.prepareStatement("SELECT ID,INDEXED,STATE,DOMAIN_NAME FROM EC_DOMAIN WHERE INDEXED>1 AND IS_ALIVE AND QUALITY>=-10")) {
stmt.setFetchSize(10000); stmt.setFetchSize(10000);
var rsp = stmt.executeQuery(); var rsp = stmt.executeQuery();
while (rsp.next()) { while (rsp.next()) {
@ -78,7 +78,7 @@ public class StandardPageRank {
} }
} }
try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) { try (var stmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
for (var seed : this.originDomains) { for (var seed : this.originDomains) {
stmt.setString(1, seed); stmt.setString(1, seed);
var rsp = stmt.executeQuery(); var rsp = stmt.executeQuery();

View File

@ -50,7 +50,7 @@ public class DedupTool {
Map<Integer, Map<Integer, List<Data>>> domainToHashToUrl = new HashMap<>(); Map<Integer, Map<Integer, List<Data>>> domainToHashToUrl = new HashMap<>();
try (var conn = ds.getConnection(); try (var conn = ds.getConnection();
var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.URL_PART FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL"); var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.DOMAIN_NAME FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL");
var updateStmt = conn.prepareStatement("UPDATE EC_URL SET STATE='redirect' WHERE ID=?") var updateStmt = conn.prepareStatement("UPDATE EC_URL SET STATE='redirect' WHERE ID=?")
) { ) {

View File

@ -112,10 +112,10 @@ public class PerusePageRankV2 {
try (var conn = dataSource.getConnection()) { try (var conn = dataSource.getConnection()) {
String s; String s;
if (getNames) { if (getNames) {
s = "SELECT EC_DOMAIN.ID,URL_PART,DOMAIN_ALIAS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 GROUP BY EC_DOMAIN.ID"; s = "SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) GROUP BY EC_DOMAIN.ID";
} }
else { else {
s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS FROM EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND STATE >= 0) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) AND QUALITY_RAW>=-20 GROUP BY EC_DOMAIN.ID"; s = "SELECT EC_DOMAIN.ID,\"\",DOMAIN_ALIAS FROM EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_METADATA.ID WHERE ((INDEXED>1 AND IS_ALIVE) OR (INDEXED=1 AND VISITED_URLS=KNOWN_URLS AND GOOD_URLS>0)) GROUP BY EC_DOMAIN.ID";
} }
try (var stmt = conn.prepareStatement(s)) { try (var stmt = conn.prepareStatement(s)) {
stmt.setFetchSize(10000); stmt.setFetchSize(10000);

View File

@ -1,30 +0,0 @@
package nu.marginalia.util.ranking.tool;
import lombok.SneakyThrows;
import nu.marginalia.util.ranking.AcademiaRank;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import org.mariadb.jdbc.Driver;
import java.io.IOException;
public class TestAcademiaRankTool {
@SneakyThrows
public static void main(String... args) {
Driver driver = new Driver();
var conn = new DatabaseModule().provideConnection();
var rank = new AcademiaRank(new DatabaseModule().provideConnection(), "www.perseus.tufts.edu", "xroads.virginia.edu");
var res = rank.getResult();
try (var c = conn.getConnection(); var stmt = c.prepareStatement("SELECT URL_PART FROM EC_DOMAIN WHERE ID=?")) {
for (int i = 0; i < Math.min(res.size(), 100); i++) {
stmt.setInt(1, res.getQuick(i));
var rsp = stmt.executeQuery();
while (rsp.next())
System.out.println(rsp.getString(1));
}
}
}
}

View File

@ -3,12 +3,13 @@ package nu.marginalia.util.ranking.tool;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.util.ranking.BuggyStandardPageRank; import nu.marginalia.util.ranking.BuggyStandardPageRank;
import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import org.mariadb.jdbc.Driver; import org.mariadb.jdbc.Driver;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.HashSet; import java.util.HashSet;
import java.util.Set; import java.util.Set;
@ -43,12 +44,14 @@ public class UpdateDomainRanksTool {
var uploader = new Thread(() -> uploadThread(conn), "Uploader"); var uploader = new Thread(() -> uploadThread(conn), "Uploader");
logger.info("Ranking"); logger.info("Ranking");
var spr = new BuggyStandardPageRank(new DatabaseModule().provideConnection(),"memex.marginalia.nu"); var ds = new DatabaseModule().provideConnection();
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
var spr = new BuggyStandardPageRank(domains, "memex.marginalia.nu");
rankMax = spr.size()*2; rankMax = spr.size()*2;
uploader.start(); uploader.start();
spr.pageRankWithPeripheralNodes(rankMax, false).forEach(i -> { spr.pageRankWithPeripheralNodes(rankMax).forEach(i -> {
try { try {
uploadQueue.put(i); uploadQueue.put(i);
} catch (InterruptedException e) { } catch (InterruptedException e) {
@ -83,11 +86,6 @@ public class UpdateDomainRanksTool {
} }
} }
logger.info("Recalculating quality");
try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET QUALITY=-5*RANK+IF(RANK=1,RANK*GREATEST(QUALITY_RAW,QUALITY_ORIGINAL)/2, 0)")) {
stmt.executeUpdate();
}
} catch (SQLException | InterruptedException throwables) { } catch (SQLException | InterruptedException throwables) {
throwables.printStackTrace(); throwables.printStackTrace();
} }

View File

@ -3,12 +3,13 @@ package nu.marginalia.util.ranking.tool;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.util.ranking.BetterReversePageRank; import nu.marginalia.util.ranking.BetterReversePageRank;
import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import org.mariadb.jdbc.Driver; import org.mariadb.jdbc.Driver;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.HashSet; import java.util.HashSet;
import java.util.Set; import java.util.Set;
@ -45,7 +46,9 @@ public class UpdateDomainRanksTool2 {
logger.info("Ranking"); logger.info("Ranking");
// "memex.marginalia.nu", "wiki.xxiivv.com", "bikobatanari.art", "sadgrl.online", "lileks.com", // "memex.marginalia.nu", "wiki.xxiivv.com", "bikobatanari.art", "sadgrl.online", "lileks.com",
// "www.rep.routledge.com", "www.personal.kent.edu", "xroads.virginia.edu", "classics.mit.edu", "faculty.washington.edu", "monadnock.net" // "www.rep.routledge.com", "www.personal.kent.edu", "xroads.virginia.edu", "classics.mit.edu", "faculty.washington.edu", "monadnock.net"
var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org"); var ds = new DatabaseModule().provideConnection();
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
var rpr = new BetterReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
// var rpr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "%edu"); // var rpr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "%edu");
// var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu"); // var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu");
@ -58,7 +61,7 @@ public class UpdateDomainRanksTool2 {
rankMax = rpr.size(); rankMax = rpr.size();
rpr.pageRankWithPeripheralNodes(rankMax, false).forEach(i -> { rpr.pageRankWithPeripheralNodes(rankMax).forEach(i -> {
try { try {
uploadQueue.put(i); uploadQueue.put(i);
} catch (InterruptedException e) { } catch (InterruptedException e) {
@ -94,9 +97,6 @@ public class UpdateDomainRanksTool2 {
} }
logger.info("Recalculating quality"); logger.info("Recalculating quality");
try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET QUALITY=-5*RANK+IF(RANK=1,RANK*GREATEST(QUALITY_RAW,QUALITY_ORIGINAL)/2, 0)")) {
stmt.executeUpdate();
}
} catch (SQLException | InterruptedException throwables) { } catch (SQLException | InterruptedException throwables) {
throwables.printStackTrace(); throwables.printStackTrace();

View File

@ -29,7 +29,7 @@ public class ReindexTriggerMain {
.build(); .build();
try (var ds = db.provideConnection(); var conn = ds.getConnection(); var stmt = conn.createStatement()) { try (var ds = db.provideConnection(); var conn = ds.getConnection(); var stmt = conn.createStatement()) {
var rs = stmt.executeQuery("SELECT ID, URL_PART, STATE, INDEXED FROM EC_DOMAIN LIMIT 100"); var rs = stmt.executeQuery("SELECT ID, DOMAIN_NAME, STATE, INDEXED FROM EC_DOMAIN LIMIT 100");
while (rs.next()) { while (rs.next()) {
System.out.printf("%d %s %s %d\n", System.out.printf("%d %s %s %d\n",
rs.getInt(1), rs.getInt(1),
@ -38,7 +38,7 @@ public class ReindexTriggerMain {
rs.getInt(4)); rs.getInt(4));
} }
rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, URL, VISITED, STATE FROM EC_URL LIMIT 100"); rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, PATH, VISITED, STATE FROM EC_URL LIMIT 100");
while (rs.next()) { while (rs.next()) {
System.out.printf("%d %d %s %d %s\n", System.out.printf("%d %d %s %d %s\n",
rs.getInt(1), rs.getInt(1),

View File

@ -14,7 +14,7 @@ public interface Interpreter {
void loadRssFeed(EdgeUrl[] rssFeed); void loadRssFeed(EdgeUrl[] rssFeed);
void loadDomainLink(DomainLink[] links); void loadDomainLink(DomainLink[] links);
void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality); void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip);
void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument); void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument);
void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError); void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError);

View File

@ -6,11 +6,11 @@ import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter;
import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
public record LoadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality) implements Instruction { public record LoadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) implements Instruction {
@Override @Override
public void apply(Interpreter interpreter) { public void apply(Interpreter interpreter) {
interpreter.loadProcessedDomain(domain, state, quality); interpreter.loadProcessedDomain(domain, state, ip);
} }
@Override @Override

View File

@ -76,9 +76,9 @@ public class Loader implements Interpreter {
} }
@Override @Override
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, double quality) { public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) {
logger.debug("loadProcessedDomain({}, {}, {})", domain, state, quality); logger.debug("loadProcessedDomain({}, {}, {})", domain, state, ip);
sqlLoadProcessedDomain.load(data, domain, state, quality); sqlLoadProcessedDomain.load(data, domain, state, ip);
} }
@Override @Override

View File

@ -30,7 +30,7 @@ public class SqlLoadDomainLinks {
INSERT IGNORE INTO EC_DOMAIN_LINK (SOURCE_DOMAIN_ID, DEST_DOMAIN_ID) INSERT IGNORE INTO EC_DOMAIN_LINK (SOURCE_DOMAIN_ID, DEST_DOMAIN_ID)
SELECT SOURCE.ID,DEST.ID SELECT SOURCE.ID,DEST.ID
FROM EC_DOMAIN SOURCE INNER JOIN EC_DOMAIN DEST FROM EC_DOMAIN SOURCE INNER JOIN EC_DOMAIN DEST
ON SOURCE.URL_PART=FROM_DOMAIN AND DEST.URL_PART=TO_DOMAIN; ON SOURCE.DOMAIN_NAME=FROM_DOMAIN AND DEST.DOMAIN_NAME=TO_DOMAIN;
END END
"""); """);
} }
@ -61,8 +61,8 @@ public class SqlLoadDomainLinks {
} }
} }
} }
catch (SQLException sql) { catch (SQLException ex) {
sql.printStackTrace(); logger.warn("SQL error inserting domain links", ex);
} }
} }

View File

@ -25,15 +25,9 @@ public class SqlLoadDomains {
stmt.execute(""" stmt.execute("""
CREATE PROCEDURE INSERT_DOMAIN ( CREATE PROCEDURE INSERT_DOMAIN (
IN DOMAIN_NAME VARCHAR(255), IN DOMAIN_NAME VARCHAR(255),
IN SUB_DOMAIN VARCHAR(255),
IN TOP_DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci) IN TOP_DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci)
BEGIN BEGIN
INSERT IGNORE INTO EC_TOP_DOMAIN (URL_PART) VALUES (TOP_DOMAIN); INSERT IGNORE INTO EC_DOMAIN(DOMAIN_NAME, DOMAIN_TOP) VALUES (DOMAIN_NAME, TOP_DOMAIN);
INSERT IGNORE INTO EC_DOMAIN(URL_PART, URL_SUBDOMAIN, URL_TOP_DOMAIN_ID)
SELECT DOMAIN_NAME,SUB_DOMAIN,ID
FROM EC_TOP_DOMAIN
WHERE EC_TOP_DOMAIN.URL_PART=TOP_DOMAIN;
END END
"""); """);
} }
@ -46,10 +40,9 @@ public class SqlLoadDomains {
public void load(LoaderData data, EdgeDomain domain) { public void load(LoaderData data, EdgeDomain domain) {
try (var connection = dataSource.getConnection()) { try (var connection = dataSource.getConnection()) {
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?,?)")) { try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) {
insertCall.setString(1, domain.toString()); insertCall.setString(1, domain.toString());
insertCall.setString(2, domain.subDomain); insertCall.setString(2, domain.domain);
insertCall.setString(3, domain.domain);
insertCall.addBatch(); insertCall.addBatch();
var ret = insertCall.executeUpdate(); var ret = insertCall.executeUpdate();
@ -57,12 +50,11 @@ public class SqlLoadDomains {
logger.warn("load({}) -- bad row count {}", domain, ret); logger.warn("load({}) -- bad row count {}", domain, ret);
} }
connection.commit();
findIdForTargetDomain(connection, data); findIdForTargetDomain(connection, data);
} }
} }
catch (SQLException ex) { catch (SQLException ex) {
ex.printStackTrace(); logger.warn("SQL error inserting domain", ex);
} }
@ -73,12 +65,11 @@ public class SqlLoadDomains {
try (var connection = dataSource.getConnection()) { try (var connection = dataSource.getConnection()) {
connection.setAutoCommit(false); connection.setAutoCommit(false);
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?,?)")) { try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) {
for (var domain : domains) { for (var domain : domains) {
insertCall.setString(1, domain.toString()); insertCall.setString(1, domain.toString());
insertCall.setString(2, domain.subDomain); insertCall.setString(2, domain.domain);
insertCall.setString(3, domain.domain);
insertCall.addBatch(); insertCall.addBatch();
} }
var ret = insertCall.executeBatch(); var ret = insertCall.executeBatch();
@ -95,7 +86,7 @@ public class SqlLoadDomains {
findIdForTargetDomain(connection, data); findIdForTargetDomain(connection, data);
} }
catch (SQLException ex) { catch (SQLException ex) {
ex.printStackTrace(); logger.warn("SQL error inserting domains", ex);
} }
} }
@ -104,7 +95,7 @@ public class SqlLoadDomains {
return; return;
} }
try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?"))
{ {
var targetDomain = data.getTargetDomain(); var targetDomain = data.getTargetDomain();
@ -118,7 +109,7 @@ public class SqlLoadDomains {
} }
} }
catch (SQLException ex) { catch (SQLException ex) {
ex.printStackTrace(); logger.warn("SQL error finding id for domain", ex);
} }
} }
} }

View File

@ -31,14 +31,14 @@ public class SqlLoadProcessedDocument {
IN TITLE VARCHAR(255), IN TITLE VARCHAR(255),
IN DESCRIPTION VARCHAR(255), IN DESCRIPTION VARCHAR(255),
IN LENGTH INT, IN LENGTH INT,
IN QUALITY_MEASURE DOUBLE,
IN FEATURES INT, IN FEATURES INT,
IN STANDARD VARCHAR(32), IN STANDARD VARCHAR(32),
IN QUALITY DOUBLE,
IN HASH INT) IN HASH INT)
BEGIN BEGIN
SET FOREIGN_KEY_CHECKS=0; SET FOREIGN_KEY_CHECKS=0;
REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES); REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES, HASH, QUALITY);
UPDATE EC_URL SET VISITED=1, STATE=STATE, QUALITY_MEASURE=QUALITY_MEASURE, DATA_HASH=HASH WHERE ID=URL_ID; UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID;
SET FOREIGN_KEY_CHECKS=1; SET FOREIGN_KEY_CHECKS=1;
END END
"""); """);
@ -47,7 +47,8 @@ public class SqlLoadProcessedDocument {
IN URL_ID INT, IN URL_ID INT,
IN STATE VARCHAR(32)) IN STATE VARCHAR(32))
BEGIN BEGIN
UPDATE EC_URL SET VISITED=1, STATE=STATE, QUALITY_MEASURE=-100, DATA_HASH=NULL WHERE ID=URL_ID; UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID;
DELETE FROM EC_PAGE_DATA WHERE ID=URL_ID;
END END
"""); """);
@ -61,6 +62,7 @@ public class SqlLoadProcessedDocument {
public void load(LoaderData data, List<LoadProcessedDocument> documents) { public void load(LoaderData data, List<LoadProcessedDocument> documents) {
try (var conn = dataSource.getConnection(); try (var conn = dataSource.getConnection();
var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?)")) { var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?)")) {
conn.setAutoCommit(false);
for (var doc : documents) { for (var doc : documents) {
int urlId = data.getUrlId(doc.url()); int urlId = data.getUrlId(doc.url());
@ -74,9 +76,9 @@ public class SqlLoadProcessedDocument {
stmt.setString(3, doc.title()); stmt.setString(3, doc.title());
stmt.setString(4, doc.description()); stmt.setString(4, doc.description());
stmt.setInt(5, doc.length()); stmt.setInt(5, doc.length());
stmt.setDouble(6, doc.quality()); stmt.setInt(6, doc.htmlFeatures());
stmt.setInt(7, doc.htmlFeatures()); stmt.setString(7, doc.standard().name());
stmt.setString(8, doc.standard().name()); stmt.setDouble(8, doc.quality());
stmt.setInt(9, (int) doc.hash()); stmt.setInt(9, (int) doc.hash());
stmt.addBatch(); stmt.addBatch();
} }
@ -89,8 +91,8 @@ public class SqlLoadProcessedDocument {
} }
conn.commit(); conn.commit();
} catch (SQLException e) { } catch (SQLException ex) {
e.printStackTrace(); logger.warn("SQL error inserting document", ex);
} }
@ -117,8 +119,8 @@ public class SqlLoadProcessedDocument {
logger.warn("load({}) -- bad row count {}", documents.get(rv), ret[rv]); logger.warn("load({}) -- bad row count {}", documents.get(rv), ret[rv]);
} }
} }
} catch (SQLException e) { } catch (SQLException ex) {
e.printStackTrace(); logger.warn("SQL error inserting failed document", ex);
} }
} }

View File

@ -25,12 +25,12 @@ public class SqlLoadProcessedDomain {
stmt.execute("DROP PROCEDURE IF EXISTS INITIALIZE_DOMAIN"); stmt.execute("DROP PROCEDURE IF EXISTS INITIALIZE_DOMAIN");
stmt.execute(""" stmt.execute("""
CREATE PROCEDURE INITIALIZE_DOMAIN ( CREATE PROCEDURE INITIALIZE_DOMAIN (
IN ST INT, IN ST ENUM('ACTIVE', 'EXHAUSTED', 'SPECIAL', 'SOCIAL_MEDIA', 'BLOCKED', 'REDIR', 'ERROR', 'UNKNOWN'),
IN IDX INT, IN IDX INT,
IN QUAL DOUBLE, IN DID INT,
IN DID INT) IN IP VARCHAR(32))
BEGIN BEGIN
UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), QUALITY=QUAL, QUALITY_RAW=QUAL, QUALITY_ORIGINAL=QUAL WHERE ID=DID; UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), IP=IP WHERE ID=DID;
DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID; DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID;
END END
"""); """);
@ -41,7 +41,7 @@ public class SqlLoadProcessedDomain {
} }
} }
public void load(LoaderData data, EdgeDomain domain, EdgeDomainIndexingState state, double quality) { public void load(LoaderData data, EdgeDomain domain, EdgeDomainIndexingState state, String ip) {
data.setTargetDomain(domain); data.setTargetDomain(domain);
loadDomains.load(data, domain); loadDomains.load(data, domain);
@ -49,18 +49,17 @@ public class SqlLoadProcessedDomain {
try (var conn = dataSource.getConnection(); try (var conn = dataSource.getConnection();
var initCall = conn.prepareCall("CALL INITIALIZE_DOMAIN(?,?,?,?)")) var initCall = conn.prepareCall("CALL INITIALIZE_DOMAIN(?,?,?,?)"))
{ {
initCall.setInt(1, state.code); initCall.setString(1, state.name());
initCall.setInt(2, 1 + data.sizeHint / 100); initCall.setInt(2, 1 + data.sizeHint / 100);
initCall.setDouble(3, quality); initCall.setInt(3, data.getDomainId(domain));
initCall.setInt(4, data.getDomainId(domain)); initCall.setString(4, ip);
int rc = initCall.executeUpdate(); int rc = initCall.executeUpdate();
if (rc < 1) { if (rc < 1) {
logger.warn("load({},{},{}) -- bad rowcount {}", domain, state, quality, rc); logger.warn("load({},{}) -- bad rowcount {}", domain, state, rc);
} }
conn.commit();
} }
catch (SQLException ex) { catch (SQLException ex) {
ex.printStackTrace(); logger.warn("SQL error initializing domain", ex);
} }
} }
@ -69,9 +68,9 @@ public class SqlLoadProcessedDomain {
try (var conn = dataSource.getConnection(); try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement(""" var stmt = conn.prepareStatement("""
UPDATE EC_DOMAIN TARGET UPDATE EC_DOMAIN TARGET
INNER JOIN EC_DOMAIN ALIAS ON ALIAS.URL_PART=? INNER JOIN EC_DOMAIN ALIAS ON ALIAS.DOMAIN_NAME=?
SET TARGET.DOMAIN_ALIAS=ALIAS.ID SET TARGET.DOMAIN_ALIAS=ALIAS.ID
WHERE TARGET.URL_PART=? WHERE TARGET.DOMAIN_NAME=?
""")) { """)) {
stmt.setString(1, link.to().toString()); stmt.setString(1, link.to().toString());
stmt.setString(2, link.from().toString()); stmt.setString(2, link.from().toString());
@ -81,7 +80,7 @@ public class SqlLoadProcessedDomain {
} }
} }
catch (SQLException ex) { catch (SQLException ex) {
ex.printStackTrace(); logger.warn("SQL error inserting domain alias", ex);
} }
} }
} }

View File

@ -1,11 +1,13 @@
package nu.marginalia.wmsa.edge.converting.loader; package nu.marginalia.wmsa.edge.converting.loader;
import com.google.common.hash.Hashing;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.nio.charset.StandardCharsets;
import java.sql.SQLException; import java.sql.SQLException;
import java.sql.Types; import java.sql.Types;
@ -25,12 +27,13 @@ public class SqlLoadUrls {
stmt.execute(""" stmt.execute("""
CREATE PROCEDURE INSERT_URL ( CREATE PROCEDURE INSERT_URL (
IN PROTO VARCHAR(255), IN PROTO VARCHAR(255),
IN DOMAIN_NAME VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
IN PORT INT, IN PORT INT,
IN URL VARCHAR(255) IN PATH VARCHAR(255),
IN PATH_HASH BIGINT
) )
BEGIN BEGIN
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,URL) SELECT PROTO,ID,PORT,URL FROM EC_DOMAIN WHERE URL_PART=DOMAIN_NAME; INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN;
END END
"""); """);
} }
@ -42,8 +45,8 @@ public class SqlLoadUrls {
public void load(LoaderData data, EdgeUrl[] urls) { public void load(LoaderData data, EdgeUrl[] urls) {
try (var conn = dataSource.getConnection(); try (var conn = dataSource.getConnection();
var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?)"); var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?, ?)");
var queryCall = conn.prepareStatement("SELECT ID, PROTO, URL FROM EC_URL WHERE DOMAIN_ID=?") var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH FROM EC_URL WHERE DOMAIN_ID=?")
) )
{ {
conn.setAutoCommit(false); conn.setAutoCommit(false);
@ -58,6 +61,7 @@ public class SqlLoadUrls {
insertCall.setNull(3, Types.INTEGER); insertCall.setNull(3, Types.INTEGER);
} }
insertCall.setString(4, url.path); insertCall.setString(4, url.path);
insertCall.setLong(5, hashPath(url.path));
insertCall.addBatch(); insertCall.addBatch();
} }
var ret = insertCall.executeBatch(); var ret = insertCall.executeBatch();
@ -86,7 +90,11 @@ public class SqlLoadUrls {
} }
catch (SQLException ex) { catch (SQLException ex) {
ex.printStackTrace(); logger.warn("SQL error inserting URLs", ex);
} }
} }
private long hashPath(String path) {
return Hashing.murmur3_128().hashString(path, StandardCharsets.UTF_8).asLong();
}
} }

View File

@ -15,7 +15,7 @@ public class InstructionsCompiler {
public List<Instruction> compile(ProcessedDomain domain) { public List<Instruction> compile(ProcessedDomain domain) {
List<Instruction> ret = new ArrayList<>(domain.size()*4); List<Instruction> ret = new ArrayList<>(domain.size()*4);
ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.averageQuality().orElse(-5.))); ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.ip));
if (domain.documents != null) { if (domain.documents != null) {
compileUrls(ret, domain.documents); compileUrls(ret, domain.documents);

View File

@ -34,11 +34,10 @@ public class CrawlJobExtractorMain {
private static final String domainsSql = private static final String domainsSql =
""" """
SELECT ID, LOWER(EC_DOMAIN.URL_PART) SELECT ID, LOWER(EC_DOMAIN.DOMAIN_NAME)
FROM EC_DOMAIN FROM EC_DOMAIN
WHERE QUALITY_RAW>-100 WHERE INDEXED>0
AND INDEXED>0 AND STATE='ACTIVE' OR STATE='EXHAUSTED'
AND STATE<2
ORDER BY ORDER BY
INDEX_DATE ASC, INDEX_DATE ASC,
DISCOVER_DATE ASC, DISCOVER_DATE ASC,
@ -49,8 +48,8 @@ public class CrawlJobExtractorMain {
private static final String urlsSql = private static final String urlsSql =
""" """
SELECT CONCAT(PROTO, "://", ?, URL) SELECT URL
FROM EC_URL FROM EC_URL_VIEW
WHERE DOMAIN_ID=? WHERE DOMAIN_ID=?
ORDER BY ORDER BY
VISITED DESC, VISITED DESC,

View File

@ -6,6 +6,7 @@ import com.google.common.hash.Hashing;
import com.google.gson.Gson; import com.google.gson.Gson;
import com.google.gson.GsonBuilder; import com.google.gson.GsonBuilder;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification; import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
@ -30,19 +31,19 @@ public class CrawlJobExtractorPageRankMain {
""" """
SELECT ID SELECT ID
FROM EC_DOMAIN FROM EC_DOMAIN
WHERE URL_PART=? WHERE DOMAIN_NAME=?
"""; """;
private static final String specificDomainSqlFromId = private static final String specificDomainSqlFromId =
""" """
SELECT LOWER(URL_PART) SELECT LOWER(DOMAIN_NAME)
FROM EC_DOMAIN FROM EC_DOMAIN
WHERE ID=? WHERE ID=?
"""; """;
private static final String urlsSql = private static final String urlsSql =
""" """
SELECT CONCAT(PROTO, "://", ?, URL) SELECT URL
FROM EC_URL FROM EC_URL_VIEW
WHERE DOMAIN_ID=? WHERE DOMAIN_ID=?
ORDER BY ORDER BY
VISITED DESC, VISITED DESC,
@ -73,10 +74,12 @@ public class CrawlJobExtractorPageRankMain {
Gson gson = new GsonBuilder().create(); Gson gson = new GsonBuilder().create();
var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org"); var ds = new DatabaseModule().provideConnection();
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
var rpr = new BetterReversePageRank(domains, "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
rpr.setMaxKnownUrls(750); rpr.setMaxKnownUrls(750);
var targetDomainIds = rpr.pageRankWithPeripheralNodes(rpr.size(), false); var targetDomainIds = rpr.pageRankWithPeripheralNodes(rpr.size());
try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) { try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) {
final var extractor = new CrawlJobExtractorPageRankMain(new DatabaseModule().provideConnection()); final var extractor = new CrawlJobExtractorPageRankMain(new DatabaseModule().provideConnection());

View File

@ -13,44 +13,14 @@ import java.util.Optional;
@ImplementedBy(EdgeDataStoreDaoImpl.class) @ImplementedBy(EdgeDataStoreDaoImpl.class)
public interface EdgeDataStoreDao { public interface EdgeDataStoreDao {
boolean isBlacklisted(EdgeDomain domain);
EdgeId<EdgeDomain> getDomainId(EdgeDomain domain); EdgeId<EdgeDomain> getDomainId(EdgeDomain domain);
EdgeId<EdgeUrl> getUrlId(EdgeUrl domain);
EdgeUrl getUrl(EdgeId<EdgeUrl> id);
EdgeUrlDetails getUrlDetails(EdgeId<EdgeUrl> id);
List<BrowseResult> getDomainNeighbors(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist backlist, int count);
List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist backlist, int count); List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist backlist, int count);
List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist backlist); List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist backlist);
List<EdgeUrlDetails> getUrlDetailsMulti(List<EdgeId<EdgeUrl>> ids); List<EdgeUrlDetails> getUrlDetailsMulti(List<EdgeId<EdgeUrl>> ids);
List<EdgeId<EdgeDomain>> getDomainIdsFromUrlIds(Collection<EdgeId<EdgeUrl>> urlIds);
EdgeDomain getDomain(EdgeId<EdgeDomain> id); EdgeDomain getDomain(EdgeId<EdgeDomain> id);
List<EdgeId<EdgeUrl>> inboudUrls(EdgeId<EdgeUrl> id, int limit);
List<EdgeId<EdgeUrl>> outboundUrls(EdgeId<EdgeUrl> id, int limit);
Optional<EdgeId<EdgeUrl>> resolveAmbiguousDomain(String name);
int getPagesKnown(EdgeId<EdgeDomain> domainId);
int getPagesVisited(EdgeId<EdgeDomain> domainId);
int getPagesIndexed(EdgeId<EdgeDomain> domainId);
int getIncomingLinks(EdgeId<EdgeDomain> domainId);
int getOutboundLinks(EdgeId<EdgeDomain> domainId);
double getDomainQuality(EdgeId<EdgeDomain> domainId);
EdgeDomainIndexingState getDomainState(EdgeId<EdgeDomain> domainId);
List<EdgeDomain> getLinkingDomains(EdgeId<EdgeDomain> domainId);
List<EdgeUrl> getNewUrls(EdgeId<EdgeDomain> domainId, Collection<EdgeUrl> links);
double getRank(EdgeId<EdgeDomain> domainId);
void updateDomainIndexTimestamp(EdgeDomain domain, EdgeDomainIndexingState state, EdgeDomain alias, int minIndexed);
} }

View File

@ -33,7 +33,6 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
private final Cache<EdgeUrl, EdgeId<EdgeUrl>> urlIdCache = CacheBuilder.newBuilder().maximumSize(100_000).build(); private final Cache<EdgeUrl, EdgeId<EdgeUrl>> urlIdCache = CacheBuilder.newBuilder().maximumSize(100_000).build();
private final Cache<EdgeDomain, EdgeId<EdgeDomain>> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build(); private final Cache<EdgeDomain, EdgeId<EdgeDomain>> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
private static final String DEFAULT_PROTOCOL = "http";
public static double QUALITY_LOWER_BOUND_CUTOFF = -15.; public static double QUALITY_LOWER_BOUND_CUTOFF = -15.;
@Inject @Inject
public EdgeDataStoreDaoImpl(HikariDataSource dataSource) public EdgeDataStoreDaoImpl(HikariDataSource dataSource)
@ -48,30 +47,13 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
domainIdCache.invalidateAll(); domainIdCache.invalidateAll();
} }
@SneakyThrows
@Override
public boolean isBlacklisted(EdgeDomain domain) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN=?")) {
stmt.setString(1, domain.domain);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return true;
} else {
return false;
}
}
}
}
@SneakyThrows @SneakyThrows
@Override @Override
public EdgeId<EdgeDomain> getDomainId(EdgeDomain domain) { public EdgeId<EdgeDomain> getDomainId(EdgeDomain domain) {
try (var connection = dataSource.getConnection()) { try (var connection = dataSource.getConnection()) {
return domainIdCache.get(domain, () -> { return domainIdCache.get(domain, () -> {
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE URL_PART=?")) { try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
stmt.setString(1, domain.toString()); stmt.setString(1, domain.toString());
var rsp = stmt.executeQuery(); var rsp = stmt.executeQuery();
if (rsp.next()) { if (rsp.next()) {
@ -86,104 +68,14 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
} }
} }
@Override private <T> String idList(List<EdgeId<T>> ids) {
@SneakyThrows StringJoiner j = new StringJoiner(",", "(", ")");
public EdgeId<EdgeUrl> getUrlId(EdgeUrl url) { for (var id : ids) {
try (var connection = dataSource.getConnection()) { j.add(Integer.toString(id.getId()));
return urlIdCache.get(url, () -> {
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_URL_VIEW WHERE URL_PATH=? AND URL_DOMAIN=? AND URL_PROTO=?")) {
stmt.setString(1, url.path);
stmt.setString(2, url.domain.toString());
stmt.setString(3, url.proto);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return new EdgeId<>(rsp.getInt(1));
}
}
// Lenient mode for http->https upgrades etc
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_URL_VIEW WHERE URL_PATH=? AND URL_DOMAIN=?")) {
stmt.setString(1, url.path);
stmt.setString(2, url.domain.toString());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return new EdgeId<>(rsp.getInt(1));
}
}
throw new NoSuchElementException(url.toString());
});
}
catch (UncheckedExecutionException ex) {
throw ex.getCause();
} }
return j.toString();
} }
@SneakyThrows
@Override
public List<EdgeId<EdgeDomain>> getDomainIdsFromUrlIds(Collection<EdgeId<EdgeUrl>> urlIds) {
List<EdgeId<EdgeDomain>> results = new ArrayList<>(urlIds.size());
if (urlIds.isEmpty())
return results;
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT DOMAIN_ID FROM EC_URL WHERE ID IN " + urlIds
.stream()
.map(EdgeId::getId)
.map(Object::toString)
.collect(Collectors.joining(",", "(", ")"))))
{
var rsp = stmt.executeQuery();
while (rsp.next()) {
results.add(new EdgeId<>(rsp.getInt(1)));
}
}
}
return results;
}
static final Pattern badChars = Pattern.compile("[';\\\\]");
private String saneString(String s) {
return "\'"+badChars.matcher(s).replaceAll("?")+"\'";
}
@SneakyThrows
@Override
public EdgeUrl getUrl(EdgeId<EdgeUrl> id) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.createStatement()) {
var rsp = stmt.executeQuery("SELECT URL_PROTO, URL_DOMAIN,URL_PORT,URL_PATH FROM EC_URL_VIEW WHERE ID=" + id.getId());
if (rsp.next()) {
return new EdgeUrl(rsp.getString(1), new EdgeDomain(rsp.getString(2)), rsp.getInt(3), rsp.getString(4));
}
throw new NoSuchElementException();
}
}
}
@SneakyThrows
@Override
public EdgeUrlDetails getUrlDetails(EdgeId<EdgeUrl> id) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.createStatement()) {
var rsp = stmt.executeQuery("SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID=" + id.getId());
if (rsp.next()) {
EdgeUrl url = new EdgeUrl(rsp.getString(2), new EdgeDomain(rsp.getString(3)), rsp.getInt(4), rsp.getString(5));
return new EdgeUrlDetails(rsp.getInt(1), url, rsp.getString(6), rsp.getString(7), rsp.getDouble(8), rsp.getDouble(15), rsp.getDouble(9), rsp.getInt(10), rsp.getInt(11), rsp.getString(12), rsp.getInt(13), EdgePageScoreAdjustment.zero(), Integer.MAX_VALUE, Double.MAX_VALUE, rsp.getString(14), rsp.getInt(16), 0, rsp.getInt(17));
}
throw new NoSuchElementException();
}
}
}
@SneakyThrows @SneakyThrows
@Override @Override
public List<EdgeUrlDetails> getUrlDetailsMulti(List<EdgeId<EdgeUrl>> ids) { public List<EdgeUrlDetails> getUrlDetailsMulti(List<EdgeId<EdgeUrl>> ids) {
@ -193,16 +85,39 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
List<EdgeUrlDetails> result = new ArrayList<>(ids.size()); List<EdgeUrlDetails> result = new ArrayList<>(ids.size());
try (var connection = dataSource.getConnection()) { try (var connection = dataSource.getConnection()) {
// This is SQL-injection safe, the IDs are of type int
String idString = ids.stream().map(EdgeId::getId).map(Objects::toString).collect(Collectors.joining(",", "(", ")"));
try (var stmt = connection.prepareStatement("SELECT ID,URL_PROTO,URL_DOMAIN,URL_PORT,URL_PATH,TITLE,DESCRIPTION,URL_QUALITY_MEASURE,DOMAIN_QUALITY_MEASURE,IFNULL(EC_DOMAIN_LINK_AGGREGATE.LINKS,1),WORDS_TOTAL,FORMAT,FEATURES,\"\",QUALITY_RAW,DOMAIN_STATE,DATA_HASH FROM EC_URL_VIEW LEFT JOIN EC_DOMAIN_LINK_AGGREGATE ON EC_DOMAIN_LINK_AGGREGATE.DOMAIN_ID=EC_URL_VIEW.DOMAIN_ID WHERE ID IN " + idString)) { String idString = idList(ids);
try (var stmt = connection.prepareStatement(
"""
SELECT ID, URL,
TITLE, DESCRIPTION,
QUALITY,
WORDS_TOTAL, FORMAT, FEATURES,
IP, DOMAIN_STATE,
DATA_HASH
FROM EC_URL_VIEW WHERE ID IN
""" + idString)) {
stmt.setFetchSize(ids.size()); stmt.setFetchSize(ids.size());
var rsp = stmt.executeQuery(); var rsp = stmt.executeQuery();
while (rsp.next()) { while (rsp.next()) {
EdgeUrl url = new EdgeUrl(rsp.getString(2), new EdgeDomain(rsp.getString(3)), rsp.getInt(4), rsp.getString(5)); EdgeUrl url = new EdgeUrl(rsp.getString(2));
var val = new EdgeUrlDetails(rsp.getInt(1), url, rsp.getString(6), rsp.getString(7), rsp.getDouble(8), rsp.getDouble(15), rsp.getDouble(9), rsp.getInt(10), rsp.getInt(11), rsp.getString(12), rsp.getInt(13), EdgePageScoreAdjustment.zero(), Integer.MAX_VALUE, Double.MAX_VALUE, rsp.getString(14), rsp.getInt(16), 0, rsp.getInt(17)); var val = new EdgeUrlDetails(rsp.getInt(1), url,
rsp.getString(3), // title
rsp.getString(4), // description
rsp.getDouble(5), // quality
rsp.getInt(6), // wordsTotal
rsp.getString(7), // format
rsp.getInt(8), // features
rsp.getString(9), // ip
EdgeDomainIndexingState.valueOf(rsp.getString(10)), // domainState
rsp.getInt(11), // dataHash
EdgePageScoreAdjustment.zero(), // urlQualityAdjustment
Integer.MAX_VALUE, // rankingId
Double.MAX_VALUE, // termScore
0 // queryLength
);
if (val.urlQuality >= QUALITY_LOWER_BOUND_CUTOFF) { if (val.urlQuality >= QUALITY_LOWER_BOUND_CUTOFF) {
result.add(val); result.add(val);
} }
@ -214,82 +129,13 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
return result; return result;
} }
@Override
public List<BrowseResult> getDomainNeighbors(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist blacklist, int count) {
final Set<BrowseResult> domains = new HashSet<>(count*3);
final String q = "SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, URL_PART from EC_DOMAIN_NEIGHBORS INNER JOIN EC_DOMAIN ON NEIGHBOR_ID=EC_DOMAIN.ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL AND EC_DOMAIN_NEIGHBORS.DOMAIN_ID = ? ORDER BY ADJ_IDX LIMIT ?";
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(q)) {
stmt.setFetchSize(count);
stmt.setInt(1, domainId.getId());
stmt.setInt(2, count);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int id = rsp.getInt(1);
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
domains.add(new BrowseResult(url, id));
}
}
}
final String q2 = "SELECT EC_DOMAIN.ID, URL_PART FROM EC_DOMAIN_LINK INNER JOIN EC_DOMAIN ON DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE SOURCE_DOMAIN_ID=? AND STATE<2 AND DOMAIN_ALIAS IS NULL GROUP BY EC_DOMAIN.ID ORDER BY RANK ASC LIMIT ?";
try (var stmt = connection.prepareStatement(q2)) {
stmt.setFetchSize(count);
stmt.setInt(1, domainId.getId());
stmt.setInt(2, count);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int id = rsp.getInt(1);
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
domains.add(new BrowseResult(url, id));
}
}
}
final String q3 = "SELECT EC_DOMAIN.ID, URL_PART FROM EC_DOMAIN_LINK INNER JOIN EC_DOMAIN ON DEST_DOMAIN_ID=EC_DOMAIN.ID WHERE DEST_DOMAIN_ID=? AND STATE<2 AND DOMAIN_ALIAS IS NULL GROUP BY EC_DOMAIN.ID ORDER BY RANK ASC LIMIT ?";
try (var stmt = connection.prepareStatement(q3)) {
stmt.setFetchSize(count);
stmt.setInt(1, domainId.getId());
stmt.setInt(2, count);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int id = rsp.getInt(1);
String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) {
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/");
domains.add(new BrowseResult(url, id));
}
}
}
} catch (SQLException throwables) {
throwables.printStackTrace();
}
return new ArrayList<>(domains);
}
@Override @Override
public List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist blacklist, int count) { public List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist blacklist, int count) {
final Set<BrowseResult> domains = new HashSet<>(count*3); final Set<BrowseResult> domains = new HashSet<>(count*3);
final String q = """ final String q = """
SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, URL_PART, COUNT(*) AS CNT SELECT EC_DOMAIN.ID AS NEIGHBOR_ID, DOMAIN_NAME, COUNT(*) AS CNT
FROM EC_DOMAIN_NEIGHBORS FROM EC_DOMAIN_NEIGHBORS
INNER JOIN EC_DOMAIN ON NEIGHBOR_ID=EC_DOMAIN.ID INNER JOIN EC_DOMAIN ON NEIGHBOR_ID=EC_DOMAIN.ID
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
@ -316,16 +162,14 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
String domain = rsp.getString(2); String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) { if (!blacklist.isBlacklisted(id)) {
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
domains.add(new BrowseResult(url, id));
} }
} }
} }
if (domains.size() < count/2) { if (domains.size() < count/2) {
final String q2 = """ final String q2 = """
SELECT EC_DOMAIN.ID, URL_PART SELECT EC_DOMAIN.ID, DOMAIN_NAME
FROM EC_DOMAIN FROM EC_DOMAIN
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
INNER JOIN EC_DOMAIN_LINK B ON DEST_DOMAIN_ID=EC_DOMAIN.ID INNER JOIN EC_DOMAIN_LINK B ON DEST_DOMAIN_ID=EC_DOMAIN.ID
@ -347,9 +191,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
String domain = rsp.getString(2); String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) { if (!blacklist.isBlacklisted(id)) {
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
domains.add(new BrowseResult(url, id));
} }
} }
} }
@ -357,11 +199,11 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
if (domains.size() < count/2) { if (domains.size() < count/2) {
final String q3 = """ final String q3 = """
SELECT EC_DOMAIN.ID, URL_PART SELECT EC_DOMAIN.ID, DOMAIN_NAME
FROM EC_DOMAIN FROM EC_DOMAIN
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
INNER JOIN EC_DOMAIN_LINK B ON B.SOURCE_DOMAIN_ID=EC_DOMAIN.ID INNER JOIN EC_DOMAIN_LINK B ON B.SOURCE_DOMAIN_ID=EC_DOMAIN.ID
INNER JOIN EC_DOMAIN_LINK O ON O.DEST_DOMAIN_ID=EC_DOMAIN.ID INNER JOIN EC_DOMAIN_LINK O ON O.DEST_DOMAIN_ID=EC_DOMAIN.ID
WHERE B.DEST_DOMAIN_ID=? WHERE B.DEST_DOMAIN_ID=?
AND STATE<2 AND STATE<2
AND KNOWN_URLS<1000 AND KNOWN_URLS<1000
@ -381,9 +223,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
String domain = rsp.getString(2); String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) { if (!blacklist.isBlacklisted(id)) {
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
domains.add(new BrowseResult(url, id));
} }
} }
} }
@ -399,7 +239,15 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
@Override @Override
public List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist blacklist) { public List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist blacklist) {
final String q = "SELECT DOMAIN_ID,URL_PART FROM EC_RANDOM_DOMAINS INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE STATE<2 AND DOMAIN_ALIAS IS NULL ORDER BY RAND() LIMIT ?"; final String q = """
SELECT DOMAIN_ID, DOMAIN_NAME
FROM EC_RANDOM_DOMAINS
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID
WHERE STATE<2
AND DOMAIN_ALIAS IS NULL
ORDER BY RAND()
LIMIT ?
""";
List<BrowseResult> domains = new ArrayList<>(count); List<BrowseResult> domains = new ArrayList<>(count);
try (var conn = dataSource.getConnection()) { try (var conn = dataSource.getConnection()) {
try (var stmt = conn.prepareStatement(q)) { try (var stmt = conn.prepareStatement(q)) {
@ -410,9 +258,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
String domain = rsp.getString(2); String domain = rsp.getString(2);
if (!blacklist.isBlacklisted(id)) { if (!blacklist.isBlacklisted(id)) {
var url = new EdgeUrl(DEFAULT_PROTOCOL, new EdgeDomain(domain), null, "/"); domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
domains.add(new BrowseResult(url, id));
} }
} }
} }
@ -428,7 +274,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
public EdgeDomain getDomain(EdgeId<EdgeDomain> id) { public EdgeDomain getDomain(EdgeId<EdgeDomain> id) {
try (var connection = dataSource.getConnection()) { try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT URL_PART FROM EC_DOMAIN WHERE ID=?")) { try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, id.getId()); stmt.setInt(1, id.getId());
var rsp = stmt.executeQuery(); var rsp = stmt.executeQuery();
if (rsp.next()) { if (rsp.next()) {
@ -439,330 +285,4 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
} }
} }
@Override @SneakyThrows
public List<EdgeId<EdgeUrl>> inboudUrls(EdgeId<EdgeUrl> id, int limit) {
List<EdgeId<EdgeUrl>> ret = new ArrayList<>();
try (var connection = dataSource.getConnection()) {
try (var stmt =
connection.prepareStatement("SELECT SRC_URL_ID FROM EC_RELATED_LINKS_IN WHERE DEST_URL_ID=? ORDER BY SRC_URL_QUALITY DESC LIMIT ?")) {
stmt.setFetchSize(limit);
stmt.setInt(1, id.getId());
stmt.setInt(2, limit);
var rsp = stmt.executeQuery();
while (rsp.next()) {
ret.add(new EdgeId<>(rsp.getInt(1)));
}
}
}
return ret;
}
@Override @SneakyThrows
public List<EdgeId<EdgeUrl>> outboundUrls(EdgeId<EdgeUrl> id, int limit) {
List<EdgeId<EdgeUrl>> ret = new ArrayList<>();
try (var connection = dataSource.getConnection()) {
try (var stmt =
connection.prepareStatement("SELECT DEST_URL_ID FROM EC_RELATED_LINKS_IN WHERE SRC_URL_ID=? ORDER BY SRC_URL_QUALITY DESC LIMIT ?")) {
stmt.setFetchSize(limit);
stmt.setInt(1, id.getId());
stmt.setInt(2, limit);
var rsp = stmt.executeQuery();
while (rsp.next()) {
ret.add(new EdgeId<>(rsp.getInt(1)));
}
}
}
return ret;
}
@Override
public Optional<EdgeId<EdgeUrl>> resolveAmbiguousDomain(String name) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
stmt.setString(1, name);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return Optional.of(new EdgeId<>(rsp.getInt(1)));
}
}
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
stmt.setString(1, "https://"+name);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return Optional.of(new EdgeId<>(rsp.getInt(1)));
}
}
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
stmt.setString(1, "http://"+name);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return Optional.of(new EdgeId<>(rsp.getInt(1)));
}
}
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
stmt.setString(1, "https://www."+name);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return Optional.of(new EdgeId<>(rsp.getInt(1)));
}
}
try (var stmt = connection.prepareStatement("SELECT IFNULL(DOMAIN_ALIAS,ID) FROM EC_DOMAIN WHERE URL_PART=?")) {
stmt.setString(1, "http://www."+name);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return Optional.of(new EdgeId<>(rsp.getInt(1)));
}
}
} catch (SQLException throwables) {
logger.info("Could not resolve domain id for {}", name);
}
return Optional.empty();
}
@SneakyThrows
@Override
public int getPagesKnown(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
@Override
public int getPagesVisited(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
@Override
public int getPagesIndexed(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
@Override
public int getIncomingLinks(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
@Override
public int getOutboundLinks(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
@Override
public double getDomainQuality(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT QUALITY FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getDouble(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return -5;
}
}
@Override
public EdgeDomainIndexingState getDomainState(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return EdgeDomainIndexingState.fromCode(rsp.getInt(1));
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
} catch (SQLException throwables) {
throwables.printStackTrace();
}
return EdgeDomainIndexingState.ERROR;
}
@Override
public List<EdgeDomain> getLinkingDomains(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
List<EdgeDomain> results = new ArrayList<>(25);
try (var stmt = connection.prepareStatement("SELECT SOURCE_URL FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
while (rsp.next()) {
results.add(new EdgeDomain(rsp.getString(1)));
}
return results;
} catch (Exception ex) {
logger.error("DB error", ex);
}
} catch (SQLException throwables) {
throwables.printStackTrace();
}
return Collections.emptyList();
}
@Override
public List<EdgeUrl> getNewUrls(EdgeId<EdgeDomain> domainId, Collection<EdgeUrl> links) {
Map<String, EdgeUrl> edgeUrlByPath = links.stream().collect(Collectors.toMap(EdgeUrl::getPath, Function.identity(), (a,b)->a));
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT URL FROM EC_URL WHERE DOMAIN_ID=?")) {
stmt.setFetchSize(500);
stmt.setInt(1, domainId.getId());
var rs = stmt.executeQuery();
while (rs.next()) {
edgeUrlByPath.remove(rs.getString(1));
}
}
}
catch (Exception ex) {
return Collections.emptyList();
}
return new ArrayList<>(edgeUrlByPath.values());
}
@Override
public double getRank(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT IFNULL(RANK, 1) FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getDouble(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
} catch (SQLException throwables) {
throwables.printStackTrace();
}
return 1;
}
@Override
public void updateDomainIndexTimestamp(EdgeDomain domain, EdgeDomainIndexingState state, EdgeDomain alias, int minIndexed) {
try (var connection = dataSource.getConnection();
var stmt = connection.prepareStatement("UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=?, DOMAIN_ALIAS=?, INDEXED=GREATEST(INDEXED,?) WHERE ID=?")) {
stmt.setInt(1, state.code);
if (null == alias) {
stmt.setNull(2, Types.INTEGER);
}
else {
stmt.setInt(2, getDomainId(alias).getId());
}
stmt.setInt(3, minIndexed);
stmt.setInt(4, getDomainId(domain).getId());
stmt.executeUpdate();
connection.commit();
}
catch (SQLException throwables) {
logger.error("SQL error", throwables);
}
}
@SneakyThrows
private double getDomainQuality(Connection connection, EdgeDomain src) {
try (var stmt = connection.prepareStatement("SELECT QUALITY_RAW FROM EC_DOMAIN WHERE URL_PART=?")) {
stmt.setString(1, src.toString());
var res = stmt.executeQuery();
if (res.next()) {
var q = res.getDouble(1);
if (q > 0.5) {
logger.warn("gDQ({}) -> 1", src);
}
return 0;
}
}
catch (SQLException ex) {
logger.error("DB error", ex);
}
return -5;
}
} }

View File

@ -50,7 +50,7 @@ public class EdgeDomainBlacklistImpl implements EdgeDomainBlacklist {
final TIntHashSet result = new TIntHashSet(1_000_000); final TIntHashSet result = new TIntHashSet(1_000_000);
try (var connection = dataSource.getConnection()) { try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_TOP_DOMAIN ON EC_DOMAIN.URL_TOP_DOMAIN_ID = EC_TOP_DOMAIN.ID INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_TOP_DOMAIN.URL_PART")) { try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_DOMAIN.DOMAIN_TOP")) {
stmt.setFetchSize(1000); stmt.setFetchSize(1000);
var rsp = stmt.executeQuery(); var rsp = stmt.executeQuery();
while (rsp.next()) { while (rsp.next()) {

View File

@ -1,13 +1,13 @@
package nu.marginalia.wmsa.edge.index.radix; package nu.marginalia.wmsa.edge.index;
import nu.marginalia.wmsa.edge.index.EdgeIndexControl; import nu.marginalia.wmsa.edge.index.EdgeIndexControl;
import nu.marginalia.wmsa.edge.index.IndexServicesFactory; import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms; import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexReader; import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriter; import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriter;
import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget; import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
import nu.marginalia.wmsa.edge.index.service.query.Query; import nu.marginalia.wmsa.edge.index.reader.query.Query;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;

View File

@ -3,7 +3,9 @@ package nu.marginalia.wmsa.edge.index;
import com.google.inject.Inject; import com.google.inject.Inject;
import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.service.index.ConversionUnnecessaryException; import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException;
import java.io.IOException;
public class EdgeIndexControl { public class EdgeIndexControl {
@ -27,7 +29,10 @@ public class EdgeIndexControl {
System.gc(); System.gc();
} }
catch (ConversionUnnecessaryException unnecessary) { catch (ConversionUnnecessaryException unnecessary) {
// swallow quietly
}
catch (IOException e) {
e.printStackTrace();
} }
} }

View File

@ -15,9 +15,9 @@ import nu.marginalia.wmsa.configuration.server.Initialization;
import nu.marginalia.wmsa.configuration.server.MetricsServer; import nu.marginalia.wmsa.configuration.server.MetricsServer;
import nu.marginalia.wmsa.configuration.server.Service; import nu.marginalia.wmsa.configuration.server.Service;
import nu.marginalia.wmsa.edge.index.model.*; import nu.marginalia.wmsa.edge.index.model.*;
import nu.marginalia.wmsa.edge.index.service.SearchIndexes; import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl; import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl;
import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget; import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.wmsa.edge.model.*; import nu.marginalia.wmsa.edge.model.*;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;

View File

@ -5,12 +5,16 @@ import com.google.inject.Singleton;
import com.google.inject.name.Named; import com.google.inject.name.Named;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException;
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPreconverter;
import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl;
import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.radix.EdgeIndexBucket; import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader;
import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryReader; import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter;
import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryWriter; import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
import nu.marginalia.wmsa.edge.index.service.index.*; import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -89,7 +93,7 @@ public class IndexServicesFactory {
} }
public SearchIndexConverter getIndexConverter(int id, IndexBlock block) throws ConversionUnnecessaryException { public SearchIndexConverter getIndexConverter(int id, IndexBlock block) throws ConversionUnnecessaryException, IOException {
return new SearchIndexConverter(block, id, tmpFileDir, return new SearchIndexConverter(block, id, tmpFileDir,
preconverterOutputFile.get(id), preconverterOutputFile.get(id),
indexWriteWordsFile.get(id, block.id), indexWriteWordsFile.get(id, block.id),

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service.index; package nu.marginalia.wmsa.edge.index.conversion;
public class ConversionUnnecessaryException extends Exception { public class ConversionUnnecessaryException extends Exception {
public ConversionUnnecessaryException() { public ConversionUnnecessaryException() {

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service; package nu.marginalia.wmsa.edge.index.conversion;
import gnu.trove.list.TIntList; import gnu.trove.list.TIntList;
import gnu.trove.map.hash.TIntIntHashMap; import gnu.trove.map.hash.TIntIntHashMap;

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service.index; package nu.marginalia.wmsa.edge.index.conversion;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.name.Named; import com.google.inject.name.Named;
@ -6,9 +6,10 @@ import gnu.trove.set.hash.TIntHashSet;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
import nu.marginalia.wmsa.edge.index.conversion.words.WordIndexOffsetsTable;
import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl;
import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.service.index.wordstable.WordsTableWriter; import nu.marginalia.wmsa.edge.index.conversion.words.WordsTableWriter;
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
import nu.marginalia.util.btree.BTreeWriter; import nu.marginalia.util.btree.BTreeWriter;
import nu.marginalia.util.btree.model.BTreeContext; import nu.marginalia.util.btree.model.BTreeContext;
import nu.marginalia.util.multimap.MultimapFileLong; import nu.marginalia.util.multimap.MultimapFileLong;
@ -32,18 +33,24 @@ public class SearchIndexConverter {
private final long fileLength; private final long fileLength;
private final long urlsFileSize; private final long urlsFileSize;
private final Path tmpFileDir;
private final FileChannel urlsTmpFileChannel; private final FileChannel urlsTmpFileChannel;
private final int wordCount; private final int wordCount;
private final MultimapFileLong urlsTmpFileMap; private final MultimapFileLong urlsTmpFileMap;
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
private final IndexBlock block; private final IndexBlock block;
private final int bucketId; private final int bucketId;
@org.jetbrains.annotations.NotNull
private final File urlsFile; private final File urlsFile;
private final SearchIndexPartitioner partitioner; private final SearchIndexPartitioner partitioner;
private final TIntHashSet spamDomains; private final TIntHashSet spamDomains;
private final MultimapSorter urlTmpFileSorter; private final MultimapSorter urlTmpFileSorter;
private final static int internalSortLimit =
Boolean.getBoolean("small-ram") ? 1024*1024 : 1024*1024*256;
@SneakyThrows @SneakyThrows
public static long wordCount(File inputFile) { public static long wordCount(File inputFile) {
try (RandomAccessFile raf = new RandomAccessFile(inputFile, "r")) { try (RandomAccessFile raf = new RandomAccessFile(inputFile, "r")) {
@ -52,7 +59,6 @@ public class SearchIndexConverter {
} }
} }
@SneakyThrows
@Inject @Inject
public SearchIndexConverter(IndexBlock block, public SearchIndexConverter(IndexBlock block,
int bucketId, @Named("tmp-file-dir") Path tmpFileDir, int bucketId, @Named("tmp-file-dir") Path tmpFileDir,
@ -61,13 +67,15 @@ public class SearchIndexConverter {
@Named("edge-index-write-urls-file") File outputFileUrls, @Named("edge-index-write-urls-file") File outputFileUrls,
SearchIndexPartitioner partitioner, SearchIndexPartitioner partitioner,
EdgeDomainBlacklist blacklist) EdgeDomainBlacklist blacklist)
throws ConversionUnnecessaryException throws ConversionUnnecessaryException, IOException
{ {
this.block = block; this.block = block;
this.bucketId = bucketId; this.bucketId = bucketId;
urlsFile = outputFileUrls; this.tmpFileDir = tmpFileDir;
this.urlsFile = outputFileUrls;
this.partitioner = partitioner; this.partitioner = partitioner;
this.spamDomains = blacklist.getSpamDomains(); this.spamDomains = blacklist.getSpamDomains();
logger.info("Converting {} ({}) {}", block.id, block, inputFile); logger.info("Converting {} ({}) {}", block.id, block, inputFile);
Files.deleteIfExists(outputFileWords.toPath()); Files.deleteIfExists(outputFileWords.toPath());
@ -89,18 +97,16 @@ public class SearchIndexConverter {
urlsFileSize = getUrlsSize(buffer, inputChannel); urlsFileSize = getUrlsSize(buffer, inputChannel);
var tmpUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat"); var tmpUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat");
var urlsTmpFileRaf = new RandomAccessFile(tmpUrlsFile.toFile(), "rw"); var urlsTmpFileRaf = new RandomAccessFile(tmpUrlsFile.toFile(), "rw");
urlsTmpFileChannel = urlsTmpFileRaf.getChannel(); urlsTmpFileChannel = urlsTmpFileRaf.getChannel();
urlsTmpFileMap = new MultimapFileLong(urlsTmpFileRaf, FileChannel.MapMode.READ_WRITE, urlsFileSize, 8*1024*1024, false); urlsTmpFileMap = new MultimapFileLong(urlsTmpFileRaf, FileChannel.MapMode.READ_WRITE, urlsFileSize, 8*1024*1024, false);
urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, 1024*1024*256); urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit);
logger.info("Creating word index table {} for block {} ({})", outputFileWords, block.id, block); logger.info("Creating word index table {} for block {} ({})", outputFileWords, block.id, block);
long[] wordIndexTable = createWordIndexTable(outputFileWords, inputChannel); WordIndexOffsetsTable wordIndexTable = createWordIndexTable(outputFileWords, inputChannel);
logger.info("Creating word urls table {} for block {} ({})", outputFileUrls, block.id, block); logger.info("Creating word urls table {} for block {} ({})", outputFileUrls, block.id, block);
createUrlTable(tmpFileDir, buffer, raf, wordIndexTable); createUrlTable(buffer, raf, wordIndexTable);
Files.delete(tmpUrlsFile); Files.delete(tmpUrlsFile);
raf.close(); raf.close();
@ -140,99 +146,69 @@ public class SearchIndexConverter {
return reader.size; return reader.size;
} }
private void createUrlTable(Path tmpFileDir, ByteBuffer buffer, RandomAccessFile raf, long[] wordIndexTable) throws IOException { private void createUrlTable(ByteBuffer buffer, RandomAccessFile raf, WordIndexOffsetsTable wordOffsetsTable) throws IOException {
logger.debug("Table size = {}", wordIndexTable.length); logger.info("Table size = {}", wordOffsetsTable.length());
int[] wordIndex = new int[wordIndexTable.length];
raf.seek(FILE_HEADER_SIZE); raf.seek(FILE_HEADER_SIZE);
var channel = raf.getChannel(); var channel = raf.getChannel();
try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, urlsFileSize, 10_000_000)) { try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, urlsFileSize, 10_000_000)) {
var reader = new IndexReader(buffer, channel) { int[] wordWriteOffset = new int[wordOffsetsTable.length()];
new IndexReader(buffer, channel) {
@Override @Override
public void eachWord(long urlId, int wordId) throws IOException { public void eachWord(long urlId, int wordId) throws IOException {
if (wordId >= wordIndex.length) if (wordId >= wordWriteOffset.length)
return; return;
if (wordId != 0) {
if (!(wordIndexTable[wordId - 1] + wordIndex[wordId] <= wordIndexTable[wordId])) {
logger.error("Crazy state: wordId={}, index={}, lower={}, upper={}",
wordId,
wordIndex[wordId],
wordIndexTable[wordId - 1],
wordIndexTable[wordId]);
throw new IllegalStateException();
}
}
if (wordId > 0) { if (wordId > 0) {
rwf.put(wordIndexTable[wordId - 1] + wordIndex[wordId]++, translateUrl(urlId)); rwf.put(wordOffsetsTable.get(wordId - 1) + wordWriteOffset[wordId]++, translateUrl(urlId));
} else { } else {
rwf.put(wordIndex[wordId]++, translateUrl(urlId)); rwf.put(wordWriteOffset[wordId]++, translateUrl(urlId));
} }
} }
}; }.read();
reader.read();
rwf.write(urlsTmpFileChannel); rwf.write(urlsTmpFileChannel);
} }
urlsTmpFileChannel.force(false); urlsTmpFileChannel.force(false);
logger.info("URL TMP Table: {} Mb", channel.position()/(1024*1024));
logger.debug("URL TMP Table: {} Mb", channel.position()/(1024*1024)); if (wordOffsetsTable.length() > 0) {
logger.info("Sorting urls table");
wordOffsetsTable.forEach(urlTmpFileSorter::sort);
if (wordIndexTable.length > 0) {
logger.debug("Sorting urls table");
sortUrls(wordIndexTable);
urlsTmpFileMap.force(); urlsTmpFileMap.force();
} }
else { else {
logger.warn("urls table empty -- nothing to sort"); logger.warn("urls table empty -- nothing to sort");
} }
logger.info("Writing BTree");
long idx = 0;
try (var urlsFileMap = MultimapFileLong.forOutput(urlsFile.toPath(), 1024)) { try (var urlsFileMap = MultimapFileLong.forOutput(urlsFile.toPath(), 1024)) {
var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext); var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext);
if (wordIndexTable[0] != 0) { wordOffsetsTable.fold((accumulatorIdx, start, length) -> {
int start = 0; // Note: The return value is accumulated into accumulatorIdx!
int end = (int) wordIndexTable[0];
idx += writer.write(idx, (int) wordIndexTable[0], return writer.write(accumulatorIdx, length,
offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end)); slice -> slice.transferFromFileChannel(urlsTmpFileChannel, 0, start, start + length));
} });
for (int i = 1; i < wordIndexTable.length; i++) {
if (wordIndexTable[i] != wordIndexTable[i - 1]) {
long start = wordIndexTable[i-1];
long end = wordIndexTable[i];
idx += writer.write(idx, (int) (end-start),
offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end));
}
}
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); logger.error("Error while writing BTree", e);
} }
} }
@SneakyThrows private WordIndexOffsetsTable createWordIndexTable(File outputFileWords, FileChannel inputChannel) throws IOException {
private void sortUrls(long[] wordIndices) {
urlTmpFileSorter.sort( 0, (int) wordIndices[0]);
for (int i = 1; i < wordIndices.length; i++) {
urlTmpFileSorter.sort(wordIndices[i-1], (int) (wordIndices[i] - wordIndices[i-1]));
}
}
private long[] createWordIndexTable(File outputFileWords, FileChannel inputChannel) throws Exception {
inputChannel.position(FILE_HEADER_SIZE); inputChannel.position(FILE_HEADER_SIZE);
logger.debug("Table size = {}", wordCount); logger.debug("Table size = {}", wordCount);
WordsTableWriter wordsTableWriter = new WordsTableWriter(wordCount); WordsTableWriter wordsTableWriter = new WordsTableWriter(wordCount);
ByteBuffer buffer = ByteBuffer.allocateDirect(8*SearchIndexWriterImpl.MAX_BLOCK_SIZE); ByteBuffer buffer = ByteBuffer.allocateDirect(8* SearchIndexWriterImpl.MAX_BLOCK_SIZE);
logger.debug("Reading words"); logger.debug("Reading words");

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service; package nu.marginalia.wmsa.edge.index.conversion;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
@ -10,7 +10,7 @@ import lombok.SneakyThrows;
import nu.marginalia.util.ranking.BetterReversePageRank; import nu.marginalia.util.ranking.BetterReversePageRank;
import nu.marginalia.util.ranking.BetterStandardPageRank; import nu.marginalia.util.ranking.BetterStandardPageRank;
import nu.marginalia.util.ranking.BuggyStandardPageRank; import nu.marginalia.util.ranking.BuggyStandardPageRank;
import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.edge.index.model.RankingSettings; import nu.marginalia.wmsa.edge.index.model.RankingSettings;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -18,41 +18,28 @@ import org.slf4j.LoggerFactory;
@Singleton @Singleton
public class SearchIndexDao { public class SearchIndexDao {
private final HikariDataSource dataSource; private final HikariDataSource dataSource;
private RankingDomainFetcher rankingDomains;
private final RankingSettings rankingSettings; private final RankingSettings rankingSettings;
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject @Inject
public SearchIndexDao(HikariDataSource dataSource, public SearchIndexDao(HikariDataSource dataSource,
RankingDomainFetcher rankingDomains,
RankingSettings rankingSettings) RankingSettings rankingSettings)
{ {
this.dataSource = dataSource; this.dataSource = dataSource;
this.rankingDomains = rankingDomains;
this.rankingSettings = rankingSettings; this.rankingSettings = rankingSettings;
logger.info("SearchIndexDao ranking settings = {}", rankingSettings); logger.info("SearchIndexDao ranking settings = {}", rankingSettings);
} }
@SneakyThrows
public TIntHashSet getSpamDomains() {
final TIntHashSet result = new TIntHashSet(1_000_000);
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT EC_DOMAIN.ID FROM EC_DOMAIN INNER JOIN EC_TOP_DOMAIN ON EC_DOMAIN.URL_TOP_DOMAIN_ID = EC_TOP_DOMAIN.ID INNER JOIN EC_DOMAIN_BLACKLIST ON EC_DOMAIN_BLACKLIST.URL_DOMAIN = EC_TOP_DOMAIN.URL_PART")) {
var rsp = stmt.executeQuery();
while (rsp.next()) {
result.add(rsp.getInt(1));
}
}
}
return result;
}
@SneakyThrows @SneakyThrows
public TIntHashSet goodUrls() { public TIntHashSet goodUrls() {
TIntHashSet domains = new TIntHashSet(10_000_000, 0.5f, -1); TIntHashSet domains = new TIntHashSet(10_000_000, 0.5f, -1);
TIntHashSet urls = new TIntHashSet(100_000_000, 0.5f, -1); TIntHashSet urls = new TIntHashSet(100_000_000, 0.5f, -1);
try (var connection = dataSource.getConnection()) { try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NULL AND STATE>=0")) { try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NULL AND IS_ALIVE")) {
stmt.setFetchSize(10_000); stmt.setFetchSize(10_000);
var rsp = stmt.executeQuery(); var rsp = stmt.executeQuery();
while (rsp.next()) { while (rsp.next()) {
@ -79,36 +66,36 @@ public class SearchIndexDao {
@SneakyThrows @SneakyThrows
public TIntList getRetroDomains() { public TIntList getRetroDomains() {
var spr = new BetterStandardPageRank(dataSource,rankingSettings.retro.toArray(String[]::new)); var spr = new BetterStandardPageRank(rankingDomains,rankingSettings.retro.toArray(String[]::new));
return spr.pageRankWithPeripheralNodes(spr.size()/2, false); return spr.pageRankWithPeripheralNodes(spr.size()/2);
} }
@SneakyThrows @SneakyThrows
public TIntList getSmallWebDomains() { public TIntList getSmallWebDomains() {
var rpr = new BetterReversePageRank(new DatabaseModule().provideConnection(), rankingSettings.small.toArray(String[]::new)); var rpr = new BetterReversePageRank(rankingDomains, rankingSettings.small.toArray(String[]::new));
rpr.setMaxKnownUrls(750); rpr.setMaxKnownUrls(750);
return rpr.pageRankWithPeripheralNodes(rpr.size(), false); return rpr.pageRankWithPeripheralNodes(rpr.size());
} }
@SneakyThrows @SneakyThrows
public TIntList getAcademiaDomains() { public TIntList getAcademiaDomains() {
var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), rankingSettings.academia.toArray(String[]::new)); var spr = new BetterStandardPageRank(rankingDomains, rankingSettings.academia.toArray(String[]::new));
return spr.pageRankWithPeripheralNodes(spr.size()/2, false); return spr.pageRankWithPeripheralNodes(spr.size()/2);
} }
@SneakyThrows @SneakyThrows
public TIntList getStandardDomains() { public TIntList getStandardDomains() {
var spr = new BuggyStandardPageRank(dataSource,rankingSettings.standard.toArray(String[]::new)); var spr = new BuggyStandardPageRank(rankingDomains,rankingSettings.standard.toArray(String[]::new));
return spr.pageRankWithPeripheralNodes(spr.size()/2, false); return spr.pageRankWithPeripheralNodes(spr.size()/2);
} }
@SneakyThrows @SneakyThrows
public TIntList getSpecialDomains() { public TIntList getSpecialDomains() {
TIntArrayList results = new TIntArrayList(); TIntArrayList results = new TIntArrayList();
try (var connection = dataSource.getConnection(); try (var connection = dataSource.getConnection();
var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE=2") var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE='SPECIAL'")
) { ) {
var rs = stmt.executeQuery(); var rs = stmt.executeQuery();
while (rs.next()) { while (rs.next()) {

View File

@ -1,11 +1,9 @@
package nu.marginalia.wmsa.edge.index.service.query; package nu.marginalia.wmsa.edge.index.conversion;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import gnu.trove.set.hash.TIntHashSet; import gnu.trove.set.hash.TIntHashSet;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.index.service.SearchEngineRanking;
import nu.marginalia.wmsa.edge.index.service.SearchIndexDao;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;

View File

@ -1,10 +1,9 @@
package nu.marginalia.wmsa.edge.index.service.index; package nu.marginalia.wmsa.edge.index.conversion;
import com.google.inject.Inject; import com.google.inject.Inject;
import gnu.trove.set.hash.TIntHashSet; import gnu.trove.set.hash.TIntHashSet;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;

View File

@ -0,0 +1,10 @@
package nu.marginalia.wmsa.edge.index.conversion.words;
public class WordIndexLengthsTable {
final long[] table;
public WordIndexLengthsTable(int size) {
this.table = new long[size];
}
public void increment(int idx) { table[idx]++; }
}

View File

@ -0,0 +1,67 @@
package nu.marginalia.wmsa.edge.index.conversion.words;
import java.io.IOException;
public class WordIndexOffsetsTable {
final long[] table;
public final int numberOfUsedWords;
public WordIndexOffsetsTable(long[] table, int numberOfUsedWords) {
this.table = table;
this.numberOfUsedWords = numberOfUsedWords;
}
public int length() {
return table.length;
}
public void forEach(OffsetTableEntryConsumer o) throws IOException {
if (table[0] > 0) {
o.accept(0, (int) table[0]);
}
for (int i = 1; i < table.length; i++) {
long start = table[i-1];
int length = (int) (table[i] - start);
if (length != 0) {
o.accept(start, length);
}
}
}
/**
* Fold over each span in the file, left to right
*/
public long fold(OffsetTableEntryFoldConsumer o) throws IOException {
long total = 0;
if (table[0] > 0) {
total = o.accept(total,0, (int) table[0]);
}
for (int i = 1; i < table.length; i++) {
long start = table[i-1];
int length = (int) (table[i] - start);
if (length != 0) {
total += o.accept(total, start, length);
}
}
return total;
}
public long get(int i) {
return table[i];
}
public interface OffsetTableEntryConsumer {
void accept(long start, int length) throws IOException;
}
public interface OffsetTableEntryFoldConsumer {
long accept(long accumulator, long start, int length) throws IOException;
}
}

View File

@ -0,0 +1,56 @@
package nu.marginalia.wmsa.edge.index.conversion.words;
/** Contains a stateful table of word index offsets, initially in lengths mode
* where the table contains how many postings exist for each word; then in offsets
* mode, where the lengths are converted into the necessary offsets for each block
* of document data.
*
* Caveat! This uses the same underlying array to conserve space.
*
*/
public class WordIndexTables {
private WordIndexLengthsTable lengthsTable;
private WordIndexOffsetsTable offsetsTable;
private boolean converted = false;
public WordIndexTables(int size) {
lengthsTable = new WordIndexLengthsTable(size);
}
public WordIndexLengthsTable lengths() {
if (converted) throw new IllegalStateException("Table has been converted");
return lengthsTable;
}
public WordIndexOffsetsTable offsets() {
if (!converted) throw new IllegalStateException("Table has not been converted");
return offsetsTable;
}
public void convert() {
if (converted) throw new IllegalStateException("Table has been converted");
// Go from lengths to offsets, i.e.
// BEFORE: 1, 2, 1, 3, 0, 2
// AFTER: 1, 3, 4, 7, 7, 9
long[] table = lengthsTable.table;
int numberOfUsedWords = 0;
if (table[0] != 0) numberOfUsedWords = 1;
for (int i = 1; i < table.length; i++) {
if (table[i] != 0) {
numberOfUsedWords++;
}
table[i] += table[i-1];
}
lengthsTable = null;
offsetsTable = new WordIndexOffsetsTable(table, numberOfUsedWords);
converted = true;
}
}

View File

@ -0,0 +1,75 @@
package nu.marginalia.wmsa.edge.index.conversion.words;
import nu.marginalia.util.btree.BTreeWriter;
import nu.marginalia.util.btree.model.BTreeContext;
import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.util.multimap.MultimapFileLongSlice;
import nu.marginalia.wmsa.edge.index.reader.IndexWordsTable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.urlsBTreeContext;
public class WordsTableWriter {
private final WordIndexTables table;
private final Logger logger = LoggerFactory.getLogger(getClass());
public static final BTreeContext wordsBTreeContext = new BTreeContext(7, 2, 0x0000_0000_FFFF_FFFFL, 8);
public WordsTableWriter(int length) {
table = new WordIndexTables(length);
}
public void acceptWord(int wordId) {
table.lengths().increment(wordId);
}
public WordIndexOffsetsTable getTable() {
return table.offsets();
}
public void write(File file) throws IOException {
table.convert();
logger.info("Writing table - {} max", table.offsets().numberOfUsedWords);
final int tableSize = table.offsets().numberOfUsedWords;
try (var mmf = MultimapFileLong.forOutput(file.toPath(), tableSize/8L)) {
mmf.put(0, IndexWordsTable.Strategy.BTREE.ordinal());
long offset = 1;
var writer = new BTreeWriter(mmf, wordsBTreeContext);
writer.write(offset, tableSize, this::writeBTreeBlock);
}
}
private void writeBTreeBlock(MultimapFileLongSlice mapSlice) {
long urlFileOffset = 0;
int idx = 0;
var offsetTable = table.offsets().table;
if (offsetTable[0] != 0) {
int length = (int) offsetTable[0];
mapSlice.put(idx++, (long)length<<32);
mapSlice.put(idx++, 0);
urlFileOffset += (urlsBTreeContext.calculateSize(length));
}
for (int i = 1; i < offsetTable.length; i++) {
final int length = (int)(offsetTable[i] - offsetTable[i-1]);
if (length > 0) {
mapSlice.put(idx++, (long)length << 32 | i);
mapSlice.put(idx++, urlFileOffset);
urlFileOffset += (urlsBTreeContext.calculateSize(length));
}
}
}
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service.dictionary; package nu.marginalia.wmsa.edge.index.dictionary;
import com.google.common.cache.Cache; import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder; import com.google.common.cache.CacheBuilder;

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service.dictionary; package nu.marginalia.wmsa.edge.index.dictionary;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service.dictionary; package nu.marginalia.wmsa.edge.index.dictionary;
import nu.marginalia.util.ByteFolder; import nu.marginalia.util.ByteFolder;
import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.util.dict.DictionaryHashMap;

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service.index; package nu.marginalia.wmsa.edge.index.journal;
import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeDomain;

View File

@ -1,10 +1,10 @@
package nu.marginalia.wmsa.edge.index.service.index; package nu.marginalia.wmsa.edge.index.journal;
import io.reactivex.rxjava3.disposables.Disposable; import io.reactivex.rxjava3.disposables.Disposable;
import io.reactivex.rxjava3.schedulers.Schedulers; import io.reactivex.rxjava3.schedulers.Schedulers;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryWriter; import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter;
import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeId; import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.EdgeUrl;

View File

@ -1,36 +1,80 @@
package nu.marginalia.wmsa.edge.index.service.index.wordstable; package nu.marginalia.wmsa.edge.index.reader;
import com.upserve.uppend.blobs.NativeIO; import com.upserve.uppend.blobs.NativeIO;
import nu.marginalia.util.btree.BTreeReader; import nu.marginalia.util.btree.BTreeReader;
import nu.marginalia.util.btree.model.BTreeHeader; import nu.marginalia.util.btree.model.BTreeHeader;
import nu.marginalia.util.multimap.MultimapFileLong; import nu.marginalia.util.multimap.MultimapFileLong;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.channels.FileChannel;
import java.util.function.LongConsumer; import java.util.function.LongConsumer;
import static nu.marginalia.wmsa.edge.index.service.index.wordstable.WordsTableWriter.wordsBTreeContext; import static nu.marginalia.wmsa.edge.index.conversion.words.WordsTableWriter.wordsBTreeContext;
public class BtreeWordsTable extends IndexWordsTable{ public class IndexWordsTable implements AutoCloseable {
private final MultimapFileLong words; protected final MultimapFileLong words;
private final BTreeReader reader; protected final BTreeReader reader;
private final BTreeHeader header; protected final BTreeHeader header;
private final int HEADER_OFFSET = 1; protected final int HEADER_OFFSET = 1;
final Logger logger = LoggerFactory.getLogger(getClass());
public BtreeWordsTable(MultimapFileLong words) { private static final int BUFFER_SIZE = 1024*1024*64;
public IndexWordsTable(MultimapFileLong words) {
this.words = words; this.words = words;
reader = new BTreeReader(words, wordsBTreeContext); reader = new BTreeReader(words, wordsBTreeContext);
header = reader.getHeader(HEADER_OFFSET); header = reader.getHeader(HEADER_OFFSET);
madvise(); madvise();
} }
private void madvise() { public static IndexWordsTable ofFile(RandomAccessFile file) throws IOException {
var wordsFile = openWordsFile(file);
long signature = wordsFile.get(0);
if (signature == Strategy.BTREE.ordinal()) {
return new IndexWordsTable(wordsFile);
}
throw new IllegalArgumentException("Unknown signature " + signature);
}
private static MultimapFileLong openWordsFile(RandomAccessFile wordsFile) throws IOException {
return new MultimapFileLong(wordsFile,
FileChannel.MapMode.READ_ONLY, wordsFile.length(), BUFFER_SIZE, false);
}
public long positionForWord(int wordId) {
long offset = reader.offsetForEntry(header, wordId);
if (offset < 0) {
return -1L;
}
return words.get(offset+1);
}
public int wordLength(int wordId) {
long offset = reader.offsetForEntry(header, wordId);
if (offset < 0) {
return -1;
}
return (int)(words.get(offset) >> 32);
}
protected void madvise() {
words.advice(NativeIO.Advice.Random); words.advice(NativeIO.Advice.Random);
words.advice0(NativeIO.Advice.WillNeed); words.advice0(NativeIO.Advice.WillNeed);
var h = reader.getHeader(HEADER_OFFSET); var h = reader.getHeader(HEADER_OFFSET);
int length = (int)(h.dataOffsetLongs() - h.indexOffsetLongs()); int length = (int)(h.dataOffsetLongs() - h.indexOffsetLongs());
words.adviceRange(NativeIO.Advice.WillNeed, h.indexOffsetLongs(), length); words.adviceRange(NativeIO.Advice.WillNeed, h.indexOffsetLongs(), length);
words.pokeRange(h.indexOffsetLongs(), length); words.pokeRange(h.indexOffsetLongs(), length);
} }
@ -58,31 +102,13 @@ public class BtreeWordsTable extends IndexWordsTable{
} }
} }
@Override
public long positionForWord(int wordId) {
long offset = reader.offsetForEntry(header, wordId);
if (offset < 0) {
return -1L;
}
return words.get(offset+1);
}
@Override
public int wordLength(int wordId) {
long offset = reader.offsetForEntry(header, wordId);
if (offset < 0) {
return -1;
}
return (int)(words.get(offset) >> 32);
}
@Override @Override
public void close() throws Exception { public void close() throws Exception {
words.close(); words.close();
} }
public enum Strategy {
BTREE
}
} }

View File

@ -1,20 +1,18 @@
package nu.marginalia.wmsa.edge.index.service.index; package nu.marginalia.wmsa.edge.index.reader;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.name.Named; import com.google.inject.name.Named;
import com.upserve.uppend.blobs.NativeIO; import com.upserve.uppend.blobs.NativeIO;
import io.reactivex.rxjava3.schedulers.Schedulers; import io.reactivex.rxjava3.schedulers.Schedulers;
import nu.marginalia.wmsa.edge.index.service.index.wordstable.IndexWordsTable; import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
import nu.marginalia.util.btree.BTreeReader; import nu.marginalia.util.btree.BTreeReader;
import nu.marginalia.util.multimap.MultimapFileLong; import nu.marginalia.util.multimap.MultimapFileLong;
import org.eclipse.jetty.util.thread.ThreadPool;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.RandomAccessFile; import java.io.RandomAccessFile;
import java.util.concurrent.ForkJoinPool;
import java.util.stream.LongStream; import java.util.stream.LongStream;
public class SearchIndex implements AutoCloseable { public class SearchIndex implements AutoCloseable {

View File

@ -1,13 +1,13 @@
package nu.marginalia.wmsa.edge.index.service.index; package nu.marginalia.wmsa.edge.index.reader;
import com.google.common.cache.Cache; import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder; import com.google.common.cache.CacheBuilder;
import com.google.inject.Inject; import com.google.inject.Inject;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.service.query.IndexQueryBuilder; import nu.marginalia.wmsa.edge.index.reader.query.IndexQueryBuilder;
import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget; import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
import nu.marginalia.wmsa.edge.index.service.query.Query; import nu.marginalia.wmsa.edge.index.reader.query.Query;
import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.lang3.tuple.Pair;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -105,10 +105,8 @@ public class SearchIndexReader implements AutoCloseable {
.mapToLong(idx -> idx.numUrls(word)) .mapToLong(idx -> idx.numUrls(word))
.sum() .sum()
); );
} }
public IndexBlock getBlockForResult(int searchTerm, long urlId) { public IndexBlock getBlockForResult(int searchTerm, long urlId) {
for (var block : indicesBySearchOrder) { for (var block : indicesBySearchOrder) {
var index = indices.get(block); var index = indices.get(block);

View File

@ -1,13 +1,13 @@
package nu.marginalia.wmsa.edge.index.service; package nu.marginalia.wmsa.edge.index.reader;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.configuration.server.Initialization;
import nu.marginalia.wmsa.edge.index.IndexServicesFactory; import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
import nu.marginalia.wmsa.edge.index.radix.EdgeIndexBucket; import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryReader; import nu.marginalia.wmsa.edge.index.EdgeIndexBucket;
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl; import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader;
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;

View File

@ -1,7 +1,7 @@
package nu.marginalia.wmsa.edge.index.service.query; package nu.marginalia.wmsa.edge.index.reader.query;
import com.google.common.collect.Streams; import com.google.common.collect.Streams;
import nu.marginalia.wmsa.edge.index.service.index.SearchIndex; import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
import java.util.Collection; import java.util.Collection;
import java.util.List; import java.util.List;

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service.query; package nu.marginalia.wmsa.edge.index.reader.query;
public class IndexSearchBudget { public class IndexSearchBudget {

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.service.query; package nu.marginalia.wmsa.edge.index.reader.query;
import java.util.stream.LongStream; import java.util.stream.LongStream;

View File

@ -1,6 +0,0 @@
package nu.marginalia.wmsa.edge.index.service;
public enum SearchOrder {
ASCENDING,
REVERSED
}

View File

@ -1,48 +0,0 @@
package nu.marginalia.wmsa.edge.index.service.index.wordstable;
import nu.marginalia.util.multimap.MultimapFileLong;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.channels.FileChannel;
import java.util.function.LongConsumer;
public abstract class IndexWordsTable implements AutoCloseable {
final Logger logger = LoggerFactory.getLogger(getClass());
private static final int BUFFER_SIZE = 1024*1024*64;
public static IndexWordsTable ofFile(RandomAccessFile file) throws IOException {
var wordsFile = openWordsFile(file);
long signature = wordsFile.get(0);
if (signature == Strategy.BTREE.ordinal()) {
return new BtreeWordsTable(wordsFile);
}
throw new IllegalArgumentException("Unknown signature " + signature);
}
private static MultimapFileLong openWordsFile(RandomAccessFile wordsFile) throws IOException {
return new MultimapFileLong(wordsFile,
FileChannel.MapMode.READ_ONLY, wordsFile.length(), BUFFER_SIZE, false);
}
public abstract long positionForWord(int wordId);
public abstract int wordLength(int wordId);
public abstract void forEachWordsOffset(LongConsumer offsetConsumer);
@Override
public void close() throws Exception {
}
public record TableWordRange(long start, long end) {}
public enum Strategy {
FLAT, HASH, BTREE_OLD, BTREE
}
}

View File

@ -1,85 +0,0 @@
package nu.marginalia.wmsa.edge.index.service.index.wordstable;
import nu.marginalia.util.btree.BTreeWriter;
import nu.marginalia.util.btree.model.BTreeContext;
import nu.marginalia.util.multimap.MultimapFileLong;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import static nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter.urlsBTreeContext;
public class WordsTableWriter {
private final long[] table;
private final Logger logger = LoggerFactory.getLogger(getClass());
public static final BTreeContext wordsBTreeContext = new BTreeContext(7, 2, 0x0000_0000_FFFF_FFFFL, 8);
public WordsTableWriter(int length) {
table = new long[length];
}
public void acceptWord(int wordId) {
if (wordId >= table.length) {
logger.warn("Invalid word-id {}", wordId);
}
else {
table[wordId]++;
}
}
public long[] getTable() {
return table;
}
public void write(File file) throws Exception {
int tableSize = 0;
if (table[0] != 0) tableSize = 1;
for (int i = 1; i < table.length; i++) {
if (table[i] != 0) {
tableSize++;
}
table[i] += table[i-1];
}
logger.info("Writing table {} words {} max", tableSize, table.length);
writeBtreeWordsFile(file, table, tableSize);
}
private void writeBtreeWordsFile(File outputFileWords, long[] table, int tableSize) throws Exception {
try (var mmf = MultimapFileLong.forOutput(outputFileWords.toPath(), tableSize/8L)) {
mmf.put(0, IndexWordsTable.Strategy.BTREE.ordinal());
long offset = 1;
var writer = new BTreeWriter(mmf, wordsBTreeContext);
writer.write(offset, tableSize, (idx) -> {
long urlFileOffset = 0;
if (table[0] != 0) {
int length = (int) table[0];
mmf.put(idx++, (long)length<<32);
mmf.put(idx++, 0);
urlFileOffset += (urlsBTreeContext.calculateSize(length));
}
for (int i = 1; i < table.length; i++) {
if (table[i] != table[i - 1]) {
int length = (int)(table[i] - table[i-1]);
mmf.put(idx++, (long)length << 32 | i);
mmf.put(idx++, urlFileOffset);
urlFileOffset += (urlsBTreeContext.calculateSize(length));
}
}
});
}
}
}

View File

@ -55,8 +55,11 @@ public class EdgeDomain implements WideHashable {
} }
} }
} }
}
public EdgeUrl toRootUrl() {
// Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http
return new EdgeUrl("http", this, null, "/");
} }
public String toString() { public String toString() {

View File

@ -1,27 +1,12 @@
package nu.marginalia.wmsa.edge.model.crawl; package nu.marginalia.wmsa.edge.model.crawl;
public enum EdgeDomainIndexingState { public enum EdgeDomainIndexingState {
ACTIVE(0), ACTIVE,
EXHAUSTED(1), EXHAUSTED,
SPECIAL(2), SPECIAL,
SOCIAL_MEDIA(3), SOCIAL_MEDIA,
BLOCKED(-1), BLOCKED,
REDIR(-2), REDIR,
ERROR(-3), ERROR,
UNKNOWN(-100); UNKNOWN
public final int code;
EdgeDomainIndexingState(int code) {
this.code = code;
}
public static EdgeDomainIndexingState fromCode(int code) {
for (var state : values()) {
if (state.code == code) {
return state;
}
}
return UNKNOWN;
}
} }

View File

@ -2,7 +2,6 @@ package nu.marginalia.wmsa.edge.model.search;
import lombok.*; import lombok.*;
import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.service.SearchOrder;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
@ -21,14 +20,13 @@ public class EdgeSearchSpecification {
public final int limitTotal; public final int limitTotal;
public final String humanQuery; public final String humanQuery;
public final SearchOrder searchOrder;
public boolean stagger; public boolean stagger;
public boolean experimental; public boolean experimental;
public static EdgeSearchSpecification justIncludes(String... words) { public static EdgeSearchSpecification justIncludes(String... words) {
return new EdgeSearchSpecification( return new EdgeSearchSpecification(
IntStream.range(0, DYNAMIC_BUCKET_LENGTH+1).boxed().toList(), IntStream.range(0, DYNAMIC_BUCKET_LENGTH+1).boxed().toList(),
Collections.singletonList(new EdgeSearchSubquery(Arrays.asList(words), Collections.emptyList(), IndexBlock.Title)), 10, 10, 10, "", SearchOrder.ASCENDING, false, false); Collections.singletonList(new EdgeSearchSubquery(Arrays.asList(words), Collections.emptyList(), IndexBlock.Title)), 10, 10, 10, "", false, false);
} }
} }

View File

@ -16,25 +16,24 @@ public class EdgeUrlDetails {
public String description; public String description;
public double urlQuality; public double urlQuality;
public double urlQualityRaw;
public double domainQuality;
public int links; // DEAD
public int words; public int words;
public String format; public String format;
public int features; public int features;
public EdgePageScoreAdjustment urlQualityAdjustment;
public long rankingId;
public double termScore;
public String ip; // BROKEN public String ip; // BROKEN
public int domainState; public EdgeDomainIndexingState domainState;
public int queryLength;
public int dataHash; public int dataHash;
public EdgePageScoreAdjustment urlQualityAdjustment;
public long rankingId;
public double termScore;
public int queryLength;
public long rankingIdAdjustment() { public long rankingIdAdjustment() {
int penalty = 0; int penalty = 0;
@ -136,7 +135,7 @@ public class EdgeUrlDetails {
return HtmlFeature.hasFeature(features, HtmlFeature.COOKIES); return HtmlFeature.hasFeature(features, HtmlFeature.COOKIES);
} }
public boolean isSpecialDomain() { public boolean isSpecialDomain() {
return domainState == EdgeDomainIndexingState.SPECIAL.code; return domainState == EdgeDomainIndexingState.SPECIAL;
} }
public int getLogRank() { return (int) Math.round(Math.min(Math.log(1+rankingId),10)); } public int getLogRank() { return (int) Math.round(Math.min(Math.log(1+rankingId),10)); }

View File

@ -11,7 +11,6 @@ import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles;
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.service.SearchOrder;
import nu.marginalia.wmsa.edge.model.*; import nu.marginalia.wmsa.edge.model.*;
import nu.marginalia.wmsa.edge.model.search.*; import nu.marginalia.wmsa.edge.model.search.*;
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResultSet; import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResultSet;
@ -136,7 +135,7 @@ public class EdgeSearchOperator {
sqs.add(new EdgeSearchSubquery(Arrays.asList(termsInclude), Collections.emptyList(), block)); sqs.add(new EdgeSearchSubquery(Arrays.asList(termsInclude), Collections.emptyList(), block));
EdgeSearchSpecification specs = new EdgeSearchSpecification(profile.buckets, sqs, 100, limitPerDomain, limitTotal, "", SearchOrder.ASCENDING, EdgeSearchProfile.YOLO.equals(profile), false); EdgeSearchSpecification specs = new EdgeSearchSpecification(profile.buckets, sqs, 100, limitPerDomain, limitTotal, "", EdgeSearchProfile.YOLO.equals(profile), false);
return performQuery(ctx, new EdgeSearchQuery(specs), true); return performQuery(ctx, new EdgeSearchQuery(specs), true);
} }

View File

@ -1,7 +1,6 @@
package nu.marginalia.wmsa.edge.search; package nu.marginalia.wmsa.edge.search;
import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.service.SearchOrder;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
@ -9,27 +8,27 @@ import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Collectors;
public enum EdgeSearchProfile { public enum EdgeSearchProfile {
DEFAULT("default", SearchOrder.ASCENDING, DEFAULT("default",
Collections.emptyList(), Collections.emptyList(),
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
0, 1), 0, 1),
MODERN("modern", SearchOrder.ASCENDING, MODERN("modern",
Collections.emptyList(), Collections.emptyList(),
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
2), 2),
CORPO("corpo", SearchOrder.ASCENDING, CORPO("corpo",
Collections.emptyList(), Collections.emptyList(),
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
4, 5, 6, 7), 4, 5, 6, 7),
YOLO("yolo", SearchOrder.ASCENDING, YOLO("yolo",
Collections.emptyList(), Collections.emptyList(),
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
0, 2, 1, 3, 4, 6), 0, 2, 1, 3, 4, 6),
CORPO_CLEAN("corpo-clean", SearchOrder.ASCENDING, CORPO_CLEAN("corpo-clean",
Collections.emptyList(), Collections.emptyList(),
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
4, 5), 4, 5),
ACADEMIA("academia", SearchOrder.ASCENDING, ACADEMIA("academia",
Collections.emptyList(), Collections.emptyList(),
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
3), 3),
@ -37,17 +36,15 @@ public enum EdgeSearchProfile {
public final String name; public final String name;
public final SearchOrder order;
public final List<String> additionalSearchTerm; public final List<String> additionalSearchTerm;
public final List<Integer> buckets; public final List<Integer> buckets;
public final List<IndexBlock> indexBlocks; public final List<IndexBlock> indexBlocks;
EdgeSearchProfile(String name, SearchOrder order, EdgeSearchProfile(String name,
List<String> additionalSearchTerm, List<String> additionalSearchTerm,
List<IndexBlock> indexBlocks, List<IndexBlock> indexBlocks,
int... buckets) { int... buckets) {
this.name = name; this.name = name;
this.order = order;
this.additionalSearchTerm = additionalSearchTerm; this.additionalSearchTerm = additionalSearchTerm;
this.indexBlocks = indexBlocks; this.indexBlocks = indexBlocks;
this.buckets = Arrays.stream(buckets).boxed().collect(Collectors.toList()); this.buckets = Arrays.stream(buckets).boxed().collect(Collectors.toList());

View File

@ -32,7 +32,7 @@ import java.util.regex.Pattern;
public class SiteSearchCommand implements SearchCommandInterface { public class SiteSearchCommand implements SearchCommandInterface {
private final EdgeDataStoreDao dataStoreDao; private final EdgeDataStoreDao dataStoreDao;
private final EdgeSearchOperator searchOperator; private final EdgeSearchOperator searchOperator;
private DomainInformationService domainInformationService; private final DomainInformationService domainInformationService;
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
private final MustacheRenderer<DomainInformation> siteInfoRenderer; private final MustacheRenderer<DomainInformation> siteInfoRenderer;
@ -91,7 +91,7 @@ public class SiteSearchCommand implements SearchCommandInterface {
logger.info("Fetching Site Info: {}", word); logger.info("Fetching Site Info: {}", word);
var results = domainInformationService.domainInfo(word) var results = domainInformationService.domainInfo(word)
.orElseGet(() -> new DomainInformation(null, false, 0, 0, 0, 0, 0, 0, 0, EdgeDomainIndexingState.UNKNOWN, Collections.emptyList())); .orElseGet(() -> new DomainInformation(null, false, 0, 0, 0, 0, 0, 0, EdgeDomainIndexingState.UNKNOWN, Collections.emptyList()));
logger.debug("Results = {}", results); logger.debug("Results = {}", results);

View File

@ -18,7 +18,6 @@ public class DomainInformation {
int pagesIndexed; int pagesIndexed;
int incomingLinks; int incomingLinks;
int outboundLinks; int outboundLinks;
double nominalQuality;
double ranking; double ranking;
EdgeDomainIndexingState state; EdgeDomainIndexingState state;

View File

@ -138,7 +138,6 @@ public class QueryFactory {
.subqueries(subqueries) .subqueries(subqueries)
.limitByBucket(50) .limitByBucket(50)
.limitTotal(100) .limitTotal(100)
.searchOrder(profile.order)
.humanQuery(query) .humanQuery(query)
.buckets(profile.buckets); .buckets(profile.buckets);

View File

@ -107,7 +107,7 @@ public class SearchResultDecorator {
private double calculateTermScore(IndexBlock block, EdgeSearchResultItem resultItem, EdgeUrlDetails details) { private double calculateTermScore(IndexBlock block, EdgeSearchResultItem resultItem, EdgeUrlDetails details) {
return valuator.evaluateTerms(resultItem.scores, block, details.words) / Math.sqrt(1 + resultItem.queryLength) return valuator.evaluateTerms(resultItem.scores, block, details.words) / Math.sqrt(1 + resultItem.queryLength)
+ ((details.domainState == EdgeDomainIndexingState.SPECIAL.code) ? 1.25 : 0); + ((details.domainState == EdgeDomainIndexingState.SPECIAL) ? 1.25 : 0);
} }
} }

View File

@ -1,24 +1,43 @@
package nu.marginalia.wmsa.edge.search.siteinfo; package nu.marginalia.wmsa.edge.search.siteinfo;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl;
import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeId; import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
import nu.marginalia.wmsa.edge.search.model.DomainInformation; import nu.marginalia.wmsa.edge.search.model.DomainInformation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.inject.Inject; import javax.inject.Inject;
import javax.inject.Singleton; import javax.inject.Singleton;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Optional; import java.util.Optional;
/*
TODO: This class needs to be refactored, a lot of
these SQL queries are redundant and can be
collapsed into one single query that fetches
all the information
*/
@Singleton @Singleton
public class DomainInformationService { public class DomainInformationService {
private EdgeDataStoreDao dataStore; private EdgeDataStoreDaoImpl dataStoreDao;
private HikariDataSource dataSource;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject @Inject
public DomainInformationService(EdgeDataStoreDao dataStore) { public DomainInformationService(
this.dataStore = dataStore; EdgeDataStoreDaoImpl dataStoreDao,
HikariDataSource dataSource) {
this.dataStoreDao = dataStoreDao;
this.dataSource = dataSource;
} }
@ -28,29 +47,28 @@ public class DomainInformationService {
if (domainId == null) { if (domainId == null) {
return Optional.empty(); return Optional.empty();
} }
EdgeDomain domain = dataStore.getDomain(domainId); EdgeDomain domain = dataStoreDao.getDomain(domainId);
boolean blacklisted = dataStore.isBlacklisted(domain); boolean blacklisted = isBlacklisted(domain);
int pagesKnown = dataStore.getPagesKnown(domainId); int pagesKnown = getPagesKnown(domainId);
int pagesVisited = dataStore.getPagesVisited(domainId); int pagesVisited = getPagesVisited(domainId);
int pagesIndexed = dataStore.getPagesIndexed(domainId); int pagesIndexed = getPagesIndexed(domainId);
int incomingLinks = dataStore.getIncomingLinks(domainId); int incomingLinks = getIncomingLinks(domainId);
int outboundLinks = dataStore.getOutboundLinks(domainId); int outboundLinks = getOutboundLinks(domainId);
double rank = Math.round(10000.0*(1.0-dataStore.getRank(domainId)))/100; double rank = Math.round(10000.0*(1.0-getRank(domainId)))/100;
EdgeDomainIndexingState state = dataStore.getDomainState(domainId); EdgeDomainIndexingState state = getDomainState(domainId);
double nominalQuality = Math.round(100*100*Math.exp(dataStore.getDomainQuality(domainId)))/100.; List<EdgeDomain> linkingDomains = getLinkingDomains(domainId);
List<EdgeDomain> linkingDomains = dataStore.getLinkingDomains(domainId);
return Optional.of(new DomainInformation(domain, blacklisted, pagesKnown, pagesVisited, pagesIndexed, incomingLinks, outboundLinks, nominalQuality, rank, state, linkingDomains)); return Optional.of(new DomainInformation(domain, blacklisted, pagesKnown, pagesVisited, pagesIndexed, incomingLinks, outboundLinks, rank, state, linkingDomains));
} }
private EdgeId<EdgeDomain> getDomainFromPartial(String site) { private EdgeId<EdgeDomain> getDomainFromPartial(String site) {
try { try {
return dataStore.getDomainId(new EdgeDomain(site)); return dataStoreDao.getDomainId(new EdgeDomain(site));
} }
catch (Exception ex) { catch (Exception ex) {
try { try {
return dataStore.getDomainId(new EdgeDomain(site)); return dataStoreDao.getDomainId(new EdgeDomain(site));
} }
catch (Exception ex2) { catch (Exception ex2) {
return null; return null;
@ -58,4 +76,178 @@ public class DomainInformationService {
} }
} }
@SneakyThrows
public boolean isBlacklisted(EdgeDomain domain) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN_BLACKLIST WHERE URL_DOMAIN=?")) {
stmt.setString(1, domain.domain);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return true;
} else {
return false;
}
}
}
}
@SneakyThrows
public int getPagesKnown(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
public int getPagesVisited(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
public int getPagesIndexed(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
public int getIncomingLinks(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
public int getOutboundLinks(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return 0;
}
}
@SneakyThrows
public double getDomainQuality(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT QUALITY FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getDouble(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
return -5;
}
}
public EdgeDomainIndexingState getDomainState(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return EdgeDomainIndexingState.valueOf(rsp.getString(1));
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
} catch (SQLException throwables) {
throwables.printStackTrace();
}
return EdgeDomainIndexingState.ERROR;
}
public List<EdgeDomain> getLinkingDomains(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
List<EdgeDomain> results = new ArrayList<>(25);
try (var stmt = connection.prepareStatement("SELECT SOURCE_URL FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
while (rsp.next()) {
results.add(new EdgeDomain(rsp.getString(1)));
}
return results;
} catch (Exception ex) {
logger.error("DB error", ex);
}
} catch (SQLException throwables) {
throwables.printStackTrace();
}
return Collections.emptyList();
}
public double getRank(EdgeId<EdgeDomain> domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT IFNULL(RANK, 1) FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, domainId.getId());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getDouble(1);
}
} catch (Exception ex) {
logger.error("DB error", ex);
}
} catch (SQLException throwables) {
throwables.printStackTrace();
}
return 1;
}
} }

View File

@ -3,12 +3,13 @@ package nu.marginalia.wmsa.edge.tools;
import com.google.inject.Inject; import com.google.inject.Inject;
import gnu.trove.set.hash.TIntHashSet; import gnu.trove.set.hash.TIntHashSet;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl; import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.index.model.RankingSettings; import nu.marginalia.wmsa.edge.index.model.RankingSettings;
import nu.marginalia.wmsa.edge.index.service.SearchIndexDao; import nu.marginalia.wmsa.edge.index.conversion.SearchIndexDao;
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
import org.mariadb.jdbc.Driver; import org.mariadb.jdbc.Driver;
import org.roaringbitmap.longlong.Roaring64Bitmap; import org.roaringbitmap.longlong.Roaring64Bitmap;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -59,7 +60,9 @@ public class IndexMergerMain {
} }
var hikari = new DatabaseModule().provideConnection(); var hikari = new DatabaseModule().provideConnection();
var partitioner = new SearchIndexPartitioner(new SearchIndexDao(hikari, new RankingSettings())); var ds = new DatabaseModule().provideConnection();
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
var partitioner = new SearchIndexPartitioner(new SearchIndexDao(hikari, domains, new RankingSettings()));
var blacklist = new EdgeDomainBlacklistImpl(hikari); var blacklist = new EdgeDomainBlacklistImpl(hikari);
new IndexMergerMain(file1, file2, outputFile, partitioner, blacklist); new IndexMergerMain(file1, file2, outputFile, partitioner, blacklist);

View File

@ -1,24 +1,11 @@
DROP TABLE IF EXISTS EC_URL_LINK; DROP TABLE IF EXISTS DOMAIN_METADATA;
DROP VIEW IF EXISTS EC_PAGE_VIEW;
DROP TABLE IF EXISTS DISC_DOMAIN_TAG;
DROP TABLE IF EXISTS DISC_TAG;
DROP TABLE IF EXISTS DISC_USER;
DROP TABLE IF EXISTS EC_DOMAIN_NEIGHBORS;
DROP TABLE IF EXISTS EC_FEED_URL; DROP TABLE IF EXISTS EC_FEED_URL;
DROP TABLE IF EXISTS EC_DOMAIN_LINK; DROP TABLE IF EXISTS EC_DOMAIN_LINK;
DROP TABLE IF EXISTS EC_PAGE_DATA; DROP TABLE IF EXISTS EC_PAGE_DATA;
DROP TABLE IF EXISTS EC_URL; DROP TABLE IF EXISTS EC_URL;
DROP TABLE IF EXISTS EC_DOMAIN_NEIGHBORS;
DROP TABLE IF EXISTS EC_DOMAIN; DROP TABLE IF EXISTS EC_DOMAIN;
DROP TABLE IF EXISTS EC_TOP_DOMAIN;
DROP TABLE IF EXISTS EC_URL_DETAILS;
DROP VIEW IF EXISTS EC_URL_VIEW;
DROP VIEW IF EXISTS EC_URL_PART_HASH;
DROP TABLE IF EXISTS EC_URL_WORD;
DROP TABLE IF EXISTS EC_DICTIONARY;
DROP TABLE IF EXISTS DOMAIN_METADATA;
CREATE TABLE IF NOT EXISTS DOMAIN_METADATA ( CREATE TABLE IF NOT EXISTS DOMAIN_METADATA (
ID INT PRIMARY KEY, ID INT PRIMARY KEY,
@ -27,52 +14,31 @@ CREATE TABLE IF NOT EXISTS DOMAIN_METADATA (
GOOD_URLS INT DEFAULT 0 GOOD_URLS INT DEFAULT 0
); );
CREATE TABLE IF NOT EXISTS EC_TOP_DOMAIN (
ID INT PRIMARY KEY AUTO_INCREMENT,
URL_PART VARCHAR(255) UNIQUE NOT NULL,
ALIVE BOOLEAN DEFAULT TRUE NOT NULL
)
CHARACTER SET utf8mb4
COLLATE utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS EC_DOMAIN ( CREATE TABLE IF NOT EXISTS EC_DOMAIN (
ID INT PRIMARY KEY AUTO_INCREMENT, ID INT PRIMARY KEY AUTO_INCREMENT,
URL_PART VARCHAR(255) UNIQUE NOT NULL,
INDEXED INT DEFAULT 0 NOT NULL,
QUALITY DOUBLE DEFAULT -5 NOT NULL,
QUALITY_RAW DOUBLE DEFAULT -5 NOT NULL,
QUALITY_ORIGINAL DOUBLE DEFAULT -5 NOT NULL,
URL_TOP_DOMAIN_ID INT NOT NULL, DOMAIN_NAME VARCHAR(255) UNIQUE NOT NULL,
URL_SUBDOMAIN VARCHAR(255) NOT NULL, DOMAIN_TOP VARCHAR(255) NOT NULL,
STATE INT DEFAULT 0 NOT NULL,
INDEXED INT DEFAULT 0 NOT NULL COMMENT "~number of documents visited / 100",
STATE ENUM('ACTIVE', 'EXHAUSTED', 'SPECIAL', 'SOCIAL_MEDIA', 'BLOCKED', 'REDIR', 'ERROR', 'UNKNOWN') NOT NULL DEFAULT 'active' COMMENT "@see EdgeDomainIndexingState",
RANK DOUBLE, RANK DOUBLE,
DOMAIN_ALIAS INTEGER, DOMAIN_ALIAS INTEGER,
IP VARCHAR(32),
INDEX_DATE TIMESTAMP DEFAULT NOW(), INDEX_DATE TIMESTAMP DEFAULT NOW(),
DISCOVER_DATE TIMESTAMP DEFAULT NOW(), DISCOVER_DATE TIMESTAMP DEFAULT NOW(),
FOREIGN KEY (URL_TOP_DOMAIN_ID) REFERENCES EC_TOP_DOMAIN(ID) ON DELETE CASCADE IS_ALIVE BOOLEAN AS (STATE='ACTIVE' OR STATE='EXHAUSTED' OR STATE='SPECIAL' OR STATE='SOCIAL_MEDIA') VIRTUAL
)
CHARACTER SET utf8mb4
COLLATE utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS EC_DOMAIN_HISTORY (
ID INT PRIMARY KEY AUTO_INCREMENT,
URL_PART VARCHAR(255) UNIQUE NOT NULL,
QUALITY_MEASURE DOUBLE DEFAULT -5 NOT NULL,
INBOUND_LINKS INT DEFAULT 1,
LINK_ADJUSTED_QUALITY DOUBLE GENERATED ALWAYS AS (0.3*QUALITY_MEASURE + 0.7*QUALITY_MEASURE / GREATEST(1, INBOUND_LINKS)),
RANK DOUBLE
) )
CHARACTER SET utf8mb4 CHARACTER SET utf8mb4
COLLATE utf8mb4_unicode_ci; COLLATE utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS EC_DOMAIN_BLACKLIST ( CREATE TABLE IF NOT EXISTS EC_DOMAIN_BLACKLIST (
ID INT PRIMARY KEY AUTO_INCREMENT, ID INT PRIMARY KEY AUTO_INCREMENT,
URL_DOMAIN VARCHAR(255) UNIQUE NOT NULL URL_DOMAIN VARCHAR(255) UNIQUE NOT NULL
) )
CHARACTER SET utf8mb4 CHARACTER SET utf8mb4
COLLATE utf8mb4_unicode_ci; COLLATE utf8mb4_unicode_ci;
@ -81,18 +47,15 @@ CREATE TABLE IF NOT EXISTS EC_URL (
ID INT PRIMARY KEY AUTO_INCREMENT, ID INT PRIMARY KEY AUTO_INCREMENT,
DOMAIN_ID INT NOT NULL, DOMAIN_ID INT NOT NULL,
PROTO ENUM('http','https','gemini') NOT NULL, PROTO ENUM('http','https','gemini') NOT NULL,
URL VARCHAR(255) NOT NULL, PATH VARCHAR(255) NOT NULL COLLATE utf8mb4_bin,
PORT INT, PORT INT,
PATH_HASH BIGINT NOT NULL COMMENT "Hash of PATH for uniqueness check by domain",
VISITED BOOLEAN NOT NULL DEFAULT FALSE, VISITED BOOLEAN NOT NULL DEFAULT FALSE,
DATA_HASH INTEGER,
QUALITY_MEASURE DOUBLE,
STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok', STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok',
IP VARCHAR(32), CONSTRAINT CONS UNIQUE (DOMAIN_ID, PATH_HASH),
CONSTRAINT CONS UNIQUE (DOMAIN_ID, URL),
FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE
) )
CHARACTER SET utf8mb4 CHARACTER SET utf8mb4
@ -101,13 +64,15 @@ COLLATE utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS EC_PAGE_DATA ( CREATE TABLE IF NOT EXISTS EC_PAGE_DATA (
ID INT PRIMARY KEY AUTO_INCREMENT, ID INT PRIMARY KEY AUTO_INCREMENT,
TITLE VARCHAR(255), TITLE VARCHAR(255) NOT NULL,
DESCRIPTION VARCHAR(255), DESCRIPTION VARCHAR(255) NOT NULL,
WORDS_DISTINCT INTEGER, WORDS_TOTAL INTEGER NOT NULL,
WORDS_TOTAL INTEGER, FORMAT ENUM('PLAIN', 'UNKNOWN', 'HTML123', 'HTML4', 'XHTML', 'HTML5', 'MARKDOWN') NOT NULL,
FORMAT VARCHAR(8), FEATURES INT COMMENT "Bit-encoded feature set of document, @see HtmlFeature" NOT NULL,
FEATURES INT,
DATA_HASH INTEGER NOT NULL,
QUALITY DOUBLE NOT NULL,
FOREIGN KEY (ID) REFERENCES EC_URL(ID) ON DELETE CASCADE FOREIGN KEY (ID) REFERENCES EC_URL(ID) ON DELETE CASCADE
) )
@ -115,13 +80,9 @@ CHARACTER SET utf8mb4
COLLATE utf8mb4_unicode_ci; COLLATE utf8mb4_unicode_ci;
CREATE TABLE EC_FEED_URL ( CREATE TABLE EC_FEED_URL (
ID INT PRIMARY KEY AUTO_INCREMENT, URL VARCHAR(255) PRIMARY KEY,
DOMAIN_ID INT NOT NULL, DOMAIN_ID INT,
PROTO VARCHAR(8) NOT NULL,
URL VARCHAR(255) NOT NULL,
PORT INT,
CONSTRAINT CONS UNIQUE (DOMAIN_ID, URL),
FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE
) )
CHARACTER SET utf8mb4 CHARACTER SET utf8mb4
@ -150,92 +111,63 @@ CREATE TABLE IF NOT EXISTS EC_DOMAIN_LINK (
FOREIGN KEY (DEST_DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE FOREIGN KEY (DEST_DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE
); );
CREATE TABLE IF NOT EXISTS EC_DOMAIN_LINK_AGGREGATE (
DOMAIN_ID INT PRIMARY KEY NOT NULL,
LINKS INT
);
CREATE OR REPLACE VIEW EC_URL_VIEW AS CREATE OR REPLACE VIEW EC_URL_VIEW AS
SELECT SELECT
EC_DOMAIN.URL_PART AS URL_DOMAIN, IF(PORT IS NULL,
EC_URL.URL AS URL_PATH, CONCAT(EC_URL.PROTO, "://", EC_DOMAIN.DOMAIN_NAME, EC_URL.PATH),
EC_TOP_DOMAIN.URL_PART AS URL_TOP, CONCAT(EC_URL.PROTO, "://", EC_DOMAIN.DOMAIN_NAME, ":", EC_URL.PORT, EC_URL.PATH))
AS URL,
EC_URL.PATH_HASH AS PATH_HASH,
EC_URL.PATH AS PATH,
EC_DOMAIN.DOMAIN_NAME AS DOMAIN_NAME,
EC_DOMAIN.DOMAIN_TOP AS DOMAIN_TOP,
EC_URL.ID AS ID, EC_URL.ID AS ID,
EC_DOMAIN.ID AS DOMAIN_ID, EC_DOMAIN.ID AS DOMAIN_ID,
EC_TOP_DOMAIN.ID AS TOP_DOMAIN_ID,
EC_URL.PROTO AS URL_PROTO,
EC_URL.PORT AS URL_PORT,
EC_URL.VISITED AS VISITED, EC_URL.VISITED AS VISITED,
EC_URL.DATA_HASH AS DATA_HASH,
EC_URL.QUALITY_MEASURE AS URL_QUALITY_MEASURE, EC_PAGE_DATA.QUALITY AS QUALITY,
EC_DOMAIN.QUALITY AS DOMAIN_QUALITY_MEASURE, EC_PAGE_DATA.DATA_HASH AS DATA_HASH,
EC_DOMAIN.QUALITY_RAW AS QUALITY_RAW,
EC_PAGE_DATA.TITLE AS TITLE, EC_PAGE_DATA.TITLE AS TITLE,
EC_PAGE_DATA.DESCRIPTION AS DESCRIPTION, EC_PAGE_DATA.DESCRIPTION AS DESCRIPTION,
EC_URL.IP AS IP,
EC_DOMAIN.STATE AS STATE,
EC_PAGE_DATA.WORDS_TOTAL AS WORDS_TOTAL, EC_PAGE_DATA.WORDS_TOTAL AS WORDS_TOTAL,
EC_PAGE_DATA.FORMAT AS FORMAT, EC_PAGE_DATA.FORMAT AS FORMAT,
EC_PAGE_DATA.FEATURES AS FEATURES, EC_PAGE_DATA.FEATURES AS FEATURES,
EC_DOMAIN.IP AS IP,
EC_DOMAIN.STATE AS STATE,
EC_DOMAIN.RANK AS RANK, EC_DOMAIN.RANK AS RANK,
EC_DOMAIN.STATE AS DOMAIN_STATE EC_DOMAIN.STATE AS DOMAIN_STATE
FROM EC_URL FROM EC_URL
LEFT JOIN EC_PAGE_DATA LEFT JOIN EC_PAGE_DATA
ON EC_PAGE_DATA.ID = EC_URL.ID ON EC_PAGE_DATA.ID = EC_URL.ID
INNER JOIN EC_DOMAIN INNER JOIN EC_DOMAIN
ON EC_URL.DOMAIN_ID = EC_DOMAIN.ID ON EC_URL.DOMAIN_ID = EC_DOMAIN.ID;
INNER JOIN EC_TOP_DOMAIN
ON EC_DOMAIN.URL_TOP_DOMAIN_ID=EC_TOP_DOMAIN.ID;
CREATE OR REPLACE VIEW EC_DISCOVER_TASKS_VIEW AS
SELECT
ID,
URL_PART
FROM EC_DOMAIN
WHERE
DOMAIN_ALIAS IS NULL
AND INDEXED = 0
ORDER BY QUALITY DESC, ID ASC;
CREATE OR REPLACE VIEW EC_RELATED_LINKS_VIEW AS CREATE OR REPLACE VIEW EC_RELATED_LINKS_VIEW AS
SELECT SELECT
SOURCE_DOMAIN_ID, SOURCE_DOMAIN_ID,
SOURCE_DOMAIN.URL_PART AS SOURCE_URL, SOURCE_DOMAIN.DOMAIN_NAME AS SOURCE_DOMAIN,
SOURCE_TOP_DOMAIN.URL_PART AS SOURCE_TOP_URL, SOURCE_DOMAIN.DOMAIN_TOP AS SOURCE_TOP_DOMAIN,
DEST_DOMAIN_ID, DEST_DOMAIN_ID,
DEST_DOMAIN.URL_PART AS DEST_URL, DEST_DOMAIN.DOMAIN_NAME AS DEST_DOMAIN,
DEST_TOP_DOMAIN.URL_PART AS DEST_TOP_URL DEST_DOMAIN.DOMAIN_TOP AS DEST_TOP_DOMAIN
FROM EC_DOMAIN_LINK FROM EC_DOMAIN_LINK
INNER JOIN EC_DOMAIN AS SOURCE_DOMAIN INNER JOIN EC_DOMAIN AS SOURCE_DOMAIN
ON SOURCE_DOMAIN.ID=SOURCE_DOMAIN_ID ON SOURCE_DOMAIN.ID=SOURCE_DOMAIN_ID
INNER JOIN EC_TOP_DOMAIN AS SOURCE_TOP_DOMAIN
ON SOURCE_TOP_DOMAIN.ID=SOURCE_DOMAIN.URL_TOP_DOMAIN_ID
INNER JOIN EC_DOMAIN AS DEST_DOMAIN INNER JOIN EC_DOMAIN AS DEST_DOMAIN
ON DEST_DOMAIN.ID=DEST_DOMAIN_ID ON DEST_DOMAIN.ID=DEST_DOMAIN_ID
INNER JOIN EC_TOP_DOMAIN AS DEST_TOP_DOMAIN
ON DEST_TOP_DOMAIN.ID=DEST_DOMAIN.URL_TOP_DOMAIN_ID
; ;
CREATE OR REPLACE VIEW EC_RELATED_LINKS_IN AS CREATE OR REPLACE VIEW EC_RELATED_LINKS_IN AS
SELECT SELECT
IN_URL.ID AS SRC_URL_ID, IN_URL.ID AS SRC_URL_ID,
IN_URL.QUALITY_MEASURE AS SRC_URL_QUALITY, OUT_URL.ID AS DEST_URL_ID
OUT_URL.ID AS DEST_URL_ID, FROM EC_DOMAIN_LINK
OUT_URL.QUALITY_MEASURE AS DEST_URL_QUALITY INNER JOIN EC_URL AS IN_URL ON IN_URL.DOMAIN_ID=EC_DOMAIN_LINK.SOURCE_DOMAIN_ID
FROM EC_URL AS IN_URL INNER JOIN EC_URL AS OUT_URL ON OUT_URL.DOMAIN_ID=EC_DOMAIN_LINK.DEST_DOMAIN_ID
INNER JOIN EC_DOMAIN_LINK WHERE IN_URL.VISITED AND IN_URL.STATE = 'ok'
ON IN_URL.DOMAIN_ID=EC_DOMAIN_LINK.SOURCE_DOMAIN_ID AND OUT_URL.VISITED AND OUT_URL.STATE = 'ok';
INNER JOIN EC_URL AS OUT_URL
ON OUT_URL.DOMAIN_ID=EC_DOMAIN_LINK.DEST_DOMAIN_ID
WHERE IN_URL.VISITED=TRUE
AND IN_URL.DATA_HASH IS NOT NULL
AND OUT_URL.VISITED=TRUE
AND OUT_URL.DATA_HASH IS NOT NULL;
CREATE TABLE IF NOT EXISTS EC_DOMAIN_BACKLINKS (
ID INT PRIMARY KEY,
LINKEDNESS INT
);
CREATE TABLE IF NOT EXISTS EC_API_KEY ( CREATE TABLE IF NOT EXISTS EC_API_KEY (
LICENSE_KEY VARCHAR(255) UNIQUE, LICENSE_KEY VARCHAR(255) UNIQUE,
@ -245,16 +177,8 @@ CREATE TABLE IF NOT EXISTS EC_API_KEY (
RATE INT DEFAULT 10 RATE INT DEFAULT 10
); );
CREATE INDEX IF NOT EXISTS EC_DOMAIN_RANK_INDEX ON EC_DOMAIN (RANK);
CREATE INDEX IF NOT EXISTS EC_DOMAIN_QUALITY_INDEX ON EC_DOMAIN (QUALITY,STATE);
CREATE INDEX IF NOT EXISTS EC_DOMAIN_INDEXED_INDEX ON EC_DOMAIN (INDEXED); CREATE INDEX IF NOT EXISTS EC_DOMAIN_INDEXED_INDEX ON EC_DOMAIN (INDEXED);
CREATE INDEX IF NOT EXISTS EC_DOMAIN_ID_INDEXED_INDEX ON EC_DOMAIN (ID, INDEXED); CREATE INDEX IF NOT EXISTS EC_DOMAIN_TOP_DOMAIN ON EC_DOMAIN (DOMAIN_TOP);
CREATE INDEX IF NOT EXISTS EC_DOMAIN_TRIO ON EC_DOMAIN (STATE, DOMAIN_ALIAS, INDEXED, QUALITY);
CREATE INDEX IF NOT EXISTS EC_URL_VISITED ON EC_URL (VISITED);
CREATE INDEX IF NOT EXISTS EC_URL_VISITED_STATE ON EC_URL (VISITED, STATE);
CREATE INDEX IF NOT EXISTS EC_URL_IP ON EC_URL (IP);
---; ---;

View File

@ -10,5 +10,4 @@ Pages Known: {{pagesKnown}}
Pages Indexed: {{pagesKnown}} Pages Indexed: {{pagesKnown}}
Inbound Links: {{inboundLinks}} Inbound Links: {{inboundLinks}}
Outbound Links: {{outboundLinks}} Outbound Links: {{outboundLinks}}
Nominal Quality: {{nominalQuality}}%
Crawl Ranking: {{ranking}}% Crawl Ranking: {{ranking}}%

View File

@ -37,7 +37,6 @@
<div class="card info"> <div class="card info">
<h2>Links</h2> <h2>Links</h2>
<p class="description"> <p class="description">
Nominal Quality: {{nominalQuality}}%<br/>
Crawl Ranking: {{ranking}}%<br/> Crawl Ranking: {{ranking}}%<br/>
Incoming Links: {{incomingLinks}} <br/> Incoming Links: {{incomingLinks}} <br/>
Outbound Links: {{outboundLinks}} <br/> Outbound Links: {{outboundLinks}} <br/>

View File

@ -43,7 +43,7 @@ public class TestUtil {
logger.info("Running script {}", scriptFile); logger.info("Running script {}", scriptFile);
try (var scriptStream = ClassLoader.getSystemResourceAsStream(scriptFile); try (var scriptStream = ClassLoader.getSystemResourceAsStream(scriptFile);
var stmt = conn.createStatement()) { var stmt = conn.createStatement()) {
for (String s : new String(scriptStream.readAllBytes()).split(";")) { for (String s : new String(scriptStream.readAllBytes()).split("(;|---)")) {
if (!s.isBlank()) { if (!s.isBlank()) {
try { try {
Assertions.assertTrue(stmt.executeUpdate(s) >= 0); Assertions.assertTrue(stmt.executeUpdate(s) >= 0);

View File

@ -90,10 +90,10 @@ class BTreeWriterTest {
{ {
var writer = new BTreeWriter(mmf, ctx); var writer = new BTreeWriter(mmf, ctx);
writer.write(0, toPut.size(), (offset) -> { writer.write(0, toPut.size(), (slice) -> {
for (int i = 0; i < data.length; i++) { for (int i = 0; i < data.length; i++) {
mmf.put(offset + 2L*i, data[i]); slice.put(2L*i, data[i]);
mmf.put(offset + 2L*i + 1, i); slice.put( 2L*i + 1, i);
} }
}); });
mmf.force(); mmf.force();
@ -133,10 +133,10 @@ class BTreeWriterTest {
{ {
var writer = new BTreeWriter(mmf, ctx); var writer = new BTreeWriter(mmf, ctx);
writer.write( 0, toPut.size(), (offset) -> { writer.write( 0, toPut.size(), (slice) -> {
for (int i = 0; i < data.length; i++) { for (int i = 0; i < data.length; i++) {
mmf.put(offset + 2L*i, data[i]); slice.put(2L*i, data[i]);
mmf.put(offset + 2L*i + 1, i); slice.put(2L*i + 1, i);
} }
}); });
mmf.force(); mmf.force();
@ -182,9 +182,9 @@ class BTreeWriterTest {
try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) { try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) {
{ {
var writer = new BTreeWriter(mmf, ctx); var writer = new BTreeWriter(mmf, ctx);
writer.write(0, toPut.size(), (offset) -> { writer.write(0, toPut.size(), (slice) -> {
for (int i = 0; i < data.length; i++) { for (int i = 0; i < data.length; i++) {
mmf.put(offset + i, data[i]); slice.put(i, data[i]);
} }
}); });
mmf.force(); mmf.force();
@ -235,9 +235,9 @@ class BTreeWriterTest {
try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) { try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) {
{ {
var writer = new BTreeWriter(mmf, ctx); var writer = new BTreeWriter(mmf, ctx);
writer.write(0, toPut.size(), (offset) -> { writer.write(0, toPut.size(), (slice) -> {
for (int i = 0; i < data.length; i++) { for (int i = 0; i < data.length; i++) {
mmf.put(offset + i, data[i]); slice.put(i, data[i]);
} }
}); });
mmf.force(); mmf.force();
@ -288,10 +288,10 @@ class BTreeWriterTest {
try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) { try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) {
{ {
var writer = new BTreeWriter(mmf, ctx); var writer = new BTreeWriter(mmf, ctx);
writer.write(0, toPut.size(), (offset) -> { writer.write(0, toPut.size(), (slice) -> {
for (int i = 0; i < data.length; i++) { for (int i = 0; i < data.length; i++) {
mmf.put(offset + i*2L, data[i]); slice.put(i*2L, data[i]);
mmf.put(offset + i*2L+1, i); slice.put(i*2L+1, i);
} }
}); });
mmf.force(); mmf.force();

View File

@ -27,7 +27,7 @@ class LongPairHashMapTest {
try { try {
RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw"); RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw");
MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000, true); MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000, true);
var lphm = new LongPairHashMap(mmf, 1024); var lphm = LongPairHashMap.createNew(mmf, 1024);
toPut.forEach(i -> { toPut.forEach(i -> {
lphm.put(new LongPairHashMap.CellData(i, i)); lphm.put(new LongPairHashMap.CellData(i, i));
}); });
@ -36,7 +36,7 @@ class LongPairHashMapTest {
RandomAccessFile raf2 = new RandomAccessFile(tempFile.toFile(), "rw"); RandomAccessFile raf2 = new RandomAccessFile(tempFile.toFile(), "rw");
MultimapFileLong mmf2 = new MultimapFileLong(raf2, FileChannel.MapMode.READ_WRITE, 10000, 1000, true); MultimapFileLong mmf2 = new MultimapFileLong(raf2, FileChannel.MapMode.READ_WRITE, 10000, 1000, true);
var lphm2 = new LongPairHashMap(mmf2); var lphm2 = LongPairHashMap.loadExisting(mmf2);
toPut.forEach(i -> { toPut.forEach(i -> {
Assertions.assertTrue(lphm2.get(i).isSet()); Assertions.assertTrue(lphm2.get(i).isSet());
Assertions.assertEquals(i, (int) lphm2.get(i).getKey()); Assertions.assertEquals(i, (int) lphm2.get(i).getKey());

View File

@ -0,0 +1,48 @@
package nu.marginalia.wmsa.edge.converting.loader;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.util.TestUtil;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.testcontainers.containers.MariaDBContainer;
import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
@Testcontainers
class SqlLoadDomainLinksTest {
@Container
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
.withDatabaseName("WMSA_prod")
.withUsername("wmsa")
.withPassword("wmsa")
.withInitScript("sql/edge-crawler-cache.sql")
.withNetworkAliases("mariadb");
HikariDataSource dataSource;
LoaderData loaderData;
@BeforeEach
public void setUp() {
dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl());
var loadDomains = new SqlLoadDomains(dataSource);
loaderData = new LoaderData(10);
loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
loadDomains.load(loaderData, new EdgeDomain[] { new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu") });
}
@AfterEach
public void tearDown() {
dataSource.close();
}
@Test
public void loadDomainLinks() {
var loader = new SqlLoadDomainLinks(dataSource);
loader.load(new DomainLink[] { new DomainLink(new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu")) });
}
}

View File

@ -0,0 +1,52 @@
package nu.marginalia.wmsa.edge.converting.loader;
import nu.marginalia.util.TestUtil;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import org.junit.jupiter.api.Test;
import org.testcontainers.containers.MariaDBContainer;
import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
import static org.junit.jupiter.api.Assertions.*;
@Testcontainers
class SqlLoadDomainsTest {
@Container
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
.withDatabaseName("WMSA_prod")
.withUsername("wmsa")
.withPassword("wmsa")
.withInitScript("sql/edge-crawler-cache.sql")
.withNetworkAliases("mariadb");
@Test
public void loadDomain() {
try (var dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl());) {
var loadDomains = new SqlLoadDomains(dataSource);
var loaderData = new LoaderData(10);
loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu"));
assertTrue(loaderData.getDomainId(new EdgeDomain("www.marginalia.nu")) >= 0);
}
}
@Test
public void loadDomains() {
try (var dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl());) {
var loadDomains = new SqlLoadDomains(dataSource);
var loaderData = new LoaderData(10);
loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
loadDomains.load(loaderData, new EdgeDomain[] { new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu") });
assertTrue(loaderData.getDomainId(new EdgeDomain("www.marginalia.nu")) >= 0);
assertTrue(loaderData.getDomainId(new EdgeDomain("memex.marginalia.nu")) >= 0);
}
}
}

View File

@ -0,0 +1,94 @@
package nu.marginalia.wmsa.edge.converting.loader;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.util.TestUtil;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument;
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.testcontainers.containers.MariaDBContainer;
import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
import java.net.URISyntaxException;
import java.util.List;
import java.util.Set;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
@Testcontainers
class SqlLoadProcessedDocumentTest {
@Container
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
.withDatabaseName("WMSA_prod")
.withUsername("wmsa")
.withPassword("wmsa")
.withInitScript("sql/edge-crawler-cache.sql")
.withNetworkAliases("mariadb");
HikariDataSource dataSource;
LoaderData loaderData;
EdgeDataStoreDaoImpl dataStoreDao;
@BeforeEach
public void setUp() throws URISyntaxException {
dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl());
dataStoreDao = new EdgeDataStoreDaoImpl(dataSource);
var loadDomains = new SqlLoadDomains(dataSource);
var loadUrls = new SqlLoadUrls(dataSource);
loaderData = new LoaderData(10);
loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu"));
loadUrls.load(loaderData, new EdgeUrl[]{new EdgeUrl("https://www.marginalia.nu/")});
}
@AfterEach
public void tearDown() {
dataStoreDao.clearCaches();
dataSource.close();
}
@Test
public void loadProcessedDocument() throws URISyntaxException {
var loader = new SqlLoadProcessedDocument(dataSource);
var url = new EdgeUrl("https://www.marginalia.nu/");
loader.load(loaderData, List.of(new LoadProcessedDocument(
url,
EdgeUrlState.OK,
"TITLE",
"DESCR",
HtmlFeature.encode(Set.of(HtmlFeature.AFFILIATE_LINK)),
EdgeHtmlStandard.HTML5,
100,
12345,
-3.14
)));
var details = dataStoreDao.getUrlDetailsMulti(List.of(new EdgeId<>(loaderData.getUrlId(new EdgeUrl("https://www.marginalia.nu/")))));
assertEquals(1, details.size());
var urlDetails = details.get(0);
assertEquals("TITLE", urlDetails.getTitle());
assertEquals("DESCR", urlDetails.getDescription());
assertTrue(urlDetails.isAffiliate());
assertEquals(100, urlDetails.words);
assertEquals(12345, urlDetails.dataHash);
assertEquals(-3.14, urlDetails.getUrlQuality());
}
}

View File

@ -0,0 +1,54 @@
package nu.marginalia.wmsa.edge.converting.loader;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.util.TestUtil;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.testcontainers.containers.MariaDBContainer;
import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
@Testcontainers
class SqlLoadProcessedDomainTest {
@Container
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
.withDatabaseName("WMSA_prod")
.withUsername("wmsa")
.withPassword("wmsa")
.withInitScript("sql/edge-crawler-cache.sql")
.withNetworkAliases("mariadb");
HikariDataSource dataSource;
LoaderData loaderData;
@BeforeEach
public void setUp() {
dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl());
var loadDomains = new SqlLoadDomains(dataSource);
loaderData = new LoaderData(10);
loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
loadDomains.load(loaderData, new EdgeDomain[]{ new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu") });
}
@AfterEach
public void tearDown() {
dataSource.close();
}
@Test
public void loadProcessedDomain() {
var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource));
loader.load(loaderData, new EdgeDomain("www.marginalia.nu"), EdgeDomainIndexingState.BLOCKED, "127.0.0.1");
}
@Test
public void loadDomainAlias() {
var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource));
loader.loadAlias(loaderData, new DomainLink(new EdgeDomain("memex.marginalia.nu"), new EdgeDomain("www.marginalia.nu")));
}
}

View File

@ -0,0 +1,50 @@
package nu.marginalia.wmsa.edge.converting.loader;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.util.TestUtil;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.testcontainers.containers.MariaDBContainer;
import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
import java.net.URISyntaxException;
@Testcontainers
class SqlLoadUrlsTest {
@Container
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
.withDatabaseName("WMSA_prod")
.withUsername("wmsa")
.withPassword("wmsa")
.withInitScript("sql/edge-crawler-cache.sql")
.withNetworkAliases("mariadb");
HikariDataSource dataSource;
LoaderData loaderData;
@BeforeEach
public void setUp() {
dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl());
var loadDomains = new SqlLoadDomains(dataSource);
loaderData = new LoaderData(10);
loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu"));
}
@AfterEach
public void tearDown() {
dataSource.close();
}
@Test
public void loadUrl() throws URISyntaxException {
var loadUrls = new SqlLoadUrls(dataSource);
loadUrls.load(loaderData, new EdgeUrl[] { new EdgeUrl("https://www.marginalia.nu/") });
}
}

View File

@ -1,11 +1,11 @@
package nu.marginalia.wmsa.edge.index.service; package nu.marginalia.wmsa.edge.index.service;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryReader; import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader;
import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryWriter; import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter;
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter; import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;

View File

@ -3,14 +3,14 @@ package nu.marginalia.wmsa.edge.index.service;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.util.TestUtil; import nu.marginalia.util.TestUtil;
import nu.marginalia.wmsa.client.exception.RemoteException;
import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.configuration.server.Initialization;
import nu.marginalia.wmsa.edge.index.EdgeIndexService; import nu.marginalia.wmsa.edge.index.EdgeIndexService;
import nu.marginalia.wmsa.edge.index.IndexServicesFactory; import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
import nu.marginalia.wmsa.edge.model.EdgeId; import nu.marginalia.wmsa.edge.model.EdgeId;
@ -23,7 +23,6 @@ import org.junit.jupiter.api.parallel.ResourceAccessMode;
import org.junit.jupiter.api.parallel.ResourceLock; import org.junit.jupiter.api.parallel.ResourceLock;
import spark.Spark; import spark.Spark;
import java.io.File;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.Arrays; import java.util.Arrays;
@ -31,7 +30,6 @@ import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import static nu.marginalia.util.TestUtil.getConnection; import static nu.marginalia.util.TestUtil.getConnection;
import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.assertTrue;

View File

@ -1,89 +0,0 @@
package nu.marginalia.wmsa.edge.index.service;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter;
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
class SearchIndexConverterTest {
private final Logger logger = LoggerFactory.getLogger(getClass());
@Test @Disabled @SneakyThrows
public void test() {
// File dictFile = new File("/home/vlofgren/dictionary.dat");
File inFile = new File("/home/vlofgren/Work/converter/3/page-index.dat");
new SearchIndexConverter(IndexBlock.Title, 0, Path.of("/tmp"), inFile,
new File("/home/vlofgren/Work/converter/words.dat"),
new File("/home/vlofgren/Work/converter/urls.dat"), new SearchIndexPartitioner(null), val -> false);
// sanityCheck();
}
@Test @Disabled
public void sanityCheck() {
File inFile = new File("/home/vlofgren/write/6/page-index.dat");
// SearchIndexReader sir = new SearchIndexReader(new SearchIndex[]{
// new SearchIndex("body", Path.of("/tmp"),
// new File("/home/vlofgren/data/urls.dat"),
// new File("/home/vlofgren/data/words.dat")),
// new SearchIndex("body", Path.of("/tmp"),
// new File("/home/vlofgren/data/urls.dat"),
// new File("/home/vlofgren/data/words.dat"))
// ,
// new SearchIndex("body", Path.of("/tmp"),
// new File("/home/vlofgren/data/urls.dat"),
// new File("/home/vlofgren/data/words.dat"))
// ,
// new SearchIndex("body", Path.of("/tmp"),
// new File("/home/vlofgren/data/urls.dat"),
// new File("/home/vlofgren/data/words.dat"))
// });
// getQuery(sir, new EdgeIndexSearchTerms(List.of(152, 106), Collections.emptyList())).stream().forEach(System.out::println);
// sir.findWord(152).also(106).stream().forEach(System.out::println);
// scanFile(inFile, (url, word) -> {
// //System.out.println(url + " " + word);
// if (!sir.findWord(word).stream().anyMatch(url::equals)) {
// logger.error("Can't find word {} in {}", word, url);
// }
// });
}
/*
private SearchIndexReader.Query getQuery(SearchIndexReader indexReader, EdgeIndexSearchTerms searchTerms) {
var orderedIncludes = searchTerms.includes
.stream()
.sorted(Comparator.comparingLong(indexReader::numHits))
.distinct()
.mapToInt(Integer::intValue)
.toArray();
logger.info("Includes: ({}); excludes: ({})", Arrays.
stream(orderedIncludes)
.mapToObj(String::valueOf)
.collect(Collectors.joining(",")),
searchTerms.excludes.stream().map(String::valueOf).collect(Collectors.joining(",")));
SearchIndexReader.Query query = indexReader.findWord(orderedIncludes[0]);
for (int i = 1; i < orderedIncludes.length; i++) {
query = query.also(orderedIncludes[i]);
}
for (int term : searchTerms.excludes) {
query = query.not(term);
}
return query;
}
*/
}

View File

@ -1,14 +1,14 @@
package nu.marginalia.wmsa.edge.index.service; package nu.marginalia.wmsa.edge.index.service;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryWriter; import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter;
import nu.marginalia.wmsa.edge.index.service.index.SearchIndex; import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter; import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexReader; import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl; import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl;
import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget; import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
import nu.marginalia.wmsa.edge.model.EdgeId; import nu.marginalia.wmsa.edge.model.EdgeId;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;

View File

@ -1,6 +1,6 @@
package nu.marginalia.wmsa.edge.index.service; package nu.marginalia.wmsa.edge.index.service;
import nu.marginalia.wmsa.edge.index.service.dictionary.TokenCompressor; import nu.marginalia.wmsa.edge.index.dictionary.TokenCompressor;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import java.util.Arrays; import java.util.Arrays;

View File

@ -13,6 +13,7 @@ class QueryVariantsTest {
QueryVariants variants; QueryVariants variants;
QueryParser parser; QueryParser parser;
SentenceExtractor se; SentenceExtractor se;
@BeforeEach @BeforeEach
public void setUp() { public void setUp() {
LanguageModels lm = TestLanguageModels.getLanguageModels(); LanguageModels lm = TestLanguageModels.getLanguageModels();
@ -24,7 +25,7 @@ class QueryVariantsTest {
parser = new QueryParser(new EnglishDictionary(dict), variants); parser = new QueryParser(new EnglishDictionary(dict), variants);
} }
@Test @Test @SuppressWarnings("unchecked")
void getQueryVariants() { void getQueryVariants() {
System.out.println(se.extractSentence("we are alone")); System.out.println(se.extractSentence("we are alone"));
testCase("DOS", List.of("DOS")); testCase("DOS", List.of("DOS"));
@ -50,7 +51,5 @@ class QueryVariantsTest {
private void testCase(String input, List<String>... expected) { private void testCase(String input, List<String>... expected) {
var tokens = variants.getQueryVariants(parser.extractBasicTokens(input)); var tokens = variants.getQueryVariants(parser.extractBasicTokens(input));
System.out.println(tokens); System.out.println(tokens);
// var result = tokens.stream().map(lst -> lst.terms).collect(Collectors.toSet());
// assertEquals(Set.of(expected), result, "Case failed: " + input);
} }
} }