Links
- Nominal Quality: {{nominalQuality}}%
Crawl Ranking: {{ranking}}%
Incoming Links: {{incomingLinks}}
Outbound Links: {{outboundLinks}}
diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java b/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java
index 84b9f165..26d397a8 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java
@@ -43,7 +43,7 @@ public class TestUtil {
logger.info("Running script {}", scriptFile);
try (var scriptStream = ClassLoader.getSystemResourceAsStream(scriptFile);
var stmt = conn.createStatement()) {
- for (String s : new String(scriptStream.readAllBytes()).split(";")) {
+ for (String s : new String(scriptStream.readAllBytes()).split("(;|---)")) {
if (!s.isBlank()) {
try {
Assertions.assertTrue(stmt.executeUpdate(s) >= 0);
diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java
index 1915d989..875cda37 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java
@@ -90,10 +90,10 @@ class BTreeWriterTest {
{
var writer = new BTreeWriter(mmf, ctx);
- writer.write(0, toPut.size(), (offset) -> {
+ writer.write(0, toPut.size(), (slice) -> {
for (int i = 0; i < data.length; i++) {
- mmf.put(offset + 2L*i, data[i]);
- mmf.put(offset + 2L*i + 1, i);
+ slice.put(2L*i, data[i]);
+ slice.put( 2L*i + 1, i);
}
});
mmf.force();
@@ -133,10 +133,10 @@ class BTreeWriterTest {
{
var writer = new BTreeWriter(mmf, ctx);
- writer.write( 0, toPut.size(), (offset) -> {
+ writer.write( 0, toPut.size(), (slice) -> {
for (int i = 0; i < data.length; i++) {
- mmf.put(offset + 2L*i, data[i]);
- mmf.put(offset + 2L*i + 1, i);
+ slice.put(2L*i, data[i]);
+ slice.put(2L*i + 1, i);
}
});
mmf.force();
@@ -182,9 +182,9 @@ class BTreeWriterTest {
try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) {
{
var writer = new BTreeWriter(mmf, ctx);
- writer.write(0, toPut.size(), (offset) -> {
+ writer.write(0, toPut.size(), (slice) -> {
for (int i = 0; i < data.length; i++) {
- mmf.put(offset + i, data[i]);
+ slice.put(i, data[i]);
}
});
mmf.force();
@@ -235,9 +235,9 @@ class BTreeWriterTest {
try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) {
{
var writer = new BTreeWriter(mmf, ctx);
- writer.write(0, toPut.size(), (offset) -> {
+ writer.write(0, toPut.size(), (slice) -> {
for (int i = 0; i < data.length; i++) {
- mmf.put(offset + i, data[i]);
+ slice.put(i, data[i]);
}
});
mmf.force();
@@ -288,10 +288,10 @@ class BTreeWriterTest {
try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) {
{
var writer = new BTreeWriter(mmf, ctx);
- writer.write(0, toPut.size(), (offset) -> {
+ writer.write(0, toPut.size(), (slice) -> {
for (int i = 0; i < data.length; i++) {
- mmf.put(offset + i*2L, data[i]);
- mmf.put(offset + i*2L+1, i);
+ slice.put(i*2L, data[i]);
+ slice.put(i*2L+1, i);
}
});
mmf.force();
diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java
index 326c9b15..9331a998 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java
@@ -27,7 +27,7 @@ class LongPairHashMapTest {
try {
RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw");
MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000, true);
- var lphm = new LongPairHashMap(mmf, 1024);
+ var lphm = LongPairHashMap.createNew(mmf, 1024);
toPut.forEach(i -> {
lphm.put(new LongPairHashMap.CellData(i, i));
});
@@ -36,7 +36,7 @@ class LongPairHashMapTest {
RandomAccessFile raf2 = new RandomAccessFile(tempFile.toFile(), "rw");
MultimapFileLong mmf2 = new MultimapFileLong(raf2, FileChannel.MapMode.READ_WRITE, 10000, 1000, true);
- var lphm2 = new LongPairHashMap(mmf2);
+ var lphm2 = LongPairHashMap.loadExisting(mmf2);
toPut.forEach(i -> {
Assertions.assertTrue(lphm2.get(i).isSet());
Assertions.assertEquals(i, (int) lphm2.get(i).getKey());
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java
new file mode 100644
index 00000000..d839bbb2
--- /dev/null
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainLinksTest.java
@@ -0,0 +1,48 @@
+package nu.marginalia.wmsa.edge.converting.loader;
+
+import com.zaxxer.hikari.HikariDataSource;
+import nu.marginalia.util.TestUtil;
+import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
+import nu.marginalia.wmsa.edge.model.EdgeDomain;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.testcontainers.containers.MariaDBContainer;
+import org.testcontainers.junit.jupiter.Container;
+import org.testcontainers.junit.jupiter.Testcontainers;
+
+@Testcontainers
+class SqlLoadDomainLinksTest {
+ @Container
+ static MariaDBContainer> mariaDBContainer = new MariaDBContainer<>("mariadb")
+ .withDatabaseName("WMSA_prod")
+ .withUsername("wmsa")
+ .withPassword("wmsa")
+ .withInitScript("sql/edge-crawler-cache.sql")
+ .withNetworkAliases("mariadb");
+
+ HikariDataSource dataSource;
+ LoaderData loaderData;
+ @BeforeEach
+ public void setUp() {
+ dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl());
+
+ var loadDomains = new SqlLoadDomains(dataSource);
+ loaderData = new LoaderData(10);
+
+ loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
+ loadDomains.load(loaderData, new EdgeDomain[] { new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu") });
+ }
+
+ @AfterEach
+ public void tearDown() {
+ dataSource.close();
+ }
+
+ @Test
+ public void loadDomainLinks() {
+ var loader = new SqlLoadDomainLinks(dataSource);
+ loader.load(new DomainLink[] { new DomainLink(new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu")) });
+ }
+
+}
\ No newline at end of file
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainsTest.java
new file mode 100644
index 00000000..25dd18b4
--- /dev/null
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomainsTest.java
@@ -0,0 +1,52 @@
+package nu.marginalia.wmsa.edge.converting.loader;
+
+import nu.marginalia.util.TestUtil;
+import nu.marginalia.wmsa.edge.model.EdgeDomain;
+import org.junit.jupiter.api.Test;
+import org.testcontainers.containers.MariaDBContainer;
+import org.testcontainers.junit.jupiter.Container;
+import org.testcontainers.junit.jupiter.Testcontainers;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+@Testcontainers
+class SqlLoadDomainsTest {
+ @Container
+ static MariaDBContainer> mariaDBContainer = new MariaDBContainer<>("mariadb")
+ .withDatabaseName("WMSA_prod")
+ .withUsername("wmsa")
+ .withPassword("wmsa")
+ .withInitScript("sql/edge-crawler-cache.sql")
+ .withNetworkAliases("mariadb");
+
+ @Test
+ public void loadDomain() {
+
+ try (var dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl());) {
+ var loadDomains = new SqlLoadDomains(dataSource);
+ var loaderData = new LoaderData(10);
+
+ loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
+ loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu"));
+
+ assertTrue(loaderData.getDomainId(new EdgeDomain("www.marginalia.nu")) >= 0);
+ }
+
+ }
+
+ @Test
+ public void loadDomains() {
+
+ try (var dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl());) {
+ var loadDomains = new SqlLoadDomains(dataSource);
+ var loaderData = new LoaderData(10);
+
+ loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
+ loadDomains.load(loaderData, new EdgeDomain[] { new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu") });
+
+ assertTrue(loaderData.getDomainId(new EdgeDomain("www.marginalia.nu")) >= 0);
+ assertTrue(loaderData.getDomainId(new EdgeDomain("memex.marginalia.nu")) >= 0);
+ }
+
+ }
+}
\ No newline at end of file
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java
new file mode 100644
index 00000000..ecb0e88a
--- /dev/null
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocumentTest.java
@@ -0,0 +1,94 @@
+package nu.marginalia.wmsa.edge.converting.loader;
+
+import com.zaxxer.hikari.HikariDataSource;
+import nu.marginalia.util.TestUtil;
+import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument;
+import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
+import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl;
+import nu.marginalia.wmsa.edge.model.EdgeDomain;
+import nu.marginalia.wmsa.edge.model.EdgeId;
+import nu.marginalia.wmsa.edge.model.EdgeUrl;
+import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
+import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.testcontainers.containers.MariaDBContainer;
+import org.testcontainers.junit.jupiter.Container;
+import org.testcontainers.junit.jupiter.Testcontainers;
+
+import java.net.URISyntaxException;
+import java.util.List;
+import java.util.Set;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+@Testcontainers
+class SqlLoadProcessedDocumentTest {
+ @Container
+ static MariaDBContainer> mariaDBContainer = new MariaDBContainer<>("mariadb")
+ .withDatabaseName("WMSA_prod")
+ .withUsername("wmsa")
+ .withPassword("wmsa")
+ .withInitScript("sql/edge-crawler-cache.sql")
+ .withNetworkAliases("mariadb");
+
+ HikariDataSource dataSource;
+ LoaderData loaderData;
+ EdgeDataStoreDaoImpl dataStoreDao;
+
+ @BeforeEach
+ public void setUp() throws URISyntaxException {
+ dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl());
+ dataStoreDao = new EdgeDataStoreDaoImpl(dataSource);
+
+ var loadDomains = new SqlLoadDomains(dataSource);
+ var loadUrls = new SqlLoadUrls(dataSource);
+
+ loaderData = new LoaderData(10);
+
+ loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
+ loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu"));
+
+ loadUrls.load(loaderData, new EdgeUrl[]{new EdgeUrl("https://www.marginalia.nu/")});
+
+ }
+
+ @AfterEach
+ public void tearDown() {
+ dataStoreDao.clearCaches();
+ dataSource.close();
+ }
+
+ @Test
+ public void loadProcessedDocument() throws URISyntaxException {
+ var loader = new SqlLoadProcessedDocument(dataSource);
+ var url = new EdgeUrl("https://www.marginalia.nu/");
+
+ loader.load(loaderData, List.of(new LoadProcessedDocument(
+ url,
+ EdgeUrlState.OK,
+ "TITLE",
+ "DESCR",
+ HtmlFeature.encode(Set.of(HtmlFeature.AFFILIATE_LINK)),
+ EdgeHtmlStandard.HTML5,
+ 100,
+ 12345,
+ -3.14
+ )));
+
+ var details = dataStoreDao.getUrlDetailsMulti(List.of(new EdgeId<>(loaderData.getUrlId(new EdgeUrl("https://www.marginalia.nu/")))));
+ assertEquals(1, details.size());
+
+ var urlDetails = details.get(0);
+
+ assertEquals("TITLE", urlDetails.getTitle());
+ assertEquals("DESCR", urlDetails.getDescription());
+ assertTrue(urlDetails.isAffiliate());
+ assertEquals(100, urlDetails.words);
+ assertEquals(12345, urlDetails.dataHash);
+ assertEquals(-3.14, urlDetails.getUrlQuality());
+ }
+
+}
\ No newline at end of file
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomainTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomainTest.java
new file mode 100644
index 00000000..eb66da92
--- /dev/null
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDomainTest.java
@@ -0,0 +1,54 @@
+package nu.marginalia.wmsa.edge.converting.loader;
+
+import com.zaxxer.hikari.HikariDataSource;
+import nu.marginalia.util.TestUtil;
+import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
+import nu.marginalia.wmsa.edge.model.EdgeDomain;
+import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.testcontainers.containers.MariaDBContainer;
+import org.testcontainers.junit.jupiter.Container;
+import org.testcontainers.junit.jupiter.Testcontainers;
+
+@Testcontainers
+class SqlLoadProcessedDomainTest {
+ @Container
+ static MariaDBContainer> mariaDBContainer = new MariaDBContainer<>("mariadb")
+ .withDatabaseName("WMSA_prod")
+ .withUsername("wmsa")
+ .withPassword("wmsa")
+ .withInitScript("sql/edge-crawler-cache.sql")
+ .withNetworkAliases("mariadb");
+
+ HikariDataSource dataSource;
+ LoaderData loaderData;
+ @BeforeEach
+ public void setUp() {
+
+ dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl());
+
+ var loadDomains = new SqlLoadDomains(dataSource);
+ loaderData = new LoaderData(10);
+
+ loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
+ loadDomains.load(loaderData, new EdgeDomain[]{ new EdgeDomain("www.marginalia.nu"), new EdgeDomain("memex.marginalia.nu") });
+ }
+
+ @AfterEach
+ public void tearDown() {
+ dataSource.close();
+ }
+
+ @Test
+ public void loadProcessedDomain() {
+ var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource));
+ loader.load(loaderData, new EdgeDomain("www.marginalia.nu"), EdgeDomainIndexingState.BLOCKED, "127.0.0.1");
+ }
+ @Test
+ public void loadDomainAlias() {
+ var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource));
+ loader.loadAlias(loaderData, new DomainLink(new EdgeDomain("memex.marginalia.nu"), new EdgeDomain("www.marginalia.nu")));
+ }
+}
\ No newline at end of file
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrlsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrlsTest.java
new file mode 100644
index 00000000..5afac733
--- /dev/null
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrlsTest.java
@@ -0,0 +1,50 @@
+package nu.marginalia.wmsa.edge.converting.loader;
+
+import com.zaxxer.hikari.HikariDataSource;
+import nu.marginalia.util.TestUtil;
+import nu.marginalia.wmsa.edge.model.EdgeDomain;
+import nu.marginalia.wmsa.edge.model.EdgeUrl;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.testcontainers.containers.MariaDBContainer;
+import org.testcontainers.junit.jupiter.Container;
+import org.testcontainers.junit.jupiter.Testcontainers;
+
+import java.net.URISyntaxException;
+
+@Testcontainers
+class SqlLoadUrlsTest {
+ @Container
+ static MariaDBContainer> mariaDBContainer = new MariaDBContainer<>("mariadb")
+ .withDatabaseName("WMSA_prod")
+ .withUsername("wmsa")
+ .withPassword("wmsa")
+ .withInitScript("sql/edge-crawler-cache.sql")
+ .withNetworkAliases("mariadb");
+
+ HikariDataSource dataSource;
+ LoaderData loaderData;
+ @BeforeEach
+ public void setUp() {
+ dataSource = TestUtil.getConnection(mariaDBContainer.getJdbcUrl());
+
+ var loadDomains = new SqlLoadDomains(dataSource);
+ loaderData = new LoaderData(10);
+
+ loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu"));
+ loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu"));
+ }
+
+ @AfterEach
+ public void tearDown() {
+ dataSource.close();
+ }
+
+ @Test
+ public void loadUrl() throws URISyntaxException {
+ var loadUrls = new SqlLoadUrls(dataSource);
+ loadUrls.load(loaderData, new EdgeUrl[] { new EdgeUrl("https://www.marginalia.nu/") });
+ }
+
+}
\ No newline at end of file
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java
index 180576fc..961d8304 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java
@@ -1,11 +1,11 @@
package nu.marginalia.wmsa.edge.index.service;
import lombok.SneakyThrows;
+import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
-import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryReader;
-import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryWriter;
-import nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter;
-import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
+import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader;
+import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter;
+import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java
index 6b029da9..2b2da0fd 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java
@@ -3,14 +3,14 @@ package nu.marginalia.wmsa.edge.index.service;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.util.TestUtil;
-import nu.marginalia.wmsa.client.exception.RemoteException;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.configuration.server.Initialization;
import nu.marginalia.wmsa.edge.index.EdgeIndexService;
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
+import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
-import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
+import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
import nu.marginalia.wmsa.edge.model.EdgeId;
@@ -23,7 +23,6 @@ import org.junit.jupiter.api.parallel.ResourceAccessMode;
import org.junit.jupiter.api.parallel.ResourceLock;
import spark.Spark;
-import java.io.File;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
@@ -31,7 +30,6 @@ import java.util.List;
import java.util.stream.Collectors;
import static nu.marginalia.util.TestUtil.getConnection;
-import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexConverterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexConverterTest.java
deleted file mode 100644
index f42f2d36..00000000
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexConverterTest.java
+++ /dev/null
@@ -1,89 +0,0 @@
-package nu.marginalia.wmsa.edge.index.service;
-
-import lombok.SneakyThrows;
-import nu.marginalia.wmsa.edge.index.model.IndexBlock;
-import nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter;
-import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
-import org.junit.jupiter.api.Disabled;
-import org.junit.jupiter.api.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.File;
-import java.io.IOException;
-import java.nio.file.Path;
-
-class SearchIndexConverterTest {
-
- private final Logger logger = LoggerFactory.getLogger(getClass());
-
- @Test @Disabled @SneakyThrows
- public void test() {
- // File dictFile = new File("/home/vlofgren/dictionary.dat");
- File inFile = new File("/home/vlofgren/Work/converter/3/page-index.dat");
-
- new SearchIndexConverter(IndexBlock.Title, 0, Path.of("/tmp"), inFile,
- new File("/home/vlofgren/Work/converter/words.dat"),
- new File("/home/vlofgren/Work/converter/urls.dat"), new SearchIndexPartitioner(null), val -> false);
-
- // sanityCheck();
- }
-
- @Test @Disabled
- public void sanityCheck() {
- File inFile = new File("/home/vlofgren/write/6/page-index.dat");
-
-// SearchIndexReader sir = new SearchIndexReader(new SearchIndex[]{
-// new SearchIndex("body", Path.of("/tmp"),
-// new File("/home/vlofgren/data/urls.dat"),
-// new File("/home/vlofgren/data/words.dat")),
-// new SearchIndex("body", Path.of("/tmp"),
-// new File("/home/vlofgren/data/urls.dat"),
-// new File("/home/vlofgren/data/words.dat"))
-// ,
-// new SearchIndex("body", Path.of("/tmp"),
-// new File("/home/vlofgren/data/urls.dat"),
-// new File("/home/vlofgren/data/words.dat"))
-// ,
-// new SearchIndex("body", Path.of("/tmp"),
-// new File("/home/vlofgren/data/urls.dat"),
-// new File("/home/vlofgren/data/words.dat"))
-// });
-
-// getQuery(sir, new EdgeIndexSearchTerms(List.of(152, 106), Collections.emptyList())).stream().forEach(System.out::println);
-// sir.findWord(152).also(106).stream().forEach(System.out::println);
-// scanFile(inFile, (url, word) -> {
-// //System.out.println(url + " " + word);
-// if (!sir.findWord(word).stream().anyMatch(url::equals)) {
-// logger.error("Can't find word {} in {}", word, url);
-// }
-// });
-
-
- }
-/*
- private SearchIndexReader.Query getQuery(SearchIndexReader indexReader, EdgeIndexSearchTerms searchTerms) {
- var orderedIncludes = searchTerms.includes
- .stream()
- .sorted(Comparator.comparingLong(indexReader::numHits))
- .distinct()
- .mapToInt(Integer::intValue)
- .toArray();
-
- logger.info("Includes: ({}); excludes: ({})", Arrays.
- stream(orderedIncludes)
- .mapToObj(String::valueOf)
- .collect(Collectors.joining(",")),
- searchTerms.excludes.stream().map(String::valueOf).collect(Collectors.joining(",")));
- SearchIndexReader.Query query = indexReader.findWord(orderedIncludes[0]);
- for (int i = 1; i < orderedIncludes.length; i++) {
- query = query.also(orderedIncludes[i]);
- }
- for (int term : searchTerms.excludes) {
- query = query.not(term);
- }
- return query;
- }
-
-*/
-}
\ No newline at end of file
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java
index 5f1d2a0f..edcfa71f 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java
@@ -1,14 +1,14 @@
package nu.marginalia.wmsa.edge.index.service;
import lombok.SneakyThrows;
+import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
-import nu.marginalia.wmsa.edge.index.service.dictionary.DictionaryWriter;
-import nu.marginalia.wmsa.edge.index.service.index.SearchIndex;
-import nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter;
-import nu.marginalia.wmsa.edge.index.service.index.SearchIndexReader;
-import nu.marginalia.wmsa.edge.index.service.index.SearchIndexWriterImpl;
-import nu.marginalia.wmsa.edge.index.service.query.IndexSearchBudget;
-import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
+import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter;
+import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
+import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
+import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
+import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl;
+import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
import nu.marginalia.wmsa.edge.model.EdgeId;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/TokenCompressorTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/TokenCompressorTest.java
index ee84472e..e780ed62 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/TokenCompressorTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/TokenCompressorTest.java
@@ -1,6 +1,6 @@
package nu.marginalia.wmsa.edge.index.service;
-import nu.marginalia.wmsa.edge.index.service.dictionary.TokenCompressor;
+import nu.marginalia.wmsa.edge.index.dictionary.TokenCompressor;
import org.junit.jupiter.api.Test;
import java.util.Arrays;
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java
index 4aa9bceb..65b1ad57 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java
@@ -13,6 +13,7 @@ class QueryVariantsTest {
QueryVariants variants;
QueryParser parser;
SentenceExtractor se;
+
@BeforeEach
public void setUp() {
LanguageModels lm = TestLanguageModels.getLanguageModels();
@@ -24,7 +25,7 @@ class QueryVariantsTest {
parser = new QueryParser(new EnglishDictionary(dict), variants);
}
- @Test
+ @Test @SuppressWarnings("unchecked")
void getQueryVariants() {
System.out.println(se.extractSentence("we are alone"));
testCase("DOS", List.of("DOS"));
@@ -50,7 +51,5 @@ class QueryVariantsTest {
private void testCase(String input, List... expected) {
var tokens = variants.getQueryVariants(parser.extractBasicTokens(input));
System.out.println(tokens);
-// var result = tokens.stream().map(lst -> lst.terms).collect(Collectors.toSet());
-// assertEquals(Set.of(expected), result, "Case failed: " + input);
}
}
\ No newline at end of file
From f76af4ca79e8d8b84deb2883824d69a568ba62b6 Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Sat, 18 Jun 2022 15:54:58 +0200
Subject: [PATCH 03/40] Refactoring conversion
---
.../java/nu/marginalia/util/ListChunker.java | 31 ++
.../nu/marginalia/util/RandomWriteFunnel.java | 50 +--
.../util/multimap/MultimapFileLong.java | 53 ++-
.../multimap/MultimapFileLongOffsetSlice.java | 3 +
.../util/multimap/MultimapFileLongSlice.java | 2 +
.../wmsa/client/AbstractClient.java | 8 +-
.../loader/SqlLoadProcessedDocument.java | 2 -
.../CrawlJobExtractorPageRankMain.java | 6 +-
.../edge/data/dao/EdgeDataStoreDaoImpl.java | 15 +-
.../data/dao/task/EdgeDomainBlacklist.java | 2 +-
.../wmsa/edge/index/EdgeIndexBucket.java | 6 +-
.../wmsa/edge/index/EdgeIndexControl.java | 6 +-
.../wmsa/edge/index/EdgeIndexService.java | 59 ++-
.../wmsa/edge/index/IndexServicesFactory.java | 27 +-
.../conversion/SearchIndexConverter.java | 367 ++++++------------
.../conversion/SearchIndexPartitioner.java | 2 +-
.../conversion/SearchIndexPreconverter.java | 65 +---
.../words/WordIndexOffsetsTable.java | 6 +-
.../journal/SearchIndexJournalEntry.java | 49 +++
.../SearchIndexJournalEntryHeader.java | 16 +
.../journal/SearchIndexJournalFileHeader.java | 4 +
.../journal/SearchIndexJournalReader.java | 123 ++++++
.../journal/SearchIndexJournalWriter.java | 10 +
...java => SearchIndexJournalWriterImpl.java} | 36 +-
.../edge/index/journal/SearchIndexWriter.java | 16 -
.../wmsa/edge/index/reader/SearchIndexes.java | 8 +-
.../nu/marginalia/wmsa/edge/model/EdgeId.java | 11 +-
.../model/search/EdgeSearchResultItem.java | 2 +-
.../wmsa/edge/search/EdgeSearchOperator.java | 2 +-
.../command/commands/SiteSearchCommand.java | 4 +-
.../search/results/SearchResultDecorator.java | 4 +-
.../siteinfo/DomainInformationService.java | 21 +-
.../index/service/EdgeIndexClientTest.java | 5 +-
.../service/SearchIndexJournalWriterTest.java | 76 ++++
.../index/service/SearchIndexWriterTest.java | 90 -----
.../service/util/RandomWriteFunnelTest.java | 33 ++
.../com/upserve/uppend/blobs/NativeIO.java | 3 -
37 files changed, 658 insertions(+), 565 deletions(-)
create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/ListChunker.java
create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntry.java
create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntryHeader.java
create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalFileHeader.java
create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalReader.java
create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriter.java
rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/{SearchIndexWriterImpl.java => SearchIndexJournalWriterImpl.java} (68%)
delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriter.java
create mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java
delete mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ListChunker.java b/marginalia_nu/src/main/java/nu/marginalia/util/ListChunker.java
new file mode 100644
index 00000000..ef27ba1d
--- /dev/null
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/ListChunker.java
@@ -0,0 +1,31 @@
+package nu.marginalia.util;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+public class ListChunker {
+
+ /** Chops data into a list of lists of max length size
+ *
+ * Caveat: Relies on subList and does not clone "data", so
+ * changes to the original list may affect the sub-lists
+ * in unspecified ways
+ *
+ * @see List#subList
+ */
+ public static List> chopList(List data, int size) {
+ if (data.isEmpty())
+ return Collections.emptyList();
+ else if (data.size() < size)
+ return List.of(data);
+
+ final List> ret = new ArrayList<>(1 + data.size() / size);
+
+ for (int i = 0; i < data.size(); i+=size) {
+ ret.add(data.subList(i, Math.min(data.size(), i+size)));
+ }
+
+ return ret;
+ }
+}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java b/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java
index 55c83464..0c274c2b 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java
@@ -1,6 +1,6 @@
package nu.marginalia.util;
-import io.prometheus.client.Gauge;
+import lombok.SneakyThrows;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -18,10 +18,6 @@ import java.nio.file.Path;
* */
public class RandomWriteFunnel implements AutoCloseable {
- private final static Gauge write_rate = Gauge.build("wmsa_rwf_write_bytes", "Bytes/s")
- .register();
- private final static Gauge transfer_rate = Gauge.build("wmsa_rwf_transfer_bytes", "Bytes/s")
- .register();
private static final Logger logger = LoggerFactory.getLogger(RandomWriteFunnel.class);
private final DataBin[] bins;
@@ -34,7 +30,7 @@ public class RandomWriteFunnel implements AutoCloseable {
int binCount = (int) (size / binSize + ((size % binSize) != 0L ? 1 : 0));
bins = new DataBin[binCount];
for (int i = 0; i < binCount; i++) {
- bins[i] = new DataBin(tempDir, (int) Math.min(size - binSize * i, binSize));
+ bins[i] = new DataBin(tempDir, Math.min((int) (size - binSize * i), binSize));
}
}
else {
@@ -42,25 +38,25 @@ public class RandomWriteFunnel implements AutoCloseable {
}
}
- public void put(long address, long data) throws IOException {
- bins[((int)(address / binSize))].put((int)(address%binSize), data);
+ @SneakyThrows
+ public void put(long address, long data) {
+ int bin = (int)(address / binSize);
+ int offset = (int)(address%binSize);
+
+ bins[bin].put(offset, data);
}
public void write(FileChannel o) throws IOException {
ByteBuffer buffer = ByteBuffer.allocateDirect(binSize*8);
- logger.debug("Writing from RWF");
- for (int i = 0; i < bins.length; i++) {
- var bin = bins[i];
+ for (var bin : bins) {
buffer.clear();
bin.eval(buffer);
while (buffer.hasRemaining()) {
- int wb = o.write(buffer);
- write_rate.set(wb);
+ o.write(buffer);
}
}
- logger.debug("Done");
}
@Override
@@ -84,12 +80,12 @@ public class RandomWriteFunnel implements AutoCloseable {
}
void put(int address, long data) throws IOException {
- buffer.putInt(address);
- buffer.putLong(data);
-
- if (buffer.capacity() - buffer.position() < 12) {
+ if (buffer.remaining() < 12) {
flushBuffer();
}
+
+ buffer.putInt(address);
+ buffer.putLong(data);
}
private void flushBuffer() throws IOException {
@@ -97,12 +93,15 @@ public class RandomWriteFunnel implements AutoCloseable {
return;
buffer.flip();
- while (channel.write(buffer) > 0);
+ while (buffer.hasRemaining())
+ channel.write(buffer);
+
buffer.clear();
}
private void eval(ByteBuffer dest) throws IOException {
flushBuffer();
+ channel.force(false);
channel.position(0);
buffer.clear();
@@ -117,14 +116,17 @@ public class RandomWriteFunnel implements AutoCloseable {
if (rb < 0) {
break;
}
- else {
- transfer_rate.set(rb);
- }
buffer.flip();
while (buffer.limit() - buffer.position() >= 12) {
- int addr = buffer.getInt();
+ int addr = 8 * buffer.getInt();
long data = buffer.getLong();
- dest.putLong(8*addr, data);
+
+ try {
+ dest.putLong(addr, data);
+ }
+ catch (IndexOutOfBoundsException ex) {
+ logger.info("!!!bad[{}]={}", addr, data);
+ }
}
buffer.compact();
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java
index f381a977..e9a9b4fe 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java
@@ -36,9 +36,7 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
private long mappedSize;
final static long WORD_SIZE = 8;
- private boolean loadAggressively;
-
- private final NativeIO.Advice advice = null;
+ private NativeIO.Advice defaultAdvice = null;
public static MultimapFileLong forReading(Path file) throws IOException {
long fileSize = Files.size(file);
@@ -70,12 +68,7 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
long mapSize,
int bufferSize) throws IOException {
- this(new RandomAccessFile(file, translateToRAFMode(mode)), mode, mapSize, bufferSize, false);
- }
-
- public MultimapFileLong loadAggressively(boolean v) {
- this.loadAggressively = v;
- return this;
+ this(new RandomAccessFile(file, translateToRAFMode(mode)), mode, mapSize, bufferSize);
}
private static String translateToRAFMode(FileChannel.MapMode mode) {
@@ -91,13 +84,11 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
public MultimapFileLong(RandomAccessFile file,
FileChannel.MapMode mode,
long mapSizeBytes,
- int bufferSizeWords,
- boolean loadAggressively) throws IOException {
+ int bufferSizeWords) throws IOException {
this.mode = mode;
this.bufferSize = bufferSizeWords;
this.mapSize = mapSizeBytes;
this.fileLength = file.length();
- this.loadAggressively = loadAggressively;
channel = file.getChannel();
mappedSize = 0;
@@ -115,6 +106,7 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
@SneakyThrows
public void advice(NativeIO.Advice advice) {
+ this.defaultAdvice = advice;
for (var buffer : mappedByteBuffers) {
NativeIO.madvise(buffer, advice);
}
@@ -157,7 +149,7 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
}
@SneakyThrows
- private void grow(long posIdxRequired) {
+ public void grow(long posIdxRequired) {
if (posIdxRequired*WORD_SIZE > mapSize && mode == READ_ONLY) {
throw new IndexOutOfBoundsException(posIdxRequired + " (max " + mapSize + ")");
}
@@ -182,11 +174,8 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
var buffer = channel.map(mode, posBytes, bzBytes);
- if (loadAggressively)
- buffer.load();
-
- if (advice != null) {
- NativeIO.madvise(buffer, advice);
+ if (defaultAdvice != null) {
+ NativeIO.madvise(buffer, defaultAdvice);
}
buffers.add(buffer.asLongBuffer());
@@ -262,6 +251,32 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
}
+ @Override
+ public void read(LongBuffer vals, long idx) {
+ int n = vals.limit() - vals.position();
+ if (idx+n >= mappedSize) {
+ grow(idx+n);
+ }
+ int iN = (int)((idx + n) / bufferSize);
+
+ for (int i = 0; i < n; ) {
+ int i0 = (int)((idx + i) / bufferSize);
+
+ int bufferOffset = (int) ((idx+i) % bufferSize);
+ var buffer = buffers.get(i0);
+
+ final int l;
+
+ if (i0 < iN) l = bufferSize - bufferOffset;
+ else l = Math.min(n - i, bufferSize - bufferOffset);
+
+ vals.put(vals.position() + i, buffer, bufferOffset, l);
+ i+=l;
+ }
+
+ }
+
+
@Override
public void write(long[] vals, long idx) {
write(vals, vals.length, idx);
@@ -363,8 +378,10 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
@Override
public void close() throws IOException {
force();
+
mappedByteBuffers.clear();
buffers.clear();
+
channel.close();
// I want to believe
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java
index c2630ddc..bd35bd9b 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java
@@ -38,6 +38,9 @@ public class MultimapFileLongOffsetSlice implements MultimapFileLongSlice {
map.read(vals, n, idx+off);
}
+ @Override
+ public void read(LongBuffer vals, long idx) { map.read(vals, idx+off); }
+
@Override
public void write(long[] vals, long idx) {
map.write(vals, idx+off);
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java
index abf29f51..27d6ae06 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java
@@ -15,6 +15,8 @@ public interface MultimapFileLongSlice {
void read(long[] vals, int n, long idx);
+ void read(LongBuffer vals, long idx);
+
void write(long[] vals, long idx);
void write(long[] vals, int n, long idx);
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/AbstractClient.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/AbstractClient.java
index 5091b75e..603f57e5 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/AbstractClient.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/AbstractClient.java
@@ -6,6 +6,7 @@ import io.reactivex.rxjava3.core.Observable;
import io.reactivex.rxjava3.core.ObservableSource;
import io.reactivex.rxjava3.plugins.RxJavaPlugins;
import lombok.SneakyThrows;
+import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
import nu.marginalia.wmsa.client.exception.LocalException;
import nu.marginalia.wmsa.client.exception.NetworkException;
import nu.marginalia.wmsa.client.exception.RemoteException;
@@ -30,9 +31,12 @@ import java.util.zip.GZIPOutputStream;
public abstract class AbstractClient implements AutoCloseable {
public static final String CONTEXT_OUTBOUND_REQUEST = "outbound-request";
- private final Gson gson = new GsonBuilder().create();
+
+ private final Gson gson = new GsonBuilder()
+ .registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
+ .create();
+
private final Logger logger = LoggerFactory.getLogger(getClass());
- private final Marker httpMarker = MarkerFactory.getMarker("HTTP");
private final OkHttpClient client;
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java
index e2e25fff..fb8a6303 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java
@@ -94,8 +94,6 @@ public class SqlLoadProcessedDocument {
} catch (SQLException ex) {
logger.warn("SQL error inserting document", ex);
}
-
-
}
public void loadWithError(LoaderData data, List documents) {
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java
index ea1946fc..ef3bf39f 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorPageRankMain.java
@@ -105,7 +105,7 @@ public class CrawlJobExtractorPageRankMain {
try (var domainQuery = conn.prepareStatement(specificDomainSqlFromId);
var urlQuery = conn.prepareStatement(urlsSql))
{
- domainQuery.setInt(1, domainId.getId());
+ domainQuery.setInt(1, domainId.id());
ResultSet rsp = domainQuery.executeQuery();
domainName = rsp.next() ? rsp.getString(1) : "";
@@ -113,10 +113,10 @@ public class CrawlJobExtractorPageRankMain {
spec.id = createId(new EdgeDomain(domainName));
spec.urls = new ArrayList<>(1000);
- spec.crawlDepth = getCrawlDepth(new DomainWithId(domainName, domainId.getId()));
+ spec.crawlDepth = getCrawlDepth(new DomainWithId(domainName, domainId.id()));
urlQuery.setString(1, domainName.toString());
- urlQuery.setInt(2, domainId.getId());
+ urlQuery.setInt(2, domainId.id());
urlQuery.setFetchSize(1000);
rsp = urlQuery.executeQuery();
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java
index 30ea2256..c73089b0 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java
@@ -17,13 +17,8 @@ import nu.marginalia.wmsa.edge.search.model.BrowseResult;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.sql.Connection;
import java.sql.SQLException;
-import java.sql.Types;
import java.util.*;
-import java.util.function.Function;
-import java.util.regex.Pattern;
-import java.util.stream.Collectors;
public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
@@ -71,7 +66,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
private String idList(List> ids) {
StringJoiner j = new StringJoiner(",", "(", ")");
for (var id : ids) {
- j.add(Integer.toString(id.getId()));
+ j.add(Integer.toString(id.id()));
}
return j.toString();
}
@@ -154,7 +149,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(q)) {
stmt.setFetchSize(count);
- stmt.setInt(1, domainId.getId());
+ stmt.setInt(1, domainId.id());
stmt.setInt(2, count);
var rsp = stmt.executeQuery();
while (rsp.next()) {
@@ -183,7 +178,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
try (var stmt = connection.prepareStatement(q2)) {
stmt.setFetchSize(count/2);
- stmt.setInt(1, domainId.getId());
+ stmt.setInt(1, domainId.id());
stmt.setInt(2, count/2 - domains.size());
var rsp = stmt.executeQuery();
while (rsp.next() && domains.size() < count/2) {
@@ -214,7 +209,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
LIMIT ?""";
try (var stmt = connection.prepareStatement(q3)) {
stmt.setFetchSize(count/2);
- stmt.setInt(1, domainId.getId());
+ stmt.setInt(1, domainId.id());
stmt.setInt(2, count/2 - domains.size());
var rsp = stmt.executeQuery();
@@ -275,7 +270,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
- stmt.setInt(1, id.getId());
+ stmt.setInt(1, id.id());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return new EdgeDomain(rsp.getString(1));
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklist.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklist.java
index fa1899b1..df265a5f 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklist.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklist.java
@@ -9,7 +9,7 @@ import nu.marginalia.wmsa.edge.model.EdgeId;
public interface EdgeDomainBlacklist {
boolean isBlacklisted(int domainId);
default boolean isBlacklisted(EdgeId domainId) {
- return isBlacklisted(domainId.getId());
+ return isBlacklisted(domainId.id());
}
default TIntHashSet getSpamDomains() {
return new TIntHashSet();
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java
index 05bcfe75..09890252 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java
@@ -1,11 +1,9 @@
package nu.marginalia.wmsa.edge.index;
-import nu.marginalia.wmsa.edge.index.EdgeIndexControl;
-import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
-import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriter;
+import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriter;
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
import nu.marginalia.wmsa.edge.index.reader.query.Query;
import org.jetbrains.annotations.NotNull;
@@ -31,7 +29,7 @@ public class EdgeIndexBucket {
@NotNull
private final IndexServicesFactory servicesFactory;
private final EdgeIndexControl indexControl;
- private final SearchIndexWriter writer;
+ private final SearchIndexJournalWriter writer;
private final int id;
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java
index ab7c73fe..8df32c0a 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java
@@ -23,7 +23,7 @@ public class EdgeIndexControl {
for (IndexBlock block : IndexBlock.values()) {
try {
- servicesFactory.getIndexConverter(id, block);
+ servicesFactory.convertIndex(id, block);
System.runFinalization();
System.gc();
@@ -40,10 +40,6 @@ public class EdgeIndexControl {
System.gc();
}
- public long wordCount(int id) {
- return servicesFactory.wordCount(id);
- }
-
public void switchIndexFiles(int id) throws Exception {
servicesFactory.switchFilesJob(id).call();
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java
index de6276a8..829a59af 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java
@@ -11,12 +11,17 @@ import gnu.trove.set.hash.TIntHashSet;
import io.prometheus.client.Counter;
import io.prometheus.client.Histogram;
import io.reactivex.rxjava3.schedulers.Schedulers;
+import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
+import nu.marginalia.util.ListChunker;
import nu.marginalia.wmsa.configuration.server.Initialization;
import nu.marginalia.wmsa.configuration.server.MetricsServer;
import nu.marginalia.wmsa.configuration.server.Service;
+import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter;
+import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntry;
+import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntryHeader;
import nu.marginalia.wmsa.edge.index.model.*;
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
-import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl;
+import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.wmsa.edge.model.*;
@@ -48,8 +53,11 @@ public class EdgeIndexService extends Service {
@NotNull
private final Initialization init;
private final SearchIndexes indexes;
+ private final DictionaryWriter dictionaryWriter;
- private final Gson gson = new GsonBuilder().create();
+ private final Gson gson = new GsonBuilder()
+ .registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
+ .create();
private static final Histogram wmsa_edge_index_query_time
= Histogram.build().name("wmsa_edge_index_query_time").help("-").register();
@@ -66,12 +74,13 @@ public class EdgeIndexService extends Service {
@Named("service-port") Integer port,
Initialization init,
MetricsServer metricsServer,
- SearchIndexes indexes
- ) {
+ SearchIndexes indexes,
+ IndexServicesFactory servicesFactory) {
super(ip, port, init, metricsServer);
this.init = init;
this.indexes = indexes;
+ this.dictionaryWriter = servicesFactory.getDictionaryWriter();
Spark.post("/words/", this::putWords);
Spark.post("/search/", this::search, gson::toJson);
@@ -173,29 +182,19 @@ public class EdgeIndexService extends Service {
public void putWords(EdgeId domainId, EdgeId urlId,
EdgePageWords words, int idx
) {
- SearchIndexWriterImpl indexWriter = indexes.getIndexWriter(idx);
+ SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);
- if (!words.words.isEmpty()) {
- if (words.size() < 1000) {
- indexWriter.put(domainId, urlId, words.block, words.words);
- } else {
- chunks(words.words, 1000).forEach(chunk -> {
- indexWriter.put(domainId, urlId, words.block, chunk);
- });
- }
- }
+ for (var chunk : ListChunker.chopList(words.words, SearchIndexJournalEntry.MAX_LENGTH)) {
+
+ var entry = new SearchIndexJournalEntry(getWordIds(chunk));
+ var header = new SearchIndexJournalEntryHeader(domainId, urlId, words.block);
+
+ indexWriter.put(header, entry);
+ };
}
-
- private List> chunks(Collection coll, int size) {
- List> ret = new ArrayList<>();
- List data = List.copyOf(coll);
-
- for (int i = 0; i < data.size(); i+=size) {
- ret.add(data.subList(i, Math.min(data.size(), i+size)));
- }
-
- return ret;
+ private long[] getWordIds(List words) {
+ return words.stream().filter(w -> w.length() < Byte.MAX_VALUE).mapToLong(dictionaryWriter::get).toArray();
}
private Object search(Request request, Response response) {
@@ -341,7 +340,7 @@ public class EdgeIndexService extends Service {
getQuery(i, budget, sq.block, lv -> localFilter.filterRawValue(i, lv), searchTerms)
.mapToObj(id -> new EdgeSearchResultItem(i, sq.termSize(), id))
- .filter(ri -> !seenResults.contains(ri.url.getId()) && localFilter.test(i, domainCountFilter, ri))
+ .filter(ri -> !seenResults.contains(ri.url.id()) && localFilter.test(i, domainCountFilter, ri))
.limit(specs.limitTotal * 3L)
.distinct()
.limit(Math.min(specs.limitByBucket
@@ -350,7 +349,7 @@ public class EdgeIndexService extends Service {
for (var result : resultsForBucket) {
- seenResults.add(result.url.getId());
+ seenResults.add(result.url.id());
}
for (var result : resultsForBucket) {
for (var searchTerm : sq.searchTermsInclude) {
@@ -401,7 +400,7 @@ public class EdgeIndexService extends Service {
public boolean filterRawValue(int bucket, long value) {
var domain = new EdgeId((int)(value >>> 32));
- if (domain.getId() == Integer.MAX_VALUE) {
+ if (domain.id() == Integer.MAX_VALUE) {
return true;
}
@@ -409,11 +408,11 @@ public class EdgeIndexService extends Service {
}
long getKey(int bucket, EdgeId id) {
- return ((long)bucket) << 32 | id.getId();
+ return ((long)bucket) << 32 | id.id();
}
public boolean test(int bucket, EdgeSearchResultItem item) {
- if (item.domain.getId() == Integer.MAX_VALUE) {
+ if (item.domain.id() == Integer.MAX_VALUE) {
return true;
}
@@ -431,7 +430,7 @@ public class EdgeIndexService extends Service {
}
public boolean test(int bucket, DomainResultCountFilter root, EdgeSearchResultItem item) {
- if (item.domain.getId() == Integer.MAX_VALUE) {
+ if (item.domain.id() == Integer.MAX_VALUE) {
return true;
}
return root.getCount(bucket, item) + resultsByDomain.adjustOrPutValue(getKey(bucket, item.domain), 1, 1) <= limitByDomain;
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java
index 61e64b41..40c733e2 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java
@@ -8,7 +8,7 @@ import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException;
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPreconverter;
-import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl;
+import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader;
import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter;
@@ -44,7 +44,8 @@ public class IndexServicesFactory {
private final DoublePartitionedDataFile indexWriteUrlsFile;
private volatile static DictionaryWriter dictionaryWriter;
private final Long dictionaryHashMapSize;
- private final SearchIndexPartitioner partitoner;
+ private final SearchIndexPartitioner partitioner;
+
@Inject
public IndexServicesFactory(
@Named("tmp-file-dir") Path tmpFileDir,
@@ -59,7 +60,7 @@ public class IndexServicesFactory {
@Named("edge-index-write-urls-file") String indexWriteUrlsFile,
@Named("edge-dictionary-hash-map-size") Long dictionaryHashMapSize,
EdgeDomainBlacklist domainBlacklist,
- SearchIndexPartitioner partitoner
+ SearchIndexPartitioner partitioner
) {
this.tmpFileDir = tmpFileDir;
@@ -73,11 +74,11 @@ public class IndexServicesFactory {
this.indexWriteWordsFile = new DoublePartitionedDataFile(partitionRootFast, indexWriteWordsFile);
this.indexWriteUrlsFile = new DoublePartitionedDataFile(partitionRootFast, indexWriteUrlsFile);
this.preconverterOutputFile = new PartitionedDataFile(partitionRootSlowTmp, "preconverted.dat");
- this.partitoner = partitoner;
+ this.partitioner = partitioner;
}
- public SearchIndexWriterImpl getIndexWriter(int idx) {
- return new SearchIndexWriterImpl(getDictionaryWriter(), writerIndexFile.get(idx));
+ public SearchIndexJournalWriterImpl getIndexWriter(int idx) {
+ return new SearchIndexJournalWriterImpl(getDictionaryWriter(), writerIndexFile.get(idx));
}
public DictionaryWriter getDictionaryWriter() {
@@ -93,15 +94,17 @@ public class IndexServicesFactory {
}
- public SearchIndexConverter getIndexConverter(int id, IndexBlock block) throws ConversionUnnecessaryException, IOException {
- return new SearchIndexConverter(block, id, tmpFileDir,
+ public void convertIndex(int id, IndexBlock block) throws ConversionUnnecessaryException, IOException {
+ var converter = new SearchIndexConverter(block, id, tmpFileDir,
preconverterOutputFile.get(id),
indexWriteWordsFile.get(id, block.id),
indexWriteUrlsFile.get(id, block.id),
- partitoner,
+ partitioner,
domainBlacklist
);
+ converter.convert();
}
+
@SneakyThrows
public SearchIndexPreconverter getIndexPreconverter() {
File[] outputFiles = new File[DYNAMIC_BUCKET_LENGTH+1];
@@ -110,7 +113,7 @@ public class IndexServicesFactory {
}
return new SearchIndexPreconverter(writerIndexFile.get(0),
outputFiles,
- partitoner,
+ partitioner,
domainBlacklist
);
}
@@ -119,10 +122,6 @@ public class IndexServicesFactory {
return preconverterOutputFile.get(i);
}
- public long wordCount(int id) {
- return SearchIndexConverter.wordCount(writerIndexFile.get(0));
- }
-
@SneakyThrows
public SearchIndexReader getIndexReader(int id) {
EnumMap indexMap = new EnumMap<>(IndexBlock.class);
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java
index 0827b4e7..2d12d0f4 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java
@@ -1,331 +1,222 @@
package nu.marginalia.wmsa.edge.index.conversion;
-import com.google.inject.Inject;
-import com.google.inject.name.Named;
-import gnu.trove.set.hash.TIntHashSet;
-import lombok.RequiredArgsConstructor;
-import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
import nu.marginalia.wmsa.edge.index.conversion.words.WordIndexOffsetsTable;
-import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl;
+import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntry;
+import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalReader;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.conversion.words.WordsTableWriter;
import nu.marginalia.util.btree.BTreeWriter;
import nu.marginalia.util.btree.model.BTreeContext;
import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.util.RandomWriteFunnel;
-import nu.marginalia.util.multimap.MultimapSorter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
-import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
-import java.util.concurrent.locks.Lock;
+
+import static nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntry.MAX_LENGTH;
public class SearchIndexConverter {
- private static final long FILE_HEADER_SIZE = 12;
- private static final int CHUNK_HEADER_SIZE = 16;
-
public static final BTreeContext urlsBTreeContext = new BTreeContext(5, 1, ~0, 8);
- private final long fileLength;
- private final long urlsFileSize;
+ private final long[] tmpWordsBuffer = new long[MAX_LENGTH];
+
private final Path tmpFileDir;
- private final FileChannel urlsTmpFileChannel;
- private final int wordCount;
- private final MultimapFileLong urlsTmpFileMap;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final IndexBlock block;
private final int bucketId;
+ private final File inputFile;
+ private final File outputFileWords;
+ private final File outputFileUrls;
- private final File urlsFile;
private final SearchIndexPartitioner partitioner;
- private final TIntHashSet spamDomains;
- private final MultimapSorter urlTmpFileSorter;
+ private final EdgeDomainBlacklist blacklist;
private final static int internalSortLimit =
Boolean.getBoolean("small-ram") ? 1024*1024 : 1024*1024*256;
- @SneakyThrows
- public static long wordCount(File inputFile) {
- try (RandomAccessFile raf = new RandomAccessFile(inputFile, "r")) {
- raf.readLong();
- return raf.readInt();
- }
- }
-
- @Inject
public SearchIndexConverter(IndexBlock block,
- int bucketId, @Named("tmp-file-dir") Path tmpFileDir,
- @Named("edge-writer-page-index-file") File inputFile,
- @Named("edge-index-write-words-file") File outputFileWords,
- @Named("edge-index-write-urls-file") File outputFileUrls,
+ int bucketId,
+ Path tmpFileDir,
+ File inputFile,
+ File outputFileWords,
+ File outputFileUrls,
SearchIndexPartitioner partitioner,
EdgeDomainBlacklist blacklist)
- throws ConversionUnnecessaryException, IOException
{
this.block = block;
this.bucketId = bucketId;
this.tmpFileDir = tmpFileDir;
- this.urlsFile = outputFileUrls;
+ this.inputFile = inputFile;
+ this.outputFileWords = outputFileWords;
+ this.outputFileUrls = outputFileUrls;
this.partitioner = partitioner;
- this.spamDomains = blacklist.getSpamDomains();
-
- logger.info("Converting {} ({}) {}", block.id, block, inputFile);
+ this.blacklist = blacklist;
+ }
+ public void convert() throws IOException {
Files.deleteIfExists(outputFileWords.toPath());
Files.deleteIfExists(outputFileUrls.toPath());
- final RandomAccessFile raf = new RandomAccessFile(inputFile, "r");
+ SearchIndexJournalReader journalReader = new SearchIndexJournalReader(MultimapFileLong.forReading(inputFile.toPath()));
- this.fileLength = raf.readLong();
- this.wordCount = raf.readInt();
-
- if (fileLength <= FILE_HEADER_SIZE) {
- throw new ConversionUnnecessaryException();
+ if (journalReader.fileHeader.fileSize() <= SearchIndexJournalReader.FILE_HEADER_SIZE_BYTES) {
+ return;
}
- var inputChannel = raf.getChannel();
+ logger.info("Converting {} ({}) {} {}", block.id, block, inputFile, journalReader.fileHeader);
- ByteBuffer buffer = ByteBuffer.allocateDirect(10_000);
+ var lock = partitioner.getReadLock();
+ try {
+ lock.lock();
- urlsFileSize = getUrlsSize(buffer, inputChannel);
+ var tmpUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat");
- var tmpUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat");
- var urlsTmpFileRaf = new RandomAccessFile(tmpUrlsFile.toFile(), "rw");
- urlsTmpFileChannel = urlsTmpFileRaf.getChannel();
- urlsTmpFileMap = new MultimapFileLong(urlsTmpFileRaf, FileChannel.MapMode.READ_WRITE, urlsFileSize, 8*1024*1024, false);
- urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit);
+ logger.info("Creating word index table {} for block {} ({})", outputFileWords, block.id, block);
+ WordIndexOffsetsTable wordIndexTable = createWordIndexTable(journalReader, outputFileWords);
- logger.info("Creating word index table {} for block {} ({})", outputFileWords, block.id, block);
- WordIndexOffsetsTable wordIndexTable = createWordIndexTable(outputFileWords, inputChannel);
+ logger.info("Creating word urls table {} for block {} ({})", outputFileUrls, block.id, block);
+ createUrlTable(journalReader, tmpUrlsFile, wordIndexTable);
- logger.info("Creating word urls table {} for block {} ({})", outputFileUrls, block.id, block);
- createUrlTable(buffer, raf, wordIndexTable);
-
- Files.delete(tmpUrlsFile);
- raf.close();
-
- urlsTmpFileChannel.close();
- urlsTmpFileMap.force();
-
- }
-
- private boolean isUrlAllowed(long url) {
- return !spamDomains.contains((int)(url >>> 32));
- }
-
- public long translateUrl(long url) {
- int domainId = partitioner.translateId(bucketId, (int) (url >>> 32));
- return ((long)domainId << 32) | (url & 0xFFFFFFFFL);
- }
-
-
- private long getUrlsSize(ByteBuffer buffer, FileChannel channel) throws IOException {
- channel.position(FILE_HEADER_SIZE);
-
- var reader = new IndexReader(buffer, channel) {
- public long size;
-
- @Override
- public void eachWord(long urlId, int wordId) {
- size++;
- }
- };
-
- reader.read();
-
- logger.info("Blacklist filtered {} URLs", reader.filtered);
- logger.debug("URLs Size {} Mb", channel.position()/(1024*1024));
-
- return reader.size;
- }
-
- private void createUrlTable(ByteBuffer buffer, RandomAccessFile raf, WordIndexOffsetsTable wordOffsetsTable) throws IOException {
- logger.info("Table size = {}", wordOffsetsTable.length());
-
- raf.seek(FILE_HEADER_SIZE);
-
- var channel = raf.getChannel();
-
- try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, urlsFileSize, 10_000_000)) {
- int[] wordWriteOffset = new int[wordOffsetsTable.length()];
-
- new IndexReader(buffer, channel) {
- @Override
- public void eachWord(long urlId, int wordId) throws IOException {
- if (wordId >= wordWriteOffset.length)
- return;
-
- if (wordId > 0) {
- rwf.put(wordOffsetsTable.get(wordId - 1) + wordWriteOffset[wordId]++, translateUrl(urlId));
- } else {
- rwf.put(wordWriteOffset[wordId]++, translateUrl(urlId));
- }
- }
- }.read();
-
- rwf.write(urlsTmpFileChannel);
+ Files.delete(tmpUrlsFile);
}
-
- urlsTmpFileChannel.force(false);
- logger.info("URL TMP Table: {} Mb", channel.position()/(1024*1024));
-
- if (wordOffsetsTable.length() > 0) {
- logger.info("Sorting urls table");
-
- wordOffsetsTable.forEach(urlTmpFileSorter::sort);
-
- urlsTmpFileMap.force();
- }
- else {
- logger.warn("urls table empty -- nothing to sort");
- }
-
- logger.info("Writing BTree");
- try (var urlsFileMap = MultimapFileLong.forOutput(urlsFile.toPath(), 1024)) {
- var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext);
-
- wordOffsetsTable.fold((accumulatorIdx, start, length) -> {
- // Note: The return value is accumulated into accumulatorIdx!
-
- return writer.write(accumulatorIdx, length,
- slice -> slice.transferFromFileChannel(urlsTmpFileChannel, 0, start, start + length));
- });
-
- } catch (Exception e) {
- logger.error("Error while writing BTree", e);
+ finally {
+ lock.unlock();
}
}
- private WordIndexOffsetsTable createWordIndexTable(File outputFileWords, FileChannel inputChannel) throws IOException {
- inputChannel.position(FILE_HEADER_SIZE);
- logger.debug("Table size = {}", wordCount);
- WordsTableWriter wordsTableWriter = new WordsTableWriter(wordCount);
- ByteBuffer buffer = ByteBuffer.allocateDirect(8* SearchIndexWriterImpl.MAX_BLOCK_SIZE);
+
+ private WordIndexOffsetsTable createWordIndexTable(SearchIndexJournalReader journalReader,
+ File outputFileWords) throws IOException
+ {
+ final int topWord = (int) journalReader.fileHeader.wordCount();
+
+ logger.debug("Table size = {}", topWord);
+ WordsTableWriter wordsTableWriter = new WordsTableWriter(topWord);
logger.debug("Reading words");
- var reader = new IndexReader(buffer, inputChannel) {
- @Override
- public void eachWord(long urlId, int wordId) {
+ for (var entry : journalReader) {
+ if (!isRelevantEntry(entry)) {
+ continue;
+ }
+
+ final SearchIndexJournalEntry entryData = entry.readEntryUsingBuffer(tmpWordsBuffer);
+
+ for (int i = 0; i < entryData.size(); i++) {
+ int wordId = (int) entryData.get(i);
+ if (wordId < 0 || wordId >= topWord) {
+ logger.warn("Bad wordId {}", wordId);
+ }
wordsTableWriter.acceptWord(wordId);
}
- };
- reader.read();
+ }
logger.debug("Rearranging table");
- inputChannel.position(FILE_HEADER_SIZE);
-
wordsTableWriter.write(outputFileWords);
return wordsTableWriter.getTable();
}
- @RequiredArgsConstructor
- private class IndexReader {
- private final ByteBuffer buffer;
- private final FileChannel channel;
- public long filtered;
+ private void createUrlTable(SearchIndexJournalReader journalReader,
+ Path tmpUrlsFile,
+ WordIndexOffsetsTable wordOffsetsTable) throws IOException
+ {
+ logger.info("Table size = {}", wordOffsetsTable.length());
- public void read() throws IOException {
- var lock = partitioner.getReadLock();
- try {
- lock.lock();
- outer:
- while (channel.position() < fileLength) {
- buffer.clear();
- buffer.limit(CHUNK_HEADER_SIZE);
- channel.read(buffer);
- buffer.flip();
- long urlId = buffer.getLong();
- int chunkBlock = buffer.getInt();
- int count = buffer.getInt();
+ long numberOfWordsTotal = 0;
+ for (var entry : journalReader) {
+ if (isRelevantEntry(entry))
+ numberOfWordsTotal += entry.wordCount();
+ }
- if (count > 1000) {
- int tries = 0;
- logger.warn("Terminating garbage @{}b, attempting repair", channel.position());
+ try (RandomAccessFile urlsTmpFileRAF = new RandomAccessFile(tmpUrlsFile.toFile(), "rw");
+ FileChannel urlsTmpFileChannel = urlsTmpFileRAF.getChannel()) {
- for (; ; ) {
- tries++;
- long p = channel.position();
- buffer.clear();
- buffer.limit(8);
- if (channel.read(buffer) != 8) {
- break outer; // EOF...?
- }
+ try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, numberOfWordsTotal, 10_000_000)) {
+ int[] wordWriteOffset = new int[wordOffsetsTable.length()];
- buffer.flip();
- int pcb = buffer.getInt();
- int pct = buffer.getInt();
- if (pcb == 0 || pcb == 1 && pct >= 0 && pct <= 1000) {
- chunkBlock = pcb;
- count = pct;
- break;
- } else {
- channel.position(p + 1);
- }
+ for (var entry : journalReader) {
+ if (!isRelevantEntry(entry)) continue;
+
+ var entryData = entry.readEntryUsingBuffer(tmpWordsBuffer);
+
+ for (int i = 0; i < entryData.size(); i++) {
+ int wordId = (int) entryData.get(i);
+
+ if (wordId >= wordWriteOffset.length)
+ continue;
+ if (wordId < 0) {
+ logger.warn("Negative wordId {}", wordId);
}
- logger.warn("Skipped {}b", tries);
- }
- buffer.clear();
- buffer.limit(count * 4);
-
- int trb = 0;
- while (trb < count * 4) {
- int rb = channel.read(buffer);
- if (rb <= 0) {
- throw new ArrayIndexOutOfBoundsException(trb + " - " + count * 4 + " " + rb);
+ final long urlInternal = translateUrl(entry.docId());
+ if (wordId > 0) {
+ rwf.put(wordOffsetsTable.get(wordId - 1) + wordWriteOffset[wordId]++, urlInternal);
+ } else {
+ rwf.put(wordWriteOffset[wordId]++, urlInternal);
}
- trb += rb;
- }
-
- buffer.flip();
-
- if (isUrlAllowed(urlId)) {
- if (block.id == chunkBlock) {
- eachUrl(lock, count, urlId);
- }
- } else {
- filtered++;
}
}
- }
- finally {
- lock.unlock();
- }
- }
- public void eachUrl(Lock lock, int count, long urlId) throws IOException {
- for (int i = 0; i < count; i++) {
- int wordId = buffer.getInt();
- if (acceptWord(lock, urlId)) {
- eachWord(urlId, wordId);
+
+ rwf.write(urlsTmpFileChannel);
+ }
+
+ urlsTmpFileChannel.force(false);
+
+ try (var urlsTmpFileMap = MultimapFileLong.forOutput(tmpUrlsFile, numberOfWordsTotal)) {
+ if (wordOffsetsTable.length() > 0) {
+ logger.info("Sorting urls table");
+
+ var urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit);
+
+ wordOffsetsTable.forEachRange(urlTmpFileSorter::sort);
+
+ urlsTmpFileMap.force();
+ } else {
+ logger.warn("urls table empty -- nothing to sort");
}
}
- }
- public void eachWord(long urlId, int wordId) throws IOException {
- }
+ logger.info("Writing BTree");
+ try (var urlsFileMap = MultimapFileLong.forOutput(outputFileUrls.toPath(), numberOfWordsTotal)) {
+ var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext);
- boolean acceptWord(Lock lock, long urlId) {
- int domainId = (int) (urlId >>> 32L);
+ wordOffsetsTable.foldRanges((accumulatorIdx, start, length) -> {
+ // Note: The return value is accumulated into accumulatorIdx!
- if (!partitioner.filterUnsafe(lock, domainId, bucketId)) {
- return false;
+ return writer.write(accumulatorIdx, length,
+ slice -> slice.transferFromFileChannel(urlsTmpFileChannel, 0, start, start + length));
+ });
+
+ } catch (Exception e) {
+ logger.error("Error while writing BTree", e);
}
- return true;
}
}
+
+
+ private long translateUrl(long url) {
+ int domainId = partitioner.translateId(bucketId, (int) (url >>> 32));
+ return ((long)domainId << 32) | (url & 0xFFFFFFFFL);
+ }
+
+ private boolean isRelevantEntry(SearchIndexJournalReader.JournalEntry entry) {
+ return block.equals(entry.header.block())
+ && !blacklist.isBlacklisted(entry.domainId())
+ && partitioner.filterUnsafe(entry.domainId(), bucketId);
+ }
+
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java
index bf5a1d74..2f2e9d47 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java
@@ -122,7 +122,7 @@ public class SearchIndexPartitioner {
public Lock getReadLock() {
return rwl.readLock();
}
- public boolean filterUnsafe(Lock lock, int domainId, int bucketId) {
+ public boolean filterUnsafe(int domainId, int bucketId) {
return partitionSet.test(domainId, bucketId);
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java
index 9e851025..5357fc1f 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java
@@ -3,7 +3,9 @@ package nu.marginalia.wmsa.edge.index.conversion;
import com.google.inject.Inject;
import gnu.trove.set.hash.TIntHashSet;
import lombok.SneakyThrows;
+import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
+import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -46,23 +48,16 @@ public class SearchIndexPreconverter {
}
}
- final RandomAccessFile raf = new RandomAccessFile(inputFile, "r");
+ SearchIndexJournalReader indexJournalReader = new SearchIndexJournalReader(MultimapFileLong.forReading(inputFile.toPath()));
- var fileLength = raf.readLong();
- var wordCount = raf.readInt();
- final int wordCountOriginal = wordCount;
+ final long wordCountOriginal = indexJournalReader.fileHeader.wordCount();
- logger.info("Word Count: {}", wordCount);
- logger.info("File Length: {}", fileLength);
-
- var channel = raf.getChannel();
-
- ByteBuffer inByteBuffer = ByteBuffer.allocateDirect(10_000);
+ logger.info("{}", indexJournalReader.fileHeader);
RandomAccessFile[] randomAccessFiles = new RandomAccessFile[outputFiles.length];
for (int i = 0; i < randomAccessFiles.length; i++) {
randomAccessFiles[i] = new RandomAccessFile(outputFiles[i], "rw");
- randomAccessFiles[i].seek(12);
+ randomAccessFiles[i].seek(SearchIndexJournalReader.FILE_HEADER_SIZE_BYTES);
}
FileChannel[] fileChannels = new FileChannel[outputFiles.length];
for (int i = 0; i < fileChannels.length; i++) {
@@ -73,33 +68,24 @@ public class SearchIndexPreconverter {
var lock = partitioner.getReadLock();
try {
lock.lock();
+ ByteBuffer buffer = ByteBuffer.allocateDirect(8192);
- while (channel.position() < fileLength) {
- inByteBuffer.clear();
- inByteBuffer.limit(CHUNK_HEADER_SIZE);
- channel.read(inByteBuffer);
- inByteBuffer.flip();
- long urlId = inByteBuffer.getLong();
- int chunkBlock = inByteBuffer.getInt();
- int count = inByteBuffer.getInt();
- // inByteBuffer.clear();
- inByteBuffer.limit(count * 4 + CHUNK_HEADER_SIZE);
- channel.read(inByteBuffer);
- inByteBuffer.position(CHUNK_HEADER_SIZE);
-
- for (int i = 0; i < count; i++) {
- wordCount = Math.max(wordCount, 1 + inByteBuffer.getInt());
+ for (var entry : indexJournalReader) {
+ if (!partitioner.isGoodUrl(entry.urlId())
+ || spamDomains.contains(entry.domainId())) {
+ continue;
}
- inByteBuffer.position(count * 4 + CHUNK_HEADER_SIZE);
+ int domainId = entry.domainId();
+ buffer.clear();
+ entry.copyToBuffer(buffer);
+ for (int i = 0; i < randomAccessFiles.length; i++) {
+ if (partitioner.filterUnsafe(domainId, i)) {
+ buffer.flip();
- if (isUrlAllowed(urlId)) {
- for (int i = 0; i < randomAccessFiles.length; i++) {
- if (partitioner.filterUnsafe(lock, (int) (urlId >>> 32L), i)) {
- inByteBuffer.flip();
- fileChannels[i].write(inByteBuffer);
- }
+ while (buffer.position() < buffer.limit())
+ fileChannels[i].write(buffer);
}
}
}
@@ -108,27 +94,16 @@ public class SearchIndexPreconverter {
lock.unlock();
}
- if (wordCountOriginal < wordCount) {
- logger.warn("Raised word count {} => {}", wordCountOriginal, wordCount);
- }
-
for (int i = 0; i < randomAccessFiles.length; i++) {
long pos = randomAccessFiles[i].getFilePointer();
randomAccessFiles[i].seek(0);
randomAccessFiles[i].writeLong(pos);
- randomAccessFiles[i].writeInt(wordCount);
+ randomAccessFiles[i].writeLong(wordCountOriginal);
fileChannels[i].force(true);
fileChannels[i].close();
randomAccessFiles[i].close();
}
}
- private boolean isUrlAllowed(long url) {
- int urlId = (int)(url & 0xFFFF_FFFFL);
- int domainId = (int)(url >>> 32);
-
- return partitioner.isGoodUrl(urlId) && !spamDomains.contains(domainId);
- }
-
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java
index 29b88509..f1308d6e 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java
@@ -16,7 +16,7 @@ public class WordIndexOffsetsTable {
return table.length;
}
- public void forEach(OffsetTableEntryConsumer o) throws IOException {
+ public void forEachRange(OffsetTableEntryConsumer o) throws IOException {
if (table[0] > 0) {
o.accept(0, (int) table[0]);
}
@@ -32,9 +32,9 @@ public class WordIndexOffsetsTable {
}
/**
- * Fold over each span in the file, left to right
+ * Fold over each span in the file, left to right, accumulating the return value
*/
- public long fold(OffsetTableEntryFoldConsumer o) throws IOException {
+ public long foldRanges(OffsetTableEntryFoldConsumer o) throws IOException {
long total = 0;
if (table[0] > 0) {
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntry.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntry.java
new file mode 100644
index 00000000..493eea40
--- /dev/null
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntry.java
@@ -0,0 +1,49 @@
+package nu.marginalia.wmsa.edge.index.journal;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+
+public class SearchIndexJournalEntry {
+ private final int size;
+ private final long[] underlyingArray;
+
+ public static final int MAX_LENGTH = 1000;
+
+ public SearchIndexJournalEntry(long[] underlyingArray) {
+ this.size = underlyingArray.length;
+ this.underlyingArray = underlyingArray;
+ }
+
+ public SearchIndexJournalEntry(int size, long[] underlyingArray) {
+ this.size = size;
+ this.underlyingArray = underlyingArray;
+ }
+
+ public void write(ByteBuffer buffer) {
+ for (int i = 0; i < size; i++) {
+ buffer.putLong(underlyingArray[i]);
+ }
+ }
+
+ public long get(int idx) {
+ if (idx >= size)
+ throw new ArrayIndexOutOfBoundsException();
+ return underlyingArray[idx];
+ }
+
+ public int size() {
+ return size;
+ }
+
+ public long[] toArray() {
+ if (size == underlyingArray.length)
+ return underlyingArray;
+ else
+ return Arrays.copyOf(underlyingArray, size);
+ }
+
+ public String toString() {
+ return String.format("%s[%s]", getClass().getSimpleName(), Arrays.toString(toArray()));
+ }
+
+}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntryHeader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntryHeader.java
new file mode 100644
index 00000000..f635b1d4
--- /dev/null
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntryHeader.java
@@ -0,0 +1,16 @@
+package nu.marginalia.wmsa.edge.index.journal;
+
+import nu.marginalia.wmsa.edge.index.model.IndexBlock;
+import nu.marginalia.wmsa.edge.model.EdgeDomain;
+import nu.marginalia.wmsa.edge.model.EdgeId;
+import nu.marginalia.wmsa.edge.model.EdgeUrl;
+
+public record SearchIndexJournalEntryHeader(int entrySize, long documentId, IndexBlock block) {
+
+ public static final int HEADER_SIZE_LONGS = 2;
+
+ public SearchIndexJournalEntryHeader( EdgeId domainId, EdgeId urlId, IndexBlock block) {
+ this(-1, (long) domainId.id() << 32 | urlId.id(), block);
+ }
+
+}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalFileHeader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalFileHeader.java
new file mode 100644
index 00000000..49ac5009
--- /dev/null
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalFileHeader.java
@@ -0,0 +1,4 @@
+package nu.marginalia.wmsa.edge.index.journal;
+
+public record SearchIndexJournalFileHeader(long fileSize, long wordCount) {
+}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalReader.java
new file mode 100644
index 00000000..0e11646a
--- /dev/null
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalReader.java
@@ -0,0 +1,123 @@
+package nu.marginalia.wmsa.edge.index.journal;
+
+import com.upserve.uppend.blobs.NativeIO;
+import nu.marginalia.util.multimap.MultimapFileLong;
+import nu.marginalia.util.multimap.MultimapFileLongSlice;
+import nu.marginalia.wmsa.edge.index.model.IndexBlock;
+import org.jetbrains.annotations.NotNull;
+
+import java.nio.ByteBuffer;
+import java.util.Iterator;
+
+import static nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS;
+
+public class SearchIndexJournalReader implements Iterable {
+ public static final long FILE_HEADER_SIZE_LONGS = 2;
+ public static final long FILE_HEADER_SIZE_BYTES = 8*FILE_HEADER_SIZE_LONGS;
+
+ public final SearchIndexJournalFileHeader fileHeader;
+
+ private final MultimapFileLongSlice map;
+ private final long committedSize;
+
+ public SearchIndexJournalReader(MultimapFileLong map) {
+ fileHeader = new SearchIndexJournalFileHeader(map.get(0), map.get(1));
+ committedSize = map.get(0) / 8 - FILE_HEADER_SIZE_LONGS;
+
+ map.advice(NativeIO.Advice.Sequential);
+
+ this.map = map.atOffset(FILE_HEADER_SIZE_LONGS);
+ }
+
+ @NotNull
+ @Override
+ public Iterator iterator() {
+ return new JournalEntryIterator();
+ }
+
+ private class JournalEntryIterator implements Iterator {
+ private JournalEntry entry;
+
+ @Override
+ public boolean hasNext() {
+ if (entry == null) {
+ return committedSize > 0;
+ }
+
+ return entry.hasNext();
+ }
+
+ @Override
+ public JournalEntry next() {
+ if (entry == null) {
+ entry = new JournalEntry(0);
+ }
+ else {
+ entry = entry.next();
+ }
+ return entry;
+ }
+ }
+
+ public class JournalEntry {
+ private final long offset;
+ public final SearchIndexJournalEntryHeader header;
+
+ JournalEntry(long offset) {
+ final long sizeBlock = map.get(offset);
+ final long docId = map.get(offset + 1);
+
+ this.offset = offset;
+ this.header = new SearchIndexJournalEntryHeader(
+ (int)(sizeBlock >>> 32L),
+ docId,
+ IndexBlock.byId((int)(sizeBlock & 0xFFFF_FFFFL)));
+ }
+
+ public boolean hasNext() {
+ return nextId() < committedSize;
+ }
+ public long docId() {
+ return header.documentId();
+ }
+ public int domainId() {
+ return (int) (docId() >>> 32L);
+ }
+ public int urlId() {
+ return (int)(docId() & 0xFFFF_FFFFL);
+ }
+ public IndexBlock block() {
+ return header.block();
+ }
+ public int wordCount() { return header.entrySize(); }
+
+ public SearchIndexJournalEntry readEntry() {
+ long[] dest = new long[header.entrySize()];
+ map.read(dest, offset + HEADER_SIZE_LONGS);
+ return new SearchIndexJournalEntry(header.entrySize(), dest);
+ }
+
+ public SearchIndexJournalEntry readEntryUsingBuffer(long[] dest) {
+ if (dest.length >= header.entrySize()) {
+ map.read(dest, header.entrySize(), offset + HEADER_SIZE_LONGS);
+ return new SearchIndexJournalEntry(header.entrySize(), dest);
+ }
+ else {
+ return readEntry();
+ }
+ }
+
+ public long nextId() {
+ return offset + HEADER_SIZE_LONGS + header.entrySize();
+ }
+ public JournalEntry next() { return new JournalEntry(nextId()); }
+
+ public void copyToBuffer(ByteBuffer buffer) {
+ var dest = buffer.asLongBuffer();
+ dest.position(buffer.position() * 8);
+ dest.limit(buffer.position()*8 + header.entrySize() + HEADER_SIZE_LONGS);
+ map.read(dest, offset);
+ buffer.position(dest.limit()*8);
+ }
+ }
+}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriter.java
new file mode 100644
index 00000000..4567a428
--- /dev/null
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriter.java
@@ -0,0 +1,10 @@
+package nu.marginalia.wmsa.edge.index.journal;
+
+public interface SearchIndexJournalWriter {
+ void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entry);
+
+ void forceWrite();
+
+ void flushWords();
+
+}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriterImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java
similarity index 68%
rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriterImpl.java
rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java
index cf76ada2..f5ba8b31 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriterImpl.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java
@@ -3,11 +3,7 @@ package nu.marginalia.wmsa.edge.index.journal;
import io.reactivex.rxjava3.disposables.Disposable;
import io.reactivex.rxjava3.schedulers.Schedulers;
import lombok.SneakyThrows;
-import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter;
-import nu.marginalia.wmsa.edge.model.EdgeDomain;
-import nu.marginalia.wmsa.edge.model.EdgeId;
-import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -17,10 +13,9 @@ import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
-import java.util.List;
import java.util.concurrent.TimeUnit;
-public class SearchIndexWriterImpl implements SearchIndexWriter {
+public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
private final DictionaryWriter dictionaryWriter;
private final Logger logger = LoggerFactory.getLogger(getClass());
@@ -28,12 +23,12 @@ public class SearchIndexWriterImpl implements SearchIndexWriter {
private RandomAccessFile raf;
private FileChannel channel;
- public static final int MAX_BLOCK_SIZE = 1000*32*8*4;
+ public static final int MAX_BLOCK_SIZE = SearchIndexJournalEntry.MAX_LENGTH*32*8*4;
private final ByteBuffer byteBuffer;
private long pos;
@SneakyThrows
- public SearchIndexWriterImpl(DictionaryWriter dictionaryWriter, File indexFile) {
+ public SearchIndexJournalWriterImpl(DictionaryWriter dictionaryWriter, File indexFile) {
this.dictionaryWriter = dictionaryWriter;
initializeIndexFile(indexFile);
@@ -61,23 +56,16 @@ public class SearchIndexWriterImpl implements SearchIndexWriter {
@Override
@SneakyThrows
- public synchronized void put(EdgeId domainId, EdgeId urlId, IndexBlock block, List wordsSuspect) {
- int numGoodWords = 0;
- for (String word : wordsSuspect) {
- if (word.length() < Byte.MAX_VALUE) numGoodWords++;
- }
+ public synchronized void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {
byteBuffer.clear();
- long url_id = ((long) domainId.getId() << 32) | urlId.getId();
- byteBuffer.putLong(url_id);
- byteBuffer.putInt(block.id);
- byteBuffer.putInt(numGoodWords);
- for (String word : wordsSuspect) {
- if (word.length() < Byte.MAX_VALUE) {
- byteBuffer.putInt(dictionaryWriter.get(word));
- }
- }
+ byteBuffer.putInt(entryData.size());
+ byteBuffer.putInt(header.block().id);
+ byteBuffer.putLong(header.documentId());
+
+ entryData.write(byteBuffer);
+
byteBuffer.limit(byteBuffer.position());
byteBuffer.rewind();
@@ -104,11 +92,11 @@ public class SearchIndexWriterImpl implements SearchIndexWriter {
}
private void writePositionMarker() throws IOException {
- var lock = channel.lock(0, 12, false);
+ var lock = channel.lock(0, 16, false);
pos = channel.size();
raf.seek(0);
raf.writeLong(pos);
- raf.writeInt(dictionaryWriter.size());
+ raf.writeLong(dictionaryWriter.size());
raf.seek(pos);
lock.release();
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriter.java
deleted file mode 100644
index 11fc186a..00000000
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexWriter.java
+++ /dev/null
@@ -1,16 +0,0 @@
-package nu.marginalia.wmsa.edge.index.journal;
-
-import nu.marginalia.wmsa.edge.index.model.IndexBlock;
-import nu.marginalia.wmsa.edge.model.EdgeDomain;
-import nu.marginalia.wmsa.edge.model.EdgeId;
-import nu.marginalia.wmsa.edge.model.EdgeUrl;
-
-import java.util.List;
-
-public interface SearchIndexWriter {
- void put(EdgeId domainId, EdgeId urlId, IndexBlock block, List words);
- void forceWrite();
-
- void flushWords();
-
-}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java
index 863c0c65..01ad1e20 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java
@@ -7,7 +7,7 @@ import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
import nu.marginalia.wmsa.edge.index.EdgeIndexBucket;
import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader;
-import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl;
+import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -27,8 +27,8 @@ public class SearchIndexes {
private final ReentrantLock opsLock = new ReentrantLock(false);
- private final SearchIndexWriterImpl primaryIndexWriter;
- private final SearchIndexWriterImpl secondaryIndexWriter;
+ private final SearchIndexJournalWriterImpl primaryIndexWriter;
+ private final SearchIndexJournalWriterImpl secondaryIndexWriter;
private DictionaryReader dictionaryReader = null;
@Inject
@@ -134,7 +134,7 @@ public class SearchIndexes {
}
}
- public SearchIndexWriterImpl getIndexWriter(int idx) {
+ public SearchIndexJournalWriterImpl getIndexWriter(int idx) {
if (idx == 0) {
return primaryIndexWriter;
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeId.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeId.java
index f2be15fa..0ee908ef 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeId.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeId.java
@@ -1,15 +1,10 @@
package nu.marginalia.wmsa.edge.model;
-import lombok.AllArgsConstructor;
-import lombok.EqualsAndHashCode;
-import lombok.Getter;
-import lombok.ToString;
-/** This exists entirely for strengthening the typing of IDs
+/**
+ * This exists entirely for strengthening the typing of IDs
*
* @param
*/
-@AllArgsConstructor @Getter @EqualsAndHashCode @ToString
-public class EdgeId {
- private final int id;
+public record EdgeId(int id) {
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java
index c6f4fbc5..66438279 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java
@@ -32,7 +32,7 @@ public class EdgeSearchResultItem {
}
public long getCombinedId() {
- return ((long) domain.getId() << 32L) | url.getId();
+ return ((long) domain.id() << 32L) | url.id();
}
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java
index 66004dee..add46ef4 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java
@@ -121,7 +121,7 @@ public class EdgeSearchOperator {
int domainId = -1;
try {
if (domain != null) {
- return edgeDataStoreDao.getDomainId(new EdgeDomain(domain)).getId();
+ return edgeDataStoreDao.getDomainId(new EdgeDomain(domain)).id();
}
}
catch (NoSuchElementException ex) {
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java
index 6e341721..193f1a1c 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java
@@ -3,7 +3,6 @@ package nu.marginalia.wmsa.edge.search.command.commands;
import com.google.inject.Inject;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
-import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
import nu.marginalia.wmsa.edge.search.EdgeSearchOperator;
@@ -12,7 +11,6 @@ import nu.marginalia.wmsa.edge.search.command.ResponseType;
import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface;
import nu.marginalia.wmsa.edge.search.command.SearchParameters;
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResultSet;
-import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResults;
import nu.marginalia.wmsa.edge.search.model.DomainInformation;
import nu.marginalia.wmsa.edge.search.siteinfo.DomainInformationService;
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
@@ -69,7 +67,7 @@ public class SiteSearchCommand implements SearchCommandInterface {
if (null != domain) {
resultSet = searchOperator.performDumbQuery(ctx, EdgeSearchProfile.CORPO, IndexBlock.Words, 100, 100, "site:"+domain);
- screenshotPath = Path.of("/screenshot/" + dataStoreDao.getDomainId(domain).getId());
+ screenshotPath = Path.of("/screenshot/" + dataStoreDao.getDomainId(domain).id());
}
else {
resultSet = new DecoratedSearchResultSet(Collections.emptyList());
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java
index 22b24aca..12d358bf 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java
@@ -78,8 +78,8 @@ public class SearchResultDecorator {
TIntArrayList missedIds = new TIntArrayList();
for (var resultItem : resultItems) {
- var did = resultItem.getDomain().getId();
- var uid = resultItem.getUrl().getId();
+ var did = resultItem.getDomain().id();
+ var uid = resultItem.getUrl().id();
var details = detailsById.get(uid);
if (details == null) {
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java
index 2f79a9ea..d3eb8061 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/siteinfo/DomainInformationService.java
@@ -2,7 +2,6 @@ package nu.marginalia.wmsa.edge.search.siteinfo;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
-import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeId;
@@ -98,7 +97,7 @@ public class DomainInformationService {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
- stmt.setInt(1, domainId.getId());
+ stmt.setInt(1, domainId.id());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
@@ -115,7 +114,7 @@ public class DomainInformationService {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
- stmt.setInt(1, domainId.getId());
+ stmt.setInt(1, domainId.id());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
@@ -133,7 +132,7 @@ public class DomainInformationService {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
- stmt.setInt(1, domainId.getId());
+ stmt.setInt(1, domainId.id());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
@@ -150,7 +149,7 @@ public class DomainInformationService {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) {
- stmt.setInt(1, domainId.getId());
+ stmt.setInt(1, domainId.id());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
@@ -166,7 +165,7 @@ public class DomainInformationService {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) {
- stmt.setInt(1, domainId.getId());
+ stmt.setInt(1, domainId.id());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
@@ -183,7 +182,7 @@ public class DomainInformationService {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT QUALITY FROM EC_DOMAIN WHERE ID=?")) {
- stmt.setInt(1, domainId.getId());
+ stmt.setInt(1, domainId.id());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getDouble(1);
@@ -199,7 +198,7 @@ public class DomainInformationService {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) {
- stmt.setInt(1, domainId.getId());
+ stmt.setInt(1, domainId.id());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return EdgeDomainIndexingState.valueOf(rsp.getString(1));
@@ -216,8 +215,8 @@ public class DomainInformationService {
public List getLinkingDomains(EdgeId domainId) {
try (var connection = dataSource.getConnection()) {
List results = new ArrayList<>(25);
- try (var stmt = connection.prepareStatement("SELECT SOURCE_URL FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) {
- stmt.setInt(1, domainId.getId());
+ try (var stmt = connection.prepareStatement("SELECT SOURCE_DOMAIN FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) {
+ stmt.setInt(1, domainId.id());
var rsp = stmt.executeQuery();
while (rsp.next()) {
results.add(new EdgeDomain(rsp.getString(1)));
@@ -237,7 +236,7 @@ public class DomainInformationService {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT IFNULL(RANK, 1) FROM EC_DOMAIN WHERE ID=?")) {
- stmt.setInt(1, domainId.getId());
+ stmt.setInt(1, domainId.id());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getDouble(1);
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java
index 2b2da0fd..55015d13 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java
@@ -81,7 +81,8 @@ public class EdgeIndexClientTest {
service = new EdgeIndexService("127.0.0.1",
testPort,
init, null,
- indexes);
+ indexes,
+ servicesFactory);
Spark.awaitInitialization();
init.setReady();
@@ -113,7 +114,7 @@ public class EdgeIndexClientTest {
indexes.reindexAll();
var rsp = client.query(Context.internal(), EdgeSearchSpecification.justIncludes("trapphus"));
System.out.println(rsp);
- assertEquals(5, rsp.resultsList.get(IndexBlock.Title).get(0).results.get(0).get(0).url.getId());
+ assertEquals(5, rsp.resultsList.get(IndexBlock.Title).get(0).results.get(0).get(0).url.id());
}
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java
new file mode 100644
index 00000000..39a62033
--- /dev/null
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java
@@ -0,0 +1,76 @@
+package nu.marginalia.wmsa.edge.index.service;
+
+import lombok.SneakyThrows;
+import nu.marginalia.util.multimap.MultimapFileLong;
+import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntry;
+import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntryHeader;
+import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalReader;
+import nu.marginalia.wmsa.edge.index.model.IndexBlock;
+import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter;
+import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
+import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
+import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+class SearchIndexJournalWriterTest {
+ DictionaryWriter dictionaryWriter;
+ SearchIndexJournalWriterImpl writer;
+
+ Path indexFile;
+ Path wordsFile1;
+ Path urlsFile1;
+ Path dictionaryFile;
+
+ private final Logger logger = LoggerFactory.getLogger(getClass());
+
+ @BeforeEach @SneakyThrows
+ void setUp() {
+ dictionaryFile = Files.createTempFile("tmp", ".dict");
+ dictionaryFile.toFile().deleteOnExit();
+
+ dictionaryWriter = new DictionaryWriter(dictionaryFile.toFile(), 1L<<16, false);
+
+ indexFile = Files.createTempFile("tmp", ".idx");
+ indexFile.toFile().deleteOnExit();
+ writer = new SearchIndexJournalWriterImpl(dictionaryWriter, indexFile.toFile());
+
+ wordsFile1 = Files.createTempFile("words1", ".idx");
+ urlsFile1 = Files.createTempFile("urls1", ".idx");
+ }
+
+ @SneakyThrows
+ @AfterEach
+ void tearDown() {
+ dictionaryWriter.close();
+ writer.close();
+ indexFile.toFile().delete();
+ dictionaryFile.toFile().delete();
+ urlsFile1.toFile().delete();
+ wordsFile1.toFile().delete();
+ }
+
+ @Test
+ void put() throws IOException {
+ writer.put(new SearchIndexJournalEntryHeader(4, (1234L << 32) | 5678, IndexBlock.Link),
+ new SearchIndexJournalEntry(new long[] { 1, 2, 3, 4 }));
+ writer.put(new SearchIndexJournalEntryHeader(4, (2345L << 32) | 2244, IndexBlock.Words),
+ new SearchIndexJournalEntry(new long[] { 5, 6, 7 }));
+ writer.forceWrite();
+
+ var reader = new SearchIndexJournalReader(MultimapFileLong.forReading(indexFile));
+ reader.forEach(entry -> {
+ logger.info("{}, {} {}", entry, entry.urlId(), entry.domainId());
+ logger.info("{}", entry.readEntry().toArray());
+ });
+ }
+
+}
\ No newline at end of file
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java
deleted file mode 100644
index edcfa71f..00000000
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java
+++ /dev/null
@@ -1,90 +0,0 @@
-package nu.marginalia.wmsa.edge.index.service;
-
-import lombok.SneakyThrows;
-import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
-import nu.marginalia.wmsa.edge.index.model.IndexBlock;
-import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter;
-import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
-import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
-import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
-import nu.marginalia.wmsa.edge.index.journal.SearchIndexWriterImpl;
-import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
-import nu.marginalia.wmsa.edge.model.EdgeId;
-import org.junit.jupiter.api.AfterEach;
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
-
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.Arrays;
-import java.util.EnumMap;
-
-import static nu.marginalia.util.dict.DictionaryHashMap.NO_VALUE;
-import static org.junit.jupiter.api.Assertions.*;
-
-class SearchIndexWriterTest {
- DictionaryWriter dictionaryWriter;
- SearchIndexWriterImpl writer;
-
- Path indexFile;
- Path wordsFile1;
- Path urlsFile1;
- Path dictionaryFile;
-
- @BeforeEach @SneakyThrows
- void setUp() {
- dictionaryFile = Files.createTempFile("tmp", ".dict");
- dictionaryFile.toFile().deleteOnExit();
-
- dictionaryWriter = new DictionaryWriter(dictionaryFile.toFile(), 1L<<16, false);
-
- indexFile = Files.createTempFile("tmp", ".idx");
- indexFile.toFile().deleteOnExit();
- writer = new SearchIndexWriterImpl(dictionaryWriter, indexFile.toFile());
-
- wordsFile1 = Files.createTempFile("words1", ".idx");
- urlsFile1 = Files.createTempFile("urls1", ".idx");
- }
-
- @SneakyThrows
- @AfterEach
- void tearDown() {
- dictionaryWriter.close();
- writer.close();
- indexFile.toFile().delete();
- dictionaryFile.toFile().delete();
- urlsFile1.toFile().delete();
- wordsFile1.toFile().delete();
- }
-
- public long[] findWord(SearchIndexReader reader, String word, IndexBlock block) {
- IndexSearchBudget budget = new IndexSearchBudget(100);
- return reader.findWord(block, budget, lv->true, dictionaryWriter.getReadOnly(word)).stream().toArray();
- }
-
- @Test @SneakyThrows
- void put() throws IOException {
- writer.put(new EdgeId<>(0), new EdgeId<>(1), IndexBlock.Words, Arrays.asList("Hello", "Salvete", "everyone!", "This", "is", "Bob"));
- writer.put(new EdgeId<>(0), new EdgeId<>(2), IndexBlock.Words, Arrays.asList("Salvete", "omnes!", "Bob", "sum", "Hello"));
- writer.forceWrite();
-
- new SearchIndexConverter(IndexBlock.Words, 0, Path.of("/tmp"), indexFile.toFile(), wordsFile1.toFile(), urlsFile1.toFile(), new SearchIndexPartitioner(null), val -> false);
-
- EnumMap indices = new EnumMap(IndexBlock.class);
- indices.put(IndexBlock.Words, new SearchIndex("0", urlsFile1.toFile(), wordsFile1.toFile()));
-
- var reader = new SearchIndexReader(indices);
-
- int bobId = dictionaryWriter.getReadOnly("Bob");
- assertNotEquals(NO_VALUE, bobId);
-
- assertEquals(2, reader.numHits(IndexBlock.Words, bobId));
- assertArrayEquals(new long[] { 1, 2 }, findWord(reader,"Bob", IndexBlock.Words));
- assertArrayEquals(new long[] { 2 }, findWord(reader,"sum", IndexBlock.Words));
- assertArrayEquals(new long[] { }, findWord(reader,"New Word", IndexBlock.Words));
-
- writer.close();
- }
-
-}
\ No newline at end of file
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/RandomWriteFunnelTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/RandomWriteFunnelTest.java
index 1780b6bb..8e58b117 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/RandomWriteFunnelTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/RandomWriteFunnelTest.java
@@ -67,4 +67,37 @@ class RandomWriteFunnelTest {
}
}
}
+
+
+ @Test
+ public void testYuge() {
+ new File("/tmp/test.bin").delete();
+ for (int j = 1; j <= 20; j++) {
+ try (var funnel = new RandomWriteFunnel(Path.of("/tmp"), 10, j);
+ var out = new RandomAccessFile("/tmp/test.bin", "rw")) {
+ for (int i = 10 - 1; i >= 0; i -= 2) {
+ funnel.put(i, Long.MAX_VALUE - i);
+ }
+ funnel.write(out.getChannel());
+
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ try (var in = new RandomAccessFile("/tmp/test.bin", "r")) {
+ in.readLong();
+ in.readLong();
+ in.readLong();
+ in.readLong();
+ in.readLong();
+ in.readLong();
+ in.readLong();
+ in.readLong();
+ in.readLong();
+ in.readLong();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
}
\ No newline at end of file
diff --git a/third_party/src/main/java/com/upserve/uppend/blobs/NativeIO.java b/third_party/src/main/java/com/upserve/uppend/blobs/NativeIO.java
index 80e05c64..0698c5c3 100644
--- a/third_party/src/main/java/com/upserve/uppend/blobs/NativeIO.java
+++ b/third_party/src/main/java/com/upserve/uppend/blobs/NativeIO.java
@@ -3,18 +3,15 @@ package com.upserve.uppend.blobs;
import jnr.ffi.*;
import jnr.ffi.types.size_t;
-import org.slf4j.Logger;
import com.kenai.jffi.MemoryIO;
import java.io.IOException;
-import java.lang.invoke.MethodHandles;
import java.nio.*;
// https://github.com/upserve/uppend/blob/70967c6f24d7f1a3bbc18799f485d981da93f53b/src/main/java/com/upserve/uppend/blobs/NativeIO.java
// MIT License
public class NativeIO {
- private static final Logger log = org.slf4j.LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private static final NativeC nativeC = LibraryLoader.create(NativeC.class).load("c");
public static final int pageSize = nativeC.getpagesize(); // 4096 on most Linux
From 420b9bb7e0483f6047c8eaf2360bf44cc223965e Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Mon, 20 Jun 2022 12:02:01 +0200
Subject: [PATCH 04/40] Refactoring BTreeReader and binary search code
---
.../nu/marginalia/util/btree/BTreeReader.java | 110 ++++------
.../nu/marginalia/util/btree/BTreeWriter.java | 88 ++++----
.../util/btree/model/BTreeContext.java | 11 +-
.../util/btree/model/BTreeHeader.java | 13 +-
.../util/multimap/MultimapFileLong.java | 32 ++-
.../multimap/MultimapFileLongOffsetSlice.java | 5 +
.../util/multimap/MultimapFileLongSlice.java | 2 +
.../util/multimap/MultimapSearcher.java | 192 +++++++-----------
.../util/multimap/MultimapSearcherBase.java | 143 +++++++++++++
.../edge/index/reader/IndexWordsTable.java | 6 +-
.../wmsa/edge/index/reader/SearchIndex.java | 2 +-
.../util/btree/BTreeWriterTest.java | 32 +--
.../util/hash/LongPairHashMapTest.java | 4 +-
.../edge/index/service/MultimapFileTest.java | 14 +-
14 files changed, 380 insertions(+), 274 deletions(-)
create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcherBase.java
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java
index ec8f204b..de675776 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java
@@ -11,94 +11,68 @@ public class BTreeReader {
private final MultimapFileLong file;
private final BTreeContext ctx;
+
private final Logger logger = LoggerFactory.getLogger(BTreeReader.class);
- private final long mask;
- private final MultimapSearcher searcher;
+
+ private final MultimapSearcher indexSearcher;
+ private final MultimapSearcher dataSearcher;
public BTreeReader(MultimapFileLong file, BTreeContext ctx) {
this.file = file;
- this.searcher = file.createSearcher();
+ this.indexSearcher = MultimapSearcher.forContext(file, ~0, 1);
+ this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize());
+
this.ctx = ctx;
- this.mask = ctx.equalityMask();
}
- public long fileSize() {
- return file.size();
+ public BTreeHeader getHeader(long fileOffset) {
+ return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2));
}
- public BTreeHeader getHeader(long offset) {
- return new BTreeHeader(file.get(offset), file.get(offset+1), file.get(offset+2));
- }
+ /**
+ *
+ * @return file offset of entry matching keyRaw, negative if absent
+ */
+ public long findEntry(BTreeHeader header, final long keyRaw) {
+ final long key = keyRaw & ctx.equalityMask();
- public long offsetForEntry(BTreeHeader header, final long keyRaw) {
- final long key = keyRaw & mask;
+ final long dataAddress = header.dataOffsetLongs();
+ final int entrySize = ctx.entrySize();
+ final int blockSize = ctx.BLOCK_SIZE_WORDS();
- if (header.layers() == 0) {
- return trivialSearch(header, key);
+ if (header.layers() == 0) { // For small data, we only have a data block
+ return dataSearcher.binarySearchUpperBound(key, dataAddress, header.numEntries());
}
- long p = searchEntireTopLayer(header, key);
- if (p < 0) return -1;
+ final long indexOffset = header.indexOffsetLongs();
- long cumOffset = p * ctx.BLOCK_SIZE_WORDS();
+ // Search the top layer
+ long layerOffset = indexSearch(key, indexOffset, blockSize);
+ if (layerOffset < 0) return -1;
+
+ // Search intermediary layers
for (int i = header.layers() - 2; i >= 0; --i) {
- long offsetBase = header.indexOffsetLongs() + header.relativeLayerOffset(ctx, i);
- p = searchLayerBlock(key, offsetBase+cumOffset);
- if (p < 0)
+ final long layerAddressBase = indexOffset + header.relativeIndexLayerOffset(ctx, i);
+ final long layerBlockOffset = layerAddressBase + layerOffset;
+
+ final long nextLayerOffset = indexSearch(key, layerBlockOffset, blockSize);
+ if (nextLayerOffset < 0)
return -1;
- cumOffset = ctx.BLOCK_SIZE_WORDS()*(p + cumOffset);
+
+ layerOffset = blockSize*(nextLayerOffset + layerOffset);
}
- long dataMax = header.dataOffsetLongs() + (long) header.numEntries() * ctx.entrySize();
- return searchDataBlock(key,
- header.dataOffsetLongs() + ctx.entrySize()*cumOffset,
- dataMax);
+ // Search the corresponding data block
+ final long searchStart = dataAddress + layerOffset * entrySize;
+ final long lastDataAddress = dataAddress + (long) header.numEntries() * entrySize;
+ final long lastItemInBlockAddress = searchStart + (long) blockSize * entrySize;
+ final long searchEnd = Math.min(lastItemInBlockAddress, lastDataAddress);
+
+ return dataSearcher.binarySearchUpperBound(key, searchStart, (searchEnd - searchStart) / entrySize);
}
-
- private long searchEntireTopLayer(BTreeHeader header, long key) {
- long offset = header.indexOffsetLongs();
-
- return searcher.binarySearchUpperBound(key, offset, offset + ctx.BLOCK_SIZE_WORDS()) - offset;
- }
-
- private long searchLayerBlock(long key, long blockOffset) {
- if (blockOffset < 0)
- return blockOffset;
-
- return searcher.binarySearchUpperBound(key, blockOffset, blockOffset + ctx.BLOCK_SIZE_WORDS()) - blockOffset;
- }
-
-
- private long searchDataBlock(long key, long blockOffset, long dataMax) {
- if (blockOffset < 0)
- return blockOffset;
-
- long lastOffset = Math.min(blockOffset+ctx.BLOCK_SIZE_WORDS()*(long)ctx.entrySize(), dataMax);
- int length = (int)(lastOffset - blockOffset);
-
- if (ctx.entrySize() == 1) {
- if (mask == ~0L) return searcher.binarySearchUpperBoundNoMiss(key, blockOffset, blockOffset+length);
- return searcher.binarySearchUpperBoundNoMiss(key, blockOffset, blockOffset+length, mask);
- }
-
- return searcher.binarySearchUpperBoundNoMiss(key, blockOffset, ctx.entrySize(), length/ctx.entrySize(), mask);
- }
-
- private long trivialSearch(BTreeHeader header, long key) {
- long offset = header.dataOffsetLongs();
-
- if (ctx.entrySize() == 1) {
- if (mask == ~0L) {
- return searcher.binarySearchUpperBoundNoMiss(key, offset, offset+header.numEntries());
- }
- else {
- return searcher.binarySearchUpperBoundNoMiss(key, offset, offset+header.numEntries(), mask);
- }
- }
-
- return searcher.binarySearchUpperBoundNoMiss(key, offset, ctx.entrySize(), header.numEntries(), mask);
-
+ private long indexSearch(long key, long start, long n) {
+ return indexSearcher.binarySearch(key, start, n) - start;
}
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java
index b43faca7..0c1f0789 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java
@@ -2,16 +2,12 @@ package nu.marginalia.util.btree;
import nu.marginalia.util.btree.model.BTreeContext;
import nu.marginalia.util.btree.model.BTreeHeader;
-import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.util.multimap.MultimapFileLongSlice;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
import java.io.IOException;
public class BTreeWriter {
- private final Logger logger = LoggerFactory.getLogger(BTreeWriter.class);
private final BTreeContext ctx;
private final MultimapFileLongSlice map;
@@ -27,7 +23,7 @@ public class BTreeWriter {
long size = 0;
for (int layer = 0; layer < numLayers; layer++) {
- size += ctx.layerSize(numWords, layer);
+ size += ctx.indexLayerSize(numWords, layer);
}
return size;
}
@@ -45,17 +41,17 @@ public class BTreeWriter {
writeIndexCallback.write(map.atOffset(header.dataOffsetLongs()));
- if (header.layers() < 1) {
+ if (header.layers() < 1) { // The data is too small to benefit from indexing
+ return ctx.calculateSize(numEntries);
+ }
+ else {
+ writeIndex(header);
return ctx.calculateSize(numEntries);
}
-
- writeIndex(header);
-
- return ctx.calculateSize(numEntries);
}
public static BTreeHeader makeHeader(BTreeContext ctx, long offset, int numEntries) {
- final int numLayers = ctx.numLayers(numEntries);
+ final int numLayers = ctx.numIndexLayers(numEntries);
final int padding = BTreeHeader.getPadding(ctx, offset, numLayers);
@@ -71,46 +67,50 @@ public class BTreeWriter {
private void writeIndex(BTreeHeader header) {
- var layerOffsets = getRelativeLayerOffsets(header);
+ var layerOffsets = header.getRelativeLayerOffsets(ctx);
- long stride = ctx.BLOCK_SIZE_WORDS();
+ long indexedDataStepSize = ctx.BLOCK_SIZE_WORDS();
+
+ /* Index layer 0 indexes the data itself
+ Index layer 1 indexes layer 0
+ Index layer 2 indexes layer 1
+ And so on
+ */
for (int layer = 0; layer < header.layers(); layer++,
- stride*=ctx.BLOCK_SIZE_WORDS()) {
- long indexWord = 0;
- long offsetBase = layerOffsets[layer] + header.indexOffsetLongs();
- long numEntries = header.numEntries();
- for (long idx = 0; idx < numEntries; idx += stride, indexWord++) {
- long dataOffset = header.dataOffsetLongs() + (idx + (stride-1)) * ctx.entrySize();
- long val;
+ indexedDataStepSize*=ctx.BLOCK_SIZE_WORDS()) {
- if (idx + (stride-1) < numEntries) {
- val = map.get(dataOffset) & ctx.equalityMask();
- }
- else {
- val = Long.MAX_VALUE;
- }
- if (offsetBase + indexWord < 0) {
- logger.error("bad put @ {}", offsetBase + indexWord);
- logger.error("layer{}", layer);
- logger.error("layer offsets {}", layerOffsets);
- logger.error("offsetBase = {}", offsetBase);
- logger.error("numEntries = {}", numEntries);
- logger.error("indexWord = {}", indexWord);
- }
- map.put(offsetBase + indexWord, val);
- }
- for (; (indexWord % ctx.BLOCK_SIZE_WORDS()) != 0; indexWord++) {
- map.put(offsetBase + indexWord, Long.MAX_VALUE);
- }
+ writeIndexLayer(header, layerOffsets, indexedDataStepSize, layer);
}
}
- private long[] getRelativeLayerOffsets(BTreeHeader header) {
- long[] layerOffsets = new long[header.layers()];
- for (int i = 0; i < header.layers(); i++) {
- layerOffsets[i] = header.relativeLayerOffset(ctx, i);
+ private void writeIndexLayer(BTreeHeader header, long[] layerOffsets,
+ final long indexedDataStepSize,
+ final int layer) {
+
+ final long indexOffsetBase = layerOffsets[layer] + header.indexOffsetLongs();
+ final long dataOffsetBase = header.dataOffsetLongs();
+
+ final long dataEntriesMax = header.numEntries();
+ final int entrySize = ctx.entrySize();
+
+ final long lastDataEntryOffset = indexedDataStepSize - 1;
+
+ long indexWord = 0;
+
+ for (long dataPtr = 0;
+ dataPtr + lastDataEntryOffset < dataEntriesMax;
+ dataPtr += indexedDataStepSize)
+ {
+ long dataOffset = dataOffsetBase + (dataPtr + lastDataEntryOffset) * entrySize;
+ map.put(indexOffsetBase + indexWord++, map.get(dataOffset) & ctx.equalityMask());
}
- return layerOffsets;
+
+ // Fill the remaining block with LONG_MAX
+ map.setRange(indexOffsetBase+indexWord,
+ (int) (ctx.BLOCK_SIZE_WORDS() - (indexWord % ctx.BLOCK_SIZE_WORDS())),
+ Long.MAX_VALUE);
}
+
+
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java
index 4655946c..a7d6b22b 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java
@@ -10,7 +10,6 @@ public record BTreeContext(int MAX_LAYERS,
public BTreeContext(int MAX_LAYERS, int entrySize, long equalityMask, int BLOCK_SIZE_BITS) {
this(MAX_LAYERS, entrySize, equalityMask, BLOCK_SIZE_BITS, 1 << BLOCK_SIZE_BITS);
-
}
public long calculateSize(int numEntries) {
@@ -19,7 +18,7 @@ public record BTreeContext(int MAX_LAYERS,
return header.dataOffsetLongs() + (long)numEntries * entrySize;
}
- public int numLayers(int numEntries) {
+ public int numIndexLayers(int numEntries) {
if (numEntries <= BLOCK_SIZE_WORDS*2) {
return 0;
}
@@ -36,11 +35,7 @@ public record BTreeContext(int MAX_LAYERS,
return MAX_LAYERS;
}
- public long layerSize(int numEntries, int level) {
- return BLOCK_SIZE_WORDS * numBlocks(numEntries, level);
- }
-
- private long numBlocks(int numWords, int level) {
+ public long indexLayerSize(int numWords, int level) {
long layerSize = 1L<<(BLOCK_SIZE_BITS*(level+1));
int numBlocks = 0;
@@ -50,7 +45,7 @@ public record BTreeContext(int MAX_LAYERS,
numBlocks++;
}
- return numBlocks;
+ return (long) BLOCK_SIZE_WORDS * numBlocks;
}
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java
index 8d68b424..8cdcd355 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeHeader.java
@@ -1,6 +1,5 @@
package nu.marginalia.util.btree.model;
-import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.util.multimap.MultimapFileLongSlice;
public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, long dataOffsetLongs) {
@@ -36,12 +35,20 @@ public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, lon
}
- public long relativeLayerOffset(BTreeContext ctx, int n) {
+ public long relativeIndexLayerOffset(BTreeContext ctx, int n) {
long offset = 0;
for (int i = n+1; i < layers; i++) {
- offset += ctx.layerSize( numEntries, i);
+ offset += ctx.indexLayerSize( numEntries, i);
}
return offset;
}
+ public long[] getRelativeLayerOffsets(BTreeContext ctx) {
+ long[] layerOffsets = new long[layers()];
+ for (int i = 0; i < layers(); i++) {
+ layerOffsets[i] = relativeIndexLayerOffset(ctx, i);
+ }
+ return layerOffsets;
+ }
+
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java
index e9a9b4fe..00ccd82c 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java
@@ -97,8 +97,8 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
readableSize(mapSizeBytes), readableSize(8L*bufferSizeWords), mode);
}
- public MultimapSearcher createSearcher() {
- return new MultimapSearcher(this);
+ public MultimapSearcherBase createSearcher() {
+ return new MultimapSearcherBase(this);
}
public MultimapSorter createSorter(Path tmpFile, int internalSortLimit) {
return new MultimapSorter(this, tmpFile, internalSortLimit);
@@ -332,6 +332,34 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
}
+ @Override
+ public void setRange(long idx, int n, long val) {
+ if (n == 0) return;
+
+ if (idx+n >= mappedSize) {
+ grow(idx+n);
+ }
+ int iN = (int)((idx + n) / bufferSize);
+
+ for (int i = 0; i < n; ) {
+ int i0 = (int)((idx + i) / bufferSize);
+
+ int bufferOffset = (int) ((idx+i) % bufferSize);
+ var buffer = buffers.get(i0);
+
+ final int l;
+
+ if (i0 < iN) l = bufferSize - bufferOffset;
+ else l = Math.min(n - i, bufferSize - bufferOffset);
+
+ for (int p = 0; p < l; p++) {
+ buffer.put(bufferOffset + p, val);
+ }
+
+ i+=l;
+ }
+ }
+
@Override
public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException {
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java
index bd35bd9b..f379d1c6 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java
@@ -23,6 +23,11 @@ public class MultimapFileLongOffsetSlice implements MultimapFileLongSlice {
map.put(off+idx, val);
}
+ @Override
+ public void setRange(long idx, int n, long val) {
+ map.setRange(off+idx, n, val);
+ }
+
@Override
public long get(long idx) {
return map.get(off+idx);
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java
index 27d6ae06..29f9994d 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java
@@ -9,6 +9,8 @@ public interface MultimapFileLongSlice {
void put(long idx, long val);
+ void setRange(long idx, int n, long val);
+
long get(long idx);
void read(long[] vals, long idx);
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java
index 005888d8..886912c5 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java
@@ -1,128 +1,80 @@
package nu.marginalia.util.multimap;
-import lombok.experimental.Delegate;
+public interface MultimapSearcher {
+ long binarySearch(long key, long fromIndex, long n);
+ long binarySearchUpperBound(long key, long fromIndex, long n);
-public class MultimapSearcher {
- @Delegate
- private final MultimapFileLongSlice mmf;
-
- public MultimapSearcher(MultimapFileLongSlice mmf) {
- this.mmf = mmf;
- }
-
- public boolean binarySearch(long key, long fromIndex, long toIndex) {
-
- long low = fromIndex;
- long high = toIndex - 1;
-
- while (low <= high) {
- long mid = (low + high) >>> 1;
- long midVal = get(mid);
-
- if (midVal < key)
- low = mid + 1;
- else if (midVal > key)
- high = mid - 1;
- else
- return true; // key found
+ static MultimapSearcher forContext(MultimapFileLongSlice slice, long mask, int stepSize) {
+ if (mask == ~0L && stepSize == 1) {
+ return new SimpleMultimapSearcher(new MultimapSearcherBase(slice));
}
- return false; // key not found.
- }
-
- public long binarySearchUpperBound(long key, long fromIndex, long toIndex) {
-
- long low = fromIndex;
- long high = toIndex - 1;
-
- while (low <= high) {
- long mid = (low + high) >>> 1;
- long midVal = get(mid);
-
- if (midVal < key)
- low = mid + 1;
- else if (midVal > key)
- high = mid - 1;
- else
- return mid;
+ else if (stepSize == 1) {
+ return new MaskedMultimapSearcher(new MultimapSearcherBase(slice), mask);
}
- return low;
- }
-
- public long binarySearchUpperBound(long key, long fromIndex, long toIndex, long mask) {
-
- long low = fromIndex;
- long high = toIndex - 1;
-
- while (low <= high) {
- long mid = (low + high) >>> 1;
- long midVal = get(mid) & mask;
-
- if (midVal < key)
- low = mid + 1;
- else if (midVal > key)
- high = mid - 1;
- else
- return mid;
+ else {
+ return new SteppingMaskedMultimapSearcher(new MultimapSearcherBase(slice), mask, stepSize);
}
- return low;
- }
-
- public long binarySearchUpperBoundNoMiss(long key, long fromIndex, long toIndex) {
-
- long low = fromIndex;
- long high = toIndex - 1;
-
- while (low <= high) {
- long mid = (low + high) >>> 1;
- long midVal = get(mid);
-
- if (midVal < key)
- low = mid + 1;
- else if (midVal > key)
- high = mid - 1;
- else
- return mid;
- }
- return -1;
- }
-
-
- public long binarySearchUpperBoundNoMiss(long key, long fromIndex, long toIndex, long mask) {
-
- long low = fromIndex;
- long high = toIndex - 1;
-
- while (low <= high) {
- long mid = (low + high) >>> 1;
- long midVal = get(mid) & mask;
-
- if (midVal < key)
- low = mid + 1;
- else if (midVal > key)
- high = mid - 1;
- else
- return mid;
- }
- return -1;
- }
-
-
- public long binarySearchUpperBoundNoMiss(long key, long fromIndex, long step, long steps, long mask) {
-
- long low = 0;
- long high = steps - 1;
-
- while (low <= high) {
- long mid = (low + high) >>> 1;
- long midVal = get(fromIndex + mid*step) & mask;
-
- if (midVal < key)
- low = mid + 1;
- else if (midVal > key)
- high = mid - 1;
- else
- return fromIndex + mid*step;
- }
- return -1;
}
}
+
+class SimpleMultimapSearcher implements MultimapSearcher {
+ private final MultimapSearcherBase base;
+
+ SimpleMultimapSearcher(MultimapSearcherBase base) {
+ this.base = base;
+ }
+
+ @Override
+ public long binarySearch(long key, long fromIndex, long n) {
+ return base.binarySearchOffset(key, fromIndex, n);
+ }
+
+ @Override
+ public long binarySearchUpperBound(long key, long fromIndex, long n) {
+ return base.binarySearchUpperBound(key, fromIndex, n);
+ }
+}
+
+
+class MaskedMultimapSearcher implements MultimapSearcher {
+ private final MultimapSearcherBase base;
+ private final long mask;
+
+ MaskedMultimapSearcher(MultimapSearcherBase base, long mask) {
+ this.base = base;
+ this.mask = mask;
+ }
+
+ @Override
+ public long binarySearch(long key, long fromIndex, long n) {
+ return base.binarySearchOffset(key, fromIndex, n, mask);
+ }
+
+ @Override
+ public long binarySearchUpperBound(long key, long fromIndex, long n) {
+ return base.binarySearchUpperBound(key, fromIndex, n, mask);
+ }
+}
+
+
+class SteppingMaskedMultimapSearcher implements MultimapSearcher {
+ private final MultimapSearcherBase base;
+ private final long mask;
+ private final int step;
+
+ SteppingMaskedMultimapSearcher(MultimapSearcherBase base, long mask, int step) {
+ this.base = base;
+ this.mask = mask;
+ this.step = step;
+ }
+
+ @Override
+ public long binarySearch(long key, long fromIndex, long n) {
+ return base.binarySearchOffset(key, fromIndex, step, n, mask);
+ }
+
+ @Override
+ public long binarySearchUpperBound(long key, long fromIndex, long n) {
+ return base.binarySearchUpperBound(key, fromIndex, step, n, mask);
+ }
+}
\ No newline at end of file
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcherBase.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcherBase.java
new file mode 100644
index 00000000..2bd8c166
--- /dev/null
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcherBase.java
@@ -0,0 +1,143 @@
+package nu.marginalia.util.multimap;
+
+import lombok.experimental.Delegate;
+
+public class MultimapSearcherBase {
+ @Delegate
+ private final MultimapFileLongSlice mmf;
+
+ public MultimapSearcherBase(MultimapFileLongSlice mmf) {
+ this.mmf = mmf;
+ }
+
+ public boolean binarySearchTest(long key, long fromIndex, long n) {
+
+ long low = 0;
+ long high = n - 1;
+
+ while (low <= high) {
+ long mid = (low + high) >>> 1;
+ long midVal = get(fromIndex + mid);
+
+ if (midVal < key)
+ low = mid + 1;
+ else if (midVal > key)
+ high = mid - 1;
+ else
+ return true;
+ }
+ return false;
+ }
+
+ public long binarySearchOffset(long key, long fromIndex, long n) {
+ long low = 0;
+ long high = n - 1;
+
+ while (low <= high) {
+ long mid = (low + high) >>> 1;
+ long midVal = get(fromIndex + mid);
+
+ if (midVal < key)
+ low = mid + 1;
+ else if (midVal > key)
+ high = mid - 1;
+ else
+ return fromIndex + mid;
+ }
+ return fromIndex + low;
+ }
+
+
+ public long binarySearchOffset(long key, long fromIndex, long n, long mask) {
+ long low = 0;
+ long high = n - 1;
+
+ while (low <= high) {
+ long mid = (low + high) >>> 1;
+ long midVal = get(fromIndex + mid) & mask;
+
+ if (midVal < key)
+ low = mid + 1;
+ else if (midVal > key)
+ high = mid - 1;
+ else
+ return fromIndex + mid;
+ }
+ return fromIndex + low;
+ }
+
+
+ public long binarySearchOffset(long key, long fromIndex, int step, long n, long mask) {
+ long low = 0;
+ long high = n - 1;
+
+ while (low <= high) {
+ long mid = (low + high) >>> 1;
+ long midVal = get(fromIndex + mid*step) & mask;
+
+ if (midVal < key)
+ low = mid + 1;
+ else if (midVal > key)
+ high = mid - 1;
+ else
+ return fromIndex + mid*step;
+ }
+ return fromIndex + low;
+ }
+
+ public long binarySearchUpperBound(long key, long fromIndex, long n) {
+ long low = 0;
+ long high = n - 1;
+
+ while (low <= high) {
+ long mid = (low + high) >>> 1;
+ long midVal = get(fromIndex + mid);
+
+ if (midVal < key)
+ low = mid + 1;
+ else if (midVal > key)
+ high = mid - 1;
+ else
+ return fromIndex + mid;
+ }
+ return -1;
+ }
+
+
+ public long binarySearchUpperBound(long key, long fromIndex, long n, long mask) {
+ long low = 0;
+ long high = n - 1;
+
+ while (low <= high) {
+ long mid = (low + high) >>> 1;
+ long midVal = get(fromIndex + mid) & mask;
+
+ if (midVal < key)
+ low = mid + 1;
+ else if (midVal > key)
+ high = mid - 1;
+ else
+ return fromIndex + mid;
+ }
+ return -1;
+ }
+
+
+ public long binarySearchUpperBound(long key, long fromIndex, int step, long n, long mask) {
+ long low = 0;
+ long high = n - 1;
+
+ while (low <= high) {
+ long mid = (low + high) >>> 1;
+ long midVal = get(fromIndex + mid*step) & mask;
+
+ if (midVal < key)
+ low = mid + 1;
+ else if (midVal > key)
+ high = mid - 1;
+ else
+ return fromIndex + mid*step;
+ }
+ return -1;
+ }
+}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java
index 2bde1aa7..681e42ea 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java
@@ -45,12 +45,12 @@ public class IndexWordsTable implements AutoCloseable {
private static MultimapFileLong openWordsFile(RandomAccessFile wordsFile) throws IOException {
return new MultimapFileLong(wordsFile,
- FileChannel.MapMode.READ_ONLY, wordsFile.length(), BUFFER_SIZE, false);
+ FileChannel.MapMode.READ_ONLY, wordsFile.length(), BUFFER_SIZE);
}
public long positionForWord(int wordId) {
- long offset = reader.offsetForEntry(header, wordId);
+ long offset = reader.findEntry(header, wordId);
if (offset < 0) {
return -1L;
}
@@ -60,7 +60,7 @@ public class IndexWordsTable implements AutoCloseable {
public int wordLength(int wordId) {
- long offset = reader.offsetForEntry(header, wordId);
+ long offset = reader.findEntry(header, wordId);
if (offset < 0) {
return -1;
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java
index 042f8f54..0ab4d80b 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java
@@ -82,7 +82,7 @@ public class SearchIndex implements AutoCloseable {
if (!range.isPresent())
return false;
- return bTreeReader.offsetForEntry(bTreeReader.getHeader(range.dataOffset), url) >= 0;
+ return bTreeReader.findEntry(bTreeReader.getHeader(range.dataOffset), url) >= 0;
}
public class UrlIndexTree {
diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java
index 875cda37..73aa4dc3 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java
@@ -48,9 +48,9 @@ class BTreeWriterTest {
@Test
void testLayerOffset() {
int wcub = ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS();
- System.out.println(writer.makeHeader(1025, wcub).relativeLayerOffset(ctx, 0));
- System.out.println(writer.makeHeader(1025, wcub).relativeLayerOffset(ctx, 1));
- System.out.println(writer.makeHeader(1025, wcub).relativeLayerOffset(ctx, 2));
+ System.out.println(writer.makeHeader(1025, wcub).relativeIndexLayerOffset(ctx, 0));
+ System.out.println(writer.makeHeader(1025, wcub).relativeIndexLayerOffset(ctx, 1));
+ System.out.println(writer.makeHeader(1025, wcub).relativeIndexLayerOffset(ctx, 2));
for (int i = 0; i < 1024; i++) {
var header = writer.makeHeader(0, i);
@@ -59,7 +59,7 @@ class BTreeWriterTest {
printTreeLayout(i, header, ctx);
if (header.layers() >= 1) {
- assertEquals(1, ctx.layerSize(i, header.layers() - 1) / ctx.BLOCK_SIZE_WORDS());
+ assertEquals(1, ctx.indexLayerSize(i, header.layers() - 1) / ctx.BLOCK_SIZE_WORDS());
}
}
}
@@ -67,7 +67,7 @@ class BTreeWriterTest {
private void printTreeLayout(int numEntries, BTreeHeader header, BTreeContext ctx) {
StringJoiner sj = new StringJoiner(",");
for (int l = 0; l < header.layers(); l++) {
- sj.add(""+ctx.layerSize(numEntries, l)/ctx.BLOCK_SIZE_WORDS());
+ sj.add(""+ctx.indexLayerSize(numEntries, l)/ctx.BLOCK_SIZE_WORDS());
}
System.out.println(numEntries + ":" + sj);
}
@@ -86,7 +86,7 @@ class BTreeWriterTest {
try {
RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw");
- MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000, true);
+ MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000);
{
var writer = new BTreeWriter(mmf, ctx);
@@ -103,7 +103,7 @@ class BTreeWriterTest {
var reader = new BTreeReader(mmf, ctx);
var header = reader.getHeader(0);
for (int i = 0; i < data.length; i++) {
- long offset = reader.offsetForEntry(header, data[i]);
+ long offset = reader.findEntry(header, data[i]);
assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset);
assertEquals(i, mmf.get(offset+1));
}
@@ -129,7 +129,7 @@ class BTreeWriterTest {
try {
RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw");
- MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000, true);
+ MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000);
{
var writer = new BTreeWriter(mmf, ctx);
@@ -146,7 +146,7 @@ class BTreeWriterTest {
var reader = new BTreeReader(mmf, ctx);
var header = reader.getHeader(0);
for (int i = 0; i < data.length; i++) {
- long offset = reader.offsetForEntry(header, data[i]);
+ long offset = reader.findEntry(header, data[i]);
assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset);
assertEquals(i, mmf.get(offset+1));
}
@@ -154,7 +154,7 @@ class BTreeWriterTest {
for (int i = 0; i < 500; i++) {
long val = (long)(Long.MAX_VALUE * Math.random());
while (toPut.contains((int)val)) val = (long)(Long.MAX_VALUE * Math.random());
- assertEquals(-1, reader.offsetForEntry(header, val));
+ assertEquals(-1, reader.findEntry(header, val));
}
}
} catch (Exception e) {
@@ -197,7 +197,7 @@ class BTreeWriterTest {
printTreeLayout(toPut.size(), header, ctx);
for (int i = 0; i < data.length; i++) {
- long offset = reader.offsetForEntry(header, data[i]);
+ long offset = reader.findEntry(header, data[i]);
assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset);
assertEquals(data[i], mmf.get(offset));
}
@@ -205,7 +205,7 @@ class BTreeWriterTest {
for (int i = 0; i < 500; i++) {
long val = (long) (Long.MAX_VALUE * Math.random());
while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random());
- assertEquals(-1, reader.offsetForEntry(header, val));
+ assertEquals(-1, reader.findEntry(header, val));
}
}
} catch (Exception e) {
@@ -250,7 +250,7 @@ class BTreeWriterTest {
printTreeLayout(toPut.size(), header, ctx);
for (int i = 0; i < data.length; i++) {
- long offset = reader.offsetForEntry(header, data[i] & mask);
+ long offset = reader.findEntry(header, data[i] & mask);
assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset);
assertEquals(data[i], mmf.get(offset));
}
@@ -258,7 +258,7 @@ class BTreeWriterTest {
for (int i = 0; i < 500; i++) {
long val = (long) (Long.MAX_VALUE * Math.random());
while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random());
- assertEquals(-1, reader.offsetForEntry(header, val & mask));
+ assertEquals(-1, reader.findEntry(header, val & mask));
}
}
} catch (Exception e) {
@@ -304,7 +304,7 @@ class BTreeWriterTest {
printTreeLayout(toPut.size(), header, ctx);
for (int i = 0; i < data.length; i++) {
- long offset = reader.offsetForEntry(header, data[i] & mask);
+ long offset = reader.findEntry(header, data[i] & mask);
assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset);
assertEquals(data[i], mmf.get(offset));
assertEquals(i, mmf.get(offset+1));
@@ -313,7 +313,7 @@ class BTreeWriterTest {
for (int i = 0; i < 500; i++) {
long val = (long) (Long.MAX_VALUE * Math.random());
while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random());
- assertEquals(-1, reader.offsetForEntry(header, val & mask));
+ assertEquals(-1, reader.findEntry(header, val & mask));
}
}
} catch (Exception e) {
diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java
index 9331a998..d2bec272 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/util/hash/LongPairHashMapTest.java
@@ -26,7 +26,7 @@ class LongPairHashMapTest {
try {
RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw");
- MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000, true);
+ MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000);
var lphm = LongPairHashMap.createNew(mmf, 1024);
toPut.forEach(i -> {
lphm.put(new LongPairHashMap.CellData(i, i));
@@ -35,7 +35,7 @@ class LongPairHashMapTest {
lphm.close();
RandomAccessFile raf2 = new RandomAccessFile(tempFile.toFile(), "rw");
- MultimapFileLong mmf2 = new MultimapFileLong(raf2, FileChannel.MapMode.READ_WRITE, 10000, 1000, true);
+ MultimapFileLong mmf2 = new MultimapFileLong(raf2, FileChannel.MapMode.READ_WRITE, 10000, 1000);
var lphm2 = LongPairHashMap.loadExisting(mmf2);
toPut.forEach(i -> {
Assertions.assertTrue(lphm2.get(i).isSet());
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/MultimapFileTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/MultimapFileTest.java
index 44e4207a..bb7b360e 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/MultimapFileTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/MultimapFileTest.java
@@ -56,7 +56,7 @@ class MultimapFileTest {
@SneakyThrows
@Test
void put() {
- var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8, false);
+ var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8);
for (int i = 0; i < 32; i++) {
file.put(i, i);
}
@@ -68,7 +68,7 @@ class MultimapFileTest {
@SneakyThrows
@Test
void read() {
- var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8, false);
+ var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8);
for (int i = 0; i < 32; i++) {
file.put(i, i);
}
@@ -85,7 +85,7 @@ class MultimapFileTest {
@Test
void write() throws IOException {
- var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8, false);
+ var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8);
for (int i = 0; i < 32-6; i++) {
file.write(new long[] { 0,1,2,3,4,5}, i);
@@ -98,7 +98,7 @@ class MultimapFileTest {
@Test
void sortInternal() throws IOException {
- var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8, false);
+ var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8);
var sorter = file.createSorter(Path.of("/tmp"), 16);
var searcher = file.createSearcher();
for (int i = 0; i < 32; i++) {
@@ -109,13 +109,13 @@ class MultimapFileTest {
for (int i = 2+1; i < 16; i++) {
assertTrue(file.get(i) > file.get(i-1));
- assertTrue(searcher.binarySearch(file.get(i), 2, 18));
+ assertTrue(searcher.binarySearchTest(file.get(i), 2, 16));
}
}
@Test
void sortExternal() throws IOException {
- var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8, false);
+ var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8);
var sorter = file.createSorter(Path.of("/tmp"), 2);
var searcher = file.createSearcher();
@@ -128,7 +128,7 @@ class MultimapFileTest {
for (int i = 2+1; i < 16; i++) {
assertTrue(file.get(i) > file.get(i-1));
- assertTrue(searcher.binarySearch(file.get(i), 2, 18));
+ assertTrue(searcher.binarySearchTest(file.get(i), 2, 16));
}
}
From c324c80efca4abe9c5cd23db365231333c0293da Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Mon, 20 Jun 2022 12:04:06 +0200
Subject: [PATCH 05/40] Refactoring BTreeReader and binary search code
---
.../nu/marginalia/util/btree/BTreeReader.java | 6 ++---
.../util/multimap/MultimapSearcher.java | 26 +++++++++----------
.../util/multimap/MultimapSearcherBase.java | 12 ++++-----
3 files changed, 22 insertions(+), 22 deletions(-)
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java
index de675776..e0f3851a 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java
@@ -41,7 +41,7 @@ public class BTreeReader {
final int blockSize = ctx.BLOCK_SIZE_WORDS();
if (header.layers() == 0) { // For small data, we only have a data block
- return dataSearcher.binarySearchUpperBound(key, dataAddress, header.numEntries());
+ return dataSearcher.binarySearch(key, dataAddress, header.numEntries());
}
final long indexOffset = header.indexOffsetLongs();
@@ -68,11 +68,11 @@ public class BTreeReader {
final long lastItemInBlockAddress = searchStart + (long) blockSize * entrySize;
final long searchEnd = Math.min(lastItemInBlockAddress, lastDataAddress);
- return dataSearcher.binarySearchUpperBound(key, searchStart, (searchEnd - searchStart) / entrySize);
+ return dataSearcher.binarySearch(key, searchStart, (searchEnd - searchStart) / entrySize);
}
private long indexSearch(long key, long start, long n) {
- return indexSearcher.binarySearch(key, start, n) - start;
+ return indexSearcher.binarySearchUpper(key, start, n) - start;
}
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java
index 886912c5..dd339e40 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java
@@ -1,8 +1,8 @@
package nu.marginalia.util.multimap;
public interface MultimapSearcher {
+ long binarySearchUpper(long key, long fromIndex, long n);
long binarySearch(long key, long fromIndex, long n);
- long binarySearchUpperBound(long key, long fromIndex, long n);
static MultimapSearcher forContext(MultimapFileLongSlice slice, long mask, int stepSize) {
if (mask == ~0L && stepSize == 1) {
@@ -25,13 +25,13 @@ class SimpleMultimapSearcher implements MultimapSearcher {
}
@Override
- public long binarySearch(long key, long fromIndex, long n) {
- return base.binarySearchOffset(key, fromIndex, n);
+ public long binarySearchUpper(long key, long fromIndex, long n) {
+ return base.binarySearchUpper(key, fromIndex, n);
}
@Override
- public long binarySearchUpperBound(long key, long fromIndex, long n) {
- return base.binarySearchUpperBound(key, fromIndex, n);
+ public long binarySearch(long key, long fromIndex, long n) {
+ return base.binarySearch(key, fromIndex, n);
}
}
@@ -46,13 +46,13 @@ class MaskedMultimapSearcher implements MultimapSearcher {
}
@Override
- public long binarySearch(long key, long fromIndex, long n) {
- return base.binarySearchOffset(key, fromIndex, n, mask);
+ public long binarySearchUpper(long key, long fromIndex, long n) {
+ return base.binarySearchUpper(key, fromIndex, n, mask);
}
@Override
- public long binarySearchUpperBound(long key, long fromIndex, long n) {
- return base.binarySearchUpperBound(key, fromIndex, n, mask);
+ public long binarySearch(long key, long fromIndex, long n) {
+ return base.binarySearch(key, fromIndex, n, mask);
}
}
@@ -69,12 +69,12 @@ class SteppingMaskedMultimapSearcher implements MultimapSearcher {
}
@Override
- public long binarySearch(long key, long fromIndex, long n) {
- return base.binarySearchOffset(key, fromIndex, step, n, mask);
+ public long binarySearchUpper(long key, long fromIndex, long n) {
+ return base.binarySearchUpper(key, fromIndex, step, n, mask);
}
@Override
- public long binarySearchUpperBound(long key, long fromIndex, long n) {
- return base.binarySearchUpperBound(key, fromIndex, step, n, mask);
+ public long binarySearch(long key, long fromIndex, long n) {
+ return base.binarySearch(key, fromIndex, step, n, mask);
}
}
\ No newline at end of file
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcherBase.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcherBase.java
index 2bd8c166..30549a8c 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcherBase.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcherBase.java
@@ -29,7 +29,7 @@ public class MultimapSearcherBase {
return false;
}
- public long binarySearchOffset(long key, long fromIndex, long n) {
+ public long binarySearchUpper(long key, long fromIndex, long n) {
long low = 0;
long high = n - 1;
@@ -48,7 +48,7 @@ public class MultimapSearcherBase {
}
- public long binarySearchOffset(long key, long fromIndex, long n, long mask) {
+ public long binarySearchUpper(long key, long fromIndex, long n, long mask) {
long low = 0;
long high = n - 1;
@@ -67,7 +67,7 @@ public class MultimapSearcherBase {
}
- public long binarySearchOffset(long key, long fromIndex, int step, long n, long mask) {
+ public long binarySearchUpper(long key, long fromIndex, int step, long n, long mask) {
long low = 0;
long high = n - 1;
@@ -85,7 +85,7 @@ public class MultimapSearcherBase {
return fromIndex + low;
}
- public long binarySearchUpperBound(long key, long fromIndex, long n) {
+ public long binarySearch(long key, long fromIndex, long n) {
long low = 0;
long high = n - 1;
@@ -104,7 +104,7 @@ public class MultimapSearcherBase {
}
- public long binarySearchUpperBound(long key, long fromIndex, long n, long mask) {
+ public long binarySearch(long key, long fromIndex, long n, long mask) {
long low = 0;
long high = n - 1;
@@ -123,7 +123,7 @@ public class MultimapSearcherBase {
}
- public long binarySearchUpperBound(long key, long fromIndex, int step, long n, long mask) {
+ public long binarySearch(long key, long fromIndex, int step, long n, long mask) {
long low = 0;
long high = n - 1;
From b1eff0107ce9a75c856e90deae5dd732035bf57a Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Mon, 20 Jun 2022 12:25:34 +0200
Subject: [PATCH 06/40] Refactoring BTreeReader and binary search code
---
.../nu/marginalia/util/btree/BTreeReader.java | 57 ++++++++++---------
.../util/btree/model/BTreeContext.java | 11 ++--
2 files changed, 33 insertions(+), 35 deletions(-)
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java
index e0f3851a..ab1b7a97 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java
@@ -4,16 +4,15 @@ import nu.marginalia.util.btree.model.BTreeContext;
import nu.marginalia.util.btree.model.BTreeHeader;
import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.util.multimap.MultimapSearcher;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import org.jetbrains.annotations.Nullable;
+
+import static java.lang.Math.min;
public class BTreeReader {
private final MultimapFileLong file;
private final BTreeContext ctx;
- private final Logger logger = LoggerFactory.getLogger(BTreeReader.class);
-
private final MultimapSearcher indexSearcher;
private final MultimapSearcher dataSearcher;
@@ -35,40 +34,42 @@ public class BTreeReader {
*/
public long findEntry(BTreeHeader header, final long keyRaw) {
final long key = keyRaw & ctx.equalityMask();
-
- final long dataAddress = header.dataOffsetLongs();
- final int entrySize = ctx.entrySize();
final int blockSize = ctx.BLOCK_SIZE_WORDS();
+ final long dataAddress = header.dataOffsetLongs();
if (header.layers() == 0) { // For small data, we only have a data block
return dataSearcher.binarySearch(key, dataAddress, header.numEntries());
}
- final long indexOffset = header.indexOffsetLongs();
-
- // Search the top layer
- long layerOffset = indexSearch(key, indexOffset, blockSize);
- if (layerOffset < 0) return -1;
-
- // Search intermediary layers
- for (int i = header.layers() - 2; i >= 0; --i) {
- final long layerAddressBase = indexOffset + header.relativeIndexLayerOffset(ctx, i);
- final long layerBlockOffset = layerAddressBase + layerOffset;
-
- final long nextLayerOffset = indexSearch(key, layerBlockOffset, blockSize);
- if (nextLayerOffset < 0)
- return -1;
-
- layerOffset = blockSize*(nextLayerOffset + layerOffset);
+ // Search index layers
+ long dataLayerOffset = searchIndex(header, key);
+ if (dataLayerOffset < 0) {
+ return dataLayerOffset;
}
// Search the corresponding data block
- final long searchStart = dataAddress + layerOffset * entrySize;
- final long lastDataAddress = dataAddress + (long) header.numEntries() * entrySize;
- final long lastItemInBlockAddress = searchStart + (long) blockSize * entrySize;
- final long searchEnd = Math.min(lastItemInBlockAddress, lastDataAddress);
+ final long searchStart = dataAddress + dataLayerOffset * ctx.entrySize();
+ final long numEntries = min(header.numEntries() - dataLayerOffset, blockSize);
- return dataSearcher.binarySearch(key, searchStart, (searchEnd - searchStart) / entrySize);
+ return dataSearcher.binarySearch(key, searchStart, numEntries);
+ }
+
+ private long searchIndex(BTreeHeader header, long key) {
+ final int blockSize = ctx.BLOCK_SIZE_WORDS();
+ final long indexAddress = header.indexOffsetLongs();
+
+ long layerOffset = 0;
+
+ for (int i = header.layers() - 1; i >= 0; --i) {
+ final long layerBlockOffset = header.relativeIndexLayerOffset(ctx, i) + layerOffset;
+
+ final long nextLayerOffset = indexSearch(key, indexAddress + layerBlockOffset, blockSize);
+ if (nextLayerOffset < 0)
+ return -1;
+
+ layerOffset = blockSize *(nextLayerOffset + layerOffset);
+ }
+ return layerOffset;
}
private long indexSearch(long key, long start, long n) {
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java
index a7d6b22b..e91b71fd 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java
@@ -36,16 +36,13 @@ public record BTreeContext(int MAX_LAYERS,
}
public long indexLayerSize(int numWords, int level) {
+ final long layerSize = 1L<<(BLOCK_SIZE_BITS*(level+1));
+ final long numBlocks = numWords / layerSize;
- long layerSize = 1L<<(BLOCK_SIZE_BITS*(level+1));
- int numBlocks = 0;
-
- numBlocks += numWords / layerSize;
if (numWords % layerSize != 0) {
- numBlocks++;
+ return BLOCK_SIZE_WORDS * (numBlocks + 1);
}
-
- return (long) BLOCK_SIZE_WORDS * numBlocks;
+ return BLOCK_SIZE_WORDS * numBlocks;
}
}
From 8139ab0d1d37b88f19662e2c527344b850236b8c Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Mon, 20 Jun 2022 12:28:15 +0200
Subject: [PATCH 07/40] Refactoring BTreeReader and binary search code
---
.../src/main/java/nu/marginalia/util/btree/BTreeReader.java | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java
index ab1b7a97..42605c04 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java
@@ -4,7 +4,6 @@ import nu.marginalia.util.btree.model.BTreeContext;
import nu.marginalia.util.btree.model.BTreeHeader;
import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.util.multimap.MultimapSearcher;
-import org.jetbrains.annotations.Nullable;
import static java.lang.Math.min;
@@ -33,8 +32,9 @@ public class BTreeReader {
* @return file offset of entry matching keyRaw, negative if absent
*/
public long findEntry(BTreeHeader header, final long keyRaw) {
- final long key = keyRaw & ctx.equalityMask();
final int blockSize = ctx.BLOCK_SIZE_WORDS();
+
+ final long key = keyRaw & ctx.equalityMask();
final long dataAddress = header.dataOffsetLongs();
if (header.layers() == 0) { // For small data, we only have a data block
From 1068694db681bc0624eff917300c4ba3d1769e6e Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Mon, 20 Jun 2022 12:35:58 +0200
Subject: [PATCH 08/40] Refactoring BTreeReader and binary search code
---
.../nu/marginalia/util/btree/BTreeReader.java | 34 +++++++++++--------
1 file changed, 20 insertions(+), 14 deletions(-)
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java
index 42605c04..5d86c4d2 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java
@@ -5,6 +5,8 @@ import nu.marginalia.util.btree.model.BTreeHeader;
import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.util.multimap.MultimapSearcher;
+import javax.annotation.CheckReturnValue;
+
import static java.lang.Math.min;
public class BTreeReader {
@@ -37,19 +39,22 @@ public class BTreeReader {
final long key = keyRaw & ctx.equalityMask();
final long dataAddress = header.dataOffsetLongs();
- if (header.layers() == 0) { // For small data, we only have a data block
- return dataSearcher.binarySearch(key, dataAddress, header.numEntries());
- }
+ final long searchStart;
+ final long numEntries;
- // Search index layers
- long dataLayerOffset = searchIndex(header, key);
- if (dataLayerOffset < 0) {
- return dataLayerOffset;
+ if (header.layers() == 0) { // For small data, there is no index block, only a flat data block
+ searchStart = dataAddress;
+ numEntries = header.numEntries();
}
+ else {
+ long dataLayerOffset = searchIndex(header, key);
+ if (dataLayerOffset < 0) {
+ return dataLayerOffset;
+ }
- // Search the corresponding data block
- final long searchStart = dataAddress + dataLayerOffset * ctx.entrySize();
- final long numEntries = min(header.numEntries() - dataLayerOffset, blockSize);
+ searchStart = dataAddress + dataLayerOffset * ctx.entrySize();
+ numEntries = min(header.numEntries() - dataLayerOffset, blockSize);
+ }
return dataSearcher.binarySearch(key, searchStart, numEntries);
}
@@ -61,14 +66,15 @@ public class BTreeReader {
long layerOffset = 0;
for (int i = header.layers() - 1; i >= 0; --i) {
- final long layerBlockOffset = header.relativeIndexLayerOffset(ctx, i) + layerOffset;
+ final long indexLayerBlockOffset = header.relativeIndexLayerOffset(ctx, i) + layerOffset;
- final long nextLayerOffset = indexSearch(key, indexAddress + layerBlockOffset, blockSize);
+ final long nextLayerOffset = indexSearch(key, indexAddress + indexLayerBlockOffset, blockSize);
if (nextLayerOffset < 0)
- return -1;
+ return nextLayerOffset;
- layerOffset = blockSize *(nextLayerOffset + layerOffset);
+ layerOffset = blockSize * (nextLayerOffset + layerOffset);
}
+
return layerOffset;
}
From 35878c510244aaac83fca373003a22a5d30d212a Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Wed, 22 Jun 2022 12:57:58 +0200
Subject: [PATCH 09/40] Anchor text capture work-in-progress
---
.../java/nu/marginalia/util/DenseBitMap.java | 37 ++++
.../nu/marginalia/util/RandomWriteFunnel.java | 2 +-
.../nu/marginalia/util/btree/BTreeReader.java | 6 +-
.../processing/model/DocumentSentence.java | 37 +++-
.../wmsa/edge/converting/ConverterMain.java | 16 +-
.../converting/LinkKeywordExtractorMain.java | 194 ++++++++++++++++++
.../processor/logic/LinkParser.java | 9 +-
.../conversion/SearchIndexConverter.java | 15 --
.../conversion/words/WordsTableWriter.java | 4 +-
.../wmsa/edge/model/EdgeCrawlPlan.java | 11 +
.../nu/marginalia/util/DenseBitMapTest.java | 56 +++++
11 files changed, 343 insertions(+), 44 deletions(-)
create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/DenseBitMap.java
create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java
create mode 100644 marginalia_nu/src/test/java/nu/marginalia/util/DenseBitMapTest.java
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/DenseBitMap.java b/marginalia_nu/src/main/java/nu/marginalia/util/DenseBitMap.java
new file mode 100644
index 00000000..39b34048
--- /dev/null
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/DenseBitMap.java
@@ -0,0 +1,37 @@
+package nu.marginalia.util;
+
+import java.nio.ByteBuffer;
+
+public class DenseBitMap {
+ public static final long MAX_CAPACITY_2GB_16BN_ITEMS=(1L<<34)-8;
+
+ public final long cardinality;
+ private final ByteBuffer buffer;
+
+ public DenseBitMap(long cardinality) {
+ this.cardinality = cardinality;
+
+ boolean misaligned = (cardinality & 7) > 0;
+ this.buffer = ByteBuffer.allocateDirect((int)((cardinality / 8) + (misaligned ? 1 : 0)));
+ }
+
+ public boolean get(long pos) {
+ return (buffer.get((int)(pos >>> 3)) & ((byte)1 << (int)(pos & 7))) != 0;
+ }
+
+ /** Set the bit indexed by pos, returns
+ * its previous value.
+ */
+ public boolean set(long pos) {
+ int offset = (int) (pos >>> 3);
+ int oldVal = buffer.get(offset);
+ int mask = (byte) 1 << (int) (pos & 7);
+ buffer.put(offset, (byte) (oldVal | mask));
+ return (oldVal & mask) != 0;
+ }
+
+ public void clear(long pos) {
+ int offset = (int)(pos >>> 3);
+ buffer.put(offset, (byte)(buffer.get(offset) & ~(byte)(1 << (int)(pos & 7))));
+ }
+}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java b/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java
index 0c274c2b..ada8de71 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java
@@ -125,7 +125,7 @@ public class RandomWriteFunnel implements AutoCloseable {
dest.putLong(addr, data);
}
catch (IndexOutOfBoundsException ex) {
- logger.info("!!!bad[{}]={}", addr, data);
+ logger.info("Bad poke[{}]={}, this happens if an RWF is allocated with insufficient size", addr, data);
}
}
buffer.compact();
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java
index 5d86c4d2..388eb175 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java
@@ -5,8 +5,6 @@ import nu.marginalia.util.btree.model.BTreeHeader;
import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.util.multimap.MultimapSearcher;
-import javax.annotation.CheckReturnValue;
-
import static java.lang.Math.min;
public class BTreeReader {
@@ -68,7 +66,7 @@ public class BTreeReader {
for (int i = header.layers() - 1; i >= 0; --i) {
final long indexLayerBlockOffset = header.relativeIndexLayerOffset(ctx, i) + layerOffset;
- final long nextLayerOffset = indexSearch(key, indexAddress + indexLayerBlockOffset, blockSize);
+ final long nextLayerOffset = relativePositionInIndex(key, indexAddress + indexLayerBlockOffset, blockSize);
if (nextLayerOffset < 0)
return nextLayerOffset;
@@ -78,7 +76,7 @@ public class BTreeReader {
return layerOffset;
}
- private long indexSearch(long key, long start, long n) {
+ private long relativePositionInIndex(long key, long start, long n) {
return indexSearcher.binarySearchUpper(key, start, n) - start;
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java
index b4406954..5630939f 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java
@@ -2,15 +2,17 @@ package nu.marginalia.util.language.processing.model;
import nu.marginalia.util.language.WordPatterns;
+import org.jetbrains.annotations.NotNull;
import java.lang.ref.SoftReference;
import java.util.BitSet;
+import java.util.Iterator;
import java.util.StringJoiner;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
-public class DocumentSentence {
+public class DocumentSentence implements Iterable{
public final String originalSentence;
public final String[] words;
public final int[] separators;
@@ -85,4 +87,37 @@ public class DocumentSentence {
public String toString() {
return IntStream.range(0, length()).mapToObj(i -> String.format("%s[%s]", words[i], posTags[i])).collect(Collectors.joining(" "));
}
+
+ @NotNull
+ @Override
+ public Iterator iterator() {
+ return new Iterator<>() {
+ int i = -1;
+ @Override
+ public boolean hasNext() {
+ return i+1 < length();
+ }
+
+ @Override
+ public SentencePos next() {
+ return new SentencePos(++i);
+ }
+ };
+ }
+
+ public class SentencePos {
+ public final int pos;
+
+ public SentencePos(int pos) {
+ this.pos = pos;
+ }
+
+ public String word() { return words[pos]; }
+ public String wordLowerCase() { return wordsLowerCase[pos]; }
+ public String posTag() { return posTags[pos]; }
+ public String stemmed() { return stemmedWords[pos]; }
+ public int separator() { return separators[pos]; }
+ public boolean isStopWord() { return DocumentSentence.this.isStopWord(pos); }
+ }
}
+
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java
index 5d6f2762..61ff0b00 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java
@@ -52,13 +52,6 @@ public class ConverterMain {
injector.getInstance(ConverterMain.class);
}
- private static void requireArgs(String[] args, String... help) {
- if (args.length != help.length) {
- System.out.println("Usage: " + String.join(", ", help));
- System.exit(255);
- }
- }
-
@Inject
public ConverterMain(
EdgeCrawlPlan plan,
@@ -103,7 +96,8 @@ public class ConverterMain {
domainToId.forEach((domain, id) -> {
String fileName = idToFileName.get(id);
- Path dest = getFilePath(plan.crawl.getDir(), fileName);
+ Path dest = plan.getCrawledFilePath(fileName);
+
logger.info("{} - {} - {}", domain, id, dest);
if (!processLog.isJobFinished(id)) {
@@ -128,10 +122,4 @@ public class ConverterMain {
record ProcessingInstructions(String id, List instructions) {}
- private Path getFilePath(Path dir, String fileName) {
- String sp1 = fileName.substring(0, 2);
- String sp2 = fileName.substring(2, 4);
- return dir.resolve(sp1).resolve(sp2).resolve(fileName);
- }
-
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java
new file mode 100644
index 00000000..63c26200
--- /dev/null
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java
@@ -0,0 +1,194 @@
+package nu.marginalia.wmsa.edge.converting;
+
+import com.google.common.hash.HashFunction;
+import com.google.common.hash.Hashing;
+import com.google.inject.Guice;
+import com.google.inject.Inject;
+import com.google.inject.Injector;
+import nu.marginalia.util.DenseBitMap;
+import nu.marginalia.util.language.WordPatterns;
+import nu.marginalia.util.language.processing.SentenceExtractor;
+import nu.marginalia.util.language.processing.model.DocumentLanguageData;
+import nu.marginalia.wmsa.configuration.WmsaHome;
+import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
+import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
+import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader;
+import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader;
+import nu.marginalia.wmsa.edge.crawling.WorkLog;
+import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
+import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
+import nu.marginalia.wmsa.edge.model.EdgeUrl;
+import org.jsoup.Jsoup;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Objects;
+import java.util.function.Predicate;
+import java.util.regex.Pattern;
+
+public class LinkKeywordExtractorMain {
+ private static final Logger logger = LoggerFactory.getLogger(LinkKeywordExtractorMain.class);
+
+ public static void main(String... args) throws IOException {
+
+ if (args.length != 1) {
+ System.err.println("Arguments: crawl-plan.yaml");
+ System.exit(0);
+ }
+ var plan = new CrawlPlanLoader().load(Path.of(args[0]));
+
+ Injector injector = Guice.createInjector(
+ new ConverterModule(plan)
+ );
+
+ injector.getInstance(LinkKeywordExtractorMain.class);
+ }
+
+ private final HashSet crawledDomains = new HashSet<>();
+ private final List fileNames = new ArrayList<>();
+ private final LinkParser linkParser = new LinkParser();
+ private final SentenceExtractor sentenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels());
+
+ private final HashFunction hashFunction = Hashing.murmur3_128();
+
+ // This bit map is used as a bloom filter to deduplicate url-keyword combinations
+ // false positives are expected, but that's an acceptable trade-off to not have to deal with
+ // de-duplicating billions of shuffled (url, word) tuples on limited hardware
+ private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS);
+
+ @Inject
+ public LinkKeywordExtractorMain(EdgeCrawlPlan plan) throws IOException {
+ logger.info("Loading input spec");
+
+ CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
+ spec -> crawledDomains.add(spec.domain));
+
+ logger.info("Replaying crawl log");
+ WorkLog.readLog(plan.crawl.getLogFile(),
+ entry -> fileNames.add(entry.path()));
+
+ logger.info("Reading files");
+ for (var fn : fileNames) {
+ CrawledDomainReader crawledDomainReader = new CrawledDomainReader();
+ var crawledDomain = crawledDomainReader.read(plan.getCrawledFilePath(fn));
+ if (crawledDomain.doc == null) continue;
+
+ System.out.println("# " + crawledDomain.domain);
+
+ for (var doc : crawledDomain.doc) {
+ try {
+ if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) {
+ processDocument(doc.url, doc.documentBody);
+ }
+ }
+ catch (URISyntaxException ex) {
+ // This Shouldn't Happen (TM) as the URL that we're failing to process
+ // is expected to have already been parsed by this code successfully
+ // in the process of getting here.
+ //
+ // But also, if it does happen, it's no big deal
+
+ logger.warn("Bad URL format", ex);
+ }
+ }
+ }
+ }
+
+ private final Pattern anchorTextNoise = Pattern.compile("[\\s\"()“”:]+");
+
+ private void processDocument(String docUrl, String documentBody) throws URISyntaxException {
+ var processed = Jsoup.parse(documentBody);
+
+ EdgeUrl documentUrl = new EdgeUrl(docUrl);
+
+ for (var link : processed.getElementsByTag("a")) {
+ if (link.hasAttr("href")) {
+ String href = link.attr("href");
+ String text = anchorTextNoise.matcher(link.text().toLowerCase()).replaceAll(" ").trim();
+
+ processAnchor(documentUrl, href, text);
+ }
+ }
+ }
+
+ private void processAnchor(EdgeUrl documentUrl, String href, String text) {
+ if (!isInterestingAnchorText(text)) {
+ return;
+ }
+
+ var optLinkUrl = linkParser.parseLink(documentUrl, href);
+ if (optLinkUrl.isEmpty()) return;
+
+ var linkUrl = optLinkUrl.get();
+
+ if (!isInterestingAnchorLink(linkUrl)) {
+ return;
+ }
+
+ DocumentLanguageData languageData = sentenceExtractor.extractSentences(text);
+ for (var sent : languageData.sentences) {
+ for (var wordPos : sent) {
+ if (wordPos.isStopWord())
+ continue;
+
+ String word = wordPos.wordLowerCase();
+ if (!WordPatterns.wordQualitiesPredicate.test(word) || !WordPatterns.filter(word))
+ continue;
+
+
+ if (!linkUrl.domain.equals(documentUrl.domain)) {
+ if (isNewKeywordForLink(word, linkUrl.toString())) {
+ System.out.println(linkUrl + "\t" + word);
+ }
+ }
+ }
+ }
+ }
+
+ // This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine
+ private final Predicate looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate();
+
+ private boolean isInterestingAnchorText(String text) {
+ if (text.isBlank()) return false;
+ if (text.length() > 32) return false;
+
+ // Google loves questions, and so does SEO spammers
+ if (text.endsWith("?")) return false;
+
+ if (text.startsWith("http:") || text.startsWith("https:")) return false;
+
+ if (looksLikeAnURL.test(text)) return false;
+
+ return switch (text) {
+ case "this", "here", "click", "click here", "download", "source" -> false;
+ default -> true;
+ };
+ }
+
+ private boolean isInterestingAnchorLink(EdgeUrl linkUrl) {
+ if (!(linkUrl.proto.endsWith("http") || linkUrl.proto.equals("https"))) {
+ return false;
+ }
+
+ return crawledDomains.contains(linkUrl.domain.toString());
+ }
+
+ private boolean isNewKeywordForLink(String href, String text) {
+ long hash = 0;
+
+ hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).asLong();
+ hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).asLong();
+
+ // Remove sign bit because we don't want a negative index in deduplicateHashBitset
+ hash &= 0x7FFF_FFFF_FFFF_FFFFL;
+
+ return !deduplicateHashBitset.set(hash % deduplicateHashBitset.cardinality);
+ }
+}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java
index 378182f2..0a2bdf45 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java
@@ -145,13 +145,8 @@ public class LinkParser {
}
private boolean isRelRelevant(String rel) {
- if (null == rel) {
- return true;
- }
- return switch (rel) {
- case "noindex" -> false;
- default -> true;
- };
+ // this is null safe
+ return !"noindex".equalsIgnoreCase(rel);
}
private boolean isUrlRelevant(String href) {
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java
index 2d12d0f4..afa319f4 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java
@@ -91,18 +91,13 @@ public class SearchIndexConverter {
}
}
-
-
private WordIndexOffsetsTable createWordIndexTable(SearchIndexJournalReader journalReader,
File outputFileWords) throws IOException
{
final int topWord = (int) journalReader.fileHeader.wordCount();
- logger.debug("Table size = {}", topWord);
WordsTableWriter wordsTableWriter = new WordsTableWriter(topWord);
- logger.debug("Reading words");
-
for (var entry : journalReader) {
if (!isRelevantEntry(entry)) {
continue;
@@ -119,8 +114,6 @@ public class SearchIndexConverter {
}
}
- logger.debug("Rearranging table");
-
wordsTableWriter.write(outputFileWords);
return wordsTableWriter.getTable();
@@ -130,15 +123,12 @@ public class SearchIndexConverter {
Path tmpUrlsFile,
WordIndexOffsetsTable wordOffsetsTable) throws IOException
{
- logger.info("Table size = {}", wordOffsetsTable.length());
-
long numberOfWordsTotal = 0;
for (var entry : journalReader) {
if (isRelevantEntry(entry))
numberOfWordsTotal += entry.wordCount();
}
-
try (RandomAccessFile urlsTmpFileRAF = new RandomAccessFile(tmpUrlsFile.toFile(), "rw");
FileChannel urlsTmpFileChannel = urlsTmpFileRAF.getChannel()) {
@@ -168,7 +158,6 @@ public class SearchIndexConverter {
}
}
-
rwf.write(urlsTmpFileChannel);
}
@@ -176,8 +165,6 @@ public class SearchIndexConverter {
try (var urlsTmpFileMap = MultimapFileLong.forOutput(tmpUrlsFile, numberOfWordsTotal)) {
if (wordOffsetsTable.length() > 0) {
- logger.info("Sorting urls table");
-
var urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit);
wordOffsetsTable.forEachRange(urlTmpFileSorter::sort);
@@ -188,7 +175,6 @@ public class SearchIndexConverter {
}
}
- logger.info("Writing BTree");
try (var urlsFileMap = MultimapFileLong.forOutput(outputFileUrls.toPath(), numberOfWordsTotal)) {
var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext);
@@ -206,7 +192,6 @@ public class SearchIndexConverter {
}
}
-
private long translateUrl(long url) {
int domainId = partitioner.translateId(bucketId, (int) (url >>> 32));
return ((long)domainId << 32) | (url & 0xFFFFFFFFL);
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java
index 7f762ff3..15ad0cd3 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java
@@ -43,11 +43,11 @@ public class WordsTableWriter {
var writer = new BTreeWriter(mmf, wordsBTreeContext);
- writer.write(offset, tableSize, this::writeBTreeBlock);
+ writer.write(offset, tableSize, this::writeBTreeDataBlock);
}
}
- private void writeBTreeBlock(MultimapFileLongSlice mapSlice) {
+ private void writeBTreeDataBlock(MultimapFileLongSlice mapSlice) {
long urlFileOffset = 0;
int idx = 0;
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java
index 4e237908..264c1051 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeCrawlPlan.java
@@ -27,4 +27,15 @@ public class EdgeCrawlPlan {
}
}
+ public Path getCrawledFilePath(String fileName) {
+ String sp1 = fileName.substring(0, 2);
+ String sp2 = fileName.substring(2, 4);
+ return crawl.getDir().resolve(sp1).resolve(sp2).resolve(fileName);
+ }
+
+ public Path getProcessedFilePath(String fileName) {
+ String sp1 = fileName.substring(0, 2);
+ String sp2 = fileName.substring(2, 4);
+ return process.getDir().resolve(sp1).resolve(sp2).resolve(fileName);
+ }
}
diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/DenseBitMapTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/DenseBitMapTest.java
new file mode 100644
index 00000000..20857947
--- /dev/null
+++ b/marginalia_nu/src/test/java/nu/marginalia/util/DenseBitMapTest.java
@@ -0,0 +1,56 @@
+package nu.marginalia.util;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+class DenseBitMapTest {
+
+ @Test
+ public void testSetAll() {
+ var dbm = new DenseBitMap(129);
+ for (int i = 0; i < dbm.cardinality; i++) {
+ dbm.set(i);
+ }
+
+ for (int i = 0; i < dbm.cardinality; i++) {
+ assertTrue(dbm.get(i));
+ }
+ }
+
+ @Test
+ public void testSetEven() {
+ var dbm = new DenseBitMap(131);
+ for (int i = 0; i < dbm.cardinality; i+=2) {
+ dbm.set(i);
+ }
+
+ for (int i = 0; i < dbm.cardinality; i+=2) {
+ assertTrue(dbm.get(i));
+ }
+
+ for (int i = 1; i < dbm.cardinality; i+=2) {
+ assertFalse(dbm.get(i));
+ }
+ }
+
+ @Test
+ public void testSetAllClearSome() {
+ var dbm = new DenseBitMap(129);
+
+ for (int i = 0; i < dbm.cardinality; i++) {
+ dbm.set(i);
+ }
+ for (int i = 1; i < dbm.cardinality; i+=2) {
+ dbm.clear(i);
+ }
+
+ for (int i = 0; i < dbm.cardinality; i+=2) {
+ assertTrue(dbm.get(i), "Expected " + i + " to be set");
+ }
+
+ for (int i = 1; i < dbm.cardinality; i+=2) {
+ assertFalse(dbm.get(i), "Expected " + i + " to be clear");
+ }
+ }
+}
\ No newline at end of file
From 48e4aa3ee848e44ab7868adbea51ac9341990e36 Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Wed, 22 Jun 2022 13:01:46 +0200
Subject: [PATCH 10/40] Clean up old junk from the WordPatterns class
---
.../util/language/WordPatterns.java | 22 +++----------------
.../converting/LinkKeywordExtractorMain.java | 4 ++--
2 files changed, 5 insertions(+), 21 deletions(-)
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java
index 391558f4..3a95072b 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java
@@ -3,7 +3,9 @@ package nu.marginalia.util.language;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
-import java.util.*;
+import java.util.HashSet;
+import java.util.Objects;
+import java.util.Set;
import java.util.function.Predicate;
import java.util.regex.Pattern;
@@ -13,21 +15,13 @@ public class WordPatterns {
public static final String WORD_TOKEN_JOINER = "_";
public static final Pattern wordPattern = Pattern.compile("[#]?[_@.a-zA-Z0-9'+\\-\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+[#]?");
- public static final Pattern wordPatternRestrictive = Pattern.compile("[#]?[@a-zA-Z0-9'+\\-\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+[#]?");
- public static final Pattern keyWordPattern = Pattern.compile("[A-Z\\u00C0-\\u00D6][_a-zA-Z\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]{0,32}('[a-zA-Z])?");
public static final Pattern wordAppendixPattern = Pattern.compile("[.]?[0-9a-zA-Z\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]{1,3}[0-9]?");
- public static final Pattern joinWord = Pattern.compile("(as|an|the|of|in|a)");
- public static final Pattern keywordAppendixPattern = Pattern.compile("([0-9A-Z][A-Z0-9]{0,3})");
public static final Pattern wordBreakPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))");
public static final Pattern characterNoisePattern = Pattern.compile("^[/+\\-]+$");
public static final Predicate wordQualitiesPredicate = wordPattern.asMatchPredicate();
- public static final Predicate restrictivePredicate = wordPatternRestrictive.asMatchPredicate();
public static final Predicate wordAppendixPredicate = wordAppendixPattern.asMatchPredicate();
- public static final Predicate keywordPredicate = keyWordPattern.asMatchPredicate();
- public static final Predicate keywordAppendixPredicate = keywordAppendixPattern.asMatchPredicate();
public static final Predicate wordPredicateEither = wordQualitiesPredicate.or(wordAppendixPredicate);
- public static final Predicate keywordPredicateEither = keywordPredicate.or(keywordAppendixPredicate);
public static final Predicate characterNoisePredicate = characterNoisePattern.asMatchPredicate();
public static final Set topWords;
@@ -88,16 +82,6 @@ public class WordPatterns {
return true;
}
- public static boolean filterStrict(String word) {
-
- int numDigits = (int) word.chars().filter(Character::isDigit).count();
- if (numDigits == word.length()) {
- return false;
- }
-
- return true;
- }
-
public static boolean isStopWord(String s) {
if (s.length() < MIN_WORD_LENGTH) {
return true;
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java
index 63c26200..f60541e3 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java
@@ -139,9 +139,9 @@ public class LinkKeywordExtractorMain {
continue;
String word = wordPos.wordLowerCase();
- if (!WordPatterns.wordQualitiesPredicate.test(word) || !WordPatterns.filter(word))
- continue;
+ if (!WordPatterns.filter(word))
+ continue;
if (!linkUrl.domain.equals(documentUrl.domain)) {
if (isNewKeywordForLink(word, linkUrl.toString())) {
From 4516b23f90806671dda7256c7891642f5f29f839 Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Wed, 22 Jun 2022 13:12:44 +0200
Subject: [PATCH 11/40] Also grab alt text for images in a-tags in anchor text
extractor
---
.../converting/LinkKeywordExtractorMain.java | 31 ++++++++++++++++---
1 file changed, 26 insertions(+), 5 deletions(-)
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java
index f60541e3..570c47b5 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java
@@ -19,6 +19,8 @@ import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -101,23 +103,42 @@ public class LinkKeywordExtractorMain {
}
}
- private final Pattern anchorTextNoise = Pattern.compile("[\\s\"()“”:]+");
private void processDocument(String docUrl, String documentBody) throws URISyntaxException {
- var processed = Jsoup.parse(documentBody);
-
- EdgeUrl documentUrl = new EdgeUrl(docUrl);
+ final Document processed = Jsoup.parse(documentBody);
+ final EdgeUrl documentUrl = new EdgeUrl(docUrl);
for (var link : processed.getElementsByTag("a")) {
if (link.hasAttr("href")) {
String href = link.attr("href");
- String text = anchorTextNoise.matcher(link.text().toLowerCase()).replaceAll(" ").trim();
+ String text = getLinkText(link);
processAnchor(documentUrl, href, text);
}
}
}
+ private final Pattern anchorTextNoise = Pattern.compile("[\\s\"()“”:]+");
+
+ private String getLinkText(Element link) {
+ String text = link.text();
+
+ if (link.text().isBlank()) {
+ text = getLinkTextByImgAltTag(link);
+ }
+
+ return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim();
+ }
+
+ private String getLinkTextByImgAltTag(Element link) {
+ for (var img: link.getElementsByTag("img")) {
+ if (img.hasAttr("alt")) {
+ return img.attr("alt");
+ }
+ }
+ return "";
+ }
+
private void processAnchor(EdgeUrl documentUrl, String href, String text) {
if (!isInterestingAnchorText(text)) {
return;
From e1b34771156a6f2c64ad71993a35213e6424129f Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Thu, 23 Jun 2022 17:02:28 +0200
Subject: [PATCH 12/40] Experiments in keyword extraction
---
.../converting/LinkKeywordExtractorMain.java | 294 ++++++++----------
.../converting/atags/AnchorTextExtractor.java | 149 +++++++++
.../java/org/openzim/ZIMTypes/ZIMReader.java | 206 +-----------
3 files changed, 292 insertions(+), 357 deletions(-)
create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java
index 570c47b5..792dac6f 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java
@@ -1,215 +1,193 @@
package nu.marginalia.wmsa.edge.converting;
-import com.google.common.hash.HashFunction;
-import com.google.common.hash.Hashing;
-import com.google.inject.Guice;
-import com.google.inject.Inject;
-import com.google.inject.Injector;
-import nu.marginalia.util.DenseBitMap;
-import nu.marginalia.util.language.WordPatterns;
-import nu.marginalia.util.language.processing.SentenceExtractor;
-import nu.marginalia.util.language.processing.model.DocumentLanguageData;
-import nu.marginalia.wmsa.configuration.WmsaHome;
-import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
+import gnu.trove.set.hash.TIntHashSet;
+import nu.marginalia.wmsa.edge.converting.atags.AnchorTextExtractor;
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader;
import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader;
import nu.marginalia.wmsa.edge.crawling.WorkLog;
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
+import nu.marginalia.wmsa.edge.integration.stackoverflow.StackOverflowPostsReader;
+import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader;
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
+import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.io.BufferedOutputStream;
+import java.io.FileOutputStream;
import java.io.IOException;
-import java.net.URISyntaxException;
-import java.nio.charset.StandardCharsets;
+import java.io.OutputStream;
+import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
-import java.util.function.Predicate;
-import java.util.regex.Pattern;
public class LinkKeywordExtractorMain {
private static final Logger logger = LoggerFactory.getLogger(LinkKeywordExtractorMain.class);
- public static void main(String... args) throws IOException {
+ public static void main(String... args) throws IOException, InterruptedException {
- if (args.length != 1) {
- System.err.println("Arguments: crawl-plan.yaml");
+ if (args.length < 2) {
+ System.err.println("Arguments: [crawl|so|wiki] crawl-plan.yaml [data]");
System.exit(0);
}
- var plan = new CrawlPlanLoader().load(Path.of(args[0]));
- Injector injector = Guice.createInjector(
- new ConverterModule(plan)
- );
+ String command = args[0];
+ var plan = new CrawlPlanLoader().load(Path.of(args[1]));
+
+ switch (command) {
+ case "crawl": getKeywordsFromCrawl(plan); break;
+ case "so": getKeywordsFromSo(plan, args[2]); break;
+ case "wiki": getKeywordsFromWiki(plan, args[2]); break;
+ default: System.err.println("Unrecognized command");
+ }
- injector.getInstance(LinkKeywordExtractorMain.class);
}
- private final HashSet crawledDomains = new HashSet<>();
- private final List fileNames = new ArrayList<>();
- private final LinkParser linkParser = new LinkParser();
- private final SentenceExtractor sentenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels());
+ private static void getKeywordsFromWiki(EdgeCrawlPlan plan, String arg) throws IOException, InterruptedException {
- private final HashFunction hashFunction = Hashing.murmur3_128();
- // This bit map is used as a bloom filter to deduplicate url-keyword combinations
- // false positives are expected, but that's an acceptable trade-off to not have to deal with
- // de-duplicating billions of shuffled (url, word) tuples on limited hardware
- private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS);
+ HashSet crawledDomains = new HashSet<>();
+ TIntHashSet crawledUrls = new TIntHashSet(50_000_000);
+
+ logger.info("Loading URLs");
+ Files.lines(Path.of("/home/vlofgren/good-urls3.txt"))
+ .filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange"))
+ .mapToInt(String::hashCode)
+ .forEach(crawledUrls::add);
+
+ logger.info("Loading input spec");
+ CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
+ spec -> { crawledDomains.add(spec.domain); });
+
+ try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
+ AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(domain -> crawledDomains.contains(domain)
+ && !domain.contains("wiki")
+ && !domain.contains("isni")
+ && !domain.contains("wiktionary"),
+ url -> crawledUrls.contains(url.toString().hashCode()),
+ output::write);
+
+ new WikipediaReader(arg, new EdgeDomain("invalid.example"), article -> {
+ anchorTextExtractor.processDocument(article.getUrl().toString(), article.body);
+ }).join();
+ }
+ catch (IOException ex) {
+ ex.printStackTrace();
+ }
+
+
+
+ }
+
+ private static void getKeywordsFromSo(EdgeCrawlPlan plan, String arg) throws IOException, InterruptedException {
+ TIntHashSet crawledUrls = new TIntHashSet(50_000_000);
+
+ logger.info("Loading URLs");
+ Files.lines(Path.of("/home/vlofgren/good-urls3.txt"))
+ .filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange"))
+ .mapToInt(String::hashCode)
+ .forEach(crawledUrls::add);
- @Inject
- public LinkKeywordExtractorMain(EdgeCrawlPlan plan) throws IOException {
logger.info("Loading input spec");
+ HashSet crawledDomains = new HashSet<>();
CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
spec -> crawledDomains.add(spec.domain));
+ crawledDomains.remove("jsfiddle.net"); // like 30% of SO's links go here
+ crawledDomains.remove("jsbin.com");
+ crawledDomains.remove("codepad.org");
+
+
+ try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
+ AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains,
+ url -> crawledUrls.contains(url.toString().hashCode()),
+ output::write);
+
+ new StackOverflowPostsReader(arg, new EdgeDomain("invalid.example"), post -> {
+ anchorTextExtractor.processDocument(post.getUrl().toString(), post.fullBody);
+ }).join();
+ }
+ catch (IOException ex) {
+ ex.printStackTrace();
+ }
+ }
+
+
+ public static void getKeywordsFromCrawl(EdgeCrawlPlan plan) throws IOException {
+
+ TIntHashSet crawledUrls = new TIntHashSet(50_000_000);
+
+ logger.info("Loading URLs");
+ Files.lines(Path.of("/home/vlofgren/good-urls3.txt"))
+ .filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange"))
+ .mapToInt(String::hashCode)
+ .forEach(crawledUrls::add);
+
+
+ logger.info("Loading input spec");
+
+ HashSet crawledDomains = new HashSet<>();
+ CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
+ spec -> crawledDomains.add(spec.domain));
+
+ List fileNames = new ArrayList<>();
+
logger.info("Replaying crawl log");
WorkLog.readLog(plan.crawl.getLogFile(),
entry -> fileNames.add(entry.path()));
- logger.info("Reading files");
- for (var fn : fileNames) {
- CrawledDomainReader crawledDomainReader = new CrawledDomainReader();
- var crawledDomain = crawledDomainReader.read(plan.getCrawledFilePath(fn));
- if (crawledDomain.doc == null) continue;
+ try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
+ AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains,
+ url -> crawledUrls.contains(url.toString().hashCode()),
+ output::write);
- System.out.println("# " + crawledDomain.domain);
+ logger.info("Reading files");
+ for (var fn : fileNames) {
+ CrawledDomainReader crawledDomainReader = new CrawledDomainReader();
+ var crawledDomain = crawledDomainReader.read(plan.getCrawledFilePath(fn));
+ if (crawledDomain.doc == null) continue;
- for (var doc : crawledDomain.doc) {
- try {
+ System.out.println("# " + crawledDomain.domain);
+
+ for (var doc : crawledDomain.doc) {
if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) {
- processDocument(doc.url, doc.documentBody);
- }
- }
- catch (URISyntaxException ex) {
- // This Shouldn't Happen (TM) as the URL that we're failing to process
- // is expected to have already been parsed by this code successfully
- // in the process of getting here.
- //
- // But also, if it does happen, it's no big deal
-
- logger.warn("Bad URL format", ex);
- }
- }
- }
- }
-
-
- private void processDocument(String docUrl, String documentBody) throws URISyntaxException {
- final Document processed = Jsoup.parse(documentBody);
- final EdgeUrl documentUrl = new EdgeUrl(docUrl);
-
- for (var link : processed.getElementsByTag("a")) {
- if (link.hasAttr("href")) {
- String href = link.attr("href");
- String text = getLinkText(link);
-
- processAnchor(documentUrl, href, text);
- }
- }
- }
-
- private final Pattern anchorTextNoise = Pattern.compile("[\\s\"()“”:]+");
-
- private String getLinkText(Element link) {
- String text = link.text();
-
- if (link.text().isBlank()) {
- text = getLinkTextByImgAltTag(link);
- }
-
- return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim();
- }
-
- private String getLinkTextByImgAltTag(Element link) {
- for (var img: link.getElementsByTag("img")) {
- if (img.hasAttr("alt")) {
- return img.attr("alt");
- }
- }
- return "";
- }
-
- private void processAnchor(EdgeUrl documentUrl, String href, String text) {
- if (!isInterestingAnchorText(text)) {
- return;
- }
-
- var optLinkUrl = linkParser.parseLink(documentUrl, href);
- if (optLinkUrl.isEmpty()) return;
-
- var linkUrl = optLinkUrl.get();
-
- if (!isInterestingAnchorLink(linkUrl)) {
- return;
- }
-
- DocumentLanguageData languageData = sentenceExtractor.extractSentences(text);
- for (var sent : languageData.sentences) {
- for (var wordPos : sent) {
- if (wordPos.isStopWord())
- continue;
-
- String word = wordPos.wordLowerCase();
-
- if (!WordPatterns.filter(word))
- continue;
-
- if (!linkUrl.domain.equals(documentUrl.domain)) {
- if (isNewKeywordForLink(word, linkUrl.toString())) {
- System.out.println(linkUrl + "\t" + word);
+ anchorTextExtractor.processDocument(doc.url, doc.documentBody);
}
}
}
}
+
}
- // This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine
- private final Predicate looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate();
+ private static class UrlKeywordTsvWriter implements AutoCloseable {
- private boolean isInterestingAnchorText(String text) {
- if (text.isBlank()) return false;
- if (text.length() > 32) return false;
+ private final OutputStream stream;
- // Google loves questions, and so does SEO spammers
- if (text.endsWith("?")) return false;
-
- if (text.startsWith("http:") || text.startsWith("https:")) return false;
-
- if (looksLikeAnURL.test(text)) return false;
-
- return switch (text) {
- case "this", "here", "click", "click here", "download", "source" -> false;
- default -> true;
- };
- }
-
- private boolean isInterestingAnchorLink(EdgeUrl linkUrl) {
- if (!(linkUrl.proto.endsWith("http") || linkUrl.proto.equals("https"))) {
- return false;
+ UrlKeywordTsvWriter(Path outputFile) throws IOException {
+ this.stream = new BufferedOutputStream(new FileOutputStream(outputFile.toFile()));
}
- return crawledDomains.contains(linkUrl.domain.toString());
+ void write(EdgeUrl url, String keyword) {
+ try {
+ stream.write(url.toString().getBytes());
+ stream.write('\t');
+ stream.write(keyword.getBytes());
+ stream.write('\n');
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ stream.close();
+ }
}
- private boolean isNewKeywordForLink(String href, String text) {
- long hash = 0;
-
- hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).asLong();
- hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).asLong();
-
- // Remove sign bit because we don't want a negative index in deduplicateHashBitset
- hash &= 0x7FFF_FFFF_FFFF_FFFFL;
-
- return !deduplicateHashBitset.set(hash % deduplicateHashBitset.cardinality);
- }
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java
new file mode 100644
index 00000000..c96fd400
--- /dev/null
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java
@@ -0,0 +1,149 @@
+package nu.marginalia.wmsa.edge.converting.atags;
+
+import com.google.common.hash.HashFunction;
+import com.google.common.hash.Hashing;
+import lombok.SneakyThrows;
+import nu.marginalia.util.DenseBitMap;
+import nu.marginalia.util.language.WordPatterns;
+import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
+import nu.marginalia.wmsa.edge.model.EdgeUrl;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+
+import java.nio.charset.StandardCharsets;
+import java.util.function.BiConsumer;
+import java.util.function.Predicate;
+import java.util.regex.Pattern;
+
+public class AnchorTextExtractor {
+ private final Predicate includeDomainPredicate;
+ private final Predicate includeUrlPredicate;
+ private final BiConsumer linkKeywordConsumer;
+
+ private final LinkParser linkParser = new LinkParser();
+
+ private final HashFunction hashFunction = Hashing.murmur3_128();
+
+ // This bit map is used as a bloom filter to deduplicate url-keyword combinations
+ // false positives are expected, but that's an acceptable trade-off to not have to deal with
+ // de-duplicating billions of shuffled (url, word) tuples on limited hardware
+ private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS);
+
+ public AnchorTextExtractor(Predicate includeDomainPredicate,
+ Predicate includeUrlPredicate,
+ BiConsumer linkKeywordConsumer) {
+ this.includeDomainPredicate = includeDomainPredicate;
+ this.includeUrlPredicate = includeUrlPredicate;
+ this.linkKeywordConsumer = linkKeywordConsumer;
+ }
+
+ @SneakyThrows
+ public void processDocument(String docUrl, String documentBody) {
+ final Document processed = Jsoup.parse(documentBody);
+ final EdgeUrl documentUrl = new EdgeUrl(docUrl);
+
+ for (var link : processed.getElementsByTag("a")) {
+ if (link.hasAttr("href")) {
+ String href = link.attr("href");
+ String text = getLinkText(link);
+
+ processAnchor(documentUrl, href, text);
+ }
+ }
+ }
+
+ private final Pattern anchorTextNoise = Pattern.compile("[ \t\n\"()“”]+");
+
+ private String getLinkText(Element link) {
+ String text = link.text();
+
+ if (link.text().isBlank()) {
+ for (var img: link.getElementsByTag("img")) {
+ if (img.hasAttr("alt")) {
+ text = img.attr("alt");
+ break;
+ }
+ }
+ }
+
+ return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim();
+ }
+
+ private void processAnchor(EdgeUrl documentUrl, String href, String text) {
+ if (!isInterestingAnchorText(text)) {
+ return;
+ }
+ if (href.contains("?")) {
+ return;
+ }
+
+ var optLinkUrl = linkParser.parseLink(documentUrl, href);
+ if (optLinkUrl.isEmpty()) return;
+
+ var linkUrl = optLinkUrl.get();
+
+ if (!isInterestingAnchorLink(linkUrl)) {
+ return;
+ }
+
+ for (String word: anchorTextNoise.split(text)) {
+ if (WordPatterns.isStopWord(word))
+ continue;
+
+ word = word.toLowerCase();
+ if (!WordPatterns.filter(word))
+ continue;
+
+ if (!linkUrl.domain.equals(documentUrl.domain)) {
+ if (isNewKeywordForLink(word, linkUrl.toString())) {
+ linkKeywordConsumer.accept(linkUrl, word);
+ }
+ }
+ }
+ }
+
+ // This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine
+ private final Predicate looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate();
+
+ private boolean isInterestingAnchorText(String text) {
+ if (text.isBlank()) return false;
+ if (text.length() > 32) return false;
+
+ // Google loves questions, and so does SEO spammers
+ if (text.endsWith("?")) return false;
+
+ if (text.startsWith("http:") || text.startsWith("https:")) return false;
+
+ if (looksLikeAnURL.test(text)) return false;
+
+ return switch (text) {
+ case "this", "here", "click", "click here", "download", "source" -> false;
+ default -> true;
+ };
+ }
+
+ private boolean isInterestingAnchorLink(EdgeUrl linkUrl) {
+ if (!(linkUrl.proto.endsWith("http") || linkUrl.proto.equals("https"))) {
+ return false;
+ }
+
+ if (!includeUrlPredicate.test(linkUrl)) {
+ return false;
+ }
+
+ return includeDomainPredicate.test(linkUrl.domain.toString());
+ }
+
+ private boolean isNewKeywordForLink(String href, String text) {
+ long hash = 0;
+
+ hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).asLong();
+ hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).asLong();
+
+ // Remove sign bit because we don't want a negative index in deduplicateHashBitset
+ hash &= 0x7FFF_FFFF_FFFF_FFFFL;
+
+ return !deduplicateHashBitset.set(hash % deduplicateHashBitset.cardinality);
+ }
+}
diff --git a/third_party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java b/third_party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java
index d97c3c73..7706e8d1 100644
--- a/third_party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java
+++ b/third_party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java
@@ -18,20 +18,20 @@
package org.openzim.ZIMTypes;
-import java.io.*;
-import java.util.*;
-import java.util.function.BiConsumer;
-import java.util.function.Consumer;
-import java.util.function.Predicate;
-
import com.github.luben.zstd.RecyclingBufferPool;
import com.github.luben.zstd.ZstdInputStream;
import lombok.AllArgsConstructor;
import lombok.Getter;
import org.jetbrains.annotations.NotNull;
-import org.tukaani.xz.SingleXZInputStream;
import org.openzim.util.RandomAcessFileZIMInputStream;
import org.openzim.util.Utilities;
+import org.tukaani.xz.SingleXZInputStream;
+
+import java.io.*;
+import java.util.*;
+import java.util.function.BiConsumer;
+import java.util.function.Consumer;
+import java.util.function.Predicate;
/**
* @author Arunesh Mathur
@@ -401,198 +401,6 @@ public class ZIMReader {
}
- public String getArticleData(DirectoryEntry mainEntry) throws IOException {
-
- byte[] buffer = new byte[8];
-
- if (mainEntry != null) {
-
- // Check what kind of an entry was mainEnrty
- if (mainEntry.getClass() == ArticleEntry.class) {
-
- // Cast to ArticleEntry
- ArticleEntry article = (ArticleEntry) mainEntry;
-
- // Get the cluster and blob numbers from the article
- long clusterNumber = article.getClusterNumber();
- int blobNumber = article.getBlobnumber();
-
- // Move to the cluster entry in the clusterPtrPos
- mReader.seek(mFile.getClusterPtrPos() + clusterNumber * 8);
-
- // Read the location of the cluster
- long clusterPos = mReader
- .readEightLittleEndianBytesValue(buffer);
-
- // Move to the cluster
- mReader.seek(clusterPos);
-
- // Read the first byte, for compression information
- int compressionType = mReader.read();
-
- // Reference declaration
- SingleXZInputStream xzReader = null;
- int firstOffset, numberOfBlobs, offset1,
- offset2,
- location,
- differenceOffset;
-
- ByteArrayOutputStream baos;
-
- // Check the compression type that was read
- switch (compressionType) {
-
- // TODO: Read uncompressed data directly
- case 0:
- case 1:
-
- // Read the first 4 bytes to find out the number of artciles
- buffer = new byte[4];
-
- // Create a dictionary with size 40MiB, the zimlib uses this
- // size while creating
-
- // Read the first offset
- mReader.read(buffer);
-
- // The first four bytes are the offset of the zeroth blob
- firstOffset = Utilities
- .toFourLittleEndianInteger(buffer);
-
- // The number of blobs
- numberOfBlobs = firstOffset / 4;
-
- // The blobNumber has to be lesser than the numberOfBlobs
- assert blobNumber < numberOfBlobs;
-
-
- if (blobNumber == 0) {
- // The first offset is what we read earlier
- offset1 = firstOffset;
- } else {
-
- location = (blobNumber - 1) * 4;
- Utilities.skipFully(mReader, location);
- mReader.read(buffer);
- offset1 = Utilities.toFourLittleEndianInteger(buffer);
- }
-
- mReader.read(buffer);
- offset2 = Utilities.toFourLittleEndianInteger(buffer);
-
- differenceOffset = offset2 - offset1;
- buffer = new byte[differenceOffset];
-
- Utilities.skipFully(mReader,
- (offset1 - 4 * (blobNumber + 2)));
-
- mReader.read(buffer, 0, differenceOffset);
-
- return new String(buffer);
-
- // LZMA2 compressed data
- case 4:
-
- // Read the first 4 bytes to find out the number of artciles
- buffer = new byte[4];
-
- // Create a dictionary with size 40MiB, the zimlib uses this
- // size while creating
- xzReader = new SingleXZInputStream(mReader, 4194304);
-
- // Read the first offset
- xzReader.read(buffer);
-
- // The first four bytes are the offset of the zeroth blob
- firstOffset = Utilities
- .toFourLittleEndianInteger(buffer);
-
- // The number of blobs
- numberOfBlobs = firstOffset / 4;
-
- // The blobNumber has to be lesser than the numberOfBlobs
- assert blobNumber < numberOfBlobs;
-
- if(blobNumber == 0) {
- // The first offset is what we read earlier
- offset1 = firstOffset;
- } else {
-
- location = (blobNumber - 1) * 4;
- Utilities.skipFully(xzReader, location);
- xzReader.read(buffer);
- offset1 = Utilities.toFourLittleEndianInteger(buffer);
- }
-
- xzReader.read(buffer);
- offset2 = Utilities.toFourLittleEndianInteger(buffer);
-
- differenceOffset = offset2 - offset1;
- buffer = new byte[differenceOffset];
-
- Utilities.skipFully(xzReader,
- (offset1 - 4 * (blobNumber + 2)));
-
- xzReader.read(buffer, 0, differenceOffset);
- return new String(buffer);
-
- case 5:
- // Read the first 4 bytes to find out the number of artciles
- buffer = new byte[4];
-
- // Create a dictionary with size 40MiB, the zimlib uses this
- // size while creating
- var zstdInputStream = new com.github.luben.zstd.ZstdInputStream(new BufferedInputStream(mReader));
-
- // Read the first offset
- zstdInputStream.read(buffer);
-
- // The first four bytes are the offset of the zeroth blob
- firstOffset = Utilities
- .toFourLittleEndianInteger(buffer);
-
- // The number of blobs
- numberOfBlobs = firstOffset / 4;
-
- // The blobNumber has to be lesser than the numberOfBlobs
- assert blobNumber < numberOfBlobs;
-
- if(blobNumber == 0) {
- // The first offset is what we read earlier
- offset1 = firstOffset;
- } else {
-
- location = (blobNumber - 1) * 4;
- Utilities.skipFully(zstdInputStream, location);
- zstdInputStream.read(buffer);
- offset1 = Utilities.toFourLittleEndianInteger(buffer);
- }
-
- zstdInputStream.read(buffer);
- offset2 = Utilities.toFourLittleEndianInteger(buffer);
-
- differenceOffset = offset2 - offset1;
- buffer = new byte[differenceOffset];
-
- Utilities.skipFully(zstdInputStream,
- (offset1 - 4 * (blobNumber + 2)));
-
- zstdInputStream.read(buffer, 0, differenceOffset);
-
- return new String(buffer);
-
- default:
- System.err.print("What is compression = " + compressionType);
-
- }
-
- }
- }
-
- return null;
-
- }
-
public DirectoryEntry getDirectoryInfoAtTitlePosition(long position)
throws IOException {
From ee07c4d94ae12b8a988dbcdff8bf4032004245bf Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Sun, 26 Jun 2022 16:44:08 +0200
Subject: [PATCH 13/40] Refactored s/DictionaryWriter/KeywordLexicon/g to use
significantly less memory and (potentially) support UTF-8.
---
.../marginalia/util/dict/DictionaryData.java | 144 ++-----
.../util/dict/DictionaryHashMap.java | 54 +--
.../wmsa/edge/index/EdgeIndexService.java | 31 +-
.../wmsa/edge/index/IndexServicesFactory.java | 39 +-
.../conversion/SearchIndexConverter.java | 20 +-
.../index/dictionary/DictionaryWriter.java | 367 ------------------
.../index/dictionary/TokenCompressor.java | 83 ----
.../journal/SearchIndexJournalReader.java | 5 +-
.../journal/SearchIndexJournalWriter.java | 3 +
.../journal/SearchIndexJournalWriterImpl.java | 8 +-
.../{ => model}/SearchIndexJournalEntry.java | 2 +-
.../SearchIndexJournalEntryHeader.java | 2 +-
.../SearchIndexJournalFileHeader.java | 2 +-
.../edge/index/lexicon/KeywordLexicon.java | 117 ++++++
.../KeywordLexiconReadOnlyView.java} | 13 +-
.../journal/KeywordLexiconJournal.java | 69 ++++
.../KeywordLexiconJournalCommitQueue.java | 41 ++
.../journal/KeywordLexiconJournalFile.java | 157 ++++++++
.../wmsa/edge/index/reader/SearchIndexes.java | 12 +-
.../index/service/DictionaryWriterTest.java | 164 ++++----
.../service/SearchIndexJournalWriterTest.java | 21 +-
.../index/service/TokenCompressorTest.java | 28 --
.../service/util/DictionaryDataTest.java | 28 +-
.../service/util/DictionaryHashMapTest.java | 115 +++---
24 files changed, 671 insertions(+), 854 deletions(-)
delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryWriter.java
delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/TokenCompressor.java
rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/{ => model}/SearchIndexJournalEntry.java (95%)
rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/{ => model}/SearchIndexJournalEntryHeader.java (90%)
rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/{ => model}/SearchIndexJournalFileHeader.java (59%)
create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java
rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{dictionary/DictionaryReader.java => lexicon/KeywordLexiconReadOnlyView.java} (61%)
create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournal.java
create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalCommitQueue.java
create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java
delete mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/TokenCompressorTest.java
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java
index 847259db..c36c10d2 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java
@@ -5,7 +5,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.ByteBuffer;
-import java.util.Arrays;
+import java.nio.LongBuffer;
public class DictionaryData {
@@ -17,22 +17,22 @@ public class DictionaryData {
public DictionaryData(int bankSize) {
DICTIONARY_BANK_SIZE = bankSize;
- banks.add(new DictionaryDataBank(0));
+ banks.add(new DictionaryDataBank(0, bankSize));
}
public int size() {
return banks.end();
}
- public int add(byte[] data, int value) {
+ public int add(long key) {
var activeBank = banks.last();
- int rb = activeBank.add(data, value);
+ int rb = activeBank.add(key);
if (rb == -1) {
int end = activeBank.getEnd();
logger.debug("Switching bank @ {}", end);
- var newBank = new DictionaryDataBank(end);
- rb = newBank.add(data, value);
+ var newBank = new DictionaryDataBank(end, DICTIONARY_BANK_SIZE);
+ rb = newBank.add(key);
banks.add(newBank);
}
@@ -41,33 +41,30 @@ public class DictionaryData {
}
- public byte[] getBytes(int offset) {
- return banks.bankForOffset(offset).getBytes(offset);
+ public long getKey(int offset) {
+ return banks.bankForOffset(offset).getKey(offset);
}
- public boolean keyEquals(int offset, byte[] data) {
- return banks.bankForOffset(offset).keyEquals(offset, data);
+ public boolean keyEquals(int offset, long otherKey) {
+ return banks.bankForOffset(offset).keyEquals(offset, otherKey);
}
- public int getValue(int offset) {
- return banks.bankForOffset(offset).getValue(offset);
- }
-
- public class DictionaryDataBank {
+ private static class DictionaryDataBank {
private final int start_idx;
- private final ByteBuffer data;
+
+ // Humongous long-lived arrays seem to sometimes yield considerable memory overhead and
+ // can make the GC behave poorly. Using off-heap memory seems preferred when their
+ // lifetime is "forever"
+
+ private final LongBuffer keys;
private int size;
- private int[] offset;
- private int[] value;
- public DictionaryDataBank(int start_idx) {
+
+ public DictionaryDataBank(int start_idx, int sz) {
this.start_idx = start_idx;
- data = ByteBuffer.allocateDirect(DICTIONARY_BANK_SIZE);
-
- offset = new int[DICTIONARY_BANK_SIZE/16];
- value = new int[DICTIONARY_BANK_SIZE/16];
+ keys = ByteBuffer.allocateDirect(8*sz).asLongBuffer();
size = 0;
}
@@ -79,102 +76,23 @@ public class DictionaryData {
return start_idx + size;
}
- public byte[] getBytes(int idx) {
+ public long getKey(int idx) {
+ if (idx < start_idx || idx - start_idx >= size) {
+ throw new IndexOutOfBoundsException(idx);
+ }
+ return keys.get(idx - start_idx);
+ }
+
+ public boolean keyEquals(int idx, long other) {
if (idx < start_idx || idx - start_idx >= size) {
throw new IndexOutOfBoundsException(idx);
}
- idx = idx - start_idx;
-
- final int start;
- final int end = offset[idx];
-
- if (idx == 0) start = 0;
- else start = offset[idx-1];
-
- byte[] dst = new byte[end-start];
- data.get(start, dst);
- return dst;
+ return keys.get(idx - start_idx) == other;
}
- public int getValue(int idx) {
- if (idx < start_idx || idx - start_idx >= size) {
- throw new IndexOutOfBoundsException(idx);
- }
- return value[idx - start_idx];
- }
-
- public boolean keyEquals(int idx, byte[] data) {
- if (idx < start_idx || idx - start_idx >= size) {
- throw new IndexOutOfBoundsException(idx);
- }
-
- idx = idx - start_idx;
- int start;
- int end = offset[idx];
-
- if (idx == 0) {
- start = 0;
- }
- else {
- start = offset[idx-1];
- }
- if (data.length != end - start) {
- return false;
- }
- for (int i = 0; i < data.length; i++) {
- if (this.data.get(start + i) != data[i]) {
- return false;
- }
- }
- return true;
- }
-
- public long longHashCode(int idx) {
- if (idx < start_idx || idx - start_idx >= size) {
- throw new IndexOutOfBoundsException(idx);
- }
-
- idx = idx - start_idx;
- int start;
- int end = offset[idx];
-
- if (idx == 0) {
- start = 0;
- }
- else {
- start = offset[idx-1];
- }
-
- long result = 1;
- for (int i = start; i < end; i++)
- result = 31 * result + data.get(i);
-
- return result;
- }
-
- public int add(byte[] newData, int newValue) {
- if (size == offset.length) {
- logger.debug("Growing bank from {} to {}", offset.length, offset.length*2);
- offset = Arrays.copyOf(offset, offset.length*2);
- value = Arrays.copyOf(value, value.length*2);
- }
-
- if (size > 0 && offset[size-1]+newData.length >= DICTIONARY_BANK_SIZE) {
- if (offset.length > size+1) {
- logger.debug("Shrinking bank from {} to {}", offset.length, size - 1);
- offset = Arrays.copyOf(offset, size + 1);
- value = Arrays.copyOf(value, size + 1);
- }
- return -1; // Full
- }
-
- int dataOffset = size > 0 ? offset[size-1] : 0;
-
- data.put(dataOffset, newData);
-
- offset[size] = dataOffset + newData.length;
- value[size] = newValue;
+ public int add(long newKey) {
+ keys.put(size, newKey);
return start_idx + size++;
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java
index d655b6a2..5544545a 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java
@@ -82,9 +82,6 @@ public class DictionaryHashMap {
}
}
- public int memSz() {
- return dictionaryData.size();
- }
public int size() {
return sz.get();
}
@@ -101,20 +98,20 @@ public class DictionaryHashMap {
buffers[buffer].put(bufferIdx, val);
}
- public int put(byte[] data, int value) {
+ public int put(long key) {
- long hash = longHash(data) & 0x7FFF_FFFF_FFFF_FFFFL;
+ long hash = key & 0x7FFF_FFFF_FFFF_FFFFL;
long idx = hash % hashTableSize;
if (getCell(idx) == NO_VALUE) {
- return setValue(data, value, idx);
+ return setValue(key, idx);
}
- return putRehash(data, value, idx, hash);
+ return putRehash(key, idx, hash);
}
- private int putRehash(byte[] data, int value, long idx, long hash) {
+ private int putRehash(long key, long idx, long hash) {
final long pStride = 1 + (hash % (hashTableSize - 2));
for (long j = 1; j < maxProbeLength; j++) {
@@ -129,9 +126,9 @@ public class DictionaryHashMap {
if (val == NO_VALUE) {
probe_count_metrics.set(j);
- return setValue(data, value, idx);
+ return setValue(key, idx);
}
- else if (dictionaryData.keyEquals(val, data)) {
+ else if (dictionaryData.keyEquals(val, key)) {
return val;
}
}
@@ -139,16 +136,16 @@ public class DictionaryHashMap {
throw new IllegalStateException("DictionaryHashMap full @ size " + size() + "/" + hashTableSize + ", " + round((100.0*size()) / hashTableSize) + "%");
}
- private int setValue(byte[] data, int value, long cell) {
+ private int setValue(long key, long cell) {
sz.incrementAndGet();
- int di = dictionaryData.add(data, value);
+ int di = dictionaryData.add(key);
setCell(cell, di);
return di;
}
- public int get(byte[] data) {
- final long hash = longHash(data) & 0x7FFF_FFFF_FFFF_FFFFL;
+ public int get(long key) {
+ final long hash = key & 0x7FFF_FFFF_FFFF_FFFFL;
final long cell = hash % hashTableSize;
if (getCell(cell) == NO_VALUE) {
@@ -157,15 +154,15 @@ public class DictionaryHashMap {
else {
int val = getCell(cell);
- if (dictionaryData.keyEquals(val, data)) {
- return dictionaryData.getValue(val);
+ if (dictionaryData.keyEquals(val, key)) {
+ return val;
}
}
- return getRehash(data, cell, hash);
+ return getRehash(key, cell, hash);
}
- private int getRehash(byte[] data, long idx, long hash) {
+ private int getRehash(long key, long idx, long hash) {
final long pStride = 1 + (hash % (hashTableSize - 2));
for (long j = 1; j < maxProbeLength; j++) {
@@ -180,29 +177,12 @@ public class DictionaryHashMap {
if (val == NO_VALUE) {
return NO_VALUE;
}
- else if (dictionaryData.keyEquals(val, data)) {
- return dictionaryData.getValue(val);
+ else if (dictionaryData.keyEquals(val, key)) {
+ return val;
}
}
throw new IllegalStateException("DictionaryHashMap full @ size " + size() + "/" + hashTableSize + ", " + round((100.0*size()) / hashTableSize) + "%");
}
- private long longHash(byte[] bytes) {
- if (bytes == null)
- return 0;
-
- // https://cp-algorithms.com/string/string-hashing.html
- int p = 127;
- long m = (1L<<61)-1;
- long p_power = 1;
- long hash_val = 0;
-
- for (byte element : bytes) {
- hash_val = (hash_val + (element+1) * p_power) % m;
- p_power = (p_power * p) % m;
- }
- return hash_val;
- }
-
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java
index 829a59af..96f1fb72 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java
@@ -13,18 +13,22 @@ import io.prometheus.client.Histogram;
import io.reactivex.rxjava3.schedulers.Schedulers;
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
import nu.marginalia.util.ListChunker;
+import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.wmsa.configuration.server.Initialization;
import nu.marginalia.wmsa.configuration.server.MetricsServer;
import nu.marginalia.wmsa.configuration.server.Service;
-import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter;
-import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntry;
-import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntryHeader;
-import nu.marginalia.wmsa.edge.index.model.*;
-import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
+import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
+import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
+import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
+import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
+import nu.marginalia.wmsa.edge.index.model.EdgePutWordsRequest;
+import nu.marginalia.wmsa.edge.index.model.IndexBlock;
+import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
-import nu.marginalia.util.dict.DictionaryHashMap;
-import nu.marginalia.wmsa.edge.model.*;
+import nu.marginalia.wmsa.edge.model.EdgeDomain;
+import nu.marginalia.wmsa.edge.model.EdgeId;
+import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
import nu.marginalia.wmsa.edge.model.search.*;
@@ -53,7 +57,7 @@ public class EdgeIndexService extends Service {
@NotNull
private final Initialization init;
private final SearchIndexes indexes;
- private final DictionaryWriter dictionaryWriter;
+ private final KeywordLexicon keywordLexicon;
private final Gson gson = new GsonBuilder()
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
@@ -80,7 +84,7 @@ public class EdgeIndexService extends Service {
this.init = init;
this.indexes = indexes;
- this.dictionaryWriter = servicesFactory.getDictionaryWriter();
+ this.keywordLexicon = servicesFactory.getKeywordLexicon();
Spark.post("/words/", this::putWords);
Spark.post("/search/", this::search, gson::toJson);
@@ -186,15 +190,18 @@ public class EdgeIndexService extends Service {
for (var chunk : ListChunker.chopList(words.words, SearchIndexJournalEntry.MAX_LENGTH)) {
- var entry = new SearchIndexJournalEntry(getWordIds(chunk));
+ var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
var header = new SearchIndexJournalEntryHeader(domainId, urlId, words.block);
indexWriter.put(header, entry);
};
}
- private long[] getWordIds(List words) {
- return words.stream().filter(w -> w.length() < Byte.MAX_VALUE).mapToLong(dictionaryWriter::get).toArray();
+ private long[] getOrInsertWordIds(List words) {
+ return words.stream()
+ .filter(w -> w.length() < Byte.MAX_VALUE)
+ .mapToLong(keywordLexicon::getOrInsert)
+ .toArray();
}
private Object search(Request request, Response response) {
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java
index 40c733e2..b3b4d45e 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java
@@ -4,17 +4,19 @@ import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.google.inject.name.Named;
import lombok.SneakyThrows;
+import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException;
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
+import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPreconverter;
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
+import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
+import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView;
+import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
-import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader;
-import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter;
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
-import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -36,13 +38,13 @@ public class IndexServicesFactory {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final PartitionedDataFile writerIndexFile;
- private final RootDataFile writerDictionaryFile;
+ private final RootDataFile keywordLexiconFile;
private final PartitionedDataFile preconverterOutputFile;
private final DoublePartitionedDataFile indexReadWordsFile;
private final DoublePartitionedDataFile indexReadUrlsFile;
private final DoublePartitionedDataFile indexWriteWordsFile;
private final DoublePartitionedDataFile indexWriteUrlsFile;
- private volatile static DictionaryWriter dictionaryWriter;
+ private volatile static KeywordLexicon keywordLexicon;
private final Long dictionaryHashMapSize;
private final SearchIndexPartitioner partitioner;
@@ -53,7 +55,7 @@ public class IndexServicesFactory {
@Named("partition-root-slow-tmp") Path partitionRootSlowTmp,
@Named("partition-root-fast") Path partitionRootFast,
@Named("edge-writer-page-index-file") String writerIndexFile,
- @Named("edge-writer-dictionary-file") String writerDictionaryFile,
+ @Named("edge-writer-dictionary-file") String keywordLexiconFile,
@Named("edge-index-read-words-file") String indexReadWordsFile,
@Named("edge-index-read-urls-file") String indexReadUrlsFile,
@Named("edge-index-write-words-file") String indexWriteWordsFile,
@@ -68,7 +70,7 @@ public class IndexServicesFactory {
this.domainBlacklist = domainBlacklist;
this.writerIndexFile = new PartitionedDataFile(partitionRootSlow, writerIndexFile);
- this.writerDictionaryFile = new RootDataFile(partitionRootSlow, writerDictionaryFile);
+ this.keywordLexiconFile = new RootDataFile(partitionRootSlow, keywordLexiconFile);
this.indexReadWordsFile = new DoublePartitionedDataFile(partitionRootFast, indexReadWordsFile);
this.indexReadUrlsFile = new DoublePartitionedDataFile(partitionRootFast, indexReadUrlsFile);
this.indexWriteWordsFile = new DoublePartitionedDataFile(partitionRootFast, indexWriteWordsFile);
@@ -78,19 +80,22 @@ public class IndexServicesFactory {
}
public SearchIndexJournalWriterImpl getIndexWriter(int idx) {
- return new SearchIndexJournalWriterImpl(getDictionaryWriter(), writerIndexFile.get(idx));
- }
-
- public DictionaryWriter getDictionaryWriter() {
- if (dictionaryWriter == null) {
- dictionaryWriter = new DictionaryWriter(writerDictionaryFile.get(), dictionaryHashMapSize, true);
- }
- return dictionaryWriter;
+ return new SearchIndexJournalWriterImpl(getKeywordLexicon(), writerIndexFile.get(idx));
}
@SneakyThrows
- public DictionaryReader getDictionaryReader() {
- return new DictionaryReader(getDictionaryWriter());
+ public KeywordLexicon getKeywordLexicon() {
+ if (keywordLexicon == null) {
+ final var journal = new KeywordLexiconJournal(keywordLexiconFile.get());
+ keywordLexicon = new KeywordLexicon(journal,
+ new DictionaryHashMap(dictionaryHashMapSize));
+ }
+ return keywordLexicon;
+ }
+
+ @SneakyThrows
+ public KeywordLexiconReadOnlyView getDictionaryReader() {
+ return new KeywordLexiconReadOnlyView(getKeywordLexicon());
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java
index afa319f4..adce8747 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java
@@ -1,24 +1,26 @@
package nu.marginalia.wmsa.edge.index.conversion;
-import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
-import nu.marginalia.wmsa.edge.index.conversion.words.WordIndexOffsetsTable;
-import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntry;
-import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalReader;
-import nu.marginalia.wmsa.edge.index.model.IndexBlock;
-import nu.marginalia.wmsa.edge.index.conversion.words.WordsTableWriter;
+import nu.marginalia.util.RandomWriteFunnel;
import nu.marginalia.util.btree.BTreeWriter;
import nu.marginalia.util.btree.model.BTreeContext;
import nu.marginalia.util.multimap.MultimapFileLong;
-import nu.marginalia.util.RandomWriteFunnel;
+import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
+import nu.marginalia.wmsa.edge.index.conversion.words.WordIndexOffsetsTable;
+import nu.marginalia.wmsa.edge.index.conversion.words.WordsTableWriter;
+import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalReader;
+import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
+import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.io.*;
+import java.io.File;
+import java.io.IOException;
+import java.io.RandomAccessFile;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
-import static nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntry.MAX_LENGTH;
+import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry.MAX_LENGTH;
public class SearchIndexConverter {
public static final BTreeContext urlsBTreeContext = new BTreeContext(5, 1, ~0, 8);
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryWriter.java
deleted file mode 100644
index 906231be..00000000
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryWriter.java
+++ /dev/null
@@ -1,367 +0,0 @@
-package nu.marginalia.wmsa.edge.index.dictionary;
-
-import com.google.inject.Inject;
-import com.google.inject.Singleton;
-import com.google.inject.name.Named;
-import io.prometheus.client.Gauge;
-import lombok.SneakyThrows;
-import nu.marginalia.util.language.WordPatterns;
-import nu.marginalia.util.dict.DictionaryHashMap;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.*;
-import java.nio.ByteBuffer;
-import java.util.*;
-import java.util.concurrent.atomic.AtomicInteger;
-import java.util.concurrent.locks.Lock;
-import java.util.concurrent.locks.ReadWriteLock;
-import java.util.concurrent.locks.ReentrantReadWriteLock;
-
-@Singleton
-public class DictionaryWriter implements AutoCloseable {
- private final ArrayList commitQueue = new ArrayList<>(10_000);
-
- private final DictionaryHashMap reverseIndex;
- private final boolean prepopulate;
-
- private final ReadWriteLock memoryLock = new ReentrantReadWriteLock();
- private final ReadWriteLock diskLock = new ReentrantReadWriteLock();
- private final RandomAccessFile raf;
-
- private final Logger logger = LoggerFactory.getLogger(getClass());
-
- static final AtomicInteger instances = new AtomicInteger();
-
- private final TokenCompressor readOnlyTokenCompressor = new TokenCompressor(this::getReadOnly);
- private final TokenCompressor tokenCompressor = new TokenCompressor(this::get);
-
- private static final Gauge request_time_metrics
- = Gauge.build("wmsa_edge_index_dictionary_size", "Dictionary Size")
- .register();
-
- private volatile boolean running = true;
- private final Thread commitToDiskThread;
- @SneakyThrows
- public long getPos() {
- return raf.getFilePointer();
- }
-
- @SneakyThrows @Inject
- public DictionaryWriter(
- @Named("edge-writer-dictionary-file") File dictionaryFile,
- @Named("edge-dictionary-hash-map-size") Long hashMapSize,
- boolean prepopulate) {
- logger.info("Creating dictionary writer");
- raf = new RandomAccessFile(dictionaryFile, "rw");
- reverseIndex = new DictionaryHashMap(hashMapSize);
- this.prepopulate = prepopulate;
-
- Lock writeLock = diskLock.writeLock();
- try {
- writeLock.lock();
- loadFile(dictionaryFile);
- }
- finally {
- writeLock.unlock();
- }
-
- commitToDiskThread = new Thread(this::commitToDiskRunner, "CommitToDiskThread");
- commitToDiskThread.start();
-
- Runtime.getRuntime().addShutdownHook(new Thread(this::commitToDisk));
-
- if (!instances.compareAndSet(0, 1)) {
- logger.error("MULTIPLE WRITER INSTANCES!");
- }
- logger.info("Done creating dictionary writer");
- }
-
-
- public void commitToDiskRunner() {
- while (running) {
- try {
- Thread.sleep(1000);
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
-
- commitToDisk();
- }
- }
-
- public void prepare() {
- if (!prepopulate)
- return;
-
- try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/word-frequency"),
- "Could not load word frequency table");
- var br = new BufferedReader(new InputStreamReader(resource))
- ) {
- for (;;) {
- var line = br.readLine();
- if (line == null) {
- break;
- }
- if (WordPatterns.wordPredicateEither.test(line)) {
- get(line);
- }
- }
- } catch (IOException e) {
- e.printStackTrace();
- }
-
- }
- @SneakyThrows
- private void loadFile(File dictionaryFile) {
- if (!dictionaryFile.exists()) {
- logger.info("File {} does not exist, can't load", dictionaryFile);
- return;
- }
-
- logger.info("Reading {}", dictionaryFile);
-
- long pos;
- if (raf.length() < 8) {
- pos = 8;
- raf.writeLong(pos);
- }
- else {
- pos = raf.readLong();
- }
-
- logger.info("Length {} ({})", pos, raf.length());
- if (pos == 8) {
- logger.info("Empty DB, prepopulating");
- prepare();
- }
-
- ByteBuffer buffer = ByteBuffer.allocateDirect(8192);
-
- var channel = raf.getChannel();
-
- long cp = channel.position();
- int debugNext = 0;
- try {
- buffer.limit(0);
- long loaded = 0;
-
- while (cp < pos || buffer.hasRemaining()) {
- if (buffer.limit() - buffer.position() < 4) {
- buffer.compact();
-
- long rb = channel.read(buffer);
- if (rb <= 0) {
- break;
- }
- cp += rb;
-
- buffer.flip();
- }
-
- int len = buffer.get();
- if (debugNext > 0) {
- logger.warn("NextLen: {} ({})", len, (char) len);
- }
- while (buffer.limit() - buffer.position() < len) {
- buffer.compact();
- int rb = channel.read(buffer);
- if (rb <= 0) break;
- cp += rb;
- buffer.flip();
- }
-
- if (buffer.limit() < len) {
-
- logger.warn("Partial write at end-of-file!");
-
- if (cp >= pos) {
- logger.info("... but it's ok");
- }
- break;
- }
-
- boolean negativeLen = false;
- if (len < 0) {
- len = (len&0xFF);
- negativeLen = true;
-
- }
-
- byte[] data = new byte[len];
- buffer.get(data);
- if ((++loaded % 10_000_000) == 0L) {
- logger.info("Loaded {} million items", loaded/1_000_000);
- }
-
- if (debugNext > 0) {
- logger.warn("Next word {}", new String(data));
- if (--debugNext == 0) {
- logger.info(" ");
- }
- }
- if (negativeLen) {
- logger.warn("Negative length of word {} {}@{}", len, new String(data), reverseIndex.size());
- debugNext = 10;
- }
-
-// if (reverseIndex.get(data) != DictionaryHashMap.NO_VALUE) {
-// logger.error("Duplicate insert");
-// }
- reverseIndex.put(data, reverseIndex.size());
- }
- }
- catch (Exception ex) {
- logger.error("IO Exception", ex);
- }
-
- raf.seek(pos);
- request_time_metrics.set(reverseIndex.size());
-
- logger.info("Initial loading done, dictionary size {}", reverseIndex.size());
- }
-
- private final ByteBuffer commitBuffer = ByteBuffer.allocateDirect(4096);
- public volatile boolean noCommit = false;
- @SneakyThrows
- public void commitToDisk() {
- if (noCommit) return;
-
- if (!raf.getChannel().isOpen()) {
- logger.error("commitToDisk() with closed channel! Cannot commit!");
- return;
- }
-
- Lock memLock = memoryLock.readLock();
- List data;
- try {
- memLock.lock();
- if (commitQueue.isEmpty())
- return;
- data = new ArrayList<>(commitQueue);
- commitQueue.clear();
- }
- finally {
- memLock.unlock();
- }
-
- var channel = raf.getChannel();
- commitBuffer.clear();
-
- Lock writeLock = diskLock.writeLock();
- // Only acquire memory lock if there's a risk of backpressure
- if (data.size() < 1000) {
- memLock = null;
- }
-
- try {
- if (memLock != null) memLock.lock();
- writeLock.lock();
-
- long start = System.currentTimeMillis();
- int ct = data.size();
-
- for (byte[] item : data) {
- commitBuffer.clear();
- commitBuffer.put((byte) item.length);
- commitBuffer.put(item);
- commitBuffer.flip();
-
- while (commitBuffer.position() < commitBuffer.limit())
- channel.write(commitBuffer, channel.size());
- }
-
- long pos = channel.size();
- commitBuffer.clear();
- commitBuffer.putLong(pos);
- commitBuffer.flip();
- channel.write(commitBuffer, 0);
-
- channel.force(false);
-
- logger.debug("Comitted {} items in {} ms", ct, System.currentTimeMillis() - start);
- }
- catch (Exception ex) {
- logger.error("Error during dictionary commit!!!", ex);
- }
- finally {
- writeLock.unlock();
- if (memLock != null) {
- memLock.unlock();
- }
- }
- }
-
- public int get(String macroWord) {
- byte[] word = tokenCompressor.getWordBytes(macroWord);
-
- Lock lock = memoryLock.readLock();
- try {
- lock.lock();
- int idx = reverseIndex.get(word);
- if (idx >= 0) {
- return idx;
- }
- }
- finally {
- lock.unlock();
- }
-
- lock = memoryLock.writeLock();
- try {
- lock.lock();
- int idx = reverseIndex.get(word);
- if (idx >= 0) {
- return idx;
- }
-
- if (!noCommit) {
- commitQueue.add(word);
- }
-
- idx = reverseIndex.size();
-
- reverseIndex.put(word, idx);
-
- request_time_metrics.set(reverseIndex.size());
-
- return idx;
- }
- finally {
-
- lock.unlock();
- }
- }
-
- public int getReadOnly(String word) {
- var bytes = readOnlyTokenCompressor.getWordBytes(word);
- if (bytes.length == 0) {
- return DictionaryHashMap.NO_VALUE;
- }
- return reverseIndex.get(bytes);
- }
-
- public int size() {
- Lock lock = memoryLock.readLock();
- try {
- lock.lock();
- return reverseIndex.size();
- }
- finally {
- lock.unlock();
- }
- }
-
- @Override
- public void close() throws Exception {
- logger.warn("Closing DictionaryWriter");
-
- running = false;
- commitToDiskThread.join();
- commitToDisk();
-
- raf.close();
- }
-
-}
-
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/TokenCompressor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/TokenCompressor.java
deleted file mode 100644
index 5a3d73ab..00000000
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/TokenCompressor.java
+++ /dev/null
@@ -1,83 +0,0 @@
-package nu.marginalia.wmsa.edge.index.dictionary;
-
-import nu.marginalia.util.ByteFolder;
-import nu.marginalia.util.dict.DictionaryHashMap;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.Arrays;
-import java.util.function.Predicate;
-import java.util.function.ToIntFunction;
-import java.util.regex.Pattern;
-
-public class TokenCompressor {
- private final ToIntFunction mapper;
- private final ByteFolder folder = new ByteFolder();
- public static final byte[] EMPTY = new byte[0];
-
- private static final Logger logger = LoggerFactory.getLogger(TokenCompressor.class);
-
- private static final Predicate intPatternMatcher = Pattern.compile("[1-9][0-9]{1,8}").asMatchPredicate();
-
-
- public TokenCompressor(ToIntFunction mapper) {
- this.mapper = mapper;
- }
- final char[] separators = new char[] { '_', '-', '.', '/' };
- public synchronized byte[] getWordBytes(String macroWord) {
- int ui = -1;
-
- for (char c : separators) {
- int ui2 = macroWord.indexOf(c);
- if (ui < 0) ui = ui2;
- else if (ui2 >= 0) ui = Math.min(ui, ui2);
- }
-
- if (ui <= 0 || ui >= macroWord.length()-1) {
- return getByteRepresentation(macroWord);
- }
-
- String car = macroWord.substring(0, ui);
- String cdr = macroWord.substring(ui+1);
-
- int carId = mapper.applyAsInt(car);
- int cdrId = mapper.applyAsInt(cdr);
-
- if (carId == DictionaryHashMap.NO_VALUE || cdrId == DictionaryHashMap.NO_VALUE) {
- return EMPTY;
- }
-
- return folder.foldBytes(carId, cdrId);
- }
-
- private byte[] getByteRepresentation(String word) {
- if (intPatternMatcher.test(word)) {
- long val = Long.parseLong(word);
- if (val < 0x100) {
- return new byte[] { 'A', (byte) (val & 0xFF)};
- }
- else if (val < 0x10000) {
- return new byte[] { 'B', (byte)((val & 0xFF00)>>8), (byte) (val & 0xFF)};
- }
- else if (val < 0x1000000) {
- return new byte[] { 'C', (byte)((val & 0xFF0000)>>12), (byte)((val & 0xFF00)>>8), (byte) (val & 0xFF)};
- }
- else if (val < 0x100000000L) {
- return new byte[] { 'D', (byte)((val & 0xFF0000)>>16), (byte)((val & 0xFF0000)>>12), (byte)((val & 0xFF00)>>8), (byte) (val & 0xFF)};
- }
- }
-
- var bytes = word.getBytes();
- for (int i = 0; i < bytes.length; i++) {
- if (bytes[i] < 32 && (bytes[i] & 0x80) == 0) {
- logger.error("Bad byte in {} -> {} ({})", word, bytes[i], (char) bytes[i]);
- bytes[i] = '?';
- }
- }
- if (bytes.length >= Byte.MAX_VALUE) {
- return Arrays.copyOf(bytes, Byte.MAX_VALUE);
- }
- return bytes;
- }
-
-}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalReader.java
index 0e11646a..94ebeacf 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalReader.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalReader.java
@@ -3,13 +3,16 @@ package nu.marginalia.wmsa.edge.index.journal;
import com.upserve.uppend.blobs.NativeIO;
import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.util.multimap.MultimapFileLongSlice;
+import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
+import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
+import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalFileHeader;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import org.jetbrains.annotations.NotNull;
import java.nio.ByteBuffer;
import java.util.Iterator;
-import static nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS;
+import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS;
public class SearchIndexJournalReader implements Iterable {
public static final long FILE_HEADER_SIZE_LONGS = 2;
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriter.java
index 4567a428..bad8d4e7 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriter.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriter.java
@@ -1,5 +1,8 @@
package nu.marginalia.wmsa.edge.index.journal;
+import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
+import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
+
public interface SearchIndexJournalWriter {
void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entry);
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java
index f5ba8b31..23c4b481 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java
@@ -3,7 +3,9 @@ package nu.marginalia.wmsa.edge.index.journal;
import io.reactivex.rxjava3.disposables.Disposable;
import io.reactivex.rxjava3.schedulers.Schedulers;
import lombok.SneakyThrows;
-import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter;
+import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
+import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
+import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -16,7 +18,7 @@ import java.nio.channels.FileChannel;
import java.util.concurrent.TimeUnit;
public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
- private final DictionaryWriter dictionaryWriter;
+ private final KeywordLexicon dictionaryWriter;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Disposable writerTask;
@@ -28,7 +30,7 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
private long pos;
@SneakyThrows
- public SearchIndexJournalWriterImpl(DictionaryWriter dictionaryWriter, File indexFile) {
+ public SearchIndexJournalWriterImpl(KeywordLexicon dictionaryWriter, File indexFile) {
this.dictionaryWriter = dictionaryWriter;
initializeIndexFile(indexFile);
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntry.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/model/SearchIndexJournalEntry.java
similarity index 95%
rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntry.java
rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/model/SearchIndexJournalEntry.java
index 493eea40..c370ecd0 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntry.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/model/SearchIndexJournalEntry.java
@@ -1,4 +1,4 @@
-package nu.marginalia.wmsa.edge.index.journal;
+package nu.marginalia.wmsa.edge.index.journal.model;
import java.nio.ByteBuffer;
import java.util.Arrays;
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntryHeader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/model/SearchIndexJournalEntryHeader.java
similarity index 90%
rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntryHeader.java
rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/model/SearchIndexJournalEntryHeader.java
index f635b1d4..745a1a21 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalEntryHeader.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/model/SearchIndexJournalEntryHeader.java
@@ -1,4 +1,4 @@
-package nu.marginalia.wmsa.edge.index.journal;
+package nu.marginalia.wmsa.edge.index.journal.model;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalFileHeader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/model/SearchIndexJournalFileHeader.java
similarity index 59%
rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalFileHeader.java
rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/model/SearchIndexJournalFileHeader.java
index 49ac5009..62fea842 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalFileHeader.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/model/SearchIndexJournalFileHeader.java
@@ -1,4 +1,4 @@
-package nu.marginalia.wmsa.edge.index.journal;
+package nu.marginalia.wmsa.edge.index.journal.model;
public record SearchIndexJournalFileHeader(long fileSize, long wordCount) {
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java
new file mode 100644
index 00000000..6485f381
--- /dev/null
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java
@@ -0,0 +1,117 @@
+package nu.marginalia.wmsa.edge.index.lexicon;
+
+import com.google.common.hash.HashFunction;
+import com.google.common.hash.Hashing;
+import io.prometheus.client.Gauge;
+import lombok.SneakyThrows;
+import nu.marginalia.util.dict.DictionaryHashMap;
+import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReadWriteLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+
+public class KeywordLexicon implements AutoCloseable {
+ private final DictionaryHashMap reverseIndex;
+
+ private final ReadWriteLock memoryLock = new ReentrantReadWriteLock();
+ private final Logger logger = LoggerFactory.getLogger(getClass());
+
+ private static final AtomicInteger instances = new AtomicInteger();
+ private final HashFunction hashFunction = Hashing.murmur3_128();
+
+ private static final Gauge request_time_metrics
+ = Gauge.build("wmsa_edge_index_dictionary_size", "Dictionary Size")
+ .register();
+ private final KeywordLexiconJournal journal;
+
+ @SneakyThrows
+ public KeywordLexicon(KeywordLexiconJournal keywordLexiconJournal, DictionaryHashMap reverseIndexHashMap) {
+
+ journal = keywordLexiconJournal;
+ reverseIndex = reverseIndexHashMap;
+
+ logger.info("Creating dictionary writer");
+
+ if (!instances.compareAndSet(0, 1)) {
+ logger.error("MULTIPLE WRITER INSTANCES!");
+ }
+
+ journal.loadFile(this::loadJournalEntry);
+
+ logger.info("Done creating dictionary writer");
+ }
+
+ private void loadJournalEntry(byte[] bytes) {
+ final long key = hashFunction.hashBytes(bytes).asLong();
+ reverseIndex.put(key);
+ }
+
+ @SneakyThrows
+ public int getOrInsert(String macroWord) {
+ final long key = hashFunction.hashBytes(macroWord.getBytes()).asLong();
+
+ int idx = getReadOnly(key);
+ if (idx >= 0)
+ return idx;
+
+ Lock lock = memoryLock.writeLock();
+ try {
+ lock.lock();
+
+ // Check again to prevent race condition
+ if ((idx = reverseIndex.get(key)) >= 0)
+ return idx;
+
+ journal.enqueue(macroWord);
+ idx = reverseIndex.put(key);
+ request_time_metrics.set(reverseIndex.size());
+
+ return idx;
+ }
+ finally {
+ lock.unlock();
+ }
+ }
+
+ public int getReadOnly(String word) {
+ return getReadOnly(hashFunction.hashBytes(word.getBytes()).asLong());
+ }
+
+ public int getReadOnly(long hashedKey) {
+ Lock lock = memoryLock.readLock();
+ try {
+ lock.lock();
+ return reverseIndex.get(hashedKey);
+ }
+ finally {
+ lock.unlock();
+ }
+ }
+
+ public int size() {
+ Lock lock = memoryLock.readLock();
+ try {
+ lock.lock();
+ return reverseIndex.size();
+ }
+ finally {
+ lock.unlock();
+ }
+ }
+
+ @Override
+ public void close() throws Exception {
+ logger.warn("Closing DictionaryWriter");
+
+ journal.close();
+ }
+
+ public void commitToDisk() {
+ journal.commitToDisk();
+ }
+}
+
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexiconReadOnlyView.java
similarity index 61%
rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryReader.java
rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexiconReadOnlyView.java
index ca10c000..485bb423 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/dictionary/DictionaryReader.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexiconReadOnlyView.java
@@ -1,21 +1,18 @@
-package nu.marginalia.wmsa.edge.index.dictionary;
+package nu.marginalia.wmsa.edge.index.lexicon;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
-import com.google.inject.Inject;
-import com.google.inject.Singleton;
import lombok.SneakyThrows;
import java.util.concurrent.TimeUnit;
-@Singleton
-public class DictionaryReader {
- private final DictionaryWriter writer;
+public class KeywordLexiconReadOnlyView {
+ private final KeywordLexicon writer;
private final Cache cache = CacheBuilder.newBuilder().maximumSize(10_000).expireAfterAccess(60, TimeUnit.SECONDS).build();
- @SneakyThrows @Inject
- public DictionaryReader(DictionaryWriter writer) {
+ @SneakyThrows
+ public KeywordLexiconReadOnlyView(KeywordLexicon writer) {
this.writer = writer;
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournal.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournal.java
new file mode 100644
index 00000000..02d50862
--- /dev/null
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournal.java
@@ -0,0 +1,69 @@
+package nu.marginalia.wmsa.edge.index.lexicon.journal;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.List;
+import java.util.function.Consumer;
+
+public class KeywordLexiconJournal {
+
+ private static final boolean noCommit = Boolean.getBoolean("DictionaryJournal.noCommit");
+
+ private final KeywordLexiconJournalCommitQueue commitQueue;
+ private final KeywordLexiconJournalFile journalFile;
+ private final Logger logger = LoggerFactory.getLogger(getClass());
+
+ private final Thread commitToDiskThread;
+
+ private volatile boolean running = true;
+
+ public KeywordLexiconJournal(File file) throws IOException {
+ commitQueue = new KeywordLexiconJournalCommitQueue();
+ journalFile = new KeywordLexiconJournalFile(file);
+
+ commitToDiskThread = new Thread(this::commitToDiskRunner, "CommitToDiskThread");
+ commitToDiskThread.start();
+
+ Runtime.getRuntime().addShutdownHook(new Thread(this::commitToDisk));
+ }
+
+ public void enqueue(String word) throws InterruptedException {
+ commitQueue.enqueue(word);
+ }
+
+
+ public void commitToDiskRunner() {
+ if (noCommit) return;
+
+ while (running) {
+ try {
+ Thread.sleep(1000);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ commitToDisk();
+ }
+ }
+
+ public void commitToDisk() {
+ List entries = commitQueue.getQueuedEntries();
+
+ journalFile.writeEntriesToJournal(entries);
+ }
+
+ public void close() throws Exception {
+ logger.info("Closing Journal");
+ running = false;
+ commitToDiskThread.join();
+ commitToDisk();
+
+ journalFile.close();
+ }
+
+ public void loadFile(Consumer loadJournalEntry) throws IOException {
+ journalFile.loadFile(loadJournalEntry);
+ }
+}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalCommitQueue.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalCommitQueue.java
new file mode 100644
index 00000000..6baef0e1
--- /dev/null
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalCommitQueue.java
@@ -0,0 +1,41 @@
+package nu.marginalia.wmsa.edge.index.lexicon.journal;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+public class KeywordLexiconJournalCommitQueue {
+ private final ArrayList commitQueue = new ArrayList<>(10_000);
+ private final Logger logger = LoggerFactory.getLogger(getClass());
+ private static final long BACK_PRESSURE_LIMIT = 25_000;
+
+ public synchronized void enqueue(String word) throws InterruptedException {
+ for (int queueSize = commitQueue.size();
+ queueSize >= BACK_PRESSURE_LIMIT;
+ queueSize = commitQueue.size())
+ {
+ wait();
+ }
+
+ commitQueue.add(word);
+ }
+
+
+ public synchronized List getQueuedEntries() {
+ if (commitQueue.isEmpty())
+ return Collections.emptyList();
+ var data = new ArrayList<>(commitQueue);
+ commitQueue.clear();
+
+ notifyAll();
+
+ if (data.size() > BACK_PRESSURE_LIMIT) {
+ logger.warn("Dictionary Journal Backpressure: {}", data.size());
+ }
+
+ return data;
+ }
+}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java
new file mode 100644
index 00000000..a97eee6c
--- /dev/null
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java
@@ -0,0 +1,157 @@
+package nu.marginalia.wmsa.edge.index.lexicon.journal;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel;
+import java.util.List;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReadWriteLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+import java.util.function.Consumer;
+
+public class KeywordLexiconJournalFile {
+ private final RandomAccessFile journalFileRAF;
+ private final File journalFile;
+ private final Logger logger = LoggerFactory.getLogger(getClass());
+
+ private final ReadWriteLock diskLock = new ReentrantReadWriteLock();
+
+
+ public KeywordLexiconJournalFile(File journalFile) throws IOException {
+ this.journalFileRAF = new RandomAccessFile(journalFile, "rw");;
+ this.journalFile = journalFile;
+ }
+
+ public void loadFile(Consumer acceptEntry) throws IOException {
+ if (!journalFile.exists()) {
+ logger.info("File {} does not exist, can't load", journalFile);
+ return;
+ }
+
+ logger.info("Reading {}", journalFile);
+
+ long pos;
+ if (journalFileRAF.length() < 8) {
+ pos = 8;
+ journalFileRAF.writeLong(pos);
+ }
+ else {
+ pos = journalFileRAF.readLong();
+ }
+
+ logger.info("Length {} ({})", pos, journalFileRAF.length());
+ if (pos == 8) {
+ logger.info("Empty DB");
+ }
+
+ ByteBuffer buffer = ByteBuffer.allocateDirect(8192);
+
+ var channel = journalFileRAF.getChannel();
+
+ long cp = channel.position();
+ try {
+ buffer.limit(0);
+ long loaded = 0;
+
+ while (cp < pos || buffer.hasRemaining()) {
+ if (buffer.limit() - buffer.position() < 4) {
+ buffer.compact();
+
+ long rb = channel.read(buffer);
+ if (rb <= 0) {
+ break;
+ }
+ cp += rb;
+ buffer.flip();
+ }
+
+ int len = buffer.get();
+ while (buffer.limit() - buffer.position() < len) {
+ buffer.compact();
+ int rb = channel.read(buffer);
+ if (rb <= 0) break;
+ cp += rb;
+ buffer.flip();
+ }
+
+ if (buffer.limit() < len) {
+ logger.warn("Partial write at end-of-file!");
+
+ if (cp >= pos) {
+ logger.info("... but it's ok");
+ }
+ break;
+ }
+
+ byte[] data = new byte[len];
+ buffer.get(data);
+ if ((++loaded % 10_000_000) == 0L) {
+ logger.info("Loaded {} million items", loaded/1_000_000);
+ }
+
+ acceptEntry.accept(data);
+ }
+ }
+ catch (Exception ex) {
+ logger.error("IO Exception", ex);
+ }
+
+ journalFileRAF.seek(pos);
+ }
+
+
+ private final ByteBuffer writeBuffer = ByteBuffer.allocateDirect(4096);
+
+ public void writeEntriesToJournal(List data) {
+ if (data.isEmpty())
+ return;
+
+ final FileChannel channel = journalFileRAF.getChannel();
+
+ if (!channel.isOpen()) {
+ throw new IllegalStateException("commitToDisk() with closed channel! Cannot commit!");
+ }
+
+ Lock writeLock = diskLock.writeLock();
+ try {
+ writeLock.lock();
+
+ long start = System.currentTimeMillis();
+ int ct = data.size();
+
+ for (String item : data) {
+ writeBuffer.clear();
+ writeBuffer.put((byte) item.length());
+ writeBuffer.put(item.getBytes());
+ writeBuffer.flip();
+
+ while (writeBuffer.position() < writeBuffer.limit())
+ channel.write(writeBuffer, channel.size());
+ }
+
+ writeBuffer.clear();
+ writeBuffer.putLong(channel.size());
+ writeBuffer.flip();
+ channel.write(writeBuffer, 0);
+
+ channel.force(false);
+
+ logger.debug("Comitted {} items in {} ms", ct, System.currentTimeMillis() - start);
+ }
+ catch (Exception ex) {
+ logger.error("Error during dictionary commit!!!", ex);
+ }
+ finally {
+ writeLock.unlock();
+ }
+ }
+
+ public void close() throws IOException {
+ journalFileRAF.close();
+ }
+}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java
index 01ad1e20..f9e2bfac 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java
@@ -3,11 +3,11 @@ package nu.marginalia.wmsa.edge.index.reader;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.wmsa.configuration.server.Initialization;
+import nu.marginalia.wmsa.edge.index.EdgeIndexBucket;
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
-import nu.marginalia.wmsa.edge.index.EdgeIndexBucket;
-import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader;
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
+import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -29,7 +29,7 @@ public class SearchIndexes {
private final SearchIndexJournalWriterImpl primaryIndexWriter;
private final SearchIndexJournalWriterImpl secondaryIndexWriter;
- private DictionaryReader dictionaryReader = null;
+ private KeywordLexiconReadOnlyView keywordLexiconReadOnlyView = null;
@Inject
public SearchIndexes(IndexServicesFactory servicesFactory, SearchIndexPartitioner partitioner) {
@@ -105,8 +105,8 @@ public class SearchIndexes {
}
@Nullable
- public DictionaryReader getDictionaryReader() {
- return dictionaryReader;
+ public KeywordLexiconReadOnlyView getDictionaryReader() {
+ return keywordLexiconReadOnlyView;
}
@@ -127,7 +127,7 @@ public class SearchIndexes {
}
logger.info("Initializing dictionary reader");
- dictionaryReader = servicesFactory.getDictionaryReader();
+ keywordLexiconReadOnlyView = servicesFactory.getDictionaryReader();
}
finally {
opsLock.unlock();
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java
index 961d8304..48ee7c83 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java
@@ -1,15 +1,18 @@
package nu.marginalia.wmsa.edge.index.service;
import lombok.SneakyThrows;
-import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
-import nu.marginalia.wmsa.edge.index.model.IndexBlock;
-import nu.marginalia.wmsa.edge.index.dictionary.DictionaryReader;
-import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter;
+import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
+import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
+import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
+import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView;
+import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
+import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
-import java.io.*;
+import java.io.File;
+import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
@@ -51,11 +54,16 @@ class DictionaryWriterTest {
new SearchIndexPartitioner(null),
val -> false);
}
+
+ KeywordLexiconJournal createJournal(File f) throws IOException {
+ return new KeywordLexiconJournal(f);
+ }
+
@SneakyThrows
@Test
@Disabled
void test() {
- try (var dict = new DictionaryWriter(Path.of("/home/vlofgren/Code/data/dictionary.dat").toFile(), 1L<<16, false)) {
+ try (var dict = new KeywordLexicon(createJournal(Path.of("/home/vlofgren/Code/data/dictionary.dat").toFile()), new DictionaryHashMap(1L<<16))) {
wait();
}
}
@@ -65,33 +73,33 @@ class DictionaryWriterTest {
@Test
void getFold() {
var path = Files.createTempFile("dict", ".tmp");
- try (var dict = new DictionaryWriter(path.toFile(), 1L<<16, false)) {
- dict.get("hic");
- dict.get("hac");
+ try (var dict = new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))) {
+ dict.getOrInsert("hic");
+ dict.getOrInsert("hac");
dict.commitToDisk();
- dict.get("quae");
- dict.get("quis");
- dict.get("quem1");
- dict.get("quem2");
- dict.get("quem3");
- dict.get("quem4");
- dict.get("quem5");
- dict.get("quem6");
- dict.get("quem7");
- dict.get("quem8");
- dict.get("quem9");
- dict.get("quem10");
- dict.get("cuis");
- dict.get("haec_hic");
- dict.get("hoc_hac_cuis");
+ dict.getOrInsert("quae");
+ dict.getOrInsert("quis");
+ dict.getOrInsert("quem1");
+ dict.getOrInsert("quem2");
+ dict.getOrInsert("quem3");
+ dict.getOrInsert("quem4");
+ dict.getOrInsert("quem5");
+ dict.getOrInsert("quem6");
+ dict.getOrInsert("quem7");
+ dict.getOrInsert("quem8");
+ dict.getOrInsert("quem9");
+ dict.getOrInsert("quem10");
+ dict.getOrInsert("cuis");
+ dict.getOrInsert("haec_hic");
+ dict.getOrInsert("hoc_hac_cuis");
dict.commitToDisk();
- assertNotEquals(0, dict.get("hac"));
- assertEquals(0, dict.get("hic"));
+ assertNotEquals(0, dict.getOrInsert("hac"));
+ assertEquals(0, dict.getOrInsert("hic"));
}
- try (var dict = new DictionaryWriter(path.toFile(), 1L<<16, false)) {
- assertNotEquals(0, dict.get("hoc"));
- assertEquals(0, dict.get("hic"));
+ try (var dict = new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))) {
+ assertNotEquals(0, dict.getOrInsert("hoc"));
+ assertEquals(0, dict.getOrInsert("hic"));
}
path.toFile().delete();
@@ -101,24 +109,24 @@ class DictionaryWriterTest {
@Test
void get() {
var path = Files.createTempFile("dict", ".tmp");
- try (var dict = new DictionaryWriter(path.toFile(), 1L<<16, false)) {
- dict.get("hic");
- dict.get("hac");
- dict.get("haec");
- dict.get("hoc");
+ try (var dict = new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))) {
+ dict.getOrInsert("hic");
+ dict.getOrInsert("hac");
+ dict.getOrInsert("haec");
+ dict.getOrInsert("hoc");
dict.commitToDisk();
- dict.get("quae");
- dict.get("quis");
- dict.get("quem");
- dict.get("cuis");
+ dict.getOrInsert("quae");
+ dict.getOrInsert("quis");
+ dict.getOrInsert("quem");
+ dict.getOrInsert("cuis");
dict.commitToDisk();
- assertNotEquals(0, dict.get("hac"));
- assertEquals(0, dict.get("hic"));
+ assertNotEquals(0, dict.getOrInsert("hac"));
+ assertEquals(0, dict.getOrInsert("hic"));
}
- try (var dict = new DictionaryWriter(path.toFile(), 1L<<16, false)) {
- assertNotEquals(0, dict.get("hoc"));
- assertEquals(0, dict.get("hic"));
+ try (var dict = new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))) {
+ assertNotEquals(0, dict.getOrInsert("hoc"));
+ assertEquals(0, dict.getOrInsert("hic"));
}
path.toFile().delete();
@@ -129,25 +137,25 @@ class DictionaryWriterTest {
void getDoubleWrite() {
var path = Files.createTempFile("dict", ".tmp");
- try (var dict = new DictionaryWriter(path.toFile(), 1L<<16, false)) {
+ try (var dict = new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))) {
dict.commitToDisk();
}
- try (var dict = new DictionaryWriter(path.toFile(), 1L<<16, false)) {
- dict.get("hic");
- dict.get("hac");
- dict.get("haec");
- dict.get("hoc");
- dict.get("quae");
- dict.get("quis");
- dict.get("quem");
- dict.get("cuis");
+ try (var dict = new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))) {
+ dict.getOrInsert("hic");
+ dict.getOrInsert("hac");
+ dict.getOrInsert("haec");
+ dict.getOrInsert("hoc");
+ dict.getOrInsert("quae");
+ dict.getOrInsert("quis");
+ dict.getOrInsert("quem");
+ dict.getOrInsert("cuis");
dict.commitToDisk();
- assertNotEquals(0, dict.get("hac"));
- assertEquals(0, dict.get("hic"));
+ assertNotEquals(0, dict.getOrInsert("hac"));
+ assertEquals(0, dict.getOrInsert("hic"));
}
- var dict = new DictionaryReader(new DictionaryWriter(path.toFile(), 1L<<16, false));
+ var dict = new KeywordLexiconReadOnlyView(new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16)));
assertNotEquals(0, dict.get("hoc"));
assertEquals(0, dict.get("hic"));
@@ -160,38 +168,38 @@ class DictionaryWriterTest {
void getDoubleWrite2() {
var path = Files.createTempFile("dict", ".tmp");
- try (var dict = new DictionaryWriter(path.toFile(), 1L<<16, false)) {
- dict.get("hic");
- dict.get("hac");
- dict.get("haec");
- dict.get("hoc");
- dict.get("quae");
- dict.get("quis");
- dict.get("quem");
- dict.get("cuis");
+ try (var dict = new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))) {
+ dict.getOrInsert("hic");
+ dict.getOrInsert("hac");
+ dict.getOrInsert("haec");
+ dict.getOrInsert("hoc");
+ dict.getOrInsert("quae");
+ dict.getOrInsert("quis");
+ dict.getOrInsert("quem");
+ dict.getOrInsert("cuis");
dict.commitToDisk();
- assertNotEquals(0, dict.get("hac"));
- assertEquals(0, dict.get("hic"));
+ assertNotEquals(0, dict.getOrInsert("hac"));
+ assertEquals(0, dict.getOrInsert("hic"));
}
- try (var dict = new DictionaryWriter(path.toFile(), 1L<<16, false)) {
- dict.get("fe");
- dict.get("fi");
- dict.get("fo");
- dict.get("fum");
+ try (var dict = new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))) {
+ dict.getOrInsert("fe");
+ dict.getOrInsert("fi");
+ dict.getOrInsert("fo");
+ dict.getOrInsert("fum");
dict.commitToDisk();
- assertNotEquals(0, dict.get("hac"));
- assertEquals(0, dict.get("hic"));
+ assertNotEquals(0, dict.getOrInsert("hac"));
+ assertEquals(0, dict.getOrInsert("hic"));
}
- try (var dict = new DictionaryWriter(path.toFile(), 1L<<16, false)) {
- dict.get("bip");
- dict.get("bap");
+ try (var dict = new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16))) {
+ dict.getOrInsert("bip");
+ dict.getOrInsert("bap");
dict.commitToDisk();
}
- var dict = new DictionaryReader(new DictionaryWriter(path.toFile(), 1L<<16, false));
+ var dict = new KeywordLexiconReadOnlyView(new KeywordLexicon(createJournal(path.toFile()), new DictionaryHashMap(1L<<16)));
assertEquals(0, dict.get("hic"));
assertEquals(1, dict.get("hac"));
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java
index 39a62033..c900f0f6 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java
@@ -1,15 +1,15 @@
package nu.marginalia.wmsa.edge.index.service;
import lombok.SneakyThrows;
+import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.util.multimap.MultimapFileLong;
-import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntry;
-import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalEntryHeader;
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalReader;
-import nu.marginalia.wmsa.edge.index.model.IndexBlock;
-import nu.marginalia.wmsa.edge.index.dictionary.DictionaryWriter;
-import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
-import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
+import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
+import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
+import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
+import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
+import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@@ -17,12 +17,11 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
-import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
class SearchIndexJournalWriterTest {
- DictionaryWriter dictionaryWriter;
+ KeywordLexicon keywordLexicon;
SearchIndexJournalWriterImpl writer;
Path indexFile;
@@ -37,11 +36,11 @@ class SearchIndexJournalWriterTest {
dictionaryFile = Files.createTempFile("tmp", ".dict");
dictionaryFile.toFile().deleteOnExit();
- dictionaryWriter = new DictionaryWriter(dictionaryFile.toFile(), 1L<<16, false);
+ keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new DictionaryHashMap(1L<<16));
indexFile = Files.createTempFile("tmp", ".idx");
indexFile.toFile().deleteOnExit();
- writer = new SearchIndexJournalWriterImpl(dictionaryWriter, indexFile.toFile());
+ writer = new SearchIndexJournalWriterImpl(keywordLexicon, indexFile.toFile());
wordsFile1 = Files.createTempFile("words1", ".idx");
urlsFile1 = Files.createTempFile("urls1", ".idx");
@@ -50,7 +49,7 @@ class SearchIndexJournalWriterTest {
@SneakyThrows
@AfterEach
void tearDown() {
- dictionaryWriter.close();
+ keywordLexicon.close();
writer.close();
indexFile.toFile().delete();
dictionaryFile.toFile().delete();
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/TokenCompressorTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/TokenCompressorTest.java
deleted file mode 100644
index e780ed62..00000000
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/TokenCompressorTest.java
+++ /dev/null
@@ -1,28 +0,0 @@
-package nu.marginalia.wmsa.edge.index.service;
-
-import nu.marginalia.wmsa.edge.index.dictionary.TokenCompressor;
-import org.junit.jupiter.api.Test;
-
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.Map;
-
-class TokenCompressorTest {
-
- @Test
- public void getWordBytes() {
- final Map map = new HashMap<>();
- TokenCompressor tc = new TokenCompressor(word -> {
- map.put(word, map.size());
- return map.size()-1;
- });
-
- System.out.println(Arrays.toString(tc.getWordBytes("308")));
- System.out.println(Arrays.toString(tc.getWordBytes(".308")));
- System.out.println(Arrays.toString(tc.getWordBytes("308.")));
- System.out.println(Arrays.toString(tc.getWordBytes("30.8.")));
- System.out.println(Arrays.toString(tc.getWordBytes("30...")));
-
- map.entrySet().forEach(System.out::println);
- }
-}
\ No newline at end of file
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/DictionaryDataTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/DictionaryDataTest.java
index cd063ea8..a88715a2 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/DictionaryDataTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/DictionaryDataTest.java
@@ -1,21 +1,17 @@
package nu.marginalia.wmsa.edge.index.service.util;
-import nu.marginalia.util.dict.DictionaryData;
-import org.junit.jupiter.api.Assertions;
-import org.junit.jupiter.api.Test;
-
class DictionaryDataTest {
- @Test
- public void testDataBankGrow2() {
- var dataBank = new DictionaryData(65535);
- for (int i = 0; i < 64; i++) {
- String s = "" + i;
- int offset = dataBank.add(s.getBytes(), i);
- System.out.println(s + " " + offset + " " + new String(dataBank.getBytes(i)) + " " + dataBank.getValue(i));
-
- Assertions.assertEquals(s, new String(dataBank.getBytes(i)));
- Assertions.assertEquals(i, dataBank.getValue(i));
- }
- }
+// @Test
+// public void testDataBankGrow2() {
+// var dataBank = new DictionaryData(65535);
+// for (int i = 0; i < 64; i++) {
+// String s = "" + i;
+// int offset = dataBank.add(s.getBytes(), i);
+// System.out.println(s + " " + offset + " " + new String(dataBank.getKey(i)) + " " + dataBank.getValue(i));
+//
+// Assertions.assertEquals(s, new String(dataBank.getKey(i)));
+// Assertions.assertEquals(i, dataBank.getValue(i));
+// }
+// }
}
\ No newline at end of file
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/DictionaryHashMapTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/DictionaryHashMapTest.java
index b9a54237..c39d5c03 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/DictionaryHashMapTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/util/DictionaryHashMapTest.java
@@ -1,67 +1,58 @@
package nu.marginalia.wmsa.edge.index.service.util;
-import nu.marginalia.util.dict.DictionaryHashMap;
-import org.junit.jupiter.api.Assertions;
-import org.junit.jupiter.api.Test;
-
-import java.util.HashSet;
-import java.util.Set;
-
-import static org.junit.jupiter.api.Assertions.*;
-
class DictionaryHashMapTest {
-
- @Test
- public void testDictionaryHashMap() {
- var dhm = new DictionaryHashMap(1<<6);
- System.out.println(dhm.put("hello".getBytes(), 23));
- System.out.println(dhm.put("hello".getBytes(), 23));
- System.out.println(dhm.put("world".getBytes(), 54));
- assertEquals(23, dhm.get("hello".getBytes()));
- assertEquals(54, dhm.get("world".getBytes()));
-
- }
-
- @Test
- public void testDictionaryHashMapMissing() {
- var dhm = new DictionaryHashMap(1<<8);
- assertEquals(DictionaryHashMap.NO_VALUE, dhm.get(new byte[] { 1,2,3}));
-
- }
-
- @Test
- public void randomTest() {
- Set strings = new HashSet<>();
- var dhm = new DictionaryHashMap(1<<14);
-
- for (int i = 0; i < 10000; i++) {
- strings.add(Double.toString(Math.random()));
- }
-
- for (String s : strings) {
- dhm.put(s.getBytes(), s.hashCode());
- }
-
- for (String s : strings) {
- assertEquals(s.hashCode(), dhm.get(s.getBytes()));
- }
-
- assertEquals(strings.size(), dhm.size());
- }
-
- @Test
- public void fillHerUp2() {
- var dhm = new DictionaryHashMap(1<<13);
-
- try {
- for (int i = 0; i < 10000; i++) {
- dhm.put(Double.toString(Math.random()).getBytes(), i);
- }
- Assertions.fail("Expected exception");
- }
- catch (IllegalStateException ex) {
- ex.printStackTrace();
- }
- }
+//
+// @Test
+// public void testDictionaryHashMap() {
+// var dhm = new DictionaryHashMap(1<<6);
+// System.out.println(dhm.put("hello".getBytes(), 23));
+// System.out.println(dhm.put("hello".getBytes(), 23));
+// System.out.println(dhm.put("world".getBytes(), 54));
+// assertEquals(23, dhm.get("hello".getBytes()));
+// assertEquals(54, dhm.get("world".getBytes()));
+//
+// }
+//
+// @Test
+// public void testDictionaryHashMapMissing() {
+// var dhm = new DictionaryHashMap(1<<8);
+// assertEquals(DictionaryHashMap.NO_VALUE, dhm.get(new byte[] { 1,2,3}));
+//
+// }
+//
+// @Test
+// public void randomTest() {
+// Set strings = new HashSet<>();
+// var dhm = new DictionaryHashMap(1<<14);
+//
+// for (int i = 0; i < 10000; i++) {
+// strings.add(Double.toString(Math.random()));
+// }
+//
+// for (String s : strings) {
+// dhm.put(s.getBytes(), s.hashCode());
+// }
+//
+// for (String s : strings) {
+// assertEquals(s.hashCode(), dhm.get(s.getBytes()));
+// }
+//
+// assertEquals(strings.size(), dhm.size());
+// }
+//
+// @Test
+// public void fillHerUp2() {
+// var dhm = new DictionaryHashMap(1<<13);
+//
+// try {
+// for (int i = 0; i < 10000; i++) {
+// dhm.put(Double.toString(Math.random()).getBytes(), i);
+// }
+// Assertions.fail("Expected exception");
+// }
+// catch (IllegalStateException ex) {
+// ex.printStackTrace();
+// }
+// }
}
\ No newline at end of file
From 853108028e5ab53697160e64fcdf7a0dfea8a58f Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Mon, 4 Jul 2022 14:47:16 +0200
Subject: [PATCH 14/40] WIP: Selective URL param strings
---
.../converting/LinkKeywordExtractorMain.java | 3 +-
.../converting/atags/AnchorTextExtractor.java | 16 ++++----
.../edge/converting/loader/SqlLoadUrls.java | 11 +++--
.../processor/logic/LinkParser.java | 41 ++++++++++++++++---
.../crawling/retreival/CrawlerRetreiver.java | 6 +--
.../edge/crawling/retreival/HttpFetcher.java | 4 +-
.../StackOverflowPostsReader.java | 2 +-
.../wikipedia/WikipediaReader.java | 2 +-
.../wmsa/edge/model/EdgeDomain.java | 2 +-
.../marginalia/wmsa/edge/model/EdgeUrl.java | 14 +++++--
.../main/resources/sql/edge-crawler-cache.sql | 22 ++++++----
.../wmsa/edge/model/EdgeUrlTest.java | 9 ++++
12 files changed, 93 insertions(+), 39 deletions(-)
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java
index 792dac6f..156dbdaa 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java
@@ -144,7 +144,8 @@ public class LinkKeywordExtractorMain {
try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains,
- url -> crawledUrls.contains(url.toString().hashCode()),
+ url -> url.params != null,
+ //url -> crawledUrls.contains(url.toString().hashCode()),
output::write);
logger.info("Reading files");
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java
index c96fd400..c44e7f18 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java
@@ -74,9 +74,6 @@ public class AnchorTextExtractor {
if (!isInterestingAnchorText(text)) {
return;
}
- if (href.contains("?")) {
- return;
- }
var optLinkUrl = linkParser.parseLink(documentUrl, href);
if (optLinkUrl.isEmpty()) return;
@@ -92,13 +89,16 @@ public class AnchorTextExtractor {
continue;
word = word.toLowerCase();
- if (!WordPatterns.filter(word))
+ if (!WordPatterns.filter(word)) {
continue;
+ }
- if (!linkUrl.domain.equals(documentUrl.domain)) {
- if (isNewKeywordForLink(word, linkUrl.toString())) {
- linkKeywordConsumer.accept(linkUrl, word);
- }
+ if (linkUrl.domain.equals(documentUrl.domain)) {
+ continue;
+ }
+
+ if (isNewKeywordForLink(word, linkUrl.toString())) {
+ linkKeywordConsumer.accept(linkUrl, word);
}
}
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java
index ba9ae43a..04c9735f 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java
@@ -30,6 +30,7 @@ public class SqlLoadUrls {
IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
IN PORT INT,
IN PATH VARCHAR(255),
+ IN PARAM VARCHAR(255),
IN PATH_HASH BIGINT
)
BEGIN
@@ -45,8 +46,8 @@ public class SqlLoadUrls {
public void load(LoaderData data, EdgeUrl[] urls) {
try (var conn = dataSource.getConnection();
- var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?, ?)");
- var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH FROM EC_URL WHERE DOMAIN_ID=?")
+ var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?,?,?)");
+ var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH, PARAM FROM EC_URL WHERE DOMAIN_ID=?")
)
{
conn.setAutoCommit(false);
@@ -61,7 +62,8 @@ public class SqlLoadUrls {
insertCall.setNull(3, Types.INTEGER);
}
insertCall.setString(4, url.path);
- insertCall.setLong(5, hashPath(url.path));
+ insertCall.setString(5, url.params);
+ insertCall.setLong(6, hashPath(url.path));
insertCall.addBatch();
}
var ret = insertCall.executeBatch();
@@ -84,8 +86,9 @@ public class SqlLoadUrls {
int urlId = rsp.getInt(1);
String proto = rsp.getString(2);
String path = rsp.getString(3);
+ String param = rsp.getString(4);
- data.addUrl(new EdgeUrl(proto, targetDomain, null, path), urlId);
+ data.addUrl(new EdgeUrl(proto, targetDomain, null, path, param), urlId);
}
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java
index 0a2bdf45..c14e31cb 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java
@@ -13,9 +13,12 @@ import org.slf4j.LoggerFactory;
import java.net.URI;
import java.net.URISyntaxException;
+import java.util.Arrays;
import java.util.List;
import java.util.Optional;
+import java.util.function.Predicate;
import java.util.regex.Pattern;
+import java.util.stream.Collectors;
public class LinkParser {
private final Logger logger = LoggerFactory.getLogger(getClass());
@@ -107,21 +110,30 @@ public class LinkParser {
@SneakyThrows
private String resolveUrl(EdgeUrl baseUrl, String s) {
- s = paramRegex.matcher(s).replaceAll("");
// url looks like http://www.marginalia.nu/
if (isAbsoluteDomain(s)) {
return s;
}
- // url looks like /my-page
- if (s.startsWith("/")) {
- return baseUrl.withPath(s).toString();
+ String[] parts = s.split("\\?", 2);
+ String path = parts[0];
+ String param;
+ if (parts.length > 1) {
+ param = queryParamsSanitizer(parts[1]);
+ }
+ else {
+ param = null;
}
- final String partFromNewLink = spaceRegex.matcher(s).replaceAll("%20");
+ // url looks like /my-page
+ if (path.startsWith("/")) {
+ return baseUrl.withPathAndParam(path, param).toString();
+ }
- return baseUrl.withPath(relativeNavigation(baseUrl) + partFromNewLink).toString();
+ final String partFromNewLink = spaceRegex.matcher(path).replaceAll("%20");
+
+ return baseUrl.withPathAndParam(relativeNavigation(baseUrl) + partFromNewLink, param).toString();
}
// for a relative url that looks like /foo or /foo/bar; return / or /foo
@@ -183,4 +195,21 @@ public class LinkParser {
return documentUrl;
}
+
+ private static final Pattern paramSplitterPattern = Pattern.compile("&");
+ private static final Predicate paramPatternPredicate = Pattern.compile("((id|i|p|t|v|m|name|view|post)=[a-zA-Z\\d]+)|(view=(/[a-zA-Z\\d\\-])+)").asMatchPredicate();
+
+ public static String queryParamsSanitizer(String queryParams) {
+ if (queryParams == null) {
+ return null;
+ }
+
+ var ret = Arrays.stream(paramSplitterPattern.split(queryParams))
+ .filter(paramPatternPredicate)
+ .sorted()
+ .collect(Collectors.joining("&"));
+ if (ret.isBlank())
+ return null;
+ return ret;
+ }
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java
index 2b27ed4d..c275ad6f 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java
@@ -63,7 +63,7 @@ public class CrawlerRetreiver {
if (queue.peek() != null) {
var fst = queue.peek();
- var root = new EdgeUrl(fst.proto, fst.domain, fst.port, "/");
+ var root = new EdgeUrl(fst.proto, fst.domain, fst.port, "/", null);
if (known.add(root))
queue.addFirst(root);
}
@@ -110,7 +110,7 @@ public class CrawlerRetreiver {
.build());
}
- var fetchResult = fetcher.probeDomain(new EdgeUrl(fst.proto, fst.domain, fst.port, "/"));
+ var fetchResult = fetcher.probeDomain(fst.domain.toRootUrl());
if (!fetchResult.ok()) {
logger.debug("Bad status on {}", domain);
return Optional.of(createErrorPostFromStatus(fetchResult));
@@ -232,7 +232,7 @@ public class CrawlerRetreiver {
}
private Optional findCanonicalUrl(EdgeUrl baseUrl, Document parsed) {
- baseUrl = baseUrl.withPath("/");
+ baseUrl = baseUrl.domain.toRootUrl();
for (var link : parsed.select("link[rel=canonical]")) {
return linkParser.parseLink(baseUrl, link);
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java
index 40728294..53180137 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java
@@ -109,7 +109,7 @@ public class HttpFetcher {
@SneakyThrows
public FetchResult probeDomain(EdgeUrl url) {
var head = new Request.Builder().head().addHeader("User-agent", userAgent)
- .url(new EdgeUrl(url.proto, url.domain, url.port, "/").toString())
+ .url(url.domain.toRootUrl().toString())
.build();
var call = client.newCall(head);
@@ -293,7 +293,7 @@ public class HttpFetcher {
private Optional fetchRobotsForProto(String proto, EdgeDomain domain) {
try {
- var url = new EdgeUrl(proto, domain, null, "/robots.txt");
+ var url = new EdgeUrl(proto, domain, null, "/robots.txt", null);
return Optional.of(parseRobotsTxt(fetchContent(url)));
}
catch (Exception ex) {
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsReader.java
index 0fecf63a..88921be1 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsReader.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsReader.java
@@ -64,7 +64,7 @@ public class StackOverflowPostsReader extends DefaultHandler {
}
private StackOverflowPost createPost(StackOverflowQuestionData data) {
- EdgeUrl url = new EdgeUrl("https", domain, null, "/questions/"+data.getId());
+ EdgeUrl url = new EdgeUrl("https", domain, null, "/questions/"+data.getId(), null);
StringBuilder body = new StringBuilder();
body.append(data.getQuestion());
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaReader.java
index 12bfec3f..fa5904c9 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaReader.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaReader.java
@@ -37,7 +37,7 @@ public class WikipediaReader {
}
private EdgeUrl synthesizeUrl(String originalUrl) {
- return new EdgeUrl("https", domain, null, "/wiki/"+originalUrl);
+ return new EdgeUrl("https", domain, null, "/wiki/"+originalUrl, null);
}
public void join() throws InterruptedException {
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java
index d1945c9e..658184c0 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java
@@ -59,7 +59,7 @@ public class EdgeDomain implements WideHashable {
public EdgeUrl toRootUrl() {
// Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http
- return new EdgeUrl("http", this, null, "/");
+ return new EdgeUrl("http", this, null, "/", null);
}
public String toString() {
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java
index e82d4b7c..b7681951 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java
@@ -4,6 +4,7 @@ import lombok.Builder;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.Setter;
+import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
import java.net.URI;
import java.net.URISyntaxException;
@@ -15,12 +16,14 @@ public class EdgeUrl implements WideHashable {
public final EdgeDomain domain;
public final Integer port;
public final String path;
+ public final String params;
- public EdgeUrl(String proto, EdgeDomain domain, Integer port, String path) {
+ public EdgeUrl(String proto, EdgeDomain domain, Integer port, String path, String params) {
this.proto = proto;
this.domain = domain;
this.port = port(port, proto);
this.path = path;
+ this.params = params;
}
public EdgeUrl(String url) throws URISyntaxException {
@@ -77,8 +80,10 @@ public class EdgeUrl implements WideHashable {
this.path = URI.getPath().isEmpty() ? "/" : URI.getPath();
this.proto = URI.getScheme().toLowerCase();
this.port = port(URI.getPort(), proto);
+ this.params = LinkParser.queryParamsSanitizer(URI.getQuery());
}
+
private static Integer port(Integer port, String protocol) {
if (null == port || port < 1) {
return null;
@@ -94,8 +99,9 @@ public class EdgeUrl implements WideHashable {
public String toString() {
String portPart = port == null ? "" : (":" + port);
+ String queryPart = params == null ? "" : ("?" + params);
- return proto + "://" + domain + portPart + "" + path;
+ return proto + "://" + domain + portPart + path + queryPart;
}
public String dir() {
@@ -115,7 +121,7 @@ public class EdgeUrl implements WideHashable {
return (int) path.chars().filter(c -> c=='/').count();
}
- public EdgeUrl withPath(String s) {
- return new EdgeUrl(proto, domain, port, s);
+ public EdgeUrl withPathAndParam(String path, String param) {
+ return new EdgeUrl(proto, domain, port, path, param);
}
}
diff --git a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql
index 36ab040a..120a1ce2 100644
--- a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql
+++ b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql
@@ -46,20 +46,23 @@ COLLATE utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS EC_URL (
ID INT PRIMARY KEY AUTO_INCREMENT,
DOMAIN_ID INT NOT NULL,
- PROTO ENUM('http','https','gemini') NOT NULL,
- PATH VARCHAR(255) NOT NULL COLLATE utf8mb4_bin,
+
+ PROTO ENUM('http','https','gemini') NOT NULL COLLATE utf8mb4_unicode_ci,
+ PATH VARCHAR(255) NOT NULL,
PORT INT,
+ PARAM VARCHAR(255),
PATH_HASH BIGINT NOT NULL COMMENT "Hash of PATH for uniqueness check by domain",
+
VISITED BOOLEAN NOT NULL DEFAULT FALSE,
- STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok',
+ STATE ENUM('ok', 'redirect', 'dead', 'archived', 'disqualified') NOT NULL DEFAULT 'ok' COLLATE utf8mb4_unicode_ci,
CONSTRAINT CONS UNIQUE (DOMAIN_ID, PATH_HASH),
FOREIGN KEY (DOMAIN_ID) REFERENCES EC_DOMAIN(ID) ON DELETE CASCADE
)
CHARACTER SET utf8mb4
-COLLATE utf8mb4_unicode_ci;
+COLLATE utf8mb4_bin;
CREATE TABLE IF NOT EXISTS EC_PAGE_DATA (
ID INT PRIMARY KEY AUTO_INCREMENT,
@@ -113,10 +116,13 @@ CREATE TABLE IF NOT EXISTS EC_DOMAIN_LINK (
CREATE OR REPLACE VIEW EC_URL_VIEW AS
SELECT
- IF(PORT IS NULL,
- CONCAT(EC_URL.PROTO, "://", EC_DOMAIN.DOMAIN_NAME, EC_URL.PATH),
- CONCAT(EC_URL.PROTO, "://", EC_DOMAIN.DOMAIN_NAME, ":", EC_URL.PORT, EC_URL.PATH))
- AS URL,
+ CONCAT(EC_URL.PROTO,
+ '://',
+ EC_DOMAIN.DOMAIN_NAME,
+ IF(EC_URL.PORT IS NULL, '', CONCAT(':', EC_URL.PORT)),
+ EC_URL.PATH,
+ IF(EC_URL.PARAM IS NULL, '', CONCAT('?', EC_URL.PARAM))
+ ) AS URL,
EC_URL.PATH_HASH AS PATH_HASH,
EC_URL.PATH AS PATH,
EC_DOMAIN.DOMAIN_NAME AS DOMAIN_NAME,
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeUrlTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeUrlTest.java
index dac8dd97..c16f1f08 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeUrlTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/model/EdgeUrlTest.java
@@ -17,4 +17,13 @@ class EdgeUrlTest {
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%22-sign"));
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/\n \"huh\""));
}
+
+ @Test
+ void testParms() throws URISyntaxException {
+ System.out.println(new EdgeUrl("https://search.marginalia.nu/?id=123"));
+ System.out.println(new EdgeUrl("https://search.marginalia.nu/?t=123"));
+ System.out.println(new EdgeUrl("https://search.marginalia.nu/?v=123"));
+ System.out.println(new EdgeUrl("https://search.marginalia.nu/?m=123"));
+ System.out.println(new EdgeUrl("https://search.marginalia.nu/?follow=123"));
+ }
}
\ No newline at end of file
From f3be865293a792c8cc343e5f32f240b9b7e46733 Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Fri, 8 Jul 2022 16:36:09 +0200
Subject: [PATCH 15/40] Allow query params for *some* path,param combinations,
targeted at allowing the crawl of forums.
---
.../converting/LinkKeywordExtractorMain.java | 2 +-
.../converting/atags/AnchorTextExtractor.java | 4 +-
.../edge/converting/loader/SqlLoadUrls.java | 17 +++++--
.../processor/logic/LinkParser.java | 22 +-------
.../processor/logic/LinkProcessor.java | 2 +-
.../processor/logic/QueryParams.java | 50 +++++++++++++++++++
.../edge/crawling/blocklist/UrlBlocklist.java | 8 +--
.../crawling/retreival/CrawlerRetreiver.java | 10 ++--
.../edge/index/lexicon/KeywordLexicon.java | 6 +--
.../marginalia/wmsa/edge/model/EdgeUrl.java | 12 ++---
10 files changed, 84 insertions(+), 49 deletions(-)
create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java
index 156dbdaa..99c93740 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java
@@ -144,7 +144,7 @@ public class LinkKeywordExtractorMain {
try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains,
- url -> url.params != null,
+ url -> url.param != null,
//url -> crawledUrls.contains(url.toString().hashCode()),
output::write);
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java
index c44e7f18..8c5fc6c1 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java
@@ -138,8 +138,8 @@ public class AnchorTextExtractor {
private boolean isNewKeywordForLink(String href, String text) {
long hash = 0;
- hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).asLong();
- hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).asLong();
+ hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).padToLong();
+ hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).padToLong();
// Remove sign bit because we don't want a negative index in deduplicateHashBitset
hash &= 0x7FFF_FFFF_FFFF_FFFFL;
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java
index 04c9735f..d09fac4a 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java
@@ -1,5 +1,6 @@
package nu.marginalia.wmsa.edge.converting.loader;
+import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
@@ -62,8 +63,8 @@ public class SqlLoadUrls {
insertCall.setNull(3, Types.INTEGER);
}
insertCall.setString(4, url.path);
- insertCall.setString(5, url.params);
- insertCall.setLong(6, hashPath(url.path));
+ insertCall.setString(5, url.param);
+ insertCall.setLong(6, hashPath(url.path, url.param));
insertCall.addBatch();
}
var ret = insertCall.executeBatch();
@@ -97,7 +98,15 @@ public class SqlLoadUrls {
}
}
- private long hashPath(String path) {
- return Hashing.murmur3_128().hashString(path, StandardCharsets.UTF_8).asLong();
+ private static final HashFunction murmur3_128 = Hashing.murmur3_128();
+ private long hashPath(String path, String queryParam) {
+ long pathHash = murmur3_128.hashString(path, StandardCharsets.UTF_8).padToLong();
+
+ if (queryParam == null) {
+ return pathHash;
+ }
+ else {
+ return pathHash + murmur3_128.hashString(queryParam, StandardCharsets.UTF_8).padToLong();
+ }
}
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java
index c14e31cb..d58b15bf 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java
@@ -13,12 +13,9 @@ import org.slf4j.LoggerFactory;
import java.net.URI;
import java.net.URISyntaxException;
-import java.util.Arrays;
import java.util.List;
import java.util.Optional;
-import java.util.function.Predicate;
import java.util.regex.Pattern;
-import java.util.stream.Collectors;
public class LinkParser {
private final Logger logger = LoggerFactory.getLogger(getClass());
@@ -105,7 +102,6 @@ public class LinkParser {
return url;
}
- private static final Pattern paramRegex = Pattern.compile("\\?.*$");
private static final Pattern spaceRegex = Pattern.compile(" ");
@SneakyThrows
@@ -120,7 +116,7 @@ public class LinkParser {
String path = parts[0];
String param;
if (parts.length > 1) {
- param = queryParamsSanitizer(parts[1]);
+ param = QueryParams.queryParamsSanitizer(parts[0], parts[1]);
}
else {
param = null;
@@ -196,20 +192,4 @@ public class LinkParser {
return documentUrl;
}
- private static final Pattern paramSplitterPattern = Pattern.compile("&");
- private static final Predicate paramPatternPredicate = Pattern.compile("((id|i|p|t|v|m|name|view|post)=[a-zA-Z\\d]+)|(view=(/[a-zA-Z\\d\\-])+)").asMatchPredicate();
-
- public static String queryParamsSanitizer(String queryParams) {
- if (queryParams == null) {
- return null;
- }
-
- var ret = Arrays.stream(paramSplitterPattern.split(queryParams))
- .filter(paramPatternPredicate)
- .sorted()
- .collect(Collectors.joining("&"));
- if (ret.isBlank())
- return null;
- return ret;
- }
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkProcessor.java
index 24c9229d..54c47e4c 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkProcessor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkProcessor.java
@@ -72,7 +72,7 @@ public class LinkProcessor {
return false;
}
- if (urlBlocklist.isForumLink(link)) {
+ if (urlBlocklist.isMailingListLink(link)) {
return false;
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java
new file mode 100644
index 00000000..ad52e347
--- /dev/null
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java
@@ -0,0 +1,50 @@
+package nu.marginalia.wmsa.edge.converting.processor.logic;
+
+import javax.annotation.Nullable;
+import java.util.Arrays;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+public class QueryParams {
+
+ private static final Pattern paramSplitterPattern = Pattern.compile("&");
+
+ @Nullable
+ public static String queryParamsSanitizer(String path, @Nullable String queryParams) {
+ if (queryParams == null) {
+ return null;
+ }
+
+ var ret = Arrays.stream(paramSplitterPattern.split(queryParams))
+ .filter(param -> QueryParams.isPermittedParam(path, param))
+ .sorted()
+ .collect(Collectors.joining("&"));
+
+ if (ret.isBlank())
+ return null;
+
+ return ret;
+ }
+
+ public static boolean isPermittedParam(String path, String param) {
+ if (path.endsWith("index.php")) {
+ if (param.startsWith("showtopic"))
+ return true;
+ if (param.startsWith("showforum"))
+ return true;
+ }
+ if (path.endsWith("viewtopic.php")) {
+ return (param.startsWith("t=") || param.startsWith("p="));
+ }
+ if (path.endsWith("viewforum.php")) {
+ return param.startsWith("v=");
+ }
+ if (path.endsWith("showthread.php")) {
+ return (param.startsWith("t=") || param.startsWith("p="));
+ }
+ if (path.endsWith("showforum.php")) {
+ return param.startsWith("v=");
+ }
+ return false;
+ }
+}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java
index f81ca0db..b70e4ab0 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java
@@ -33,20 +33,14 @@ public class UrlBlocklist {
}
}
- public boolean isForumLink(EdgeUrl linkUrl) {
+ public boolean isMailingListLink(EdgeUrl linkUrl) {
var path = linkUrl.path;
- if (path.startsWith("/forum")) {
- return true;
- }
if (path.startsWith("/lists/")) {
return true;
}
if (path.startsWith("mailinglist")) {
return true;
}
- if (path.contains("phpbb")) {
- return true;
- }
return false;
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java
index c275ad6f..b9fb79c5 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java
@@ -63,7 +63,7 @@ public class CrawlerRetreiver {
if (queue.peek() != null) {
var fst = queue.peek();
- var root = new EdgeUrl(fst.proto, fst.domain, fst.port, "/", null);
+ var root = fst.domain.toRootUrl();
if (known.add(root))
queue.addFirst(root);
}
@@ -121,6 +121,8 @@ public class CrawlerRetreiver {
private CrawledDomain crawlDomain() {
String ip = findIp(domain);
+ assert !queue.isEmpty();
+
var robotsRules = fetcher.fetchRobotRules(queue.peek().domain);
long crawlDelay = robotsRules.getCrawlDelay();
@@ -209,7 +211,7 @@ public class CrawlerRetreiver {
linkParser.parseLink(baseUrl, link)
.filter(this::isSameDomain)
.filter(u -> !urlBlocklist.isUrlBlocked(u))
- .filter(u -> !urlBlocklist.isForumLink(u))
+ .filter(u -> !urlBlocklist.isMailingListLink(u))
.filter(known::add)
.ifPresent(queue::addLast);
}
@@ -217,7 +219,7 @@ public class CrawlerRetreiver {
linkParser.parseFrame(baseUrl, link)
.filter(this::isSameDomain)
.filter(u -> !urlBlocklist.isUrlBlocked(u))
- .filter(u -> !urlBlocklist.isForumLink(u))
+ .filter(u -> !urlBlocklist.isMailingListLink(u))
.filter(known::add)
.ifPresent(queue::addLast);
}
@@ -225,7 +227,7 @@ public class CrawlerRetreiver {
linkParser.parseFrame(baseUrl, link)
.filter(this::isSameDomain)
.filter(u -> !urlBlocklist.isUrlBlocked(u))
- .filter(u -> !urlBlocklist.isForumLink(u))
+ .filter(u -> !urlBlocklist.isMailingListLink(u))
.filter(known::add)
.ifPresent(queue::addLast);
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java
index 6485f381..8d15f8f3 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java
@@ -46,13 +46,13 @@ public class KeywordLexicon implements AutoCloseable {
}
private void loadJournalEntry(byte[] bytes) {
- final long key = hashFunction.hashBytes(bytes).asLong();
+ final long key = hashFunction.hashBytes(bytes).padToLong();
reverseIndex.put(key);
}
@SneakyThrows
public int getOrInsert(String macroWord) {
- final long key = hashFunction.hashBytes(macroWord.getBytes()).asLong();
+ final long key = hashFunction.hashBytes(macroWord.getBytes()).padToLong();
int idx = getReadOnly(key);
if (idx >= 0)
@@ -78,7 +78,7 @@ public class KeywordLexicon implements AutoCloseable {
}
public int getReadOnly(String word) {
- return getReadOnly(hashFunction.hashBytes(word.getBytes()).asLong());
+ return getReadOnly(hashFunction.hashBytes(word.getBytes()).padToLong());
}
public int getReadOnly(long hashedKey) {
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java
index b7681951..123bd95a 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java
@@ -4,7 +4,7 @@ import lombok.Builder;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.Setter;
-import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
+import nu.marginalia.wmsa.edge.converting.processor.logic.QueryParams;
import java.net.URI;
import java.net.URISyntaxException;
@@ -16,14 +16,14 @@ public class EdgeUrl implements WideHashable {
public final EdgeDomain domain;
public final Integer port;
public final String path;
- public final String params;
+ public final String param;
- public EdgeUrl(String proto, EdgeDomain domain, Integer port, String path, String params) {
+ public EdgeUrl(String proto, EdgeDomain domain, Integer port, String path, String param) {
this.proto = proto;
this.domain = domain;
this.port = port(port, proto);
this.path = path;
- this.params = params;
+ this.param = param;
}
public EdgeUrl(String url) throws URISyntaxException {
@@ -80,7 +80,7 @@ public class EdgeUrl implements WideHashable {
this.path = URI.getPath().isEmpty() ? "/" : URI.getPath();
this.proto = URI.getScheme().toLowerCase();
this.port = port(URI.getPort(), proto);
- this.params = LinkParser.queryParamsSanitizer(URI.getQuery());
+ this.param = QueryParams.queryParamsSanitizer(this.path, URI.getQuery());
}
@@ -99,7 +99,7 @@ public class EdgeUrl implements WideHashable {
public String toString() {
String portPart = port == null ? "" : (":" + port);
- String queryPart = params == null ? "" : ("?" + params);
+ String queryPart = param == null ? "" : ("?" + param);
return proto + "://" + domain + portPart + path + queryPart;
}
From 2b83e0d75408b5fb8833628794dba010b12a3eb2 Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Fri, 8 Jul 2022 16:50:00 +0200
Subject: [PATCH 16/40] Block websites with "acceptable ads", as this seems a
strong indicator the domain is either parked or spam.
---
.../model/DisqualifiedException.java | 7 +++++-
.../converting/processor/AcceptableAds.java | 22 +++++++++++++++++++
.../processor/DocumentProcessor.java | 21 +++++++++++-------
3 files changed, 41 insertions(+), 9 deletions(-)
create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/AcceptableAds.java
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/DisqualifiedException.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/DisqualifiedException.java
index 1c785371..c252f315 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/DisqualifiedException.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/DisqualifiedException.java
@@ -12,6 +12,11 @@ public class DisqualifiedException extends Exception {
}
public enum DisqualificationReason {
- LENGTH, CONTENT_TYPE, LANGUAGE, STATUS, QUALITY
+ LENGTH,
+ CONTENT_TYPE,
+ LANGUAGE,
+ STATUS,
+ QUALITY,
+ ACCEPTABLE_ADS
}
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/AcceptableAds.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/AcceptableAds.java
new file mode 100644
index 00000000..2814eea7
--- /dev/null
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/AcceptableAds.java
@@ -0,0 +1,22 @@
+package nu.marginalia.wmsa.edge.converting.processor;
+
+import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
+import org.jsoup.nodes.Document;
+
+
+public class AcceptableAds {
+ /* Acceptable Ads is an initiative to allow less intrusive ads to punch through adblockers.
+ *
+ * In practice, from looking at crawled data, the only sites in the crawled corpus that seem to
+ * follow this standard are domain squatters and other nuisance sites.
+ *
+ */
+
+ public static boolean hasAcceptableAdsTag(Document parsedDocument) {
+ return parsedDocument.getElementsByTag("html").hasAttr("data-adblockkey");
+ }
+
+ public static boolean hasAcceptableAdsHeader(CrawledDocument document) {
+ return document.headers.contains("X-Adblock-Key");
+ }
+}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java
index b205cdea..d6cf2e46 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java
@@ -3,19 +3,15 @@ package nu.marginalia.wmsa.edge.converting.processor;
import com.google.common.hash.HashCode;
import com.google.inject.Inject;
import com.google.inject.name.Named;
+import nu.marginalia.util.language.LanguageFilter;
+import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
+import nu.marginalia.util.language.processing.SentenceExtractor;
+import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException.DisqualificationReason;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocumentDetails;
import nu.marginalia.wmsa.edge.converting.processor.logic.*;
-import nu.marginalia.wmsa.edge.converting.processor.logic.FeedExtractor;
-import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
-import nu.marginalia.util.language.LanguageFilter;
-import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
-import nu.marginalia.util.language.processing.SentenceExtractor;
-import nu.marginalia.util.language.processing.model.DocumentLanguageData;
-import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
-import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlStandardExtractor;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
@@ -81,6 +77,10 @@ public class DocumentProcessor {
if (ret.state == EdgeUrlState.OK) {
+ if (AcceptableAds.hasAcceptableAdsHeader(crawledDocument)) {
+ throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
+ }
+
if (isAcceptedContentType(crawledDocument)) {
var detailsWords = createDetails(crawledDomain, crawledDocument);
@@ -128,6 +128,11 @@ public class DocumentProcessor {
throws DisqualifiedException, URISyntaxException {
var doc = Jsoup.parse(crawledDocument.documentBody);
+
+ if (AcceptableAds.hasAcceptableAdsTag(doc)) {
+ throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
+ }
+
var dld = sentenceExtractor.extractSentences(doc.clone());
checkDocumentLanguage(dld);
From 7dea94d36d5f90883a8982e21450eb3afb161d7d Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Fri, 8 Jul 2022 17:25:16 +0200
Subject: [PATCH 17/40] Cleaned up HTML features code a bit.
---
.../processor/DocumentProcessor.java | 18 ++++--------------
.../processor/logic/HtmlFeature.java | 18 ++++++++++++------
2 files changed, 16 insertions(+), 20 deletions(-)
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java
index d6cf2e46..d04415fd 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java
@@ -163,7 +163,6 @@ public class DocumentProcessor {
var edgeDomain = url.domain;
tagWords.add("format:"+ret.standard.toString().toLowerCase());
-
tagWords.add("site:" + edgeDomain.toString().toLowerCase());
if (!Objects.equals(edgeDomain.toString(), edgeDomain.domain)) {
tagWords.add("site:" + edgeDomain.domain.toLowerCase());
@@ -172,18 +171,7 @@ public class DocumentProcessor {
tagWords.add("proto:"+url.proto.toLowerCase());
tagWords.add("js:" + Boolean.toString(ret.features.contains(HtmlFeature.JS)).toLowerCase());
- if (ret.features.contains(HtmlFeature.MEDIA)) {
- tagWords.add("special:media");
- }
- if (ret.features.contains(HtmlFeature.TRACKING)) {
- tagWords.add("special:tracking");
- }
- if (ret.features.contains(HtmlFeature.AFFILIATE_LINK)) {
- tagWords.add("special:affiliate");
- }
- if (ret.features.contains(HtmlFeature.COOKIES)) {
- tagWords.add("special:cookies");
- }
+ ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add);
words.append(IndexBlock.Meta, tagWords);
words.append(IndexBlock.Words, tagWords);
@@ -201,7 +189,9 @@ public class DocumentProcessor {
for (var frame : doc.getElementsByTag("frame")) {
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
}
-
+ for (var frame : doc.getElementsByTag("iframe")) {
+ linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
+ }
for (var link : doc.select("link[rel=alternate]")) {
feedExtractor
.getFeedFromAlternateTag(baseUrl, link)
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java
index c8a839ac..ff835dc7 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java
@@ -3,17 +3,23 @@ package nu.marginalia.wmsa.edge.converting.processor.logic;
import java.util.Collection;
public enum HtmlFeature {
- MEDIA(0),
- JS(1),
- AFFILIATE_LINK(2),
- TRACKING(3),
- COOKIES(4)
+ MEDIA(0, "special:media"),
+ JS(1, "special:scripts"),
+ AFFILIATE_LINK(2, "special:affiliate"),
+ TRACKING(3, "special:tracking"),
+ COOKIES(4, "special:cookies")
;
public final int bit;
+ private final String keyword;
- HtmlFeature(int bit) {
+ HtmlFeature(int bit, String keyword) {
this.bit = bit;
+ this.keyword = keyword;
+ }
+
+ public String getKeyword() {
+ return keyword;
}
public static int encode(Collection featuresAll) {
From b0c40136caa323ad4b93f760c01edbb73966eab9 Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Fri, 8 Jul 2022 19:52:12 +0200
Subject: [PATCH 18/40] Cleaned up HTML features code a bit.
---
.../processor/logic/HtmlFeature.java | 26 +++++++++----------
1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java
index ff835dc7..032315dd 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java
@@ -3,18 +3,16 @@ package nu.marginalia.wmsa.edge.converting.processor.logic;
import java.util.Collection;
public enum HtmlFeature {
- MEDIA(0, "special:media"),
- JS(1, "special:scripts"),
- AFFILIATE_LINK(2, "special:affiliate"),
- TRACKING(3, "special:tracking"),
- COOKIES(4, "special:cookies")
+ MEDIA( "special:media"),
+ JS("special:scripts"),
+ AFFILIATE_LINK( "special:affiliate"),
+ TRACKING("special:tracking"),
+ COOKIES("special:cookies")
;
- public final int bit;
private final String keyword;
- HtmlFeature(int bit, String keyword) {
- this.bit = bit;
+ HtmlFeature(String keyword) {
this.keyword = keyword;
}
@@ -23,12 +21,14 @@ public enum HtmlFeature {
}
public static int encode(Collection featuresAll) {
- return featuresAll.stream().mapToInt(f -> 1 << f.bit).reduce(0, (l, r) -> (l|r));
+ int ret = 0;
+ for (var feature : featuresAll) {
+ ret |= (1 << (feature.ordinal()));
+ }
+ return ret;
}
+
public static boolean hasFeature(int value, HtmlFeature feature) {
- return (value & (1<< feature.bit)) != 0;
- }
- public static int addFeature(int value, HtmlFeature feature) {
- return (value | (1<< feature.bit));
+ return (value & (1<< feature.ordinal())) != 0;
}
}
From fed2fa9397b33f0d81934a87bb52738c207377dd Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Mon, 11 Jul 2022 23:25:03 +0200
Subject: [PATCH 19/40] Fix tiny NPE in converting
---
.../wmsa/edge/converting/ConverterMain.java | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java
index 61ff0b00..973554d2 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java
@@ -1,18 +1,19 @@
package nu.marginalia.wmsa.edge.converting;
-import com.google.gson.*;
+import com.google.common.base.Strings;
+import com.google.gson.Gson;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
+import nu.marginalia.util.ParallelPipe;
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor;
import nu.marginalia.wmsa.edge.converting.processor.InstructionsCompiler;
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader;
-import nu.marginalia.wmsa.edge.crawling.WorkLog;
import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader;
+import nu.marginalia.wmsa.edge.crawling.WorkLog;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
-import nu.marginalia.util.ParallelPipe;
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -96,6 +97,10 @@ public class ConverterMain {
domainToId.forEach((domain, id) -> {
String fileName = idToFileName.get(id);
+
+ if (Strings.isNullOrEmpty(fileName))
+ return;
+
Path dest = plan.getCrawledFilePath(fileName);
logger.info("{} - {} - {}", domain, id, dest);
From 20970a6161d92b2059e697d29b89f573e26e07d6 Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Thu, 14 Jul 2022 12:37:06 +0200
Subject: [PATCH 20/40] Make processor more lenient toward quality, accept
content-types which specify charset
---
.../wmsa/edge/converting/ConverterModule.java | 2 +-
.../processor/DocumentProcessor.java | 14 +++++++-
.../converting/processor/DomainProcessor.java | 32 +++++++++++++++++--
.../processor/InstructionsCompiler.java | 15 +++++----
.../processor/logic/DocumentValuator.java | 14 +++-----
5 files changed, 57 insertions(+), 20 deletions(-)
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java
index 4bf6eaea..1177c1a7 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java
@@ -11,7 +11,6 @@ import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import java.net.URISyntaxException;
-import java.nio.file.Path;
public class ConverterModule extends AbstractModule {
@@ -27,6 +26,7 @@ public class ConverterModule extends AbstractModule {
bind(Gson.class).toInstance(createGson());
bind(Double.class).annotatedWith(Names.named("min-document-quality")).toInstance(-15.);
+ bind(Double.class).annotatedWith(Names.named("min-avg-document-quality")).toInstance(-25.);
bind(Integer.class).annotatedWith(Names.named("min-document-length")).toInstance(100);
bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128);
bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255);
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java
index d04415fd..618e5efb 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java
@@ -113,7 +113,19 @@ public class DocumentProcessor {
}
private boolean isAcceptedContentType(CrawledDocument crawledDocument) {
- return crawledDocument.contentType != null && acceptedContentTypes.contains(crawledDocument.contentType.toLowerCase());
+ if (crawledDocument.contentType == null) {
+ return false;
+ }
+
+ var ct = crawledDocument.contentType;
+
+ if (acceptedContentTypes.contains(ct))
+ return true;
+
+ if (ct.contains(";")) {
+ return acceptedContentTypes.contains(ct.substring(0, ct.indexOf(';')));
+ }
+ return false;
}
private EdgeUrlState crawlerStatusToUrlState(String crawlerStatus, int httpStatus) {
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java
index 4343b0c3..b8b53f9d 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java
@@ -1,21 +1,29 @@
package nu.marginalia.wmsa.edge.converting.processor;
import com.google.inject.Inject;
+import com.google.inject.name.Named;
+import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
+import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
import java.util.ArrayList;
import java.util.Collections;
+import java.util.List;
public class DomainProcessor {
private final DocumentProcessor documentProcessor;
+ private final Double minAvgDocumentQuality;
@Inject
- public DomainProcessor(DocumentProcessor documentProcessor) {
+ public DomainProcessor(DocumentProcessor documentProcessor,
+ @Named("min-avg-document-quality") Double minAvgDocumentQuality
+ ) {
this.documentProcessor = documentProcessor;
+ this.minAvgDocumentQuality = minAvgDocumentQuality;
}
public ProcessedDomain process(CrawledDomain crawledDomain) {
@@ -37,17 +45,37 @@ public class DomainProcessor {
ret.documents.add(processedDoc);
}
}
-
}
else {
ret.documents = Collections.emptyList();
}
+ double averageQuality = getAverageQuality(ret.documents);
+ if (averageQuality < minAvgDocumentQuality) {
+ ret.documents.forEach(doc -> doc.state = EdgeUrlState.DISQUALIFIED);
+ }
+
ret.state = getState(crawledDomain.crawlerStatus);
return ret;
}
+ private double getAverageQuality(List documents) {
+ int n = 0;
+ double q = 0.;
+ for (var doc : documents) {
+ if (doc.quality().isPresent()) {
+ n++;
+ q += doc.quality().getAsDouble();
+ }
+ }
+
+ if (n > 0) {
+ return q / n;
+ }
+ return -5.;
+ }
+
private EdgeDomainIndexingState getState(String crawlerStatus) {
return switch (CrawlerDomainStatus.valueOf(crawlerStatus)) {
case OK -> EdgeDomainIndexingState.ACTIVE;
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java
index b75de436..07f1705a 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java
@@ -42,15 +42,16 @@ public class InstructionsCompiler {
Set seenUrls = new HashSet<>(documents.size()*4);
Set seenDomains = new HashSet<>(documents.size());
- documents.stream().map(doc -> doc.url).forEach(seenUrls::add);
-
for (var doc : documents) {
- if (doc.details == null) continue;
- for (var url : doc.details.linksExternal) {
- seenDomains.add(url.domain);
+ seenUrls.add(doc.url);
+
+ if (doc.details != null) {
+ for (var url : doc.details.linksExternal) {
+ seenDomains.add(url.domain);
+ }
+ seenUrls.addAll(doc.details.linksExternal);
+ seenUrls.addAll(doc.details.linksInternal);
}
- seenUrls.addAll(doc.details.linksExternal);
- seenUrls.addAll(doc.details.linksInternal);
}
ret.add(new LoadDomain(seenDomains.toArray(EdgeDomain[]::new)));
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DocumentValuator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DocumentValuator.java
index 6f015ef6..b0423efa 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DocumentValuator.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DocumentValuator.java
@@ -1,8 +1,8 @@
package nu.marginalia.wmsa.edge.converting.processor.logic;
import crawlercommons.utils.Strings;
-import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
+import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import org.jsoup.nodes.Document;
@@ -35,7 +35,7 @@ public class DocumentValuator {
throw new DisqualifiedException(LENGTH);
}
- return Math.log(textBodyLength / (double) rawLength)*htmlStandard.scale
+ return Math.log(textBodyLength / (double) (1+rawLength))*htmlStandard.scale
+ htmlStandard.offset
- scriptPenalty
- smutCoefficient;
@@ -52,17 +52,13 @@ public class DocumentValuator {
double scriptPenalty = 0;
for (var tag : scriptTags) {
- String srcTag = tag.attr("src");
- if (Strings.isBlank(srcTag)) {
- scriptPenalty += 1;
- }
- else if (srcTag.contains("wp-content") || srcTag.contains("wp-includes") || srcTag.contains("jquery")) {
+ String srcAttr = tag.attr("src");
+ if (srcAttr.contains("wp-content") || srcAttr.contains("wp-includes") || srcAttr.contains("jquery")) {
scriptPenalty += 0.49;
}
- else {
+ else if (!Strings.isBlank(srcAttr)) {
scriptPenalty += 1;
}
-
}
return (int)(scriptPenalty + badScript + (scriptText.length())/1000.);
}
From 661577b456529af56058f3775544a104e23bd38a Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Thu, 14 Jul 2022 14:45:31 +0200
Subject: [PATCH 21/40] Add Fossil SCM commits to URL blocklist
---
.../marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java | 4 ++--
.../nu/marginalia/wmsa/edge/crawling/UrlBlocklistTest.java | 4 +++-
2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java
index b70e4ab0..a7dce9ed 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java
@@ -11,8 +11,8 @@ public class UrlBlocklist {
private final List> patterns = new ArrayList<>();
public UrlBlocklist() {
- patterns.add(Pattern.compile(".*/[a-f0-9]{40}(/|$)").asPredicate());
- patterns.add(Pattern.compile("/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$").asPredicate());
+ patterns.add(Pattern.compile(".*/[a-f0-9]{40}(/|$)").asPredicate()); // git
+ patterns.add(Pattern.compile(".*/[a-f0-9]{64}(/|$)").asPredicate()); // fossil SCM
patterns.add(Pattern.compile("/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$").asPredicate());
patterns.add(Pattern.compile("/permalink/[a-z]+(-([A-Za-z]+|[0-9]+)){3,}\\.(htm|html|php)$").asPredicate());
patterns.add(Pattern.compile("(webrx3|lib|pdf|book|720p).*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$").asPredicate());
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/UrlBlocklistTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/UrlBlocklistTest.java
index c357f83c..c93e1ffb 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/UrlBlocklistTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/UrlBlocklistTest.java
@@ -6,7 +6,8 @@ import org.junit.jupiter.api.Test;
import java.net.URISyntaxException;
-import static org.junit.jupiter.api.Assertions.*;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
class UrlBlocklistTest {
@@ -19,5 +20,6 @@ class UrlBlocklistTest {
assertFalse(blocklist.isUrlBlocked(new EdgeUrl("http://yelenasimone.com/nope/x-a-course-in-algebra.html")));
assertTrue(blocklist.isUrlBlocked(new EdgeUrl("http://yelenasimone.com/_module/slide/pqPan/library/american-sour-beer-innovative-techniques-for-mixed-fermentations/")));
assertTrue(blocklist.isUrlBlocked(new EdgeUrl("http://w-m-p.de/images/book/download-firstborn-starcraft-dark-templar-book-1.php")));
+ assertTrue(blocklist.isUrlBlocked(new EdgeUrl("https://sqlite.org/src/info/6376abec766e9a0785178b1823b5a587e9f1ccbc")));
}
}
\ No newline at end of file
From c71cc3d43a3313bdaf526b6ca536b39d46bb8d72 Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Sat, 16 Jul 2022 18:58:19 +0200
Subject: [PATCH 22/40] Fix overflow bugs in DictionaryHashMap that only
surfaced without small RAM
---
.../java/nu/marginalia/util/dict/DictionaryData.java | 9 +++++++--
.../java/nu/marginalia/util/dict/DictionaryHashMap.java | 3 +--
2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java
index c36c10d2..9aa953dc 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java
@@ -59,12 +59,14 @@ public class DictionaryData {
private final LongBuffer keys;
private int size;
+ private final int capacity;
public DictionaryDataBank(int start_idx, int sz) {
this.start_idx = start_idx;
+ this.capacity = sz;
- keys = ByteBuffer.allocateDirect(8*sz).asLongBuffer();
+ keys = ByteBuffer.allocateDirect(8*capacity).asLongBuffer();
size = 0;
}
@@ -88,10 +90,13 @@ public class DictionaryData {
throw new IndexOutOfBoundsException(idx);
}
- return keys.get(idx - start_idx) == other;
+ return keys.get(idx - start_idx) == other;
}
public int add(long newKey) {
+ if (size >= capacity)
+ return -1;
+
keys.put(size, newKey);
return start_idx + size++;
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java
index 5544545a..1c76b116 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java
@@ -66,8 +66,7 @@ public class DictionaryHashMap {
logger.debug("Buffer size sanity checked passed");
}
-
- dictionaryData = new DictionaryData(Math.min(1<<30, Math.max(32, (int)(sizeMemory/4))));
+ dictionaryData = new DictionaryData((int)Math.min(1<<27, Math.max(32L, sizeMemory/4)));
initializeBuffers();
}
From 80b3ac3dd8f30cc2b54c0fb1562906066f923ee2 Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Sat, 16 Jul 2022 21:19:13 +0200
Subject: [PATCH 23/40] Tweaking the URL block list to exclude git noise better
---
.../edge/crawling/blocklist/UrlBlocklist.java | 30 ++++++++++++++-----
1 file changed, 23 insertions(+), 7 deletions(-)
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java
index a7dce9ed..b8064952 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java
@@ -4,15 +4,26 @@ import nu.marginalia.wmsa.edge.model.EdgeUrl;
import java.util.ArrayList;
import java.util.List;
+import java.util.Set;
import java.util.function.Predicate;
import java.util.regex.Pattern;
public class UrlBlocklist {
private final List> patterns = new ArrayList<>();
+ // domains that have a lot of links but we know we don't want to crawl
+ private final Set badDomains = Set.of("t.co", "facebook.com",
+ "instagram.com", "youtube.com",
+ "youtu.be", "amzn.to");
+
public UrlBlocklist() {
- patterns.add(Pattern.compile(".*/[a-f0-9]{40}(/|$)").asPredicate()); // git
- patterns.add(Pattern.compile(".*/[a-f0-9]{64}(/|$)").asPredicate()); // fossil SCM
+ // Don't deep-crawl git repos
+ patterns.add(Pattern.compile("\\.git/.+").asPredicate());
+
+ // long base64-strings in URLs are typically git hashes or the like, rarely worth crawling
+ patterns.add(Pattern.compile(".*/[^/]*[a-f0-9]{32,}(/|$)").asPredicate());
+
+ // link farms &c
patterns.add(Pattern.compile("/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$").asPredicate());
patterns.add(Pattern.compile("/permalink/[a-z]+(-([A-Za-z]+|[0-9]+)){3,}\\.(htm|html|php)$").asPredicate());
patterns.add(Pattern.compile("(webrx3|lib|pdf|book|720p).*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$").asPredicate());
@@ -22,15 +33,23 @@ public class UrlBlocklist {
public boolean isUrlBlocked(EdgeUrl url) {
try {
+ if (badDomains.contains(url.domain.domain)) {
+ return true;
+ }
+
if ("github.com".equals(url.domain.domain)) {
return url.path.chars().filter(c -> c == '/').count() > 2;
}
- return patterns.stream().anyMatch(p -> p.test(url.path));
+ for (var p : patterns) {
+ if (p.test(url.path))
+ return true;
+ }
}
catch (StackOverflowError ex) {
return true;
}
+ return false;
}
public boolean isMailingListLink(EdgeUrl linkUrl) {
@@ -38,12 +57,9 @@ public class UrlBlocklist {
if (path.startsWith("/lists/")) {
return true;
}
- if (path.startsWith("mailinglist")) {
+ if (path.contains("mailinglist")) {
return true;
}
return false;
}
-
-
-
}
From 89cca4dbff4f007969ac65205e011b37c035be1d Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Sat, 16 Jul 2022 21:27:04 +0200
Subject: [PATCH 24/40] Better logging for rare parsing exception
---
.../marginalia/wmsa/edge/converting/ConvertedDomainReader.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConvertedDomainReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConvertedDomainReader.java
index 9e61c682..eca74633 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConvertedDomainReader.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConvertedDomainReader.java
@@ -45,7 +45,7 @@ public class ConvertedDomainReader {
try {
ret.add(gson.fromJson(parts[1], type));
}
- catch (JsonParseException ex) {
+ catch (NullPointerException|JsonParseException ex) {
logger.warn("Failed to deserialize {} {}", type.getSimpleName(), StringUtils.abbreviate(parts[1], 255));
logger.warn("Json error", ex);
}
From c5dbe269f716f780e5cffb87166d72394c34e52c Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Sun, 17 Jul 2022 15:17:39 +0200
Subject: [PATCH 25/40] Better logging for URL errors
---
.../marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java
index d09fac4a..a3fd2797 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java
@@ -53,6 +53,10 @@ public class SqlLoadUrls {
{
conn.setAutoCommit(false);
for (var url : urls) {
+ if (url.path.length() >= 255) {
+ logger.warn("Skipping bad URL {}", url);
+ continue;
+ }
insertCall.setString(1, url.proto);
insertCall.setString(2, url.domain.toString());
@@ -68,7 +72,7 @@ public class SqlLoadUrls {
insertCall.addBatch();
}
var ret = insertCall.executeBatch();
- for (int rv = 0; rv < urls.length; rv++) {
+ for (int rv = 0; rv < ret.length; rv++) {
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
logger.warn("load({}) -- bad row count {}", urls[rv], ret[rv]);
}
From f4966cf1f9e6b3a29007d28916c4b33e55492bd2 Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Sun, 17 Jul 2022 15:18:16 +0200
Subject: [PATCH 26/40] Fix bug in keyword loading when keywords have non-ASCII
symbols
---
.../lexicon/journal/KeywordLexiconJournalFile.java | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java
index a97eee6c..241ddefb 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java
@@ -70,7 +70,10 @@ public class KeywordLexiconJournalFile {
buffer.flip();
}
- int len = buffer.get();
+ int len = buffer.get() & 0xFF;
+ if (len > Byte.MAX_VALUE) {
+ logger.warn("Found keyword with impossible length {} near {}, likely corruption", len, cp);
+ }
while (buffer.limit() - buffer.position() < len) {
buffer.compact();
int rb = channel.read(buffer);
@@ -126,8 +129,9 @@ public class KeywordLexiconJournalFile {
for (String item : data) {
writeBuffer.clear();
- writeBuffer.put((byte) item.length());
- writeBuffer.put(item.getBytes());
+ byte[] itemBytes = item.getBytes();
+ writeBuffer.put((byte)itemBytes.length);
+ writeBuffer.put(itemBytes);
writeBuffer.flip();
while (writeBuffer.position() < writeBuffer.limit())
From e30a20bb7447f108861d4af766c16b66bd7e1211 Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Sun, 17 Jul 2022 19:31:49 +0200
Subject: [PATCH 27/40] Fix bug in keyword loading when keywords have non-ASCII
symbols, cleaner solution
---
.../wmsa/edge/index/EdgeIndexService.java | 2 +-
.../edge/index/lexicon/KeywordLexicon.java | 22 ++++++++++++-------
.../journal/KeywordLexiconJournal.java | 4 ++--
.../KeywordLexiconJournalCommitQueue.java | 6 ++---
.../journal/KeywordLexiconJournalFile.java | 7 +++---
5 files changed, 23 insertions(+), 18 deletions(-)
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java
index 96f1fb72..b4915df7 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java
@@ -199,7 +199,7 @@ public class EdgeIndexService extends Service {
private long[] getOrInsertWordIds(List words) {
return words.stream()
- .filter(w -> w.length() < Byte.MAX_VALUE)
+ .filter(w -> w.getBytes().length < Byte.MAX_VALUE)
.mapToLong(keywordLexicon::getOrInsert)
.toArray();
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java
index 8d15f8f3..667ea6b1 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java
@@ -9,6 +9,7 @@ import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.nio.charset.StandardCharsets;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
@@ -40,19 +41,23 @@ public class KeywordLexicon implements AutoCloseable {
logger.error("MULTIPLE WRITER INSTANCES!");
}
- journal.loadFile(this::loadJournalEntry);
+ journal.loadFile(bytes -> reverseIndex.put(hashFunction.hashBytes(bytes).padToLong()));
logger.info("Done creating dictionary writer");
}
- private void loadJournalEntry(byte[] bytes) {
- final long key = hashFunction.hashBytes(bytes).padToLong();
- reverseIndex.put(key);
+ public int getOrInsert(String macroWord) {
+ return getOrInsert(macroWord.getBytes(StandardCharsets.UTF_8));
}
@SneakyThrows
- public int getOrInsert(String macroWord) {
- final long key = hashFunction.hashBytes(macroWord.getBytes()).padToLong();
+ private int getOrInsert(byte[] bytes) {
+ if (bytes.length >= Byte.MAX_VALUE) {
+ logger.warn("getOrInsert({}), illegal length {}", bytes, bytes.length);
+ return DictionaryHashMap.NO_VALUE;
+ }
+
+ final long key = hashFunction.hashBytes(bytes).padToLong();
int idx = getReadOnly(key);
if (idx >= 0)
@@ -66,7 +71,7 @@ public class KeywordLexicon implements AutoCloseable {
if ((idx = reverseIndex.get(key)) >= 0)
return idx;
- journal.enqueue(macroWord);
+ journal.enqueue(bytes);
idx = reverseIndex.put(key);
request_time_metrics.set(reverseIndex.size());
@@ -78,7 +83,8 @@ public class KeywordLexicon implements AutoCloseable {
}
public int getReadOnly(String word) {
- return getReadOnly(hashFunction.hashBytes(word.getBytes()).padToLong());
+ final byte[] bytes = word.getBytes(StandardCharsets.UTF_8);
+ return getReadOnly(hashFunction.hashBytes(bytes).padToLong());
}
public int getReadOnly(long hashedKey) {
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournal.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournal.java
index 02d50862..c226c1e6 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournal.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournal.java
@@ -30,7 +30,7 @@ public class KeywordLexiconJournal {
Runtime.getRuntime().addShutdownHook(new Thread(this::commitToDisk));
}
- public void enqueue(String word) throws InterruptedException {
+ public void enqueue(byte[] word) throws InterruptedException {
commitQueue.enqueue(word);
}
@@ -49,7 +49,7 @@ public class KeywordLexiconJournal {
}
public void commitToDisk() {
- List entries = commitQueue.getQueuedEntries();
+ List entries = commitQueue.getQueuedEntries();
journalFile.writeEntriesToJournal(entries);
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalCommitQueue.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalCommitQueue.java
index 6baef0e1..67d4043a 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalCommitQueue.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalCommitQueue.java
@@ -8,11 +8,11 @@ import java.util.Collections;
import java.util.List;
public class KeywordLexiconJournalCommitQueue {
- private final ArrayList commitQueue = new ArrayList<>(10_000);
+ private final ArrayList commitQueue = new ArrayList<>(10_000);
private final Logger logger = LoggerFactory.getLogger(getClass());
private static final long BACK_PRESSURE_LIMIT = 25_000;
- public synchronized void enqueue(String word) throws InterruptedException {
+ public synchronized void enqueue(byte[] word) throws InterruptedException {
for (int queueSize = commitQueue.size();
queueSize >= BACK_PRESSURE_LIMIT;
queueSize = commitQueue.size())
@@ -24,7 +24,7 @@ public class KeywordLexiconJournalCommitQueue {
}
- public synchronized List getQueuedEntries() {
+ public synchronized List getQueuedEntries() {
if (commitQueue.isEmpty())
return Collections.emptyList();
var data = new ArrayList<>(commitQueue);
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java
index 241ddefb..b68ee1fe 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/journal/KeywordLexiconJournalFile.java
@@ -110,7 +110,7 @@ public class KeywordLexiconJournalFile {
private final ByteBuffer writeBuffer = ByteBuffer.allocateDirect(4096);
- public void writeEntriesToJournal(List data) {
+ public void writeEntriesToJournal(List data) {
if (data.isEmpty())
return;
@@ -127,10 +127,9 @@ public class KeywordLexiconJournalFile {
long start = System.currentTimeMillis();
int ct = data.size();
- for (String item : data) {
+ for (byte[] itemBytes : data) {
writeBuffer.clear();
- byte[] itemBytes = item.getBytes();
- writeBuffer.put((byte)itemBytes.length);
+ writeBuffer.put((byte) itemBytes.length);
writeBuffer.put(itemBytes);
writeBuffer.flip();
From e22748e990e2dd46fd46f1fe8077918c23e7d6a8 Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Sun, 17 Jul 2022 22:08:06 +0200
Subject: [PATCH 28/40] Better error logging for IO errors during conversion
from configuration issues.
---
.../wmsa/edge/index/conversion/SearchIndexConverter.java | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java
index adce8747..79c47a08 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java
@@ -88,6 +88,10 @@ public class SearchIndexConverter {
Files.delete(tmpUrlsFile);
}
+ catch (IOException ex) {
+ logger.error("Failed to convert", ex);
+ throw ex;
+ }
finally {
lock.unlock();
}
From 9f7a28cbdbaa967835cf53b5a6b8212dce0fd572 Mon Sep 17 00:00:00 2001
From: vlofgren
Date: Sun, 17 Jul 2022 22:21:41 +0200
Subject: [PATCH 29/40] Made search service more robust toward the case where
Encyclopedia or Assistant is down
---
.../wmsa/edge/EdgeSearchE2ETest.java | 7 +++--
.../assistant/client/AssistantClient.java | 30 +++++++++++++++----
.../wmsa/edge/search/EdgeSearchOperator.java | 14 +++++----
.../wmsa/encyclopedia/EncyclopediaClient.java | 8 ++++-
4 files changed, 45 insertions(+), 14 deletions(-)
diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java
index 08408de2..e04dd71b 100644
--- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java
+++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java
@@ -12,7 +12,10 @@ import org.openqa.selenium.chrome.ChromeOptions;
import org.openzim.ZIMTypes.ZIMFile;
import org.openzim.ZIMTypes.ZIMReader;
import org.slf4j.LoggerFactory;
-import org.testcontainers.containers.*;
+import org.testcontainers.containers.BindMode;
+import org.testcontainers.containers.BrowserWebDriverContainer;
+import org.testcontainers.containers.GenericContainer;
+import org.testcontainers.containers.NginxContainer;
import org.testcontainers.containers.output.Slf4jLogConsumer;
import org.testcontainers.containers.wait.strategy.Wait;
import org.testcontainers.junit.jupiter.Container;
@@ -41,8 +44,6 @@ public class EdgeSearchE2ETest extends E2ETestBase {
@Container
public static GenericContainer> assistantContainer = forService(EDGE_ASSISTANT, mariaDB);
@Container
- public static GenericContainer> encyclopediaContainer = forService(ENCYCLOPEDIA, mariaDB);
- @Container
public static GenericContainer> indexContainer = forService(EDGE_INDEX, mariaDB);
@Container
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/client/AssistantClient.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/client/AssistantClient.java
index de0b9313..63f8e255 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/client/AssistantClient.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/client/AssistantClient.java
@@ -4,10 +4,10 @@ import com.google.inject.Inject;
import com.google.inject.Singleton;
import io.reactivex.rxjava3.core.Observable;
import nu.marginalia.wmsa.client.AbstractDynamicClient;
+import nu.marginalia.wmsa.client.exception.RouteNotConfiguredException;
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.edge.assistant.dict.DictionaryResponse;
-import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles;
import org.eclipse.jetty.util.UrlEncoded;
import java.util.List;
@@ -21,18 +21,38 @@ public class AssistantClient extends AbstractDynamicClient {
}
public Observable dictionaryLookup(Context ctx, String word) {
- return super.get(ctx,"/dictionary/" + UrlEncoded.encodeString(word), DictionaryResponse.class);
+ try {
+ return super.get(ctx, "/dictionary/" + UrlEncoded.encodeString(word), DictionaryResponse.class);
+ }
+ catch (RouteNotConfiguredException ex) {
+ return Observable.empty();
+ }
}
@SuppressWarnings("unchecked")
public Observable> spellCheck(Context ctx, String word) {
- return (Observable>) (Object) super.get(ctx,"/spell-check/" + UrlEncoded.encodeString(word), List.class);
+ try {
+ return (Observable>) (Object) super.get(ctx, "/spell-check/" + UrlEncoded.encodeString(word), List.class);
+ }
+ catch (RouteNotConfiguredException ex) {
+ return Observable.empty();
+ }
}
public Observable unitConversion(Context ctx, String value, String from, String to) {
- return super.get(ctx,"/unit-conversion?value="+value + "&from="+from+"&to="+to);
+ try {
+ return super.get(ctx, "/unit-conversion?value=" + value + "&from=" + from + "&to=" + to);
+ }
+ catch (RouteNotConfiguredException ex) {
+ return Observable.empty();
+ }
}
public Observable evalMath(Context ctx, String expression) {
- return super.get(ctx,"/eval-expression?value="+UrlEncoded.encodeString(expression));
+ try {
+ return super.get(ctx, "/eval-expression?value=" + UrlEncoded.encodeString(expression));
+ }
+ catch (RouteNotConfiguredException ex) {
+ return Observable.empty();
+ }
}
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java
index add46ef4..a6dff7fc 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java
@@ -11,17 +11,19 @@ import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles;
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
-import nu.marginalia.wmsa.edge.model.*;
+import nu.marginalia.wmsa.edge.model.EdgeDomain;
+import nu.marginalia.wmsa.edge.model.EdgeId;
+import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.search.*;
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResultSet;
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResults;
-import nu.marginalia.wmsa.edge.search.query.model.EdgeSearchQuery;
import nu.marginalia.wmsa.edge.search.query.QueryFactory;
+import nu.marginalia.wmsa.edge.search.query.model.EdgeSearchQuery;
import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters;
-import nu.marginalia.wmsa.edge.search.results.SearchResultValuator;
-import nu.marginalia.wmsa.edge.search.results.model.AccumulatedQueryResults;
import nu.marginalia.wmsa.edge.search.results.SearchResultDecorator;
+import nu.marginalia.wmsa.edge.search.results.SearchResultValuator;
import nu.marginalia.wmsa.edge.search.results.UrlDeduplicator;
+import nu.marginalia.wmsa.edge.search.results.model.AccumulatedQueryResults;
import nu.marginalia.wmsa.encyclopedia.EncyclopediaClient;
import org.apache.logging.log4j.util.Strings;
import org.jetbrains.annotations.NotNull;
@@ -251,7 +253,9 @@ public class EdgeSearchOperator {
.encyclopediaLookup(ctx,
humanQuery.replaceAll("\\s+", "_")
.replaceAll("\"", "")
- ).subscribeOn(Schedulers.io());
+ )
+ .onErrorReturn(e -> new WikiArticles())
+ .subscribeOn(Schedulers.io());
}
private void fetchResultsMulti(Context ctx, EdgeSearchQuery processedQuery, AccumulatedQueryResults queryResults, UrlDeduplicator deduplicator) {
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaClient.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaClient.java
index dd382220..0b7e5491 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaClient.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaClient.java
@@ -3,6 +3,7 @@ package nu.marginalia.wmsa.encyclopedia;
import io.reactivex.rxjava3.core.Observable;
import nu.marginalia.wmsa.client.AbstractDynamicClient;
import nu.marginalia.wmsa.client.HttpStatusCode;
+import nu.marginalia.wmsa.client.exception.RouteNotConfiguredException;
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles;
@@ -28,7 +29,12 @@ public class EncyclopediaClient extends AbstractDynamicClient {
@CheckReturnValue
public Observable