diff --git a/build.gradle b/build.gradle index 11ef88b5..1ed11dc9 100644 --- a/build.gradle +++ b/build.gradle @@ -13,6 +13,10 @@ tasks.register('dist', Copy) { from subprojects.collect { it.tasks.withType(Tar) } into "$buildDir/dist" + // For local development, each processes that are to be triggerable + // from the control-service need to go here to end up somewhere the + // control-service can find them + doLast { copy { from tarTree("$buildDir/dist/converter-process.tar") @@ -34,10 +38,18 @@ tasks.register('dist', Copy) { from tarTree("$buildDir/dist/crawl-job-extractor-process.tar") into "$projectDir/run/dist/" } + copy { + from tarTree("$buildDir/dist/index-construction-process.tar") + into "$projectDir/run/dist/" + } } } idea { module { + // Exclude these directories from being indexed by IntelliJ + // as they tend to bring the IDE to its knees and use up all + // Inotify spots in a hurry + excludeDirs.add(file("$projectDir/run/backup")) excludeDirs.add(file("$projectDir/run/model")) excludeDirs.add(file("$projectDir/run/dist")) excludeDirs.add(file("$projectDir/run/samples")) diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexMqEndpoints.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexMqEndpoints.java index f8349eb7..627ffd4f 100644 --- a/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexMqEndpoints.java +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexMqEndpoints.java @@ -3,8 +3,6 @@ package nu.marginalia.index.client; public class IndexMqEndpoints { public static final String INDEX_IS_BLOCKED = "INDEX-IS-BLOCKED"; public static final String INDEX_REPARTITION = "INDEX-REPARTITION"; - - public static final String INDEX_RELOAD_LEXICON = "INDEX-RELOAD-LEXICON"; - public static final String INDEX_REINDEX = "INDEX-REINDEX"; + public static final String SWITCH_INDEX = "SWITCH-INDEX"; } diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultItem.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultItem.java index 9890b3aa..19f879ca 100644 --- a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultItem.java +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultItem.java @@ -2,16 +2,17 @@ package nu.marginalia.index.client.model.results; import lombok.AllArgsConstructor; import lombok.Getter; -import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.id.EdgeId; +import nu.marginalia.model.id.UrlIdCodec; +import org.jetbrains.annotations.NotNull; import java.util.ArrayList; import java.util.List; /** Represents a document matching a search query */ @AllArgsConstructor @Getter -public class SearchResultItem { - /** Encoded ID that contains both the URL id and its ranking */ +public class SearchResultItem implements Comparable { + /** Encoded ID that contains both the URL id and its ranking. This is + * probably not what you want, use getDocumentId() instead */ public final long combinedId; /** How did the subqueries match against the document ? */ @@ -20,20 +21,18 @@ public class SearchResultItem { /** How many other potential results existed in the same domain */ public int resultsFromDomain; - public SearchResultItem(long val) { - this.combinedId = val; + public SearchResultItem(long combinedId) { + this.combinedId = combinedId; this.keywordScores = new ArrayList<>(16); } - public EdgeId getUrlId() { - return new EdgeId<>(getUrlIdInt()); + + public long getDocumentId() { + return UrlIdCodec.removeRank(combinedId); } - public int getUrlIdInt() { - return (int)(combinedId & 0xFFFF_FFFFL); - } public int getRanking() { - return (int)(combinedId >>> 32); + return UrlIdCodec.getRank(combinedId); } /* Used for evaluation */ @@ -45,20 +44,16 @@ public class SearchResultItem { return scoreValue; } - private transient int domainId = Integer.MIN_VALUE; - public void setDomainId(int domainId) { - this.domainId = domainId; - } public int getDomainId() { - return this.domainId; + return UrlIdCodec.getDomainId(this.combinedId); } public int hashCode() { - return getUrlIdInt(); + return Long.hashCode(combinedId); } public String toString() { - return getClass().getSimpleName() + "[ url= " + getUrlId() + ", rank=" + getRanking() + "]"; + return getClass().getSimpleName() + "[ url= " + getDocumentId() + ", rank=" + getRanking() + "]"; } public boolean equals(Object other) { @@ -67,18 +62,18 @@ public class SearchResultItem { if (other == this) return true; if (other instanceof SearchResultItem o) { - return o.getUrlIdInt() == getUrlIdInt(); + return o.getDocumentId() == getDocumentId(); } return false; } - public long deduplicationKey() { - final int domainId = getDomainId(); + @Override + public int compareTo(@NotNull SearchResultItem o) { + // this looks like a bug, but we actually want this in a reversed order + int diff = o.getScore().compareTo(getScore()); + if (diff != 0) + return diff; - if (domainId == Integer.MAX_VALUE || domainId == Integer.MIN_VALUE) { - return 0; - } - - return domainId; + return Long.compare(this.combinedId, o.combinedId); } } diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultPreliminaryScore.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultPreliminaryScore.java index b8696a4c..a46e14b1 100644 --- a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultPreliminaryScore.java +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultPreliminaryScore.java @@ -6,7 +6,6 @@ import static java.lang.Boolean.compare; import static java.lang.Double.compare; public record SearchResultPreliminaryScore( - boolean disqualified, boolean hasPriorityTerm, double searchRankingScore) implements Comparable @@ -25,7 +24,4 @@ public record SearchResultPreliminaryScore( return PREFER_LOW * compare(searchRankingScore, other.searchRankingScore); } - public boolean isDisqualified() { - return disqualified; - } } diff --git a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/ProcessInboxNames.java b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/ProcessInboxNames.java index 9ca91fe6..b7906c22 100644 --- a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/ProcessInboxNames.java +++ b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/ProcessInboxNames.java @@ -4,4 +4,6 @@ public class ProcessInboxNames { public static final String CONVERTER_INBOX = "converter"; public static final String LOADER_INBOX = "loader"; public static final String CRAWLER_INBOX = "crawler"; + + public static final String INDEX_CONSTRUCTOR_INBOX = "index_constructor"; } diff --git a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/index/CreateIndexRequest.java b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/index/CreateIndexRequest.java new file mode 100644 index 00000000..c0ab45a1 --- /dev/null +++ b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/index/CreateIndexRequest.java @@ -0,0 +1,5 @@ +package nu.marginalia.mqapi.index; + +public record CreateIndexRequest(IndexName indexName) +{ +} diff --git a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/index/IndexName.java b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/index/IndexName.java new file mode 100644 index 00000000..c7925e50 --- /dev/null +++ b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/index/IndexName.java @@ -0,0 +1,7 @@ +package nu.marginalia.mqapi.index; + +public enum IndexName { + FORWARD, + REVERSE_FULL, + REVERSE_PRIO +} diff --git a/code/api/search-api/src/main/java/nu/marginalia/search/client/SearchMqEndpoints.java b/code/api/search-api/src/main/java/nu/marginalia/search/client/SearchMqEndpoints.java index 1c546b3e..aaa1c838 100644 --- a/code/api/search-api/src/main/java/nu/marginalia/search/client/SearchMqEndpoints.java +++ b/code/api/search-api/src/main/java/nu/marginalia/search/client/SearchMqEndpoints.java @@ -3,4 +3,5 @@ package nu.marginalia.search.client; public class SearchMqEndpoints { /** Flushes the URL caches, run if significant changes have occurred in the URLs database */ public static final String FLUSH_CACHES = "FLUSH_CACHES"; + public static final String SWITCH_LINKDB = "SWITCH_LINKDB"; } diff --git a/code/common/db/src/main/java/nu/marginalia/db/DbDomainQueries.java b/code/common/db/src/main/java/nu/marginalia/db/DbDomainQueries.java index d923a82a..1664ccef 100644 --- a/code/common/db/src/main/java/nu/marginalia/db/DbDomainQueries.java +++ b/code/common/db/src/main/java/nu/marginalia/db/DbDomainQueries.java @@ -9,16 +9,16 @@ import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.id.EdgeId; import java.util.NoSuchElementException; import java.util.Optional; +import java.util.OptionalInt; @Singleton public class DbDomainQueries { private final HikariDataSource dataSource; - private final Cache> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build(); + private final Cache domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build(); @Inject public DbDomainQueries(HikariDataSource dataSource) @@ -28,7 +28,7 @@ public class DbDomainQueries { @SneakyThrows - public EdgeId getDomainId(EdgeDomain domain) { + public Integer getDomainId(EdgeDomain domain) { try (var connection = dataSource.getConnection()) { return domainIdCache.get(domain, () -> { @@ -36,7 +36,7 @@ public class DbDomainQueries { stmt.setString(1, domain.toString()); var rsp = stmt.executeQuery(); if (rsp.next()) { - return new EdgeId<>(rsp.getInt(1)); + return rsp.getInt(1); } } throw new NoSuchElementException(); @@ -48,12 +48,12 @@ public class DbDomainQueries { } @SneakyThrows - public Optional> tryGetDomainId(EdgeDomain domain) { + public OptionalInt tryGetDomainId(EdgeDomain domain) { - var maybe = Optional.ofNullable(domainIdCache.getIfPresent(domain)); - - if (maybe.isPresent()) - return maybe; + Integer maybeId = domainIdCache.getIfPresent(domain); + if (maybeId != null) { + return OptionalInt.of(maybeId); + } try (var connection = dataSource.getConnection()) { @@ -61,25 +61,25 @@ public class DbDomainQueries { stmt.setString(1, domain.toString()); var rsp = stmt.executeQuery(); if (rsp.next()) { - var id = new EdgeId(rsp.getInt(1)); + var id = rsp.getInt(1); domainIdCache.put(domain, id); - return Optional.of(id); + return OptionalInt.of(id); } } - return Optional.empty(); + return OptionalInt.empty(); } catch (UncheckedExecutionException ex) { - return Optional.empty(); + return OptionalInt.empty(); } } @SneakyThrows - public Optional getDomain(EdgeId id) { + public Optional getDomain(int id) { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) { - stmt.setInt(1, id.id()); + stmt.setInt(1, id); var rsp = stmt.executeQuery(); if (rsp.next()) { return Optional.of(new EdgeDomain(rsp.getString(1))); diff --git a/code/common/db/src/main/java/nu/marginalia/db/DomainBlacklist.java b/code/common/db/src/main/java/nu/marginalia/db/DomainBlacklist.java index 2e8a7b4c..b8562e88 100644 --- a/code/common/db/src/main/java/nu/marginalia/db/DomainBlacklist.java +++ b/code/common/db/src/main/java/nu/marginalia/db/DomainBlacklist.java @@ -2,15 +2,10 @@ package nu.marginalia.db; import com.google.inject.ImplementedBy; import gnu.trove.set.hash.TIntHashSet; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.id.EdgeId; @ImplementedBy(DomainBlacklistImpl.class) public interface DomainBlacklist { boolean isBlacklisted(int domainId); - default boolean isBlacklisted(EdgeId domainId) { - return isBlacklisted(domainId.id()); - } default TIntHashSet getSpamDomains() { return new TIntHashSet(); } diff --git a/code/common/db/src/main/java/nu/marginalia/db/DomainTypes.java b/code/common/db/src/main/java/nu/marginalia/db/DomainTypes.java index 60b42030..6e2ae909 100644 --- a/code/common/db/src/main/java/nu/marginalia/db/DomainTypes.java +++ b/code/common/db/src/main/java/nu/marginalia/db/DomainTypes.java @@ -1,13 +1,14 @@ package nu.marginalia.db; import com.zaxxer.hikari.HikariDataSource; +import gnu.trove.list.TIntList; +import gnu.trove.list.array.TIntArrayList; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.id.EdgeIdList; import org.slf4j.LoggerFactory; import org.slf4j.Logger; -import javax.inject.Inject; -import javax.inject.Singleton; +import com.google.inject.Inject; +import com.google.inject.Singleton; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; @@ -58,10 +59,10 @@ public class DomainTypes { return ret; } - /** Retrieve the EdgeId of all domains of a certain type, + /** Retrieve the domain id of all domains of a certain type, * ignoring entries that are not in the EC_DOMAIN table */ - public EdgeIdList getKnownDomainsByType(Type type) { - EdgeIdList ret = new EdgeIdList<>(); + public TIntList getKnownDomainsByType(Type type) { + TIntList ret = new TIntArrayList(); try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(""" diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java b/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java index 813d1c57..2a22cbd2 100644 --- a/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/FileStorageService.java @@ -5,8 +5,8 @@ import nu.marginalia.db.storage.model.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.inject.Inject; -import javax.inject.Singleton; +import com.google.inject.Inject; +import com.google.inject.Singleton; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; diff --git a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageType.java b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageType.java index 9f512d06..b2665ef1 100644 --- a/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageType.java +++ b/code/common/db/src/main/java/nu/marginalia/db/storage/model/FileStorageType.java @@ -5,9 +5,9 @@ public enum FileStorageType { CRAWL_DATA, PROCESSED_DATA, INDEX_STAGING, - LEXICON_STAGING, + LINKDB_STAGING, + LINKDB_LIVE, INDEX_LIVE, - LEXICON_LIVE, BACKUP, EXPORT, SEARCH_SETS diff --git a/code/common/db/src/main/resources/db/migration/V23_09_0_000__filestorage_livedb.sql b/code/common/db/src/main/resources/db/migration/V23_09_0_000__filestorage_livedb.sql new file mode 100644 index 00000000..ed4e4e74 --- /dev/null +++ b/code/common/db/src/main/resources/db/migration/V23_09_0_000__filestorage_livedb.sql @@ -0,0 +1,9 @@ +ALTER TABLE FILE_STORAGE MODIFY COLUMN TYPE ENUM ('CRAWL_SPEC', 'CRAWL_DATA', 'PROCESSED_DATA', 'INDEX_STAGING', 'LEXICON_STAGING', 'INDEX_LIVE', 'LEXICON_LIVE', 'SEARCH_SETS', 'BACKUP', 'EXPORT', 'LINKDB_LIVE', 'LINKDB_STAGING') NOT NULL; + +INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) +SELECT ID, 'ldbr', "Linkdb Current", 'LINKDB_LIVE' +FROM FILE_STORAGE_BASE WHERE NAME='Index Storage'; + +INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE) +SELECT ID, 'ldbw', "Linkdb Staging Area", 'LINKDB_STAGING' +FROM FILE_STORAGE_BASE WHERE NAME='Index Storage'; \ No newline at end of file diff --git a/code/common/db/src/main/resources/db/migration/V23_09_1_000__drop_ecurl.sql b/code/common/db/src/main/resources/db/migration/V23_09_1_000__drop_ecurl.sql new file mode 100644 index 00000000..ad3775ec --- /dev/null +++ b/code/common/db/src/main/resources/db/migration/V23_09_1_000__drop_ecurl.sql @@ -0,0 +1,3 @@ +DROP VIEW EC_URL_VIEW; +DROP TABLE EC_PAGE_DATA; +DROP TABLE EC_URL; \ No newline at end of file diff --git a/code/common/db/src/main/resources/db/migration/V23_09_2_000__filestorage_backup.sql b/code/common/db/src/main/resources/db/migration/V23_09_2_000__filestorage_backup.sql new file mode 100644 index 00000000..15016501 --- /dev/null +++ b/code/common/db/src/main/resources/db/migration/V23_09_2_000__filestorage_backup.sql @@ -0,0 +1,3 @@ +INSERT IGNORE INTO FILE_STORAGE_BASE(NAME, PATH, TYPE, PERMIT_TEMP) +VALUES +('Backup Storage', '/backup', 'BACKUP', true); \ No newline at end of file diff --git a/code/common/db/src/main/resources/db/migration/V23_09_2_001__filestorage_no_lexicon.sql b/code/common/db/src/main/resources/db/migration/V23_09_2_001__filestorage_no_lexicon.sql new file mode 100644 index 00000000..7ed41b0a --- /dev/null +++ b/code/common/db/src/main/resources/db/migration/V23_09_2_001__filestorage_no_lexicon.sql @@ -0,0 +1 @@ +DELETE FROM FILE_STORAGE WHERE TYPE IN ('LEXICON_STAGING', 'LEXICON_LIVE'); diff --git a/code/common/linkdb/build.gradle b/code/common/linkdb/build.gradle new file mode 100644 index 00000000..19caa529 --- /dev/null +++ b/code/common/linkdb/build.gradle @@ -0,0 +1,56 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "8.2.2" + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(20)) + } +} + +configurations { + flywayMigration.extendsFrom(implementation) +} + +dependencies { + implementation project(':code:common:model') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.guice + implementation libs.bundles.gson + + implementation libs.notnull + + implementation libs.sqlite + implementation libs.commons.lang3 + + implementation libs.trove + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito + + testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') + testImplementation 'org.testcontainers:mariadb:1.17.4' + testImplementation 'org.testcontainers:junit-jupiter:1.17.4' +} + + +test { + maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 + maxHeapSize = "8G" + useJUnitPlatform() +} + +task fastTests(type: Test) { + maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 + maxHeapSize = "8G" + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbReader.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbReader.java new file mode 100644 index 00000000..8fcd4a99 --- /dev/null +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbReader.java @@ -0,0 +1,102 @@ +package nu.marginalia.linkdb; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.google.inject.name.Named; +import gnu.trove.list.TLongList; +import nu.marginalia.linkdb.model.LdbUrlDetail; +import nu.marginalia.model.EdgeUrl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.file.Files; +import java.nio.file.Path; + +import java.nio.file.StandardCopyOption; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; + +@Singleton +public class LinkdbReader { + private Path dbFile; + private volatile Connection connection; + + private final Logger logger = LoggerFactory.getLogger(getClass()); + + @Inject + public LinkdbReader(@Named("linkdb-file") Path dbFile) throws SQLException { + this.dbFile = dbFile; + + if (Files.exists(dbFile)) { + try { + connection = createConnection(); + } + catch (SQLException ex) { + connection = null; + logger.error("Failed to load linkdb file", ex); + } + } + else { + logger.warn("No linkdb file {}", dbFile); + } + } + + private Connection createConnection() throws SQLException { + String connStr = "jdbc:sqlite:" + dbFile.toString(); + return DriverManager.getConnection(connStr); + } + + public void switchInput(Path newDbFile) throws IOException, SQLException { + if (connection != null) { + connection.close(); + } + + Files.move(newDbFile, dbFile, StandardCopyOption.REPLACE_EXISTING); + + connection = createConnection(); + } + + public List getUrlDetails(TLongList ids) throws SQLException { + List ret = new ArrayList<>(ids.size()); + + if (connection == null || + connection.isClosed()) + { + throw new RuntimeException("URL query temporarily unavailable due to database switch"); + } + + try (var stmt = connection.prepareStatement(""" + SELECT ID, URL, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR + FROM DOCUMENT WHERE ID = ? + """)) { + for (int i = 0; i < ids.size(); i++) { + long id = ids.get(i); + stmt.setLong(1, id); + var rs = stmt.executeQuery(); + if (rs.next()) { + var url = new EdgeUrl(rs.getString("URL")); + ret.add(new LdbUrlDetail( + rs.getLong("ID"), + url, + rs.getString("TITLE"), + rs.getString("DESCRIPTION"), + rs.getDouble("QUALITY"), + rs.getString("FORMAT"), + rs.getInt("FEATURES"), + rs.getInt("PUB_YEAR"), + rs.getLong("DATA_HASH"), + rs.getInt("WORDS_TOTAL") + )); + } + } + } catch (URISyntaxException e) { + throw new RuntimeException(e); + } + return ret; + } +} diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbStatusWriter.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbStatusWriter.java new file mode 100644 index 00000000..ff82194d --- /dev/null +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbStatusWriter.java @@ -0,0 +1,64 @@ +package nu.marginalia.linkdb; + +import nu.marginalia.linkdb.model.UrlStatus; + +import java.io.IOException; +import java.nio.file.Path; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.SQLException; +import java.sql.Types; +import java.util.List; + +public class LinkdbStatusWriter { + + private final Connection connection; + + public LinkdbStatusWriter(Path outputFile) throws SQLException { + String connStr = "jdbc:sqlite:" + outputFile.toString(); + connection = DriverManager.getConnection(connStr); + + try (var stream = ClassLoader.getSystemResourceAsStream("db/linkdb-status.sql"); + var stmt = connection.createStatement() + ) { + var sql = new String(stream.readAllBytes()); + stmt.executeUpdate(sql); + + // Disable synchronous writing as this is a one-off operation with no recovery + stmt.execute("PRAGMA synchronous = OFF"); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public void add(List statuses) throws SQLException { + try (var stmt = connection.prepareStatement(""" + INSERT INTO STATUS(ID, URL, STATUS, DESCRIPTION) + VALUES (?, ?, ?, ?) + """)) { + int count = 0; + for (var status : statuses) { + stmt.setLong(1, status.id()); + stmt.setString(2, status.url().toString()); + stmt.setString(3, status.status()); + if (status.description() == null) { + stmt.setNull(4, Types.VARCHAR); + } else { + stmt.setString(4, status.description()); + } + stmt.addBatch(); + if (++count > 1000) { + count = 0; + stmt.executeBatch(); + } + } + if (count != 0) { + stmt.executeBatch(); + } + } + } + + public void close() throws SQLException { + connection.close(); + } +} diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbWriter.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbWriter.java new file mode 100644 index 00000000..67727219 --- /dev/null +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbWriter.java @@ -0,0 +1,80 @@ +package nu.marginalia.linkdb; + +import nu.marginalia.linkdb.model.LdbUrlDetail; + +import java.io.IOException; +import java.nio.file.Path; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.SQLException; +import java.sql.Types; +import java.util.List; + +public class LinkdbWriter { + + private final Connection connection; + + public LinkdbWriter(Path outputFile) throws SQLException { + String connStr = "jdbc:sqlite:" + outputFile.toString(); + connection = DriverManager.getConnection(connStr); + + try (var stream = ClassLoader.getSystemResourceAsStream("db/linkdb-document.sql"); + var stmt = connection.createStatement() + ) { + var sql = new String(stream.readAllBytes()); + stmt.executeUpdate(sql); + + // Disable synchronous writing as this is a one-off operation with no recovery + stmt.execute("PRAGMA synchronous = OFF"); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public void add(LdbUrlDetail ldbUrlDetail) throws SQLException { + add(List.of(ldbUrlDetail)); + } + + public void add(List ldbUrlDetail) throws SQLException { + + try (var stmt = connection.prepareStatement(""" + INSERT OR IGNORE INTO DOCUMENT(ID, URL, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """)) { + + int i = 0; + for (var document : ldbUrlDetail) { + var url = document.url(); + + stmt.setLong(1, document.urlId()); + stmt.setString(2, url.toString()); + + stmt.setString(3, document.title()); + stmt.setString(4, document.description()); + stmt.setInt(5, document.wordsTotal()); + stmt.setString(6, document.format()); + stmt.setInt(7, document.features()); + stmt.setLong(8, document.dataHash()); + stmt.setDouble(9, document.urlQuality()); + if (document.pubYear() == null) { + stmt.setNull(10, Types.INTEGER); + } else { + stmt.setInt(10, document.pubYear()); + } + + stmt.addBatch(); + + if (++i > 1000) { + stmt.executeBatch(); + i = 0; + } + } + + if (i != 0) stmt.executeBatch(); + } + } + + public void close() throws SQLException { + connection.close(); + } +} diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/LdbUrlDetail.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/LdbUrlDetail.java new file mode 100644 index 00000000..9b743c9c --- /dev/null +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/LdbUrlDetail.java @@ -0,0 +1,18 @@ +package nu.marginalia.linkdb.model; + +import nu.marginalia.model.EdgeUrl; + +public record LdbUrlDetail(long urlId, + EdgeUrl url, + String title, + String description, + double urlQuality, + String format, + int features, + Integer pubYear, + long dataHash, + int wordsTotal + ) + +{ +} diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/UrlProtocol.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/UrlProtocol.java new file mode 100644 index 00000000..a090a51a --- /dev/null +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/UrlProtocol.java @@ -0,0 +1,24 @@ +package nu.marginalia.linkdb.model; + +public enum UrlProtocol { + HTTP, + HTTPS; + + public static int encode(String str) { + if ("http".equalsIgnoreCase(str)) { + return HTTP.ordinal(); + } + else if ("https".equalsIgnoreCase(str)) { + return HTTPS.ordinal(); + } + + throw new IllegalArgumentException(str); + } + + public static String decode(int ordinal) { + return switch (values()[ordinal]) { + case HTTP -> "http"; + case HTTPS -> "https"; + }; + }; +} diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/UrlStatus.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/UrlStatus.java new file mode 100644 index 00000000..dbb5334a --- /dev/null +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/UrlStatus.java @@ -0,0 +1,8 @@ +package nu.marginalia.linkdb.model; + +import nu.marginalia.model.EdgeUrl; + +import javax.annotation.Nullable; + +public record UrlStatus(long id, EdgeUrl url, String status, @Nullable String description) { +} diff --git a/code/common/linkdb/src/main/resources/db/linkdb-document.sql b/code/common/linkdb/src/main/resources/db/linkdb-document.sql new file mode 100644 index 00000000..a15aeb5e --- /dev/null +++ b/code/common/linkdb/src/main/resources/db/linkdb-document.sql @@ -0,0 +1,17 @@ +CREATE TABLE DOCUMENT ( + ID INT8 PRIMARY KEY, + + URL TEXT, + + STATE INT, + TITLE TEXT NOT NULL, + DESCRIPTION TEXT NOT NULL, + + WORDS_TOTAL INTEGER NOT NULL, + FORMAT TEXT NOT NULL, + FEATURES INTEGER NOT NULL, + + DATA_HASH INTEGER NOT NULL, + QUALITY REAL NOT NULL, + PUB_YEAR INTEGER NOT NULL +); \ No newline at end of file diff --git a/code/common/linkdb/src/main/resources/db/linkdb-status.sql b/code/common/linkdb/src/main/resources/db/linkdb-status.sql new file mode 100644 index 00000000..93fb9a40 --- /dev/null +++ b/code/common/linkdb/src/main/resources/db/linkdb-status.sql @@ -0,0 +1,6 @@ +CREATE TABLE STATUS ( + ID INT8 PRIMARY KEY, + URL TEXT, + STATUS TEXT NOT NULL, + DESCRIPTION TEXT +); diff --git a/code/common/linkdb/src/test/java/nu/marginalia/linkdb/LinkdbStatusWriterTest.java b/code/common/linkdb/src/test/java/nu/marginalia/linkdb/LinkdbStatusWriterTest.java new file mode 100644 index 00000000..e6c4b4b6 --- /dev/null +++ b/code/common/linkdb/src/test/java/nu/marginalia/linkdb/LinkdbStatusWriterTest.java @@ -0,0 +1,33 @@ +package nu.marginalia.linkdb; + +import nu.marginalia.linkdb.model.UrlStatus; +import nu.marginalia.model.EdgeUrl; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.List; + +public class LinkdbStatusWriterTest { + @Test + public void testCreate() throws IOException { + Path tempPath = Files.createTempFile("linkdb-status", ".db"); + try { + var writer = new LinkdbStatusWriter(tempPath); + writer.add(List.of( + new UrlStatus(5, new EdgeUrl("https://www.marginalia.nu/x"), "y", null), + new UrlStatus(6, new EdgeUrl("https://www.marginalia.nu/y"), "y", "z") + )); + writer.close(); + } catch (SQLException e) { + throw new RuntimeException(e); + } catch (URISyntaxException e) { + throw new RuntimeException(e); + } finally { + Files.deleteIfExists(tempPath); + } + } +} diff --git a/code/common/linkdb/src/test/java/nu/marginalia/linkdb/LinkdbWriterTest.java b/code/common/linkdb/src/test/java/nu/marginalia/linkdb/LinkdbWriterTest.java new file mode 100644 index 00000000..598e6b67 --- /dev/null +++ b/code/common/linkdb/src/test/java/nu/marginalia/linkdb/LinkdbWriterTest.java @@ -0,0 +1,42 @@ +package nu.marginalia.linkdb; + +import gnu.trove.list.array.TLongArrayList; +import nu.marginalia.linkdb.model.LdbUrlDetail; +import nu.marginalia.model.EdgeDomain; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; + +public class LinkdbWriterTest { + @Test + public void testCreate() throws IOException { + Path tempPath = Files.createTempFile("linkdb", ".db"); + try { + var writer = new LinkdbWriter(tempPath); + writer.add(new LdbUrlDetail( + 1, + new nu.marginalia.model.EdgeUrl("http", new EdgeDomain("example.com"), null, "/", null), + "Test", + "This is a test", + -4., + "XHTML", + 5, + 2020, + 0xF00BA3, + 444 + )); + writer.close(); + + var reader = new LinkdbReader(tempPath); + var deets = reader.getUrlDetails(new TLongArrayList(new long[]{1})); + System.out.println(deets); + } catch (SQLException e) { + throw new RuntimeException(e); + } finally { + Files.deleteIfExists(tempPath); + } + } +} diff --git a/code/common/model/src/main/java/nu/marginalia/model/gson/GsonFactory.java b/code/common/model/src/main/java/nu/marginalia/model/gson/GsonFactory.java index ec5a9a74..38f509dd 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/gson/GsonFactory.java +++ b/code/common/model/src/main/java/nu/marginalia/model/gson/GsonFactory.java @@ -6,7 +6,6 @@ import nu.marginalia.bigstring.BigString; import nu.marginalia.bigstring.CompressedBigString; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.id.EdgeId; import java.net.URISyntaxException; @@ -24,8 +23,6 @@ public class GsonFactory { } }) .registerTypeAdapter(EdgeDomain.class, (JsonDeserializer) (json, typeOfT, context) -> new EdgeDomain(json.getAsString())) - .registerTypeAdapter(EdgeId.class, (JsonDeserializer>) (json, typeOfT, context) -> new EdgeId<>(json.getAsInt())) - .registerTypeAdapter(EdgeId.class, (JsonSerializer>) (src, typeOfSrc, context) -> new JsonPrimitive(src.id())) .registerTypeAdapter(BigString.class, (JsonDeserializer) (json, typeOfT, context) -> BigString.encode(json.getAsString())) .registerTypeAdapter(BigString.class, (JsonSerializer) (src, typeOfT, context) -> new JsonPrimitive(src.decode())) .registerTypeAdapter(CompressedBigString.class, (JsonSerializer) (src, typeOfT, context) -> new JsonPrimitive(src.decode())) diff --git a/code/common/model/src/main/java/nu/marginalia/model/id/EdgeId.java b/code/common/model/src/main/java/nu/marginalia/model/id/EdgeId.java deleted file mode 100644 index 9e45c78f..00000000 --- a/code/common/model/src/main/java/nu/marginalia/model/id/EdgeId.java +++ /dev/null @@ -1,10 +0,0 @@ -package nu.marginalia.model.id; - - -/** - * This exists entirely for strengthening the typing of IDs - * - * @param - */ -public record EdgeId(int id) { -} diff --git a/code/common/model/src/main/java/nu/marginalia/model/id/EdgeIdArray.java b/code/common/model/src/main/java/nu/marginalia/model/id/EdgeIdArray.java deleted file mode 100644 index 078dcdb6..00000000 --- a/code/common/model/src/main/java/nu/marginalia/model/id/EdgeIdArray.java +++ /dev/null @@ -1,34 +0,0 @@ -package nu.marginalia.model.id; - -import java.util.Arrays; -import java.util.stream.IntStream; - -public record EdgeIdArray (int... values) implements EdgeIdCollection { - - public static EdgeIdArray gather(IntStream stream) { - return new EdgeIdArray<>(stream.toArray()); - } - - @Override - public int[] values() { - return values; - } - - @Override - public boolean isEmpty() { - return values.length == 0; - } - - @Override - public int size() { - return values.length; - } - - public int get(int idx) { - return values[idx]; - } - - public void sort() { - Arrays.sort(values); - } -} \ No newline at end of file diff --git a/code/common/model/src/main/java/nu/marginalia/model/id/EdgeIdCollection.java b/code/common/model/src/main/java/nu/marginalia/model/id/EdgeIdCollection.java deleted file mode 100644 index 47cb4c62..00000000 --- a/code/common/model/src/main/java/nu/marginalia/model/id/EdgeIdCollection.java +++ /dev/null @@ -1,28 +0,0 @@ -package nu.marginalia.model.id; - -import java.util.Arrays; -import java.util.Iterator; -import java.util.stream.IntStream; - -public interface EdgeIdCollection extends Iterable> { - int size(); - boolean isEmpty(); - int[] values(); - - default IntStream stream() { - return Arrays.stream(values()); - } - - default Iterator> iterator() { - return Arrays.stream(values()).mapToObj(EdgeId::new).iterator(); - } - default EdgeIdArray asArray() { - return new EdgeIdArray<>(values()); - } - default EdgeIdList asList() { - return new EdgeIdList<>(values()); - } - default EdgeIdSet asSet() { - return new EdgeIdSet<>(values()); - } -} diff --git a/code/common/model/src/main/java/nu/marginalia/model/id/EdgeIdCollectionMutable.java b/code/common/model/src/main/java/nu/marginalia/model/id/EdgeIdCollectionMutable.java deleted file mode 100644 index 0056cb28..00000000 --- a/code/common/model/src/main/java/nu/marginalia/model/id/EdgeIdCollectionMutable.java +++ /dev/null @@ -1,12 +0,0 @@ -package nu.marginalia.model.id; - -import gnu.trove.TIntCollection; - -public interface EdgeIdCollectionMutable { - TIntCollection underlyingCollection(); - - default void addAll(EdgeIdArray other) { underlyingCollection().addAll(other.values()); } - default void addAll(EdgeIdList other) { underlyingCollection().addAll(other.list()); } - default void addAll(EdgeIdCollection other) { underlyingCollection().addAll(other.values()); } - -} diff --git a/code/common/model/src/main/java/nu/marginalia/model/id/EdgeIdList.java b/code/common/model/src/main/java/nu/marginalia/model/id/EdgeIdList.java deleted file mode 100644 index 98f651d1..00000000 --- a/code/common/model/src/main/java/nu/marginalia/model/id/EdgeIdList.java +++ /dev/null @@ -1,48 +0,0 @@ -package nu.marginalia.model.id; - -import gnu.trove.TIntCollection; -import gnu.trove.list.array.TIntArrayList; - -import java.util.stream.IntStream; - -public record EdgeIdList (TIntArrayList list) implements - EdgeIdCollection, - EdgeIdCollectionMutable { - - public EdgeIdList(int... values) { this(new TIntArrayList(values)); } - public static EdgeIdList gather(IntStream stream) { - return stream.collect(EdgeIdList::new, EdgeIdList::add, EdgeIdList::addAll); - } - - @Override - public int[] values() { - return list.toArray(); - } - - @Override - public boolean isEmpty() { - return list.isEmpty(); - } - - @Override - public int size() { - return list.size(); - } - - public int get(int idx) { - return list.get(idx); - } - - public void add(int id) { - list.add(id); - } - - public void sort() { - list.sort(); - } - - @Override - public TIntCollection underlyingCollection() { - return list; - } -} \ No newline at end of file diff --git a/code/common/model/src/main/java/nu/marginalia/model/id/EdgeIdSet.java b/code/common/model/src/main/java/nu/marginalia/model/id/EdgeIdSet.java deleted file mode 100644 index 5119b5c7..00000000 --- a/code/common/model/src/main/java/nu/marginalia/model/id/EdgeIdSet.java +++ /dev/null @@ -1,52 +0,0 @@ -package nu.marginalia.model.id; - -import gnu.trove.TIntCollection; -import gnu.trove.set.hash.TIntHashSet; - -import java.util.stream.IntStream; - -public record EdgeIdSet (TIntHashSet set) implements EdgeIdCollection, EdgeIdCollectionMutable { - - public EdgeIdSet(int... values) { - this(new TIntHashSet(values.length, 0.5f, -1)); - - set.addAll(values); - } - - public EdgeIdSet(int initialCapacity, float loadFactor) { - this(new TIntHashSet(initialCapacity, loadFactor, -1)); - } - - @Override - public TIntCollection underlyingCollection() { - return set; - } - - public static EdgeIdSet gather(IntStream stream) { - return new EdgeIdSet<>(stream.toArray()); - } - - @Override - public int[] values() { - return set.toArray(); - } - - @Override - public boolean isEmpty() { - return set.isEmpty(); - } - - @Override - public int size() { - return set.size(); - } - - public boolean contains(int id) { - return set.contains(id); - } - public boolean add(int id) { - return set.add(id); - } - public boolean remove(int id) { return set.remove(id); } - -} \ No newline at end of file diff --git a/code/common/model/src/main/java/nu/marginalia/model/id/UrlIdCodec.java b/code/common/model/src/main/java/nu/marginalia/model/id/UrlIdCodec.java new file mode 100644 index 00000000..26ac847e --- /dev/null +++ b/code/common/model/src/main/java/nu/marginalia/model/id/UrlIdCodec.java @@ -0,0 +1,78 @@ +package nu.marginalia.model.id; + +/** URL id encoding scheme, including an optional ranking part that's used in the indices and washed away + * outside. The ranking part is put in the highest bits so that when we sort the documents by id, they're + * actually sorted by rank. Next is the domain id part, which keeps documents from the same domain clustered. + * Finally is the document ordinal part, which is a non-unique sequence number for within the current set of + * documents loaded. The same ID may be re-used over time as a new index is loaded. + *

+ * + * + * + * + * + * + * + * + * + * + * + *
PartBitsCardinality
rank6 bits64
domain31 bits2 billion
document26 bits67 million
+ *

+ * Most significant bit is unused for now because I'm not routing Long.compareUnsigned() all over the codebase. + * If we end up needing more domains, we'll cross that bridge when we come to it. + * + *

Coding Scheme

+ *
+ * [    | rank | domain | url ]
+ *  0   1       6       38    64
+ * 
+ */ +public class UrlIdCodec { + private static final long RANK_MASK = 0xFE00_0000_0000_0000L; + private static final int DOCORD_MASK = 0x03FF_FFFF; + + /** Encode a URL id without a ranking element */ + public static long encodeId(int domainId, int documentOrdinal) { + domainId &= 0x7FFF_FFFF; + documentOrdinal &= 0x03FF_FFFF; + + return ((long) domainId << 26) | documentOrdinal; + } + + /** Add a ranking element to an existing combined URL id. + * + * @param rank [0,1] the importance of the domain, low is good + * @param urlId + */ + public static long addRank(float rank, long urlId) { + long rankPart = (int)(rank * (1<<6)); + + if (rankPart >= 64) rankPart = 63; + if (rankPart < 0) rankPart = 0; + + return (urlId&(~RANK_MASK)) | (rankPart << 57); + } + + /** Extract the domain component from this URL id */ + public static int getDomainId(long combinedId) { + return (int) ((combinedId >>> 26) & 0x7FFF_FFFFL); + } + + /** Extract the document ordinal component from this URL id */ + public static int getDocumentOrdinal(long combinedId) { + return (int) (combinedId & DOCORD_MASK); + } + + + /** Extract the document ordinal component from this URL id */ + public static int getRank(long combinedId) { + return (int) (combinedId >>> 57); + } + + /** Mask out the ranking element from this URL id */ + public static long removeRank(long combinedId) { + return combinedId & ~RANK_MASK; + } + +} diff --git a/code/common/model/src/test/java/nu/marginalia/model/id/UrlIdCodecTest.java b/code/common/model/src/test/java/nu/marginalia/model/id/UrlIdCodecTest.java new file mode 100644 index 00000000..c1966048 --- /dev/null +++ b/code/common/model/src/test/java/nu/marginalia/model/id/UrlIdCodecTest.java @@ -0,0 +1,41 @@ +package nu.marginalia.model.id; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class UrlIdCodecTest { + @Test + public void testDocumentBounds() { + long encoded = UrlIdCodec.encodeId(0, ~0); + assertEquals(0, UrlIdCodec.getDomainId(encoded)); + } + + @Test + public void testDomainBounds() { + long encoded = UrlIdCodec.encodeId(~0, 0); + assertEquals(0x7FFF_FFFF, UrlIdCodec.getDomainId(encoded)); + assertEquals(0, UrlIdCodec.getRank(encoded)); + assertEquals(0, UrlIdCodec.getDocumentOrdinal(encoded)); + } + + @Test + public void testRankBoundsAdd() { + long encoded = UrlIdCodec.encodeId(0, 0); + encoded = UrlIdCodec.addRank(1.f, encoded); + assertEquals(0, UrlIdCodec.getDomainId(encoded)); + assertEquals(63, UrlIdCodec.getRank(encoded)); + assertEquals(0, UrlIdCodec.getDocumentOrdinal(encoded)); + } + + @Test + public void testRemoveRank() { + long encoded = UrlIdCodec.encodeId(0x7FFF_FFFF, ~0); + encoded = UrlIdCodec.addRank(1.f, encoded); + encoded = UrlIdCodec.removeRank(encoded); + assertEquals(0x7FFF_FFFFL, UrlIdCodec.getDomainId(encoded)); + assertEquals(0, UrlIdCodec.getRank(encoded)); + assertEquals(0x03FF_FFFF, UrlIdCodec.getDocumentOrdinal(encoded)); + } + +} \ No newline at end of file diff --git a/code/common/process/src/main/java/nu/marginalia/process/control/FakeProcessHeartbeat.java b/code/common/process/src/main/java/nu/marginalia/process/control/FakeProcessHeartbeat.java new file mode 100644 index 00000000..619dd101 --- /dev/null +++ b/code/common/process/src/main/java/nu/marginalia/process/control/FakeProcessHeartbeat.java @@ -0,0 +1,35 @@ +package nu.marginalia.process.control; + +/** Dummy implementation of ProcessHeartbeat that does nothing */ +public class FakeProcessHeartbeat implements ProcessHeartbeat { + + @Override + public > ProcessTaskHeartbeat createProcessTaskHeartbeat(Class steps, String processName) { + return new ProcessTaskHeartbeat<>() { + @Override + public void progress(T step) {} + + @Override + public void shutDown() {} + + @Override + public void close() {} + }; + } + + @Override + public ProcessAdHocTaskHeartbeat createAdHocTaskHeartbeat(String processName) { + return new ProcessAdHocTaskHeartbeat() { + @Override + public void progress(String step, int progress, int total) {} + + @Override + public void close() {} + }; + } + + @Override + public void setProgress(double progress) {} + + +} diff --git a/code/common/process/src/main/java/nu/marginalia/process/control/ProcessAdHocTaskHeartbeat.java b/code/common/process/src/main/java/nu/marginalia/process/control/ProcessAdHocTaskHeartbeat.java new file mode 100644 index 00000000..e4af6bcd --- /dev/null +++ b/code/common/process/src/main/java/nu/marginalia/process/control/ProcessAdHocTaskHeartbeat.java @@ -0,0 +1,7 @@ +package nu.marginalia.process.control; + +public interface ProcessAdHocTaskHeartbeat extends AutoCloseable { + void progress(String step, int progress, int total); + + void close(); +} diff --git a/code/common/process/src/main/java/nu/marginalia/process/control/ProcessAdHocTaskHeartbeatImpl.java b/code/common/process/src/main/java/nu/marginalia/process/control/ProcessAdHocTaskHeartbeatImpl.java new file mode 100644 index 00000000..41c963cf --- /dev/null +++ b/code/common/process/src/main/java/nu/marginalia/process/control/ProcessAdHocTaskHeartbeatImpl.java @@ -0,0 +1,187 @@ +package nu.marginalia.process.control; + + +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.ProcessConfiguration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.UUID; +import java.util.concurrent.TimeUnit; + +/** This object sends a heartbeat to the database every few seconds, + * updating with the progress of a task within a service. Progress is tracked by providing + * enumerations corresponding to the steps in the task. It's important they're arranged in the same + * order as the steps in the task in order to get an accurate progress tracking. + */ +public class ProcessAdHocTaskHeartbeatImpl implements AutoCloseable, ProcessAdHocTaskHeartbeat { + private final Logger logger = LoggerFactory.getLogger(ProcessAdHocTaskHeartbeatImpl.class); + private final String taskName; + private final String taskBase; + private final String instanceUUID; + private final HikariDataSource dataSource; + + + private final Thread runnerThread; + private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 1); + private final String serviceInstanceUUID; + private int progress; + + private volatile boolean running = false; + private volatile String step = "-"; + + ProcessAdHocTaskHeartbeatImpl(ProcessConfiguration configuration, + String taskName, + HikariDataSource dataSource) + { + this.taskName = configuration.processName() + "." + taskName + ":" + configuration.node(); + this.taskBase = configuration.processName() + "." + taskName; + this.dataSource = dataSource; + + this.instanceUUID = UUID.randomUUID().toString(); + this.serviceInstanceUUID = configuration.instanceUuid().toString(); + + heartbeatInit(); + + runnerThread = new Thread(this::run); + runnerThread.start(); + } + + /** Update the progress of the task. This is a fast function that doesn't block; + * the actual update is done in a separate thread. + * + * @param step The current step in the task. + */ + @Override + public void progress(String step, int stepProgress, int stepCount) { + this.step = step; + + + // off by one since we calculate the progress based on the number of steps, + // and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the + // final progress being 80% and not 100%) + + this.progress = (int) Math.round(100. * stepProgress / (double) stepCount); + + logger.info("ProcessTask {} progress: {}%", taskBase, progress); + } + + public void shutDown() { + if (!running) + return; + + running = false; + + try { + runnerThread.join(); + heartbeatStop(); + } + catch (InterruptedException|SQLException ex) { + logger.warn("ProcessHeartbeat shutdown failed", ex); + } + } + + private void run() { + if (!running) + running = true; + else + return; + + try { + while (running) { + try { + heartbeatUpdate(); + } + catch (SQLException ex) { + logger.warn("ProcessHeartbeat failed to update", ex); + } + + TimeUnit.SECONDS.sleep(heartbeatInterval); + } + } + catch (InterruptedException ex) { + logger.error("ProcessHeartbeat caught irrecoverable exception, killing service", ex); + System.exit(255); + } + } + + private void heartbeatInit() { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + INSERT INTO TASK_HEARTBEAT (TASK_NAME, TASK_BASE, INSTANCE, SERVICE_INSTANCE, HEARTBEAT_TIME, STATUS) + VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP(6), 'STARTING') + ON DUPLICATE KEY UPDATE + INSTANCE = ?, + SERVICE_INSTANCE = ?, + HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), + STATUS = 'STARTING' + """ + )) + { + stmt.setString(1, taskName); + stmt.setString(2, taskBase); + stmt.setString(3, instanceUUID); + stmt.setString(4, serviceInstanceUUID); + stmt.setString(5, instanceUUID); + stmt.setString(6, serviceInstanceUUID); + stmt.executeUpdate(); + } + } + catch (SQLException ex) { + logger.error("ProcessHeartbeat failed to initialize", ex); + throw new RuntimeException(ex); + } + + } + + private void heartbeatUpdate() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + UPDATE TASK_HEARTBEAT + SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), + STATUS = 'RUNNING', + PROGRESS = ?, + STAGE_NAME = ? + WHERE INSTANCE = ? + """) + ) + { + stmt.setInt(1, progress); + stmt.setString(2, step); + stmt.setString(3, instanceUUID); + stmt.executeUpdate(); + } + } + } + + private void heartbeatStop() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + UPDATE TASK_HEARTBEAT + SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), + STATUS='STOPPED', + PROGRESS = ?, + STAGE_NAME = ? + WHERE INSTANCE = ? + """) + ) + { + stmt.setInt(1, progress); + stmt.setString( 2, step); + stmt.setString( 3, instanceUUID); + stmt.executeUpdate(); + } + } + } + + @Override + public void close() { + shutDown(); + } + +} + diff --git a/code/common/process/src/main/java/nu/marginalia/process/control/ProcessHeartbeat.java b/code/common/process/src/main/java/nu/marginalia/process/control/ProcessHeartbeat.java index 82b2c95e..beb86e9b 100644 --- a/code/common/process/src/main/java/nu/marginalia/process/control/ProcessHeartbeat.java +++ b/code/common/process/src/main/java/nu/marginalia/process/control/ProcessHeartbeat.java @@ -1,155 +1,11 @@ package nu.marginalia.process.control; +import com.google.inject.ImplementedBy; -import com.google.inject.Inject; -import com.google.inject.Singleton; -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.ProcessConfiguration; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +@ImplementedBy(ProcessHeartbeatImpl.class) +public interface ProcessHeartbeat { + > ProcessTaskHeartbeat createProcessTaskHeartbeat(Class steps, String processName); + ProcessAdHocTaskHeartbeat createAdHocTaskHeartbeat(String processName); -import java.sql.SQLException; -import java.util.concurrent.TimeUnit; - -/** This service sends a heartbeat to the database every 5 seconds. - */ -@Singleton -public class ProcessHeartbeat { - private final Logger logger = LoggerFactory.getLogger(ProcessHeartbeat.class); - private final String processName; - private final String processBase; - private final String instanceUUID; - private final HikariDataSource dataSource; - - - private final Thread runnerThread; - private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 1); - - private volatile boolean running = false; - - private volatile int progress = -1; - - @Inject - public ProcessHeartbeat(ProcessConfiguration configuration, - HikariDataSource dataSource) - { - this.processName = configuration.processName() + ":" + configuration.node(); - this.processBase = configuration.processName(); - this.dataSource = dataSource; - - this.instanceUUID = configuration.instanceUuid().toString(); - - runnerThread = new Thread(this::run); - - Runtime.getRuntime().addShutdownHook(new Thread(this::shutDown)); - } - - public void setProgress(double progress) { - this.progress = (int) (progress * 100); - } - - public void start() { - if (!running) { - runnerThread.start(); - } - } - - public void shutDown() { - if (!running) - return; - - running = false; - - try { - runnerThread.join(); - heartbeatStop(); - } - catch (InterruptedException|SQLException ex) { - logger.warn("ServiceHeartbeat shutdown failed", ex); - } - } - - private void run() { - if (!running) - running = true; - else - return; - - try { - heartbeatInit(); - - while (running) { - - try { - heartbeatUpdate(); - } - catch (SQLException ex) { - logger.warn("ServiceHeartbeat failed to update", ex); - } - - TimeUnit.SECONDS.sleep(heartbeatInterval); - } - } - catch (InterruptedException|SQLException ex) { - logger.error("ServiceHeartbeat caught irrecoverable exception, killing service", ex); - System.exit(255); - } - } - - private void heartbeatInit() throws SQLException { - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement( - """ - INSERT INTO PROCESS_HEARTBEAT (PROCESS_NAME, PROCESS_BASE, INSTANCE, HEARTBEAT_TIME, STATUS) - VALUES (?, ?, ?, CURRENT_TIMESTAMP(6), 'STARTING') - ON DUPLICATE KEY UPDATE - INSTANCE = ?, - HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), - STATUS = 'STARTING' - """ - )) - { - stmt.setString(1, processName); - stmt.setString(2, processBase); - stmt.setString(3, instanceUUID); - stmt.setString(4, instanceUUID); - stmt.executeUpdate(); - } - } - } - - private void heartbeatUpdate() throws SQLException { - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement( - """ - UPDATE PROCESS_HEARTBEAT - SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), STATUS = 'RUNNING', PROGRESS = ? - WHERE INSTANCE = ? - """) - ) - { - stmt.setInt(1, progress); - stmt.setString(2, instanceUUID); - stmt.executeUpdate(); - } - } - } - - private void heartbeatStop() throws SQLException { - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement( - """ - UPDATE PROCESS_HEARTBEAT - SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), STATUS='STOPPED', PROGRESS=? - WHERE INSTANCE = ? - """) - ) - { - stmt.setInt(1, progress); - stmt.setString( 2, instanceUUID); - stmt.executeUpdate(); - } - } - } + void setProgress(double progress); } - diff --git a/code/common/process/src/main/java/nu/marginalia/process/control/ProcessHeartbeatImpl.java b/code/common/process/src/main/java/nu/marginalia/process/control/ProcessHeartbeatImpl.java new file mode 100644 index 00000000..05fc7ae5 --- /dev/null +++ b/code/common/process/src/main/java/nu/marginalia/process/control/ProcessHeartbeatImpl.java @@ -0,0 +1,170 @@ +package nu.marginalia.process.control; + + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.ProcessConfiguration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.concurrent.TimeUnit; + +/** This service sends a heartbeat to the database every 5 seconds. + */ +@Singleton +public class ProcessHeartbeatImpl implements ProcessHeartbeat { + private final Logger logger = LoggerFactory.getLogger(ProcessHeartbeatImpl.class); + private final String processName; + private final String processBase; + private final String instanceUUID; + @org.jetbrains.annotations.NotNull + private final ProcessConfiguration configuration; + private final HikariDataSource dataSource; + + + private final Thread runnerThread; + private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 1); + + private volatile boolean running = false; + + private volatile int progress = -1; + + @Inject + public ProcessHeartbeatImpl(ProcessConfiguration configuration, + HikariDataSource dataSource) + { + this.processName = configuration.processName() + ":" + configuration.node(); + this.processBase = configuration.processName(); + this.configuration = configuration; + this.dataSource = dataSource; + + this.instanceUUID = configuration.instanceUuid().toString(); + + runnerThread = new Thread(this::run); + + Runtime.getRuntime().addShutdownHook(new Thread(this::shutDown)); + } + + + @Override + public > ProcessTaskHeartbeat createProcessTaskHeartbeat(Class steps, String processName) { + return new ProcessTaskHeartbeatImpl<>(steps, configuration, processName, dataSource); + } + + @Override + public ProcessAdHocTaskHeartbeat createAdHocTaskHeartbeat(String processName) { + return new ProcessAdHocTaskHeartbeatImpl(configuration, processName, dataSource); + } + + @Override + public void setProgress(double progress) { + this.progress = (int) (progress * 100); + } + + public void start() { + if (!running) { + runnerThread.start(); + } + } + + public void shutDown() { + if (!running) + return; + + running = false; + + try { + runnerThread.join(); + heartbeatStop(); + } + catch (InterruptedException|SQLException ex) { + logger.warn("ServiceHeartbeat shutdown failed", ex); + } + } + + private void run() { + if (!running) + running = true; + else + return; + + try { + heartbeatInit(); + + while (running) { + + try { + heartbeatUpdate(); + } + catch (SQLException ex) { + logger.warn("ServiceHeartbeat failed to update", ex); + } + + TimeUnit.SECONDS.sleep(heartbeatInterval); + } + } + catch (InterruptedException|SQLException ex) { + logger.error("ServiceHeartbeat caught irrecoverable exception, killing service", ex); + System.exit(255); + } + } + + private void heartbeatInit() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + INSERT INTO PROCESS_HEARTBEAT (PROCESS_NAME, PROCESS_BASE, INSTANCE, HEARTBEAT_TIME, STATUS) + VALUES (?, ?, ?, CURRENT_TIMESTAMP(6), 'STARTING') + ON DUPLICATE KEY UPDATE + INSTANCE = ?, + HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), + STATUS = 'STARTING' + """ + )) + { + stmt.setString(1, processName); + stmt.setString(2, processBase); + stmt.setString(3, instanceUUID); + stmt.setString(4, instanceUUID); + stmt.executeUpdate(); + } + } + } + + private void heartbeatUpdate() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + UPDATE PROCESS_HEARTBEAT + SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), STATUS = 'RUNNING', PROGRESS = ? + WHERE INSTANCE = ? + """) + ) + { + stmt.setInt(1, progress); + stmt.setString(2, instanceUUID); + stmt.executeUpdate(); + } + } + } + + private void heartbeatStop() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + UPDATE PROCESS_HEARTBEAT + SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), STATUS='STOPPED', PROGRESS=? + WHERE INSTANCE = ? + """) + ) + { + stmt.setInt(1, progress); + stmt.setString( 2, instanceUUID); + stmt.executeUpdate(); + } + } + } +} + diff --git a/code/common/process/src/main/java/nu/marginalia/process/control/ProcessTaskHeartbeat.java b/code/common/process/src/main/java/nu/marginalia/process/control/ProcessTaskHeartbeat.java new file mode 100644 index 00000000..7a720b54 --- /dev/null +++ b/code/common/process/src/main/java/nu/marginalia/process/control/ProcessTaskHeartbeat.java @@ -0,0 +1,9 @@ +package nu.marginalia.process.control; + +public interface ProcessTaskHeartbeat> extends AutoCloseable { + void progress(T step); + + void shutDown(); + + void close(); +} diff --git a/code/common/process/src/main/java/nu/marginalia/process/control/ProcessTaskHeartbeatImpl.java b/code/common/process/src/main/java/nu/marginalia/process/control/ProcessTaskHeartbeatImpl.java new file mode 100644 index 00000000..eb848e57 --- /dev/null +++ b/code/common/process/src/main/java/nu/marginalia/process/control/ProcessTaskHeartbeatImpl.java @@ -0,0 +1,192 @@ +package nu.marginalia.process.control; + + +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.ProcessConfiguration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.UUID; +import java.util.concurrent.TimeUnit; + +/** This object sends a heartbeat to the database every few seconds, + * updating with the progress of a task within a service. Progress is tracked by providing + * enumerations corresponding to the steps in the task. It's important they're arranged in the same + * order as the steps in the task in order to get an accurate progress tracking. + */ +public class ProcessTaskHeartbeatImpl> implements AutoCloseable, ProcessTaskHeartbeat { + private final Logger logger = LoggerFactory.getLogger(ProcessTaskHeartbeatImpl.class); + private final String taskName; + private final String taskBase; + private final String instanceUUID; + private final HikariDataSource dataSource; + + + private final Thread runnerThread; + private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 1); + private final String serviceInstanceUUID; + private final int stepCount; + + private volatile boolean running = false; + private volatile int stepNum = 0; + private volatile String step = "-"; + + ProcessTaskHeartbeatImpl(Class stepClass, + ProcessConfiguration configuration, + String taskName, + HikariDataSource dataSource) + { + this.taskName = configuration.processName() + "." + taskName + ":" + configuration.node(); + this.taskBase = configuration.processName() + "." + taskName; + this.dataSource = dataSource; + + this.instanceUUID = UUID.randomUUID().toString(); + this.serviceInstanceUUID = configuration.instanceUuid().toString(); + + this.stepCount = stepClass.getEnumConstants().length; + + heartbeatInit(); + + runnerThread = new Thread(this::run); + runnerThread.start(); + } + + /** Update the progress of the task. This is a fast function that doesn't block; + * the actual update is done in a separate thread. + * + * @param step The current step in the task. + */ + @Override + public void progress(T step) { + this.step = step.name(); + + + // off by one since we calculate the progress based on the number of steps, + // and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the + // final progress being 80% and not 100%) + + this.stepNum = 1 + step.ordinal(); + + logger.info("ProcessTask {} progress: {}", taskBase, step.name()); + } + + @Override + public void shutDown() { + if (!running) + return; + + running = false; + + try { + runnerThread.join(); + heartbeatStop(); + } + catch (InterruptedException|SQLException ex) { + logger.warn("ProcessHeartbeat shutdown failed", ex); + } + } + + private void run() { + if (!running) + running = true; + else + return; + + try { + while (running) { + try { + heartbeatUpdate(); + } + catch (SQLException ex) { + logger.warn("ProcessHeartbeat failed to update", ex); + } + + TimeUnit.SECONDS.sleep(heartbeatInterval); + } + } + catch (InterruptedException ex) { + logger.error("ProcessHeartbeat caught irrecoverable exception, killing service", ex); + System.exit(255); + } + } + + private void heartbeatInit() { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + INSERT INTO TASK_HEARTBEAT (TASK_NAME, TASK_BASE, INSTANCE, SERVICE_INSTANCE, HEARTBEAT_TIME, STATUS) + VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP(6), 'STARTING') + ON DUPLICATE KEY UPDATE + INSTANCE = ?, + SERVICE_INSTANCE = ?, + HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), + STATUS = 'STARTING' + """ + )) + { + stmt.setString(1, taskName); + stmt.setString(2, taskBase); + stmt.setString(3, instanceUUID); + stmt.setString(4, serviceInstanceUUID); + stmt.setString(5, instanceUUID); + stmt.setString(6, serviceInstanceUUID); + stmt.executeUpdate(); + } + } + catch (SQLException ex) { + logger.error("ProcessHeartbeat failed to initialize", ex); + throw new RuntimeException(ex); + } + + } + + private void heartbeatUpdate() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + UPDATE TASK_HEARTBEAT + SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), + STATUS = 'RUNNING', + PROGRESS = ?, + STAGE_NAME = ? + WHERE INSTANCE = ? + """) + ) + { + stmt.setInt(1, (int) Math.round(100 * stepNum / (double) stepCount)); + stmt.setString(2, step); + stmt.setString(3, instanceUUID); + stmt.executeUpdate(); + } + } + } + + private void heartbeatStop() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + UPDATE TASK_HEARTBEAT + SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), + STATUS='STOPPED', + PROGRESS = ?, + STAGE_NAME = ? + WHERE INSTANCE = ? + """) + ) + { + stmt.setInt(1, (int) Math.round(100 * stepNum / (double) stepCount)); + stmt.setString( 2, step); + stmt.setString( 3, instanceUUID); + stmt.executeUpdate(); + } + } + } + + @Override + public void close() { + shutDown(); + } + +} + diff --git a/code/common/service/src/main/java/nu/marginalia/service/control/FakeServiceHeartbeat.java b/code/common/service/src/main/java/nu/marginalia/service/control/FakeServiceHeartbeat.java new file mode 100644 index 00000000..c0c732b9 --- /dev/null +++ b/code/common/service/src/main/java/nu/marginalia/service/control/FakeServiceHeartbeat.java @@ -0,0 +1,14 @@ +package nu.marginalia.service.control; + +/** Dummy implementation of ServiceHeartbeat that does nothing */ +public class FakeServiceHeartbeat implements ServiceHeartbeat { + @Override + public > ServiceTaskHeartbeat createServiceTaskHeartbeat(Class steps, String processName) { + return new ServiceTaskHeartbeat() { + @Override + public void progress(T step) {} + @Override + public void close() {} + }; + } +} diff --git a/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeat.java b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeat.java index c9c5085c..992e42e9 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeat.java +++ b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeat.java @@ -1,157 +1,8 @@ package nu.marginalia.service.control; -import com.google.inject.Inject; -import com.google.inject.Singleton; -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.service.module.ServiceConfiguration; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.sql.SQLException; -import java.util.concurrent.TimeUnit; - -/** This service sends a heartbeat to the database every 5 seconds, - * updating the control service with the liveness information for the service. - */ -@Singleton -public class ServiceHeartbeat { - private final Logger logger = LoggerFactory.getLogger(ServiceHeartbeat.class); - private final String serviceName; - private final String serviceBase; - private final String instanceUUID; - private final ServiceConfiguration configuration; - private final ServiceEventLog eventLog; - private final HikariDataSource dataSource; - - - private final Thread runnerThread; - private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 5); - - private volatile boolean running = false; - - @Inject - public ServiceHeartbeat(ServiceConfiguration configuration, - ServiceEventLog eventLog, - HikariDataSource dataSource) - { - this.serviceName = configuration.serviceName() + ":" + configuration.node(); - this.serviceBase = configuration.serviceName(); - this.configuration = configuration; - this.eventLog = eventLog; - this.dataSource = dataSource; - - this.instanceUUID = configuration.instanceUuid().toString(); - - runnerThread = new Thread(this::run); - - Runtime.getRuntime().addShutdownHook(new Thread(this::shutDown)); - } - - public > ServiceTaskHeartbeat createServiceTaskHeartbeat(Class steps, String processName) { - return new ServiceTaskHeartbeat<>(steps, configuration, processName, eventLog, dataSource); - } - - - public void start() { - if (!running) { - runnerThread.start(); - } - } - - public void shutDown() { - if (!running) - return; - - running = false; - - try { - runnerThread.join(); - heartbeatStop(); - } - catch (InterruptedException|SQLException ex) { - logger.warn("ServiceHeartbeat shutdown failed", ex); - } - } - - private void run() { - if (!running) - running = true; - else - return; - - try { - heartbeatInit(); - - while (running) { - - try { - heartbeatUpdate(); - } - catch (SQLException ex) { - logger.warn("ServiceHeartbeat failed to update", ex); - } - - TimeUnit.SECONDS.sleep(heartbeatInterval); - } - } - catch (InterruptedException|SQLException ex) { - logger.error("ServiceHeartbeat caught irrecoverable exception, killing service", ex); - System.exit(255); - } - } - - private void heartbeatInit() throws SQLException { - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement( - """ - INSERT INTO SERVICE_HEARTBEAT (SERVICE_NAME, SERVICE_BASE, INSTANCE, HEARTBEAT_TIME, ALIVE) - VALUES (?, ?, ?, CURRENT_TIMESTAMP(6), 1) - ON DUPLICATE KEY UPDATE - INSTANCE = ?, - HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), - ALIVE = 1 - """ - )) - { - stmt.setString(1, serviceName); - stmt.setString(2, serviceBase); - stmt.setString(3, instanceUUID); - stmt.setString(4, instanceUUID); - stmt.executeUpdate(); - } - } - } - - private void heartbeatUpdate() throws SQLException { - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement( - """ - UPDATE SERVICE_HEARTBEAT - SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6) - WHERE INSTANCE = ? AND ALIVE = 1 - """) - ) - { - stmt.setString(1, instanceUUID); - stmt.executeUpdate(); - } - } - } - - private void heartbeatStop() throws SQLException { - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement( - """ - UPDATE SERVICE_HEARTBEAT - SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), ALIVE = 0 - WHERE INSTANCE = ? - """) - ) - { - stmt.setString(1, instanceUUID); - stmt.executeUpdate(); - } - } - } +import com.google.inject.ImplementedBy; +@ImplementedBy(ServiceHeartbeatImpl.class) +public interface ServiceHeartbeat { + > ServiceTaskHeartbeat createServiceTaskHeartbeat(Class steps, String processName); } diff --git a/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeatImpl.java b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeatImpl.java new file mode 100644 index 00000000..63746567 --- /dev/null +++ b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceHeartbeatImpl.java @@ -0,0 +1,158 @@ +package nu.marginalia.service.control; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.service.module.ServiceConfiguration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.concurrent.TimeUnit; + +/** This service sends a heartbeat to the database every 5 seconds, + * updating the control service with the liveness information for the service. + */ +@Singleton +public class ServiceHeartbeatImpl implements ServiceHeartbeat { + private final Logger logger = LoggerFactory.getLogger(ServiceHeartbeatImpl.class); + private final String serviceName; + private final String serviceBase; + private final String instanceUUID; + private final ServiceConfiguration configuration; + private final ServiceEventLog eventLog; + private final HikariDataSource dataSource; + + + private final Thread runnerThread; + private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 5); + + private volatile boolean running = false; + + @Inject + public ServiceHeartbeatImpl(ServiceConfiguration configuration, + ServiceEventLog eventLog, + HikariDataSource dataSource) + { + this.serviceName = configuration.serviceName() + ":" + configuration.node(); + this.serviceBase = configuration.serviceName(); + this.configuration = configuration; + this.eventLog = eventLog; + this.dataSource = dataSource; + + this.instanceUUID = configuration.instanceUuid().toString(); + + runnerThread = new Thread(this::run); + + Runtime.getRuntime().addShutdownHook(new Thread(this::shutDown)); + } + + @Override + public > ServiceTaskHeartbeat createServiceTaskHeartbeat(Class steps, String processName) { + return new ServiceTaskHeartbeatImpl<>(steps, configuration, processName, eventLog, dataSource); + } + + + public void start() { + if (!running) { + runnerThread.start(); + } + } + + public void shutDown() { + if (!running) + return; + + running = false; + + try { + runnerThread.join(); + heartbeatStop(); + } + catch (InterruptedException|SQLException ex) { + logger.warn("ServiceHeartbeat shutdown failed", ex); + } + } + + private void run() { + if (!running) + running = true; + else + return; + + try { + heartbeatInit(); + + while (running) { + + try { + heartbeatUpdate(); + } + catch (SQLException ex) { + logger.warn("ServiceHeartbeat failed to update", ex); + } + + TimeUnit.SECONDS.sleep(heartbeatInterval); + } + } + catch (InterruptedException|SQLException ex) { + logger.error("ServiceHeartbeat caught irrecoverable exception, killing service", ex); + System.exit(255); + } + } + + private void heartbeatInit() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + INSERT INTO SERVICE_HEARTBEAT (SERVICE_NAME, SERVICE_BASE, INSTANCE, HEARTBEAT_TIME, ALIVE) + VALUES (?, ?, ?, CURRENT_TIMESTAMP(6), 1) + ON DUPLICATE KEY UPDATE + INSTANCE = ?, + HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), + ALIVE = 1 + """ + )) + { + stmt.setString(1, serviceName); + stmt.setString(2, serviceBase); + stmt.setString(3, instanceUUID); + stmt.setString(4, instanceUUID); + stmt.executeUpdate(); + } + } + } + + private void heartbeatUpdate() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + UPDATE SERVICE_HEARTBEAT + SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6) + WHERE INSTANCE = ? AND ALIVE = 1 + """) + ) + { + stmt.setString(1, instanceUUID); + stmt.executeUpdate(); + } + } + } + + private void heartbeatStop() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + UPDATE SERVICE_HEARTBEAT + SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), ALIVE = 0 + WHERE INSTANCE = ? + """) + ) + { + stmt.setString(1, instanceUUID); + stmt.executeUpdate(); + } + } + } + +} diff --git a/code/common/service/src/main/java/nu/marginalia/service/control/ServiceTaskHeartbeat.java b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceTaskHeartbeat.java index bf0d6a9f..a9dd5072 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/control/ServiceTaskHeartbeat.java +++ b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceTaskHeartbeat.java @@ -1,196 +1,8 @@ package nu.marginalia.service.control; - -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.service.module.ServiceConfiguration; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.sql.SQLException; -import java.util.UUID; -import java.util.concurrent.TimeUnit; - -/** This object sends a heartbeat to the database every few seconds, - * updating with the progress of a task within a service. Progress is tracked by providing - * enumerations corresponding to the steps in the task. It's important they're arranged in the same - * order as the steps in the task in order to get an accurate progress tracking. - */ -public class ServiceTaskHeartbeat> implements AutoCloseable { - private final Logger logger = LoggerFactory.getLogger(ServiceTaskHeartbeat.class); - private final String taskName; - private final String taskBase; - private final String instanceUUID; - private final HikariDataSource dataSource; - - - private final Thread runnerThread; - private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 1); - private final String serviceInstanceUUID; - private final int stepCount; - private final ServiceEventLog eventLog; - - private volatile boolean running = false; - private volatile int stepNum = 0; - private volatile String step = "-"; - - ServiceTaskHeartbeat(Class stepClass, - ServiceConfiguration configuration, - String taskName, - ServiceEventLog eventLog, - HikariDataSource dataSource) - { - this.eventLog = eventLog; - this.taskName = configuration.serviceName() + "." + taskName + ":" + configuration.node(); - this.taskBase = configuration.serviceName() + "." + taskName; - this.dataSource = dataSource; - - this.instanceUUID = UUID.randomUUID().toString(); - this.serviceInstanceUUID = configuration.instanceUuid().toString(); - - this.stepCount = stepClass.getEnumConstants().length; - - heartbeatInit(); - - runnerThread = new Thread(this::run); - runnerThread.start(); - } - - /** Update the progress of the task. This is a fast function that doesn't block; - * the actual update is done in a separate thread. - * - * @param step The current step in the task. - */ - public void progress(T step) { - this.step = step.name(); - - - // off by one since we calculate the progress based on the number of steps, - // and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the - // final progress being 80% and not 100%) - - this.stepNum = 1 + step.ordinal(); - - logger.info("ServiceTask {} progress: {}", taskBase, step.name()); - eventLog.logEvent("TASK-STEP", taskName + " = " + step.name()); - } - - public void shutDown() { - if (!running) - return; - - running = false; - - try { - runnerThread.join(); - heartbeatStop(); - } - catch (InterruptedException|SQLException ex) { - logger.warn("ServiceHeartbeat shutdown failed", ex); - } - } - - private void run() { - if (!running) - running = true; - else - return; - - try { - while (running) { - try { - heartbeatUpdate(); - } - catch (SQLException ex) { - logger.warn("ServiceHeartbeat failed to update", ex); - } - - TimeUnit.SECONDS.sleep(heartbeatInterval); - } - } - catch (InterruptedException ex) { - logger.error("ServiceHeartbeat caught irrecoverable exception, killing service", ex); - System.exit(255); - } - } - - private void heartbeatInit() { - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement( - """ - INSERT INTO TASK_HEARTBEAT (TASK_NAME, TASK_BASE, INSTANCE, SERVICE_INSTANCE, HEARTBEAT_TIME, STATUS) - VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP(6), 'STARTING') - ON DUPLICATE KEY UPDATE - INSTANCE = ?, - SERVICE_INSTANCE = ?, - HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), - STATUS = 'STARTING' - """ - )) - { - stmt.setString(1, taskName); - stmt.setString(2, taskBase); - stmt.setString(3, instanceUUID); - stmt.setString(4, serviceInstanceUUID); - stmt.setString(5, instanceUUID); - stmt.setString(6, serviceInstanceUUID); - stmt.executeUpdate(); - } - } - catch (SQLException ex) { - logger.error("ServiceHeartbeat failed to initialize", ex); - throw new RuntimeException(ex); - } - - eventLog.logEvent("TASK-STARTED", taskName); - } - - private void heartbeatUpdate() throws SQLException { - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement( - """ - UPDATE TASK_HEARTBEAT - SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), - STATUS = 'RUNNING', - PROGRESS = ?, - STAGE_NAME = ? - WHERE INSTANCE = ? - """) - ) - { - stmt.setInt(1, (int) Math.round(100 * stepNum / (double) stepCount)); - stmt.setString(2, step); - stmt.setString(3, instanceUUID); - stmt.executeUpdate(); - } - } - } - - private void heartbeatStop() throws SQLException { - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement( - """ - UPDATE TASK_HEARTBEAT - SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), - STATUS='STOPPED', - PROGRESS = ?, - STAGE_NAME = ? - WHERE INSTANCE = ? - """) - ) - { - stmt.setInt(1, (int) Math.round(100 * stepNum / (double) stepCount)); - stmt.setString( 2, step); - stmt.setString( 3, instanceUUID); - stmt.executeUpdate(); - } - } - eventLog.logEvent("TASK-TERMINATED", taskName); - } +public interface ServiceTaskHeartbeat> extends AutoCloseable { + void progress(T step); @Override - public void close() { - shutDown(); - } - + void close(); } - diff --git a/code/common/service/src/main/java/nu/marginalia/service/control/ServiceTaskHeartbeatImpl.java b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceTaskHeartbeatImpl.java new file mode 100644 index 00000000..c16bcc8c --- /dev/null +++ b/code/common/service/src/main/java/nu/marginalia/service/control/ServiceTaskHeartbeatImpl.java @@ -0,0 +1,197 @@ +package nu.marginalia.service.control; + + +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.service.module.ServiceConfiguration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.UUID; +import java.util.concurrent.TimeUnit; + +/** This object sends a heartbeat to the database every few seconds, + * updating with the progress of a task within a service. Progress is tracked by providing + * enumerations corresponding to the steps in the task. It's important they're arranged in the same + * order as the steps in the task in order to get an accurate progress tracking. + */ +public class ServiceTaskHeartbeatImpl> implements ServiceTaskHeartbeat { + private final Logger logger = LoggerFactory.getLogger(ServiceTaskHeartbeatImpl.class); + private final String taskName; + private final String taskBase; + private final String instanceUUID; + private final HikariDataSource dataSource; + + + private final Thread runnerThread; + private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 1); + private final String serviceInstanceUUID; + private final int stepCount; + private final ServiceEventLog eventLog; + + private volatile boolean running = false; + private volatile int stepNum = 0; + private volatile String step = "-"; + + ServiceTaskHeartbeatImpl(Class stepClass, + ServiceConfiguration configuration, + String taskName, + ServiceEventLog eventLog, + HikariDataSource dataSource) + { + this.eventLog = eventLog; + this.taskName = configuration.serviceName() + "." + taskName + ":" + configuration.node(); + this.taskBase = configuration.serviceName() + "." + taskName; + this.dataSource = dataSource; + + this.instanceUUID = UUID.randomUUID().toString(); + this.serviceInstanceUUID = configuration.instanceUuid().toString(); + + this.stepCount = stepClass.getEnumConstants().length; + + heartbeatInit(); + + runnerThread = new Thread(this::run); + runnerThread.start(); + } + + /** Update the progress of the task. This is a fast function that doesn't block; + * the actual update is done in a separate thread. + * + * @param step The current step in the task. + */ + @Override + public void progress(T step) { + this.step = step.name(); + + + // off by one since we calculate the progress based on the number of steps, + // and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the + // final progress being 80% and not 100%) + + this.stepNum = 1 + step.ordinal(); + + logger.info("ServiceTask {} progress: {}", taskBase, step.name()); + eventLog.logEvent("TASK-STEP", taskName + " = " + step.name()); + } + + public void shutDown() { + if (!running) + return; + + running = false; + + try { + runnerThread.join(); + heartbeatStop(); + } + catch (InterruptedException|SQLException ex) { + logger.warn("ServiceHeartbeat shutdown failed", ex); + } + } + + private void run() { + if (!running) + running = true; + else + return; + + try { + while (running) { + try { + heartbeatUpdate(); + } + catch (SQLException ex) { + logger.warn("ServiceHeartbeat failed to update", ex); + } + + TimeUnit.SECONDS.sleep(heartbeatInterval); + } + } + catch (InterruptedException ex) { + logger.error("ServiceHeartbeat caught irrecoverable exception, killing service", ex); + System.exit(255); + } + } + + private void heartbeatInit() { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + INSERT INTO TASK_HEARTBEAT (TASK_NAME, TASK_BASE, INSTANCE, SERVICE_INSTANCE, HEARTBEAT_TIME, STATUS) + VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP(6), 'STARTING') + ON DUPLICATE KEY UPDATE + INSTANCE = ?, + SERVICE_INSTANCE = ?, + HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), + STATUS = 'STARTING' + """ + )) + { + stmt.setString(1, taskName); + stmt.setString(2, taskBase); + stmt.setString(3, instanceUUID); + stmt.setString(4, serviceInstanceUUID); + stmt.setString(5, instanceUUID); + stmt.setString(6, serviceInstanceUUID); + stmt.executeUpdate(); + } + } + catch (SQLException ex) { + logger.error("ServiceHeartbeat failed to initialize", ex); + throw new RuntimeException(ex); + } + + eventLog.logEvent("TASK-STARTED", taskName); + } + + private void heartbeatUpdate() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + UPDATE TASK_HEARTBEAT + SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), + STATUS = 'RUNNING', + PROGRESS = ?, + STAGE_NAME = ? + WHERE INSTANCE = ? + """) + ) + { + stmt.setInt(1, (int) Math.round(100 * stepNum / (double) stepCount)); + stmt.setString(2, step); + stmt.setString(3, instanceUUID); + stmt.executeUpdate(); + } + } + } + + private void heartbeatStop() throws SQLException { + try (var connection = dataSource.getConnection()) { + try (var stmt = connection.prepareStatement( + """ + UPDATE TASK_HEARTBEAT + SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), + STATUS='STOPPED', + PROGRESS = ?, + STAGE_NAME = ? + WHERE INSTANCE = ? + """) + ) + { + stmt.setInt(1, (int) Math.round(100 * stepNum / (double) stepCount)); + stmt.setString( 2, step); + stmt.setString( 3, instanceUUID); + stmt.executeUpdate(); + } + } + eventLog.logEvent("TASK-TERMINATED", taskName); + } + + @Override + public void close() { + shutDown(); + } + +} + diff --git a/code/common/service/src/main/java/nu/marginalia/service/server/BaseServiceParams.java b/code/common/service/src/main/java/nu/marginalia/service/server/BaseServiceParams.java index 73706dc8..9db8d82e 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/server/BaseServiceParams.java +++ b/code/common/service/src/main/java/nu/marginalia/service/server/BaseServiceParams.java @@ -4,7 +4,7 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.service.control.ServiceEventLog; -import nu.marginalia.service.control.ServiceHeartbeat; +import nu.marginalia.service.control.ServiceHeartbeatImpl; import nu.marginalia.service.module.ServiceConfiguration; /** This class exists to reduce Service boilerplate */ @@ -13,14 +13,14 @@ public class BaseServiceParams { public final ServiceConfiguration configuration; public final Initialization initialization; public final MetricsServer metricsServer; - public final ServiceHeartbeat heartbeat; + public final ServiceHeartbeatImpl heartbeat; public final ServiceEventLog eventLog; public final MessageQueueFactory messageQueueInboxFactory; @Inject public BaseServiceParams(ServiceConfiguration configuration, Initialization initialization, MetricsServer metricsServer, - ServiceHeartbeat heartbeat, + ServiceHeartbeatImpl heartbeat, ServiceEventLog eventLog, MessageQueueFactory messageQueueInboxFactory) { this.configuration = configuration; diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/DocumentKeywordExtractor.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/DocumentKeywordExtractor.java index 74554fd0..b3b098cf 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/DocumentKeywordExtractor.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/DocumentKeywordExtractor.java @@ -7,7 +7,7 @@ import nu.marginalia.language.model.WordRep; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.model.EdgeUrl; -import javax.inject.Inject; +import com.google.inject.Inject; import java.util.*; import java.util.stream.Stream; diff --git a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/DomainRankings.java b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/DomainRankings.java index b408f980..0ca2db77 100644 --- a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/DomainRankings.java +++ b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/DomainRankings.java @@ -2,6 +2,7 @@ package nu.marginalia.ranking; import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; import it.unimi.dsi.fastutil.ints.Int2ShortOpenHashMap; +import nu.marginalia.model.id.UrlIdCodec; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -37,6 +38,11 @@ public class DomainRankings { return rankings.getOrDefault(domainId, (short) MAX_RANK_VALUE); } + public float getSortRanking(long docId) { + int domainId = UrlIdCodec.getDomainId(docId); + return rankings.getOrDefault(domainId, (short) MAX_RANK_VALUE) / (float) MAX_RANK_VALUE; + } + public int size() { return rankings.size(); } diff --git a/code/features-index/index-forward/build.gradle b/code/features-index/index-forward/build.gradle index f55a02df..8dd3e788 100644 --- a/code/features-index/index-forward/build.gradle +++ b/code/features-index/index-forward/build.gradle @@ -16,9 +16,8 @@ dependencies { implementation project(':code:features-index:domain-ranking') implementation project(':code:features-index:index-query') implementation project(':code:features-index:index-journal') - implementation project(':code:features-index:lexicon') implementation project(':code:common:model') - implementation project(':code:common:service') + implementation project(':code:common:process') implementation project(':third-party:uppend') diff --git a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexConverter.java b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexConverter.java index 4aa083e3..edba7da6 100644 --- a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexConverter.java +++ b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexConverter.java @@ -6,10 +6,10 @@ import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.array.LongArray; import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile; import nu.marginalia.model.idx.DocumentMetadata; +import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.ranking.DomainRankings; -import nu.marginalia.service.control.ServiceHeartbeat; -import org.roaringbitmap.IntConsumer; -import org.roaringbitmap.RoaringBitmap; +import org.roaringbitmap.longlong.LongConsumer; +import org.roaringbitmap.longlong.Roaring64Bitmap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -20,24 +20,24 @@ import java.nio.file.Path; public class ForwardIndexConverter { - private final ServiceHeartbeat heartbeat; - private final File inputFile; + private final ProcessHeartbeat heartbeat; private final Logger logger = LoggerFactory.getLogger(getClass()); + private final IndexJournalReader journalReader; private final Path outputFileDocsId; private final Path outputFileDocsData; private final DomainRankings domainRankings; - public ForwardIndexConverter(ServiceHeartbeat heartbeat, - File inputFile, + public ForwardIndexConverter(ProcessHeartbeat heartbeat, + IndexJournalReader journalReader, Path outputFileDocsId, Path outputFileDocsData, DomainRankings domainRankings ) { this.heartbeat = heartbeat; - this.inputFile = inputFile; + this.journalReader = journalReader; this.outputFileDocsId = outputFileDocsId; this.outputFileDocsData = outputFileDocsData; this.domainRankings = domainRankings; @@ -54,17 +54,9 @@ public class ForwardIndexConverter { public void convert() throws IOException { deleteOldFiles(); - IndexJournalReaderSingleCompressedFile journalReader = new IndexJournalReaderSingleCompressedFile(inputFile.toPath()); - if (journalReader.fileHeader().fileSize() <= IndexJournalReader.FILE_HEADER_SIZE_BYTES) { - logger.warn("Bailing: Journal is empty!"); - return; - } - - logger.info("Converting {} {}", inputFile, journalReader.fileHeader); - logger.info("Domain Rankings size = {}", domainRankings.size()); - try (var progress = heartbeat.createServiceTaskHeartbeat(TaskSteps.class, "forwardIndexConverter")) { + try (var progress = heartbeat.createProcessTaskHeartbeat(TaskSteps.class, "forwardIndexConverter")) { progress.progress(TaskSteps.GET_DOC_IDS); LongArray docsFileId = getDocIds(outputFileDocsId, journalReader); @@ -83,12 +75,11 @@ public class ForwardIndexConverter { LongArray docFileData = LongArray.mmapForWriting(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size()); journalReader.forEach(entry -> { - long entryOffset = (long) ForwardIndexParameters.ENTRY_SIZE * docIdToIdx.get(entry.urlId()); + long entryOffset = (long) ForwardIndexParameters.ENTRY_SIZE * docIdToIdx.get(entry.docId()); int ranking = domainRankings.getRanking(entry.domainId()); long meta = DocumentMetadata.encodeRank(entry.docMeta(), ranking); - docFileData.set(entryOffset + ForwardIndexParameters.DOMAIN_OFFSET, entry.domainId()); docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta); docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, entry.header.documentFeatures()); }); @@ -109,17 +100,18 @@ public class ForwardIndexConverter { } private LongArray getDocIds(Path outputFileDocs, IndexJournalReader journalReader) throws IOException { - RoaringBitmap rbm = new RoaringBitmap(); - journalReader.forEachUrlId(rbm::add); + Roaring64Bitmap rbm = new Roaring64Bitmap(); + journalReader.forEachDocId(rbm::add); - LongArray ret = LongArray.mmapForWriting(outputFileDocs, rbm.getCardinality()); - rbm.forEach(new IntConsumer() { + LongArray ret = LongArray.mmapForWriting(outputFileDocs, rbm.getIntCardinality()); + rbm.forEach(new LongConsumer() { int offset; @Override - public void accept(int value) { + public void accept(long value) { ret.set(offset++, value); } }); + return ret; } diff --git a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexFileNames.java b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexFileNames.java new file mode 100644 index 00000000..89cd0d6d --- /dev/null +++ b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexFileNames.java @@ -0,0 +1,28 @@ +package nu.marginalia.index.forward; + +import java.nio.file.Path; + +public class ForwardIndexFileNames { + public static Path resolve(Path basePath, FileIdentifier identifier, FileVersion version) { + return switch (identifier) { + case DOC_ID -> switch (version) { + case NEXT -> basePath.resolve("fwd-doc-id.dat.next"); + case CURRENT -> basePath.resolve("fwd-doc-id.dat"); + }; + case DOC_DATA -> switch (version) { + case NEXT -> basePath.resolve("fwd-doc-data.dat.next"); + case CURRENT -> basePath.resolve("fwd-doc-data.dat"); + }; + }; + } + + public enum FileVersion { + CURRENT, + NEXT + }; + + public enum FileIdentifier { + DOC_DATA, + DOC_ID + } +} diff --git a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexParameters.java b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexParameters.java index f9c17a71..0b306050 100644 --- a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexParameters.java +++ b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexParameters.java @@ -1,9 +1,8 @@ package nu.marginalia.index.forward; class ForwardIndexParameters { - public static final int ENTRY_SIZE = 3; - public static final int DOMAIN_OFFSET = 0; - public static final int METADATA_OFFSET = 1; - public static final int FEATURES_OFFSET = 2; + public static final int ENTRY_SIZE = 2; + public static final int METADATA_OFFSET = 0; + public static final int FEATURES_OFFSET = 1; } diff --git a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexReader.java b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexReader.java index 3bdf14c8..dc888aa9 100644 --- a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexReader.java +++ b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexReader.java @@ -3,6 +3,7 @@ package nu.marginalia.index.forward; import com.upserve.uppend.blobs.NativeIO; import gnu.trove.map.hash.TLongIntHashMap; import nu.marginalia.array.LongArray; +import nu.marginalia.model.id.UrlIdCodec; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -71,6 +72,8 @@ public class ForwardIndexReader { } public long getDocMeta(long docId) { + assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id"; + long offset = idxForDoc(docId); if (offset < 0) return 0; @@ -78,20 +81,17 @@ public class ForwardIndexReader { } public int getHtmlFeatures(long docId) { + assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id"; + long offset = idxForDoc(docId); if (offset < 0) return 0; return (int) data.get(ENTRY_SIZE * offset + FEATURES_OFFSET); } - public int getDomainId(long docId) { - long offset = idxForDoc(docId); - if (offset < 0) return 0; - - return Math.max(0, (int) data.get(ENTRY_SIZE * offset + DOMAIN_OFFSET)); - } - private int idxForDoc(long docId) { + assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id"; + return idToOffset.get(docId); } diff --git a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ParamMatchingQueryFilter.java b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ParamMatchingQueryFilter.java index 8d22516b..d7e6a9b3 100644 --- a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ParamMatchingQueryFilter.java +++ b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ParamMatchingQueryFilter.java @@ -1,5 +1,6 @@ package nu.marginalia.index.forward; +import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.index.query.limit.SpecificationLimitType; import nu.marginalia.index.query.IndexQueryParams; @@ -15,10 +16,11 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf { } @Override - public boolean test(long docId) { - int urlId = (int) (docId & 0xFFFF_FFFFL); - int domainId = forwardIndexReader.getDomainId(urlId); - long meta = forwardIndexReader.getDocMeta(urlId); + public boolean test(long combinedId) { + long docId = UrlIdCodec.removeRank(combinedId); + int domainId = UrlIdCodec.getDomainId(docId); + + long meta = forwardIndexReader.getDocMeta(docId); if (!validateDomain(domainId, meta)) { return false; diff --git a/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java b/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java index 1c6fdf1c..b3485475 100644 --- a/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java +++ b/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java @@ -2,14 +2,14 @@ package nu.marginalia.index.forward; import lombok.SneakyThrows; import nu.marginalia.index.journal.model.IndexJournalEntry; -import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; +import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile; import nu.marginalia.index.journal.writer.IndexJournalWriter; -import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; +import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl; +import nu.marginalia.model.id.UrlIdCodec; +import nu.marginalia.process.control.FakeProcessHeartbeat; +import nu.marginalia.process.control.ProcessHeartbeatImpl; +import nu.marginalia.process.control.ProcessTaskHeartbeatImpl; import nu.marginalia.ranking.DomainRankings; -import nu.marginalia.lexicon.KeywordLexicon; -import nu.marginalia.lexicon.journal.KeywordLexiconJournal; -import nu.marginalia.service.control.ServiceHeartbeat; -import nu.marginalia.service.control.ServiceTaskHeartbeat; import nu.marginalia.test.TestUtil; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; @@ -28,7 +28,6 @@ import static org.mockito.Mockito.when; class ForwardIndexConverterTest { - KeywordLexicon keywordLexicon; IndexJournalWriter writer; Path indexFile; @@ -49,12 +48,9 @@ class ForwardIndexConverterTest { dictionaryFile = Files.createTempFile("tmp", ".dict"); dictionaryFile.toFile().deleteOnExit(); - keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile(), KeywordLexiconJournalMode.READ_WRITE)); - keywordLexicon.getOrInsert("0"); - indexFile = Files.createTempFile("tmp", ".idx"); indexFile.toFile().deleteOnExit(); - writer = new IndexJournalWriterImpl(keywordLexicon, indexFile); + writer = new IndexJournalWriterSingleFileImpl(indexFile); wordsFile1 = Files.createTempFile("words1", ".idx"); urlsFile1 = Files.createTempFile("urls1", ".idx"); @@ -62,11 +58,9 @@ class ForwardIndexConverterTest { dataDir = Files.createTempDirectory(getClass().getSimpleName()); for (int i = 1; i < workSetSize; i++) { - createEntry(writer, keywordLexicon, i); + createEntry(writer, i); } - - keywordLexicon.commitToDisk(); writer.close(); @@ -84,15 +78,16 @@ class ForwardIndexConverterTest { } long createId(long url, long domain) { - return (domain << 32) | url; + return UrlIdCodec.encodeId((int) domain, (int) url); } - public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) { + + public void createEntry(IndexJournalWriter writer, int id) { int[] factors = getFactorsI(id); var entryBuilder = IndexJournalEntry.builder(createId(id, id/20), id%5); for (int i = 0; i+1 < factors.length; i+=2) { - entryBuilder.add(keywordLexicon.getOrInsert(Integer.toString(factors[i])), -factors[i+1]); + entryBuilder.add(factors[i], -factors[i+1]); } writer.put(entryBuilder.build()); @@ -101,18 +96,14 @@ class ForwardIndexConverterTest { @Test void testForwardIndex() throws IOException { - // RIP fairies - var serviceHeartbeat = Mockito.mock(ServiceHeartbeat.class); - when(serviceHeartbeat.createServiceTaskHeartbeat(Mockito.any(), Mockito.any())) - .thenReturn(Mockito.mock(ServiceTaskHeartbeat.class)); - - new ForwardIndexConverter(serviceHeartbeat, indexFile.toFile(), docsFileId, docsFileData, new DomainRankings()).convert(); + new ForwardIndexConverter(new FakeProcessHeartbeat(), new IndexJournalReaderSingleCompressedFile(indexFile), docsFileId, docsFileData, new DomainRankings()).convert(); var forwardReader = new ForwardIndexReader(docsFileId, docsFileData); for (int i = 36; i < workSetSize; i++) { - assertEquals(0x00FF000000000000L | (i % 5), forwardReader.getDocMeta(i)); - assertEquals(i/20, forwardReader.getDomainId(i)); + long docId = createId(i, i/20); + assertEquals(0x00FF000000000000L | (i % 5), forwardReader.getDocMeta(docId)); + assertEquals(i/20, UrlIdCodec.getDomainId(docId)); } } diff --git a/code/features-index/index-journal/build.gradle b/code/features-index/index-journal/build.gradle index bc87a59f..b20d7157 100644 --- a/code/features-index/index-journal/build.gradle +++ b/code/features-index/index-journal/build.gradle @@ -13,7 +13,6 @@ java { dependencies { implementation project(':code:libraries:array') implementation project(':code:common:model') - implementation project(':code:features-index:lexicon') implementation libs.lombok annotationProcessor libs.lombok @@ -22,6 +21,7 @@ dependencies { implementation libs.prometheus implementation libs.notnull implementation libs.rxjava + implementation libs.guava implementation libs.trove implementation libs.zstd implementation libs.commons.lang3 diff --git a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntry.java b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntry.java index c3602266..2239fc8a 100644 --- a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntry.java +++ b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntry.java @@ -1,8 +1,6 @@ package nu.marginalia.index.journal.model; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.id.EdgeId; +import nu.marginalia.model.id.UrlIdCodec; public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntryData data) { @@ -15,18 +13,7 @@ public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntr long documentMeta) { - return builder(new EdgeId<>(domainId), - new EdgeId<>(urlId), - documentMeta); + return builder(UrlIdCodec.encodeId(domainId, urlId), documentMeta); } - public static IndexJournalEntryBuilder builder(EdgeId domainId, - EdgeId urlId, - long documentMeta) { - - - return new IndexJournalEntryBuilder(0, - IndexJournalEntryHeader.combineIds(domainId, urlId), - documentMeta); - } } diff --git a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryData.java b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryData.java index f24be823..dcf49149 100644 --- a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryData.java +++ b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryData.java @@ -25,7 +25,7 @@ public class IndexJournalEntryData implements Iterable= size) - throw new ArrayIndexOutOfBoundsException(); + throw new ArrayIndexOutOfBoundsException(idx + " vs " + size); return underlyingArray[idx]; } @@ -58,9 +58,9 @@ public class IndexJournalEntryData implements Iterable domainId, + public IndexJournalEntryHeader(long combinedId, int documentFeatures, - EdgeId urlId, long documentMeta) { this(-1, documentFeatures, - combineIds(domainId, urlId), + combinedId, documentMeta); } - static long combineIds(EdgeId domainId, EdgeId urlId) { - long did = domainId.id(); - long uid = urlId.id(); - - return (did << 32L) | uid; - } - } diff --git a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReadEntry.java b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReadEntry.java index 00ba3b88..42729331 100644 --- a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReadEntry.java +++ b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReadEntry.java @@ -2,11 +2,13 @@ package nu.marginalia.index.journal.reader; import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.model.IndexJournalEntryHeader; +import nu.marginalia.model.id.UrlIdCodec; import java.io.DataInputStream; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.LongBuffer; +import java.util.Arrays; public class IndexJournalReadEntry { public final IndexJournalEntryHeader header; @@ -51,11 +53,7 @@ public class IndexJournalReadEntry { } public int domainId() { - return (int) (docId() >>> 32L); - } - - public int urlId() { - return (int) (docId() & 0xFFFF_FFFFL); + return UrlIdCodec.getDomainId(docId()); } public IndexJournalEntryData readEntry() { diff --git a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReader.java b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReader.java index 1467c500..04cd4a62 100644 --- a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReader.java +++ b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReader.java @@ -1,31 +1,48 @@ package nu.marginalia.index.journal.reader; import nu.marginalia.index.journal.model.IndexJournalEntryData; -import nu.marginalia.index.journal.model.IndexJournalFileHeader; -import nu.marginalia.index.journal.model.IndexJournalStatistics; +import nu.marginalia.model.idx.WordFlags; import org.jetbrains.annotations.NotNull; import java.io.IOException; +import java.nio.file.Path; import java.util.Iterator; -import java.util.function.IntConsumer; +import java.util.function.LongConsumer; +import java.util.function.Predicate; public interface IndexJournalReader extends Iterable { int FILE_HEADER_SIZE_LONGS = 2; int FILE_HEADER_SIZE_BYTES = 8 * FILE_HEADER_SIZE_LONGS; - IndexJournalFileHeader fileHeader(); + static IndexJournalReader singleFile(Path fileName) throws IOException { + return new IndexJournalReaderSingleCompressedFile(fileName); + } - IndexJournalStatistics getStatistics(); + static IndexJournalReader paging(Path baseDir) throws IOException { + return new IndexJournalReaderPagingImpl(baseDir); + } - void forEachWordId(IntConsumer consumer); + static IndexJournalReader singleFileWithPriorityFilters(Path path) throws IOException { - void forEachUrlIdWordId(BiIntConsumer consumer); + long highPriorityFlags = + WordFlags.Title.asBit() + | WordFlags.Subjects.asBit() + | WordFlags.TfIdfHigh.asBit() + | WordFlags.NamesWords.asBit() + | WordFlags.UrlDomain.asBit() + | WordFlags.UrlPath.asBit() + | WordFlags.Site.asBit() + | WordFlags.SiteAdjacent.asBit(); - void forEachDocIdWordId(LongIntConsumer consumer); + return new IndexJournalReaderSingleCompressedFile(path, null, + r -> (r.metadata() & highPriorityFlags) != 0); + } + + void forEachWordId(LongConsumer consumer); void forEachDocIdRecord(LongObjectConsumer consumer); - void forEachUrlId(IntConsumer consumer); + void forEachDocId(LongConsumer consumer); @NotNull @Override @@ -33,13 +50,7 @@ public interface IndexJournalReader extends Iterable { void close() throws IOException; - interface BiIntConsumer { - void accept(int left, int right); - } - interface LongIntConsumer { - void accept(long left, int right); - } interface LongObjectConsumer { void accept(long left, T right); diff --git a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReaderPagingImpl.java b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReaderPagingImpl.java new file mode 100644 index 00000000..24bf02db --- /dev/null +++ b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReaderPagingImpl.java @@ -0,0 +1,61 @@ +package nu.marginalia.index.journal.reader; + +import com.google.common.collect.Iterators; +import nu.marginalia.index.journal.model.IndexJournalEntryData; +import nu.marginalia.index.journal.model.IndexJournalStatistics; +import nu.marginallia.index.journal.IndexJournalFileNames; +import org.jetbrains.annotations.NotNull; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.function.LongConsumer; + +public class IndexJournalReaderPagingImpl implements IndexJournalReader { + + private final List readers; + + public IndexJournalReaderPagingImpl(Path baseDir) throws IOException { + var inputFiles = IndexJournalFileNames.findJournalFiles(baseDir); + this.readers = new ArrayList<>(inputFiles.size()); + + for (var inputFile : inputFiles) { + readers.add(new IndexJournalReaderSingleCompressedFile(inputFile)); + } + } + + @Override + public void forEachWordId(LongConsumer consumer) { + for (var reader : readers) { + reader.forEachWordId(consumer); + } + } + + @Override + public void forEachDocIdRecord(LongObjectConsumer consumer) { + for (var reader : readers) { + reader.forEachDocIdRecord(consumer); + } + } + + @Override + public void forEachDocId(LongConsumer consumer) { + for (var reader : readers) { + reader.forEachDocId(consumer); + } + } + + @Override + public @NotNull Iterator iterator() { + return Iterators.concat(readers.stream().map(IndexJournalReader::iterator).iterator()); + } + + @Override + public void close() throws IOException { + for (var reader : readers) { + reader.close(); + } + } +} diff --git a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReaderSingleCompressedFile.java b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReaderSingleCompressedFile.java index c64bccf5..04352880 100644 --- a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReaderSingleCompressedFile.java +++ b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReaderSingleCompressedFile.java @@ -12,21 +12,30 @@ import java.io.*; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardOpenOption; +import java.util.Arrays; import java.util.Iterator; import java.util.function.IntConsumer; +import java.util.function.LongConsumer; import java.util.function.Predicate; public class IndexJournalReaderSingleCompressedFile implements IndexJournalReader { - private static Path journalFile; + private Path journalFile; public final IndexJournalFileHeader fileHeader; + @Override + public String toString() { + return "IndexJournalReaderSingleCompressedFile{" + journalFile + " }"; + } + private DataInputStream dataInputStream = null; final Predicate entryPredicate; final Predicate recordPredicate; public IndexJournalReaderSingleCompressedFile(Path file) throws IOException { + this.journalFile = file; + fileHeader = readHeader(file); this.recordPredicate = null; @@ -34,7 +43,8 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade } public IndexJournalReaderSingleCompressedFile(Path file, Predicate entryPredicate, Predicate recordPredicate) throws IOException { - journalFile = file; + this.journalFile = file; + fileHeader = readHeader(file); this.recordPredicate = recordPredicate; @@ -42,8 +52,6 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade } private static IndexJournalFileHeader readHeader(Path file) throws IOException { - journalFile = file; - try (var raf = new RandomAccessFile(file.toFile(), "r")) { long unused = raf.readLong(); long wordCount = raf.readLong(); @@ -61,10 +69,6 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade return new DataInputStream(new ZstdInputStream(new BufferedInputStream(fileInputStream))); } - public IndexJournalFileHeader fileHeader() { - return fileHeader; - } - public boolean filter(IndexJournalReadEntry entry) { return entryPredicate == null || entryPredicate.test(entry); } @@ -80,31 +84,7 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade @Override - public IndexJournalStatistics getStatistics() { - int highestWord = 0; - - // Docs cardinality is a candidate for a HyperLogLog - Roaring64Bitmap docsBitmap = new Roaring64Bitmap(); - - for (var entry : this) { - var entryData = entry.readEntry(); - - if (filter(entry)) { - docsBitmap.addLong(entry.docId() & 0x0000_0000_FFFF_FFFFL); - - for (var item : entryData) { - if (filter(entry, item)) { - highestWord = Integer.max(item.wordId(), highestWord); - } - } - } - } - - return new IndexJournalStatistics(highestWord, docsBitmap.getIntCardinality()); - } - - @Override - public void forEachWordId(IntConsumer consumer) { + public void forEachWordId(LongConsumer consumer) { for (var entry : this) { var data = entry.readEntry(); for (var post : data) { @@ -115,32 +95,6 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade } } - @Override - public void forEachUrlIdWordId(BiIntConsumer consumer) { - for (var entry : this) { - var data = entry.readEntry(); - - for (var post : data) { - if (filter(entry, post)) { - consumer.accept(entry.urlId(), post.wordId()); - } - } - } - } - - @Override - public void forEachDocIdWordId(LongIntConsumer consumer) { - for (var entry : this) { - var data = entry.readEntry(); - - for (var post : data) { - if (filter(entry, post)) { - consumer.accept(entry.docId(), post.wordId()); - } - } - } - } - @Override public void forEachDocIdRecord(LongObjectConsumer consumer) { for (var entry : this) { @@ -154,10 +108,10 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade } } @Override - public void forEachUrlId(IntConsumer consumer) { + public void forEachDocId(LongConsumer consumer) { for (var entry : this) { if (filter(entry)) { - consumer.accept(entry.urlId()); + consumer.accept(entry.docId()); } } } diff --git a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriterPagingImpl.java b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriterPagingImpl.java new file mode 100644 index 00000000..d1dca2d6 --- /dev/null +++ b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriterPagingImpl.java @@ -0,0 +1,48 @@ +package nu.marginalia.index.journal.writer; + +import lombok.SneakyThrows; +import nu.marginalia.index.journal.model.IndexJournalEntryData; +import nu.marginalia.index.journal.model.IndexJournalEntryHeader; +import nu.marginallia.index.journal.IndexJournalFileNames; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Path; + +public class IndexJournalWriterPagingImpl implements IndexJournalWriter { + private final Path outputDir; + private int fileNumber = 0; + + private final Logger logger = LoggerFactory.getLogger(getClass()); + private IndexJournalWriter currentWriter = null; + private int inputsForFile = 0; + + public IndexJournalWriterPagingImpl(Path outputDir) throws IOException { + this.outputDir = outputDir; + switchToNextWriter(); + + logger.info("Creating Journal Writer {}", outputDir); + } + + private void switchToNextWriter() throws IOException { + if (currentWriter != null) + currentWriter.close(); + + currentWriter = new IndexJournalWriterSingleFileImpl(IndexJournalFileNames.allocateName(outputDir, fileNumber++)); + } + + @Override + @SneakyThrows + public void put(IndexJournalEntryHeader header, IndexJournalEntryData entry) { + if (++inputsForFile > 100_000) { + inputsForFile = 0; + switchToNextWriter(); + } + currentWriter.put(header, entry); + } + + public void close() throws IOException { + currentWriter.close(); + } +} diff --git a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriterImpl.java b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriterSingleFileImpl.java similarity index 83% rename from code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriterImpl.java rename to code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriterSingleFileImpl.java index c1cec279..bb49b62b 100644 --- a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriterImpl.java +++ b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/writer/IndexJournalWriterSingleFileImpl.java @@ -1,12 +1,11 @@ package nu.marginalia.index.journal.writer; import com.github.luben.zstd.ZstdDirectBufferCompressingStream; -import com.github.luben.zstd.ZstdOutputStream; import lombok.SneakyThrows; import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.model.IndexJournalEntryHeader; import nu.marginalia.index.journal.reader.IndexJournalReader; -import nu.marginalia.lexicon.KeywordLexicon; +import nu.marginallia.index.journal.IndexJournalFileNames; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -16,27 +15,34 @@ import java.nio.channels.FileChannel; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardOpenOption; +import java.nio.file.attribute.PosixFilePermissions; -public class IndexJournalWriterImpl implements IndexJournalWriter{ - private final KeywordLexicon lexicon; +public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{ private static final int ZSTD_BUFFER_SIZE = 8192; private static final int DATA_BUFFER_SIZE = 8192; private final ByteBuffer dataBuffer = ByteBuffer.allocateDirect(DATA_BUFFER_SIZE); - private final ZstdDirectBufferCompressingStream compressingStream; - private int numEntries = 0; private final FileChannel fileChannel; - public IndexJournalWriterImpl(KeywordLexicon lexicon, Path outputFile) throws IOException { - this.lexicon = lexicon; + private int numEntries = 0; + + private final Logger logger = LoggerFactory.getLogger(getClass()); + + public IndexJournalWriterSingleFileImpl(Path outputFile) throws IOException { + + logger.info("Creating Journal Writer {}", outputFile); + + Files.deleteIfExists(outputFile); + Files.createFile(outputFile, PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); fileChannel = FileChannel.open(outputFile, StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING); writeHeaderPlaceholder(fileChannel); + compressingStream = new ZstdDirectBufferCompressingStream(ByteBuffer.allocateDirect(ZSTD_BUFFER_SIZE), 3) { protected ByteBuffer flushBuffer(ByteBuffer toFlush) throws IOException { toFlush.flip(); @@ -64,7 +70,7 @@ public class IndexJournalWriterImpl implements IndexJournalWriter{ @Override @SneakyThrows - public synchronized void put(IndexJournalEntryHeader header, IndexJournalEntryData entry) { + public void put(IndexJournalEntryHeader header, IndexJournalEntryData entry) { if (dataBuffer.capacity() - dataBuffer.position() < 3*8) { dataBuffer.flip(); compressingStream.compress(dataBuffer); @@ -84,6 +90,7 @@ public class IndexJournalWriterImpl implements IndexJournalWriter{ dataBuffer.clear(); } else while (remaining-- > 0 && i < entry.size()) { + dataBuffer.putLong(entry.underlyingArray[i++]); } } @@ -103,7 +110,7 @@ public class IndexJournalWriterImpl implements IndexJournalWriter{ ByteBuffer header = ByteBuffer.allocate(16); header.putLong(numEntries); - header.putLong(lexicon.size()); + header.putLong(0); header.flip(); while (header.position() < header.limit()) { diff --git a/code/features-index/index-journal/src/main/java/nu/marginallia/index/journal/IndexJournalFileNames.java b/code/features-index/index-journal/src/main/java/nu/marginallia/index/journal/IndexJournalFileNames.java new file mode 100644 index 00000000..433cbe2e --- /dev/null +++ b/code/features-index/index-journal/src/main/java/nu/marginallia/index/journal/IndexJournalFileNames.java @@ -0,0 +1,30 @@ +package nu.marginallia.index.journal; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +public class IndexJournalFileNames { + public static Path allocateName(Path base, int idx) { + return base.resolve(String.format("page-index-%04d.dat", idx)); + } + + public static List findJournalFiles(Path baseDirectory) throws IOException { + List ret = new ArrayList<>(); + + try (var listStream = Files.list(baseDirectory)) { + listStream + .filter(IndexJournalFileNames::isJournalFile) + .sorted() + .forEach(ret::add); + } + + return ret; + } + + public static boolean isJournalFile(Path file) { + return file.toFile().getName().matches("page-index-\\d{4}.dat"); + } +} diff --git a/code/features-index/index-journal/src/test/java/nu/marginalia/index/journal/IndexJournalTest.java b/code/features-index/index-journal/src/test/java/nu/marginalia/index/journal/IndexJournalTest.java index 9cb96781..23814556 100644 --- a/code/features-index/index-journal/src/test/java/nu/marginalia/index/journal/IndexJournalTest.java +++ b/code/features-index/index-journal/src/test/java/nu/marginalia/index/journal/IndexJournalTest.java @@ -4,13 +4,12 @@ import nu.marginalia.index.journal.model.IndexJournalEntry; import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile; -import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; -import nu.marginalia.lexicon.KeywordLexicon; +import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl; +import nu.marginalia.model.id.UrlIdCodec; import org.apache.commons.lang3.tuple.Pair; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import org.mockito.Mockito; import java.io.IOException; import java.nio.file.Files; @@ -22,15 +21,16 @@ import static org.junit.jupiter.api.Assertions.assertEquals; public class IndexJournalTest { Path tempFile; - KeywordLexicon lexicon; IndexJournalReader reader; + long firstDocId = UrlIdCodec.encodeId(44, 10); + long secondDocId = UrlIdCodec.encodeId(43, 15); + @BeforeEach public void setUp() throws IOException { tempFile = Files.createTempFile(getClass().getSimpleName(), ".dat"); - lexicon = Mockito.mock(KeywordLexicon.class); - var journalWriter = new IndexJournalWriterImpl(lexicon, tempFile); + var journalWriter = new IndexJournalWriterSingleFileImpl( tempFile); journalWriter.put(IndexJournalEntry.builder(44, 10, 55) .add(1, 2) .add(2, 3) @@ -65,11 +65,11 @@ public class IndexJournalTest { } @Test - public void forEachUrlId() { - List expected = List.of(10, 15); - List actual = new ArrayList<>(); + public void forEachDocId() { + List expected = List.of(firstDocId, secondDocId); + List actual = new ArrayList<>(); - reader.forEachUrlId(actual::add); + reader.forEachDocId(actual::add); assertEquals(expected, actual); } @@ -78,50 +78,19 @@ public class IndexJournalTest { List expected = List.of(1, 2, 3, 5, 5 ,6); List actual = new ArrayList<>(); - reader.forEachWordId(actual::add); - assertEquals(expected, actual); - } - - - @Test - public void forEachUrlIdWordId() { - List> expected = List.of( - Pair.of(10, 1), - Pair.of(10, 2), - Pair.of(10, 3), - Pair.of(10, 5), - Pair.of(15, 5), - Pair.of(15, 6)); - List> actual = new ArrayList<>(); - - reader.forEachUrlIdWordId((url, word) -> actual.add(Pair.of(url, word))); - assertEquals(expected, actual); - } - - @Test - public void forEachDocIdWordId() { - List> expected = List.of( - Pair.of(10L | (44L << 32), 1), - Pair.of(10L | (44L << 32), 2), - Pair.of(10L | (44L << 32), 3), - Pair.of(10L | (44L << 32), 5), - Pair.of(15L | (43L << 32), 5), - Pair.of(15L | (43L << 32), 6)); - List> actual = new ArrayList<>(); - - reader.forEachDocIdWordId((url, word) -> actual.add(Pair.of(url, word))); + reader.forEachWordId(i -> actual.add((int) i)); assertEquals(expected, actual); } @Test public void forEachDocIdRecord() { List> expected = List.of( - Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(1, 2)), - Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(2, 3)), - Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(3, 4)), - Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(5, 6)), - Pair.of(15L | (43L << 32), new IndexJournalEntryData.Record(5, 5)), - Pair.of(15L | (43L << 32), new IndexJournalEntryData.Record(6, 6)) + Pair.of(firstDocId, new IndexJournalEntryData.Record(1, 2)), + Pair.of(firstDocId, new IndexJournalEntryData.Record(2, 3)), + Pair.of(firstDocId, new IndexJournalEntryData.Record(3, 4)), + Pair.of(firstDocId, new IndexJournalEntryData.Record(5, 6)), + Pair.of(secondDocId, new IndexJournalEntryData.Record(5, 5)), + Pair.of(secondDocId, new IndexJournalEntryData.Record(6, 6)) ); List> actual = new ArrayList<>(); diff --git a/code/features-index/index-query/src/main/java/nu/marginalia/index/query/IndexQueryBuilder.java b/code/features-index/index-query/src/main/java/nu/marginalia/index/query/IndexQueryBuilder.java index 274ce96b..68a88625 100644 --- a/code/features-index/index-query/src/main/java/nu/marginalia/index/query/IndexQueryBuilder.java +++ b/code/features-index/index-query/src/main/java/nu/marginalia/index/query/IndexQueryBuilder.java @@ -9,16 +9,16 @@ import nu.marginalia.index.query.filter.QueryFilterStepIf; public interface IndexQueryBuilder { /** Filters documents that also contain termId, within the full index. */ - IndexQueryBuilder alsoFull(int termId); + IndexQueryBuilder alsoFull(long termId); /** * Filters documents that also contain the termId, within the priority index. */ - IndexQueryBuilder alsoPrio(int termIds); + IndexQueryBuilder alsoPrio(long termIds); /** Excludes documents that contain termId, within the full index */ - IndexQueryBuilder notFull(int termId); + IndexQueryBuilder notFull(long termId); IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep); diff --git a/code/features-index/index-query/src/main/java/nu/marginalia/index/query/filter/QueryFilterLetThrough.java b/code/features-index/index-query/src/main/java/nu/marginalia/index/query/filter/QueryFilterLetThrough.java index 688ef938..5e849d79 100644 --- a/code/features-index/index-query/src/main/java/nu/marginalia/index/query/filter/QueryFilterLetThrough.java +++ b/code/features-index/index-query/src/main/java/nu/marginalia/index/query/filter/QueryFilterLetThrough.java @@ -21,7 +21,7 @@ public class QueryFilterLetThrough implements QueryFilterStepIf { } public String describe() { - return "[NoPass]"; + return "[PassThrough]"; } } diff --git a/code/features-index/index-query/src/main/java/nu/marginalia/index/searchset/SearchSet.java b/code/features-index/index-query/src/main/java/nu/marginalia/index/searchset/SearchSet.java index 529950e7..8fba1801 100644 --- a/code/features-index/index-query/src/main/java/nu/marginalia/index/searchset/SearchSet.java +++ b/code/features-index/index-query/src/main/java/nu/marginalia/index/searchset/SearchSet.java @@ -3,10 +3,10 @@ package nu.marginalia.index.searchset; public interface SearchSet { /** - * Returns true if the given urlId is contained in the set + * Returns true if the given domainId is contained in the set * or if the documentMetadata vibes with the set * */ - boolean contains(int urlId, long documentMetadata); + boolean contains(int domainId, long documentMetadata); } diff --git a/code/features-index/index-reverse/build.gradle b/code/features-index/index-reverse/build.gradle index 90bf9411..76c382a2 100644 --- a/code/features-index/index-reverse/build.gradle +++ b/code/features-index/index-reverse/build.gradle @@ -18,15 +18,15 @@ dependencies { implementation project(':code:features-index:domain-ranking') implementation project(':code:features-index:index-query') implementation project(':code:features-index:index-journal') - implementation project(':code:features-index:lexicon') implementation project(':code:common:model') - implementation project(':code:common:service') + implementation project(':code:common:process') + implementation libs.lombok annotationProcessor libs.lombok implementation libs.bundles.slf4j - implementation libs.prometheus + implementation libs.fastutil testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit diff --git a/code/features-index/index-reverse/index.svg b/code/features-index/index-reverse/index.svg new file mode 100644 index 00000000..8c0184ea --- /dev/null +++ b/code/features-index/index-reverse/index.svg @@ -0,0 +1,4 @@ + + + +
Words
Words
Static BTree Index
Static BTree Index
word1, offset1
word1, offset1
word2, offset2
word2, offset2
...
...
wordN, offsetN
wordN, offsetN
Documents
Documents
Static BTree Index 1
Static BTree Index 1
doc1, meta1
doc1, meta1
doc2, meta2
doc2, meta2
doc3, meta3
doc3, meta3
Static BTree Index 2
Static BTree Index 2
doc1, meta1
doc1, meta1
doc2, meta2
doc2, meta2
Static BTree Index N
Static BTree Index N
doc2, meta2
doc2, meta2
doc3, meta3
doc3, meta3
Reverse Index
Reverse Index
Text is not SVG - cannot display
\ No newline at end of file diff --git a/code/features-index/index-reverse/merging.svg b/code/features-index/index-reverse/merging.svg new file mode 100644 index 00000000..ed023d52 --- /dev/null +++ b/code/features-index/index-reverse/merging.svg @@ -0,0 +1,4 @@ + + + +
journal1
journal1
journal2
journal2
journal3
journal3
journalN
journalN
preindex1
preindex1
preindex2
preindex2
journal3
journal3
preindexN
preindexN
Merge
Merge
Process
Process
Input
Input
partial merge preindex 1
partial merge pre...
partial merge
preindex N
partial merge...
final merged preindex
(100s of Gb)
final merged prei...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/code/features-index/index-reverse/preindex.svg b/code/features-index/index-reverse/preindex.svg new file mode 100644 index 00000000..456f56a4 --- /dev/null +++ b/code/features-index/index-reverse/preindex.svg @@ -0,0 +1,4 @@ + + + +
WordIds
WordIds
Foo
Foo
Bar
Bar
Baz
Baz
Counts
Counts
3
3
2
2
2
2
Documents
Documents
doc1
doc1
doc2
doc2
doc3
doc3
doc1
doc1
doc2
doc2
doc2
doc2
doc3
doc3
0
0
3
3
5
5
Offsets
Offsets
0
0
0+3 = 3
0+3 = 3
0+3+2 = 5
0+3+2 = 5
Offset(n) = sum(Counts; 0,n-1)
Offset(n) = sum(Counts; 0,n-1)
Preindex Segment
Preindex Segment
Preindex Documents
Preindex Documents
Text is not SVG - cannot display
\ No newline at end of file diff --git a/code/features-index/index-reverse/readme.md b/code/features-index/index-reverse/readme.md index 5a9db1e5..a27371d6 100644 --- a/code/features-index/index-reverse/readme.md +++ b/code/features-index/index-reverse/readme.md @@ -12,9 +12,35 @@ The full index also provides access to term-level metadata, while the priority i [1] See WordFlags in [common/model](../../common/model/) and KeywordMetadata in [features-convert/keyword-extraction](../../features-convert/keyword-extraction). +## Construction + +The reverse index is constructed by first building a series of preindexes. +Preindexes consist of a Segment and a Documents object. The segment contains +information about which word identifiers are present and how many, and the +documents contain information about in which documents the words can be found. + +![Memory layout illustrations](./preindex.svg) + +These would typically not fit in RAM, so the index journal is paged +and the preindexes are constructed small enough to fit in memory, and +then merged. Merging sorted arrays is a very fast operation that does +not require additional RAM. + +![Illustration of successively merged preindex files](./merging.svg) + +Once merged into one large preindex, indexes are added to the preindex data +to form a finalized reverse index. + +![Illustration of the data layout of the finalized index](index.svg) ## Central Classes -* [ReverseIndexFullConverter](src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java) constructs the full index. -* [ReverseIndexFullReader](src/main/java/nu/marginalia/index/full/ReverseIndexFullReader.java) interrogates the full index. -* [ReverseIndexPriorityConverter](src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java) constructs the priority index. -* [ReverseIndexPriorityReader](src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityReader.java) interrogates the priority index. +* [ReversePreindex](src/main/java/nu/marginalia/index/construction/ReversePreindex.java) intermediate reverse index state. +* [ReverseIndexConstructor](src/main/java/nu/marginalia/index/construction/ReverseIndexConstructor.java) constructs the index. +* [ReverseIndexReader](src/main/java/nu/marginalia/index/ReverseIndexReader.java) interrogates the index. + +## See Also + +* [index-journal](../index-journal) +* [index-forward](../index-forward) +* [libraries/btree](../../libraries/btree) +* [libraries/array](../../libraries/array) \ No newline at end of file diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullEntrySource.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/ReverseIndexEntrySource.java similarity index 80% rename from code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullEntrySource.java rename to code/features-index/index-reverse/src/main/java/nu/marginalia/index/ReverseIndexEntrySource.java index 8e7c6b01..00de237b 100644 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullEntrySource.java +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/ReverseIndexEntrySource.java @@ -1,4 +1,4 @@ -package nu.marginalia.index.full; +package nu.marginalia.index; import nu.marginalia.array.buffer.LongQueryBuffer; import nu.marginalia.btree.BTreeReader; @@ -6,18 +6,18 @@ import nu.marginalia.index.query.EntrySource; import static java.lang.Math.min; -public class ReverseIndexFullEntrySource implements EntrySource { +public class ReverseIndexEntrySource implements EntrySource { private final BTreeReader reader; int pos; int endOffset; final int entrySize; - private final int wordId; + private final long wordId; - public ReverseIndexFullEntrySource(BTreeReader reader, - int entrySize, - int wordId) { + public ReverseIndexEntrySource(BTreeReader reader, + int entrySize, + long wordId) { this.reader = reader; this.entrySize = entrySize; this.wordId = wordId; diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/ReverseIndexFullFileNames.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/ReverseIndexFullFileNames.java new file mode 100644 index 00000000..c8005fdb --- /dev/null +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/ReverseIndexFullFileNames.java @@ -0,0 +1,28 @@ +package nu.marginalia.index; + +import java.nio.file.Path; + +public class ReverseIndexFullFileNames { + public static Path resolve(Path basePath, FileIdentifier identifier, FileVersion version) { + return switch (identifier) { + case WORDS -> switch (version) { + case NEXT -> basePath.resolve("rev-words.dat.next"); + case CURRENT -> basePath.resolve("rev-words.dat"); + }; + case DOCS -> switch (version) { + case NEXT -> basePath.resolve("rev-docs.dat.next"); + case CURRENT -> basePath.resolve("rev-docs.dat"); + }; + }; + } + + public enum FileVersion { + CURRENT, + NEXT + }; + + public enum FileIdentifier { + WORDS, + DOCS + } +} diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/ReverseIndexParameters.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/ReverseIndexParameters.java new file mode 100644 index 00000000..a6df15d3 --- /dev/null +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/ReverseIndexParameters.java @@ -0,0 +1,10 @@ +package nu.marginalia.index; + +import nu.marginalia.btree.model.BTreeBlockSize; +import nu.marginalia.btree.model.BTreeContext; + +public class ReverseIndexParameters +{ + public static final BTreeContext docsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048); + public static final BTreeContext wordsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048); +} diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/ReverseIndexPrioFileNames.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/ReverseIndexPrioFileNames.java new file mode 100644 index 00000000..6e73df43 --- /dev/null +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/ReverseIndexPrioFileNames.java @@ -0,0 +1,28 @@ +package nu.marginalia.index; + +import java.nio.file.Path; + +public class ReverseIndexPrioFileNames { + public static Path resolve(Path basePath, FileIdentifier identifier, FileVersion version) { + return switch (identifier) { + case WORDS -> switch (version) { + case NEXT -> basePath.resolve("rev-prio-words.dat.next"); + case CURRENT -> basePath.resolve("rev-prio-words.dat"); + }; + case DOCS -> switch (version) { + case NEXT -> basePath.resolve("rev-prio-docs.dat.next"); + case CURRENT -> basePath.resolve("rev-prio-docs.dat"); + }; + }; + } + + public enum FileVersion { + CURRENT, + NEXT + }; + + public enum FileIdentifier { + WORDS, + DOCS + } +} diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullReader.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/ReverseIndexReader.java similarity index 54% rename from code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullReader.java rename to code/features-index/index-reverse/src/main/java/nu/marginalia/index/ReverseIndexReader.java index 6342c436..bb4ab9b4 100644 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullReader.java +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/ReverseIndexReader.java @@ -1,11 +1,11 @@ -package nu.marginalia.index.full; +package nu.marginalia.index; -import nu.marginalia.index.query.ReverseIndexRejectFilter; -import nu.marginalia.index.query.ReverseIndexRetainFilter; import nu.marginalia.array.LongArray; import nu.marginalia.btree.BTreeReader; import nu.marginalia.index.query.EmptyEntrySource; import nu.marginalia.index.query.EntrySource; +import nu.marginalia.index.query.ReverseIndexRejectFilter; +import nu.marginalia.index.query.ReverseIndexRetainFilter; import nu.marginalia.index.query.filter.QueryFilterLetThrough; import nu.marginalia.index.query.filter.QueryFilterNoPass; import nu.marginalia.index.query.filter.QueryFilterStepIf; @@ -15,18 +15,22 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.util.Arrays; -public class ReverseIndexFullReader { +public class ReverseIndexReader { private final LongArray words; private final LongArray documents; - + private final long wordsDataOffset; private final Logger logger = LoggerFactory.getLogger(getClass()); + private final BTreeReader wordsBTreeReader; - public ReverseIndexFullReader(Path words, Path documents) throws IOException { + + + public ReverseIndexReader(Path words, Path documents) throws IOException { if (!Files.exists(words) || !Files.exists(documents)) { this.words = null; this.documents = null; + this.wordsBTreeReader = null; + this.wordsDataOffset = -1; return; } @@ -34,62 +38,52 @@ public class ReverseIndexFullReader { this.words = LongArray.mmapRead(words); this.documents = LongArray.mmapRead(documents); + + wordsBTreeReader = new BTreeReader(this.words, ReverseIndexParameters.wordsBTreeContext, 0); + wordsDataOffset = wordsBTreeReader.getHeader().dataOffsetLongs(); } - public boolean isWordInDoc(int wordId, long documentId) { - if (wordId < 0) { - return false; - } - long offset = words.get(wordId); + private long wordOffset(long wordId) { + long idx = wordsBTreeReader.findEntry(wordId); - if (offset < 0) { - return false; - } + if (idx < 0) + return -1L; - return createReaderNew(offset).findEntry(documentId) >= 0; + return words.get(wordsDataOffset + idx + 1); } - public EntrySource documents(int wordId) { + public EntrySource documents(long wordId) { if (null == words) { logger.warn("Reverse index is not ready, dropping query"); return new EmptyEntrySource(); } - if (wordId < 0 || wordId >= words.size()) return new EmptyEntrySource(); - - long offset = words.get(wordId); + long offset = wordOffset(wordId); if (offset < 0) return new EmptyEntrySource(); - return new ReverseIndexFullEntrySource(createReaderNew(offset), ReverseIndexFullParameters.ENTRY_SIZE, wordId); + return new ReverseIndexEntrySource(createReaderNew(offset), 2, wordId); } - public QueryFilterStepIf also(int wordId) { - if (wordId < 0) return new QueryFilterNoPass(); - - long offset = words.get(wordId); + public QueryFilterStepIf also(long wordId) { + long offset = wordOffset(wordId); if (offset < 0) return new QueryFilterNoPass(); return new ReverseIndexRetainFilter(createReaderNew(offset), "full", wordId); } - public QueryFilterStepIf not(int wordId) { - if (wordId < 0) return new QueryFilterLetThrough(); - - long offset = words.get(wordId); + public QueryFilterStepIf not(long wordId) { + long offset = wordOffset(wordId); if (offset < 0) return new QueryFilterLetThrough(); return new ReverseIndexRejectFilter(createReaderNew(offset)); } - public int numDocuments(int wordId) { - if (wordId < 0) - return 0; - - long offset = words.get(wordId); + public int numDocuments(long wordId) { + long offset = wordOffset(wordId); if (offset < 0) return 0; @@ -98,23 +92,33 @@ public class ReverseIndexFullReader { } private BTreeReader createReaderNew(long offset) { - return new BTreeReader(documents, ReverseIndexFullParameters.bTreeContext, offset); + return new BTreeReader(documents, ReverseIndexParameters.docsBTreeContext, offset); } - public long[] getTermMeta(int wordId, long[] docIds) { - if (wordId < 0) { - return new long[docIds.length]; - } + public long[] getTermMeta(long wordId, long[] docIds) { + long offset = wordOffset(wordId); - long offset = words.get(wordId); if (offset < 0) { return new long[docIds.length]; } - Arrays.sort(docIds); + assert isSorted(docIds) : "The input array docIds is assumed to be sorted"; var reader = createReaderNew(offset); return reader.queryData(docIds, 1); } + private boolean isSorted(long[] ids) { + if (ids.length == 0) + return true; + long prev = ids[0]; + + for (int i = 1; i < ids.length; i++) { + if(ids[i] <= prev) + return false; + } + + return true; + } + } diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/DocIdRewriter.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/DocIdRewriter.java new file mode 100644 index 00000000..3dc0b278 --- /dev/null +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/DocIdRewriter.java @@ -0,0 +1,9 @@ +package nu.marginalia.index.construction; + +public interface DocIdRewriter { + long rewriteDocId(long docId); + + static DocIdRewriter identity() { + return l -> l; + } +} diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/JournalReaderSource.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/JournalReaderSource.java new file mode 100644 index 00000000..b565206d --- /dev/null +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/JournalReaderSource.java @@ -0,0 +1,10 @@ +package nu.marginalia.index.construction; + +import nu.marginalia.index.journal.reader.IndexJournalReader; + +import java.io.IOException; +import java.nio.file.Path; + +public interface JournalReaderSource { + IndexJournalReader construct(Path sourceFile) throws IOException; +} diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReverseIndexBTreeTransformer.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReverseIndexBTreeTransformer.java index 5e94921d..2f5c05f4 100644 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReverseIndexBTreeTransformer.java +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReverseIndexBTreeTransformer.java @@ -4,7 +4,6 @@ import nu.marginalia.array.LongArray; import nu.marginalia.array.functional.LongIOTransformer; import nu.marginalia.btree.BTreeWriter; import nu.marginalia.btree.model.BTreeContext; -import nu.marginalia.index.priority.ReverseIndexPriorityParameters; import java.io.IOException; import java.nio.channels.FileChannel; diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReverseIndexConstructor.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReverseIndexConstructor.java new file mode 100644 index 00000000..f8cf07a7 --- /dev/null +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReverseIndexConstructor.java @@ -0,0 +1,115 @@ +package nu.marginalia.index.construction; + +import nu.marginalia.process.control.ProcessAdHocTaskHeartbeat; +import nu.marginalia.process.control.ProcessHeartbeat; +import nu.marginallia.index.journal.IndexJournalFileNames; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +public class ReverseIndexConstructor { + + private static final Logger logger = LoggerFactory.getLogger(ReverseIndexConstructor.class); + + public enum CreateReverseIndexSteps { + CREATE_PREINDEXES, + MERGE_PREINDEXES, + FINALIZE, + FINISHED + } + public static void createReverseIndex( + ProcessHeartbeat processHeartbeat, + JournalReaderSource readerSource, + Path sourceBaseDir, + DocIdRewriter docIdRewriter, + Path tmpDir, + Path outputFileDocs, + Path outputFileWords) throws IOException + { + var inputs = IndexJournalFileNames.findJournalFiles(sourceBaseDir); + if (inputs.isEmpty()) { + logger.error("No journal files in base dir {}", sourceBaseDir); + return; + } + + try (var heartbeat = processHeartbeat.createProcessTaskHeartbeat(CreateReverseIndexSteps.class, "createReverseIndex")) { + List preindexes = new ArrayList<>(); + + heartbeat.progress(CreateReverseIndexSteps.CREATE_PREINDEXES); + + try (var preindexHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("constructPreindexes")) { + for (int i = 0; i < inputs.size(); i++) { + var input = inputs.get(i); + + preindexHeartbeat.progress(input.toFile().getName(), i, inputs.size()); + + preindexes.add(ReversePreindex.constructPreindex(readerSource.construct(input), docIdRewriter, tmpDir)); + } + + preindexHeartbeat.progress("FINISHED", inputs.size(), inputs.size()); + } + + heartbeat.progress(CreateReverseIndexSteps.MERGE_PREINDEXES); + ReversePreindex finalPreindex; + + try (var mergeHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("mergePreindexes")) { + finalPreindex = mergePreindexes(tmpDir, mergeHeartbeat, preindexes); + } + + heartbeat.progress(CreateReverseIndexSteps.FINALIZE); + finalPreindex.finalizeIndex(outputFileDocs, outputFileWords); + + heartbeat.progress(CreateReverseIndexSteps.FINISHED); + finalPreindex.delete(); + } + } + + private static ReversePreindex mergePreindexes(Path workDir, ProcessAdHocTaskHeartbeat mergeHeartbeat, List preindexes) throws IOException { + assert !preindexes.isEmpty(); + + if (preindexes.size() == 1) { + logger.info("Single preindex, no merge necessary"); + return preindexes.get(0); + } + + List toMerge = new ArrayList<>(preindexes); + List merged = new ArrayList<>(); + + int pass = 0; + while (toMerge.size() != 1) { + String stage = String.format("PASS[%d]: %d -> %d", ++pass, + toMerge.size(), + toMerge.size()/2 + (toMerge.size() % 2) + ); + + for (int i = 0; i + 1 < toMerge.size(); i+=2) { + mergeHeartbeat.progress(stage, i/2, toMerge.size()/2); + + var left = toMerge.get(i); + var right = toMerge.get(i+1); + + merged.add(ReversePreindex.merge(workDir, left, right)); + + left.delete(); + right.delete(); + } + + if ((toMerge.size() % 2) != 0) { + merged.add(toMerge.get(toMerge.size()-1)); + } + + toMerge.clear(); + toMerge.addAll(merged); + merged.clear(); + } + + mergeHeartbeat.progress("FINISHED", 1, 1); + + return toMerge.get(0); + } + +} diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindex.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindex.java new file mode 100644 index 00000000..284f7df7 --- /dev/null +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindex.java @@ -0,0 +1,280 @@ +package nu.marginalia.index.construction; + +import nu.marginalia.array.LongArray; +import nu.marginalia.array.algo.SortingContext; +import nu.marginalia.btree.BTreeWriter; +import nu.marginalia.index.ReverseIndexParameters; +import nu.marginalia.index.journal.reader.IndexJournalReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; + +import static nu.marginalia.array.algo.TwoArrayOperations.*; + +/** Contains the data that would go into a reverse index, + * that is, a mapping from words to documents, minus the actual + * index structure that makes the data quick to access while + * searching. + *

+ * Two preindexes can be merged into a third preindex containing + * the union of their data. This operation requires no additional + * RAM. + */ +public class ReversePreindex { + final ReversePreindexWordSegments segments; + final ReversePreindexDocuments documents; + + private static final Logger logger = LoggerFactory.getLogger(ReversePreindex.class); + + public ReversePreindex(ReversePreindexWordSegments segments, ReversePreindexDocuments documents) { + this.segments = segments; + this.documents = documents; + } + + /** Constructs a new preindex with the data associated with reader. The backing files + * will have randomly assigned names. + */ + public static ReversePreindex constructPreindex(IndexJournalReader reader, + DocIdRewriter docIdRewriter, + Path destDir) throws IOException + { + Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat"); + Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat"); + Path docsFile = Files.createTempFile(destDir, "docs", ".dat"); + + logger.info("Segmenting"); + var segments = ReversePreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile); + logger.info("Mapping docs"); + var docs = ReversePreindexDocuments.construct(docsFile, reader, docIdRewriter, segments); + logger.info("Done"); + return new ReversePreindex(segments, docs); + } + + /** Transform the preindex into a reverse index */ + public void finalizeIndex(Path outputFileDocs, Path outputFileWords) throws IOException { + var offsets = segments.counts; + + Files.deleteIfExists(outputFileDocs); + Files.deleteIfExists(outputFileWords); + + // Estimate the size of the docs index data + offsets.transformEach(0, offsets.size(), new CountToOffsetTransformer(2)); + IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(ReverseIndexParameters.docsBTreeContext, 2); + offsets.fold(0, 0, offsets.size(), sizeEstimator); + + // Write the docs file + LongArray finalDocs = LongArray.mmapForWriting(outputFileDocs, sizeEstimator.size); + try (var intermediateDocChannel = documents.createDocumentsFileChannel()) { + offsets.transformEachIO(0, offsets.size(), new ReverseIndexBTreeTransformer(finalDocs, 2, ReverseIndexParameters.docsBTreeContext, intermediateDocChannel)); + intermediateDocChannel.force(false); + } + + LongArray wordIds = segments.wordIds; + + assert offsets.size() == wordIds.size() : "Offsets and word-ids of different size"; + + // Estimate the size of the words index data + long wordsSize = ReverseIndexParameters.wordsBTreeContext.calculateSize((int) offsets.size()); + + // Construct the tree + LongArray wordsArray = LongArray.mmapForWriting(outputFileWords, wordsSize); + + new BTreeWriter(wordsArray, ReverseIndexParameters.wordsBTreeContext) + .write(0, (int) offsets.size(), mapRegion -> { + for (long i = 0; i < offsets.size(); i++) { + mapRegion.set(2*i, wordIds.get(i)); + mapRegion.set(2*i + 1, offsets.get(i)); + } + }); + + wordsArray.force(); + + } + + /** Delete all files associated with this pre-index */ + public void delete() throws IOException { + segments.delete(); + documents.delete(); + } + + public static ReversePreindex merge(Path destDir, + ReversePreindex left, + ReversePreindex right) throws IOException { + + ReversePreindexWordSegments mergingSegment = + createMergedSegmentWordFile(destDir, left.segments, right.segments); + + var mergingIter = mergingSegment.constructionIterator(2); + var leftIter = left.segments.iterator(2); + var rightIter = right.segments.iterator(2); + + Path docsFile = Files.createTempFile(destDir, "docs", ".dat"); + + LongArray mergedDocuments = LongArray.mmapForWriting(docsFile, 8 * (left.documents.size() + right.documents.size())); + + leftIter.next(); + rightIter.next(); + + try (FileChannel leftChannel = left.documents.createDocumentsFileChannel(); + FileChannel rightChannel = right.documents.createDocumentsFileChannel()) + { + + while (mergingIter.canPutMore() + && leftIter.isPositionBeforeEnd() + && rightIter.isPositionBeforeEnd()) + { + final long currentWord = mergingIter.wordId; + + if (leftIter.wordId == currentWord && rightIter.wordId == currentWord) + { + // both inputs have documents for the current word + mergeSegments(leftIter, rightIter, + left.documents, right.documents, + mergedDocuments, mergingIter); + } + else if (leftIter.wordId == currentWord) { + if (!copySegment(leftIter, mergedDocuments, leftChannel, mergingIter)) + break; + } + else if (rightIter.wordId == currentWord) { + if (!copySegment(rightIter, mergedDocuments, rightChannel, mergingIter)) + break; + } + else assert false : "This should never happen"; // the helvetica scenario + } + + if (leftIter.isPositionBeforeEnd()) { + while (copySegment(leftIter, mergedDocuments, leftChannel, mergingIter)); + } + + if (rightIter.isPositionBeforeEnd()) { + while (copySegment(rightIter, mergedDocuments, rightChannel, mergingIter)); + } + + } + + assert !leftIter.isPositionBeforeEnd() : "Left has more to go"; + assert !rightIter.isPositionBeforeEnd() : "Right has more to go"; + assert !mergingIter.canPutMore() : "Source iters ran dry before merging iter"; + + // We may have overestimated the size of the merged docs size in the case there were + // duplicates in the data, so we need to shrink it to the actual size we wrote. + + mergedDocuments = shrinkMergedDocuments(mergedDocuments, + docsFile, 2 * mergingSegment.totalSize()); + + mergingSegment.force(); + + return new ReversePreindex( + mergingSegment, + new ReversePreindexDocuments(mergedDocuments, docsFile) + ); + } + + /** Create a segment word file with each word from both inputs, with zero counts for all the data. + * This is an intermediate product in merging. + */ + static ReversePreindexWordSegments createMergedSegmentWordFile(Path destDir, + ReversePreindexWordSegments left, + ReversePreindexWordSegments right) throws IOException { + Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat"); + Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat"); + + long segmentsSize = countDistinctElements(left.wordIds, right.wordIds, + 0, left.wordIds.size(), + 0, right.wordIds.size()); + + LongArray wordIdsFile = LongArray.mmapForWriting(segmentWordsFile, segmentsSize); + + mergeArrays(wordIdsFile, left.wordIds, right.wordIds, + 0, wordIdsFile.size(), + 0, left.wordIds.size(), + 0, right.wordIds.size()); + + LongArray counts = LongArray.mmapForWriting(segmentCountsFile, segmentsSize); + + return new ReversePreindexWordSegments(wordIdsFile, counts, segmentWordsFile, segmentCountsFile); + } + + /** It's possible we overestimated the necessary size of the documents file, + * this will permit us to shrink it down to the smallest necessary size. + */ + private static LongArray shrinkMergedDocuments(LongArray mergedDocuments, Path docsFile, long sizeLongs) throws IOException { + + mergedDocuments.force(); + + long beforeSize = mergedDocuments.size(); + try (var bc = Files.newByteChannel(docsFile, StandardOpenOption.WRITE)) { + bc.truncate(sizeLongs * 8); + } + long afterSize = mergedDocuments.size(); + mergedDocuments = LongArray.mmapForWriting(docsFile, sizeLongs); + + if (beforeSize != afterSize) { + logger.info("Shrunk {} from {}b to {}b", docsFile, beforeSize, afterSize); + } + + return mergedDocuments; + } + + /** Merge contents of the segments indicated by leftIter and rightIter into the destionation + * segment, and advance the construction iterator with the appropriate size. + */ + private static void mergeSegments(ReversePreindexWordSegments.SegmentIterator leftIter, + ReversePreindexWordSegments.SegmentIterator rightIter, + ReversePreindexDocuments left, + ReversePreindexDocuments right, + LongArray dest, + ReversePreindexWordSegments.SegmentConstructionIterator destIter) + { + long distinct = countDistinctElementsN(2, + left.documents, + right.documents, + leftIter.startOffset, leftIter.endOffset, + rightIter.startOffset, rightIter.endOffset); + + mergeArrays2(dest, + left.documents, + right.documents, + destIter.startOffset, + destIter.startOffset + 2*distinct, + leftIter.startOffset, leftIter.endOffset, + rightIter.startOffset, rightIter.endOffset); + + destIter.putNext(distinct); + leftIter.next(); + rightIter.next(); + } + + /** Copy the data from the source segment at the position and length indicated by sourceIter, + * into the destination segment, and advance the construction iterator. + */ + private static boolean copySegment(ReversePreindexWordSegments.SegmentIterator sourceIter, + LongArray dest, + FileChannel sourceChannel, + ReversePreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException { + + long size = sourceIter.endOffset - sourceIter.startOffset; + long start = mergingIter.startOffset; + long end = start + size; + + dest.transferFrom(sourceChannel, + sourceIter.startOffset, + mergingIter.startOffset, + end); + + boolean putNext = mergingIter.putNext(size / 2); + boolean iterNext = sourceIter.next(); + + assert putNext || !iterNext : "Source iterator ran out before dest iterator?!"; + + return iterNext; + } + + +} diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindexDocuments.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindexDocuments.java new file mode 100644 index 00000000..c51a977d --- /dev/null +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindexDocuments.java @@ -0,0 +1,123 @@ +package nu.marginalia.index.construction; + +import lombok.SneakyThrows; +import nu.marginalia.array.LongArray; +import nu.marginalia.array.algo.SortingContext; +import nu.marginalia.index.journal.reader.IndexJournalReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; + +/** A LongArray with document data, segmented according to + * the associated ReversePreindexWordSegments data + */ +public class ReversePreindexDocuments { + private final Path file; + public final LongArray documents; + private static final int RECORD_SIZE_LONGS = 2; + private static final Logger logger= LoggerFactory.getLogger(ReversePreindexDocuments.class); + + public ReversePreindexDocuments(LongArray documents, Path file) { + this.documents = documents; + this.file = file; + } + + public static ReversePreindexDocuments construct( + Path docsFile, + IndexJournalReader reader, + DocIdRewriter docIdRewriter, + ReversePreindexWordSegments segments) throws IOException { + + + logger.info("Transferring data"); + createUnsortedDocsFile(docsFile, reader, segments, docIdRewriter); + + LongArray docsFileMap = LongArray.mmapForWriting(docsFile, 8 * Files.size(docsFile)); + logger.info("Sorting data"); + sortDocsFile(docsFileMap, segments); + + return new ReversePreindexDocuments(docsFileMap, docsFile); + } + + public FileChannel createDocumentsFileChannel() throws IOException { + return (FileChannel) Files.newByteChannel(file, StandardOpenOption.READ); + } + + + public LongArray slice(long start, long end) { + return documents.range(start, end); + } + + public long size() { + return documents.size(); + } + + private static void createUnsortedDocsFile(Path docsFile, + IndexJournalReader reader, + ReversePreindexWordSegments segments, + DocIdRewriter docIdRewriter) throws IOException { + long fileSize = 8 * segments.totalSize(); + LongArray outArray = LongArray.mmapForWriting(docsFile, fileSize); + + var offsetMap = segments.asMap(RECORD_SIZE_LONGS); + offsetMap.defaultReturnValue(0); + + for (var entry : reader) { + long rankEncodedId = docIdRewriter.rewriteDocId(entry.docId()); + + var data = entry.readEntry(); + for (int i = 0; i + 1 < data.size(); i+=2) { + long wordId = data.get(i); + long meta = data.get(i+1); + + long offset = offsetMap.addTo(wordId, RECORD_SIZE_LONGS); + + outArray.set(offset + 0, rankEncodedId); + outArray.set(offset + 1, meta); + } + } + + outArray.force(); + } + + @SneakyThrows + private static void sortDocsFile(LongArray docsFileMap, ReversePreindexWordSegments segments) throws IOException { + + var iter = segments.iterator(RECORD_SIZE_LONGS); + + ExecutorService sortingWorkers = Executors.newWorkStealingPool(Runtime.getRuntime().availableProcessors()); + + while (iter.next()) { + if (iter.size() < 1024) { + docsFileMap.quickSortN(RECORD_SIZE_LONGS, + iter.startOffset, + iter.endOffset); + } + else { + sortingWorkers.execute(() -> + docsFileMap.quickSortN(RECORD_SIZE_LONGS, + iter.startOffset, + iter.endOffset)); + } + } + + sortingWorkers.shutdown(); + logger.info("Awaiting shutdown"); + + while (!sortingWorkers.awaitTermination(1, TimeUnit.HOURS)); + + sortingWorkers.close(); + } + + public void delete() throws IOException { + Files.delete(this.file); + } +} diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindexWordSegments.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindexWordSegments.java new file mode 100644 index 00000000..5acd2219 --- /dev/null +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindexWordSegments.java @@ -0,0 +1,197 @@ +package nu.marginalia.index.construction; + +import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap; +import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap; +import it.unimi.dsi.fastutil.longs.LongIterator; +import nu.marginalia.array.LongArray; +import nu.marginalia.array.algo.SortingContext; +import nu.marginalia.index.journal.reader.IndexJournalReader; + +import java.io.IOException; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; + +/** A pair of file-backed arrays of sorted wordIds + * and the count of documents associated with each wordId. + */ +public class ReversePreindexWordSegments { + public final LongArray wordIds; + public final LongArray counts; + + private final Path wordsFile; + private final Path countsFile; + + public ReversePreindexWordSegments(LongArray wordIds, + LongArray counts, + Path wordsFile, + Path countsFile) + { + assert wordIds.size() == counts.size(); + + this.wordIds = wordIds; + this.counts = counts; + this.wordsFile = wordsFile; + this.countsFile = countsFile; + } + + /** Returns a long-long hash map where each key is a wordId, + * and each value is the start offset of the data. + */ + public Long2LongOpenHashMap asMap(int recordSize) { + Long2LongOpenHashMap ret = new Long2LongOpenHashMap((int) wordIds.size(), 0.75f); + var iter = iterator(recordSize); + + while (iter.next()) { + ret.put(iter.wordId, iter.startOffset); + } + + return ret; + } + + public static ReversePreindexWordSegments construct(IndexJournalReader reader, + Path wordIdsFile, + Path countsFile) + throws IOException + { + Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f); + countsMap.defaultReturnValue(0); + reader.forEachWordId(wordId -> countsMap.addTo(wordId, 1)); + + LongArray words = LongArray.mmapForWriting(wordIdsFile, countsMap.size()); + LongArray counts = LongArray.mmapForWriting(countsFile, countsMap.size()); + + // Create the words file by iterating over the map and inserting them into + // the words file in whatever bizarro hash table order they appear in + int i = 0; + LongIterator iter = countsMap.keySet().iterator(); + while (iter.hasNext()) { + words.set(i, iter.nextLong()); + i++; + } + + // Sort the words file + words.quickSort(0, counts.size()); + + // Populate the counts + for (i = 0; i < countsMap.size(); i++) { + counts.set(i, countsMap.get(words.get(i))); + } + + return new ReversePreindexWordSegments(words, counts, wordIdsFile, countsFile); + } + + public SegmentIterator iterator(int recordSize) { + return new SegmentIterator(recordSize); + } + public SegmentConstructionIterator constructionIterator(int recordSize) { + return new SegmentConstructionIterator(recordSize); + } + + public long totalSize() { + return counts.fold(0, 0, counts.size(), Long::sum); + } + + public void delete() throws IOException { + Files.delete(countsFile); + Files.delete(wordsFile); + } + + public void force() { + counts.force(); + wordIds.force(); + } + + public class SegmentIterator { + private final int recordSize; + private final long fileSize; + long wordId; + long startOffset = 0; + long endOffset = 0; + + private SegmentIterator(int recordSize) { + this.recordSize = recordSize; + this.fileSize = wordIds.size(); + } + + private int i = -1; + public int idx() { + return i; + } + public boolean next() { + if (++i >= fileSize) { + wordId = Long.MIN_VALUE; + return false; + } + + wordId = wordIds.get(i); + startOffset = endOffset; + endOffset = startOffset + recordSize * counts.get(i); + + return true; + } + + public boolean hasMorePositions() { + return i + 1 < wordIds.size(); + } + + public boolean isPositionBeforeEnd() { + return i < wordIds.size(); + } + + public long size() { + return endOffset - startOffset; + } + } + + class SegmentConstructionIterator { + private final int recordSize; + private final long fileSize; + long wordId; + long startOffset = 0; + long endOffset = 0; + + private SegmentConstructionIterator(int recordSize) { + this.recordSize = recordSize; + this.fileSize = wordIds.size(); + if (fileSize == 0) { + throw new IllegalArgumentException("Cannot construct zero-length word segment file"); + } + this.wordId = wordIds.get(0); + } + + private int i = 0; + public int idx() { + return i; + } + + public boolean putNext(long size) { + + if (i >= fileSize) + return false; + + endOffset = startOffset + recordSize * size; + counts.set(i, size); + startOffset = endOffset; + endOffset = -1; + + i++; + + if (i == fileSize) { + // We've reached the end of the iteration and there is no + // "next" wordId to fetch + wordId = Long.MIN_VALUE; + return false; + } + else { + wordId = wordIds.get(i); + return true; + } + } + + public boolean canPutMore() { + return i < wordIds.size(); + } + } +} diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java deleted file mode 100644 index f2e3f91b..00000000 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java +++ /dev/null @@ -1,218 +0,0 @@ -package nu.marginalia.index.full; - -import lombok.SneakyThrows; -import nu.marginalia.index.construction.CountToOffsetTransformer; -import nu.marginalia.index.construction.ReverseIndexBTreeTransformer; -import nu.marginalia.index.construction.IndexSizeEstimator; -import nu.marginalia.index.journal.model.IndexJournalEntryData; -import nu.marginalia.index.journal.model.IndexJournalStatistics; -import nu.marginalia.index.journal.reader.IndexJournalReader; -import nu.marginalia.ranking.DomainRankings; -import nu.marginalia.rwf.RandomWriteFunnel; -import nu.marginalia.array.IntArray; -import nu.marginalia.array.LongArray; -import nu.marginalia.array.algo.SortingContext; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.nio.channels.FileChannel; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardOpenOption; - -import nu.marginalia.service.control.ServiceHeartbeat; - -import static nu.marginalia.index.full.ReverseIndexFullParameters.bTreeContext; - -public class ReverseIndexFullConverter { - private static final int RWF_BIN_SIZE = 10_000_000; - - private final ServiceHeartbeat heartbeat; - private final Path tmpFileDir; - - private final Logger logger = LoggerFactory.getLogger(getClass()); - - private final IndexJournalReader journalReader; - private final DomainRankings domainRankings; - private final Path outputFileWords; - private final Path outputFileDocs; - private final SortingContext sortingContext; - - public ReverseIndexFullConverter(ServiceHeartbeat heartbeat, - Path tmpFileDir, - IndexJournalReader journalReader, - DomainRankings domainRankings, - Path outputFileWords, - Path outputFileDocs) { - this.heartbeat = heartbeat; - this.tmpFileDir = tmpFileDir; - this.journalReader = journalReader; - this.domainRankings = domainRankings; - this.outputFileWords = outputFileWords; - this.outputFileDocs = outputFileDocs; - this.sortingContext = new SortingContext(tmpFileDir, 64_000); - } - - public enum TaskSteps { - ACCUMULATE_STATISTICS, - INCREMENT_OFFSETS, - COUNT_OFFSETS, - CREATE_INTERMEDIATE_DOCS, - SORT_INTERMEDIATE_DOCS, - SIZING, - FINALIZING_DOCS, - FORCE, - FINISHED, - } - - public void convert() throws IOException { - deleteOldFiles(); - - if (journalReader.fileHeader().fileSize() <= IndexJournalReader.FILE_HEADER_SIZE_BYTES) { - logger.warn("Bailing: Journal is empty!"); - return; - } - - final Path intermediateUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat"); - - try (var progress = heartbeat.createServiceTaskHeartbeat(TaskSteps.class, "reverseIndexFullConverter")) { - progress.progress(TaskSteps.ACCUMULATE_STATISTICS); - - final IndexJournalStatistics statistics = journalReader.getStatistics(); - final long wordsFileSize = statistics.highestWord() + 1; - - progress.progress(TaskSteps.INCREMENT_OFFSETS); - - logger.debug("Words file size: {}", wordsFileSize); - // Create a count of how many documents has contains each word - final LongArray wordsOffsets = LongArray.allocate(wordsFileSize); - - journalReader.forEachWordId(wordsOffsets::increment); - progress.progress(TaskSteps.COUNT_OFFSETS); - - wordsOffsets.transformEach(0, wordsFileSize, new CountToOffsetTransformer(ReverseIndexFullParameters.ENTRY_SIZE)); - - progress.progress(TaskSteps.CREATE_INTERMEDIATE_DOCS); - - // Construct an intermediate representation of the reverse documents index - try (FileChannel intermediateDocChannel = - (FileChannel) Files.newByteChannel(intermediateUrlsFile, - StandardOpenOption.CREATE, StandardOpenOption.READ, StandardOpenOption.WRITE)) - { - - // Construct intermediate index - try (RandomWriteFunnel intermediateDocumentWriteFunnel = new RandomWriteFunnel(tmpFileDir, RWF_BIN_SIZE); - IntermediateIndexConstructor intermediateIndexConstructor = new IntermediateIndexConstructor(tmpFileDir, wordsOffsets, intermediateDocumentWriteFunnel) - ) - { - journalReader.forEachDocIdRecord(intermediateIndexConstructor); - intermediateDocumentWriteFunnel.write(intermediateDocChannel); - } - intermediateDocChannel.force(false); - progress.progress(TaskSteps.SORT_INTERMEDIATE_DOCS); - - // Sort each segment of the intermediate file - { - LongArray intermediateDocs = LongArray.mmapForModifying(intermediateUrlsFile); - wordsOffsets.foldIO(0, 0, wordsFileSize, (s, e) -> { - intermediateDocs.sortLargeSpanN(sortingContext, ReverseIndexFullParameters.ENTRY_SIZE, s, e); - return e; - }); - intermediateDocs.force(); - } - - progress.progress(TaskSteps.SIZING); - - IndexSizeEstimator sizeEstimator = new IndexSizeEstimator( - ReverseIndexFullParameters.bTreeContext, - ReverseIndexFullParameters.ENTRY_SIZE); - - wordsOffsets.fold(0, 0, wordsOffsets.size(), sizeEstimator); - progress.progress(TaskSteps.FINALIZING_DOCS); - - LongArray finalDocs = LongArray.mmapForWriting(outputFileDocs, sizeEstimator.size); - // Construct the proper reverse index - wordsOffsets.transformEachIO(0, wordsOffsets.size(), new ReverseIndexBTreeTransformer(finalDocs, ReverseIndexFullParameters.ENTRY_SIZE, bTreeContext, intermediateDocChannel)); - wordsOffsets.write(outputFileWords); - - progress.progress(TaskSteps.FORCE); - - // Attempt to clean up before forcing (important disk space preservation) - Files.deleteIfExists(intermediateUrlsFile); - - wordsOffsets.force(); - finalDocs.force(); - - progress.progress(TaskSteps.FINISHED); - } - - } catch (IOException ex) { - logger.error("Failed to convert", ex); - throw ex; - } finally { - Files.deleteIfExists(intermediateUrlsFile); - } - } - - private void deleteOldFiles() throws IOException { - Files.deleteIfExists(outputFileWords); - Files.deleteIfExists(outputFileDocs); - } - - private class IntermediateIndexConstructor implements IndexJournalReader.LongObjectConsumer, AutoCloseable { - - private final LongArray wordRangeEnds; - private final IntArray wordRangeOffset; - private final RandomWriteFunnel documentsFile; - - private final Path tempFile; - - public IntermediateIndexConstructor(Path tempDir, LongArray wordRangeEnds, RandomWriteFunnel documentsFile) throws IOException { - tempFile = Files.createTempFile(tempDir, "iic", "dat"); - - this.wordRangeEnds = wordRangeEnds; - this.wordRangeOffset = IntArray.mmapForWriting(tempFile, wordRangeEnds.size()); - this.documentsFile = documentsFile; - } - - @SneakyThrows - @Override - public void accept(long docId, IndexJournalEntryData.Record record) { - - /* Encode the ID as - * - * 32 bits 32 bits - * [ ranking | url-id ] - * - * in order to get low-ranking documents to be considered first - * when sorting the items. - */ - - int domainId = (int) (docId >>> 32); - long rankingId = (long) domainRankings.getRanking(domainId) << 32; - - int urlId = (int) (docId & 0xFFFF_FFFFL); - long rankEncodedId = rankingId | urlId; - - final int wordId = record.wordId(); - long offset = startOfRange(wordId); - - documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), rankEncodedId); - documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), record.metadata()); - - } - - private long startOfRange(int wordId) { - if (wordId == 0) return 0; - - return wordRangeEnds.get(wordId - 1); - } - - public void close() throws IOException { - Files.delete(tempFile); - } - } - -} - diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullParameters.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullParameters.java deleted file mode 100644 index fb767cb2..00000000 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullParameters.java +++ /dev/null @@ -1,16 +0,0 @@ -package nu.marginalia.index.full; - -import nu.marginalia.btree.model.BTreeBlockSize; -import nu.marginalia.btree.model.BTreeContext; - -public class ReverseIndexFullParameters { - static final int ENTRY_SIZE = 2; - - // This is the byte size per index page on disk, the data pages are twice as large due to ENTRY_SIZE = 2. - // - // Given a hardware limit of 4k reads, 2k block size should be optimal. - static final BTreeBlockSize blockSize = BTreeBlockSize.BS_2048; - - - static final BTreeContext bTreeContext = new BTreeContext(5, ENTRY_SIZE, blockSize); -} diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java deleted file mode 100644 index 4c9cd0d0..00000000 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java +++ /dev/null @@ -1,215 +0,0 @@ -package nu.marginalia.index.priority; - -import lombok.SneakyThrows; -import nu.marginalia.array.IntArray; -import nu.marginalia.array.LongArray; -import nu.marginalia.array.algo.SortingContext; -import nu.marginalia.index.construction.CountToOffsetTransformer; -import nu.marginalia.index.construction.ReverseIndexBTreeTransformer; -import nu.marginalia.index.construction.IndexSizeEstimator; -import nu.marginalia.index.journal.model.IndexJournalEntryData; -import nu.marginalia.index.journal.model.IndexJournalStatistics; -import nu.marginalia.index.journal.reader.IndexJournalReader; -import nu.marginalia.ranking.DomainRankings; -import nu.marginalia.rwf.RandomWriteFunnel; -import nu.marginalia.service.control.ServiceHeartbeat; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.nio.channels.FileChannel; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardOpenOption; - -import static nu.marginalia.index.priority.ReverseIndexPriorityParameters.bTreeContext; - -public class ReverseIndexPriorityConverter { - private static final int RWF_BIN_SIZE = 10_000_000; - - private final ServiceHeartbeat heartbeat; - private final Path tmpFileDir; - - private final Logger logger = LoggerFactory.getLogger(getClass()); - - private final IndexJournalReader journalReader; - private final DomainRankings domainRankings; - private final Path outputFileWords; - private final Path outputFileDocs; - private final SortingContext sortingContext; - - public ReverseIndexPriorityConverter(ServiceHeartbeat heartbeat, - Path tmpFileDir, - IndexJournalReader journalReader, - DomainRankings domainRankings, - Path outputFileWords, - Path outputFileDocs) { - this.heartbeat = heartbeat; - this.tmpFileDir = tmpFileDir; - this.journalReader = journalReader; - this.domainRankings = domainRankings; - this.outputFileWords = outputFileWords; - this.outputFileDocs = outputFileDocs; - this.sortingContext = new SortingContext(tmpFileDir, 64_000); - } - - public enum TaskSteps { - ACCUMULATE_STATISTICS, - INCREMENT_OFFSETS, - COUNT_OFFSETS, - CREATE_INTERMEDIATE_DOCS, - SORT_INTERMEDIATE_DOCS, - SIZING, - FINALIZING_DOCS, - FORCE, - FINISHED, - } - - public void convert() throws IOException { - deleteOldFiles(); - - if (journalReader.fileHeader().fileSize() <= IndexJournalReader.FILE_HEADER_SIZE_BYTES) { - logger.warn("Bailing: Journal is empty!"); - return; - } - - final Path intermediateUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat"); - - try (var progress = heartbeat.createServiceTaskHeartbeat(TaskSteps.class, "reverseIndexPriorityConverter")) { - progress.progress(TaskSteps.ACCUMULATE_STATISTICS); - - final IndexJournalStatistics statistics = journalReader.getStatistics(); - final long wordsFileSize = statistics.highestWord() + 1; - - progress.progress(TaskSteps.INCREMENT_OFFSETS); - - logger.debug("Words file size: {}", wordsFileSize); - // Create a count of how many documents has contains each word - final LongArray wordsOffsets = LongArray.allocate(wordsFileSize); - - journalReader.forEachWordId(wordsOffsets::increment); - progress.progress(TaskSteps.COUNT_OFFSETS); - - wordsOffsets.transformEach(0, wordsFileSize, new CountToOffsetTransformer(ReverseIndexPriorityParameters.ENTRY_SIZE)); - - progress.progress(TaskSteps.CREATE_INTERMEDIATE_DOCS); - - // Construct an intermediate representation of the reverse documents index - try (FileChannel intermediateDocChannel = - (FileChannel) Files.newByteChannel(intermediateUrlsFile, - StandardOpenOption.CREATE, StandardOpenOption.READ, StandardOpenOption.WRITE)) - { - - // Construct intermediate index - try (RandomWriteFunnel intermediateDocumentWriteFunnel = new RandomWriteFunnel(tmpFileDir, RWF_BIN_SIZE); - IntermediateIndexConstructor intermediateIndexConstructor = new IntermediateIndexConstructor(tmpFileDir, wordsOffsets, intermediateDocumentWriteFunnel) - ) - { - journalReader.forEachDocIdRecord(intermediateIndexConstructor); - intermediateDocumentWriteFunnel.write(intermediateDocChannel); - } - intermediateDocChannel.force(false); - progress.progress(TaskSteps.SORT_INTERMEDIATE_DOCS); - - // Sort each segment of the intermediate file - { - LongArray intermediateDocs = LongArray.mmapForModifying(intermediateUrlsFile); - wordsOffsets.foldIO(0, 0, wordsFileSize, (s, e) -> { - intermediateDocs.sortLargeSpan(sortingContext, s, e); - return e; - }); - intermediateDocs.force(); - } - - progress.progress(TaskSteps.SIZING); - - IndexSizeEstimator sizeEstimator = new IndexSizeEstimator( - bTreeContext, - ReverseIndexPriorityParameters.ENTRY_SIZE); - - wordsOffsets.fold(0, 0, wordsOffsets.size(), sizeEstimator); - progress.progress(TaskSteps.FINALIZING_DOCS); - - LongArray finalDocs = LongArray.mmapForWriting(outputFileDocs, sizeEstimator.size); - // Construct the proper reverse index - wordsOffsets.transformEachIO(0, wordsOffsets.size(), new ReverseIndexBTreeTransformer(finalDocs, ReverseIndexPriorityParameters.ENTRY_SIZE, bTreeContext, intermediateDocChannel)); - wordsOffsets.write(outputFileWords); - - progress.progress(TaskSteps.FORCE); - - // Attempt to clean up before forcing (important disk space preservation) - Files.deleteIfExists(intermediateUrlsFile); - - wordsOffsets.force(); - finalDocs.force(); - - progress.progress(TaskSteps.FINISHED); - } - - } catch (IOException ex) { - logger.error("Failed to convert", ex); - throw ex; - } finally { - Files.deleteIfExists(intermediateUrlsFile); - } - } - - private void deleteOldFiles() throws IOException { - Files.deleteIfExists(outputFileWords); - Files.deleteIfExists(outputFileDocs); - } - - private class IntermediateIndexConstructor implements IndexJournalReader.LongObjectConsumer, AutoCloseable { - - private final LongArray wordRangeEnds; - private final IntArray wordRangeOffset; - private final RandomWriteFunnel documentsFile; - - private final Path tempFile; - - public IntermediateIndexConstructor(Path tempDir, LongArray wordRangeEnds, RandomWriteFunnel documentsFile) throws IOException { - tempFile = Files.createTempFile(tempDir, "iic", "dat"); - - this.wordRangeEnds = wordRangeEnds; - this.wordRangeOffset = IntArray.mmapForWriting(tempFile, wordRangeEnds.size()); - this.documentsFile = documentsFile; - } - - @SneakyThrows - @Override - public void accept(long docId, IndexJournalEntryData.Record record) { - - /* Encode the ID as - * - * 32 bits 32 bits - * [ ranking | url-id ] - * - * in order to get low-ranking documents to be considered first - * when sorting the items. - */ - - int domainId = (int) (docId >>> 32); - long rankingId = (long) domainRankings.getRanking(domainId) << 32; - - int urlId = (int) (docId & 0xFFFF_FFFFL); - long rankEncodedId = rankingId | urlId; - - final int wordId = record.wordId(); - long offset = startOfRange(wordId); - - documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), rankEncodedId); - } - - private long startOfRange(int wordId) { - if (wordId == 0) return 0; - - return wordRangeEnds.get(wordId - 1); - } - - public void close() throws IOException { - Files.delete(tempFile); - } - } - -} - diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityEntrySource.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityEntrySource.java deleted file mode 100644 index f4cc932d..00000000 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityEntrySource.java +++ /dev/null @@ -1,48 +0,0 @@ -package nu.marginalia.index.priority; - -import nu.marginalia.array.buffer.LongQueryBuffer; -import nu.marginalia.btree.BTreeReader; -import nu.marginalia.index.query.EntrySource; - -import static java.lang.Math.min; - -public class ReverseIndexPriorityEntrySource implements EntrySource { - private final BTreeReader reader; - - int pos; - int endOffset; - - private final int wordId; - - public ReverseIndexPriorityEntrySource(BTreeReader reader, int wordId) { - this.reader = reader; - this.wordId = wordId; - - pos = 0; - endOffset = pos + reader.numEntries(); - } - - @Override - public void skip(int n) { - pos += n; - } - - @Override - public void read(LongQueryBuffer buffer) { - buffer.end = min(buffer.end, endOffset - pos); - reader.readData(buffer.data, buffer.end, pos); - pos += buffer.end; - - buffer.uniq(); - } - - @Override - public boolean hasMore() { - return pos < endOffset; - } - - @Override - public String indexName() { - return "Priority:" + wordId; - } -} diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityParameters.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityParameters.java deleted file mode 100644 index 5cd09307..00000000 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityParameters.java +++ /dev/null @@ -1,31 +0,0 @@ -package nu.marginalia.index.priority; - -import nu.marginalia.btree.model.BTreeBlockSize; -import nu.marginalia.btree.model.BTreeContext; -import nu.marginalia.index.journal.model.IndexJournalEntryData; -import nu.marginalia.model.idx.WordFlags; - -public class ReverseIndexPriorityParameters { - static final int ENTRY_SIZE = 1; - static final BTreeBlockSize blockSize = BTreeBlockSize.BS_4096; - - static final BTreeContext bTreeContext = new BTreeContext(5, ENTRY_SIZE, blockSize); - - private static final long highPriorityFlags = - WordFlags.Title.asBit() - | WordFlags.Subjects.asBit() - | WordFlags.TfIdfHigh.asBit() - | WordFlags.NamesWords.asBit() - | WordFlags.UrlDomain.asBit() - | WordFlags.UrlPath.asBit() - | WordFlags.Site.asBit() - | WordFlags.SiteAdjacent.asBit(); - - public static boolean filterPriorityRecord(IndexJournalEntryData.Record record) { - long meta = record.metadata(); - - return (meta & highPriorityFlags) != 0; - } - - -} diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityReader.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityReader.java deleted file mode 100644 index e314f0e5..00000000 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityReader.java +++ /dev/null @@ -1,77 +0,0 @@ -package nu.marginalia.index.priority; - -import nu.marginalia.index.query.EntrySource; -import nu.marginalia.array.LongArray; -import nu.marginalia.btree.BTreeReader; -import nu.marginalia.index.query.EmptyEntrySource; -import nu.marginalia.index.query.ReverseIndexRetainFilter; -import nu.marginalia.index.query.filter.QueryFilterNoPass; -import nu.marginalia.index.query.filter.QueryFilterStepIf; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; - -public class ReverseIndexPriorityReader { - private final LongArray words; - private final LongArray documents; - - private final Logger logger = LoggerFactory.getLogger(getClass()); - - public ReverseIndexPriorityReader(Path words, Path documents) throws IOException { - if (!Files.exists(words) || !Files.exists(documents)) { - this.words = null; - this.documents = null; - return; - } - - logger.info("Switching prio reverse index"); - - this.words = LongArray.mmapRead(words); - this.documents = LongArray.mmapRead(documents); - } - - public EntrySource priorityDocuments(int wordId) { - if (words == null) { - // index not loaded - return new EmptyEntrySource(); - } - - if (wordId < 0 || wordId >= words.size()) return new EmptyEntrySource(); - - long offset = words.get(wordId); - - if (offset < 0) return new EmptyEntrySource(); - - return new ReverseIndexPriorityEntrySource(createReaderNew(offset), wordId); - } - - private BTreeReader createReaderNew(long offset) { - return new BTreeReader(documents, ReverseIndexPriorityParameters.bTreeContext, offset); - } - - public QueryFilterStepIf also(int wordId) { - if (wordId < 0) return new QueryFilterNoPass(); - - long offset = words.get(wordId); - - if (offset < 0) return new QueryFilterNoPass(); - - return new ReverseIndexRetainFilter(createReaderNew(offset), "priority", wordId); - } - - public int numDocuments(int wordId) { - if (wordId < 0) - return 0; - - long offset = words.get(wordId); - - if (offset < 0) - return 0; - - return createReaderNew(offset).numEntries(); - } - -} diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/query/ReverseIndexRetainFilter.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/query/ReverseIndexRetainFilter.java index e5182cce..bde2ccc8 100644 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/query/ReverseIndexRetainFilter.java +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/query/ReverseIndexRetainFilter.java @@ -4,7 +4,7 @@ import nu.marginalia.array.buffer.LongQueryBuffer; import nu.marginalia.btree.BTreeReader; import nu.marginalia.index.query.filter.QueryFilterStepIf; -public record ReverseIndexRetainFilter(BTreeReader range, String name, int wordId) implements QueryFilterStepIf { +public record ReverseIndexRetainFilter(BTreeReader range, String name, long wordId) implements QueryFilterStepIf { @Override public void apply(LongQueryBuffer buffer) { diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/ReverseIndexReaderTest.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/ReverseIndexReaderTest.java new file mode 100644 index 00000000..e05fdf78 --- /dev/null +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/ReverseIndexReaderTest.java @@ -0,0 +1,109 @@ +package nu.marginalia.index; + +import nu.marginalia.array.algo.SortingContext; +import nu.marginalia.array.buffer.LongQueryBuffer; +import nu.marginalia.index.construction.DocIdRewriter; +import nu.marginalia.index.construction.ReversePreindex; +import nu.marginalia.index.construction.TestJournalFactory; +import nu.marginalia.index.construction.TestJournalFactory.EntryDataWithWordMeta; +import nu.marginalia.index.journal.reader.IndexJournalReader; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +import static nu.marginalia.index.construction.TestJournalFactory.wm; +import static org.junit.jupiter.api.Assertions.*; + +class ReverseIndexReaderTest { + TestJournalFactory journalFactory; + Path tempDir; + SortingContext sortingContext; + + @BeforeEach + public void setUp() throws IOException { + journalFactory = new TestJournalFactory(); + + tempDir = Files.createTempDirectory("sort"); + sortingContext = new SortingContext(Path.of("invalid"), 1<<20); + } + + @AfterEach + public void tearDown() throws IOException { + journalFactory.clear(); + + List contents = new ArrayList<>(); + Files.list(tempDir).forEach(contents::add); + for (var tempFile : contents) { + Files.delete(tempFile); + } + Files.delete(tempDir); + } + + @Test + public void testSimple() throws IOException { + + var indexReader = createIndex( + new EntryDataWithWordMeta(100, 101, wm(50, 51)) + ); + + assertEquals(1, indexReader.numDocuments(50)); + + long[] meta = indexReader.getTermMeta(50, new long[] { 100 }); + assertArrayEquals(new long[] { 51 }, meta); + assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50)); + } + + @Test + public void test2x2() throws IOException { + + var indexReader = createIndex( + new EntryDataWithWordMeta(100, 101, wm(50, 51), wm(51, 52)), + new EntryDataWithWordMeta(101, 101, wm(51, 53), wm(52, 54)) + + ); + + assertEquals(1, indexReader.numDocuments(50)); + assertEquals(2, indexReader.numDocuments(51)); + assertEquals(1, indexReader.numDocuments(52)); + + assertArrayEquals(new long[] { 51 }, indexReader.getTermMeta(50, new long[] { 100 })); + assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50)); + + assertArrayEquals(new long[] { 52, 53 }, indexReader.getTermMeta(51, new long[] { 100, 101 })); + assertArrayEquals(new long[] { 100, 101 }, readEntries(indexReader, 51)); + + assertArrayEquals(new long[] { 54 }, indexReader.getTermMeta(52, new long[] { 101 })); + assertArrayEquals(new long[] { 101 }, readEntries(indexReader, 52)); + + } + + private long[] readEntries(ReverseIndexReader reader, long wordId) { + var es = reader.documents(wordId); + assertTrue(es.hasMore()); + LongQueryBuffer buffer = new LongQueryBuffer(4); + es.read(buffer); + assertFalse(es.hasMore()); + return buffer.copyData(); + } + + private ReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException { + var reader = journalFactory.createReader(scenario); + var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir); + + + Path docsFile = tempDir.resolve("docs.dat"); + Path wordsFile = tempDir.resolve("words.dat"); + + preindex.finalizeIndex(docsFile, wordsFile); + preindex.delete(); + + return new ReverseIndexReader(wordsFile, docsFile); + + } +} \ No newline at end of file diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexDocsTest.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexDocsTest.java new file mode 100644 index 00000000..6d3b7bf4 --- /dev/null +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexDocsTest.java @@ -0,0 +1,171 @@ +package nu.marginalia.index.construction; + +import nu.marginalia.array.algo.SortingContext; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static nu.marginalia.index.construction.TestJournalFactory.EntryData; +import static org.junit.jupiter.api.Assertions.assertEquals; + +class ReversePreindexDocsTest { + Path countsFile; + Path wordsIdFile; + Path docsFile; + Path tempDir; + + TestJournalFactory journalFactory; + + @BeforeEach + public void setUp() throws IOException { + journalFactory = new TestJournalFactory(); + + countsFile = Files.createTempFile("counts", ".dat"); + wordsIdFile = Files.createTempFile("words", ".dat"); + docsFile = Files.createTempFile("docs", ".dat"); + tempDir = Files.createTempDirectory("sort"); + } + + @AfterEach + public void tearDown() throws IOException { + journalFactory.clear(); + + Files.deleteIfExists(countsFile); + Files.deleteIfExists(wordsIdFile); + List contents = new ArrayList<>(); + Files.list(tempDir).forEach(contents::add); + for (var tempFile : contents) { + Files.delete(tempFile); + } + Files.delete(tempDir); + } + + @Test + public void testDocs() throws IOException { + var reader = journalFactory.createReader( + new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33) + ); + + var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile); + var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), segments); + + List expected = List.of( + new TestSegmentData(-100, 0, 2, new long[] { -0xF00BA3L, 0 }), + new TestSegmentData(10, 2, 4, new long[] { -0xF00BA3L, 0 }), + new TestSegmentData(33, 4, 6, new long[] { -0xF00BA3L, 0 }), + new TestSegmentData(40, 6, 8, new long[] { -0xF00BA3L, 0 }) + ); + + List actual = new ArrayList<>(); + + var iter = segments.iterator(2); + while (iter.next()) { + long[] data = new long[(int) (iter.endOffset - iter.startOffset)]; + docs.slice(iter.startOffset, iter.endOffset).get(0, data); + actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset, + data)); + } + + assertEquals(expected, actual); + } + + @Test + public void testDocsRepeatedWord() throws IOException { + var reader = journalFactory.createReader( + new EntryData(-0xF00BA3L, 0, 4, 4) + ); + + var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile); + var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), segments); + + List expected = List.of( + new TestSegmentData(4, 0, 4, new long[] { -0xF00BA3L, 0, -0xF00BA3L, 0 }) + ); + + List actual = new ArrayList<>(); + + var iter = segments.iterator(2); + while (iter.next()) { + long[] data = new long[(int) (iter.endOffset - iter.startOffset)]; + docs.slice(iter.startOffset, iter.endOffset).get(0, data); + actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset, + data)); + } + + assertEquals(expected, actual); + } + @Test + public void testDocs2() throws IOException { + var reader = journalFactory.createReader( + new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33), + new EntryData(0xF00BA4L, 0, 15, 30, -100, 33) + ); + + var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile); + var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), segments); + + List expected = List.of( + new TestSegmentData(-100, 0, 4, new long[] { -0xF00BA3L, 0, 0xF00BA4L, 0 }), + new TestSegmentData(10, 4, 6, new long[] { -0xF00BA3L, 0}), + new TestSegmentData(15, 6, 8, new long[] { 0xF00BA4L, 0}), + new TestSegmentData(30, 8, 10, new long[] { 0xF00BA4L, 0}), + new TestSegmentData(33, 10, 14, new long[] { -0xF00BA3L, 0, 0xF00BA4L, 0}), + new TestSegmentData(40, 14, 16, new long[] { -0xF00BA3L, 0}) + ); + + List actual = new ArrayList<>(); + + var iter = segments.iterator(2); + while (iter.next()) { + long[] data = new long[(int) (iter.endOffset - iter.startOffset)]; + docs.slice(iter.startOffset, iter.endOffset).get(0, data); + actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset, + data)); + } + } + + record TestSegmentData(long wordId, long start, long end, long[] data) { + public TestSegmentData(long wordId, long start, long end) { + this(wordId, start, end, null); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + TestSegmentData that = (TestSegmentData) o; + + if (wordId != that.wordId) return false; + if (start != that.start) return false; + if (end != that.end) return false; + return Arrays.equals(data, that.data); + } + + @Override + public int hashCode() { + int result = (int) (wordId ^ (wordId >>> 32)); + result = 31 * result + (int) (start ^ (start >>> 32)); + result = 31 * result + (int) (end ^ (end >>> 32)); + result = 31 * result + Arrays.hashCode(data); + return result; + } + + @Override + public String toString() { + return "TestSegmentData{" + + "wordId=" + wordId + + ", start=" + start + + ", end=" + end + + ", data=" + Arrays.toString(data) + + '}'; + } + } +} \ No newline at end of file diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexFinalizeTest.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexFinalizeTest.java new file mode 100644 index 00000000..cc79ebac --- /dev/null +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexFinalizeTest.java @@ -0,0 +1,143 @@ + +package nu.marginalia.index.construction; + +import nu.marginalia.array.LongArray; +import nu.marginalia.array.algo.SortingContext; +import nu.marginalia.btree.BTreeReader; +import nu.marginalia.btree.model.BTreeHeader; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.*; + +import static nu.marginalia.index.construction.TestJournalFactory.*; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class ReversePreindexFinalizeTest { + TestJournalFactory journalFactory; + Path countsFile; + Path wordsIdFile; + Path docsFile; + Path tempDir; + SortingContext sortingContext; + + @BeforeEach + public void setUp() throws IOException { + journalFactory = new TestJournalFactory(); + + countsFile = Files.createTempFile("counts", ".dat"); + wordsIdFile = Files.createTempFile("words", ".dat"); + docsFile = Files.createTempFile("docs", ".dat"); + tempDir = Files.createTempDirectory("sort"); + sortingContext = new SortingContext(Path.of("invalid"), 1<<20); + } + + @AfterEach + public void tearDown() throws IOException { + journalFactory.clear(); + + Files.deleteIfExists(countsFile); + Files.deleteIfExists(wordsIdFile); + List contents = new ArrayList<>(); + Files.list(tempDir).forEach(contents::add); + for (var tempFile : contents) { + Files.delete(tempFile); + } + Files.delete(tempDir); + } + + @Test + public void testFinalizeSimple() throws IOException { + var reader = journalFactory.createReader(new EntryDataWithWordMeta(100, 101, wm(50, 51))); + var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir); + + + preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat")); + preindex.delete(); + + Path wordsFile = tempDir.resolve("words.dat"); + Path docsFile = tempDir.resolve("docs.dat"); + + assertTrue(Files.exists(wordsFile)); + assertTrue(Files.exists(docsFile)); + + System.out.println(Files.size(wordsFile)); + System.out.println(Files.size(docsFile)); + + var docsArray = LongArray.mmapRead(docsFile); + var wordsArray = LongArray.mmapRead(wordsFile); + + var docsHeader = BTreeReader.readHeader(docsArray, 0); + var wordsHeader = BTreeReader.readHeader(wordsArray, 0); + + assertEquals(1, docsHeader.numEntries()); + assertEquals(1, wordsHeader.numEntries()); + + assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0)); + assertEquals(51, docsArray.get(docsHeader.dataOffsetLongs() + 1)); + assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs())); + assertEquals(0, wordsArray.get(wordsHeader.dataOffsetLongs() + 1)); + } + + + @Test + public void testFinalizeSimple2x2() throws IOException { + var reader = journalFactory.createReader( + new EntryDataWithWordMeta(100, 101, wm(50, 51)), + new EntryDataWithWordMeta(101, 101, wm(51, 52)) + ); + + var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir); + + preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat")); + preindex.delete(); + + Path wordsFile = tempDir.resolve("words.dat"); + Path docsFile = tempDir.resolve("docs.dat"); + + assertTrue(Files.exists(wordsFile)); + assertTrue(Files.exists(docsFile)); + + System.out.println(Files.size(wordsFile)); + System.out.println(Files.size(docsFile)); + + var docsArray = LongArray.mmapRead(docsFile); + var wordsArray = LongArray.mmapRead(wordsFile); + + + var wordsHeader = BTreeReader.readHeader(wordsArray, 0); + + System.out.println(wordsHeader); + + assertEquals(2, wordsHeader.numEntries()); + + long offset1 = wordsArray.get(wordsHeader.dataOffsetLongs() + 1); + long offset2 = wordsArray.get(wordsHeader.dataOffsetLongs() + 3); + + assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs())); + assertEquals(0, wordsArray.get(wordsHeader.dataOffsetLongs() + 1)); + assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs())); + assertEquals(0, wordsArray.get(wordsHeader.dataOffsetLongs() + 1)); + + BTreeHeader docsHeader; + + docsHeader = BTreeReader.readHeader(docsArray, offset1); + System.out.println(docsHeader); + assertEquals(1, docsHeader.numEntries()); + + assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0)); + assertEquals(51, docsArray.get(docsHeader.dataOffsetLongs() + 1)); + + docsHeader = BTreeReader.readHeader(docsArray, offset2); + System.out.println(docsHeader); + assertEquals(1, docsHeader.numEntries()); + + assertEquals(101, docsArray.get(docsHeader.dataOffsetLongs() + 0)); + assertEquals(52, docsArray.get(docsHeader.dataOffsetLongs() + 1)); + } +} \ No newline at end of file diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexMergeTest.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexMergeTest.java new file mode 100644 index 00000000..5d9d42f2 --- /dev/null +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexMergeTest.java @@ -0,0 +1,427 @@ + +package nu.marginalia.index.construction; + +import nu.marginalia.array.algo.SortingContext; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.*; + +import static nu.marginalia.index.construction.TestJournalFactory.*; +import static org.junit.jupiter.api.Assertions.assertEquals; + +class ReversePreindexMergeTest { + TestJournalFactory journalFactory; + Path countsFile; + Path wordsIdFile; + Path docsFile; + Path tempDir; + SortingContext sortingContext; + + @BeforeEach + public void setUp() throws IOException { + journalFactory = new TestJournalFactory(); + + countsFile = Files.createTempFile("counts", ".dat"); + wordsIdFile = Files.createTempFile("words", ".dat"); + docsFile = Files.createTempFile("docs", ".dat"); + tempDir = Files.createTempDirectory("sort"); + sortingContext = new SortingContext(Path.of("invalid"), 1<<20); + } + + @AfterEach + public void tearDown() throws IOException { + journalFactory.clear(); + + Files.deleteIfExists(countsFile); + Files.deleteIfExists(wordsIdFile); + List contents = new ArrayList<>(); + Files.list(tempDir).forEach(contents::add); + for (var tempFile : contents) { + Files.delete(tempFile); + } + Files.delete(tempDir); + } + + public ReversePreindex runMergeScenario( + List leftData, + List rightData + ) throws IOException { + var reader1 = journalFactory.createReader(leftData.toArray(EntryDataWithWordMeta[]::new)); + var reader2 = journalFactory.createReader(rightData.toArray(EntryDataWithWordMeta[]::new)); + + var left = ReversePreindex.constructPreindex(reader1, DocIdRewriter.identity(), tempDir); + var right = ReversePreindex.constructPreindex(reader2, DocIdRewriter.identity(), tempDir); + return ReversePreindex.merge(tempDir, left, right); + } + + private List getData(ReversePreindex merged) { + var iter = merged.segments.iterator(2); + List actual = new ArrayList<>(); + while (iter.next()) { + long[] data = new long[(int) (iter.endOffset - iter.startOffset)]; + merged.documents.slice(iter.startOffset, iter.endOffset).get(0, data); + actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset, + data)); + } + return actual; + } + + @Test + public void testDocsMergeSingleNoOverlap() throws IOException { + + IdSequence docIds = new IdSequence(); + IdSequence docMetas = new IdSequence(); + IdSequence wordMetas = new IdSequence(); + IdSequence wordIds = new IdSequence(); + + var leftSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.nextUnique(), wordMetas.nextUnique()))); + var rightSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.nextUnique(), wordMetas.nextUnique()))); + + var merged = runMergeScenario( + leftSequence, + rightSequence + ); + + var actual = getData(merged); + + var expected = simulateMerge(leftSequence, rightSequence); + + System.out.println(actual); + assertEquals(expected, actual); + } + + @Test + public void testDocsMergeSingleOnlyOverlap() throws IOException { + + IdSequence docIds = new IdSequence(); + IdSequence docMetas = new IdSequence(); + IdSequence wordMetas = new IdSequence(); + IdSequence wordIds = new IdSequence(); + + var leftSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.nextUnique(), wordMetas.nextUnique()))); + var rightSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.alreadySeenSameSequence(), wordMetas.nextUnique()))); + + var merged = runMergeScenario( + leftSequence, + rightSequence + ); + + var actual = getData(merged); + + var expected = simulateMerge(leftSequence, rightSequence); + + System.out.println(actual); + assertEquals(expected, actual); + } + + @Test + public void testDocsMergeSingleOnlyOverlap2() throws IOException { + + long wid1 = 1; + long wid2 = 2; + IdSequence docIds = new IdSequence(); + IdSequence docMetas = new IdSequence(); + IdSequence wordMetas = new IdSequence(); + + var leftSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), + wm(wid1, wordMetas.nextUnique()), + wm(wid2, wordMetas.nextUnique()) + )); + var rightSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), + wm(wid1, wordMetas.nextUnique()), + wm(wid2, wordMetas.nextUnique()) + )); + + var merged = runMergeScenario( + leftSequence, + rightSequence + ); + + var actual = getData(merged); + + var expected = simulateMerge(leftSequence, rightSequence); + + System.out.println(actual); + assertEquals(expected, actual); + } + + @Test + public void testBadCase1() throws IOException { + long wordId = 0xF00F00BA3L; + + List leftSequence = List.of(new EntryDataWithWordMeta(40, 50, + wm(wordId, 5)) + ); + List rightSequence = List.of(new EntryDataWithWordMeta(41, 51, + wm(wordId, 3), + wm(wordId, 4)) + ); + + var mergedLR = runMergeScenario( + leftSequence, + rightSequence + ); + var mergedRL = runMergeScenario( + rightSequence, + leftSequence + ); + + var actualLR = getData(mergedLR); + var actualRL = getData(mergedRL); + + var expected = simulateMerge(leftSequence, rightSequence); + + assertEquals(actualLR, actualRL); + + if (!expected.equals(actualLR)) { + System.out.println("*fail*"); + System.out.println(leftSequence); + System.out.println(rightSequence); + } + else { + System.out.println("*pass*"); + } + + assertEquals(expected, actualLR); + + } + + @Test + public void testBadCase2() throws IOException { + long wordId = 100; + + List leftSequence = List.of( + new EntryDataWithWordMeta(1, 50, wm(wordId, 5)), + new EntryDataWithWordMeta(2, 50, wm(wordId, 5)) + + ); + List rightSequence = List.of( + new EntryDataWithWordMeta(3, 50, wm(wordId, 5)) + ); + + var mergedLR = runMergeScenario( + leftSequence, + rightSequence + ); + var mergedRL = runMergeScenario( + rightSequence, + leftSequence + ); + + var actualLR = getData(mergedLR); + var actualRL = getData(mergedRL); + + var expected = simulateMerge(leftSequence, rightSequence); + + assertEquals(actualLR, actualRL); + + if (!expected.equals(actualLR)) { + System.out.println("*fail*"); + System.out.println(leftSequence); + System.out.println(rightSequence); + } + else { + System.out.println("*pass*"); + } + + assertEquals(expected, actualLR); + + } + + @Test + public void testFuzz() throws IOException { + Random r = new Random(); + int maxDocs = 150; + int maxWords = 160; + int nIters = 1000; + + for (int i = 0; i < nIters; i++) { + int nLeft = 1 + r.nextInt(maxDocs); + int nRight = 1 + r.nextInt(maxDocs); + + IdSequence docIdsLeft = new IdSequence(); + IdSequence docIdsRight = new IdSequence(); + IdSequence docMetas = new IdSequence(); + IdSequence wordMetas = new IdSequence(); + IdSequence wordIds = new IdSequence(); + + List leftSequence = new ArrayList<>(nLeft); + for (int j = 0; j < nLeft; j++) { + WordWithMeta[] words = new WordWithMeta[maxWords == 1 ? 1 : r.nextInt(1, maxWords)]; + Arrays.setAll(words, idx -> { + long wordId = wordIds.seenWithP(1.0); + long wordMeta = wordMetas.nextUniqueAssociatedWithKey(wordId); + return wm(wordId, wordMeta); + }); + + long docId = docIdsLeft.nextUnique(); + long docMeta = docMetas.nextUniqueAssociatedWithKey(docId); + leftSequence.add(new EntryDataWithWordMeta(docId, docMeta, words)); + } + + List rightSequence = new ArrayList<>(nLeft); + for (int j = 0; j < nRight; j++) { + WordWithMeta[] words = new WordWithMeta[maxWords == 1 ? 1 : r.nextInt(1, maxWords)]; + Arrays.setAll(words, idx -> { + long wordId = wordIds.seenWithP(1.0); + long wordMeta = wordMetas.nextUniqueAssociatedWithKey(wordId); + return wm(wordId, wordMeta); + }); + + long docId = docIdsRight.seenWithP(docIdsLeft, 0.1); + long docMeta = docMetas.nextUniqueAssociatedWithKey(docId); + rightSequence.add(new EntryDataWithWordMeta(docId, docMeta, words)); + } + + var mergedLR = runMergeScenario( + leftSequence, + rightSequence + ); + var mergedRL = runMergeScenario( + rightSequence, + leftSequence + ); + + var actualLR = getData(mergedLR); + var actualRL = getData(mergedRL); + + var expected = simulateMerge(leftSequence, rightSequence); + + assertEquals(actualLR, actualRL); + + if (!expected.equals(actualLR)) { + System.out.println("*fail*"); + System.out.println(leftSequence); + System.out.println(rightSequence); + } + else { + System.out.println("*pass*"); + } + + assertEquals(expected, actualLR); + + } + } + + + public List simulateMerge( + Collection leftInputs, + Collection rightInputs + ) { + TreeMap> wordToDocs = new TreeMap<>(); + + for (var entry : leftInputs) { + for (var wm : entry.wordIds()) { + wordToDocs.computeIfAbsent(wm.wordId(), w -> new ArrayList<>()).add( + new DocWithMeta(entry.docId(), wm.meta()) + ); + } + } + for (var entry : rightInputs) { + for (var wm : entry.wordIds()) { + wordToDocs.computeIfAbsent(wm.wordId(), w -> new ArrayList<>()).add( + new DocWithMeta(entry.docId(), wm.meta()) + ); + } + } + + List ret = new ArrayList<>(); + int[] start = new int[1]; + wordToDocs.forEach((wordId, docsList) -> { + docsList.sort(Comparator.naturalOrder()); + var iter = docsList.iterator(); + DocWithMeta prevVal = null; + DocWithMeta currentVal; + while (iter.hasNext()) { + currentVal = iter.next(); + if (prevVal != null) { + if (currentVal.docId == prevVal.docId) { + iter.remove(); + } + } + prevVal = currentVal; + + } + long[] data = new long[docsList.size()*2]; + for (int i = 0; i < docsList.size(); i++) { + data[2*i] = docsList.get(i).docId; + data[2*i + 1] = docsList.get(i).meta; + } + ret.add(new TestSegmentData(wordId, start[0], start[0] + data.length, data)); + + start[0] += data.length; + }); + return ret; + } + + + record DocWithMeta(long docId, long meta) implements Comparable { + + @Override + public int compareTo(DocWithMeta o) { + return Long.compare(docId, o.docId); + } + } + + class IdSequence { + Set seen = new HashSet<>(); + Map associatedValues = new HashMap<>(); + private Random random = new Random(); + + /** Return alreadySeen() with probability p, + * else nextUnique() + */ + public long seenWithP(double p) { + if (isEmpty() || random.nextDouble() > p) + return nextUnique(); + + return alreadySeenSameSequence(); + } + + public long seenWithP(IdSequence other, double p) { + if (isEmpty() || random.nextDouble() > p) + return nextUnique(); + + return alreadySeenOtherSequence(other); + } + + public long nextUnique() { + for (;;) { + long val = random.nextLong(); + if (seen.add(val)) { + return val; + } + } + } + + public long nextUniqueAssociatedWithKey(long key) { + return associatedValues.computeIfAbsent(key, k -> nextUnique()); + } + + public long alreadySeenSameSequence() { + long[] values = seen.stream().mapToLong(Long::longValue).toArray(); + int idx = random.nextInt(0, values.length); + return values[idx]; + } + + public long alreadySeenOtherSequence(IdSequence other) { + List values = new ArrayList<>(other.seen); + Collections.shuffle(values); + for (Long maybe : values) { + if (seen.add(maybe)) + return maybe; + } + return nextUnique(); + } + + public boolean isEmpty() { + return seen.isEmpty(); + } + } + +} \ No newline at end of file diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexWordSegmentsTest.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexWordSegmentsTest.java new file mode 100644 index 00000000..1f5556ac --- /dev/null +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexWordSegmentsTest.java @@ -0,0 +1,232 @@ +package nu.marginalia.index.construction; + +import nu.marginalia.array.LongArray; +import nu.marginalia.array.algo.SortingContext; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +import static nu.marginalia.index.construction.TestJournalFactory.*; +import static org.junit.jupiter.api.Assertions.*; + +class ReversePreindexWordSegmentsTest { + Path countsFile; + Path wordsIdFile; + Path docsFile; + Path tempDir; + + TestJournalFactory journalFactory; + + @BeforeEach + public void setUp() throws IOException { + journalFactory = new TestJournalFactory(); + + countsFile = Files.createTempFile("counts", ".dat"); + wordsIdFile = Files.createTempFile("words", ".dat"); + docsFile = Files.createTempFile("docs", ".dat"); + tempDir = Files.createTempDirectory("sort"); + } + + @AfterEach + public void tearDown() throws IOException { + journalFactory.clear(); + + Files.deleteIfExists(countsFile); + Files.deleteIfExists(wordsIdFile); + List contents = new ArrayList<>(); + Files.list(tempDir).forEach(contents::add); + for (var tempFile : contents) { + Files.delete(tempFile); + } + Files.delete(tempDir); + } + @Test + public void testWordSegmentsLongWordId() throws IOException { + var reader = journalFactory.createReader( + new EntryData(-0xF00BA3L, 0, 1L<<33) + ); + + var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile); + var iter = segments.iterator(1); + + List expected = List.of( + new TestSegmentData(1L<<33, 0, 1) + ); + + List actual = new ArrayList<>(); + + while (iter.next()) { + actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset)); + } + + assertEquals(expected, actual); + } + @Test + public void testWordSegmentsRepeatedWordId() throws IOException { + var reader = journalFactory.createReader( + new EntryData(-0xF00BA3L, 0, 5, 5) + ); + + var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile); + var iter = segments.iterator(1); + + List expected = List.of( + new TestSegmentData(5, 0, 2) + ); + + List actual = new ArrayList<>(); + + while (iter.next()) { + actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset)); + } + + assertEquals(expected, actual); + } + + @Test + public void testWordSegments1() throws IOException { + var reader = journalFactory.createReader( + new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33) + ); + + var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile); + var iter = segments.iterator(1); + + List expected = List.of( + new TestSegmentData(-100, 0, 1), + new TestSegmentData(10, 1, 2), + new TestSegmentData(33, 2, 3), + new TestSegmentData(40, 3, 4) + ); + + List actual = new ArrayList<>(); + + while (iter.next()) { + actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset)); + } + + assertEquals(expected, actual); + } + + @Test + public void testWordSegments2() throws IOException { + var reader = journalFactory.createReader( + new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33), + new EntryData(0xF00BA4L, 0, 15, 30, -100, 33) + ); + + var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile); + var iter = segments.iterator(1); + + List expected = List.of( + new TestSegmentData(-100, 0, 2), + new TestSegmentData(10, 2, 3), + new TestSegmentData(15, 3, 4), + new TestSegmentData(30, 4, 5), + new TestSegmentData(33, 5, 7), + new TestSegmentData(40, 7, 8) + ); + + List actual = new ArrayList<>(); + + while (iter.next()) { + actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset)); + } + + assertEquals(expected, actual); + } + + + @Test + public void testWordSegments_ReadIterator() { + LongArray wordsArray = LongArray.allocate(4); + LongArray countsArray = LongArray.allocate(4); + wordsArray.set(0, -1, -2, -3, -4); + countsArray.set(0, 2, 1, 3, 5); + var segments = new ReversePreindexWordSegments(wordsArray, countsArray, null, null); + + var ritr = segments.iterator(1); + assertTrue(ritr.hasMorePositions()); + assertTrue(ritr.next()); + assertTrue(ritr.isPositionBeforeEnd()); + assertEquals(-1, ritr.wordId); + assertEquals(0, ritr.idx()); + assertEquals(0, ritr.startOffset); + assertEquals(2, ritr.endOffset); + + assertTrue(ritr.hasMorePositions()); + assertTrue(ritr.next()); + assertTrue(ritr.isPositionBeforeEnd()); + assertEquals(-2, ritr.wordId); + assertEquals(1, ritr.idx()); + assertEquals(2, ritr.startOffset); + assertEquals(3, ritr.endOffset); + + assertTrue(ritr.hasMorePositions()); + assertTrue(ritr.next()); + assertTrue(ritr.isPositionBeforeEnd()); + assertEquals(-3, ritr.wordId); + assertEquals(2, ritr.idx()); + assertEquals(3, ritr.startOffset); + assertEquals(6, ritr.endOffset); + + assertTrue(ritr.hasMorePositions()); + assertTrue(ritr.next()); + assertTrue(ritr.isPositionBeforeEnd()); + assertEquals(-4, ritr.wordId); + assertEquals(3, ritr.idx()); + assertEquals(6, ritr.startOffset); + assertEquals(11, ritr.endOffset); + + assertFalse(ritr.hasMorePositions()); + assertFalse(ritr.next()); + assertFalse(ritr.isPositionBeforeEnd()); + + assertEquals(Long.MIN_VALUE, ritr.wordId); + } + + + @Test + public void testWordSegments_ConstructionIterator() { + LongArray wordsArray = LongArray.allocate(4); + LongArray countsArray = LongArray.allocate(4); + wordsArray.set(0, -1, -2, -3, -4); + var segments = new ReversePreindexWordSegments(wordsArray, countsArray, null, null); + + var citr = segments.constructionIterator(1); + assertEquals(-1, citr.wordId); + assertEquals(0, citr.idx()); + assertTrue(citr.canPutMore()); + assertTrue(citr.putNext(1)); + assertEquals(1, countsArray.get(0)); + + assertEquals(-2, citr.wordId); + assertEquals(1, citr.idx()); + assertTrue(citr.canPutMore()); + assertTrue(citr.putNext(2)); + assertEquals(2, countsArray.get(1)); + + assertEquals(-3, citr.wordId); + assertEquals(2, citr.idx()); + assertTrue(citr.canPutMore()); + assertTrue(citr.putNext(3)); + assertEquals(3, countsArray.get(2)); + + assertEquals(-4, citr.wordId); + assertEquals(3, citr.idx()); + assertTrue(citr.canPutMore()); + assertFalse(citr.putNext(4)); + assertEquals(4, countsArray.get(3)); + + assertEquals(4, citr.idx()); + assertFalse(citr.canPutMore()); + assertEquals(Long.MIN_VALUE, citr.wordId); + } + +} \ No newline at end of file diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/TestJournalFactory.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/TestJournalFactory.java new file mode 100644 index 00000000..5fdb0ac1 --- /dev/null +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/TestJournalFactory.java @@ -0,0 +1,93 @@ +package nu.marginalia.index.construction; + +import nu.marginalia.index.journal.model.IndexJournalEntryData; +import nu.marginalia.index.journal.model.IndexJournalEntryHeader; +import nu.marginalia.index.journal.reader.IndexJournalReader; +import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile; +import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class TestJournalFactory { + Path tempDir = Files.createTempDirectory("journal"); + + public TestJournalFactory() throws IOException {} + + public void clear() throws IOException { + List toDelete = new ArrayList<>(); + try (var dirStream = Files.list(tempDir)) { + dirStream.forEach(toDelete::add); + } + for (var tempFile : toDelete) { + Files.delete(tempFile); + } + Files.delete(tempDir); + } + + public record EntryData(long docId, long docMeta, long... wordIds) { + @Override + public String toString() { + return "EntryData{" + + "docId=" + docId + + ", docMeta=" + docMeta + + ", wordIds=" + Arrays.toString(wordIds) + + '}'; + } + } + public record EntryDataWithWordMeta(long docId, long docMeta, WordWithMeta... wordIds) { + @Override + public String toString() { + return "EntryDataWithWordMeta{" + + "docId=" + docId + + ", docMeta=" + docMeta + + ", wordIds=" + Arrays.toString(wordIds) + + '}'; + } + } + public record WordWithMeta(long wordId, long meta) {} + + public static WordWithMeta wm(long wordId, long meta) { + return new WordWithMeta(wordId, meta); + } + + IndexJournalReader createReader(EntryData... entries) throws IOException { + Path jf = Files.createTempFile(tempDir, "journal", ".dat"); + + var writer = new IndexJournalWriterSingleFileImpl(jf); + for (var entry : entries) { + long[] data = new long[entry.wordIds.length * 2]; + for (int i = 0; i < entry.wordIds.length; i++) + data[i*2] = entry.wordIds[i]; + + writer.put(new IndexJournalEntryHeader(entries.length, 0, entry.docId, entry.docMeta), + new IndexJournalEntryData(data)); + } + writer.close(); + var ret = new IndexJournalReaderSingleCompressedFile(jf); + return ret; + } + + public IndexJournalReader createReader(EntryDataWithWordMeta... entries) throws IOException { + Path jf = Files.createTempFile(tempDir, "journal", ".dat"); + + var writer = new IndexJournalWriterSingleFileImpl(jf); + for (var entry : entries) { + long[] data = new long[entry.wordIds.length * 2]; + for (int i = 0; i < entry.wordIds.length; i++) { + data[i * 2] = entry.wordIds[i].wordId; + data[i * 2 + 1] = entry.wordIds[i].meta; + } + + writer.put(new IndexJournalEntryHeader(entries.length, 0, entry.docId, entry.docMeta), + new IndexJournalEntryData(data)); + } + writer.close(); + var ret = new IndexJournalReaderSingleCompressedFile(jf); + return ret; + } +} diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/TestSegmentData.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/TestSegmentData.java new file mode 100644 index 00000000..574bb61a --- /dev/null +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/TestSegmentData.java @@ -0,0 +1,41 @@ +package nu.marginalia.index.construction; + +import java.util.Arrays; + +record TestSegmentData(long wordId, long start, long end, long[] data) { + public TestSegmentData(long wordId, long start, long end) { + this(wordId, start, end, null); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + TestSegmentData that = (TestSegmentData) o; + + if (wordId != that.wordId) return false; + if (start != that.start) return false; + if (end != that.end) return false; + return Arrays.equals(data, that.data); + } + + @Override + public int hashCode() { + int result = (int) (wordId ^ (wordId >>> 32)); + result = 31 * result + (int) (start ^ (start >>> 32)); + result = 31 * result + (int) (end ^ (end >>> 32)); + result = 31 * result + Arrays.hashCode(data); + return result; + } + + @Override + public String toString() { + return "TestSegmentData{" + + "wordId=" + wordId + + ", start=" + start + + ", end=" + end + + ", data=" + Arrays.toString(data) + + '}'; + } +} diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest.java deleted file mode 100644 index 7644d019..00000000 --- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest.java +++ /dev/null @@ -1,140 +0,0 @@ -package nu.marginalia.index.reverse; - -import lombok.SneakyThrows; -import nu.marginalia.array.buffer.LongQueryBuffer; -import nu.marginalia.index.full.ReverseIndexFullConverter; -import nu.marginalia.index.full.ReverseIndexFullReader; -import nu.marginalia.index.journal.model.IndexJournalEntry; -import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile; -import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; -import nu.marginalia.index.journal.writer.IndexJournalWriter; -import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; -import nu.marginalia.ranking.DomainRankings; -import nu.marginalia.lexicon.KeywordLexicon; -import nu.marginalia.lexicon.journal.KeywordLexiconJournal; -import nu.marginalia.model.idx.DocumentMetadata; -import nu.marginalia.service.control.ServiceHeartbeat; -import nu.marginalia.service.control.ServiceTaskHeartbeat; -import nu.marginalia.test.TestUtil; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.mockito.Mockito; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.stream.IntStream; -import java.util.stream.LongStream; - -import static org.junit.jupiter.api.Assertions.assertArrayEquals; -import static org.mockito.Mockito.when; - -class ReverseIndexFullConverterTest { - KeywordLexicon keywordLexicon; - - Path indexFile; - Path wordsFile1; - Path urlsFile1; - Path dictionaryFile; - - private final Logger logger = LoggerFactory.getLogger(getClass()); - - @BeforeEach - @SneakyThrows - void setUp() { - dictionaryFile = Files.createTempFile("tmp", ".dict"); - dictionaryFile.toFile().deleteOnExit(); - - keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile(), KeywordLexiconJournalMode.READ_WRITE)); - keywordLexicon.getOrInsert("0"); - - indexFile = Files.createTempFile("tmp", ".idx"); - indexFile.toFile().deleteOnExit(); - - - wordsFile1 = Files.createTempFile("words1", ".idx"); - urlsFile1 = Files.createTempFile("urls1", ".idx"); - } - - public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) { - int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray(); - - var entryBuilder = IndexJournalEntry.builder(id, DocumentMetadata.defaultValue()); - - for (int i = 0; i < factors.length; i++) { - entryBuilder.add(keywordLexicon.getOrInsert(Integer.toString(factors[i])), -factors[i]); - } - - writer.put(entryBuilder.build()); - } - - @Test - void testReverseIndex() throws IOException { - var writer = new IndexJournalWriterImpl(keywordLexicon, indexFile); - - for (int i = 1; i < 512; i++) { - createEntry(writer, keywordLexicon, i); - } - - writer.close(); - - - Path tmpDir = Path.of("/tmp"); - Path dataDir = Files.createTempDirectory(getClass().getSimpleName()); - - var wordsFile = dataDir.resolve("urls.dat"); - var docsFile = dataDir.resolve("docs.dat"); - var journalReader = new IndexJournalReaderSingleCompressedFile(indexFile); - - // RIP fairies - var serviceHeartbeat = Mockito.mock(ServiceHeartbeat.class); - when(serviceHeartbeat.createServiceTaskHeartbeat(Mockito.any(), Mockito.any())) - .thenReturn(Mockito.mock(ServiceTaskHeartbeat.class)); - - new ReverseIndexFullConverter( - serviceHeartbeat, - tmpDir, journalReader, new DomainRankings(), wordsFile, docsFile) - .convert(); - - var reverseIndexReader = new ReverseIndexFullReader(wordsFile, docsFile); - - System.out.println(reverseIndexReader.numDocuments(keywordLexicon.getReadOnly("1"))); - System.out.println(reverseIndexReader.numDocuments(keywordLexicon.getReadOnly("2"))); - System.out.println(reverseIndexReader.numDocuments(keywordLexicon.getReadOnly("3"))); - - System.out.println(reverseIndexReader.isWordInDoc(keywordLexicon.getReadOnly("1"), 1)); - System.out.println(reverseIndexReader.isWordInDoc(keywordLexicon.getReadOnly("2"), 1)); - System.out.println(reverseIndexReader.isWordInDoc(keywordLexicon.getReadOnly("1"), 2)); - System.out.println(reverseIndexReader.isWordInDoc(keywordLexicon.getReadOnly("2"), 2)); - System.out.println(reverseIndexReader.isWordInDoc(keywordLexicon.getReadOnly("1"), 3)); - System.out.println(reverseIndexReader.isWordInDoc(keywordLexicon.getReadOnly("2"), 3)); - - var buffer = new LongQueryBuffer(32); - reverseIndexReader.documents(keywordLexicon.getReadOnly("1")).read(buffer); - assertArrayEquals(LongStream.range(1, 17).map(v -> v | (255L << 32)).toArray(), buffer.copyData()); - System.out.println(buffer); - - buffer.reset(); - reverseIndexReader.documents(keywordLexicon.getReadOnly("2")).read(buffer); - assertArrayEquals(LongStream.range(1, 17).map(v -> v*2).map(v -> v | (255L << 32)).toArray(), buffer.copyData()); - System.out.println(buffer); - - buffer.reset(); - reverseIndexReader.documents(keywordLexicon.getReadOnly("3")).read(buffer); - assertArrayEquals(LongStream.range(1, 17).map(v -> v*3).map(v -> v | (255L << 32)).toArray(), buffer.copyData()); - System.out.println(buffer); - - buffer.reset(); - var es = reverseIndexReader.documents(keywordLexicon.getReadOnly("7")); - do { - buffer.reset(); - es.read(buffer); - System.out.println(buffer); - } while (es.hasMore()); - - - TestUtil.clearTempDir(dataDir); - } -} \ No newline at end of file diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java deleted file mode 100644 index 4da283a0..00000000 --- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java +++ /dev/null @@ -1,179 +0,0 @@ -package nu.marginalia.index.reverse; - -import lombok.SneakyThrows; -import nu.marginalia.array.buffer.LongQueryBuffer; -import nu.marginalia.index.full.ReverseIndexFullConverter; -import nu.marginalia.index.full.ReverseIndexFullReader; -import nu.marginalia.index.journal.model.IndexJournalEntryData; -import nu.marginalia.index.journal.model.IndexJournalEntryHeader; -import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile; -import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; -import nu.marginalia.index.journal.writer.IndexJournalWriter; -import nu.marginalia.index.priority.ReverseIndexPriorityParameters; -import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; -import nu.marginalia.ranking.DomainRankings; -import nu.marginalia.lexicon.KeywordLexicon; -import nu.marginalia.lexicon.journal.KeywordLexiconJournal; -import nu.marginalia.service.control.ServiceHeartbeat; -import nu.marginalia.service.control.ServiceTaskHeartbeat; -import nu.marginalia.test.TestUtil; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.mockito.Mockito; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; -import java.util.stream.IntStream; -import java.util.stream.LongStream; - -import static org.mockito.Mockito.when; - -class ReverseIndexFullConverterTest2 { - - KeywordLexicon keywordLexicon; - IndexJournalWriter writer; - - Path indexFile; - Path wordsFile1; - Path urlsFile1; - Path dictionaryFile; - - private final Logger logger = LoggerFactory.getLogger(getClass()); - - Path dataDir; - private Path wordsFile; - private Path docsFile; - - int workSetSize = 8192; - int workSetStart = 8000; - - @BeforeEach - @SneakyThrows - void setUp() { - dictionaryFile = Files.createTempFile("tmp", ".dict"); - dictionaryFile.toFile().deleteOnExit(); - - keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile(), KeywordLexiconJournalMode.READ_WRITE)); - keywordLexicon.getOrInsert("0"); - - indexFile = Files.createTempFile("tmp", ".idx"); - indexFile.toFile().deleteOnExit(); - writer = new IndexJournalWriterImpl(keywordLexicon, indexFile); - - wordsFile1 = Files.createTempFile("words1", ".idx"); - urlsFile1 = Files.createTempFile("urls1", ".idx"); - - dataDir = Files.createTempDirectory(getClass().getSimpleName()); - - for (int i = 1; i < workSetSize; i++) { - if (i < workSetStart) { - keywordLexicon.getOrInsert(Integer.toString(i)); - } - else { - createEntry(writer, keywordLexicon, i); - } - } - - keywordLexicon.commitToDisk(); - Thread.sleep(1000); - writer.close(); - - var reader = new IndexJournalReaderSingleCompressedFile(indexFile); - - wordsFile = dataDir.resolve("words.dat"); - docsFile = dataDir.resolve("docs.dat"); - } - - @AfterEach - public void tearDown() { - TestUtil.clearTempDir(dataDir); - } - - public int[] getFactorsI(int id) { - return IntStream.rangeClosed(1, id-1).toArray(); - } - public long[] getFactorsL(int id) { - return LongStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray(); - } - - long createId(long url, long domain) { - return (domain << 32) | url; - } - public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) { - int[] factors = getFactorsI(id); - var header = new IndexJournalEntryHeader(factors.length, 0, createId(id, id/20), id % 5); - - long[] data = new long[factors.length*2]; - for (int i = 0; i < factors.length; i++) { - data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i])); - data[2*i + 1] = (i % 21 != 0) ? 0 : -factors[i]; - } - - writer.put(header, new IndexJournalEntryData(data)); - } - - @Test - void testRev2() throws IOException { - - Path tmpDir = Path.of("/tmp"); - - // RIP fairies - var serviceHeartbeat = Mockito.mock(ServiceHeartbeat.class); - when(serviceHeartbeat.createServiceTaskHeartbeat(Mockito.any(), Mockito.any())) - .thenReturn(Mockito.mock(ServiceTaskHeartbeat.class)); - - new ReverseIndexFullConverter(serviceHeartbeat, tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile), new DomainRankings(), wordsFile, docsFile).convert(); - - var reverseReader = new ReverseIndexFullReader(wordsFile, docsFile); - - for (int i = workSetStart; i < workSetSize; i++) { - - var es = reverseReader.documents(i); - LongQueryBuffer lqb = new LongQueryBuffer(100); - while (es.hasMore()) { - lqb.reset(); - es.read(lqb); - System.out.println(Arrays.toString(Arrays.copyOf(lqb.data, lqb.end))); - } - System.out.println("--"); - } - - TestUtil.clearTempDir(dataDir); - } - - - @Test - void testRevP() throws IOException { - - Path tmpDir = Path.of("/tmp"); - - // RIP fairies - var serviceHeartbeat = Mockito.mock(ServiceHeartbeat.class); - when(serviceHeartbeat.createServiceTaskHeartbeat(Mockito.any(), Mockito.any())) - .thenReturn(Mockito.mock(ServiceTaskHeartbeat.class)); - - new ReverseIndexFullConverter(serviceHeartbeat, tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile, null, ReverseIndexPriorityParameters::filterPriorityRecord), new DomainRankings(), wordsFile, docsFile).convert(); - - var reverseReader = new ReverseIndexFullReader(wordsFile, docsFile); - - for (int i = workSetStart; i < workSetSize; i++) { - - var es = reverseReader.documents(i); - LongQueryBuffer lqb = new LongQueryBuffer(100); - while (es.hasMore()) { - lqb.reset(); - es.read(lqb); - System.out.println(Arrays.toString(Arrays.copyOf(lqb.data, lqb.end))); - } - System.out.println("--"); - } - - TestUtil.clearTempDir(dataDir); - } - -} \ No newline at end of file diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java deleted file mode 100644 index a5ad6940..00000000 --- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java +++ /dev/null @@ -1,179 +0,0 @@ -package nu.marginalia.index.reverse; - -import lombok.SneakyThrows; -import nu.marginalia.array.buffer.LongQueryBuffer; -import nu.marginalia.index.journal.model.IndexJournalEntryData; -import nu.marginalia.index.journal.model.IndexJournalEntryHeader; -import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile; -import nu.marginalia.index.journal.writer.IndexJournalWriter; -import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; -import nu.marginalia.index.priority.ReverseIndexPriorityReader; -import nu.marginalia.index.priority.ReverseIndexPriorityConverter; -import nu.marginalia.index.priority.ReverseIndexPriorityParameters; -import nu.marginalia.lexicon.KeywordLexicon; -import nu.marginalia.lexicon.journal.KeywordLexiconJournal; -import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; -import nu.marginalia.ranking.DomainRankings; -import nu.marginalia.service.control.ServiceHeartbeat; -import nu.marginalia.service.control.ServiceTaskHeartbeat; -import nu.marginalia.test.TestUtil; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.mockito.Mockito; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; -import java.util.stream.IntStream; -import java.util.stream.LongStream; - -import static org.mockito.Mockito.when; - -class ReverseIndexPriorityConverterTest2 { - - KeywordLexicon keywordLexicon; - IndexJournalWriter writer; - - Path indexFile; - Path wordsFile1; - Path urlsFile1; - Path dictionaryFile; - - private final Logger logger = LoggerFactory.getLogger(getClass()); - - Path dataDir; - private Path wordsFile; - private Path docsFile; - - int workSetSize = 8192; - int workSetStart = 8000; - - @BeforeEach - @SneakyThrows - void setUp() { - dictionaryFile = Files.createTempFile("tmp", ".dict"); - dictionaryFile.toFile().deleteOnExit(); - - keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile(), KeywordLexiconJournalMode.READ_WRITE)); - keywordLexicon.getOrInsert("0"); - - indexFile = Files.createTempFile("tmp", ".idx"); - indexFile.toFile().deleteOnExit(); - writer = new IndexJournalWriterImpl(keywordLexicon, indexFile); - - wordsFile1 = Files.createTempFile("words1", ".idx"); - urlsFile1 = Files.createTempFile("urls1", ".idx"); - - dataDir = Files.createTempDirectory(getClass().getSimpleName()); - - for (int i = 1; i < workSetSize; i++) { - if (i < workSetStart) { - keywordLexicon.getOrInsert(Integer.toString(i)); - } - else { - createEntry(writer, keywordLexicon, i); - } - } - - keywordLexicon.commitToDisk(); - Thread.sleep(1000); - writer.close(); - - var reader = new IndexJournalReaderSingleCompressedFile(indexFile); - - wordsFile = dataDir.resolve("words.dat"); - docsFile = dataDir.resolve("docs.dat"); - } - - @AfterEach - public void tearDown() { - TestUtil.clearTempDir(dataDir); - } - - public int[] getFactorsI(int id) { - return IntStream.rangeClosed(1, id-1).toArray(); - } - public long[] getFactorsL(int id) { - return LongStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray(); - } - - long createId(long url, long domain) { - return (domain << 32) | url; - } - public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) { - int[] factors = getFactorsI(id); - var header = new IndexJournalEntryHeader(factors.length, 0, createId(id, id/20), id % 5); - - long[] data = new long[factors.length*2]; - for (int i = 0; i < factors.length; i++) { - data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i])); - data[2*i + 1] = (i % 21 != 0) ? 0 : -factors[i]; - } - - writer.put(header, new IndexJournalEntryData(data)); - } - - @Test - void testRev2() throws IOException { - - Path tmpDir = Path.of("/tmp"); - - // RIP fairies - var serviceHeartbeat = Mockito.mock(ServiceHeartbeat.class); - when(serviceHeartbeat.createServiceTaskHeartbeat(Mockito.any(), Mockito.any())) - .thenReturn(Mockito.mock(ServiceTaskHeartbeat.class)); - - new ReverseIndexPriorityConverter(serviceHeartbeat, tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile), new DomainRankings(), wordsFile, docsFile).convert(); - - var reverseReader = new ReverseIndexPriorityReader(wordsFile, docsFile); - - for (int i = workSetStart; i < workSetSize; i++) { - - var es = reverseReader.priorityDocuments(i); - LongQueryBuffer lqb = new LongQueryBuffer(100); - while (es.hasMore()) { - lqb.reset(); - es.read(lqb); - System.out.println(Arrays.toString(Arrays.copyOf(lqb.data, lqb.end))); - } - System.out.println("--"); - } - - TestUtil.clearTempDir(dataDir); - } - - - @Test - void testRevP() throws IOException { - - Path tmpDir = Path.of("/tmp"); - - // RIP fairies - var serviceHeartbeat = Mockito.mock(ServiceHeartbeat.class); - when(serviceHeartbeat.createServiceTaskHeartbeat(Mockito.any(), Mockito.any())) - .thenReturn(Mockito.mock(ServiceTaskHeartbeat.class)); - - new ReverseIndexPriorityConverter(serviceHeartbeat, tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile, null, ReverseIndexPriorityParameters::filterPriorityRecord), new DomainRankings(), wordsFile, docsFile).convert(); - - var reverseReader = new ReverseIndexPriorityReader(wordsFile, docsFile); - - for (int i = workSetStart; i < workSetSize; i++) { - - var es = reverseReader.priorityDocuments(i); - LongQueryBuffer lqb = new LongQueryBuffer(100); - while (es.hasMore()) { - lqb.reset(); - es.read(lqb); - System.out.println(Arrays.toString(Arrays.copyOf(lqb.data, lqb.end))); - } - System.out.println("--"); - } - - TestUtil.clearTempDir(dataDir); - } - -} \ No newline at end of file diff --git a/code/features-index/lexicon/build.gradle b/code/features-index/lexicon/build.gradle deleted file mode 100644 index dfe3ef66..00000000 --- a/code/features-index/lexicon/build.gradle +++ /dev/null @@ -1,40 +0,0 @@ -plugins { - id 'java' - id "io.freefair.lombok" version "8.2.2" - - id 'jvm-test-suite' -} - - -java { - toolchain { - languageVersion.set(JavaLanguageVersion.of(20)) - } -} - -dependencies { - - implementation project(':code:libraries:next-prime') - implementation libs.lombok - annotationProcessor libs.lombok - implementation libs.bundles.slf4j - - implementation libs.prometheus - implementation libs.guava - implementation libs.fastutil - implementation project(':third-party:commons-codec') - - testImplementation libs.bundles.slf4j.test - testImplementation libs.bundles.junit - testImplementation libs.mockito -} - -test { - useJUnitPlatform() -} - -task fastTests(type: Test) { - useJUnitPlatform { - excludeTags "slow" - } -} diff --git a/code/features-index/lexicon/readme.md b/code/features-index/lexicon/readme.md deleted file mode 100644 index c4f0225c..00000000 --- a/code/features-index/lexicon/readme.md +++ /dev/null @@ -1,19 +0,0 @@ -# Lexicon - -The lexicon contains a mapping for words to identifiers. - -To ease index construction, it makes calculations easier if the domain of word identifiers is dense, that is, there is no gaps between ids; if there are 100 words, they're indexed 0-99 and not 5, 23, 107, 9999, 819235 etc. The lexicon exists to create such a mapping. - -This lexicon is populated from a journal. The actual word data isn't mapped, but rather a 64 bit hash. As a result of the birthday paradox, colissions will be rare up until about to 232 words. - - -The lexicon is constructed by [processes/loading-process](../../processes/loading-process) and read when -[services-core/index-service](../../services-core/index-service) interprets queries. - -## Central Classes - -* [KeywordLexicon](src/main/java/nu/marginalia/lexicon/KeywordLexicon.java) -* [KeywordLexiconJournal](src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournal.java) -* [DictionaryMap](src/main/java/nu/marginalia/dict/DictionaryMap.java) comes in two versions -* * [OnHeapDictionaryMap](src/main/java/nu/marginalia/dict/OnHeapDictionaryMap.java) - basically just a fastutil Long2IntOpenHashMap -* * [OffHeapDictionaryHashMap](src/main/java/nu/marginalia/dict/OffHeapDictionaryHashMap.java) - a heavily modified trove TLongIntHashMap that uses off heap memory diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryData.java b/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryData.java deleted file mode 100644 index ea291052..00000000 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryData.java +++ /dev/null @@ -1,42 +0,0 @@ -package nu.marginalia.dict; - -import java.util.ArrayList; - -public class DictionaryData { - private final int bankSize; - - private final ArrayList banks = new ArrayList<>(100); - - public DictionaryData(int bankSize) { - this.bankSize = bankSize; - banks.add(new DictionaryDataBank(0, bankSize)); - } - - public int add(long key) { - var activeBank = banks.get(banks.size()-1); - int rb = activeBank.add(key); - - if (rb == -1) { - int end = activeBank.getEnd(); - var newBank = new DictionaryDataBank(end, bankSize); - rb = newBank.add(key); - - banks.add(newBank); - } - - return rb; - } - - - public long getKey(int offset) { - return banks.get(offset/ bankSize).getKey(offset); - } - public boolean keyEquals(int offset, long otherKey) { - return banks.get(offset/ bankSize).keyEquals(offset, otherKey); - } - - public void clear() { - banks.clear(); - banks.add(new DictionaryDataBank(0, bankSize)); - } -} diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryDataBank.java b/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryDataBank.java deleted file mode 100644 index 75798dcb..00000000 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryDataBank.java +++ /dev/null @@ -1,63 +0,0 @@ -package nu.marginalia.dict; - -import java.nio.ByteBuffer; -import java.nio.LongBuffer; - -class DictionaryDataBank { - - private final int start_idx; - - // Humongous long-lived arrays seem to sometimes yield considerable memory overhead and - // can make the GC behave poorly. Using off-heap memory seems preferred when their - // lifetime is "forever" - - private final LongBuffer keys; - - private int size; - private final int capacity; - - - public DictionaryDataBank(int start_idx, int sz) { - this.start_idx = start_idx; - this.capacity = sz; - - keys = ByteBuffer.allocateDirect(8 * capacity).asLongBuffer(); - size = 0; - } - - public int getStart() { - return start_idx; - } - - public int getEnd() { - return start_idx + size; - } - - public long getKey(int idx) { - if (idx < start_idx || idx - start_idx >= size) { - throw new IndexOutOfBoundsException(idx); - } - return keys.get(idx - start_idx); - } - - public boolean keyEquals(int idx, long other) { - if (idx < start_idx || idx - start_idx >= size) { - throw new IndexOutOfBoundsException(idx); - } - - return keys.get(idx - start_idx) == other; - } - - public int add(long newKey) { - if (size >= capacity) - return -1; - - keys.put(size, newKey); - - return start_idx + size++; - } - - public int getSize() { - return size; - } -} diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryMap.java b/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryMap.java deleted file mode 100644 index 260015be..00000000 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/DictionaryMap.java +++ /dev/null @@ -1,27 +0,0 @@ -package nu.marginalia.dict; - -/** Backing store for the KeywordLexicon, available in on and off-heap versions. - *

- * The off-heap version is necessary when loading a lexicon that is too large to fit in RAM, due - * to Java's 2GB limit on the size of a single array. It is slower and less optimized than the on-heap version. - *

- * The off-heap version is on the precipice of being deprecated and its use is discouraged. - */ -public interface DictionaryMap { - int NO_VALUE = Integer.MIN_VALUE; - - static DictionaryMap create() { - // Default to on-heap version - // TODO: Make this configurable - - return new OnHeapDictionaryMap(); - } - - void clear(); - - int size(); - - int put(long key); - - int get(long key); -} diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/OffHeapDictionaryHashMap.java b/code/features-index/lexicon/src/main/java/nu/marginalia/dict/OffHeapDictionaryHashMap.java deleted file mode 100644 index 6a7aa07f..00000000 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/OffHeapDictionaryHashMap.java +++ /dev/null @@ -1,172 +0,0 @@ -package nu.marginalia.dict; - -import nu.marginalia.util.NextPrimeUtil; - -import java.nio.ByteBuffer; -import java.nio.IntBuffer; -import java.util.concurrent.atomic.AtomicInteger; - -import static java.lang.Math.round; - -/** - * Spiritually influenced by GNU Trove's hash maps - * LGPL 2.1 - */ -public class OffHeapDictionaryHashMap implements DictionaryMap { - - private final int bufferCount; - - private final IntBuffer[] buffers; - private final DictionaryData dictionaryData; - - private final long hashTableSize; - private final int bufferSizeBytes; - private final int intsPerBuffer; - private final long maxProbeLength; - - private final AtomicInteger sz = new AtomicInteger(0); - - public OffHeapDictionaryHashMap(long sizeMemory) { - final int intSize = 4; - - bufferCount = 1 + (int) ((intSize*sizeMemory) / (1<<30)); - buffers = new IntBuffer[bufferCount]; - - // Actually use a prime size for Donald Knuth reasons - hashTableSize = NextPrimeUtil.nextPrime(sizeMemory, -1); - - intsPerBuffer = 1 + (int)(sizeMemory/ bufferCount); - bufferSizeBytes = intSize*intsPerBuffer; - maxProbeLength = sizeMemory/10; - - if (((long) bufferCount * intsPerBuffer) < sizeMemory) { - throw new Error("Buffer memory is less than requested memory; this data structure is not safe to use"); - } - - dictionaryData = new DictionaryData((int)Math.min(1<<27, Math.max(32L, sizeMemory/4))); - - initializeBuffers(); - } - - private void initializeBuffers() { - for (int b = 0; b < bufferCount; b++) { - buffers[b] = ByteBuffer.allocateDirect(bufferSizeBytes).asIntBuffer(); - - for (int i = 0; i < intsPerBuffer; i++) { - buffers[b].put(i, NO_VALUE); - } - } - } - - @Override - public void clear() { - dictionaryData.clear(); - initializeBuffers(); - sz.set(0); - } - - @Override - public int size() { - return sz.get(); - } - - private int getCell(long idx) { - int buffer = (int)(idx / intsPerBuffer); - int bufferIdx = (int)(idx % intsPerBuffer); - return buffers[buffer].get(bufferIdx); - } - private void setCell(long idx, int val) { - int buffer = (int)(idx / intsPerBuffer); - int bufferIdx = (int)(idx % intsPerBuffer); - - buffers[buffer].put(bufferIdx, val); - } - - @Override - public int put(long key) { - - long hash = key & 0x7FFF_FFFF_FFFF_FFFFL; - - long idx = hash % hashTableSize; - - if (getCell(idx) == NO_VALUE) { - return setValue(key, idx); - } - - return putRehash(key, idx, hash); - } - - private int putRehash(long key, long idx, long hash) { - final long pStride = 1 + (hash % (hashTableSize - 2)); - - for (long j = 1; j < maxProbeLength; j++) { - idx = idx - pStride; - - if (idx < 0) { - idx += hashTableSize; - } - - final int val = getCell(idx); - - if (val == NO_VALUE) { - return setValue(key, idx); - } - else if (dictionaryData.keyEquals(val, key)) { - return val; - } - } - - throw new IllegalStateException("DictionaryHashMap full @ size " + size() + "/" + hashTableSize + ", " + round((100.0*size()) / hashTableSize) + "%"); - } - - private int setValue(long key, long cell) { - sz.incrementAndGet(); - - int di = dictionaryData.add(key); - setCell(cell, di); - return di; - } - - @Override - public int get(long key) { - final long hash = key & 0x7FFF_FFFF_FFFF_FFFFL; - final long cell = hash % hashTableSize; - - if (getCell(cell) == NO_VALUE) { - return NO_VALUE; - } - else { - int val = getCell(cell); - - if (dictionaryData.keyEquals(val, key)) { - return val; - } - } - - return getRehash(key, cell, hash); - } - - private int getRehash(long key, long idx, long hash) { - final long pStride = 1 + (hash % (hashTableSize - 2)); - - for (long j = 1; j < maxProbeLength; j++) { - idx = idx - pStride; - - if (idx < 0) { - idx += hashTableSize; - } - - final var val = getCell(idx); - - if (val == NO_VALUE) { - return NO_VALUE; - } - else if (dictionaryData.keyEquals(val, key)) { - return val; - } - } - - throw new IllegalStateException("DictionaryHashMap full @ size " + size() + "/" + hashTableSize + ", " + round((100.0*size()) / hashTableSize) + "%"); - } - -} diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/OnHeapDictionaryMap.java b/code/features-index/lexicon/src/main/java/nu/marginalia/dict/OnHeapDictionaryMap.java deleted file mode 100644 index 4662cd5c..00000000 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/dict/OnHeapDictionaryMap.java +++ /dev/null @@ -1,61 +0,0 @@ -package nu.marginalia.dict; - -import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap; - -public class OnHeapDictionaryMap implements DictionaryMap { - /* Use three different hash tables to get around the limitations of Java's array sizes. - * - * Each map fits 0.75 * 2^30 entries (~800mn); the three maps together fit a bit over 2^31 entries. - * We're happy with 2^31. - * - * We'll assign each term to one of the three maps based on their modulo of 3. We'll pray each - * night that Long2IntOpenHashMap hash function is good enough to cope with this. The keys we are - * inserting are 64 bit hashes already, so odds are the rest of the bits have very good entropy. - */ - private static final int DEFAULT_SIZE = Integer.getInteger("lexiconSizeHint", 100_000)/3; - private final Long2IntOpenHashMap[] entries = new Long2IntOpenHashMap[3]; - - public OnHeapDictionaryMap() { - for (int i = 0; i < entries.length; i++) { - entries[i] = new Long2IntOpenHashMap(DEFAULT_SIZE, 0.75f); - } - } - - @Override - public void clear() { - for (var map : entries) { - map.clear(); - } - } - - @Override - public int size() { - int totalSize = 0; - for (var map : entries) { - totalSize += map.size(); - } - return totalSize; - } - - @Override - public int put(long key) { - int shardIdx = (int) Long.remainderUnsigned(key, 3); - var shard = entries[shardIdx]; - int size = size(); - - if (size == Integer.MAX_VALUE) - throw new IllegalStateException("DictionaryMap is full"); - - shard.putIfAbsent(key, size); - - return get(key); - } - - @Override - public int get(long key) { - int shardIdx = (int) Long.remainderUnsigned(key, 3); - var shard = entries[shardIdx]; - - return shard.getOrDefault(key, NO_VALUE); - } -} diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexicon.java b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexicon.java deleted file mode 100644 index 9132f151..00000000 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexicon.java +++ /dev/null @@ -1,170 +0,0 @@ -package nu.marginalia.lexicon; - -import io.prometheus.client.Gauge; -import lombok.SneakyThrows; -import nu.marginalia.dict.DictionaryMap; -import nu.marginalia.hash.MurmurHash3_128; -import nu.marginalia.lexicon.journal.KeywordLexiconJournal; -import nu.marginalia.lexicon.journal.KeywordLexiconJournalFingerprint; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.locks.Lock; -import java.util.concurrent.locks.ReadWriteLock; -import java.util.concurrent.locks.ReentrantReadWriteLock; - -/** The keyword lexicon is used to map keywords to unique numeric IDs. - * This class is used to both construct the lexicon, and to read from it. - *

- * Readers will want to use the KeywordLexiconReadOnlyView wrapper, as it - * only exposes read-only methods and hides the mutating methods. - *

- * Between instances, the lexicon is stored in a journal file, exactly in the - * order they were received by the writer. The journal file is then replayed - * on startup to reconstruct the lexicon, giving each term an ID according to - * the order they are loaded. It is therefore important that the journal file - * is not tampered with, as this will cause the lexicon to be corrupted. - * */ - -public class KeywordLexicon implements AutoCloseable { - private final DictionaryMap reverseIndex; - - private final ReadWriteLock memoryLock = new ReentrantReadWriteLock(); - private final Logger logger = LoggerFactory.getLogger(getClass()); - - private static final AtomicInteger instances = new AtomicInteger(); - - private static final Gauge request_time_metrics - = Gauge.build("wmsa_edge_index_dictionary_size", "Dictionary Size") - .register(); - private final KeywordLexiconJournal journal; - - private volatile KeywordLexiconJournalFingerprint fingerprint = null; - - private final MurmurHash3_128 hasher = new MurmurHash3_128(); - - @SneakyThrows - public KeywordLexicon(KeywordLexiconJournal keywordLexiconJournal) { - - journal = keywordLexiconJournal; - reverseIndex = DictionaryMap.create(); - - logger.info("Creating dictionary writer"); - - if (!instances.compareAndSet(0, 1)) { - logger.error("MULTIPLE LEXICON INSTANCES!"); - } - - reload(); - - logger.info("Done creating dictionary writer"); - } - - public boolean needsReload() throws IOException { - var newFingerprint = journal.journalFingerprint(); - return !newFingerprint.equals(fingerprint); - } - - /** Reload the lexicon from the journal */ - public void reload() throws IOException { - var lock = memoryLock.writeLock(); - lock.lock(); - try { - reverseIndex.clear(); - journal.loadFile(bytes -> reverseIndex.put(hasher.hash(bytes))); - fingerprint = journal.journalFingerprint(); - } - finally { - lock.unlock(); - } - } - - /** Get method that inserts the word into the lexicon if it is not present */ - public int getOrInsert(String macroWord) { - return getOrInsert(macroWord.getBytes(StandardCharsets.UTF_8)); - } - - /** Get method that inserts the word into the lexicon if it is not present */ - @SneakyThrows - private int getOrInsert(byte[] bytes) { - if (bytes.length >= Byte.MAX_VALUE) { - logger.warn("getOrInsert({}), illegal length {}", new String(bytes), bytes.length); - return DictionaryMap.NO_VALUE; - } - - final long key = hasher.hash(bytes); - - int idx = getReadOnly(key); - - if (idx < 0) { - idx = insertNew(key, bytes); - } - - return idx; - } - - private int insertNew(long key, byte[] bytes) throws InterruptedException { - Lock lock = memoryLock.writeLock(); - int idx; - try { - lock.lock(); - - // Check again to prevent race condition - if ((idx = reverseIndex.get(key)) >= 0) - return idx; - - journal.enqueue(bytes); - idx = reverseIndex.put(key); - request_time_metrics.set(reverseIndex.size()); - - return idx; - } - finally { - lock.unlock(); - } - } - - /** Get method that does not modify the lexicon if the word is not present */ - public int getReadOnly(String word) { - final byte[] bytes = word.getBytes(StandardCharsets.UTF_8); - return getReadOnly(hasher.hash(bytes)); - } - - /** Get method that does not modify the lexicon if the word is not present */ - public int getReadOnly(long hashedKey) { - Lock lock = memoryLock.readLock(); - try { - lock.lock(); - return reverseIndex.get(hashedKey); - } - finally { - lock.unlock(); - } - } - - public long size() { - Lock lock = memoryLock.readLock(); - try { - lock.lock(); - return reverseIndex.size(); - } - finally { - lock.unlock(); - } - } - - @Override - public void close() throws Exception { - logger.warn("Closing Lexicon"); - - journal.close(); - } - - public void commitToDisk() { - journal.commitToDisk(); - } -} - diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexiconReadOnlyView.java b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexiconReadOnlyView.java deleted file mode 100644 index 076cc84d..00000000 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/KeywordLexiconReadOnlyView.java +++ /dev/null @@ -1,42 +0,0 @@ -package nu.marginalia.lexicon; - -import com.google.common.cache.Cache; -import com.google.common.cache.CacheBuilder; -import lombok.SneakyThrows; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.concurrent.TimeUnit; - -/** A read-only view of a keyword lexicon. - * - * @see KeywordLexicon - * */ -public class KeywordLexiconReadOnlyView { - private final KeywordLexicon writer; - private final Logger logger = LoggerFactory.getLogger(getClass()); - private final Cache cache = CacheBuilder.newBuilder().maximumSize(10_000).expireAfterAccess(60, TimeUnit.SECONDS).build(); - - @SneakyThrows - public KeywordLexiconReadOnlyView(KeywordLexicon writer) { - this.writer = writer; - } - - @SneakyThrows - public int get(String word) { - return cache.get(word, () -> writer.getReadOnly(word)); - } - - public boolean suggestReload() throws IOException { - if (writer.needsReload()) { - logger.info("Reloading lexicon"); - writer.reload(); - cache.invalidateAll(); - } - else { - logger.info("Foregoing lexicon reload"); - } - return true; - } -} diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournal.java b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournal.java deleted file mode 100644 index 01ba412b..00000000 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournal.java +++ /dev/null @@ -1,114 +0,0 @@ -package nu.marginalia.lexicon.journal; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.attribute.BasicFileAttributes; -import java.util.List; -import java.util.function.Consumer; - -/** The journal for the keyword lexicon. - * It's used both for writing the lexicon, but also for reconstructing it for reading later. - */ -public class KeywordLexiconJournal { - - private static final boolean noCommit = Boolean.getBoolean("DictionaryJournal.noCommit"); - - private final KeywordLexiconJournalCommitQueue commitQueue; - private KeywordLexiconJournalFile journalFile; - private final Logger logger = LoggerFactory.getLogger(getClass()); - - private final Thread commitToDiskThread; - - private volatile boolean running = true; - private final Path journalFilePath; - - /** Create a new journal. - * - * @param file The file to use for the journal. - * @param mode The mode to use for the journal. If READ_ONLY, the journal will be read-only and refuse - * to accept new entries. - */ - public KeywordLexiconJournal(File file, KeywordLexiconJournalMode mode) throws IOException { - journalFilePath = file.toPath(); - - if (mode == KeywordLexiconJournalMode.READ_WRITE) { - commitQueue = new KeywordLexiconJournalCommitQueue(); - journalFile = new KeywordLexiconJournalFile(file); - - commitToDiskThread = new Thread(this::commitToDiskRunner, "CommitToDiskThread"); - commitToDiskThread.start(); - - Runtime.getRuntime().addShutdownHook(new Thread(this::commitToDisk)); - } - else { - journalFile = new KeywordLexiconJournalFile(file); - - commitQueue = null; - commitToDiskThread = null; - } - } - - public void enqueue(byte[] word) throws InterruptedException { - if (null == commitQueue) - throw new UnsupportedOperationException("Lexicon journal is read-only"); - - commitQueue.enqueue(word); - } - - public KeywordLexiconJournalFingerprint journalFingerprint() throws IOException { - var attributes = Files.readAttributes(journalFilePath, BasicFileAttributes.class); - - long cTime = attributes.creationTime().toMillis(); - long mTime = attributes.lastModifiedTime().toMillis(); - long size = attributes.size(); - - return new KeywordLexiconJournalFingerprint(cTime, mTime, size); - } - - public void commitToDiskRunner() { - if (noCommit) return; - - while (running) { - try { - Thread.sleep(1000); - } catch (InterruptedException e) { - e.printStackTrace(); - } - commitToDisk(); - } - } - - public void commitToDisk() { - List entries = commitQueue.getQueuedEntries(); - - journalFile.writeEntriesToJournal(entries); - } - - public void close() throws Exception { - logger.info("Closing Journal"); - running = false; - - if (commitToDiskThread != null) { - commitToDiskThread.join(); - commitToDisk(); - } - - if (journalFile != null) { - journalFile.close(); - } - } - - public void loadFile(Consumer loadJournalEntry) throws IOException { - if (journalFile != null) { - journalFile.close(); - } - - journalFile = new KeywordLexiconJournalFile(journalFilePath.toFile()); - journalFile.loadFile(loadJournalEntry); - } -} diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalCommitQueue.java b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalCommitQueue.java deleted file mode 100644 index 8ff12d6d..00000000 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalCommitQueue.java +++ /dev/null @@ -1,48 +0,0 @@ -package nu.marginalia.lexicon.journal; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -/** An in-memory queue for lexicon journal entries used to improve the performance of - * large bursts of insert-operations. - */ -class KeywordLexiconJournalCommitQueue { - private final ArrayList commitQueue = new ArrayList<>(10_000); - private final Logger logger = LoggerFactory.getLogger(getClass()); - private static final long BACK_PRESSURE_LIMIT = 25_000; - - public synchronized void enqueue(byte[] word) throws InterruptedException { - for (int queueSize = commitQueue.size(); - queueSize >= BACK_PRESSURE_LIMIT; - queueSize = commitQueue.size()) - { - wait(); - } - - commitQueue.add(word); - } - - - public synchronized List getQueuedEntries() { - List data; - if (commitQueue.isEmpty()) { - return Collections.emptyList(); - } - else { - data = new ArrayList<>(commitQueue); - commitQueue.clear(); - } - - notifyAll(); - - if (data.size() > BACK_PRESSURE_LIMIT) { - logger.warn("Lexicon Journal Backpressure: {}", data.size()); - } - - return data; - } -} diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFile.java b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFile.java deleted file mode 100644 index 81789891..00000000 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFile.java +++ /dev/null @@ -1,162 +0,0 @@ -package nu.marginalia.lexicon.journal; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.File; -import java.io.IOException; -import java.io.RandomAccessFile; -import java.nio.ByteBuffer; -import java.nio.channels.FileChannel; -import java.util.List; -import java.util.concurrent.locks.Lock; -import java.util.concurrent.locks.ReadWriteLock; -import java.util.concurrent.locks.ReentrantReadWriteLock; -import java.util.function.Consumer; - -public class KeywordLexiconJournalFile implements AutoCloseable { - private final RandomAccessFile journalFileRAF; - private final File journalFile; - private final Logger logger = LoggerFactory.getLogger(getClass()); - - private final ReadWriteLock diskLock = new ReentrantReadWriteLock(); - - public KeywordLexiconJournalFile(File journalFile) throws IOException { - this.journalFileRAF = new RandomAccessFile(journalFile, "rw"); - this.journalFile = journalFile; - } - - public void rewind() throws IOException { - journalFileRAF.seek(0); - } - - public void loadFile(Consumer acceptEntry) throws IOException { - if (!journalFile.exists()) { - logger.info("File {} does not exist, can't load", journalFile); - return; - } - - logger.info("Reading {}", journalFile); - - long pos; - if (journalFileRAF.length() < 8) { - pos = 8; - journalFileRAF.writeLong(pos); - } - else { - pos = journalFileRAF.readLong(); - } - - logger.info("Length {} ({})", pos, journalFileRAF.length()); - if (pos == 8) { - logger.info("Empty DB"); - } - - ByteBuffer buffer = ByteBuffer.allocateDirect(8192); - - var channel = journalFileRAF.getChannel(); - - long cp = channel.position(); - try { - buffer.limit(0); - long loaded = 0; - - while (cp < pos || buffer.hasRemaining()) { - if (buffer.limit() - buffer.position() < 4) { - buffer.compact(); - - long rb = channel.read(buffer); - if (rb <= 0) { - break; - } - cp += rb; - buffer.flip(); - } - - int len = buffer.get() & 0xFF; - if (len > Byte.MAX_VALUE) { - logger.warn("Found keyword with impossible length {} near {}, likely corruption", len, cp); - } - while (buffer.limit() - buffer.position() < len) { - buffer.compact(); - int rb = channel.read(buffer); - if (rb <= 0) break; - cp += rb; - buffer.flip(); - } - - if (buffer.limit() < len) { - logger.warn("Partial write at end-of-file!"); - - if (cp >= pos) { - logger.info("... but it's ok"); - } - break; - } - - byte[] data = new byte[len]; - buffer.get(data); - if ((++loaded % 10_000_000) == 0L) { - logger.info("Loaded {} million items", loaded/1_000_000); - } - - acceptEntry.accept(data); - } - } - catch (Exception ex) { - logger.error("IO Exception", ex); - } - - journalFileRAF.seek(pos); - } - - private final ByteBuffer writeBuffer = ByteBuffer.allocateDirect(4096); - - public void writeEntriesToJournal(List data) { - if (data.isEmpty()) - return; - - final FileChannel channel = journalFileRAF.getChannel(); - - if (!channel.isOpen()) { - throw new IllegalStateException("commitToDisk() with closed channel! Cannot commit!"); - } - - Lock writeLock = diskLock.writeLock(); - try { - writeLock.lock(); - - long start = System.currentTimeMillis(); - int ct = data.size(); - - for (byte[] itemBytes : data) { - writeBuffer.clear(); - writeBuffer.put((byte) itemBytes.length); - writeBuffer.put(itemBytes); - writeBuffer.flip(); - - while (writeBuffer.position() < writeBuffer.limit()) - channel.write(writeBuffer, channel.size()); - } - - writeBuffer.clear(); - writeBuffer.putLong(channel.size()); - writeBuffer.flip(); - channel.write(writeBuffer, 0); - - channel.force(false); - - logger.debug("Comitted {} items in {} ms", ct, System.currentTimeMillis() - start); - } - catch (Exception ex) { - logger.error("Error during dictionary commit!!!", ex); - } - finally { - writeLock.unlock(); - } - } - - public void close() throws IOException { - journalFileRAF.close(); - } -} diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFingerprint.java b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFingerprint.java deleted file mode 100644 index a08d7124..00000000 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalFingerprint.java +++ /dev/null @@ -1,10 +0,0 @@ -package nu.marginalia.lexicon.journal; - -/** Contains values used to assess whether the lexicon is in sync with the journal - * or if it has been replaced with a newer version and should be reloaded - * */ -public record KeywordLexiconJournalFingerprint(long createdTime, - long mTime, - long sizeBytes) -{ -} diff --git a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalMode.java b/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalMode.java deleted file mode 100644 index 6208fc47..00000000 --- a/code/features-index/lexicon/src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournalMode.java +++ /dev/null @@ -1,6 +0,0 @@ -package nu.marginalia.lexicon.journal; - -public enum KeywordLexiconJournalMode { - READ_ONLY, - READ_WRITE -} diff --git a/code/features-index/lexicon/src/test/java/nu/marginalia/lexicon/KeywordLexiconTest.java b/code/features-index/lexicon/src/test/java/nu/marginalia/lexicon/KeywordLexiconTest.java deleted file mode 100644 index 98249c27..00000000 --- a/code/features-index/lexicon/src/test/java/nu/marginalia/lexicon/KeywordLexiconTest.java +++ /dev/null @@ -1,78 +0,0 @@ -package nu.marginalia.lexicon; - -import nu.marginalia.dict.OnHeapDictionaryMap; -import nu.marginalia.lexicon.journal.KeywordLexiconJournal; -import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotEquals; - -public class KeywordLexiconTest { - - private Path journalFile; - private KeywordLexicon lexicon; - - @BeforeEach - public void setUp() throws IOException { - journalFile = Files.createTempFile(getClass().getSimpleName(), ".dat"); - - var lexiconJournal = new KeywordLexiconJournal(journalFile.toFile(), KeywordLexiconJournalMode.READ_WRITE); - lexicon = new KeywordLexicon(lexiconJournal); - } - - @AfterEach - public void tearDown() throws Exception { - lexicon.close(); - Files.delete(journalFile); - } - - @Test - public void testConsistentInserts() { - int a = lexicon.getOrInsert("aaa"); - int b = lexicon.getOrInsert("bbb"); - int a2 = lexicon.getOrInsert("aaa"); - int c = lexicon.getOrInsert("ccc"); - - assertEquals(a, a2); - assertNotEquals(a, b); - assertNotEquals(a, c); - assertNotEquals(b, c); - } - - @Test - public void testInsertReplay() { - int a = lexicon.getOrInsert("aaa"); - int b = lexicon.getOrInsert("bbb"); - int c = lexicon.getOrInsert("ccc"); - - assertEquals(a, lexicon.getReadOnly("aaa")); - assertEquals(b, lexicon.getReadOnly("bbb")); - assertEquals(c, lexicon.getReadOnly("ccc")); - } - - @Test - public void testReload() throws IOException { - int a = lexicon.getOrInsert("aaa"); - int b = lexicon.getOrInsert("bbb"); - int c = lexicon.getOrInsert("ccc"); - lexicon.commitToDisk(); - - var lexiconJournal = new KeywordLexiconJournal(journalFile.toFile(), KeywordLexiconJournalMode.READ_WRITE); - try (var anotherLexicon = new KeywordLexicon(lexiconJournal)) { - assertEquals(a, anotherLexicon.getReadOnly("aaa")); - assertEquals(b, anotherLexicon.getReadOnly("bbb")); - assertEquals(c, anotherLexicon.getReadOnly("ccc")); - } - catch (Exception ex) { - Assertions.fail("???", ex); - } - } -} diff --git a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsFromUrlId.java b/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsFromUrlId.java deleted file mode 100644 index 27be5967..00000000 --- a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsFromUrlId.java +++ /dev/null @@ -1,71 +0,0 @@ -package nu.marginalia.browse; - -import com.google.inject.Inject; -import com.google.inject.Singleton; -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.browse.model.BrowseResult; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.id.EdgeIdCollection; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.sql.SQLException; -import java.util.*; - -@Singleton -public class DbBrowseDomainsFromUrlId { - - private final Logger logger = LoggerFactory.getLogger(getClass()); - private final HikariDataSource dataSource; - - @Inject - public DbBrowseDomainsFromUrlId(HikariDataSource dataSource) { - this.dataSource = dataSource; - } - - private String idList(EdgeIdCollection ids) { - StringJoiner j = new StringJoiner(",", "(", ")"); - for (var id : ids.values()) { - j.add(Integer.toString(id)); - } - return j.toString(); - } - - public List getBrowseResultFromUrlIds(EdgeIdCollection urlIds) { - if (urlIds.isEmpty()) - return Collections.emptyList(); - - List ret = new ArrayList<>(urlIds.size()); - - try (var conn = dataSource.getConnection()) { - try (var stmt = conn.createStatement()) { - - String inStmt = idList(urlIds); - - var rsp = stmt.executeQuery(""" - SELECT DOMAIN_ID, DOMAIN_NAME - FROM EC_URL_VIEW - INNER JOIN DOMAIN_METADATA ON EC_URL_VIEW.DOMAIN_ID=DOMAIN_METADATA.ID - WHERE - KNOWN_URLS<5000 - AND QUALITY>-10 - AND EC_URL_VIEW.ID IN - """ + inStmt); // this injection is safe, inStmt is derived from concatenating a list of integers - while (rsp.next()) { - int id = rsp.getInt(1); - String domain = rsp.getString(2); - - ret.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0)); - } - } - } - catch (SQLException ex) { - logger.error("SQL error", ex); - } - - return ret; - } - - -} diff --git a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarCosine.java b/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarCosine.java index 6928e329..36651cd0 100644 --- a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarCosine.java +++ b/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarCosine.java @@ -6,7 +6,6 @@ import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.browse.model.BrowseResult; import nu.marginalia.model.EdgeDomain; import nu.marginalia.db.DomainBlacklist; -import nu.marginalia.model.id.EdgeId; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -24,7 +23,7 @@ public class DbBrowseDomainsSimilarCosine { this.dataSource = dataSource; } - public List getDomainNeighborsAdjacentCosine(EdgeId domainId, DomainBlacklist blacklist, int count) { + public List getDomainNeighborsAdjacentCosine(int domainId, DomainBlacklist blacklist, int count) { List domains = new ArrayList<>(count); String q = """ @@ -43,7 +42,7 @@ public class DbBrowseDomainsSimilarCosine { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement(q)) { stmt.setFetchSize(count); - stmt.setInt(1, domainId.id()); + stmt.setInt(1, domainId); stmt.setInt(2, count); var rsp = stmt.executeQuery(); while (rsp.next() && domains.size() < count) { diff --git a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarOldAlgo.java b/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarOldAlgo.java index a9fb6e54..923cc4fe 100644 --- a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarOldAlgo.java +++ b/code/features-search/random-websites/src/main/java/nu/marginalia/browse/DbBrowseDomainsSimilarOldAlgo.java @@ -5,10 +5,7 @@ import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.browse.model.BrowseResult; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; import nu.marginalia.db.DomainBlacklist; -import nu.marginalia.model.id.EdgeId; -import nu.marginalia.model.id.EdgeIdCollection; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -26,7 +23,7 @@ public class DbBrowseDomainsSimilarOldAlgo { this.dataSource = dataSource; } - public List getDomainNeighborsAdjacent(EdgeId domainId, DomainBlacklist blacklist, int count) { + public List getDomainNeighborsAdjacent(int domainId, DomainBlacklist blacklist, int count) { final Set domains = new HashSet<>(count*3); final String q = """ @@ -49,7 +46,7 @@ public class DbBrowseDomainsSimilarOldAlgo { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement(q)) { stmt.setFetchSize(count); - stmt.setInt(1, domainId.id()); + stmt.setInt(1, domainId); stmt.setInt(2, count); var rsp = stmt.executeQuery(); while (rsp.next()) { @@ -78,7 +75,7 @@ public class DbBrowseDomainsSimilarOldAlgo { try (var stmt = connection.prepareStatement(q2)) { stmt.setFetchSize(count/2); - stmt.setInt(1, domainId.id()); + stmt.setInt(1, domainId); stmt.setInt(2, count/2 - domains.size()); var rsp = stmt.executeQuery(); while (rsp.next() && domains.size() < count/2) { @@ -109,7 +106,7 @@ public class DbBrowseDomainsSimilarOldAlgo { LIMIT ?"""; try (var stmt = connection.prepareStatement(q3)) { stmt.setFetchSize(count/2); - stmt.setInt(1, domainId.id()); + stmt.setInt(1, domainId); stmt.setInt(2, count/2 - domains.size()); var rsp = stmt.executeQuery(); @@ -165,49 +162,4 @@ public class DbBrowseDomainsSimilarOldAlgo { return domains; } - - private String idList(EdgeIdCollection ids) { - StringJoiner j = new StringJoiner(",", "(", ")"); - for (var id : ids.values()) { - j.add(Integer.toString(id)); - } - return j.toString(); - } - - public List getBrowseResultFromUrlIds(EdgeIdCollection urlIds) { - if (urlIds.isEmpty()) - return Collections.emptyList(); - - List ret = new ArrayList<>(urlIds.size()); - - try (var conn = dataSource.getConnection()) { - try (var stmt = conn.createStatement()) { - - String inStmt = idList(urlIds); - - var rsp = stmt.executeQuery(""" - SELECT DOMAIN_ID, DOMAIN_NAME - FROM EC_URL_VIEW - INNER JOIN DOMAIN_METADATA ON EC_URL_VIEW.DOMAIN_ID=DOMAIN_METADATA.ID - WHERE - KNOWN_URLS<5000 - AND QUALITY>-10 - AND EC_URL_VIEW.ID IN - """ + inStmt); // this injection is safe, inStmt is derived from concatenating a list of integers - while (rsp.next()) { - int id = rsp.getInt(1); - String domain = rsp.getString(2); - - ret.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0)); - } - } - } - catch (SQLException ex) { - logger.error("SQL error", ex); - } - - return ret; - } - - } diff --git a/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java b/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java index 4602044b..b1c9462e 100644 --- a/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java +++ b/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java @@ -9,8 +9,8 @@ import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.ranking.factors.*; -import javax.inject.Inject; -import javax.inject.Singleton; +import com.google.inject.Inject; +import com.google.inject.Singleton; import java.util.ArrayList; import java.util.List; diff --git a/code/features-search/screenshots/src/main/java/nu/marginalia/screenshot/ScreenshotService.java b/code/features-search/screenshots/src/main/java/nu/marginalia/screenshot/ScreenshotService.java index 6a4ca551..7b6b53b2 100644 --- a/code/features-search/screenshots/src/main/java/nu/marginalia/screenshot/ScreenshotService.java +++ b/code/features-search/screenshots/src/main/java/nu/marginalia/screenshot/ScreenshotService.java @@ -4,9 +4,7 @@ import com.google.common.base.Strings; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; -import nu.marginalia.model.EdgeDomain; import nu.marginalia.db.DbDomainQueries; -import nu.marginalia.model.id.EdgeId; import org.apache.commons.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -30,7 +28,7 @@ public class ScreenshotService { this.dataSource = dataSource; } - public boolean hasScreenshot(EdgeId domainId) { + public boolean hasScreenshot(int domainId) { try (var conn = dataSource.getConnection(); var ps = conn.prepareStatement(""" SELECT TRUE @@ -38,7 +36,7 @@ public class ScreenshotService { INNER JOIN EC_DOMAIN ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME WHERE EC_DOMAIN.ID=? """)) { - ps.setInt(1, domainId.id()); + ps.setInt(1, domainId); var rs = ps.executeQuery(); if (rs.next()) { return rs.getBoolean(1); @@ -86,7 +84,7 @@ public class ScreenshotService { private Object serveSvgPlaceholder(Response response, int id) { - var name = domainQueries.getDomain(new EdgeId<>(id)).map(Object::toString) + var name = domainQueries.getDomain(id).map(Object::toString) .orElse("[Screenshot Not Yet Captured]"); response.type("image/svg+xml"); diff --git a/code/libraries/array/src/main/java/nu/marginalia/array/algo/TwoArrayOperations.java b/code/libraries/array/src/main/java/nu/marginalia/array/algo/TwoArrayOperations.java index 9ddb20e9..94827075 100644 --- a/code/libraries/array/src/main/java/nu/marginalia/array/algo/TwoArrayOperations.java +++ b/code/libraries/array/src/main/java/nu/marginalia/array/algo/TwoArrayOperations.java @@ -369,7 +369,8 @@ public class TwoArrayOperations { } while (aPos < aEnd) { - long val = a.get(aPos+=stepSize); + long val = a.get(aPos); + aPos+=stepSize; if (distinct == 0 || val != lastValue) { distinct++; } @@ -377,7 +378,8 @@ public class TwoArrayOperations { } while (bPos < bEnd) { - long val = b.get(bPos+=stepSize); + long val = b.get(bPos); + bPos+=stepSize; if (distinct == 0 || val != lastValue) { distinct++; } diff --git a/code/libraries/array/src/test/java/nu/marginalia/array/algo/TwoArrayOperationsTest.java b/code/libraries/array/src/test/java/nu/marginalia/array/algo/TwoArrayOperationsTest.java index 8a32df1b..e7af4fbf 100644 --- a/code/libraries/array/src/test/java/nu/marginalia/array/algo/TwoArrayOperationsTest.java +++ b/code/libraries/array/src/test/java/nu/marginalia/array/algo/TwoArrayOperationsTest.java @@ -5,6 +5,7 @@ import nu.marginalia.array.LongArray; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; +import java.util.Arrays; import java.util.HashMap; import java.util.stream.Collectors; import java.util.stream.LongStream; @@ -118,4 +119,31 @@ class TwoArrayOperationsTest { assertEquals(distinctSize, mergedSize); } + + @Test + public void mergeArrays2() { + LongArray left = LongArray.allocate(4); + LongArray right = LongArray.allocate(2); + LongArray out = LongArray.allocate(4); + left.set(0, 40, 3, 41, 4); + right.set(0, 40, 5); + + System.out.println(Arrays.toString(longArrayToJavaArray(left))); + System.out.println(Arrays.toString(longArrayToJavaArray(right))); + System.out.println(Arrays.toString(longArrayToJavaArray(out))); + long numDistinct = TwoArrayOperations.countDistinctElementsN(2, left, right, 0, 4, 0, 2); + System.out.println(numDistinct); + System.out.println(numDistinct); + + TwoArrayOperations.mergeArrays2(out, left, right, 0, 4, 0, 4, 0, 2); + + System.out.println(Arrays.toString(longArrayToJavaArray(out))); + + } + + long[] longArrayToJavaArray(LongArray longArray) { + long[] vals = new long[(int) longArray.size()]; + longArray.get(0, vals); + return vals; + } } \ No newline at end of file diff --git a/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java b/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java index 7b2af91c..88a73148 100644 --- a/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java +++ b/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java @@ -16,7 +16,7 @@ import org.jsoup.nodes.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.inject.Inject; +import com.google.inject.Inject; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; diff --git a/code/libraries/message-queue/src/main/java/nu/marginalia/actor/ActorStateMachine.java b/code/libraries/message-queue/src/main/java/nu/marginalia/actor/ActorStateMachine.java index 657fb6e2..88e57028 100644 --- a/code/libraries/message-queue/src/main/java/nu/marginalia/actor/ActorStateMachine.java +++ b/code/libraries/message-queue/src/main/java/nu/marginalia/actor/ActorStateMachine.java @@ -279,7 +279,7 @@ public class ActorStateMachine { } if (!state.isFinal()) { - logger.info("Transitining from state {}", state.name()); + logger.info("Transitioning from state {}", state.name()); var transition = state.next(msg.payload()); if (!expectedMessage.isExpected(msg)) { diff --git a/code/libraries/message-queue/src/main/java/nu/marginalia/mq/MessageQueueFactory.java b/code/libraries/message-queue/src/main/java/nu/marginalia/mq/MessageQueueFactory.java index bc664d38..e682de17 100644 --- a/code/libraries/message-queue/src/main/java/nu/marginalia/mq/MessageQueueFactory.java +++ b/code/libraries/message-queue/src/main/java/nu/marginalia/mq/MessageQueueFactory.java @@ -1,5 +1,7 @@ package nu.marginalia.mq; +import com.google.inject.Inject; +import com.google.inject.Singleton; import nu.marginalia.mq.inbox.MqAsynchronousInbox; import nu.marginalia.mq.inbox.MqInboxIf; import nu.marginalia.mq.inbox.MqSingleShotInbox; @@ -7,8 +9,6 @@ import nu.marginalia.mq.inbox.MqSynchronousInbox; import nu.marginalia.mq.outbox.MqOutbox; import nu.marginalia.mq.persistence.MqPersistence; -import javax.inject.Inject; -import javax.inject.Singleton; import java.util.UUID; @Singleton diff --git a/code/libraries/term-frequency-dict/src/main/java/nu/marginalia/term_frequency_dict/TermFrequencyDict.java b/code/libraries/term-frequency-dict/src/main/java/nu/marginalia/term_frequency_dict/TermFrequencyDict.java index 7778aa97..cc2e2fae 100644 --- a/code/libraries/term-frequency-dict/src/main/java/nu/marginalia/term_frequency_dict/TermFrequencyDict.java +++ b/code/libraries/term-frequency-dict/src/main/java/nu/marginalia/term_frequency_dict/TermFrequencyDict.java @@ -10,8 +10,8 @@ import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.inject.Inject; -import javax.inject.Singleton; +import com.google.inject.Inject; +import com.google.inject.Singleton; import java.io.*; import java.nio.file.Files; import java.nio.file.Path; diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/InstructionTag.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/InstructionTag.java index 7527229c..23584925 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/InstructionTag.java +++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/InstructionTag.java @@ -5,7 +5,6 @@ import nu.marginalia.converting.instruction.instructions.*; public enum InstructionTag { DOMAIN(LoadDomain.class), - URL(LoadUrl.class), LINK(LoadDomainLink.class), REDIRECT(LoadDomainRedirect.class), WORDS(LoadKeywords.class), diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java index a7089b9f..624081c9 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java +++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java @@ -10,7 +10,6 @@ import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; public interface Interpreter { - default void loadUrl(EdgeUrl[] url) {} default void loadDomain(EdgeDomain[] domain) {} default void loadRssFeed(EdgeUrl[] rssFeed) {} default void loadDomainLink(DomainLink[] links) {} @@ -19,7 +18,7 @@ public interface Interpreter { default void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {} default void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) {} - default void loadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) {} + default void loadKeywords(EdgeUrl url, int ordinal, int features, DocumentMetadata metadata, DocumentKeywords words) {} default void loadDomainRedirect(DomainLink link) {} diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadKeywords.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadKeywords.java index 779ec79c..96c78611 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadKeywords.java +++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadKeywords.java @@ -7,11 +7,11 @@ import nu.marginalia.converting.instruction.InstructionTag; import nu.marginalia.converting.instruction.Interpreter; import nu.marginalia.model.EdgeUrl; -public record LoadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) implements Instruction { +public record LoadKeywords(EdgeUrl url, int ordinal, int features, DocumentMetadata metadata, DocumentKeywords words) implements Instruction { @Override public void apply(Interpreter interpreter) { - interpreter.loadKeywords(url, features, metadata, words); + interpreter.loadKeywords(url, ordinal, features, metadata, words); } @Override diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocument.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocument.java index 11885f18..2a43494c 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocument.java +++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocument.java @@ -9,7 +9,7 @@ import org.jetbrains.annotations.Nullable; public record LoadProcessedDocument(EdgeUrl url, - UrlIndexingState state, + int ordinal, UrlIndexingState state, String title, String description, int htmlFeatures, @@ -17,7 +17,8 @@ public record LoadProcessedDocument(EdgeUrl url, int length, long hash, double quality, - @Nullable Integer pubYear) implements Instruction + @Nullable Integer pubYear +) implements Instruction { @Override public void apply(Interpreter interpreter) { diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocumentWithError.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocumentWithError.java index 28e42f5d..a1a42a90 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocumentWithError.java +++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocumentWithError.java @@ -9,7 +9,8 @@ import nu.marginalia.model.EdgeUrl; public record LoadProcessedDocumentWithError(EdgeUrl url, UrlIndexingState state, - String reason) implements Instruction + String reason, + int ordinal) implements Instruction { @Override public void apply(Interpreter interpreter) { diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadUrl.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadUrl.java deleted file mode 100644 index d126a515..00000000 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadUrl.java +++ /dev/null @@ -1,31 +0,0 @@ -package nu.marginalia.converting.instruction.instructions; - -import nu.marginalia.converting.instruction.Instruction; -import nu.marginalia.converting.instruction.InstructionTag; -import nu.marginalia.converting.instruction.Interpreter; -import nu.marginalia.model.EdgeUrl; - -import java.util.Arrays; - -public record LoadUrl(EdgeUrl... url) implements Instruction { - - @Override - public void apply(Interpreter interpreter) { - interpreter.loadUrl(url); - } - - @Override - public String toString() { - return getClass().getSimpleName()+"["+ Arrays.toString(url)+"]"; - } - - @Override - public InstructionTag tag() { - return InstructionTag.URL; - } - - @Override - public boolean isNoOp() { - return url.length == 0; - } -} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java index 10c11e21..865e6d6b 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java @@ -2,15 +2,8 @@ package nu.marginalia.converting; import com.github.luben.zstd.RecyclingBufferPool; import com.github.luben.zstd.ZstdOutputStream; -import nu.marginalia.model.crawl.DomainIndexingState; -import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.converting.instruction.Interpreter; -import nu.marginalia.keyword.model.DocumentKeywords; -import nu.marginalia.converting.instruction.instructions.DomainLink; -import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; import java.io.BufferedOutputStream; import java.io.IOException; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index 0dfd816c..9a5a78af 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -14,6 +14,7 @@ import nu.marginalia.mq.inbox.MqInboxResponse; import nu.marginalia.mq.inbox.MqSingleShotInbox; import nu.marginalia.mqapi.converting.ConvertAction; import nu.marginalia.process.control.ProcessHeartbeat; +import nu.marginalia.process.control.ProcessHeartbeatImpl; import nu.marginalia.process.log.WorkLog; import nu.marginalia.service.module.DatabaseModule; import plan.CrawlPlan; @@ -27,9 +28,6 @@ import java.nio.file.Path; import java.sql.SQLException; import java.util.Optional; import java.util.UUID; -import java.util.concurrent.LinkedBlockingQueue; -import java.util.concurrent.Semaphore; -import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; @@ -69,7 +67,7 @@ public class ConverterMain { DomainProcessor processor, InstructionsCompiler compiler, Gson gson, - ProcessHeartbeat heartbeat, + ProcessHeartbeatImpl heartbeat, MessageQueueFactory messageQueueFactory, FileStorageService fileStorageService, SideloadSourceFactory sideloadSourceFactory diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java index c3683cd0..e3b68629 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java @@ -7,9 +7,7 @@ import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.instruction.Interpreter; import nu.marginalia.keyword.model.DocumentKeywords; -import nu.marginalia.converting.instruction.instructions.DomainLink; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; -import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import org.slf4j.Logger; @@ -130,7 +128,7 @@ public class InstructionWriterFactory { } @Override - public void loadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) { + public void loadKeywords(EdgeUrl url, int ordinal, int features, DocumentMetadata metadata, DocumentKeywords words) { keywords++; } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/UpdateDomainStatistics.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/UpdateDomainStatistics.java deleted file mode 100644 index a59c7426..00000000 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/UpdateDomainStatistics.java +++ /dev/null @@ -1,71 +0,0 @@ -package nu.marginalia.converting; - -import com.zaxxer.hikari.HikariDataSource; -import gnu.trove.map.hash.TIntIntHashMap; -import nu.marginalia.service.module.DatabaseModule; - -import java.sql.SQLException; - -public class UpdateDomainStatistics { - private final HikariDataSource dataSource; - - public UpdateDomainStatistics(HikariDataSource dataSource) { - this.dataSource = dataSource; - } - - public static void main(String... args) throws SQLException { - new UpdateDomainStatistics(new DatabaseModule().provideConnection()).run(); - } - - public void run() throws SQLException { - - // This looks weird, but it's actually much faster than doing the computations with SQL queries - // - // ... in part because we can assume the data is immutable and don't mind consuming egregious - // resources - - try (var conn = dataSource.getConnection(); - var stmt = conn.createStatement(); - var domainInfoQuery = conn.prepareStatement("SELECT DOMAIN_ID, VISITED, STATE='ok' FROM EC_URL"); - var insertDomainInfo = conn.prepareStatement("INSERT INTO DOMAIN_METADATA(ID,KNOWN_URLS,VISITED_URLS,GOOD_URLS) VALUES (?, ?, ?, ?)") - ) { - - stmt.executeUpdate("DELETE FROM DOMAIN_METADATA"); - - TIntIntHashMap knownUrls = new TIntIntHashMap(1_000_000, 0.75f, 0, 0); - TIntIntHashMap visitedUrls = new TIntIntHashMap(1_000_000, 0.75f, 0, 0); - TIntIntHashMap goodUrls = new TIntIntHashMap(1_000_000, 0.75f, 0, 0); - - domainInfoQuery.setFetchSize(10_000); - var rsp = domainInfoQuery.executeQuery(); - while (rsp.next()) { - int domainId = rsp.getInt(1); - boolean visited = rsp.getBoolean(2); - boolean stateOk = rsp.getBoolean(3); - - knownUrls.adjustOrPutValue(domainId, 1, 1); - if (visited) { - visitedUrls.adjustOrPutValue(domainId, 1, 1); - if (stateOk) { - goodUrls.adjustOrPutValue(domainId, 1, 1); - } - } - } - - int i = 0; - for (int domainId : knownUrls.keys()) { - insertDomainInfo.setInt(1, domainId); - insertDomainInfo.setInt(2, knownUrls.get(domainId)); - insertDomainInfo.setInt(3, visitedUrls.get(domainId)); - insertDomainInfo.setInt(4, goodUrls.get(domainId)); - insertDomainInfo.addBatch(); - if ((++i % 1000) == 0) { - insertDomainInfo.executeBatch(); - } - } - if ((i % 1000) != 0) { - insertDomainInfo.executeBatch(); - } - } - } -} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java index 21a610fb..b3cb2a9f 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java @@ -3,6 +3,7 @@ package nu.marginalia.converting.compiler; import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.instruction.instructions.LoadKeywords; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; +import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.model.crawl.HtmlFeature; @@ -11,32 +12,43 @@ import java.util.function.Consumer; public class DocumentsCompiler { - public void compile(Consumer instructionConsumer, List documents) { - - for (var doc : documents) { - compileDocumentDetails(instructionConsumer, doc); - } - - for (var doc : documents) { - compileWords(instructionConsumer, doc); - } - - } - - public void compileDocumentDetails(Consumer instructionConsumer, ProcessedDocument doc) { + public void compileDocumentDetails(Consumer instructionConsumer, + ProcessedDocument doc, + int ordinal) { var details = doc.details; if (details != null) { - instructionConsumer.accept(new LoadProcessedDocument(doc.url, doc.state, details.title, details.description, HtmlFeature.encode(details.features), details.standard.name(), details.length, details.hashCode, details.quality, details.pubYear)); + instructionConsumer.accept(new LoadProcessedDocument(doc.url, + ordinal, + doc.state, + details.title, + details.description, + HtmlFeature.encode(details.features), + details.standard.name(), + details.length, + details.hashCode, + details.quality, + details.pubYear + )); + } + else { + instructionConsumer.accept(new LoadProcessedDocumentWithError( + doc.url, + doc.state, + doc.stateReason, + ordinal + )); } } public void compileWords(Consumer instructionConsumer, - ProcessedDocument doc) { + ProcessedDocument doc, + int ordinal) { var words = doc.words; if (words != null) { instructionConsumer.accept(new LoadKeywords(doc.url, + ordinal, HtmlFeature.encode(doc.details.features), doc.details.metadata, words.build()) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java index 87f28e3c..65d2e989 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java @@ -6,7 +6,6 @@ import nu.marginalia.converting.instruction.instructions.LoadProcessedDomain; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.sideload.SideloadSource; -import nu.marginalia.model.EdgeUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -17,7 +16,6 @@ import java.util.function.Consumer; import static java.util.Objects.requireNonNullElse; public class InstructionsCompiler { - private final UrlsCompiler urlsCompiler; private final DocumentsCompiler documentsCompiler; private final DomainMetadataCompiler domainMetadataCompiler; private final FeedsCompiler feedsCompiler; @@ -27,14 +25,12 @@ public class InstructionsCompiler { private final Logger logger = LoggerFactory.getLogger(InstructionsCompiler.class); @Inject - public InstructionsCompiler(UrlsCompiler urlsCompiler, - DocumentsCompiler documentsCompiler, + public InstructionsCompiler(DocumentsCompiler documentsCompiler, DomainMetadataCompiler domainMetadataCompiler, FeedsCompiler feedsCompiler, LinksCompiler linksCompiler, RedirectCompiler redirectCompiler) { - this.urlsCompiler = urlsCompiler; this.documentsCompiler = documentsCompiler; this.domainMetadataCompiler = domainMetadataCompiler; this.feedsCompiler = feedsCompiler; @@ -47,8 +43,13 @@ public class InstructionsCompiler { instructionConsumer.accept(new LoadProcessedDomain(domain.domain, domain.state, domain.ip)); if (domain.documents != null) { - urlsCompiler.compile(instructionConsumer, domain.documents); - documentsCompiler.compile(instructionConsumer, domain.documents); + + int ordinal = 0; + for (var doc : domain.documents) { + documentsCompiler.compileDocumentDetails(instructionConsumer, doc, ordinal); + documentsCompiler.compileWords(instructionConsumer, doc, ordinal); + ordinal++; + } feedsCompiler.compile(instructionConsumer, domain.documents); linksCompiler.compile(instructionConsumer, domain.domain, domain.documents); @@ -63,7 +64,6 @@ public class InstructionsCompiler { public void compileStreaming(SideloadSource sideloadSource, Consumer instructionConsumer) { ProcessedDomain domain = sideloadSource.getDomain(); - Iterator urlsIterator = sideloadSource.getUrlsIterator(); Iterator documentsIterator = sideloadSource.getDocumentsStream(); // Guaranteed to always be first @@ -72,11 +72,6 @@ public class InstructionsCompiler { int countAll = 0; int countGood = 0; - logger.info("Writing domains"); - urlsCompiler.compileJustDomain(instructionConsumer, domain.domain); - logger.info("Writing urls"); - urlsCompiler.compileJustUrls(instructionConsumer, urlsIterator); - logger.info("Writing docs"); while (documentsIterator.hasNext()) { @@ -84,8 +79,8 @@ public class InstructionsCompiler { countAll++; if (doc.isOk()) countGood++; - documentsCompiler.compileDocumentDetails(instructionConsumer, doc); - documentsCompiler.compileWords(instructionConsumer, doc); + documentsCompiler.compileDocumentDetails(instructionConsumer, doc, countAll); + documentsCompiler.compileWords(instructionConsumer, doc, countAll); } domainMetadataCompiler.compileFake(instructionConsumer, domain.domain, countAll, countGood); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/LinksCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/LinksCompiler.java index e100cb86..e84a7c54 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/LinksCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/LinksCompiler.java @@ -2,26 +2,34 @@ package nu.marginalia.converting.compiler; import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.instruction.instructions.DomainLink; +import nu.marginalia.converting.instruction.instructions.LoadDomain; import nu.marginalia.converting.instruction.instructions.LoadDomainLink; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.model.EdgeDomain; +import java.util.Arrays; import java.util.List; import java.util.Objects; import java.util.function.Consumer; public class LinksCompiler { - public void compile(Consumer instructionConsumer, EdgeDomain from, List documents) { + public void compile(Consumer instructionConsumer, + EdgeDomain from, + List documents) { - DomainLink[] links = documents.stream().map(doc -> doc.details) + EdgeDomain[] domains = documents.stream() + .map(doc -> doc.details) .filter(Objects::nonNull) - .flatMap(dets -> dets.linksExternal.stream()) + .flatMap(details -> details.linksExternal.stream()) .map(link -> link.domain) .distinct() - .map(domain -> new DomainLink(from, domain)) - .toArray(DomainLink[]::new); + .toArray(EdgeDomain[]::new); + DomainLink[] links = new DomainLink[domains.length]; + Arrays.setAll(links, i -> new DomainLink(from, domains[i])); + + instructionConsumer.accept(new LoadDomain(domains)); instructionConsumer.accept(new LoadDomainLink(links)); } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java deleted file mode 100644 index ee4f3cbe..00000000 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java +++ /dev/null @@ -1,77 +0,0 @@ -package nu.marginalia.converting.compiler; - -import nu.marginalia.converting.instruction.Instruction; -import nu.marginalia.converting.instruction.instructions.LoadDomain; -import nu.marginalia.converting.instruction.instructions.LoadUrl; -import nu.marginalia.converting.model.ProcessedDocument; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.*; -import java.util.function.Consumer; - -public class UrlsCompiler { - - private static final int MAX_INTERNAL_LINKS = 25; - private final Logger logger = LoggerFactory.getLogger(getClass()); - - public void compile(Consumer instructionConsumer, List documents) { - Set seenUrls = new HashSet<>(documents.size()*4); - Set seenDomains = new HashSet<>(documents.size()); - - for (var doc : documents) { - if (doc.url == null) { - logger.warn("Discovered document with null URL"); - continue; - } - - seenUrls.add(doc.url); - - if (doc.details == null) { - continue; - } - - // Add *some* external links; to avoid loading too many and gunking up the database with nonsense, - // only permit this once per external domain per crawled domain - for (var url : doc.details.linksExternal) { - if (seenDomains.add(url.domain)) { - seenUrls.add(url); - } - } - - if (doc.isOk()) { - // Don't load more than a few from linksInternal, grows too big for no reason - var linksToAdd = new ArrayList<>(doc.details.linksInternal); - if (linksToAdd.size() > MAX_INTERNAL_LINKS) { - linksToAdd.subList(MAX_INTERNAL_LINKS, linksToAdd.size()).clear(); - } - seenUrls.addAll(linksToAdd); - } - } - - instructionConsumer.accept(new LoadDomain(seenDomains.toArray(EdgeDomain[]::new))); - instructionConsumer.accept(new LoadUrl(seenUrls.toArray(EdgeUrl[]::new))); - } - - public void compileJustUrls(Consumer instructionConsumer, Iterator urlsIterator) { - var urls = new ArrayList(1000); - - while (urlsIterator.hasNext()) { - if (urls.size() >= 1000) { - instructionConsumer.accept(new LoadUrl(urls.toArray(EdgeUrl[]::new))); - urls.clear(); - } - - urls.add(urlsIterator.next()); - } - if (!urls.isEmpty()) { - instructionConsumer.accept(new LoadUrl(urls.toArray(EdgeUrl[]::new))); - } - } - - public void compileJustDomain(Consumer instructionConsumer, EdgeDomain domain) { - instructionConsumer.accept(new LoadDomain(domain)); - } -} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/language/LanguageFilter.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/language/LanguageFilter.java index be857061..524bfa1f 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/language/LanguageFilter.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/language/LanguageFilter.java @@ -8,8 +8,8 @@ import org.jsoup.nodes.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.inject.Inject; -import javax.inject.Singleton; +import com.google.inject.Inject; +import com.google.inject.Singleton; import java.util.Optional; import java.util.Set; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/MetaRobotsTag.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/MetaRobotsTag.java index 6e8e3dbc..d2ab8751 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/MetaRobotsTag.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/MetaRobotsTag.java @@ -2,7 +2,7 @@ package nu.marginalia.converting.processor; import org.jsoup.nodes.Document; -import javax.inject.Singleton; +import com.google.inject.Singleton; @Singleton public class MetaRobotsTag { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/SiteWords.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/SiteWords.java index 97a2f7b6..6db64321 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/SiteWords.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/SiteWords.java @@ -9,7 +9,7 @@ import nu.marginalia.model.EdgeUrl; import nu.marginalia.converting.processor.logic.links.CommonKeywordExtractor; import nu.marginalia.converting.processor.logic.links.TopKeywords; -import javax.inject.Singleton; +import com.google.inject.Singleton; import java.util.HashMap; import java.util.HashSet; import java.util.Map; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/EncyclopediaMarginaliaNuSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/EncyclopediaMarginaliaNuSideloader.java index ae07b6c3..5a5a6855 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/EncyclopediaMarginaliaNuSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/EncyclopediaMarginaliaNuSideloader.java @@ -64,25 +64,6 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC return ret; } - @Override - @SneakyThrows - public Iterator getUrlsIterator() { - EdgeUrl base = new EdgeUrl("https://encyclopedia.marginalia.nu/"); - - return new SqlQueryIterator<>(connection.prepareStatement(""" - SELECT url, html FROM articles - """)) - { - @Override - public EdgeUrl convert(ResultSet rs) throws Exception { - var path = URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8); - - return base.withPathAndParam("/article/"+path, null); - } - }; - } - - @SneakyThrows @Override public Iterator getDocumentsStream() { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSource.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSource.java index d23a81ae..3d07d56a 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSource.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSource.java @@ -8,7 +8,6 @@ import java.util.Iterator; public interface SideloadSource { ProcessedDomain getDomain(); - Iterator getUrlsIterator(); Iterator getDocumentsStream(); String getId(); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackexchangeSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackexchangeSideloader.java index 97a37ac9..0f40639b 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackexchangeSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackexchangeSideloader.java @@ -22,7 +22,6 @@ import java.nio.file.Path; import java.util.EnumSet; import java.util.Iterator; import java.util.List; -import java.util.Optional; /** This code is broken */ @Deprecated() @@ -55,17 +54,6 @@ public class StackexchangeSideloader implements SideloadSource { return ret; } - @SneakyThrows - @Override - public Iterator getUrlsIterator() { - var ids = reader.getIds(); - return ids.stream() - .map(id -> EdgeUrl.parse("https://" + domainName + "/questions/" + id)) - .filter(Optional::isPresent) - .map(Optional::get) - .iterator(); - } - @Override public Iterator getDocumentsStream() { try { diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index 6f659bad..4141e263 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -14,7 +14,7 @@ import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.inbox.MqInboxResponse; import nu.marginalia.mq.inbox.MqSingleShotInbox; -import nu.marginalia.process.control.ProcessHeartbeat; +import nu.marginalia.process.control.ProcessHeartbeatImpl; import nu.marginalia.process.log.WorkLog; import nu.marginalia.service.module.DatabaseModule; import plan.CrawlPlan; @@ -42,7 +42,7 @@ public class CrawlerMain { private Path crawlDataDir; - private final ProcessHeartbeat heartbeat; + private final ProcessHeartbeatImpl heartbeat; private final ConnectionPool connectionPool = new ConnectionPool(5, 10, TimeUnit.SECONDS); private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS, @@ -65,7 +65,7 @@ public class CrawlerMain { @Inject public CrawlerMain(UserAgent userAgent, - ProcessHeartbeat heartbeat, + ProcessHeartbeatImpl heartbeat, MessageQueueFactory messageQueueFactory, FileStorageService fileStorageService, Gson gson) { diff --git a/code/processes/index-constructor-process/build.gradle b/code/processes/index-constructor-process/build.gradle new file mode 100644 index 00000000..b96b245f --- /dev/null +++ b/code/processes/index-constructor-process/build.gradle @@ -0,0 +1,57 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "8.2.2" + id 'application' + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(20)) + } +} + +application { + mainClass = 'nu.marginalia.index.IndexConstructorMain' + applicationName = 'index-construction-process' +} + +tasks.distZip.enabled = false + +dependencies { + implementation project(':code:api:process-mqapi') + implementation project(':code:common:process') + implementation project(':code:common:service') + implementation project(':code:common:db') + implementation project(':code:common:model') + implementation project(':code:libraries:message-queue') + + implementation project(':code:features-index:index-forward') + implementation project(':code:features-index:index-reverse') + implementation project(':code:features-index:index-journal') + implementation project(':code:features-index:domain-ranking') + + implementation libs.lombok + annotationProcessor libs.lombok + + implementation libs.bundles.slf4j + implementation libs.guice + implementation libs.bundles.mariadb + implementation libs.bundles.gson + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito + + testImplementation project(':code:processes:test-data') +} + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/code/processes/index-constructor-process/src/main/java/nu/marginalia/index/IndexConstructorMain.java b/code/processes/index-constructor-process/src/main/java/nu/marginalia/index/IndexConstructorMain.java new file mode 100644 index 00000000..8e7be9d9 --- /dev/null +++ b/code/processes/index-constructor-process/src/main/java/nu/marginalia/index/IndexConstructorMain.java @@ -0,0 +1,203 @@ +package nu.marginalia.index; + +import com.google.gson.Gson; +import com.google.inject.Guice; +import com.google.inject.Inject; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorage; +import nu.marginalia.db.storage.model.FileStorageType; +import nu.marginalia.index.construction.ReverseIndexConstructor; +import nu.marginalia.index.forward.ForwardIndexConverter; +import nu.marginalia.index.forward.ForwardIndexFileNames; +import nu.marginalia.index.journal.reader.IndexJournalReader; +import nu.marginalia.model.gson.GsonFactory; +import nu.marginalia.model.id.UrlIdCodec; +import nu.marginalia.mq.MessageQueueFactory; +import nu.marginalia.mq.MqMessage; +import nu.marginalia.mq.inbox.MqInboxResponse; +import nu.marginalia.mq.inbox.MqSingleShotInbox; +import nu.marginalia.mqapi.index.CreateIndexRequest; +import nu.marginalia.mqapi.index.IndexName; +import nu.marginalia.process.control.ProcessHeartbeatImpl; +import nu.marginalia.ranking.DomainRankings; +import nu.marginalia.service.module.DatabaseModule; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.Optional; +import java.util.UUID; +import java.util.concurrent.TimeUnit; + +import static nu.marginalia.mqapi.ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX; + +public class IndexConstructorMain { + private final FileStorageService fileStorageService; + private final ProcessHeartbeatImpl heartbeat; + private final MessageQueueFactory messageQueueFactory; + private final DomainRankings domainRankings; + private static final Logger logger = LoggerFactory.getLogger(IndexConstructorMain.class); + private final Gson gson = GsonFactory.get(); + public static void main(String[] args) throws Exception { + new org.mariadb.jdbc.Driver(); + + var main = Guice.createInjector( + new IndexConstructorModule(), + new DatabaseModule()) + .getInstance(IndexConstructorMain.class); + + var instructions = main.fetchInstructions(); + + try { + main.run(instructions); + instructions.ok(); + } + catch (Exception ex) { + logger.error("Constructor failed", ex); + instructions.err(); + } + + TimeUnit.SECONDS.sleep(5); + + System.exit(0); + } + + @Inject + public IndexConstructorMain(FileStorageService fileStorageService, + ProcessHeartbeatImpl heartbeat, + MessageQueueFactory messageQueueFactory, + DomainRankings domainRankings) { + + this.fileStorageService = fileStorageService; + this.heartbeat = heartbeat; + this.messageQueueFactory = messageQueueFactory; + this.domainRankings = domainRankings; + } + + private void run(CreateIndexInstructions instructions) throws SQLException, IOException { + heartbeat.start(); + + switch (instructions.name) { + case FORWARD -> createForwardIndex(); + case REVERSE_FULL -> createFullReverseIndex(); + case REVERSE_PRIO -> createPrioReverseIndex(); + } + + heartbeat.shutDown(); + } + + private void createFullReverseIndex() throws SQLException, IOException { + + FileStorage indexLive = fileStorageService.getStorageByType(FileStorageType.INDEX_LIVE); + FileStorage indexStaging = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING); + + Path outputFileDocs = ReverseIndexFullFileNames.resolve(indexLive.asPath(), ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.NEXT); + Path outputFileWords = ReverseIndexFullFileNames.resolve(indexLive.asPath(), ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.NEXT); + + Path tmpDir = indexStaging.asPath().resolve("tmp"); + if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir); + + + ReverseIndexConstructor. + createReverseIndex( + heartbeat, + IndexJournalReader::singleFile, + indexStaging.asPath(), + this::addRank, + tmpDir, + outputFileDocs, + outputFileWords); + } + + private void createPrioReverseIndex() throws SQLException, IOException { + + FileStorage indexLive = fileStorageService.getStorageByType(FileStorageType.INDEX_LIVE); + FileStorage indexStaging = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING); + + Path outputFileDocs = ReverseIndexPrioFileNames.resolve(indexLive.asPath(), ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.NEXT); + Path outputFileWords = ReverseIndexPrioFileNames.resolve(indexLive.asPath(), ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.NEXT); + + Path tmpDir = indexStaging.asPath().resolve("tmp"); + if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir); + + ReverseIndexConstructor. + createReverseIndex(heartbeat, + IndexJournalReader::singleFileWithPriorityFilters, + indexStaging.asPath(), this::addRank, tmpDir, outputFileDocs, outputFileWords); + } + + private void createForwardIndex() throws SQLException, IOException { + + FileStorage indexLive = fileStorageService.getStorageByType(FileStorageType.INDEX_LIVE); + FileStorage indexStaging = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING); + + Path outputFileDocsId = ForwardIndexFileNames.resolve(indexLive.asPath(), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT); + Path outputFileDocsData = ForwardIndexFileNames.resolve(indexLive.asPath(), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT); + + ForwardIndexConverter converter = new ForwardIndexConverter(heartbeat, + IndexJournalReader.paging(indexStaging.asPath()), + outputFileDocsId, + outputFileDocsData, + domainRankings + ); + + converter.convert(); + } + + private long addRank(long docId) { + float rank = domainRankings.getSortRanking(docId); + return UrlIdCodec.addRank(rank, docId); + } + + private class CreateIndexInstructions { + + public final IndexName name; + private final MqSingleShotInbox inbox; + private final MqMessage message; + + private CreateIndexInstructions(IndexName name, MqSingleShotInbox inbox, MqMessage message) { + this.name = name; + this.inbox = inbox; + this.message = message; + } + + public void ok() { + inbox.sendResponse(message, MqInboxResponse.ok()); + } + public void err() { + inbox.sendResponse(message, MqInboxResponse.err()); + } + } + + private CreateIndexInstructions fetchInstructions() throws Exception { + + var inbox = messageQueueFactory.createSingleShotInbox(INDEX_CONSTRUCTOR_INBOX, UUID.randomUUID()); + + logger.info("Waiting for instructions"); + var msgOpt = getMessage(inbox, CreateIndexRequest.class.getSimpleName()); + var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received")); + + var payload = gson.fromJson(msg.payload(), CreateIndexRequest.class); + var name = payload.indexName(); + + return new CreateIndexInstructions(name, inbox, msg); + } + + private Optional getMessage(MqSingleShotInbox inbox, String expectedFunction) throws SQLException, InterruptedException { + var opt = inbox.waitForMessage(30, TimeUnit.SECONDS); + if (opt.isPresent()) { + if (!opt.get().function().equals(expectedFunction)) { + throw new RuntimeException("Unexpected function: " + opt.get().function()); + } + return opt; + } + else { + var stolenMessage = inbox.stealMessage(msg -> msg.function().equals(expectedFunction)); + stolenMessage.ifPresent(mqMessage -> logger.info("Stole message {}", mqMessage)); + return stolenMessage; + } + } +} diff --git a/code/processes/index-constructor-process/src/main/java/nu/marginalia/index/IndexConstructorModule.java b/code/processes/index-constructor-process/src/main/java/nu/marginalia/index/IndexConstructorModule.java new file mode 100644 index 00000000..c4847e71 --- /dev/null +++ b/code/processes/index-constructor-process/src/main/java/nu/marginalia/index/IndexConstructorModule.java @@ -0,0 +1,14 @@ +package nu.marginalia.index; + +import com.google.inject.AbstractModule; +import nu.marginalia.ProcessConfiguration; + +import java.util.UUID; + +public class IndexConstructorModule extends AbstractModule { + @Override + public void configure() { + bind(ProcessConfiguration.class).toInstance(new ProcessConfiguration("index-constructor", 0, UUID.randomUUID())); + + } +} diff --git a/code/processes/loading-process/build.gradle b/code/processes/loading-process/build.gradle index eb705bbf..fd23a7d0 100644 --- a/code/processes/loading-process/build.gradle +++ b/code/processes/loading-process/build.gradle @@ -27,7 +27,7 @@ dependencies { implementation project(':code:common:service') implementation project(':code:common:service-discovery') implementation project(':code:common:service-client') - implementation project(':code:features-index:lexicon') + implementation project(':code:common:linkdb') implementation project(':code:features-index:index-journal') implementation project(':code:libraries:message-queue') implementation project(':code:libraries:language-processing') diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java index 99266e13..97b2bf0d 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java @@ -1,15 +1,14 @@ package nu.marginalia.loading; -import com.google.common.collect.Sets; import com.google.gson.Gson; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; import lombok.SneakyThrows; import nu.marginalia.converting.instruction.Interpreter; -import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.keyword.model.DocumentKeywords; +import nu.marginalia.linkdb.LinkdbWriter; import nu.marginalia.loading.loader.IndexLoadKeywords; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.idx.DocumentMetadata; @@ -17,7 +16,7 @@ import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.inbox.MqInboxResponse; import nu.marginalia.mq.inbox.MqSingleShotInbox; -import nu.marginalia.process.control.ProcessHeartbeat; +import nu.marginalia.process.control.ProcessHeartbeatImpl; import nu.marginalia.process.log.WorkLog; import plan.CrawlPlan; import nu.marginalia.loading.loader.LoaderFactory; @@ -27,9 +26,7 @@ import org.slf4j.LoggerFactory; import java.nio.file.Path; import java.sql.SQLException; -import java.util.HashSet; import java.util.Optional; -import java.util.Set; import java.util.UUID; import java.util.concurrent.TimeUnit; @@ -41,10 +38,11 @@ public class LoaderMain { private final ConvertedDomainReader instructionsReader; private final LoaderFactory loaderFactory; - private final ProcessHeartbeat heartbeat; + private final ProcessHeartbeatImpl heartbeat; private final MessageQueueFactory messageQueueFactory; private final FileStorageService fileStorageService; private final IndexLoadKeywords indexLoadKeywords; + private final LinkdbWriter writer; private final Gson gson; public static void main(String... args) throws Exception { @@ -69,10 +67,11 @@ public class LoaderMain { @Inject public LoaderMain(ConvertedDomainReader instructionsReader, LoaderFactory loaderFactory, - ProcessHeartbeat heartbeat, + ProcessHeartbeatImpl heartbeat, MessageQueueFactory messageQueueFactory, FileStorageService fileStorageService, IndexLoadKeywords indexLoadKeywords, + LinkdbWriter writer, Gson gson ) { @@ -82,6 +81,7 @@ public class LoaderMain { this.messageQueueFactory = messageQueueFactory; this.fileStorageService = fileStorageService; this.indexLoadKeywords = indexLoadKeywords; + this.writer = writer; this.gson = gson; heartbeat.start(); @@ -136,6 +136,7 @@ public class LoaderMain { // This needs to be done in order to have a readable index journal indexLoadKeywords.close(); + writer.close(); logger.info("Loading finished"); } catch (Exception ex) { @@ -215,7 +216,7 @@ public class LoaderMain { public class InstructionCounter implements Interpreter { private int count = 0; - public void loadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) { + public void loadKeywords(EdgeUrl url, int ordinal, int features, DocumentMetadata metadata, DocumentKeywords words) { count++; } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java index a2df0ea9..920abf19 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java @@ -2,21 +2,29 @@ package nu.marginalia.loading; import com.google.gson.Gson; import com.google.inject.AbstractModule; +import com.google.inject.Inject; +import com.google.inject.Provides; +import com.google.inject.Singleton; import com.google.inject.name.Names; import nu.marginalia.LanguageModels; import nu.marginalia.ProcessConfiguration; import nu.marginalia.WmsaHome; -import plan.CrawlPlan; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorageType; +import nu.marginalia.linkdb.LinkdbStatusWriter; +import nu.marginalia.linkdb.LinkdbWriter; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.service.SearchServiceDescriptors; import nu.marginalia.service.descriptor.ServiceDescriptors; +import java.io.IOException; +import java.nio.file.Files; import java.nio.file.Path; +import java.sql.SQLException; import java.util.UUID; public class LoaderModule extends AbstractModule { - public LoaderModule() { } @@ -25,11 +33,32 @@ public class LoaderModule extends AbstractModule { bind(ProcessConfiguration.class).toInstance(new ProcessConfiguration("loader", 0, UUID.randomUUID())); bind(Gson.class).toProvider(this::createGson); - bind(Path.class).annotatedWith(Names.named("local-index-path")).toInstance(Path.of(System.getProperty("local-index-path", "/vol"))); bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels()); } + @Inject @Provides @Singleton + private LinkdbWriter createLinkdbWriter(FileStorageService service) throws SQLException, IOException { + var storage = service.getStorageByType(FileStorageType.LINKDB_STAGING); + Path dbPath = storage.asPath().resolve("links.db"); + + if (Files.exists(dbPath)) { + Files.delete(dbPath); + } + return new LinkdbWriter(dbPath); + } + + @Inject @Provides @Singleton + private LinkdbStatusWriter createLinkdbStatusWriter(FileStorageService service) throws SQLException, IOException { + var storage = service.getStorageByType(FileStorageType.LINKDB_STAGING); + Path dbPath = storage.asPath().resolve("urlstatus.db"); + + if (Files.exists(dbPath)) { + Files.delete(dbPath); + } + return new LinkdbStatusWriter(dbPath); + } + private Gson createGson() { return GsonFactory.get(); } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java index ae914d3d..2e24b843 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java @@ -1,74 +1,47 @@ package nu.marginalia.loading.loader; import com.google.inject.Inject; -import lombok.SneakyThrows; import nu.marginalia.keyword.model.DocumentKeywords; -import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.id.EdgeId; +import nu.marginalia.model.id.UrlIdCodec; +import nu.marginalia.model.idx.DocumentMetadata; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.concurrent.LinkedBlockingQueue; -import java.util.concurrent.TimeUnit; - -public class IndexLoadKeywords implements Runnable { +public class IndexLoadKeywords { private static final Logger logger = LoggerFactory.getLogger(IndexLoadKeywords.class); - - private final LinkedBlockingQueue insertQueue = new LinkedBlockingQueue<>(32); private final LoaderIndexJournalWriter journalWriter; - private record InsertTask(int urlId, - int domainId, - int features, - DocumentMetadata metadata, - DocumentKeywords wordSet) {} - - private final Thread runThread; - private volatile boolean canceled = false; @Inject public IndexLoadKeywords(LoaderIndexJournalWriter journalWriter) { this.journalWriter = journalWriter; - runThread = new Thread(this, getClass().getSimpleName()); - runThread.start(); } - @SneakyThrows - public void run() { - while (!canceled) { - var data = insertQueue.poll(1, TimeUnit.SECONDS); - if (data != null) { - journalWriter.putWords(new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), - data.features, - data.metadata(), - data.wordSet); - } - } - } public void close() throws Exception { if (!canceled) { - canceled = true; - runThread.join(); journalWriter.close(); } } public void load(LoaderData loaderData, + int ordinal, EdgeUrl url, int features, DocumentMetadata metadata, - DocumentKeywords words) throws InterruptedException { - int domainId = loaderData.getDomainId(url.domain); - int urlId = loaderData.getUrlId(url); + DocumentKeywords words) { + long combinedId = UrlIdCodec.encodeId(loaderData.getTargetDomainId(), ordinal); - if (urlId <= 0 || domainId <= 0) { - logger.warn("Failed to get IDs for {} -- d={},u={}", url, domainId, urlId); + if (combinedId <= 0) { + logger.warn("Failed to get IDs for {} -- c={}", url, combinedId); return; } - insertQueue.put(new InsertTask(urlId, domainId, features, metadata, words)); + journalWriter.putWords(combinedId, + features, + metadata, + words); } } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LdbLoadProcessedDocument.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LdbLoadProcessedDocument.java new file mode 100644 index 00000000..acb3b26b --- /dev/null +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LdbLoadProcessedDocument.java @@ -0,0 +1,83 @@ +package nu.marginalia.loading.loader; + +import com.google.inject.Inject; +import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; +import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; +import nu.marginalia.linkdb.LinkdbStatusWriter; +import nu.marginalia.linkdb.LinkdbWriter; +import nu.marginalia.linkdb.model.LdbUrlDetail; +import nu.marginalia.linkdb.model.UrlStatus; +import nu.marginalia.model.id.UrlIdCodec; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; + +public class LdbLoadProcessedDocument { + private static final Logger logger = LoggerFactory.getLogger(LdbLoadProcessedDocument.class); + private final LinkdbWriter linkdbWriter; + private final LinkdbStatusWriter linkdbStatusWriter; + + @Inject + public LdbLoadProcessedDocument(LinkdbWriter linkdbWriter, + LinkdbStatusWriter linkdbStatusWriter + ) { + this.linkdbWriter = linkdbWriter; + this.linkdbStatusWriter = linkdbStatusWriter; + } + + public void load(LoaderData data, List documents) { + var details = new ArrayList(); + + int domainId = data.getTargetDomainId(); + var statusList = new ArrayList(); + + for (var document : documents) { + long id = UrlIdCodec.encodeId(domainId, document.ordinal()); + details.add(new LdbUrlDetail( + id, + document.url(), + document.title(), + document.description(), + document.quality(), + document.standard(), + document.htmlFeatures(), + document.pubYear(), + document.hash(), + document.length() + )); + statusList.add(new UrlStatus(id, document.url(), document.state().toString(), null)); + } + + try { + linkdbWriter.add(details); + } + catch (SQLException ex) { + logger.warn("Failed to add processed documents to linkdb", ex); + } + } + + public void loadWithError(LoaderData data, List documents) { + var statusList = new ArrayList(); + int domainId = data.getTargetDomainId(); + + for (var document : documents) { + statusList.add(new UrlStatus( + UrlIdCodec.encodeId(domainId, document.ordinal()), + document.url(), + document.state().toString(), + document.reason() + )); + } + + try { + linkdbStatusWriter.add(statusList); + } + catch (SQLException ex) { + logger.warn("Failed to add processed documents to linkdb", ex); + } + } + +} diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java index 80b6afec..8af672c5 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java @@ -12,15 +12,15 @@ import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWi import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.sql.SQLException; import java.util.ArrayList; import java.util.List; public class Loader implements Interpreter, AutoCloseable { - private final SqlLoadUrls sqlLoadUrls; private final SqlLoadDomains sqlLoadDomains; private final SqlLoadDomainLinks sqlLoadDomainLinks; private final SqlLoadProcessedDomain sqlLoadProcessedDomain; - private final SqlLoadProcessedDocument sqlLoadProcessedDocument; + private final LdbLoadProcessedDocument loadProcessedDocument; private final SqlLoadDomainMetadata sqlLoadDomainMetadata; private final IndexLoadKeywords indexLoadKeywords; @@ -34,21 +34,18 @@ public class Loader implements Interpreter, AutoCloseable { public final LoaderData data; public Loader(int sizeHint, - SqlLoadUrls sqlLoadUrls, SqlLoadDomains sqlLoadDomains, SqlLoadDomainLinks sqlLoadDomainLinks, SqlLoadProcessedDomain sqlLoadProcessedDomain, - SqlLoadProcessedDocument sqlLoadProcessedDocument, + LdbLoadProcessedDocument loadProcessedDocument, SqlLoadDomainMetadata sqlLoadDomainMetadata, - IndexLoadKeywords indexLoadKeywords) - { + IndexLoadKeywords indexLoadKeywords) { data = new LoaderData(sizeHint); - this.sqlLoadUrls = sqlLoadUrls; this.sqlLoadDomains = sqlLoadDomains; this.sqlLoadDomainLinks = sqlLoadDomainLinks; this.sqlLoadProcessedDomain = sqlLoadProcessedDomain; - this.sqlLoadProcessedDocument = sqlLoadProcessedDocument; + this.loadProcessedDocument = loadProcessedDocument; this.sqlLoadDomainMetadata = sqlLoadDomainMetadata; this.indexLoadKeywords = indexLoadKeywords; @@ -56,12 +53,6 @@ public class Loader implements Interpreter, AutoCloseable { processedDocumentWithErrorList = new ArrayList<>(sizeHint); } - - @Override - public void loadUrl(EdgeUrl[] urls) { - sqlLoadUrls.load(data, urls); - } - @Override public void loadDomain(EdgeDomain[] domains) { sqlLoadDomains.load(data, domains); @@ -86,29 +77,23 @@ public class Loader implements Interpreter, AutoCloseable { public void loadProcessedDocument(LoadProcessedDocument document) { processedDocumentList.add(document); - if (processedDocumentList.size() > 100) { - sqlLoadProcessedDocument.load(data, processedDocumentList); + if (processedDocumentList.size() > 1000) { + loadProcessedDocument.load(data, processedDocumentList); processedDocumentList.clear(); } } - @Override public void loadProcessedDocumentWithError(LoadProcessedDocumentWithError document) { processedDocumentWithErrorList.add(document); - if (processedDocumentWithErrorList.size() > 100) { - sqlLoadProcessedDocument.loadWithError(data, processedDocumentWithErrorList); + if (processedDocumentWithErrorList.size() > 1000) { + loadProcessedDocument.loadWithError(data, processedDocumentWithErrorList); processedDocumentWithErrorList.clear(); } } - @Override - public void loadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) { - try { - indexLoadKeywords.load(data, url, features, metadata, words); - } catch (InterruptedException e) { - throw new RuntimeException(e); - } + public void loadKeywords(EdgeUrl url, int ordinal, int features, DocumentMetadata metadata, DocumentKeywords words) { + indexLoadKeywords.load(data, ordinal, url, features, metadata, words); } @Override @@ -123,10 +108,10 @@ public class Loader implements Interpreter, AutoCloseable { public void close() { if (processedDocumentList.size() > 0) { - sqlLoadProcessedDocument.load(data, processedDocumentList); + loadProcessedDocument.load(data, processedDocumentList); } if (processedDocumentWithErrorList.size() > 0) { - sqlLoadProcessedDocument.loadWithError(data, processedDocumentWithErrorList); + loadProcessedDocument.loadWithError(data, processedDocumentWithErrorList); } } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderData.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderData.java index 570cb579..613b880d 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderData.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderData.java @@ -2,17 +2,15 @@ package nu.marginalia.loading.loader; import gnu.trove.map.hash.TObjectIntHashMap; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; public class LoaderData { - private final TObjectIntHashMap urlIds; private final TObjectIntHashMap domainIds; private EdgeDomain targetDomain; public final int sizeHint; + private int targetDomainId = -1; public LoaderData(int sizeHint) { - urlIds = new TObjectIntHashMap<>(sizeHint+1); domainIds = new TObjectIntHashMap<>(10); this.sizeHint = sizeHint; } @@ -24,19 +22,16 @@ public class LoaderData { return targetDomain; } + public int getTargetDomainId() { + if (targetDomainId < 0) + targetDomainId = domainIds.get(targetDomain); + return targetDomainId; + } public void addDomain(EdgeDomain domain, int id) { domainIds.put(domain, id); } - public void addUrl(EdgeUrl url, int id) { - urlIds.put(url, id); - } - - public int getUrlId(EdgeUrl url) { - return urlIds.get(url); - } - public int getDomainId(EdgeDomain domain) { return domainIds.get(domain); } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderFactory.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderFactory.java index 21435ac0..f5984b51 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderFactory.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderFactory.java @@ -3,24 +3,20 @@ package nu.marginalia.loading.loader; import com.google.inject.Inject; public class LoaderFactory { - private final SqlLoadUrls sqlLoadUrls; private final SqlLoadDomains sqlLoadDomains; private final SqlLoadDomainLinks sqlLoadDomainLinks; private final SqlLoadProcessedDomain sqlLoadProcessedDomain; - private final SqlLoadProcessedDocument sqlLoadProcessedDocument; + private final LdbLoadProcessedDocument sqlLoadProcessedDocument; private final SqlLoadDomainMetadata sqlLoadDomainMetadata; private final IndexLoadKeywords indexLoadKeywords; @Inject - public LoaderFactory(SqlLoadUrls sqlLoadUrls, - SqlLoadDomains sqlLoadDomains, + public LoaderFactory(SqlLoadDomains sqlLoadDomains, SqlLoadDomainLinks sqlLoadDomainLinks, SqlLoadProcessedDomain sqlLoadProcessedDomain, - SqlLoadProcessedDocument sqlLoadProcessedDocument, + LdbLoadProcessedDocument sqlLoadProcessedDocument, SqlLoadDomainMetadata sqlLoadDomainMetadata, IndexLoadKeywords indexLoadKeywords) { - - this.sqlLoadUrls = sqlLoadUrls; this.sqlLoadDomains = sqlLoadDomains; this.sqlLoadDomainLinks = sqlLoadDomainLinks; this.sqlLoadProcessedDomain = sqlLoadProcessedDomain; @@ -30,6 +26,6 @@ public class LoaderFactory { } public Loader create(int sizeHint) { - return new Loader(sizeHint, sqlLoadUrls, sqlLoadDomains, sqlLoadDomainLinks, sqlLoadProcessedDomain, sqlLoadProcessedDocument, sqlLoadDomainMetadata, indexLoadKeywords); + return new Loader(sizeHint, sqlLoadDomains, sqlLoadDomainLinks, sqlLoadProcessedDomain, sqlLoadProcessedDocument, sqlLoadDomainMetadata, indexLoadKeywords); } } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java index 4aabdcea..05f02798 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java @@ -5,129 +5,82 @@ import com.google.inject.Singleton; import lombok.SneakyThrows; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.db.storage.model.FileStorageType; -import nu.marginalia.dict.OffHeapDictionaryHashMap; +import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.model.IndexJournalEntryHeader; -import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; +import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl; +import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl; import nu.marginalia.index.journal.writer.IndexJournalWriter; import nu.marginalia.keyword.model.DocumentKeywords; -import nu.marginalia.lexicon.KeywordLexicon; -import nu.marginalia.lexicon.journal.KeywordLexiconJournal; -import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; import nu.marginalia.model.idx.DocumentMetadata; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.id.EdgeId; +import nu.marginallia.index.journal.IndexJournalFileNames; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Files; -import java.nio.file.attribute.PosixFilePermissions; import java.sql.SQLException; import java.util.Arrays; -import java.util.concurrent.*; + +import static nu.marginalia.index.journal.model.IndexJournalEntryData.MAX_LENGTH; @Singleton public class LoaderIndexJournalWriter { - private final KeywordLexicon lexicon; private final IndexJournalWriter indexWriter; private static final Logger logger = LoggerFactory.getLogger(LoaderIndexJournalWriter.class); @Inject public LoaderIndexJournalWriter(FileStorageService fileStorageService) throws IOException, SQLException { - var lexiconArea = fileStorageService.getStorageByType(FileStorageType.LEXICON_STAGING); var indexArea = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING); - var lexiconPath = lexiconArea.asPath().resolve("dictionary.dat"); - var indexPath = indexArea.asPath().resolve("page-index.dat"); + var existingIndexFiles = IndexJournalFileNames.findJournalFiles(indexArea.asPath()); + for (var existingFile : existingIndexFiles) { + Files.delete(existingFile); + } - Files.deleteIfExists(indexPath); - Files.deleteIfExists(lexiconPath); - - Files.createFile(indexPath, PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); - Files.createFile(lexiconPath, PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); - - lexicon = new KeywordLexicon(new KeywordLexiconJournal(lexiconPath.toFile(), KeywordLexiconJournalMode.READ_WRITE)); - indexWriter = new IndexJournalWriterImpl(lexicon, indexPath); + indexWriter = new IndexJournalWriterPagingImpl(indexArea.asPath()); } - private final LinkedBlockingQueue keywordInsertTaskQueue = - new LinkedBlockingQueue<>(65536); - private final ExecutorService keywordInsertionExecutor = - new ThreadPoolExecutor(8, 16, 1, TimeUnit.MINUTES, keywordInsertTaskQueue); - + MurmurHash3_128 hasher = new MurmurHash3_128(); + long[] buffer = new long[MAX_LENGTH * 2]; @SneakyThrows - public void putWords(EdgeId domain, EdgeId url, + public void putWords(long combinedId, int features, DocumentMetadata metadata, DocumentKeywords wordSet) { if (wordSet.keywords().length == 0) { - logger.info("Skipping zero-length word set for {}:{}", domain, url); + logger.info("Skipping zero-length word set for {}", combinedId); return; } - if (domain.id() <= 0 || url.id() <= 0) { - logger.warn("Bad ID: {}:{}", domain, url); + if (combinedId <= 0) { + logger.warn("Bad ID: {}", combinedId); return; } - // Due to the very bursty access patterns of this method, doing the actual insertions in separate threads - // with a chonky work queue is a fairly decent improvement - for (var chunk : KeywordListChunker.chopList(wordSet, IndexJournalEntryData.MAX_LENGTH)) { - try { - keywordInsertionExecutor.submit(() -> loadWords(domain, url, features, metadata, chunk)); - } - catch (RejectedExecutionException ex) { - loadWords(domain, url, features, metadata, chunk); + String[] words = wordSet.keywords(); + long[] meta = wordSet.metadata(); + + for (int start = 0; start < words.length; ) { + int end = Math.min(start + MAX_LENGTH, words.length); + + for (int i = 0; i < end - start; i++) { + buffer[2*i] = hasher.hashNearlyASCII(words[start+i]); + buffer[2*i + 1] = meta[start+i]; } + + var entry = new IndexJournalEntryData(end-start, buffer); + var header = new IndexJournalEntryHeader(combinedId, features, metadata.encode()); + + indexWriter.put(header, entry); + + start = end; } } - private void loadWords(EdgeId domain, - EdgeId url, - int features, - DocumentMetadata metadata, - DocumentKeywords wordSet) { - if (null == metadata) { - logger.warn("Null metadata for {}:{}", domain, url); - return; - } - - var entry = new IndexJournalEntryData(getOrInsertWordIds(wordSet.keywords(), wordSet.metadata())); - var header = new IndexJournalEntryHeader(domain, features, url, metadata.encode()); - - indexWriter.put(header, entry); - } - - private long[] getOrInsertWordIds(String[] words, long[] meta) { - long[] ids = new long[words.length*2]; - int putIdx = 0; - - for (int i = 0; i < words.length; i++) { - String word = words[i]; - - long id = lexicon.getOrInsert(word); - if (id != OffHeapDictionaryHashMap.NO_VALUE) { - ids[putIdx++] = id; - ids[putIdx++] = meta[i]; - } - } - - if (putIdx != words.length*2) { - ids = Arrays.copyOf(ids, putIdx); - } - return ids; - } - public void close() throws Exception { - keywordInsertionExecutor.shutdown(); - while (!keywordInsertionExecutor.awaitTermination(1, TimeUnit.DAYS)) { - // ...? - } indexWriter.close(); - lexicon.close(); } } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomainMetadata.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomainMetadata.java index 3c435f87..bf1dbcdc 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomainMetadata.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomainMetadata.java @@ -5,7 +5,7 @@ import nu.marginalia.model.EdgeDomain; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.inject.Inject; +import com.google.inject.Inject; import java.sql.SQLException; public class SqlLoadDomainMetadata { diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomains.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomains.java index eb4713ce..3ecd2411 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomains.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomains.java @@ -6,6 +6,7 @@ import nu.marginalia.model.EdgeDomain; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; import java.sql.Connection; import java.sql.SQLException; diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDocument.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDocument.java deleted file mode 100644 index 909ec986..00000000 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDocument.java +++ /dev/null @@ -1,187 +0,0 @@ -package nu.marginalia.loading.loader; - -import com.google.inject.Inject; -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; -import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; -import org.apache.commons.lang3.StringUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.sql.SQLException; -import java.sql.Types; -import java.util.List; - -import static java.sql.Statement.SUCCESS_NO_INFO; - -public class SqlLoadProcessedDocument { - private final HikariDataSource dataSource; - private static final Logger logger = LoggerFactory.getLogger(SqlLoadProcessedDocument.class); - - @Inject - public SqlLoadProcessedDocument(HikariDataSource dataSource) { - this.dataSource = dataSource; - - try (var conn = dataSource.getConnection()) { - try (var stmt = conn.createStatement()) { - stmt.execute("DROP PROCEDURE IF EXISTS INSERT_PAGE_VISIT"); - stmt.execute("DROP PROCEDURE IF EXISTS INSERT_PAGE_VISIT_BAD"); - stmt.execute(""" - CREATE PROCEDURE INSERT_PAGE_VISIT ( - IN URL_ID INT, - IN STATE VARCHAR(32), - IN TITLE VARCHAR(255), - IN DESCRIPTION VARCHAR(255), - IN LENGTH INT, - IN FEATURES INT, - IN STANDARD VARCHAR(32), - IN QUALITY DOUBLE, - IN HASH BIGINT, - IN PUB_YEAR SMALLINT) - BEGIN - SET FOREIGN_KEY_CHECKS=0; - REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES, HASH, QUALITY, PUB_YEAR); - UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID; - SET FOREIGN_KEY_CHECKS=1; - END - """); - stmt.execute(""" - CREATE PROCEDURE INSERT_PAGE_VISIT_BAD ( - IN URL_ID INT, - IN STATE VARCHAR(32)) - BEGIN - UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID; - DELETE FROM EC_PAGE_DATA WHERE ID=URL_ID; - END - """); - - } - } - catch (SQLException ex) { - throw new RuntimeException("Failed to set up loader", ex); - } - } - - public void load(LoaderData data, List documents) { - - try (var conn = dataSource.getConnection()) { - try (var insertCall = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)") - ) { - conn.setAutoCommit(false); - - int cnt = 0; - int batchOffset = 0; - for (var doc : documents) { - int urlId = data.getUrlId(doc.url()); - if (urlId <= 0) { - logger.warn("Failed to resolve ID for URL {}", doc.url()); - continue; - } - - insertCall.setInt(1, urlId); - insertCall.setString(2, doc.state().name()); - insertCall.setString(3, doc.title()); - insertCall.setString(4, StringUtils.truncate(doc.description(), 255)); - insertCall.setInt(5, doc.length()); - insertCall.setInt(6, doc.htmlFeatures()); - insertCall.setString(7, doc.standard()); - insertCall.setDouble(8, doc.quality()); - insertCall.setLong(9, doc.hash()); - if (doc.pubYear() != null) { - insertCall.setShort(10, (short) doc.pubYear().intValue()); - } else { - insertCall.setInt(10, Types.SMALLINT); - } - insertCall.addBatch(); - - if (++cnt == 100) { - var ret = insertCall.executeBatch(); - conn.commit(); - - for (int rv = 0; rv < cnt; rv++) { - if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { - logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]); - } - } - - cnt = 0; - batchOffset += 100; - } - } - if (cnt > 0) { - var ret = insertCall.executeBatch(); - conn.commit(); - for (int rv = 0; rv < cnt; rv++) { - if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { - logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]); - } - } - } - - conn.setAutoCommit(true); - } - catch (SQLException ex) { - conn.rollback(); - throw ex; - } - } catch (SQLException ex) { - logger.warn("SQL error inserting document", ex); - - if (getClass().desiredAssertionStatus()) - throw new RuntimeException(ex); - } - } - - public void loadWithError(LoaderData data, List documents) { - - try (var conn = dataSource.getConnection(); - var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT_BAD(?, ?)")) { - - conn.setAutoCommit(false); - - int cnt = 0; int batchOffset = 0; - for (var doc : documents) { - int urlId = data.getUrlId(doc.url()); - if (urlId < 0) { - logger.warn("Failed to resolve ID for URL {}", doc.url()); - return; - } - - stmt.setInt(1, urlId); - stmt.setString(2, doc.state().name()); - stmt.addBatch(); - - if (++cnt == 100) { - var ret = stmt.executeBatch(); - conn.commit(); - - for (int rv = 0; rv < cnt; rv++) { - if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { - logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]); - } - } - - cnt = 0; - batchOffset += 100; - } - } - if (cnt > 0) { - var ret = stmt.executeBatch(); - conn.commit(); - for (int rv = 0; rv < cnt; rv++) { - if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { - logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]); - } - } - } - - conn.setAutoCommit(true); - } catch (SQLException ex) { - logger.warn("SQL error inserting failed document", ex); - - if (getClass().desiredAssertionStatus()) - throw new RuntimeException(ex); - } - - } -} diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java index 9ac576af..9bf94816 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java @@ -14,23 +14,18 @@ import java.sql.SQLException; public class SqlLoadProcessedDomain { private final HikariDataSource dataSource; private final SqlLoadDomains loadDomains; - private final SqlLoadUrls loadUrls; private static final Logger logger = LoggerFactory.getLogger(SqlLoadProcessedDomain.class); @Inject - public SqlLoadProcessedDomain(HikariDataSource dataSource, SqlLoadDomains loadDomains, SqlLoadUrls loadUrls) { + public SqlLoadProcessedDomain(HikariDataSource dataSource, SqlLoadDomains loadDomains) { this.dataSource = dataSource; this.loadDomains = loadDomains; - this.loadUrls = loadUrls; try (var conn = dataSource.getConnection()) { try (var stmt = conn.createStatement()) { stmt.execute("DROP PROCEDURE IF EXISTS INITIALIZE_DOMAIN"); - // Note that there should be no need to delete from EC_PAGE_DATA here as it's done via their - // CASCADE DELETE constraint on EC_URL. - stmt.execute(""" CREATE PROCEDURE INITIALIZE_DOMAIN ( IN ST ENUM('ACTIVE', 'EXHAUSTED', 'SPECIAL', 'SOCIAL_MEDIA', 'BLOCKED', 'REDIR', 'ERROR', 'UNKNOWN'), @@ -40,7 +35,6 @@ public class SqlLoadProcessedDomain { BEGIN DELETE FROM DOMAIN_METADATA WHERE ID=DID; DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID; - DELETE FROM EC_URL WHERE DOMAIN_ID=DID; UPDATE EC_DOMAIN SET INDEX_DATE=NOW(), STATE=ST, DOMAIN_ALIAS=NULL, INDEXED=GREATEST(INDEXED,IDX), IP=IP WHERE ID=DID; DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=DID; END @@ -69,8 +63,6 @@ public class SqlLoadProcessedDomain { if (rc < 1) { logger.warn("load({},{}) -- bad rowcount {}", domain, state, rc); } - - loadUrls.loadUrlsForDomain(data, domain, 0); } catch (SQLException ex) { conn.rollback(); diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadUrls.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadUrls.java deleted file mode 100644 index 4ef1509e..00000000 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadUrls.java +++ /dev/null @@ -1,151 +0,0 @@ -package nu.marginalia.loading.loader; - -import com.google.inject.Inject; -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.hash.MurmurHash3_128; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.sql.SQLException; -import java.sql.Types; -import java.util.HashSet; -import java.util.Set; - -import static java.sql.Statement.SUCCESS_NO_INFO; - -public class SqlLoadUrls { - - private final HikariDataSource dataSource; - private static final Logger logger = LoggerFactory.getLogger(SqlLoadUrls.class); - - @Inject - public SqlLoadUrls(HikariDataSource dataSource) { - this.dataSource = dataSource; - } - private final MurmurHash3_128 murmurHash = new MurmurHash3_128(); - - public void load(LoaderData data, EdgeUrl[] urls) { - Set affectedDomains = new HashSet<>(); - - if (urls.length == 0) - return; - - int maxOldId = 0; - try (var conn = dataSource.getConnection()) { - - try (var insertStmt = conn.prepareStatement("INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PARAM,PATH_HASH) VALUES (?,?,?,?,?,?)"); - var queryMaxId = conn.prepareStatement("SELECT MAX(ID) FROM EC_URL")) { - - conn.setAutoCommit(false); - - var rs = queryMaxId.executeQuery(); - if (rs.next()) { - maxOldId = rs.getInt(1); - } - - int cnt = 0; - int batchOffset = 0; - - for (var url : urls) { - if (data.getUrlId(url) != 0) - continue; - - if (url.path.length() >= 255) { - logger.info("Skipping bad URL {}", url); - continue; - } - var domainId = data.getDomainId(url.domain); - - affectedDomains.add(url.domain); - - insertStmt.setString(1, url.proto); - insertStmt.setInt(2, domainId); - if (url.port != null) { - insertStmt.setInt(3, url.port); - } else { - insertStmt.setNull(3, Types.INTEGER); - } - insertStmt.setString(4, url.path); - insertStmt.setString(5, url.param); - insertStmt.setLong(6, hashPath(url.path, url.param)); - insertStmt.addBatch(); - - if (++cnt == 1000) { - var ret = insertStmt.executeBatch(); - for (int rv = 0; rv < cnt; rv++) { - if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { - logger.warn("load({}) -- bad row count {}", urls[batchOffset + rv], ret[rv]); - } - } - - batchOffset += cnt; - cnt = 0; - } - } - - if (cnt > 0) { - var ret = insertStmt.executeBatch(); - for (int rv = 0; rv < cnt; rv++) { - if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { - logger.warn("load({}) -- bad row count {}", urls[batchOffset + rv], ret[rv]); - } - } - } - - conn.commit(); - conn.setAutoCommit(true); - - for (var domain : affectedDomains) { - loadUrlsForDomain(data, domain, maxOldId); - } - } - catch (SQLException ex) { - conn.rollback(); - throw ex; - } - } - catch (SQLException ex) { - logger.warn("SQL error inserting URLs", ex); - - if (getClass().desiredAssertionStatus()) - throw new RuntimeException(ex); - } - } - - /* We use a uniqueness constraint on DOMAIN_ID and this hash instead of on the PATH and PARAM - * fields as the uniqueness index grows absurdly large for some reason, possibly due to the prevalent - * shared leading substrings in paths? - */ - private long hashPath(String path, String queryParam) { - long hash = murmurHash.hashNearlyASCII(path); - if (queryParam != null) { - hash ^= murmurHash.hashNearlyASCII(queryParam); - } - return hash; - } - - /** Loads urlIDs for the domain into `data` from the database, starting at URL ID minId. */ - public void loadUrlsForDomain(LoaderData data, EdgeDomain domain, int minId) throws SQLException { - try (var conn = dataSource.getConnection(); - var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH, PARAM FROM EC_URL WHERE DOMAIN_ID=? AND ID > ?")) { - - queryCall.setFetchSize(1000); - queryCall.setInt(1, data.getDomainId(domain)); - queryCall.setInt(2, minId); - - var rsp = queryCall.executeQuery(); - - while (rsp.next()) { - int urlId = rsp.getInt(1); - String proto = rsp.getString(2); - String path = rsp.getString(3); - String param = rsp.getString(4); - - data.addUrl(new EdgeUrl(proto, domain, null, path, param), urlId); - } - } - - } -} diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDocumentTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDocumentTest.java deleted file mode 100644 index e9dd92b6..00000000 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDocumentTest.java +++ /dev/null @@ -1,96 +0,0 @@ -package nu.marginalia.loader; - -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; -import nu.marginalia.search.db.DbUrlDetailsQuery; -import nu.marginalia.loading.loader.LoaderData; -import nu.marginalia.loading.loader.SqlLoadDomains; -import nu.marginalia.loading.loader.SqlLoadProcessedDocument; -import nu.marginalia.loading.loader.SqlLoadUrls; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; -import nu.marginalia.converting.model.HtmlStandard; -import nu.marginalia.model.crawl.UrlIndexingState; -import nu.marginalia.model.crawl.HtmlFeature; -import nu.marginalia.model.id.EdgeIdArray; -import org.junit.jupiter.api.*; -import org.testcontainers.containers.MariaDBContainer; -import org.testcontainers.junit.jupiter.Container; -import org.testcontainers.junit.jupiter.Testcontainers; - -import java.net.URISyntaxException; -import java.util.List; -import java.util.Set; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; - -@Tag("slow") -@Testcontainers -class SqlLoadProcessedDocumentTest { - @Container - static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") - .withDatabaseName("WMSA_prod") - .withUsername("wmsa") - .withPassword("wmsa") - .withInitScript("db/migration/V23_06_0_000__base.sql") - .withNetworkAliases("mariadb"); - - HikariDataSource dataSource; - LoaderData loaderData; - - DbUrlDetailsQuery dbUrlDetailsQuery; - @BeforeEach - public void setUp() throws URISyntaxException { - dataSource = DbTestUtil.getConnection(mariaDBContainer.getJdbcUrl()); - dbUrlDetailsQuery = new DbUrlDetailsQuery(dataSource); - - var loadDomains = new SqlLoadDomains(dataSource); - var loadUrls = new SqlLoadUrls(dataSource); - - loaderData = new LoaderData(10); - - loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu")); - loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu")); - - loadUrls.load(loaderData, new EdgeUrl[]{new EdgeUrl("https://www.marginalia.nu/")}); - - } - - @AfterEach - public void tearDown() { - dataSource.close(); - } - - @Test - public void loadProcessedDocument() throws URISyntaxException { - var loader = new SqlLoadProcessedDocument(dataSource); - var url = new EdgeUrl("https://www.marginalia.nu/"); - - loader.load(loaderData, List.of(new LoadProcessedDocument( - url, - UrlIndexingState.OK, - "TITLE", - "DESCR", - HtmlFeature.encode(Set.of(HtmlFeature.AFFILIATE_LINK)), - HtmlStandard.HTML5.name(), - 100, - 12345, - -3.14, - null - ))); - - var details = dbUrlDetailsQuery.getUrlDetailsMulti(new EdgeIdArray<>(loaderData.getUrlId(new EdgeUrl("https://www.marginalia.nu/")))); - Assertions.assertEquals(1, details.size()); - - var urlDetails = details.get(0); - - assertEquals("TITLE", urlDetails.getTitle()); - assertEquals("DESCR", urlDetails.getDescription()); - assertTrue(urlDetails.isAffiliate()); - assertEquals(100, urlDetails.words); - assertEquals(12345, urlDetails.dataHash); - assertEquals(-3.14, urlDetails.getUrlQuality()); - } - -} \ No newline at end of file diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java index 0ef662eb..e1fa8223 100644 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java +++ b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java @@ -5,7 +5,6 @@ import nu.marginalia.loading.loader.LoaderData; import nu.marginalia.loading.loader.SqlLoadDomains; import nu.marginalia.loading.loader.SqlLoadProcessedDomain; import nu.marginalia.converting.instruction.instructions.DomainLink; -import nu.marginalia.loading.loader.SqlLoadUrls; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.crawl.DomainIndexingState; import org.junit.jupiter.api.AfterEach; @@ -51,18 +50,18 @@ class SqlLoadProcessedDomainTest { @Test public void loadProcessedDomain() { - var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource), new SqlLoadUrls(dataSource)); + var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); loader.load(loaderData, new EdgeDomain("www.marginalia.nu"), DomainIndexingState.BLOCKED, "127.0.0.1"); } @Test public void loadProcessedDomainTwice() { - var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource), new SqlLoadUrls(dataSource)); + var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); loader.load(loaderData, new EdgeDomain("www.marginalia.nu"), DomainIndexingState.BLOCKED, "127.0.0.1"); } @Test public void loadProcessedDomaiWithExtremelyLongIP() { - var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource), new SqlLoadUrls(dataSource)); + var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); String ip = Stream.generate(() -> "127.").limit(1024).collect(Collectors.joining()); @@ -71,7 +70,7 @@ class SqlLoadProcessedDomainTest { @Test public void loadDomainAlias() { - var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource), new SqlLoadUrls(dataSource)); + var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); loader.loadAlias(loaderData, new DomainLink(new EdgeDomain("memex.marginalia.nu"), new EdgeDomain("www.marginalia.nu"))); } } \ No newline at end of file diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadUrlsTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadUrlsTest.java deleted file mode 100644 index 7fece308..00000000 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadUrlsTest.java +++ /dev/null @@ -1,54 +0,0 @@ -package nu.marginalia.loader; - -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.loading.loader.LoaderData; -import nu.marginalia.loading.loader.SqlLoadDomains; -import nu.marginalia.loading.loader.SqlLoadUrls; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Tag; -import org.junit.jupiter.api.Test; -import org.testcontainers.containers.MariaDBContainer; -import org.testcontainers.junit.jupiter.Container; -import org.testcontainers.junit.jupiter.Testcontainers; - -import java.net.URISyntaxException; - -@Tag("slow") -@Testcontainers -class SqlLoadUrlsTest { - @Container - static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") - .withDatabaseName("WMSA_prod") - .withUsername("wmsa") - .withPassword("wmsa") - .withInitScript("db/migration/V23_06_0_000__base.sql") - .withNetworkAliases("mariadb"); - - HikariDataSource dataSource; - LoaderData loaderData; - @BeforeEach - public void setUp() { - dataSource = DbTestUtil.getConnection(mariaDBContainer.getJdbcUrl()); - - var loadDomains = new SqlLoadDomains(dataSource); - loaderData = new LoaderData(10); - - loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu")); - loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu")); - } - - @AfterEach - public void tearDown() { - dataSource.close(); - } - - @Test - public void loadUrl() throws URISyntaxException { - var loadUrls = new SqlLoadUrls(dataSource); - loadUrls.load(loaderData, new EdgeUrl[] { new EdgeUrl("https://www.marginalia.nu/") }); - } - -} \ No newline at end of file diff --git a/code/processes/readme.md b/code/processes/readme.md index 44c29a1e..6b1ccede 100644 --- a/code/processes/readme.md +++ b/code/processes/readme.md @@ -19,6 +19,11 @@ described in [converting-model](../process-models/converting-model/). The [loading-process](loading-process/) reads the processed data and creates an index journal and lexicon, and loads domains and addresses into the MariaDB-database. +## 4. Index Construction Process + +The [index-construction-process](index-constructor-process/) constructs indices from +the data generated by the loader. + ## Overview Schematically the crawling and loading process looks like this: @@ -65,8 +70,12 @@ Schematically the crawling and loading process looks like this: \\==================// | +------------+ - | LOADING | Insert URLs in DB + | LOADING | Insert URLs in link DB | STEP | Insert keywords in Index +------------+ - + | + +------------+ + | CONSTRUCT | Make the data searchable + | INDEX | + +------------+ ``` \ No newline at end of file diff --git a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/eval/MathParser.java b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/eval/MathParser.java index 247963c8..1ec4bb0a 100644 --- a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/eval/MathParser.java +++ b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/eval/MathParser.java @@ -4,7 +4,7 @@ import lombok.AllArgsConstructor; import lombok.SneakyThrows; import lombok.ToString; -import javax.inject.Singleton; +import com.google.inject.Singleton; import java.math.RoundingMode; import java.text.DecimalFormat; import java.text.NumberFormat; diff --git a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/eval/Units.java b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/eval/Units.java index b73eb7ac..4cbb141a 100644 --- a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/eval/Units.java +++ b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/eval/Units.java @@ -3,8 +3,8 @@ package nu.marginalia.assistant.eval; import com.opencsv.CSVReader; import lombok.SneakyThrows; -import javax.inject.Inject; -import javax.inject.Singleton; +import com.google.inject.Inject; +import com.google.inject.Singleton; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.text.DecimalFormat; diff --git a/code/services-core/control-service/build.gradle b/code/services-core/control-service/build.gradle index c2cbcb24..9c1336aa 100644 --- a/code/services-core/control-service/build.gradle +++ b/code/services-core/control-service/build.gradle @@ -36,6 +36,7 @@ dependencies { implementation project(':code:api:index-api') implementation project(':code:api:process-mqapi') implementation project(':code:features-search:screenshots') + implementation project(':code:features-index:index-journal') implementation libs.lombok annotationProcessor libs.lombok @@ -44,6 +45,7 @@ dependencies { implementation libs.prometheus implementation libs.notnull implementation libs.guice + implementation libs.zstd implementation libs.trove implementation libs.spark diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java index 0fc1a178..43e9d985 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/ControlService.java @@ -11,7 +11,6 @@ import nu.marginalia.db.storage.model.FileStorageId; import nu.marginalia.db.storage.model.FileStorageType; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.gson.GsonFactory; -import nu.marginalia.model.id.EdgeIdList; import nu.marginalia.renderer.RendererFactory; import nu.marginalia.screenshot.ScreenshotService; import nu.marginalia.service.server.*; @@ -81,6 +80,7 @@ public class ControlService extends Service { var storageRenderer = rendererFactory.renderer("control/storage-overview"); var storageSpecsRenderer = rendererFactory.renderer("control/storage-specs"); var storageCrawlsRenderer = rendererFactory.renderer("control/storage-crawls"); + var storageBackupsRenderer = rendererFactory.renderer("control/storage-backups"); var storageProcessedRenderer = rendererFactory.renderer("control/storage-processed"); var reviewRandomDomainsRenderer = rendererFactory.renderer("control/review-random-domains"); @@ -147,6 +147,7 @@ public class ControlService extends Service { Spark.get("/public/storage", this::storageModel, storageRenderer::render); Spark.get("/public/storage/specs", this::storageModelSpecs, storageSpecsRenderer::render); Spark.get("/public/storage/crawls", this::storageModelCrawls, storageCrawlsRenderer::render); + Spark.get("/public/storage/backups", this::storageModelBackups, storageBackupsRenderer::render); Spark.get("/public/storage/processed", this::storageModelProcessed, storageProcessedRenderer::render); Spark.get("/public/storage/:id", this::storageDetailsModel, storageDetailsRenderer::render); Spark.get("/public/storage/:id/file", controlFileStorageService::downloadFileFromStorage); @@ -158,6 +159,7 @@ public class ControlService extends Service { Spark.post("/public/storage/:fid/process", controlActorService::triggerProcessing, redirectToActors); Spark.post("/public/storage/:fid/process-and-load", controlActorService::triggerProcessingWithLoad, redirectToActors); Spark.post("/public/storage/:fid/load", controlActorService::loadProcessedData, redirectToActors); + Spark.post("/public/storage/:fid/restore-backup", controlActorService::restoreBackup, redirectToActors); Spark.post("/public/storage/specs", controlActorService::createCrawlSpecification, redirectToStorage); Spark.post("/public/storage/:fid/delete", controlFileStorageService::flagFileForDeletionRequest, redirectToStorage); @@ -183,9 +185,7 @@ public class ControlService extends Service { Spark.post("/public/actions/calculate-adjacencies", controlActionsService::calculateAdjacencies, redirectToActors); Spark.post("/public/actions/reload-blogs-list", controlActionsService::reloadBlogsList, redirectToActors); Spark.post("/public/actions/repartition-index", controlActionsService::triggerRepartition, redirectToActors); - Spark.post("/public/actions/reconstruct-index", controlActionsService::triggerIndexReconstruction, redirectToActors); Spark.post("/public/actions/trigger-data-exports", controlActionsService::triggerDataExports, redirectToActors); - Spark.post("/public/actions/flush-search-caches", controlActionsService::flushSearchCaches, redirectToActors); Spark.post("/public/actions/flush-api-caches", controlActionsService::flushApiCaches, redirectToActors); Spark.post("/public/actions/truncate-links-database", controlActionsService::truncateLinkDatabase, redirectToActors); @@ -224,7 +224,7 @@ public class ControlService extends Service { } }); - randomExplorationService.removeRandomDomains(new EdgeIdList<>(idList.toArray())); + randomExplorationService.removeRandomDomains(idList.toArray()); String after = request.queryParams("after"); @@ -361,6 +361,9 @@ public class ControlService extends Service { private Object storageModelCrawls(Request request, Response response) { return Map.of("storage", controlFileStorageService.getStorageList(FileStorageType.CRAWL_DATA)); } + private Object storageModelBackups(Request request, Response response) { + return Map.of("storage", controlFileStorageService.getStorageList(FileStorageType.BACKUP)); + } private Object storageModelProcessed(Request request, Response response) { return Map.of("storage", controlFileStorageService.getStorageList(FileStorageType.PROCESSED_DATA)); } diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/Actor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/Actor.java index 75c96a22..9eb625ce 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/Actor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/Actor.java @@ -14,8 +14,9 @@ public enum Actor { CRAWL_JOB_EXTRACTOR, EXPORT_DATA, TRUNCATE_LINK_DATABASE, - CONVERT; - + INDEX_CONSTRUCTOR_MONITOR, + CONVERT, + RESTORE_BACKUP; public String id() { return "fsm:" + name().toLowerCase(); diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java index 7be91df3..3aea2bf9 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/ControlActors.java @@ -38,12 +38,14 @@ public class ControlActors { ConvertAndLoadActor convertAndLoadActor, CrawlActor crawlActor, RecrawlActor recrawlActor, + RestoreBackupActor restoreBackupActor, ConverterMonitorActor converterMonitorFSM, CrawlerMonitorActor crawlerMonitorActor, LoaderMonitorActor loaderMonitor, MessageQueueMonitorActor messageQueueMonitor, ProcessLivenessMonitorActor processMonitorFSM, FileStorageMonitorActor fileStorageMonitorActor, + IndexConstructorMonitorActor indexConstructorMonitorActor, TriggerAdjacencyCalculationActor triggerAdjacencyCalculationActor, CrawlJobExtractorActor crawlJobExtractorActor, ExportDataActor exportDataActor, @@ -56,8 +58,10 @@ public class ControlActors { register(Actor.CRAWL, crawlActor); register(Actor.RECRAWL, recrawlActor); register(Actor.CONVERT, convertActor); + register(Actor.RESTORE_BACKUP, restoreBackupActor); register(Actor.CONVERT_AND_LOAD, convertAndLoadActor); + register(Actor.INDEX_CONSTRUCTOR_MONITOR, indexConstructorMonitorActor); register(Actor.CONVERTER_MONITOR, converterMonitorFSM); register(Actor.LOADER_MONITOR, loaderMonitor); register(Actor.CRAWLER_MONITOR, crawlerMonitorActor); diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java index b3b3473f..d95c9475 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/AbstractProcessSpawnerActor.java @@ -119,14 +119,14 @@ public class AbstractProcessSpawnerActor extends AbstractActorPrototype { if (attempts < MAX_ATTEMPTS) transition(RUN, attempts + 1); else error(); } - else if (endTime - startTime < TimeUnit.SECONDS.toMillis(10)) { + else if (endTime - startTime < TimeUnit.SECONDS.toMillis(1)) { // To avoid boot loops, we transition to error if the process - // didn't run for longer than 10 seconds. This might happen if + // didn't run for longer than 1 seconds. This might happen if // the process crashes before it can reach the heartbeat and inbox // stages of execution. In this case it would not report having acted // on its message, and the process would be restarted forever without // the attempts counter incrementing. - error("Process terminated within 10 seconds of starting"); + error("Process terminated within 1 seconds of starting"); } } catch (InterruptedException ex) { diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/IndexConstructorMonitorActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/IndexConstructorMonitorActor.java new file mode 100644 index 00000000..abc44d6b --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/monitor/IndexConstructorMonitorActor.java @@ -0,0 +1,22 @@ +package nu.marginalia.control.actor.monitor; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.actor.ActorStateFactory; +import nu.marginalia.control.process.ProcessService; +import nu.marginalia.mq.persistence.MqPersistence; +import nu.marginalia.mqapi.ProcessInboxNames; + +@Singleton +public class IndexConstructorMonitorActor extends AbstractProcessSpawnerActor { + + + @Inject + public IndexConstructorMonitorActor(ActorStateFactory stateFactory, + MqPersistence persistence, + ProcessService processService) { + super(stateFactory, persistence, processService, ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX, ProcessService.ProcessId.INDEX_CONSTRUCTOR); + } + + +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ConvertAndLoadActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ConvertAndLoadActor.java index 38966a7f..3d57e677 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ConvertAndLoadActor.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/ConvertAndLoadActor.java @@ -9,10 +9,13 @@ import lombok.With; import nu.marginalia.actor.ActorStateFactory; import nu.marginalia.control.process.ProcessOutboxes; import nu.marginalia.control.process.ProcessService; +import nu.marginalia.control.svc.BackupService; import nu.marginalia.index.client.IndexClient; import nu.marginalia.index.client.IndexMqEndpoints; import nu.marginalia.mqapi.converting.ConvertAction; import nu.marginalia.mqapi.converting.ConvertRequest; +import nu.marginalia.mqapi.index.CreateIndexRequest; +import nu.marginalia.mqapi.index.IndexName; import nu.marginalia.mqapi.loading.LoadRequest; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.db.storage.model.FileStorageBaseType; @@ -23,11 +26,13 @@ import nu.marginalia.mq.outbox.MqOutbox; import nu.marginalia.actor.prototype.AbstractActorPrototype; import nu.marginalia.actor.state.ActorState; import nu.marginalia.actor.state.ActorResumeBehavior; +import nu.marginalia.search.client.SearchClient; +import nu.marginalia.search.client.SearchMqEndpoints; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.nio.file.Files; -import java.nio.file.StandardCopyOption; +import java.io.IOException; +import java.sql.SQLException; @Singleton public class ConvertAndLoadActor extends AbstractActorPrototype { @@ -38,19 +43,22 @@ public class ConvertAndLoadActor extends AbstractActorPrototype { public static final String RECONVERT = "RECONVERT"; public static final String RECONVERT_WAIT = "RECONVERT-WAIT"; public static final String LOAD = "LOAD"; - public static final String LOAD_WAIT = "LOAD-WAIT"; - public static final String SWAP_LEXICON = "SWAP-LEXICON"; - + public static final String BACKUP = "BACKUP"; public static final String REPARTITION = "REPARTITION"; - public static final String REPARTITION_WAIT = "REPARTITION-WAIT"; - public static final String REINDEX = "REINDEX"; - public static final String REINDEX_WAIT = "REINDEX-WAIT"; + public static final String REINDEX_FWD = "REINDEX_FWD"; + public static final String REINDEX_FULL = "REINDEX_FULL"; + public static final String REINDEX_PRIO = "REINDEX_PRIO"; + public static final String SWITCH_OVER = "SWITCH-OVER"; + public static final String END = "END"; private final ActorProcessWatcher processWatcher; private final MqOutbox mqConverterOutbox; private final MqOutbox mqLoaderOutbox; + private final MqOutbox mqIndexConstructorOutbox; private final MqOutbox indexOutbox; + private final MqOutbox searchOutbox; private final FileStorageService storageService; + private final BackupService backupService; private final Gson gson; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -74,15 +82,20 @@ public class ConvertAndLoadActor extends AbstractActorPrototype { ProcessOutboxes processOutboxes, FileStorageService storageService, IndexClient indexClient, + SearchClient searchClient, + BackupService backupService, Gson gson ) { super(stateFactory); this.processWatcher = processWatcher; this.indexOutbox = indexClient.outbox(); + this.searchOutbox = searchClient.outbox(); this.mqConverterOutbox = processOutboxes.getConverterOutbox(); this.mqLoaderOutbox = processOutboxes.getLoaderOutbox(); + this.mqIndexConstructorOutbox = processOutboxes.getIndexConstructorOutbox(); this.storageService = storageService; + this.backupService = backupService; this.gson = gson; } @@ -155,80 +168,71 @@ public class ConvertAndLoadActor extends AbstractActorPrototype { @ActorState( name = LOAD, - next = LOAD_WAIT, - resume = ActorResumeBehavior.ERROR, - description = """ - Send a load request to the loader and transition to LOAD_WAIT. - """) - public Message load(Message message) throws Exception { - - var request = new LoadRequest(message.processedStorageId); - long id = mqLoaderOutbox.sendAsync(LoadRequest.class.getSimpleName(), gson.toJson(request)); - - return message.withLoaderMsgId(id); - - } - - @ActorState( - name = LOAD_WAIT, - next = SWAP_LEXICON, + next = BACKUP, resume = ActorResumeBehavior.RETRY, description = """ - Wait for the loader to finish loading the data. - """ - ) - public void loadWait(Message message) throws Exception { + Instruct the loader to process the data + """) + public Message load(Message message) throws Exception { + if (message.loaderMsgId <= 0) { + var request = new LoadRequest(message.processedStorageId); + long id = mqLoaderOutbox.sendAsync(LoadRequest.class.getSimpleName(), gson.toJson(request)); + + transition(LOAD, message.withLoaderMsgId(id)); + } var rsp = processWatcher.waitResponse(mqLoaderOutbox, ProcessService.ProcessId.LOADER, message.loaderMsgId); if (rsp.state() != MqMessageState.OK) error("Loader failed"); + + return message; } - - @ActorState( - name = SWAP_LEXICON, + name = BACKUP, next = REPARTITION, resume = ActorResumeBehavior.RETRY, description = """ - Move the lexicon from the LEXICON_STAGING area to the LEXICON_LIVE area, - then instruct the index-service to reload the lexicon. - """ - ) - public void swapLexicon(Message message) throws Exception { - var live = storageService.getStorageByType(FileStorageType.LEXICON_LIVE); - - var staging = storageService.getStorageByType(FileStorageType.LEXICON_STAGING); - var fromSource = staging.asPath().resolve("dictionary.dat"); - var liveDest = live.asPath().resolve("dictionary.dat"); - - // Swap in new lexicon - logger.info("Moving " + fromSource + " to " + liveDest); - Files.move(fromSource, liveDest, StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.ATOMIC_MOVE); + Create a backup snapshot of the new data + """) + public void createBackup(Message message) throws SQLException, IOException { + backupService.createBackupFromStaging(message.processedStorageId); } - @ActorState( name = REPARTITION, - next = REPARTITION_WAIT, + next = REINDEX_FWD, + resume = ActorResumeBehavior.RETRY, description = """ - Instruct the index-service to repartition the index then transition to REPARTITION_WAIT. + Instruct the index-service to repartition. """ ) - public Long repartition() throws Exception { - return indexOutbox.sendAsync(IndexMqEndpoints.INDEX_REPARTITION, ""); + public void repartition(Long id) throws Exception { + if (id == null) { + transition(REPARTITION, indexOutbox.sendAsync(IndexMqEndpoints.INDEX_REPARTITION, "")); + } + + var rsp = indexOutbox.waitResponse(id); + if (rsp.state() != MqMessageState.OK) { + error("Repartition failed"); + } } @ActorState( - name = REPARTITION_WAIT, - next = REINDEX, + name = REINDEX_FWD, + next = REINDEX_FULL, resume = ActorResumeBehavior.RETRY, description = """ - Wait for the index-service to finish repartitioning the index. + Reconstruct the fwd index """ ) - public void repartitionReply(Long id) throws Exception { - var rsp = indexOutbox.waitResponse(id); + public void reindexFwd(Long id) throws Exception { + if (id == null) { + var request = new CreateIndexRequest(IndexName.FORWARD); + transition(REINDEX_FWD, mqIndexConstructorOutbox.sendAsync(CreateIndexRequest.class.getSimpleName(), gson.toJson(request))); + } + + var rsp = mqIndexConstructorOutbox.waitResponse(id); if (rsp.state() != MqMessageState.OK) { error("Repartition failed"); @@ -236,31 +240,60 @@ public class ConvertAndLoadActor extends AbstractActorPrototype { } @ActorState( - name = REINDEX, - next = REINDEX_WAIT, + name = REINDEX_FULL, + next = REINDEX_PRIO, + resume = ActorResumeBehavior.RETRY, description = """ - Instruct the index-service to reindex the data then transition to REINDEX_WAIT. + Reconstruct the reverse full index """ ) - public Long reindex() throws Exception { - return indexOutbox.sendAsync(IndexMqEndpoints.INDEX_REINDEX, ""); + public void reindexFull(Long id) throws Exception { + if (id == null) { + var request = new CreateIndexRequest(IndexName.REVERSE_FULL); + transition(REINDEX_FULL, mqIndexConstructorOutbox.sendAsync(CreateIndexRequest.class.getSimpleName(), gson.toJson(request))); + } + + var rsp = mqIndexConstructorOutbox.waitResponse(id); + + if (rsp.state() != MqMessageState.OK) { + error("Repartition failed"); + } } @ActorState( - name = REINDEX_WAIT, + name = REINDEX_PRIO, + next = SWITCH_OVER, + resume = ActorResumeBehavior.RETRY, + description = """ + Reconstruct the reverse prio index + """ + ) + public void reindexPrio(Long id) throws Exception { + if (id == null) { + var request = new CreateIndexRequest(IndexName.REVERSE_PRIO); + transition(REINDEX_PRIO, mqIndexConstructorOutbox.sendAsync(CreateIndexRequest.class.getSimpleName(), gson.toJson(request))); + } + + var rsp = mqIndexConstructorOutbox.waitResponse(id); + + if (rsp.state() != MqMessageState.OK) { + error("Repartition failed"); + } + } + + @ActorState( + name = SWITCH_OVER, next = END, resume = ActorResumeBehavior.RETRY, description = """ - Wait for the index-service to finish reindexing the data. + Move the new lexicon into place, instruct the search service to + switch to the new linkdb, and the index service to switch over to the new index. """ ) - public void reindexReply(Long id) throws Exception { - var rsp = indexOutbox.waitResponse(id); - - if (rsp.state() != MqMessageState.OK) { - error("Repartition failed"); - } + public void switchOver(Long id) throws Exception { + // Notify services to switch over + searchOutbox.sendNotice(SearchMqEndpoints.SWITCH_LINKDB, ":-)"); + indexOutbox.sendNotice(IndexMqEndpoints.SWITCH_INDEX, ":^D"); } - } diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/RestoreBackupActor.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/RestoreBackupActor.java new file mode 100644 index 00000000..96629208 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/RestoreBackupActor.java @@ -0,0 +1,49 @@ +package nu.marginalia.control.actor.task; + +import com.google.inject.Inject; +import nu.marginalia.actor.ActorStateFactory; +import nu.marginalia.actor.prototype.AbstractActorPrototype; +import nu.marginalia.actor.state.ActorResumeBehavior; +import nu.marginalia.actor.state.ActorState; +import nu.marginalia.control.actor.Actor; +import nu.marginalia.control.svc.BackupService; +import nu.marginalia.db.storage.model.FileStorageId; +import nu.marginalia.mq.persistence.MqPersistence; + + +public class RestoreBackupActor extends AbstractActorPrototype { + // States + + public static final String RESTORE = "RESTORE"; + public static final String END = "END"; + + private final BackupService backupService; + private final MqPersistence mqPersistence; + + @Override + public String describe() { + return "Restores a backed up set of index data"; + } + @Inject + public RestoreBackupActor(ActorStateFactory stateFactory, + MqPersistence mqPersistence, + BackupService backupService + ) { + super(stateFactory); + this.mqPersistence = mqPersistence; + this.backupService = backupService; + } + + @ActorState(name=RESTORE, next = END, resume = ActorResumeBehavior.ERROR) + public void restoreBackup(FileStorageId id) throws Exception { + backupService.restoreBackup(id); + + mqPersistence.sendNewMessage( + Actor.CONVERT_AND_LOAD.id(), + null, + null, + ConvertAndLoadActor.REPARTITION, + "", + null); + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/TruncateLinkDatabase.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/TruncateLinkDatabase.java index f44545b9..70dd06a3 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/TruncateLinkDatabase.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/actor/task/TruncateLinkDatabase.java @@ -62,13 +62,11 @@ public class TruncateLinkDatabase extends AbstractActorPrototype { Truncate the domain and link tables. """ ) - public void exportBlacklist() throws Exception { + public void flushDatabase() throws Exception { try (var conn = dataSource.getConnection(); var stmt = conn.createStatement()) { stmt.executeUpdate("SET FOREIGN_KEY_CHECKS = 0"); - stmt.executeUpdate("TRUNCATE TABLE EC_PAGE_DATA"); - stmt.executeUpdate("TRUNCATE TABLE EC_URL"); stmt.executeUpdate("TRUNCATE TABLE EC_DOMAIN_LINK"); stmt.executeUpdate("TRUNCATE TABLE DOMAIN_METADATA"); stmt.executeUpdate("SET FOREIGN_KEY_CHECKS = 1"); diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java index 803ad21a..d77d28f2 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/FileStorageWithActions.java @@ -15,10 +15,16 @@ public record FileStorageWithActions(FileStorage storage) { public boolean isLoadable() { return storage.type() == FileStorageType.PROCESSED_DATA; } + public boolean isRestorable() { + return storage.type() == FileStorageType.BACKUP; + } public boolean isConvertible() { return storage.type() == FileStorageType.CRAWL_DATA; } public boolean isDeletable() { - return storage.base().type() == FileStorageBaseType.SLOW; + var baseType = storage.base().type(); + + return baseType == FileStorageBaseType.SLOW + || baseType == FileStorageBaseType.BACKUP; } } diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java index accb3351..5d8554bb 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/model/ProcessHeartbeat.java @@ -44,6 +44,7 @@ public record ProcessHeartbeat( case "loader" -> ProcessService.ProcessId.LOADER; case "website-adjacencies-calculator" -> ProcessService.ProcessId.ADJACENCIES_CALCULATOR; case "crawl-job-extractor" -> ProcessService.ProcessId.CRAWL_JOB_EXTRACTOR; + case "index-constructor" -> ProcessService.ProcessId.INDEX_CONSTRUCTOR; default -> null; }; } diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/process/ProcessOutboxes.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/process/ProcessOutboxes.java index b5b74406..cb45b6f5 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/process/ProcessOutboxes.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/process/ProcessOutboxes.java @@ -12,6 +12,7 @@ public class ProcessOutboxes { private final MqOutbox converterOutbox; private final MqOutbox loaderOutbox; private final MqOutbox crawlerOutbox; + private final MqOutbox indexConstructorOutbox; @Inject public ProcessOutboxes(BaseServiceParams params, MqPersistence persistence) { @@ -30,6 +31,11 @@ public class ProcessOutboxes { params.configuration.serviceName(), params.configuration.instanceUuid() ); + indexConstructorOutbox = new MqOutbox(persistence, + ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX, + params.configuration.serviceName(), + params.configuration.instanceUuid() + ); } @@ -44,4 +50,6 @@ public class ProcessOutboxes { public MqOutbox getCrawlerOutbox() { return crawlerOutbox; } + + public MqOutbox getIndexConstructorOutbox() { return indexConstructorOutbox; } } diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/process/ProcessService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/process/ProcessService.java index 14403dfb..089e42e6 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/process/ProcessService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/process/ProcessService.java @@ -8,8 +8,8 @@ import org.slf4j.LoggerFactory; import org.slf4j.Marker; import org.slf4j.MarkerFactory; -import javax.inject.Inject; -import javax.inject.Singleton; +import com.google.inject.Inject; +import com.google.inject.Singleton; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; @@ -32,6 +32,7 @@ public class ProcessService { CRAWLER("crawler-process/bin/crawler-process"), CONVERTER("converter-process/bin/converter-process"), LOADER("loader-process/bin/loader-process"), + INDEX_CONSTRUCTOR("index-construction-process/bin/index-construction-process"), ADJACENCIES_CALCULATOR("website-adjacencies-calculator/bin/website-adjacencies-calculator"), CRAWL_JOB_EXTRACTOR("crawl-job-extractor-process/bin/crawl-job-extractor-process") ; @@ -118,7 +119,7 @@ public class ProcessService { private final List propagatedEnvironmentVariables = List.of( "JAVA_HOME", "CONVERTER_PROCESS_OPTS", -// "LOADER_PROCESS_OPTS", + "LOADER_PROCESS_OPTS", "CRAWLER_PROCESS_OPTS"); private String[] createEnvironmentVariables() { diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/BackupService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/BackupService.java new file mode 100644 index 00000000..9aef1b90 --- /dev/null +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/BackupService.java @@ -0,0 +1,110 @@ +package nu.marginalia.control.svc; + +import com.github.luben.zstd.ZstdInputStream; +import com.github.luben.zstd.ZstdOutputStream; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorage; +import nu.marginalia.db.storage.model.FileStorageBaseType; +import nu.marginalia.db.storage.model.FileStorageId; +import nu.marginalia.db.storage.model.FileStorageType; +import nu.marginallia.index.journal.IndexJournalFileNames; +import org.apache.commons.io.IOUtils; + +import com.google.inject.Inject; +import java.io.IOException; +import java.nio.file.Files; +import java.sql.SQLException; +import java.time.LocalDateTime; + +public class BackupService { + + private final FileStorageService storageService; + + @Inject + public BackupService(FileStorageService storageService) { + this.storageService = storageService; + } + + /** Create a new backup of the contents in the _STAGING storage areas. + * This backup can later be dehydrated and quickly loaded into _LIVE. + * */ + public void createBackupFromStaging(FileStorageId associatedId) throws SQLException, IOException { + var backupBase = storageService.getStorageBase(FileStorageBaseType.BACKUP); + + String desc = "Pre-load backup snapshot " + LocalDateTime.now(); + + var backupStorage = storageService.allocateTemporaryStorage(backupBase, FileStorageType.BACKUP, "snapshot", desc); + + storageService.relateFileStorages(associatedId, backupStorage.id()); + + var indexStagingStorage = storageService.getStorageByType(FileStorageType.INDEX_STAGING); + var linkdbStagingStorage = storageService.getStorageByType(FileStorageType.LINKDB_STAGING); + + backupFileCompressed("links.db", linkdbStagingStorage, backupStorage); + // This file format is already compressed + backupJournal(indexStagingStorage, backupStorage); + } + + + /** Read back a backup into _STAGING */ + public void restoreBackup(FileStorageId backupId) throws SQLException, IOException { + var backupStorage = storageService.getStorage(backupId); + + var indexStagingStorage = storageService.getStorageByType(FileStorageType.INDEX_STAGING); + var linkdbStagingStorage = storageService.getStorageByType(FileStorageType.LINKDB_STAGING); + + restoreBackupCompressed("links.db", linkdbStagingStorage, backupStorage); + restoreJournal(indexStagingStorage, backupStorage); + } + + + private void backupJournal(FileStorage inputStorage, FileStorage backupStorage) throws IOException + { + for (var source : IndexJournalFileNames.findJournalFiles(inputStorage.asPath())) { + var dest = backupStorage.asPath().resolve(source.toFile().getName()); + + try (var is = Files.newInputStream(source); + var os = Files.newOutputStream(dest) + ) { + IOUtils.copyLarge(is, os); + } + } + + } + + private void restoreJournal(FileStorage destStorage, FileStorage backupStorage) throws IOException { + + // Remove any old journal files first to avoid them getting loaded + for (var garbage : IndexJournalFileNames.findJournalFiles(destStorage.asPath())) { + Files.delete(garbage); + } + + for (var source : IndexJournalFileNames.findJournalFiles(backupStorage.asPath())) { + var dest = destStorage.asPath().resolve(source.toFile().getName()); + + try (var is = Files.newInputStream(source); + var os = Files.newOutputStream(dest) + ) { + IOUtils.copyLarge(is, os); + } + } + + } + + private void backupFileCompressed(String fileName, FileStorage inputStorage, FileStorage backupStorage) throws IOException + { + try (var is = Files.newInputStream(inputStorage.asPath().resolve(fileName)); + var os = new ZstdOutputStream(Files.newOutputStream(backupStorage.asPath().resolve(fileName))) + ) { + IOUtils.copyLarge(is, os); + } + } + private void restoreBackupCompressed(String fileName, FileStorage inputStorage, FileStorage backupStorage) throws IOException + { + try (var is = new ZstdInputStream(Files.newInputStream(backupStorage.asPath().resolve(fileName))); + var os = Files.newOutputStream(backupStorage.asPath().resolve(fileName)) + ) { + IOUtils.copyLarge(is, os); + } + } +} diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java index ff8c5744..a8267d40 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActionsService.java @@ -4,14 +4,12 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.control.actor.ControlActors; import nu.marginalia.control.actor.Actor; -import nu.marginalia.db.DbDomainQueries; import nu.marginalia.db.DomainTypes; import nu.marginalia.index.client.IndexClient; import nu.marginalia.index.client.IndexMqEndpoints; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.outbox.MqOutbox; import nu.marginalia.search.client.SearchClient; -import nu.marginalia.search.client.SearchMqEndpoints; import nu.marginalia.service.control.ServiceEventLog; import nu.marginalia.service.id.ServiceId; import spark.Request; @@ -24,7 +22,6 @@ import java.util.UUID; public class ControlActionsService { private final ControlActors actors; - private final SearchClient searchClient; private final IndexClient indexClient; private final MqOutbox apiOutbox; private final ServiceEventLog eventLog; @@ -32,14 +29,12 @@ public class ControlActionsService { @Inject public ControlActionsService(ControlActors actors, - SearchClient searchClient, IndexClient indexClient, MessageQueueFactory mqFactory, ServiceEventLog eventLog, DomainTypes domainTypes) { this.actors = actors; - this.searchClient = searchClient; this.indexClient = indexClient; this.apiOutbox = createApiOutbox(mqFactory); this.eventLog = eventLog; @@ -71,13 +66,6 @@ public class ControlActionsService { return ""; } - public Object flushSearchCaches(Request request, Response response) throws Exception { - eventLog.logEvent("USER-ACTION", "FLUSH-SEARCH-CACHES"); - searchClient.outbox().sendNotice(SearchMqEndpoints.FLUSH_CACHES, ""); - - return ""; - } - public Object reloadBlogsList(Request request, Response response) throws Exception { eventLog.logEvent("USER-ACTION", "RELOAD-BLOGS-LIST"); @@ -114,10 +102,4 @@ public class ControlActionsService { return null; } - - public Object triggerIndexReconstruction(Request request, Response response) throws Exception { - indexClient.outbox().sendAsync(IndexMqEndpoints.INDEX_REINDEX, ""); - - return null; - } } diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java index 7507e3d1..fbb2b818 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/ControlActorService.java @@ -3,10 +3,7 @@ package nu.marginalia.control.svc; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.control.actor.ControlActors; -import nu.marginalia.control.actor.task.ConvertActor; -import nu.marginalia.control.actor.task.CrawlJobExtractorActor; -import nu.marginalia.control.actor.task.ConvertAndLoadActor; -import nu.marginalia.control.actor.task.RecrawlActor; +import nu.marginalia.control.actor.task.*; import nu.marginalia.control.actor.Actor; import nu.marginalia.control.model.ActorRunState; import nu.marginalia.control.model.ActorStateGraph; @@ -158,4 +155,12 @@ public class ControlActorService { return ""; } + + public Object restoreBackup(Request request, Response response) throws Exception { + var fid = FileStorageId.parse(request.params("fid")); + controlActors.startFrom(Actor.RESTORE_BACKUP, RestoreBackupActor.RESTORE, fid); + return ""; + } + + } \ No newline at end of file diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java index 1379924a..920cdf69 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/HeartbeatService.java @@ -58,7 +58,6 @@ public class HeartbeatService { var stmt = conn.prepareStatement(""" SELECT TASK_NAME, TASK_BASE, SERVICE_INSTANCE, STATUS, STAGE_NAME, PROGRESS, TIMESTAMPDIFF(MICROSECOND, TASK_HEARTBEAT.HEARTBEAT_TIME, CURRENT_TIMESTAMP(6)) AS TSDIFF FROM TASK_HEARTBEAT - INNER JOIN SERVICE_HEARTBEAT ON SERVICE_HEARTBEAT.`INSTANCE` = SERVICE_INSTANCE """)) { var rs = stmt.executeQuery(); while (rs.next()) { diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/RandomExplorationService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/RandomExplorationService.java index c861e961..7a414761 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/RandomExplorationService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/svc/RandomExplorationService.java @@ -2,8 +2,6 @@ package nu.marginalia.control.svc; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.id.EdgeIdList; import java.sql.SQLException; import java.util.ArrayList; @@ -18,7 +16,7 @@ public class RandomExplorationService { this.dataSource = dataSource; } - public void removeRandomDomains(EdgeIdList ids) throws SQLException { + public void removeRandomDomains(int[] ids) throws SQLException { try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(""" DELETE FROM EC_RANDOM_DOMAINS @@ -27,7 +25,7 @@ public class RandomExplorationService { """)) { for (var id : ids) { - stmt.setInt(1, id.id()); + stmt.setInt(1, id); stmt.addBatch(); } stmt.executeBatch(); diff --git a/code/services-core/control-service/src/main/resources/templates/control/actions.hdb b/code/services-core/control-service/src/main/resources/templates/control/actions.hdb index 351fbe5d..9ec528d4 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/actions.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/actions.hdb @@ -45,28 +45,6 @@ - - Reconstruct Index

- This will reconstruct the index from the index journal. - - -

- -
- - - - Flush search-service Caches

- This will instruct the search-service to flush its caches, - getting rid of any stale data. This may rarely be necessary after - reloading the index. - - -

- -
- - Flush api-service Caches

This will instruct the api-service to flush its caches, @@ -74,7 +52,7 @@ changes to the API licenses directly through the database. -

+
diff --git a/code/services-core/control-service/src/main/resources/templates/control/partials/storage-types.hdb b/code/services-core/control-service/src/main/resources/templates/control/partials/storage-types.hdb index 575797f9..85c39898 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/partials/storage-types.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/partials/storage-types.hdb @@ -3,4 +3,5 @@ Specifications Crawl Data Processed Data + Backups \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/storage-backups.hdb b/code/services-core/control-service/src/main/resources/templates/control/storage-backups.hdb new file mode 100644 index 00000000..b5450dd1 --- /dev/null +++ b/code/services-core/control-service/src/main/resources/templates/control/storage-backups.hdb @@ -0,0 +1,27 @@ + + + + Control Service + + + + + {{> control/partials/nav}} +
+ {{> control/partials/storage-types}} +

Backups

+ {{> control/partials/storage-table}} + +

About

+

Backups are compressed snapshots of index data, lexicon data and the document database.

+

Assuming no changes have been made to the binary format of these files, they are recoverable.

+
+ + + + + \ No newline at end of file diff --git a/code/services-core/control-service/src/main/resources/templates/control/storage-details.hdb b/code/services-core/control-service/src/main/resources/templates/control/storage-details.hdb index 65cbd144..ebb8f033 100644 --- a/code/services-core/control-service/src/main/resources/templates/control/storage-details.hdb +++ b/code/services-core/control-service/src/main/resources/templates/control/storage-details.hdb @@ -59,6 +59,14 @@ {{/if}} + {{#if isRestorable}} +
+ + Restore into live index + + +
+ {{/if}} {{#if isLoadable}}
diff --git a/code/services-core/index-service/build.gradle b/code/services-core/index-service/build.gradle index a84628f7..7d90cdfa 100644 --- a/code/services-core/index-service/build.gradle +++ b/code/services-core/index-service/build.gradle @@ -35,13 +35,14 @@ dependencies { implementation project(':code:features-index:index-query') implementation project(':code:features-index:index-forward') implementation project(':code:features-index:index-reverse') - implementation project(':code:features-index:lexicon') - implementation project(':code:features-index:domain-ranking') implementation project(':code:features-search:result-ranking') + implementation project(':third-party:commons-codec') implementation libs.lombok + testImplementation project(path: ':code:services-core:control-service') + testImplementation project(':code:common:process') annotationProcessor libs.lombok implementation libs.bundles.slf4j diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java index e0a3b2de..9bc4c6fb 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexModule.java @@ -8,10 +8,6 @@ import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.db.storage.model.FileStorageType; import nu.marginalia.index.config.RankingSettings; import nu.marginalia.WmsaHome; -import nu.marginalia.lexicon.KeywordLexicon; -import nu.marginalia.lexicon.KeywordLexiconReadOnlyView; -import nu.marginalia.lexicon.journal.KeywordLexiconJournal; -import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; import nu.marginalia.service.control.ServiceEventLog; import java.nio.file.Path; @@ -23,23 +19,6 @@ public class IndexModule extends AbstractModule { public void configure() { } - @Provides - @SneakyThrows - @Singleton - private KeywordLexiconReadOnlyView createLexicon(ServiceEventLog eventLog, FileStorageService fileStorageService) { - try { - eventLog.logEvent("INDEX-LEXICON-LOAD-BEGIN", ""); - - var area = fileStorageService.getStorageByType(FileStorageType.LEXICON_LIVE); - var path = area.asPath().resolve("dictionary.dat"); - - return new KeywordLexiconReadOnlyView(new KeywordLexicon(new KeywordLexiconJournal(path.toFile(), KeywordLexiconJournalMode.READ_ONLY))); - } - finally { - eventLog.logEvent("INDEX-LEXICON-LOAD-OK", ""); - } - } - @Provides public RankingSettings rankingSettings() { Path dir = WmsaHome.getHomePath().resolve("conf/ranking-settings.yaml"); diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java index a0ff5582..7d3b0f85 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexService.java @@ -11,6 +11,7 @@ import nu.marginalia.index.svc.IndexSearchSetsService; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.service.control.ServiceEventLog; import nu.marginalia.service.server.*; +import nu.marginalia.service.server.mq.MqNotification; import nu.marginalia.service.server.mq.MqRequest; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; @@ -19,7 +20,6 @@ import spark.Request; import spark.Response; import spark.Spark; -import java.io.IOException; import java.util.concurrent.TimeUnit; import static spark.Spark.get; @@ -74,17 +74,6 @@ public class IndexService extends Service { volatile boolean initialized = false; - @MqRequest(endpoint = IndexMqEndpoints.INDEX_RELOAD_LEXICON) - public String reloadLexicon(String message) throws Exception { - - if (!opsService.reloadLexicon()) { - throw new IllegalStateException("Ops lock busy"); - } - - return "ok"; - } - - @MqRequest(endpoint = IndexMqEndpoints.INDEX_REPARTITION) public String repartition(String message) { if (!opsService.repartition()) { @@ -93,9 +82,9 @@ public class IndexService extends Service { return "ok"; } - @MqRequest(endpoint = IndexMqEndpoints.INDEX_REINDEX) - public String reindex(String message) throws Exception { - if (!opsService.reindex()) { + @MqNotification(endpoint = IndexMqEndpoints.SWITCH_INDEX) + public String switchIndex(String message) throws Exception { + if (!opsService.switchIndex()) { throw new IllegalStateException("Ops lock busy"); } @@ -112,34 +101,8 @@ public class IndexService extends Service { searchIndex.init(); initialized = true; } - - if (!opsService.run(this::autoConvert)) { - logger.warn("Auto-convert could not be performed, ops lock busy"); - } } - private void autoConvert() { - if (!servicesFactory.isConvertedIndexMissing() - || !servicesFactory.isPreconvertedIndexPresent() - || Boolean.getBoolean("no-auto-convert") - ) { - return; - } - - try { - eventLog.logEvent("INDEX-AUTO-CONVERT-BEGIN", ""); - logger.info("Auto-converting"); - searchSetsService.recalculateAll(); - searchIndex.switchIndex(); - eventLog.logEvent("INDEX-AUTO-CONVERT-END", ""); - logger.info("Auto-conversion finished!"); - } - catch (IOException ex) { - logger.error("Auto convert failed", ex); - } - } - - } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java index 9e0c2a04..f0417e53 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java @@ -4,217 +4,82 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.db.storage.model.FileStorageType; -import nu.marginalia.index.forward.ForwardIndexConverter; +import nu.marginalia.index.forward.ForwardIndexFileNames; import nu.marginalia.index.forward.ForwardIndexReader; -import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile; -import nu.marginalia.index.priority.ReverseIndexPriorityConverter; -import nu.marginalia.index.full.ReverseIndexFullConverter; -import nu.marginalia.index.priority.ReverseIndexPriorityReader; -import nu.marginalia.index.priority.ReverseIndexPriorityParameters; -import nu.marginalia.index.full.ReverseIndexFullReader; -import nu.marginalia.ranking.DomainRankings; import nu.marginalia.index.index.SearchIndexReader; import nu.marginalia.service.control.ServiceHeartbeat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardCopyOption; import java.sql.SQLException; -import java.util.concurrent.Callable; -import java.util.stream.Stream; @Singleton public class IndexServicesFactory { - private final Path tmpFileDir; - private final ServiceHeartbeat heartbeat; private final Path liveStorage; - private final Path stagingStorage; - private final Logger logger = LoggerFactory.getLogger(getClass()); - - private final Path writerIndexFile; - - private final PartitionedDataFile fwdIndexDocId; - private final PartitionedDataFile fwdIndexDocData; - private final PartitionedDataFile revIndexDoc; - private final PartitionedDataFile revIndexWords; - - private final PartitionedDataFile revPrioIndexDoc; - private final PartitionedDataFile revPrioIndexWords; - private final Path searchSetsBase; - final int LIVE_PART = 0; - final int NEXT_PART = 1; - @Inject public IndexServicesFactory( ServiceHeartbeat heartbeat, FileStorageService fileStorageService - ) throws IOException, SQLException { - this.heartbeat = heartbeat; + ) throws SQLException { liveStorage = fileStorageService.getStorageByType(FileStorageType.INDEX_LIVE).asPath(); - stagingStorage = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING).asPath(); - tmpFileDir = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING).asPath().resolve("tmp"); searchSetsBase = fileStorageService.getStorageByType(FileStorageType.SEARCH_SETS).asPath(); - if (!Files.exists(tmpFileDir)) { - Files.createDirectories(tmpFileDir); - } - - writerIndexFile = stagingStorage.resolve("page-index.dat"); - - fwdIndexDocId = new PartitionedDataFile(liveStorage, "fwd-doc-id.dat"); - fwdIndexDocData = new PartitionedDataFile(liveStorage, "fwd-doc-data.dat"); - - revIndexDoc = new PartitionedDataFile(liveStorage, "rev-doc.dat"); - revIndexWords = new PartitionedDataFile(liveStorage, "rev-words.dat"); - - revPrioIndexDoc = new PartitionedDataFile(liveStorage, "rev-prio-doc.dat"); - revPrioIndexWords = new PartitionedDataFile(liveStorage, "rev-prio-words.dat"); } public Path getSearchSetsBase() { return searchSetsBase; } - public boolean isPreconvertedIndexPresent() { - return Stream.of( - writerIndexFile - ).allMatch(Files::exists); - } + public ReverseIndexReader getReverseIndexReader() throws IOException { - public boolean isConvertedIndexMissing() { - return Stream.of( - revIndexWords.get(LIVE_PART).toPath(), - revIndexDoc.get(LIVE_PART).toPath(), - revPrioIndexWords.get(LIVE_PART).toPath(), - revPrioIndexDoc.get(LIVE_PART).toPath(), - fwdIndexDocData.get(LIVE_PART).toPath(), - fwdIndexDocId.get(LIVE_PART).toPath() - ).noneMatch(Files::exists); - } - - enum ConvertSteps { - FORWARD_INDEX, - FULL_REVERSE_INDEX, - PRIORITY_REVERSE_INDEX, - FINISHED - } - public void convertIndex(DomainRankings domainRankings) throws IOException { - try (var hb = heartbeat.createServiceTaskHeartbeat(ConvertSteps.class, "index-conversion")) { - hb.progress(ConvertSteps.FORWARD_INDEX); - convertForwardIndex(domainRankings); - - hb.progress(ConvertSteps.FULL_REVERSE_INDEX); - convertFullReverseIndex(domainRankings); - - hb.progress(ConvertSteps.PRIORITY_REVERSE_INDEX); - convertPriorityReverseIndex(domainRankings); - - hb.progress(ConvertSteps.FINISHED); - } - } - - private void convertFullReverseIndex(DomainRankings domainRankings) throws IOException { - logger.info("Converting full reverse index {}", writerIndexFile); - - var journalReader = new IndexJournalReaderSingleCompressedFile(writerIndexFile); - var converter = new ReverseIndexFullConverter( - heartbeat, - tmpFileDir, - journalReader, - domainRankings, - revIndexWords.get(NEXT_PART).toPath(), - revIndexDoc.get(NEXT_PART).toPath()); - - converter.convert(); - - tryGc(); - } - - private void convertPriorityReverseIndex(DomainRankings domainRankings) throws IOException { - - logger.info("Converting priority reverse index {}", writerIndexFile); - - var journalReader = new IndexJournalReaderSingleCompressedFile(writerIndexFile, null, - ReverseIndexPriorityParameters::filterPriorityRecord); - - var converter = new ReverseIndexPriorityConverter(heartbeat, - tmpFileDir, - journalReader, - domainRankings, - revPrioIndexWords.get(NEXT_PART).toPath(), - revPrioIndexDoc.get(NEXT_PART).toPath()); - - converter.convert(); - - tryGc(); - } - - private void convertForwardIndex(DomainRankings domainRankings) throws IOException { - - - logger.info("Converting forward index data {}", writerIndexFile); - - new ForwardIndexConverter(heartbeat, - writerIndexFile.toFile(), - fwdIndexDocId.get(NEXT_PART).toPath(), - fwdIndexDocData.get(NEXT_PART).toPath(), - domainRankings) - .convert(); - - tryGc(); - } - - - public void tryGc() { - - try { - Thread.sleep(1000); - } catch (InterruptedException e) { - throw new RuntimeException(e); - } - - System.gc(); - } - - public ReverseIndexFullReader getReverseIndexReader() throws IOException { - return new ReverseIndexFullReader( - revIndexWords.get(LIVE_PART).toPath(), - revIndexDoc.get(LIVE_PART).toPath()); - } - public ReverseIndexPriorityReader getReverseIndexPrioReader() throws IOException { - return new ReverseIndexPriorityReader( - revPrioIndexWords.get(LIVE_PART).toPath(), - revPrioIndexDoc.get(LIVE_PART).toPath()); - } - public ForwardIndexReader getForwardIndexReader() throws IOException { - return new ForwardIndexReader( - fwdIndexDocId.get(LIVE_PART).toPath(), - fwdIndexDocData.get(LIVE_PART).toPath() + return new ReverseIndexReader( + ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.CURRENT), + ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.CURRENT) ); } - public Callable switchFilesJob() { - return () -> { + public ReverseIndexReader getReverseIndexPrioReader() throws IOException { + return new ReverseIndexReader( + ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.CURRENT), + ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.CURRENT) + ); + } - switchFile(revIndexDoc.get(NEXT_PART).toPath(), revIndexDoc.get(LIVE_PART).toPath()); - switchFile(revIndexWords.get(NEXT_PART).toPath(), revIndexWords.get(LIVE_PART).toPath()); + public ForwardIndexReader getForwardIndexReader() throws IOException { + return new ForwardIndexReader( + ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.CURRENT), + ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.CURRENT) + ); + } - switchFile(revPrioIndexDoc.get(NEXT_PART).toPath(), revPrioIndexDoc.get(LIVE_PART).toPath()); - switchFile(revPrioIndexWords.get(NEXT_PART).toPath(), revPrioIndexWords.get(LIVE_PART).toPath()); + public void switchFiles() throws IOException { - switchFile(fwdIndexDocId.get(NEXT_PART).toPath(), fwdIndexDocId.get(LIVE_PART).toPath()); - switchFile(fwdIndexDocData.get(NEXT_PART).toPath(), fwdIndexDocData.get(LIVE_PART).toPath()); - - return true; - }; + for (var file : ReverseIndexFullFileNames.FileIdentifier.values()) { + switchFile( + ReverseIndexFullFileNames.resolve(liveStorage, file, ReverseIndexFullFileNames.FileVersion.NEXT), + ReverseIndexFullFileNames.resolve(liveStorage, file, ReverseIndexFullFileNames.FileVersion.CURRENT) + ); + } + for (var file : ReverseIndexPrioFileNames.FileIdentifier.values()) { + switchFile( + ReverseIndexPrioFileNames.resolve(liveStorage, file, ReverseIndexPrioFileNames.FileVersion.NEXT), + ReverseIndexPrioFileNames.resolve(liveStorage, file, ReverseIndexPrioFileNames.FileVersion.CURRENT) + ); + } + for (var file : ForwardIndexFileNames.FileIdentifier.values()) { + switchFile( + ForwardIndexFileNames.resolve(liveStorage, file, ForwardIndexFileNames.FileVersion.NEXT), + ForwardIndexFileNames.resolve(liveStorage, file, ForwardIndexFileNames.FileVersion.CURRENT) + ); + } } public void switchFile(Path from, Path to) throws IOException { @@ -231,37 +96,3 @@ public class IndexServicesFactory { ); } } - -class RootDataFile { - private final Path partition; - private final String pattern; - - RootDataFile(Path partition, String pattern) { - this.partition = partition; - this.pattern = pattern; - } - - public File get() { - return partition.resolve(pattern).toFile(); - } -} - - -class PartitionedDataFile { - private final Path partition; - private final String pattern; - - PartitionedDataFile(Path partition, String pattern) { - this.partition = partition; - this.pattern = pattern; - } - - public File get(Object id) { - Path partitionDir = partition.resolve(id.toString()); - if (!partitionDir.toFile().exists()) { - partitionDir.toFile().mkdir(); - } - return partitionDir.resolve(pattern).toFile(); - } - -} \ No newline at end of file diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/db/DbUpdateRanks.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/db/DbUpdateRanks.java index 21c56884..d6696f59 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/db/DbUpdateRanks.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/db/DbUpdateRanks.java @@ -5,8 +5,8 @@ import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.inject.Inject; -import javax.inject.Singleton; +import com.google.inject.Inject; +import com.google.inject.Singleton; import java.sql.SQLException; @Singleton diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java index d4664531..f471596a 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java @@ -62,8 +62,6 @@ public class SearchIndex { else { eventLog.logEvent("INDEX-INIT", "No index loaded"); } - - } catch (Exception ex) { logger.error("Uncaught exception", ex); @@ -74,19 +72,12 @@ public class SearchIndex { } public boolean switchIndex() throws IOException { - - eventLog.logEvent("CONVERT-INDEX-BEGIN", ""); - servicesFactory.convertIndex(searchSetsService.getDomainRankings()); - eventLog.logEvent("CONVERT-INDEX-END", ""); - System.gc(); - eventLog.logEvent("INDEX-SWITCH-BEGIN", ""); Lock lock = indexReplacementLock.writeLock(); try { lock.lock(); - servicesFactory.switchFilesJob().call(); - + servicesFactory.switchFiles(); indexReader = servicesFactory.getSearchIndexReader(); eventLog.logEvent("INDEX-SWITCH-OK", ""); @@ -116,8 +107,8 @@ public class SearchIndex { return Collections.emptyList(); } - final int[] orderedIncludes = terms.sortedDistinctIncludes(this::compareKeywords); - final int[] orderedIncludesPrio = terms.sortedDistinctIncludes(this::compareKeywordsPrio); + final long[] orderedIncludes = terms.sortedDistinctIncludes(this::compareKeywords); + final long[] orderedIncludesPrio = terms.sortedDistinctIncludes(this::compareKeywordsPrio); List queryHeads = new ArrayList<>(10); List queries = new ArrayList<>(10); @@ -155,11 +146,11 @@ public class SearchIndex { return Collections.emptyList(); } - for (int orderedInclude : orderedIncludes) { + for (long orderedInclude : orderedIncludes) { query = query.alsoFull(orderedInclude); } - for (int term : terms.excludes()) { + for (long term : terms.excludes()) { query = query.notFull(term); } @@ -175,21 +166,25 @@ public class SearchIndex { return queries; } - private int compareKeywords(int a, int b) { + private int compareKeywords(long a, long b) { return Long.compare( indexReader.numHits(a), indexReader.numHits(b) ); } - private int compareKeywordsPrio(int a, int b) { + private int compareKeywordsPrio(long a, long b) { return Long.compare( indexReader.numHitsPrio(a), indexReader.numHitsPrio(b) ); } - /** Replaces the values of ids with their associated metadata, or 0L if absent */ - public long[] getTermMetadata(int termId, long[] docs) { + + /** Return an array of encoded document metadata longs corresponding to the + * document identifiers provided; with metadata for termId. The input array + * docs[] *must* be sorted. + */ + public long[] getTermMetadata(long termId, long[] docs) { return indexReader.getMetadata(termId, docs); } @@ -200,18 +195,14 @@ public class SearchIndex { return indexReader.getHtmlFeatures(docId); } - public int getDomainId(long docId) { - return indexReader.getDomainId(docId); - } - public int getTotalDocCount() { return indexReader.totalDocCount(); } - public int getTermFrequency(int id) { + public int getTermFrequency(long id) { return (int) indexReader.numHits(id); } - public int getTermFrequencyPrio(int id) { + public int getTermFrequencyPrio(long id) { return (int) indexReader.numHitsPrio(id); } } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexQueryBuilder.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexQueryBuilder.java index 7cb6f34f..1d1396f9 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexQueryBuilder.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexQueryBuilder.java @@ -1,19 +1,15 @@ package nu.marginalia.index.index; -import gnu.trove.set.hash.TIntHashSet; -import nu.marginalia.index.priority.ReverseIndexPriorityReader; +import gnu.trove.set.hash.TLongHashSet; +import nu.marginalia.index.ReverseIndexReader; import nu.marginalia.index.query.IndexQuery; import nu.marginalia.index.query.IndexQueryBuilder; import nu.marginalia.index.query.filter.QueryFilterStepIf; -import nu.marginalia.index.full.ReverseIndexFullReader; - -import java.util.stream.Collectors; -import java.util.stream.IntStream; public class SearchIndexQueryBuilder implements IndexQueryBuilder { private final IndexQuery query; - private final ReverseIndexFullReader reverseIndexFullReader; - private final ReverseIndexPriorityReader reverseIndexPrioReader; + private final ReverseIndexReader reverseIndexFullReader; + private final ReverseIndexReader reverseIndexPrioReader; /* Keep track of already added include terms to avoid redundant checks. * @@ -21,11 +17,11 @@ public class SearchIndexQueryBuilder implements IndexQueryBuilder { * first check one index and then another for the same term. At the moment, that * makes no sense, but in the future, that might be a thing one might want to do. * */ - private final TIntHashSet alreadyConsideredTerms = new TIntHashSet(); + private final TLongHashSet alreadyConsideredTerms = new TLongHashSet(); - SearchIndexQueryBuilder(ReverseIndexFullReader reverseIndexFullReader, - ReverseIndexPriorityReader reverseIndexPrioReader, - IndexQuery query, int... sourceTerms) + SearchIndexQueryBuilder(ReverseIndexReader reverseIndexFullReader, + ReverseIndexReader reverseIndexPrioReader, + IndexQuery query, long... sourceTerms) { this.query = query; this.reverseIndexFullReader = reverseIndexFullReader; @@ -34,7 +30,7 @@ public class SearchIndexQueryBuilder implements IndexQueryBuilder { alreadyConsideredTerms.addAll(sourceTerms); } - public IndexQueryBuilder alsoFull(int termId) { + public IndexQueryBuilder alsoFull(long termId) { if (alreadyConsideredTerms.add(termId)) { query.addInclusionFilter(reverseIndexFullReader.also(termId)); @@ -43,7 +39,7 @@ public class SearchIndexQueryBuilder implements IndexQueryBuilder { return this; } - public IndexQueryBuilder alsoPrio(int termId) { + public IndexQueryBuilder alsoPrio(long termId) { if (alreadyConsideredTerms.add(termId)) { query.addInclusionFilter(reverseIndexPrioReader.also(termId)); @@ -52,7 +48,7 @@ public class SearchIndexQueryBuilder implements IndexQueryBuilder { return this; } - public IndexQueryBuilder notFull(int termId) { + public IndexQueryBuilder notFull(long termId) { query.addInclusionFilter(reverseIndexFullReader.not(termId)); diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java index 8a3e3e6e..00c089c6 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java @@ -1,11 +1,10 @@ package nu.marginalia.index.index; +import nu.marginalia.index.ReverseIndexReader; import nu.marginalia.index.forward.ForwardIndexReader; import nu.marginalia.index.forward.ParamMatchingQueryFilter; import nu.marginalia.index.query.*; import nu.marginalia.index.query.filter.QueryFilterStepIf; -import nu.marginalia.index.priority.ReverseIndexPriorityReader; -import nu.marginalia.index.full.ReverseIndexFullReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -16,25 +15,25 @@ public class SearchIndexReader { private final Logger logger = LoggerFactory.getLogger(getClass()); private final ForwardIndexReader forwardIndexReader; - private final ReverseIndexFullReader reverseIndexFullReader; - private final ReverseIndexPriorityReader reverseIndexPriorityReader; + private final ReverseIndexReader reverseIndexFullReader; + private final ReverseIndexReader reverseIndexPriorityReader; public SearchIndexReader(ForwardIndexReader forwardIndexReader, - ReverseIndexFullReader reverseIndexFullReader, - ReverseIndexPriorityReader reverseIndexPriorityReader) { + ReverseIndexReader reverseIndexFullReader, + ReverseIndexReader reverseIndexPriorityReader) { this.forwardIndexReader = forwardIndexReader; this.reverseIndexFullReader = reverseIndexFullReader; this.reverseIndexPriorityReader = reverseIndexPriorityReader; } - public IndexQueryBuilder findPriorityWord(IndexQueryPriority priority, int wordId, int fetchSizeMultiplier) { - var sources = List.of(reverseIndexPriorityReader.priorityDocuments(wordId)); + public IndexQueryBuilder findPriorityWord(IndexQueryPriority priority, long wordId, int fetchSizeMultiplier) { + var sources = List.of(reverseIndexPriorityReader.documents(wordId)); return new SearchIndexQueryBuilder(reverseIndexFullReader, reverseIndexPriorityReader, new IndexQuery(sources, priority, fetchSizeMultiplier), wordId); } - public IndexQueryBuilder findFullWord(IndexQueryPriority priority, int wordId, int fetchSizeMultiplier) { + public IndexQueryBuilder findFullWord(IndexQueryPriority priority, long wordId, int fetchSizeMultiplier) { var sources = List.of(reverseIndexFullReader.documents(wordId)); return new SearchIndexQueryBuilder(reverseIndexFullReader, reverseIndexPriorityReader, @@ -45,14 +44,14 @@ public class SearchIndexReader { return new ParamMatchingQueryFilter(params, forwardIndexReader); } - public long numHits(int word) { + public long numHits(long word) { return reverseIndexFullReader.numDocuments(word); } - public long numHitsPrio(int word) { + public long numHitsPrio(long word) { return reverseIndexPriorityReader.numDocuments(word); } - public long[] getMetadata(int wordId, long[] docIds) { + public long[] getMetadata(long wordId, long[] docIds) { return reverseIndexFullReader.getTermMeta(wordId, docIds); } @@ -60,10 +59,6 @@ public class SearchIndexReader { return forwardIndexReader.getDocMeta(docId); } - public int getDomainId(long docId) { - return forwardIndexReader.getDomainId(docId); - } - public int totalDocCount() { return forwardIndexReader.totalDocCount(); } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexSearchTerms.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexSearchTerms.java index aedd7da3..833778df 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexSearchTerms.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexSearchTerms.java @@ -1,35 +1,35 @@ package nu.marginalia.index.index; -import it.unimi.dsi.fastutil.ints.IntArrayList; -import it.unimi.dsi.fastutil.ints.IntComparator; -import it.unimi.dsi.fastutil.ints.IntList; -import it.unimi.dsi.fastutil.ints.IntOpenHashSet; +import it.unimi.dsi.fastutil.longs.LongArrayList; +import it.unimi.dsi.fastutil.longs.LongComparator; +import it.unimi.dsi.fastutil.longs.LongList; +import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import java.util.Collections; import java.util.List; public record SearchIndexSearchTerms( - IntList includes, - IntList excludes, - IntList priority, - List coherences + LongList includes, + LongList excludes, + LongList priority, + List coherences ) { public SearchIndexSearchTerms() { - this(IntList.of(), IntList.of(), IntList.of(), Collections.emptyList()); + this(LongList.of(), LongList.of(), LongList.of(), Collections.emptyList()); } public boolean isEmpty() { return includes.isEmpty(); } - public int[] sortedDistinctIncludes(IntComparator comparator) { + public long[] sortedDistinctIncludes(LongComparator comparator) { if (includes.isEmpty()) - return includes.toIntArray(); + return includes.toLongArray(); - IntList list = new IntArrayList(new IntOpenHashSet(includes)); + LongList list = new LongArrayList(new LongOpenHashSet(includes)); list.sort(comparator); - return list.toIntArray(); + return list.toLongArray(); } public int size() { diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java index 996afafa..9c6ca197 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java @@ -2,9 +2,12 @@ package nu.marginalia.index.results; import com.google.inject.Inject; import gnu.trove.map.hash.TObjectIntHashMap; +import gnu.trove.map.hash.TObjectLongHashMap; import gnu.trove.set.hash.TLongHashSet; -import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.ints.Int2ObjectArrayMap; import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap; +import it.unimi.dsi.fastutil.longs.Long2ObjectArrayMap; +import it.unimi.dsi.fastutil.longs.LongArrayList; import nu.marginalia.index.client.model.query.SearchSubquery; import nu.marginalia.index.index.SearchIndex; import nu.marginalia.index.svc.SearchTermsService; @@ -18,7 +21,6 @@ import java.util.OptionalInt; public class IndexMetadataService { private final SearchIndex index; private final SearchTermsService searchTermsService; - private final ResultValuator searchResultValuator; @Inject @@ -30,41 +32,23 @@ public class IndexMetadataService { this.searchResultValuator = searchResultValuator; } - public long getDocumentMetadata(long urlId) { - return index.getDocumentMetadata(urlId); + public long getDocumentMetadata(long docId) { + return index.getDocumentMetadata(docId); } public int getHtmlFeatures(long urlId) { return index.getHtmlFeatures(urlId); } - public int getDomainId(long urlId) { - return index.getDomainId(urlId); - } - - public long[] getTermMetadata(int termId, long[] docIdsAll) { - return index.getTermMetadata(termId, docIdsAll); - } - - public TermMetadata getTermMetadata(long[] docIdsAll, int[] termIdsList) { - var termdocToMeta = new Long2LongOpenHashMap(docIdsAll.length * termIdsList.length, 0.5f); - - for (int term : termIdsList) { - var metadata = getTermMetadata(term, docIdsAll); - - for (int i = 0; i < docIdsAll.length; i++) { - termdocToMeta.put(termdocKey(term, docIdsAll[i]), metadata[i]); - } - } - - return new TermMetadata(termdocToMeta); + public TermMetadataForDocuments getTermMetadataForDocuments(long[] docIdsAll, long[] termIdsList) { + return new TermMetadataForDocuments(docIdsAll, termIdsList); } public QuerySearchTerms getSearchTerms(List searchTermVariants) { - IntArrayList termIdsList = new IntArrayList(); + LongArrayList termIdsList = new LongArrayList(); - TObjectIntHashMap termToId = new TObjectIntHashMap<>(10, 0.75f, -1); + TObjectLongHashMap termToId = new TObjectLongHashMap<>(10, 0.75f, -1); for (var subquery : searchTermVariants) { for (var term : subquery.searchTermsInclude) { @@ -72,27 +56,24 @@ public class IndexMetadataService { continue; } - var id = searchTermsService.lookUpWord(term); - if (id.isPresent()) { - termIdsList.add(id.getAsInt()); - termToId.put(term, id.getAsInt()); - } + long id = searchTermsService.getWordId(term); + termIdsList.add(id); + termToId.put(term, id); } } - return new QuerySearchTerms(termToId, - termIdsList.toIntArray(), + termIdsList.toLongArray(), getTermCoherences(searchTermVariants)); } private TermCoherences getTermCoherences(List searchTermVariants) { - List coherences = new ArrayList<>(); + List coherences = new ArrayList<>(); for (var subquery : searchTermVariants) { for (var coh : subquery.searchTermCoherences) { - int[] ids = coh.stream().map(searchTermsService::lookUpWord).filter(OptionalInt::isPresent).mapToInt(OptionalInt::getAsInt).toArray(); + long[] ids = coh.stream().mapToLong(searchTermsService::getWordId).toArray(); coherences.add(ids); } @@ -104,42 +85,53 @@ public class IndexMetadataService { } public TLongHashSet getResultsWithPriorityTerms(List subqueries, long[] resultsArray) { - int[] priorityTermIds = + long[] priorityTermIds = subqueries.stream() .flatMap(sq -> sq.searchTermsPriority.stream()) .distinct() - .map(searchTermsService::lookUpWord) - .filter(OptionalInt::isPresent) - .mapToInt(OptionalInt::getAsInt) + .mapToLong(searchTermsService::getWordId) .toArray(); var ret = new TLongHashSet(resultsArray.length); - for (int priorityTerm : priorityTermIds) { - long[] metadata = getTermMetadata(priorityTerm, resultsArray); + for (long priorityTerm : priorityTermIds) { + long[] metadata = index.getTermMetadata(priorityTerm, resultsArray); for (int i = 0; i < metadata.length; i++) { if (metadata[i] != 0) ret.add(resultsArray[i]); } } return ret; - - } public ResultValuator getSearchResultValuator() { return searchResultValuator; } - public static class TermMetadata { - private final Long2LongOpenHashMap termdocToMeta; + public class TermMetadataForDocuments { + private final Long2ObjectArrayMap termdocToMeta; - public TermMetadata(Long2LongOpenHashMap termdocToMeta) { - this.termdocToMeta = termdocToMeta; + public TermMetadataForDocuments(long[] docIdsAll, long[] termIdsList) { + termdocToMeta = new Long2ObjectArrayMap<>(termIdsList.length); + + for (long termId : termIdsList) { + var mapForTerm = new Long2LongOpenHashMap(docIdsAll.length); + + var metadata = index.getTermMetadata(termId, docIdsAll); + for (int i = 0; i < docIdsAll.length; i++) { + mapForTerm.put(docIdsAll[i], metadata[i]); + } + + termdocToMeta.put(termId, mapForTerm); + } } - public long getTermMetadata(int termId, long docId) { - return termdocToMeta.getOrDefault(termdocKey(termId, docId), 0); + public long getTermMetadata(long termId, long docId) { + var docsForTerm = termdocToMeta.get(termId); + if (docsForTerm == null) { + return 0; + } + return docsForTerm.getOrDefault(docId, 0); } public boolean testCoherence(long docId, TermCoherences coherences) { @@ -159,25 +151,24 @@ public class IndexMetadataService { } public static class QuerySearchTerms { - private final TObjectIntHashMap termToId; - public final int[] termIdsAll; + private final TObjectLongHashMap termToId; + public final long[] termIdsAll; public final TermCoherences coherences; - public QuerySearchTerms(TObjectIntHashMap termToId, int[] termIdsAll, TermCoherences coherences) { + public QuerySearchTerms(TObjectLongHashMap termToId, + long[] termIdsAll, + TermCoherences coherences) { this.termToId = termToId; this.termIdsAll = termIdsAll; this.coherences = coherences; } - public int get(String searchTerm) { + public long getIdForTerm(String searchTerm) { return termToId.get(searchTerm); } } - public record TermCoherences(List words) {} - - private static long termdocKey(int termId, long docId) { - return (docId << 32) | Integer.toUnsignedLong(termId); - } + /** wordIds that we require to be in the same sentence */ + public record TermCoherences(List words) {} } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultDomainDeduplicator.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultDomainDeduplicator.java index 8dbf740f..4c29886a 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultDomainDeduplicator.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultDomainDeduplicator.java @@ -5,7 +5,7 @@ import gnu.trove.map.hash.TLongIntHashMap; import nu.marginalia.index.client.model.results.SearchResultItem; public class IndexResultDomainDeduplicator { - final TLongIntMap resultsByRankingId = CachedObjects.getMap(); + final TLongIntMap resultsByDomainId = CachedObjects.getMap(); final int limitByDomain; public IndexResultDomainDeduplicator(int limitByDomain) { @@ -13,19 +13,15 @@ public class IndexResultDomainDeduplicator { } public boolean test(SearchResultItem item) { - final long key = item.deduplicationKey(); - if (key == 0) - return true; + final long key = item.getDomainId(); - return resultsByRankingId.adjustOrPutValue(key, 1, 1) <= limitByDomain; + return resultsByDomainId.adjustOrPutValue(key, 1, 1) <= limitByDomain; } public int getCount(SearchResultItem item) { - final long key = item.deduplicationKey(); - if (key == 0) - return 1; + final long key = item.getDomainId(); - return resultsByRankingId.get(key); + return resultsByDomainId.get(key); } private static class CachedObjects { @@ -38,7 +34,14 @@ public class IndexResultDomainDeduplicator { ret.clear(); return ret; } + + public static void clear() { + mapCache.remove(); + } } + static void clearCachedObjects() { + CachedObjects.clear(); + } } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java index 34ea1826..05f94779 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java @@ -4,6 +4,7 @@ import gnu.trove.list.TLongList; import gnu.trove.set.hash.TLongHashSet; import nu.marginalia.index.client.model.results.SearchResultPreliminaryScore; import nu.marginalia.index.client.model.results.ResultRankingContext; +import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.index.query.limit.QueryStrategy; @@ -13,6 +14,8 @@ import nu.marginalia.index.client.model.query.SearchSubquery; import nu.marginalia.index.query.IndexQueryParams; import nu.marginalia.ranking.ResultValuator; +import javax.annotation.Nullable; +import java.util.Arrays; import java.util.List; public class IndexResultValuator { @@ -21,7 +24,7 @@ public class IndexResultValuator { private final IndexQueryParams queryParams; private final TLongHashSet resultsWithPriorityTerms; - private final IndexMetadataService.TermMetadata termMetadata; + private final IndexMetadataService.TermMetadataForDocuments termMetadataForDocuments; private final IndexMetadataService.QuerySearchTerms searchTerms; private final ResultRankingContext rankingContext; @@ -36,37 +39,43 @@ public class IndexResultValuator { this.rankingContext = rankingContext; this.searchResultValuator = metadataService.getSearchResultValuator(); - final long[] resultsArray = results.toArray(); + final long[] ids = results.toArray(); + Arrays.sort(ids); this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList(); this.queryParams = queryParams; this.metadataService = metadataService; this.searchTerms = metadataService.getSearchTerms(subqueries); - this.termMetadata = metadataService.getTermMetadata(results.toArray(), searchTerms.termIdsAll); + this.termMetadataForDocuments = metadataService.getTermMetadataForDocuments(ids, searchTerms.termIdsAll); - resultsWithPriorityTerms = metadataService.getResultsWithPriorityTerms(subqueries, resultsArray); + resultsWithPriorityTerms = metadataService.getResultsWithPriorityTerms(subqueries, ids); } private final long flagsFilterMask = WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit(); + @Nullable public SearchResultItem calculatePreliminaryScore(long id) { - SearchResultItem searchResult = new SearchResultItem(id); - final long urlIdInt = searchResult.getUrlIdInt(); + final long docId = UrlIdCodec.removeRank(id); - searchResult.setDomainId(metadataService.getDomainId(urlIdInt)); + if (!termMetadataForDocuments.testCoherence(docId, searchTerms.coherences)) + return null; - long docMetadata = metadataService.getDocumentMetadata(urlIdInt); - int htmlFeatures = metadataService.getHtmlFeatures(urlIdInt); + long docMetadata = metadataService.getDocumentMetadata(docId); + int htmlFeatures = metadataService.getHtmlFeatures(docId); int maxFlagsCount = 0; boolean anyAllSynthetic = false; int maxPositionsSet = 0; - for (int querySetId = 0; querySetId < searchTermVariants.size(); querySetId++) { + SearchResultItem searchResult = new SearchResultItem(id); + for (int querySetId = 0; + querySetId < searchTermVariants.size(); + querySetId++) + { var termList = searchTermVariants.get(querySetId); SearchResultKeywordScore[] termScoresForSet = new SearchResultKeywordScore[termList.size()]; @@ -76,21 +85,21 @@ public class IndexResultValuator { for (int termIdx = 0; termIdx < termList.size(); termIdx++) { String searchTerm = termList.get(termIdx); - long metadata = termMetadata.getTermMetadata( - searchTerms.get(searchTerm), - searchResult.getUrlIdInt() + long termMetadata = termMetadataForDocuments.getTermMetadata( + searchTerms.getIdForTerm(searchTerm), + searchResult.combinedId ); var score = new SearchResultKeywordScore( querySetId, searchTerm, - metadata, + termMetadata, docMetadata, htmlFeatures, resultsWithPriorityTerms.contains(searchResult.combinedId) ); - synthetic &= WordFlags.Synthetic.isPresent(metadata); + synthetic &= WordFlags.Synthetic.isPresent(termMetadata); searchResult.keywordScores.add(score); @@ -115,20 +124,15 @@ public class IndexResultValuator { anyAllSynthetic |= synthetic; } - final boolean hasPriorityTerm = resultsWithPriorityTerms.contains(id); + if (maxFlagsCount == 0 && !anyAllSynthetic && maxPositionsSet == 0) + return null; - double score = searchResultValuator.calculateSearchResultValue(searchResult.keywordScores, 5000, rankingContext); - - boolean disqualified = false; - - if (!termMetadata.testCoherence(urlIdInt, searchTerms.coherences)) - disqualified = true; - else if (maxFlagsCount == 0 && !anyAllSynthetic && maxPositionsSet == 0) - disqualified = true; + double score = searchResultValuator.calculateSearchResultValue(searchResult.keywordScores, + 5000, // use a dummy value here as it's not present in the index + rankingContext); searchResult.setScore(new SearchResultPreliminaryScore( - disqualified, - hasPriorityTerm, + resultsWithPriorityTerms.contains(id), score )); diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java index 22e514d8..397e124e 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexOpsService.java @@ -3,7 +3,6 @@ package nu.marginalia.index.svc; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.index.index.SearchIndex; -import nu.marginalia.lexicon.KeywordLexiconReadOnlyView; import spark.Request; import spark.Response; import spark.Spark; @@ -19,15 +18,12 @@ public class IndexOpsService { private final SearchIndex index; private final IndexSearchSetsService searchSetService; - private final KeywordLexiconReadOnlyView lexicon; @Inject public IndexOpsService(SearchIndex index, - IndexSearchSetsService searchSetService, - KeywordLexiconReadOnlyView lexicon) { + IndexSearchSetsService searchSetService) { this.index = index; this.searchSetService = searchSetService; - this.lexicon = lexicon; } public boolean isBusy() { @@ -37,15 +33,10 @@ public class IndexOpsService { public boolean repartition() { return run(searchSetService::recalculateAll); } - public boolean reindex() throws Exception { - return run(() -> { - return index.switchIndex() && lexicon.suggestReload(); - }).isPresent(); + public boolean switchIndex() throws Exception { + return run(index::switchIndex).isPresent(); } - public boolean reloadLexicon() throws Exception { - return run(lexicon::suggestReload).isPresent(); - } public Object repartitionEndpoint(Request request, Response response) throws Exception { diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java index c100388e..e8e0c76b 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java @@ -142,8 +142,8 @@ public class IndexQueryService { * accurately */ private ResultRankingContext createRankingContext(ResultRankingParameters rankingParams, List subqueries) { final var termToId = searchTermsSvc.getAllIncludeTerms(subqueries); - final var termFrequencies = new HashMap<>(termToId); - final var prioFrequencies = new HashMap<>(termToId); + final Map termFrequencies = new HashMap<>(termToId.size()); + final Map prioFrequencies = new HashMap<>(termToId.size()); termToId.forEach((key, id) -> termFrequencies.put(key, index.getTermFrequency(id))); termToId.forEach((key, id) -> prioFrequencies.put(key, index.getTermFrequencyPrio(id))); @@ -230,16 +230,16 @@ public class IndexQueryService { var priority = subquery.searchTermsPriority; for (int i = 0; i < includes.size(); i++) { - logger.info(queryMarker, "{} -> {} I", includes.get(i), searchTerms.includes().getInt(i)); + logger.info(queryMarker, "{} -> {} I", includes.get(i), searchTerms.includes().getLong(i)); } for (int i = 0; i < advice.size(); i++) { - logger.info(queryMarker, "{} -> {} A", advice.get(i), searchTerms.includes().getInt(includes.size() + i)); + logger.info(queryMarker, "{} -> {} A", advice.get(i), searchTerms.includes().getLong(includes.size() + i)); } for (int i = 0; i < excludes.size(); i++) { - logger.info(queryMarker, "{} -> {} E", excludes.get(i), searchTerms.excludes().getInt(i)); + logger.info(queryMarker, "{} -> {} E", excludes.get(i), searchTerms.excludes().getLong(i)); } for (int i = 0; i < priority.size(); i++) { - logger.info(queryMarker, "{} -> {} P", priority.get(i), searchTerms.priority().getInt(i)); + logger.info(queryMarker, "{} -> {} P", priority.get(i), searchTerms.priority().getLong(i)); } } @@ -258,7 +258,7 @@ public class IndexQueryService { return Arrays.stream(resultIds.toArray()) .parallel() .mapToObj(evaluator::calculatePreliminaryScore) - .filter(score -> !score.getScore().isDisqualified()) + .filter(Objects::nonNull) .collect(Collectors.toList()); } @@ -266,9 +266,7 @@ public class IndexQueryService { var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain); - results.sort(Comparator.comparing(SearchResultItem::getScore).reversed() - .thenComparingInt(SearchResultItem::getRanking) - .thenComparingInt(SearchResultItem::getUrlIdInt)); + results.sort(Comparator.naturalOrder()); List resultsList = new ArrayList<>(results.size()); diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java index 3d886158..4c06bf2f 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java @@ -2,13 +2,12 @@ package nu.marginalia.index.svc; import com.google.inject.Inject; import com.google.inject.Singleton; +import gnu.trove.list.TIntList; import it.unimi.dsi.fastutil.ints.IntOpenHashSet; import lombok.SneakyThrows; import nu.marginalia.db.DomainTypes; import nu.marginalia.index.IndexServicesFactory; import nu.marginalia.index.searchset.SearchSet; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.id.EdgeIdList; import nu.marginalia.ranking.ReversePageRank; import nu.marginalia.ranking.StandardPageRank; import nu.marginalia.ranking.accumulator.RankingResultHashMapAccumulator; @@ -168,7 +167,7 @@ public class IndexSearchSetsService { @SneakyThrows public void updateBlogsSet() { - EdgeIdList knownDomains = domainTypes.getKnownDomainsByType(DomainTypes.Type.BLOG); + TIntList knownDomains = domainTypes.getKnownDomainsByType(DomainTypes.Type.BLOG); if (knownDomains.isEmpty()) { // FIXME: We don't want to reload the entire list every time, but we do want to do it sometimes. Actor maybe? @@ -177,7 +176,7 @@ public class IndexSearchSetsService { } synchronized (this) { - blogsSet = new RankingSearchSet(SearchSetIdentifier.BLOGS, blogsSet.source, new IntOpenHashSet(knownDomains.values())); + blogsSet = new RankingSearchSet(SearchSetIdentifier.BLOGS, blogsSet.source, new IntOpenHashSet(knownDomains.toArray())); blogsSet.write(); } } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java index 944517d6..306a88d8 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java @@ -2,32 +2,28 @@ package nu.marginalia.index.svc; import com.google.inject.Inject; import com.google.inject.Singleton; -import it.unimi.dsi.fastutil.ints.IntArrayList; -import it.unimi.dsi.fastutil.ints.IntList; -import nu.marginalia.dict.OffHeapDictionaryHashMap; +import it.unimi.dsi.fastutil.longs.LongArrayList; +import it.unimi.dsi.fastutil.longs.LongList; +import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.index.client.model.query.SearchSubquery; import nu.marginalia.index.index.SearchIndexSearchTerms; -import nu.marginalia.lexicon.KeywordLexiconReadOnlyView; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; @Singleton public class SearchTermsService { private final Logger logger = LoggerFactory.getLogger(getClass()); - private final KeywordLexiconReadOnlyView lexicon; - - @Inject - public SearchTermsService(KeywordLexiconReadOnlyView lexicon) { - this.lexicon = lexicon; - } public SearchIndexSearchTerms getSearchTerms(SearchSubquery request) { - final IntList excludes = new IntArrayList(); - final IntList includes = new IntArrayList(); - final IntList priority = new IntArrayList(); - final List coherences = new ArrayList<>(); + final LongList excludes = new LongArrayList(); + final LongList includes = new LongArrayList(); + final LongList priority = new LongArrayList(); + final List coherences = new ArrayList<>(); if (!addEachTerm(includes, request.searchTermsInclude)) { return new SearchIndexSearchTerms(); @@ -40,7 +36,7 @@ public class SearchTermsService { } for (var coherence : request.searchTermCoherences) { - IntList parts = new IntArrayList(coherence.size()); + LongList parts = new LongArrayList(coherence.size()); if (!addEachTerm(parts, coherence)) { return new SearchIndexSearchTerms(); @@ -56,46 +52,37 @@ public class SearchTermsService { return new SearchIndexSearchTerms(includes, excludes, priority, coherences); } - private boolean addEachTerm(IntList ret, List words) { + private boolean addEachTerm(LongList ret, List words) { boolean success = true; for (var word : words) { - var termId = lookUpWord(word); - - if (termId.isPresent()) { - lookUpWord(word).ifPresent(ret::add); - } - else { - success = false; - } + ret.add(getWordId(word)); } + return success; } - private void addEachNonMandatoryTerm(IntList ret, List words) { + private void addEachNonMandatoryTerm(LongList ret, List words) { for (var word : words) { - ret.add(lexicon.get(word)); + ret.add(getWordId(word)); } } - public OptionalInt lookUpWord(String s) { - int ret = lexicon.get(s); - if (ret == OffHeapDictionaryHashMap.NO_VALUE) { - return OptionalInt.empty(); - } - return OptionalInt.of(ret); - } - - public Map getAllIncludeTerms(List subqueries) { - Map ret = new HashMap<>(); + public Map getAllIncludeTerms(List subqueries) { + Map ret = new HashMap<>(); for (var subquery : subqueries) { for (var include : subquery.searchTermsInclude) { - ret.computeIfAbsent(include, term -> lookUpWord(term).orElse(-1)); + ret.computeIfAbsent(include, i -> getWordId(include)); } } return ret; } + + static MurmurHash3_128 hasher = new MurmurHash3_128(); + public long getWordId(String s) { + return hasher.hashNearlyASCII(s); + } } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/RankingSearchSet.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/RankingSearchSet.java index 64507955..d8dd9ca1 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/RankingSearchSet.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/RankingSearchSet.java @@ -63,12 +63,13 @@ public class RankingSearchSet implements SearchSet { } @Override - public boolean contains(int urlId, long documentMetadata) { + public boolean contains(int domainId, long documentMetadata) { // This is the main check - if (set.contains(urlId) || set.isEmpty()) { + if (set.contains(domainId) || set.isEmpty()) { return true; } + // TODO return false; } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/SearchSetAny.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/SearchSetAny.java index 2f457974..b0ee4e39 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/SearchSetAny.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/SearchSetAny.java @@ -4,7 +4,7 @@ import nu.marginalia.index.searchset.SearchSet; public class SearchSetAny implements SearchSet { @Override - public boolean contains(int urlId, long meta) { + public boolean contains(int domainId, long meta) { return true; } diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java new file mode 100644 index 00000000..a2960a67 --- /dev/null +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java @@ -0,0 +1,35 @@ +package nu.marginalia.index.results; + +import nu.marginalia.index.client.model.results.SearchResultItem; +import nu.marginalia.model.id.UrlIdCodec; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class IndexResultDomainDeduplicatorTest { + + @AfterEach + public void clear() { + IndexResultDomainDeduplicator.clearCachedObjects(); + } + + @Test + public void testDeduplicator() { + + IndexResultDomainDeduplicator deduplicator = new IndexResultDomainDeduplicator(3); + + assertTrue(deduplicator.test(forId(3, 0))); + assertTrue(deduplicator.test(forId(3, 1))); + assertTrue(deduplicator.test(forId(3, 2))); + assertFalse(deduplicator.test(forId(3, 3))); + assertFalse(deduplicator.test(forId(3, 4))); + + assertEquals(5, deduplicator.getCount(forId(3, 3))); + } + + SearchResultItem forId(int domain, int ordinal) { + return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal)); + } + +} \ No newline at end of file diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java index 01be347b..7efa08a2 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java @@ -2,22 +2,36 @@ package nu.marginalia.index.svc; import com.google.inject.Guice; import com.google.inject.Inject; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorage; +import nu.marginalia.db.storage.model.FileStorageType; +import nu.marginalia.hash.MurmurHash3_128; +import nu.marginalia.index.ReverseIndexFullFileNames; +import nu.marginalia.index.ReverseIndexPrioFileNames; import nu.marginalia.index.client.model.query.SearchSpecification; import nu.marginalia.index.client.model.query.SearchSubquery; import nu.marginalia.index.client.model.query.SearchSetIdentifier; import nu.marginalia.index.client.model.results.ResultRankingParameters; import nu.marginalia.index.client.model.results.SearchResultItem; +import nu.marginalia.index.construction.DocIdRewriter; +import nu.marginalia.index.construction.ReverseIndexConstructor; +import nu.marginalia.index.forward.ForwardIndexConverter; +import nu.marginalia.index.forward.ForwardIndexFileNames; import nu.marginalia.index.index.SearchIndex; import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.model.IndexJournalEntryHeader; +import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.index.journal.writer.IndexJournalWriter; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.SpecificationLimit; -import nu.marginalia.lexicon.KeywordLexicon; +import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.WordMetadata; +import nu.marginalia.process.control.FakeProcessHeartbeat; +import nu.marginalia.process.control.ProcessHeartbeat; +import nu.marginalia.ranking.DomainRankings; import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.service.server.Initialization; import org.junit.jupiter.api.AfterEach; @@ -28,6 +42,9 @@ import org.junit.jupiter.api.parallel.Execution; import spark.Spark; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; import java.util.*; import java.util.stream.IntStream; @@ -47,15 +64,21 @@ public class IndexQueryServiceIntegrationTest { @Inject SearchIndex searchIndex; - @Inject - KeywordLexicon keywordLexicon; - @Inject ServiceHeartbeat heartbeat; @Inject IndexJournalWriter indexJournalWriter; + @Inject + FileStorageService fileStorageService; + + @Inject + DomainRankings domainRankings; + + @Inject + ProcessHeartbeat processHeartbeat; + @BeforeEach public void setUp() throws IOException { @@ -79,6 +102,7 @@ public class IndexQueryServiceIntegrationTest { } indexJournalWriter.close(); + constructIndex(); searchIndex.switchIndex(); var rsp = queryService.justQuery( @@ -96,14 +120,71 @@ public class IndexQueryServiceIntegrationTest { List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList()))).build()); - Assertions.assertArrayEquals( - new int[] { 30, 90, 150, 210, 270, 330, 390, 450, 510 }, - rsp.results - .stream() - .mapToInt(SearchResultItem::getUrlIdInt) - .toArray()); + int[] idxes = new int[] { 30, 510, 90, 150, 210, 270, 330, 390, 450 }; + long[] ids = IntStream.of(idxes).mapToLong(this::fullId).toArray(); + long[] actual = rsp.results + .stream() + .mapToLong(SearchResultItem::getDocumentId) + .toArray(); + + Assertions.assertArrayEquals(ids, actual); } + private void constructIndex() throws SQLException, IOException { + createForwardIndex(); + createFullReverseIndex(); + createPrioReverseIndex(); + } + + + private void createFullReverseIndex() throws SQLException, IOException { + + FileStorage indexLive = fileStorageService.getStorageByType(FileStorageType.INDEX_LIVE); + FileStorage indexStaging = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING); + + Path outputFileDocs = ReverseIndexFullFileNames.resolve(indexLive.asPath(), ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.NEXT); + Path outputFileWords = ReverseIndexFullFileNames.resolve(indexLive.asPath(), ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.NEXT); + + Path tmpDir = indexStaging.asPath().resolve("tmp"); + if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir); + + + ReverseIndexConstructor. + createReverseIndex(new FakeProcessHeartbeat(), IndexJournalReader::singleFile, indexStaging.asPath(), DocIdRewriter.identity(), tmpDir, outputFileDocs, outputFileWords); + } + + private void createPrioReverseIndex() throws SQLException, IOException { + + FileStorage indexLive = fileStorageService.getStorageByType(FileStorageType.INDEX_LIVE); + FileStorage indexStaging = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING); + + Path outputFileDocs = ReverseIndexPrioFileNames.resolve(indexLive.asPath(), ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.NEXT); + Path outputFileWords = ReverseIndexPrioFileNames.resolve(indexLive.asPath(), ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.NEXT); + + Path tmpDir = indexStaging.asPath().resolve("tmp"); + if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir); + + ReverseIndexConstructor. + createReverseIndex(new FakeProcessHeartbeat(), IndexJournalReader::singleFile, indexStaging.asPath(), DocIdRewriter.identity(), tmpDir, outputFileDocs, outputFileWords); + } + + private void createForwardIndex() throws SQLException, IOException { + + FileStorage indexLive = fileStorageService.getStorageByType(FileStorageType.INDEX_LIVE); + FileStorage indexStaging = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING); + + Path outputFileDocsId = ForwardIndexFileNames.resolve(indexLive.asPath(), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT); + Path outputFileDocsData = ForwardIndexFileNames.resolve(indexLive.asPath(), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT); + + ForwardIndexConverter converter = new ForwardIndexConverter(processHeartbeat, + IndexJournalReader.paging(indexStaging.asPath()), + outputFileDocsId, + outputFileDocsData, + domainRankings + ); + + converter.convert(); + } @Test public void testDomainQuery() throws Exception { @@ -111,7 +192,9 @@ public class IndexQueryServiceIntegrationTest { loadDataWithDomain(i/100, i); } + indexJournalWriter.close(); + constructIndex(); searchIndex.switchIndex(); var rsp = queryService.justQuery( @@ -127,9 +210,11 @@ public class IndexQueryServiceIntegrationTest { .subqueries(List.of(new SearchSubquery( List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList()))).build()); - Assertions.assertArrayEquals( - new int[] { 210, 270 }, - rsp.results.stream().mapToInt(SearchResultItem::getUrlIdInt).toArray()); + int[] idxes = new int[] { 210, 270 }; + long[] ids = IntStream.of(idxes).mapToLong(id -> UrlIdCodec.encodeId(id/100, id)).toArray(); + long[] actual = rsp.results.stream().mapToLong(SearchResultItem::getDocumentId).toArray(); + + Assertions.assertArrayEquals(ids, actual); } @Test @@ -137,7 +222,9 @@ public class IndexQueryServiceIntegrationTest { for (int i = 1; i < 512; i++) { loadData(i); } + indexJournalWriter.close(); + constructIndex(); searchIndex.switchIndex(); var rsp = queryService.justQuery( @@ -169,19 +256,24 @@ public class IndexQueryServiceIntegrationTest { } + private long fullId(int id) { + return UrlIdCodec.encodeId((32 - (id % 32)), id); + } + + MurmurHash3_128 hasher = new MurmurHash3_128(); public void loadData(int id) { int[] factors = IntStream .rangeClosed(1, id) .filter(v -> (id % v) == 0) .toArray(); - long fullId = id | ((long) (32 - (id % 32)) << 32); + long fullId = fullId(id); var header = new IndexJournalEntryHeader(factors.length, 0, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode()); long[] data = new long[factors.length*2]; for (int i = 0; i < factors.length; i++) { - data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i])); + data[2*i] = hasher.hashNearlyASCII(Integer.toString(factors[i])); data[2*i + 1] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode(); } @@ -190,11 +282,11 @@ public class IndexQueryServiceIntegrationTest { public void loadDataWithDomain(int domain, int id) { int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray(); - var header = new IndexJournalEntryHeader(factors.length, 0, id | ((long) domain << 32), DocumentMetadata.defaultValue()); + var header = new IndexJournalEntryHeader(factors.length, 0, UrlIdCodec.encodeId(domain, id), DocumentMetadata.defaultValue()); long[] data = new long[factors.length*2]; for (int i = 0; i < factors.length; i++) { - data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i])); + data[2*i] = hasher.hashNearlyASCII(Integer.toString(factors[i])); data[2*i + 1] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode(); } diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java index 0801bc77..24fbff96 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java @@ -6,18 +6,14 @@ import nu.marginalia.db.storage.model.FileStorage; import nu.marginalia.db.storage.model.FileStorageType; import nu.marginalia.index.IndexServicesFactory; import nu.marginalia.index.journal.writer.IndexJournalWriter; -import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; -import nu.marginalia.lexicon.KeywordLexicon; -import nu.marginalia.lexicon.KeywordLexiconReadOnlyView; -import nu.marginalia.lexicon.journal.KeywordLexiconJournal; -import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; +import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl; +import nu.marginalia.process.control.FakeProcessHeartbeat; +import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.ranking.DomainRankings; import nu.marginalia.index.svc.searchset.SearchSetAny; import nu.marginalia.index.util.TestUtil; import nu.marginalia.index.client.model.query.SearchSetIdentifier; -import nu.marginalia.service.control.ServiceEventLog; -import nu.marginalia.service.control.ServiceHeartbeat; -import nu.marginalia.service.control.ServiceTaskHeartbeat; +import nu.marginalia.service.control.*; import nu.marginalia.service.id.ServiceId; import nu.marginalia.service.module.ServiceConfiguration; import org.mockito.Mockito; @@ -58,40 +54,22 @@ public class IndexQueryServiceIntegrationTestModule extends AbstractModule { var fileStorageServiceMock = Mockito.mock(FileStorageService.class); when(fileStorageServiceMock.getStorageByType(FileStorageType.SEARCH_SETS)).thenReturn(new FileStorage(null, null, null, fastDir.toString(), null)); - when(fileStorageServiceMock.getStorageByType(FileStorageType.LEXICON_LIVE)).thenReturn(new FileStorage(null, null, null, fastDir.toString(), null)); - when(fileStorageServiceMock.getStorageByType(FileStorageType.LEXICON_STAGING)).thenReturn(new FileStorage(null, null, null, fastDir.toString(), null)); when(fileStorageServiceMock.getStorageByType(FileStorageType.INDEX_LIVE)).thenReturn(new FileStorage(null, null, null, fastDir.toString(), null)); when(fileStorageServiceMock.getStorageByType(FileStorageType.INDEX_STAGING)).thenReturn(new FileStorage(null, null, null, slowDir.toString(), null)); - var serviceHeartbeat = Mockito.mock(ServiceHeartbeat.class); - // RIP fairies - when(serviceHeartbeat.createServiceTaskHeartbeat(Mockito.any(), Mockito.any())) - .thenReturn(Mockito.mock(ServiceTaskHeartbeat.class)); - bind(ServiceHeartbeat.class).toInstance(serviceHeartbeat); + bind(FileStorageService.class).toInstance(fileStorageServiceMock); - var servicesFactory = new IndexServicesFactory( - serviceHeartbeat, - fileStorageServiceMock - ); - bind(IndexServicesFactory.class).toInstance(servicesFactory); + bind(ServiceHeartbeat.class).toInstance(new FakeServiceHeartbeat()); + bind(ProcessHeartbeat.class).toInstance(new FakeProcessHeartbeat()); IndexSearchSetsService setsServiceMock = Mockito.mock(IndexSearchSetsService.class); when(setsServiceMock.getSearchSetByName(SearchSetIdentifier.NONE)).thenReturn(new SearchSetAny()); when(setsServiceMock.getDomainRankings()).thenReturn(new DomainRankings()); bind(IndexSearchSetsService.class).toInstance(setsServiceMock); - var keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal( - slowDir.resolve("dictionary.dat").toFile(), - KeywordLexiconJournalMode.READ_WRITE) - ); - bind(KeywordLexicon.class).toInstance(keywordLexicon); - bind(KeywordLexiconReadOnlyView.class).toInstance(new KeywordLexiconReadOnlyView(keywordLexicon)); - bind(ServiceEventLog.class).toInstance(Mockito.mock(ServiceEventLog.class)); - - bind(IndexJournalWriter.class).toInstance(new IndexJournalWriterImpl(keywordLexicon, - slowDir.resolve("page-index.dat"))); + bind(IndexJournalWriter.class).toInstance(new IndexJournalWriterPagingImpl(slowDir)); bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration( ServiceId.Index, diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/util/TestUtil.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/util/TestUtil.java index ef80181e..76908567 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/util/TestUtil.java +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/util/TestUtil.java @@ -12,22 +12,23 @@ public class TestUtil { return dir.startsWith("/tmp") || dir.toString().contains("tmp"); } - public static void clearTempDir(Path dir) { - if (!isTempDir(dir)) { + public static void clearTempDir(Path path) { + if (!isTempDir(path)) { throw new IllegalArgumentException("Refusing to recursively delete directory with that name"); } - if (Files.isDirectory(dir)) { - for (File f : dir.toFile().listFiles()) { + if (Files.isDirectory(path)) { + for (File f : path.toFile().listFiles()) { File[] files = f.listFiles(); if (files != null) { Arrays.stream(files).map(File::toPath).forEach(TestUtil::clearTempDir); } - System.out.println("Deleting " + f + " (" + fileSize(f.toPath()) + ")"); + System.out.println("Deleting " + path); f.delete(); } } - System.out.println("Deleting " + dir); - dir.toFile().delete(); + + System.out.println("Deleting " + path + " (" + fileSize(path) + ")"); + path.toFile().delete(); } private static String fileSize(Path path) { diff --git a/code/services-core/search-service/build.gradle b/code/services-core/search-service/build.gradle index 44c85e6f..ae0288be 100644 --- a/code/services-core/search-service/build.gradle +++ b/code/services-core/search-service/build.gradle @@ -25,6 +25,7 @@ dependencies { implementation project(':code:common:model') implementation project(':code:common:service') implementation project(':code:common:config') + implementation project(':code:common:linkdb') implementation project(':code:features-index:index-query') implementation project(':code:libraries:easy-lsh') diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchModule.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchModule.java index 1492c99f..5cb61977 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchModule.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchModule.java @@ -1,9 +1,17 @@ package nu.marginalia.search; import com.google.inject.AbstractModule; +import com.google.inject.Provides; +import com.google.inject.Singleton; +import com.google.inject.name.Named; import nu.marginalia.LanguageModels; import nu.marginalia.WebsiteUrl; import nu.marginalia.WmsaHome; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorageType; + +import java.nio.file.Path; +import java.sql.SQLException; public class SearchModule extends AbstractModule { @@ -12,4 +20,14 @@ public class SearchModule extends AbstractModule { bind(WebsiteUrl.class).toInstance(new WebsiteUrl(System.getProperty("website-url", "https://search.marginalia.nu/"))); } + @Provides + @Singleton + @Named("linkdb-file") + public Path linkdbPath(FileStorageService storageService) throws SQLException { + return storageService + .getStorageByType(FileStorageType.LINKDB_LIVE) + .asPath() + .resolve("links.db"); + } + } diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchOperator.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchOperator.java index 5657eb40..5a328234 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchOperator.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchOperator.java @@ -106,16 +106,11 @@ public class SearchOperator { } private int getDomainId(String domain) { - int domainId = -1; - try { - if (domain != null) { - return domainQueries.getDomainId(new EdgeDomain(domain)).id(); - } + if (domain == null) { + return -1; } - catch (NoSuchElementException ex) { - } - return domainId; + return domainQueries.tryGetDomainId(new EdgeDomain(domain)).orElse(-1); } private List getProblems(Context ctx, String evalResult, List queryResults, SearchQuery processedQuery) { diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchService.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchService.java index 30d978f6..e0dff012 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchService.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchService.java @@ -5,11 +5,14 @@ import com.google.inject.Inject; import lombok.SneakyThrows; import nu.marginalia.WebsiteUrl; import nu.marginalia.client.Context; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorageType; +import nu.marginalia.linkdb.LinkdbReader; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.search.client.SearchMqEndpoints; -import nu.marginalia.search.db.DbUrlDetailsQuery; import nu.marginalia.search.svc.SearchFrontPageService; import nu.marginalia.search.svc.*; +import nu.marginalia.service.control.ServiceEventLog; import nu.marginalia.service.server.*; import nu.marginalia.service.server.mq.MqNotification; import org.slf4j.Logger; @@ -20,33 +23,40 @@ import spark.Spark; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; public class SearchService extends Service { private final WebsiteUrl websiteUrl; - private final DbUrlDetailsQuery dbUrlDetailsQuery; private final StaticResources staticResources; + private final FileStorageService fileStorageService; + private final LinkdbReader linkdbReader; private static final Logger logger = LoggerFactory.getLogger(SearchService.class); + private final ServiceEventLog eventLog; @SneakyThrows @Inject public SearchService(BaseServiceParams params, WebsiteUrl websiteUrl, - DbUrlDetailsQuery dbUrlDetailsQuery, StaticResources staticResources, SearchFrontPageService frontPageService, SearchErrorPageService errorPageService, SearchAddToCrawlQueueService addToCrawlQueueService, SearchFlagSiteService flagSiteService, SearchQueryService searchQueryService, - SearchApiQueryService apiQueryService + SearchApiQueryService apiQueryService, + FileStorageService fileStorageService, + LinkdbReader linkdbReader ) { super(params); + this.eventLog = params.eventLog; this.websiteUrl = websiteUrl; - this.dbUrlDetailsQuery = dbUrlDetailsQuery; this.staticResources = staticResources; + this.fileStorageService = fileStorageService; + this.linkdbReader = linkdbReader; Spark.staticFiles.expireTime(600); @@ -77,10 +87,19 @@ public class SearchService extends Service { Spark.awaitInitialization(); } - @MqNotification(endpoint = SearchMqEndpoints.FLUSH_CACHES) - public void flushCaches(String unusedArg) { - logger.info("Flushing caches"); - dbUrlDetailsQuery.clearCaches(); + @SneakyThrows + @MqNotification(endpoint = SearchMqEndpoints.SWITCH_LINKDB) + public void switchLinkdb(String unusedArg) { + logger.info("Switching link database"); + + Path newPath = fileStorageService.getStorageByType(FileStorageType.LINKDB_STAGING) + .asPath() + .resolve("links.db"); + + if (Files.exists(newPath)) { + eventLog.logEvent("SEARCH-SWITCH-LINKDB", ""); + linkdbReader.switchInput(newPath); + } } private Object serveStatic(Request request, Response response) { diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/SiteListCommand.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/SiteListCommand.java index eac37841..e6300b1c 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/SiteListCommand.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/command/commands/SiteListCommand.java @@ -61,13 +61,13 @@ public class SiteListCommand implements SearchCommandInterface { List resultSet; Path screenshotPath = null; - Integer domainId = -1; + int domainId = -1; if (null != domain) { var dumbQuery = queryFactory.createQuery(SearchProfile.CORPO, 100, 100, "site:"+domain); resultSet = searchQueryIndexService.executeQuery(ctx, dumbQuery); var maybeId = domainQueries.tryGetDomainId(domain); if (maybeId.isPresent()) { - domainId = maybeId.get().id(); + domainId = maybeId.getAsInt(); screenshotPath = Path.of("/screenshot/" + domainId); } else { diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/db/DbUrlDetailsQuery.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/db/DbUrlDetailsQuery.java deleted file mode 100644 index b775696e..00000000 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/db/DbUrlDetailsQuery.java +++ /dev/null @@ -1,112 +0,0 @@ -package nu.marginalia.search.db; - -import com.google.common.base.Strings; -import com.google.common.cache.Cache; -import com.google.common.cache.CacheBuilder; -import com.google.inject.Inject; -import com.zaxxer.hikari.HikariDataSource; -import lombok.SneakyThrows; -import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.crawl.DomainIndexingState; -import nu.marginalia.model.id.EdgeId; -import nu.marginalia.model.id.EdgeIdCollection; -import nu.marginalia.search.model.PageScoreAdjustment; -import nu.marginalia.search.model.UrlDetails; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.*; - - -public class DbUrlDetailsQuery { - private final HikariDataSource dataSource; - private final Logger logger = LoggerFactory.getLogger(getClass()); - - private final Cache> urlIdCache = CacheBuilder.newBuilder().maximumSize(100_000).build(); - - public static double QUALITY_LOWER_BOUND_CUTOFF = -15.; - @Inject - public DbUrlDetailsQuery(HikariDataSource dataSource) - { - this.dataSource = dataSource; - } - - - public synchronized void clearCaches() - { - urlIdCache.invalidateAll(); - } - - private String idList(EdgeIdCollection ids) { - StringJoiner j = new StringJoiner(",", "(", ")"); - for (var id : ids.values()) { - j.add(Integer.toString(id)); - } - return j.toString(); - } - - @SneakyThrows - public List getUrlDetailsMulti(EdgeIdCollection ids) { - if (ids.isEmpty()) { - return Collections.emptyList(); - } - List result = new ArrayList<>(ids.size()); - - try (var connection = dataSource.getConnection()) { - - String idString = idList(ids); - - try (var stmt = connection.prepareStatement( - """ - SELECT ID, DOMAIN_ID, URL, - TITLE, DESCRIPTION, - QUALITY, - WORDS_TOTAL, FORMAT, FEATURES, - IP, DOMAIN_STATE, - DATA_HASH - FROM EC_URL_VIEW - WHERE TITLE IS NOT NULL - AND ID IN - """ + idString)) { - stmt.setFetchSize(ids.size()); - - var rsp = stmt.executeQuery(); - while (rsp.next()) { - var val = new UrlDetails(rsp.getInt(1), - rsp.getInt(2), - new EdgeUrl(rsp.getString(3)), - rsp.getString(4), // title - rsp.getString(5), // description - rsp.getDouble(6), // quality - rsp.getInt(7), // wordsTotal - rsp.getString(8), // format - rsp.getInt(9), // features - rsp.getString(10), // ip - DomainIndexingState.valueOf(rsp.getString(11)), // domainState - rsp.getLong(12), // dataHash - PageScoreAdjustment.zero(), // urlQualityAdjustment - Integer.MAX_VALUE, // rankingId - Double.MAX_VALUE, // termScore - 1, // resultsFromSameDomain - "", // positions - null, // result item - null // keyword scores - ); - if (val.urlQuality <= QUALITY_LOWER_BOUND_CUTOFF - && Strings.isNullOrEmpty(val.description) - && val.url.path.length() > 1) { - continue; - } - result.add(val); - - } - } - } - - return result; - } - - - - -} diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java index c1cda232..2f74046e 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java @@ -14,7 +14,7 @@ import java.util.StringJoiner; @AllArgsConstructor @NoArgsConstructor @With @Getter @ToString public class UrlDetails { - public int id; + public long id; public int domainId; public EdgeUrl url; public String title; @@ -66,7 +66,7 @@ public class UrlDetails { } public int hashCode() { - return Integer.hashCode(id); + return Long.hashCode(id); } public boolean equals(Object other) { diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/results/BrowseResultCleaner.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/results/BrowseResultCleaner.java index d56094d8..71a6ad43 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/results/BrowseResultCleaner.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/results/BrowseResultCleaner.java @@ -4,7 +4,6 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.browse.model.BrowseResult; import nu.marginalia.screenshot.ScreenshotService; -import nu.marginalia.model.id.EdgeId; import java.util.HashSet; import java.util.Set; @@ -22,7 +21,7 @@ public class BrowseResultCleaner { public Predicate shouldRemoveResultPredicate() { Set domainHashes = new HashSet<>(100); - return (res) -> !screenshotService.hasScreenshot(new EdgeId<>(res.domainId())) + return (res) -> !screenshotService.hasScreenshot(res.domainId()) || !domainHashes.add(res.domainHash()); } } diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/results/SearchResultDecorator.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/results/SearchResultDecorator.java index 9d5709e6..850c4f6a 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/results/SearchResultDecorator.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/results/SearchResultDecorator.java @@ -1,45 +1,72 @@ package nu.marginalia.search.results; import com.google.inject.Inject; -import gnu.trove.list.array.TIntArrayList; -import gnu.trove.map.hash.TIntObjectHashMap; +import gnu.trove.list.array.TLongArrayList; +import gnu.trove.map.hash.TLongObjectHashMap; import it.unimi.dsi.fastutil.ints.Int2LongArrayMap; +import lombok.SneakyThrows; import nu.marginalia.bbpc.BrailleBlockPunchCards; import nu.marginalia.index.client.model.results.ResultRankingContext; -import nu.marginalia.index.client.model.results.SearchResultSet; -import nu.marginalia.ranking.ResultValuator; -import nu.marginalia.search.db.DbUrlDetailsQuery; -import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.crawl.DomainIndexingState; -import nu.marginalia.model.id.EdgeIdList; import nu.marginalia.index.client.model.results.SearchResultItem; +import nu.marginalia.index.client.model.results.SearchResultSet; +import nu.marginalia.model.crawl.DomainIndexingState; +import nu.marginalia.model.id.UrlIdCodec; +import nu.marginalia.ranking.ResultValuator; +import nu.marginalia.search.model.PageScoreAdjustment; import nu.marginalia.search.model.UrlDetails; +import nu.marginalia.linkdb.LinkdbReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.List; +import java.util.StringJoiner; public class SearchResultDecorator { - private final DbUrlDetailsQuery dbUrlDetailsQuery; + private final LinkdbReader linkDbReader; private final ResultValuator valuator; private final Logger logger = LoggerFactory.getLogger(getClass()); @Inject - public SearchResultDecorator(DbUrlDetailsQuery dbUrlDetailsQuery, + public SearchResultDecorator(LinkdbReader linkDbReader, ResultValuator valuator) { - this.dbUrlDetailsQuery = dbUrlDetailsQuery; + this.linkDbReader = linkDbReader; this.valuator = valuator; } + @SneakyThrows public List getAllUrlDetails(SearchResultSet resultSet) { - TIntObjectHashMap detailsById = new TIntObjectHashMap<>(resultSet.size()); + TLongObjectHashMap detailsById = new TLongObjectHashMap<>(resultSet.size()); - EdgeIdList idList = resultSet.results.stream() - .mapToInt(SearchResultItem::getUrlIdInt) - .collect(EdgeIdList::new, EdgeIdList::add, EdgeIdList::addAll); + TLongArrayList idsList = new TLongArrayList(resultSet.results.size()); + for (var result : resultSet.results) { + idsList.add(result.getDocumentId()); + } - List ret = dbUrlDetailsQuery.getUrlDetailsMulti(idList); + List ret = new ArrayList<>(idsList.size()); + for (var rawDetail : linkDbReader.getUrlDetails(idsList)) { + ret.add(new UrlDetails( + rawDetail.urlId(), + UrlIdCodec.getDomainId(rawDetail.urlId()), + rawDetail.url(), + rawDetail.title(), + rawDetail.description(), + rawDetail.urlQuality(), + rawDetail.wordsTotal(), + rawDetail.format(), + rawDetail.features(), + "", + DomainIndexingState.ACTIVE, + rawDetail.dataHash(), + PageScoreAdjustment.zero(), // urlQualityAdjustment + Integer.MAX_VALUE, // rankingId + Double.MAX_VALUE, // termScore + 1, // resultsFromSameDomain + "", // positions + null, // result item + null // keyword scores + )); + } for (var val : ret) { detailsById.put(val.id, val); @@ -47,11 +74,11 @@ public class SearchResultDecorator { List retList = new ArrayList<>(resultSet.size()); - TIntArrayList missedIds = new TIntArrayList(); + TLongArrayList missedIds = new TLongArrayList(); for (var resultItem : resultSet.results) { var rankingId = resultItem.getRanking(); - var uid = resultItem.getUrlId().id(); + var uid = resultItem.getDocumentId(); var details = detailsById.get(uid); if (details == null) { @@ -72,7 +99,11 @@ public class SearchResultDecorator { retList.add(details); } if (!missedIds.isEmpty()) { - logger.info("Could not look up documents: {}", missedIds.toArray()); + StringJoiner missingDocs = new StringJoiner(","); + for (var id : missedIds.toArray()) { + missingDocs.add(Long.toHexString(id) + "/" + UrlIdCodec.getDomainId(id) + "." + UrlIdCodec.getDocumentOrdinal(id)); + } + logger.info("Could not look up documents: {}", missingDocs); } return retList; diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/siteinfo/DomainInformationService.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/siteinfo/DomainInformationService.java index 7863c17b..bd15c497 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/siteinfo/DomainInformationService.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/siteinfo/DomainInformationService.java @@ -5,18 +5,14 @@ import lombok.SneakyThrows; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.db.DbDomainQueries; -import nu.marginalia.model.id.EdgeId; import nu.marginalia.search.model.DomainInformation; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.inject.Inject; -import javax.inject.Singleton; +import com.google.inject.Inject; +import com.google.inject.Singleton; import java.sql.SQLException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Optional; +import java.util.*; /* TODO: This class needs to be refactored, a lot of @@ -42,10 +38,11 @@ public class DomainInformationService { public Optional domainInfo(String site) { - EdgeId domainId = getDomainFromPartial(site); - if (domainId == null) { + OptionalInt maybeDomainId = getDomainFromPartial(site); + if (maybeDomainId.isEmpty()) { return Optional.empty(); } + int domainId = maybeDomainId.getAsInt(); Optional domain = dbDomainQueries.getDomain(domainId); if (domain.isEmpty()) { @@ -85,7 +82,7 @@ public class DomainInformationService { } @SneakyThrows - private boolean inCrawlQueue(EdgeId domainId) { + private boolean inCrawlQueue(int domainId) { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement( """ @@ -94,21 +91,15 @@ public class DomainInformationService { WHERE EC_DOMAIN.ID=? """)) { - stmt.setInt(1, domainId.id()); + stmt.setInt(1, domainId); var rsp = stmt.executeQuery(); return rsp.next(); } } } - private EdgeId getDomainFromPartial(String site) { - try { - return dbDomainQueries.getDomainId(new EdgeDomain(site)); - } - catch (Exception ex) { - return null; - } - + private OptionalInt getDomainFromPartial(String site) { + return dbDomainQueries.tryGetDomainId(new EdgeDomain(site)); } @SneakyThrows @@ -125,11 +116,11 @@ public class DomainInformationService { } @SneakyThrows - public int getPagesKnown(EdgeId domainId) { + public int getPagesKnown(int domainId) { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) { - stmt.setInt(1, domainId.id()); + stmt.setInt(1, domainId); var rsp = stmt.executeQuery(); if (rsp.next()) { return rsp.getInt(1); @@ -142,11 +133,11 @@ public class DomainInformationService { } @SneakyThrows - public int getPagesVisited(EdgeId domainId) { + public int getPagesVisited(int domainId) { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) { - stmt.setInt(1, domainId.id()); + stmt.setInt(1, domainId); var rsp = stmt.executeQuery(); if (rsp.next()) { return rsp.getInt(1); @@ -160,11 +151,11 @@ public class DomainInformationService { @SneakyThrows - public int getPagesIndexed(EdgeId domainId) { + public int getPagesIndexed(int domainId) { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) { - stmt.setInt(1, domainId.id()); + stmt.setInt(1, domainId); var rsp = stmt.executeQuery(); if (rsp.next()) { return rsp.getInt(1); @@ -177,11 +168,11 @@ public class DomainInformationService { } @SneakyThrows - public int getIncomingLinks(EdgeId domainId) { + public int getIncomingLinks(int domainId) { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) { - stmt.setInt(1, domainId.id()); + stmt.setInt(1, domainId); var rsp = stmt.executeQuery(); if (rsp.next()) { return rsp.getInt(1); @@ -193,11 +184,11 @@ public class DomainInformationService { } } @SneakyThrows - public int getOutboundLinks(EdgeId domainId) { + public int getOutboundLinks(int domainId) { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) { - stmt.setInt(1, domainId.id()); + stmt.setInt(1, domainId); var rsp = stmt.executeQuery(); if (rsp.next()) { return rsp.getInt(1); @@ -210,11 +201,11 @@ public class DomainInformationService { } @SneakyThrows - public double getDomainQuality(EdgeId domainId) { + public double getDomainQuality(int domainId) { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement("SELECT QUALITY FROM EC_DOMAIN WHERE ID=?")) { - stmt.setInt(1, domainId.id()); + stmt.setInt(1, domainId); var rsp = stmt.executeQuery(); if (rsp.next()) { return rsp.getDouble(1); @@ -226,11 +217,11 @@ public class DomainInformationService { } } - public DomainIndexingState getDomainState(EdgeId domainId) { + public DomainIndexingState getDomainState(int domainId) { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) { - stmt.setInt(1, domainId.id()); + stmt.setInt(1, domainId); var rsp = stmt.executeQuery(); if (rsp.next()) { return DomainIndexingState.valueOf(rsp.getString(1)); @@ -244,11 +235,11 @@ public class DomainInformationService { return DomainIndexingState.ERROR; } - public List getLinkingDomains(EdgeId domainId) { + public List getLinkingDomains(int domainId) { try (var connection = dataSource.getConnection()) { List results = new ArrayList<>(25); try (var stmt = connection.prepareStatement("SELECT SOURCE_DOMAIN FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) { - stmt.setInt(1, domainId.id()); + stmt.setInt(1, domainId); var rsp = stmt.executeQuery(); while (rsp.next()) { results.add(new EdgeDomain(rsp.getString(1))); @@ -264,11 +255,11 @@ public class DomainInformationService { return Collections.emptyList(); } - public double getRank(EdgeId domainId) { + public double getRank(int domainId) { try (var connection = dataSource.getConnection()) { try (var stmt = connection.prepareStatement("SELECT IFNULL(RANK, 1) FROM EC_DOMAIN WHERE ID=?")) { - stmt.setInt(1, domainId.id()); + stmt.setInt(1, domainId); var rsp = stmt.executeQuery(); if (rsp.next()) { return rsp.getDouble(1); diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchAddToCrawlQueueService.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchAddToCrawlQueueService.java index ba104340..a5f080bf 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchAddToCrawlQueueService.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchAddToCrawlQueueService.java @@ -4,7 +4,6 @@ import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.WebsiteUrl; import nu.marginalia.db.DbDomainQueries; -import nu.marginalia.model.id.EdgeId; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import spark.Request; @@ -61,7 +60,7 @@ public class SearchAddToCrawlQueueService { } private String getDomainName(int id) { - var domain = domainQueries.getDomain(new EdgeId<>(id)); + var domain = domainQueries.getDomain(id); if (domain.isEmpty()) Spark.halt(404); return domain.get().toString(); diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryCountService.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryCountService.java index 3a3dc5ee..77afba8a 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryCountService.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryCountService.java @@ -3,7 +3,7 @@ package nu.marginalia.search.svc; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.inject.Singleton; +import com.google.inject.Singleton; import java.time.temporal.ChronoUnit; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchUnitConversionService.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchUnitConversionService.java index d2d75bcf..95e92782 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchUnitConversionService.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchUnitConversionService.java @@ -7,8 +7,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.annotation.CheckForNull; -import javax.inject.Inject; -import javax.inject.Singleton; +import com.google.inject.Inject; +import com.google.inject.Singleton; import java.util.Optional; import java.util.concurrent.Future; import java.util.function.Predicate; diff --git a/code/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingService.java b/code/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingService.java index c5c8a3cd..450267dc 100644 --- a/code/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingService.java +++ b/code/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingService.java @@ -9,7 +9,6 @@ import nu.marginalia.db.DomainBlacklist; import nu.marginalia.renderer.MustacheRenderer; import nu.marginalia.renderer.RendererFactory; import nu.marginalia.screenshot.ScreenshotService; -import nu.marginalia.model.id.EdgeId; import nu.marginalia.service.server.*; import org.jetbrains.annotations.NotNull; import spark.Request; @@ -156,7 +155,7 @@ public class DatingService extends Service { var session = sessionObjectOpt.get(); int id = Integer.parseInt(request.params("id")); - BrowseResult res = session.nextSimilar(new EdgeId<>(id), browseSimilarCosine, blacklist); + BrowseResult res = session.nextSimilar(id, browseSimilarCosine, blacklist); res = findViableDomain(session, res); @@ -168,7 +167,7 @@ public class DatingService extends Service { @NotNull private BrowseResult findViableDomain(DatingSessionObject session, BrowseResult res) { - while (!screenshotService.hasScreenshot(new EdgeId<>(res.domainId())) || session.isRecent(res)) { + while (!screenshotService.hasScreenshot(res.domainId()) || session.isRecent(res)) { res = session.next(browseRandom, blacklist); } return res; diff --git a/code/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingSessionObject.java b/code/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingSessionObject.java index 60ec6e3e..ff87e235 100644 --- a/code/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingSessionObject.java +++ b/code/services-satellite/dating-service/src/main/java/nu/marginalia/dating/DatingSessionObject.java @@ -3,9 +3,7 @@ package nu.marginalia.dating; import nu.marginalia.browse.DbBrowseDomainsRandom; import nu.marginalia.browse.DbBrowseDomainsSimilarCosine; import nu.marginalia.browse.model.BrowseResult; -import nu.marginalia.model.EdgeDomain; import nu.marginalia.db.DomainBlacklist; -import nu.marginalia.model.id.EdgeId; import java.util.LinkedList; @@ -29,8 +27,8 @@ public class DatingSessionObject { return queue.pollFirst(); } - public BrowseResult nextSimilar(EdgeId id, DbBrowseDomainsSimilarCosine adjacent, DomainBlacklist blacklist) { - adjacent.getDomainNeighborsAdjacentCosine(id, blacklist, 25).forEach(queue::addFirst); + public BrowseResult nextSimilar(int domainId, DbBrowseDomainsSimilarCosine adjacent, DomainBlacklist blacklist) { + adjacent.getDomainNeighborsAdjacentCosine(domainId, blacklist, 25).forEach(queue::addFirst); while (queue.size() > MAX_QUEUE_SIZE) { queue.removeLast(); diff --git a/code/tools/website-adjacencies-calculator/src/main/java/nu/marginalia/adjacencies/WebsiteAdjacenciesCalculator.java b/code/tools/website-adjacencies-calculator/src/main/java/nu/marginalia/adjacencies/WebsiteAdjacenciesCalculator.java index 12348543..e3421787 100644 --- a/code/tools/website-adjacencies-calculator/src/main/java/nu/marginalia/adjacencies/WebsiteAdjacenciesCalculator.java +++ b/code/tools/website-adjacencies-calculator/src/main/java/nu/marginalia/adjacencies/WebsiteAdjacenciesCalculator.java @@ -5,8 +5,8 @@ import lombok.SneakyThrows; import nu.marginalia.ProcessConfiguration; import nu.marginalia.db.DbDomainQueries; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.id.EdgeId; import nu.marginalia.process.control.ProcessHeartbeat; +import nu.marginalia.process.control.ProcessHeartbeatImpl; import nu.marginalia.service.module.DatabaseModule; import java.sql.SQLException; @@ -40,8 +40,7 @@ public class WebsiteAdjacenciesCalculator { System.out.println(Arrays.toString(domainName)); int[] domainIds = Arrays.stream(domainName).map(EdgeDomain::new) - .map(dataStoreDao::getDomainId) - .mapToInt(EdgeId::id) + .mapToInt(dataStoreDao::getDomainId) .map(domainAliases::deAlias) .toArray(); @@ -49,7 +48,7 @@ public class WebsiteAdjacenciesCalculator { findAdjacentDtoS(domainId, similarities -> { for (var similarity : similarities.similarities()) { if (adjacenciesData.isIndexedDomain(similarity.domainId)) System.out.print("*"); - System.out.println(dataStoreDao.getDomain(new EdgeId<>(similarity.domainId)).map(Object::toString).orElse("") + " " + prettyPercent(similarity.value)); + System.out.println(dataStoreDao.getDomain(similarity.domainId).map(Object::toString).orElse("") + " " + prettyPercent(similarity.value)); } }); } @@ -191,7 +190,7 @@ public class WebsiteAdjacenciesCalculator { var main = new WebsiteAdjacenciesCalculator(dataSource); if (args.length == 1 && "load".equals(args[0])) { - var processHeartbeat = new ProcessHeartbeat( + var processHeartbeat = new ProcessHeartbeatImpl( new ProcessConfiguration("website-adjacencies-calculator", 0, UUID.randomUUID()), dataSource ); diff --git a/docker-compose.yml b/docker-compose.yml index 0c5c3fd1..fd9c2a1a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,6 +3,7 @@ x-svc: &service - "run/env/service.env" volumes: - vol:/vol + - backup:/backup - conf:/wmsa/conf:ro - model:/wmsa/model - data:/wmsa/data @@ -123,6 +124,12 @@ volumes: type: none o: bind device: run/vol + backup: + driver: local + driver_opts: + type: none + o: bind + device: run/backup logs: driver: local driver_opts: diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index 15de9024..db9a6b82 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,5 +1,5 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-8.2-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-8.3-bin.zip zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists diff --git a/run/env/service.env b/run/env/service.env index a1646b18..c5b49a10 100644 --- a/run/env/service.env +++ b/run/env/service.env @@ -1,5 +1,5 @@ WMSA_HOME=run/ CONTROL_SERVICE_OPTS="-DdistPath=/dist" -CONVERTER_PROCESS_OPTS="-ea -Dservice-host=0.0.0.0 -ea -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.port=4001 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false" -CRAWLER_PROCESS_OPTS="-Dservice-host=0.0.0.0 -ea -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.port=4001 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false" -LOADER_PROCESS_OPTS="-Dservice-host=0.0.0.0 -ea -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.port=4001 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false" \ No newline at end of file +CONVERTER_PROCESS_OPTS="-ea -Dservice-host=0.0.0.0 -ea -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.port=4001 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -XX:StartFlightRecording:dumponexit=true,filename=/samples/converter.jfr" +CRAWLER_PROCESS_OPTS="-Dservice-host=0.0.0.0 -ea -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.port=4001 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -XX:StartFlightRecording:dumponexit=true,filename=/samples/crawler.jfr" +LOADER_PROCESS_OPTS="-Dservice-host=0.0.0.0 -ea -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.port=4001 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -XX:StartFlightRecording:dumponexit=true,filename=/samples/loader.jfr" \ No newline at end of file diff --git a/run/setup.sh b/run/setup.sh index 4739176b..71d736b7 100755 --- a/run/setup.sh +++ b/run/setup.sh @@ -18,7 +18,7 @@ function download_model { pushd $(dirname $0) -mkdir -p model logs db samples install vol/ir/{0,1}/ vol/{lr,lw} vol/iw/{0,1}/search-sets vol/{tmpf,tmps} vol/ss data samples/export +mkdir -p model logs db samples backup install vol/{ir,iw} vol/{lr,lw} vol/ss vol/{ldbw,ldbr} data samples/export download_model model/English.DICT https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.DICT download_model model/English.RDR https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.RDR diff --git a/settings.gradle b/settings.gradle index 0e00abc2..3e82b3a7 100644 --- a/settings.gradle +++ b/settings.gradle @@ -36,7 +36,6 @@ include 'code:features-convert:topic-detection' include 'code:features-crawl:crawl-blocklist' include 'code:features-crawl:link-parser' -include 'code:features-index:lexicon' include 'code:features-index:index-journal' include 'code:features-index:index-query' include 'code:features-index:index-forward' @@ -51,6 +50,7 @@ include 'code:api:process-mqapi' include 'code:common:service-discovery' include 'code:common:service-client' include 'code:common:db' +include 'code:common:linkdb' include 'code:common:service' include 'code:common:config' include 'code:common:model' @@ -60,6 +60,7 @@ include 'code:common:process' include 'code:processes:converting-process' include 'code:processes:crawling-process' include 'code:processes:loading-process' +include 'code:processes:index-constructor-process' include 'code:processes:test-data' include 'code:process-models:converting-model' @@ -112,7 +113,7 @@ dependencyResolutionManagement { library('hikaricp', 'com.zaxxer:HikariCP:5.0.1') library('spark', 'com.sparkjava', 'spark-core').version('2.9.4') - library('guice', 'com.google.inject', 'guice').version('5.1.0') + library('guice', 'com.google.inject', 'guice').version('7.0.0') library('guava', 'com.google.guava', 'guava').version('32.0.1-jre') library('rxjava', 'io.reactivex.rxjava3', 'rxjava').version('3.1.6')