Merge pull request #42 from MarginaliaSearch/no-downtime-upgrades

Zero downtime upgrades, merge-based index construction
This commit is contained in:
Viktor 2023-08-29 17:05:48 +02:00 committed by GitHub
commit bdcbfb11a8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
243 changed files with 5601 additions and 5213 deletions

View File

@ -13,6 +13,10 @@ tasks.register('dist', Copy) {
from subprojects.collect { it.tasks.withType(Tar) }
into "$buildDir/dist"
// For local development, each processes that are to be triggerable
// from the control-service need to go here to end up somewhere the
// control-service can find them
doLast {
copy {
from tarTree("$buildDir/dist/converter-process.tar")
@ -34,10 +38,18 @@ tasks.register('dist', Copy) {
from tarTree("$buildDir/dist/crawl-job-extractor-process.tar")
into "$projectDir/run/dist/"
}
copy {
from tarTree("$buildDir/dist/index-construction-process.tar")
into "$projectDir/run/dist/"
}
}
}
idea {
module {
// Exclude these directories from being indexed by IntelliJ
// as they tend to bring the IDE to its knees and use up all
// Inotify spots in a hurry
excludeDirs.add(file("$projectDir/run/backup"))
excludeDirs.add(file("$projectDir/run/model"))
excludeDirs.add(file("$projectDir/run/dist"))
excludeDirs.add(file("$projectDir/run/samples"))

View File

@ -3,8 +3,6 @@ package nu.marginalia.index.client;
public class IndexMqEndpoints {
public static final String INDEX_IS_BLOCKED = "INDEX-IS-BLOCKED";
public static final String INDEX_REPARTITION = "INDEX-REPARTITION";
public static final String INDEX_RELOAD_LEXICON = "INDEX-RELOAD-LEXICON";
public static final String INDEX_REINDEX = "INDEX-REINDEX";
public static final String SWITCH_INDEX = "SWITCH-INDEX";
}

View File

@ -2,16 +2,17 @@ package nu.marginalia.index.client.model.results;
import lombok.AllArgsConstructor;
import lombok.Getter;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.EdgeId;
import nu.marginalia.model.id.UrlIdCodec;
import org.jetbrains.annotations.NotNull;
import java.util.ArrayList;
import java.util.List;
/** Represents a document matching a search query */
@AllArgsConstructor @Getter
public class SearchResultItem {
/** Encoded ID that contains both the URL id and its ranking */
public class SearchResultItem implements Comparable<SearchResultItem> {
/** Encoded ID that contains both the URL id and its ranking. This is
* probably not what you want, use getDocumentId() instead */
public final long combinedId;
/** How did the subqueries match against the document ? */
@ -20,20 +21,18 @@ public class SearchResultItem {
/** How many other potential results existed in the same domain */
public int resultsFromDomain;
public SearchResultItem(long val) {
this.combinedId = val;
public SearchResultItem(long combinedId) {
this.combinedId = combinedId;
this.keywordScores = new ArrayList<>(16);
}
public EdgeId<EdgeUrl> getUrlId() {
return new EdgeId<>(getUrlIdInt());
public long getDocumentId() {
return UrlIdCodec.removeRank(combinedId);
}
public int getUrlIdInt() {
return (int)(combinedId & 0xFFFF_FFFFL);
}
public int getRanking() {
return (int)(combinedId >>> 32);
return UrlIdCodec.getRank(combinedId);
}
/* Used for evaluation */
@ -45,20 +44,16 @@ public class SearchResultItem {
return scoreValue;
}
private transient int domainId = Integer.MIN_VALUE;
public void setDomainId(int domainId) {
this.domainId = domainId;
}
public int getDomainId() {
return this.domainId;
return UrlIdCodec.getDomainId(this.combinedId);
}
public int hashCode() {
return getUrlIdInt();
return Long.hashCode(combinedId);
}
public String toString() {
return getClass().getSimpleName() + "[ url= " + getUrlId() + ", rank=" + getRanking() + "]";
return getClass().getSimpleName() + "[ url= " + getDocumentId() + ", rank=" + getRanking() + "]";
}
public boolean equals(Object other) {
@ -67,18 +62,18 @@ public class SearchResultItem {
if (other == this)
return true;
if (other instanceof SearchResultItem o) {
return o.getUrlIdInt() == getUrlIdInt();
return o.getDocumentId() == getDocumentId();
}
return false;
}
public long deduplicationKey() {
final int domainId = getDomainId();
@Override
public int compareTo(@NotNull SearchResultItem o) {
// this looks like a bug, but we actually want this in a reversed order
int diff = o.getScore().compareTo(getScore());
if (diff != 0)
return diff;
if (domainId == Integer.MAX_VALUE || domainId == Integer.MIN_VALUE) {
return 0;
}
return domainId;
return Long.compare(this.combinedId, o.combinedId);
}
}

View File

@ -6,7 +6,6 @@ import static java.lang.Boolean.compare;
import static java.lang.Double.compare;
public record SearchResultPreliminaryScore(
boolean disqualified,
boolean hasPriorityTerm,
double searchRankingScore)
implements Comparable<SearchResultPreliminaryScore>
@ -25,7 +24,4 @@ public record SearchResultPreliminaryScore(
return PREFER_LOW * compare(searchRankingScore, other.searchRankingScore);
}
public boolean isDisqualified() {
return disqualified;
}
}

View File

@ -4,4 +4,6 @@ public class ProcessInboxNames {
public static final String CONVERTER_INBOX = "converter";
public static final String LOADER_INBOX = "loader";
public static final String CRAWLER_INBOX = "crawler";
public static final String INDEX_CONSTRUCTOR_INBOX = "index_constructor";
}

View File

@ -0,0 +1,5 @@
package nu.marginalia.mqapi.index;
public record CreateIndexRequest(IndexName indexName)
{
}

View File

@ -0,0 +1,7 @@
package nu.marginalia.mqapi.index;
public enum IndexName {
FORWARD,
REVERSE_FULL,
REVERSE_PRIO
}

View File

@ -3,4 +3,5 @@ package nu.marginalia.search.client;
public class SearchMqEndpoints {
/** Flushes the URL caches, run if significant changes have occurred in the URLs database */
public static final String FLUSH_CACHES = "FLUSH_CACHES";
public static final String SWITCH_LINKDB = "SWITCH_LINKDB";
}

View File

@ -9,16 +9,16 @@ import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.id.EdgeId;
import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.OptionalInt;
@Singleton
public class DbDomainQueries {
private final HikariDataSource dataSource;
private final Cache<EdgeDomain, EdgeId<EdgeDomain>> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
private final Cache<EdgeDomain, Integer> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
@Inject
public DbDomainQueries(HikariDataSource dataSource)
@ -28,7 +28,7 @@ public class DbDomainQueries {
@SneakyThrows
public EdgeId<EdgeDomain> getDomainId(EdgeDomain domain) {
public Integer getDomainId(EdgeDomain domain) {
try (var connection = dataSource.getConnection()) {
return domainIdCache.get(domain, () -> {
@ -36,7 +36,7 @@ public class DbDomainQueries {
stmt.setString(1, domain.toString());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return new EdgeId<>(rsp.getInt(1));
return rsp.getInt(1);
}
}
throw new NoSuchElementException();
@ -48,12 +48,12 @@ public class DbDomainQueries {
}
@SneakyThrows
public Optional<EdgeId<EdgeDomain>> tryGetDomainId(EdgeDomain domain) {
public OptionalInt tryGetDomainId(EdgeDomain domain) {
var maybe = Optional.ofNullable(domainIdCache.getIfPresent(domain));
if (maybe.isPresent())
return maybe;
Integer maybeId = domainIdCache.getIfPresent(domain);
if (maybeId != null) {
return OptionalInt.of(maybeId);
}
try (var connection = dataSource.getConnection()) {
@ -61,25 +61,25 @@ public class DbDomainQueries {
stmt.setString(1, domain.toString());
var rsp = stmt.executeQuery();
if (rsp.next()) {
var id = new EdgeId<EdgeDomain>(rsp.getInt(1));
var id = rsp.getInt(1);
domainIdCache.put(domain, id);
return Optional.of(id);
return OptionalInt.of(id);
}
}
return Optional.empty();
return OptionalInt.empty();
}
catch (UncheckedExecutionException ex) {
return Optional.empty();
return OptionalInt.empty();
}
}
@SneakyThrows
public Optional<EdgeDomain> getDomain(EdgeId<EdgeDomain> id) {
public Optional<EdgeDomain> getDomain(int id) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, id.id());
stmt.setInt(1, id);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return Optional.of(new EdgeDomain(rsp.getString(1)));

View File

@ -2,15 +2,10 @@ package nu.marginalia.db;
import com.google.inject.ImplementedBy;
import gnu.trove.set.hash.TIntHashSet;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.id.EdgeId;
@ImplementedBy(DomainBlacklistImpl.class)
public interface DomainBlacklist {
boolean isBlacklisted(int domainId);
default boolean isBlacklisted(EdgeId<EdgeDomain> domainId) {
return isBlacklisted(domainId.id());
}
default TIntHashSet getSpamDomains() {
return new TIntHashSet();
}

View File

@ -1,13 +1,14 @@
package nu.marginalia.db;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.id.EdgeIdList;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
import javax.inject.Inject;
import javax.inject.Singleton;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
@ -58,10 +59,10 @@ public class DomainTypes {
return ret;
}
/** Retrieve the EdgeId of all domains of a certain type,
/** Retrieve the domain id of all domains of a certain type,
* ignoring entries that are not in the EC_DOMAIN table */
public EdgeIdList<EdgeDomain> getKnownDomainsByType(Type type) {
EdgeIdList<EdgeDomain> ret = new EdgeIdList<>();
public TIntList getKnownDomainsByType(Type type) {
TIntList ret = new TIntArrayList();
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""

View File

@ -5,8 +5,8 @@ import nu.marginalia.db.storage.model.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.inject.Inject;
import javax.inject.Singleton;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;

View File

@ -5,9 +5,9 @@ public enum FileStorageType {
CRAWL_DATA,
PROCESSED_DATA,
INDEX_STAGING,
LEXICON_STAGING,
LINKDB_STAGING,
LINKDB_LIVE,
INDEX_LIVE,
LEXICON_LIVE,
BACKUP,
EXPORT,
SEARCH_SETS

View File

@ -0,0 +1,9 @@
ALTER TABLE FILE_STORAGE MODIFY COLUMN TYPE ENUM ('CRAWL_SPEC', 'CRAWL_DATA', 'PROCESSED_DATA', 'INDEX_STAGING', 'LEXICON_STAGING', 'INDEX_LIVE', 'LEXICON_LIVE', 'SEARCH_SETS', 'BACKUP', 'EXPORT', 'LINKDB_LIVE', 'LINKDB_STAGING') NOT NULL;
INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE)
SELECT ID, 'ldbr', "Linkdb Current", 'LINKDB_LIVE'
FROM FILE_STORAGE_BASE WHERE NAME='Index Storage';
INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE)
SELECT ID, 'ldbw', "Linkdb Staging Area", 'LINKDB_STAGING'
FROM FILE_STORAGE_BASE WHERE NAME='Index Storage';

View File

@ -0,0 +1,3 @@
DROP VIEW EC_URL_VIEW;
DROP TABLE EC_PAGE_DATA;
DROP TABLE EC_URL;

View File

@ -0,0 +1,3 @@
INSERT IGNORE INTO FILE_STORAGE_BASE(NAME, PATH, TYPE, PERMIT_TEMP)
VALUES
('Backup Storage', '/backup', 'BACKUP', true);

View File

@ -0,0 +1 @@
DELETE FROM FILE_STORAGE WHERE TYPE IN ('LEXICON_STAGING', 'LEXICON_LIVE');

View File

@ -0,0 +1,56 @@
plugins {
id 'java'
id "io.freefair.lombok" version "8.2.2"
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(20))
}
}
configurations {
flywayMigration.extendsFrom(implementation)
}
dependencies {
implementation project(':code:common:model')
implementation libs.lombok
annotationProcessor libs.lombok
implementation libs.bundles.slf4j
implementation libs.guice
implementation libs.bundles.gson
implementation libs.notnull
implementation libs.sqlite
implementation libs.commons.lang3
implementation libs.trove
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
testImplementation 'org.testcontainers:mariadb:1.17.4'
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
}
test {
maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1
maxHeapSize = "8G"
useJUnitPlatform()
}
task fastTests(type: Test) {
maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1
maxHeapSize = "8G"
useJUnitPlatform {
excludeTags "slow"
}
}

View File

@ -0,0 +1,102 @@
package nu.marginalia.linkdb;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.google.inject.name.Named;
import gnu.trove.list.TLongList;
import nu.marginalia.linkdb.model.LdbUrlDetail;
import nu.marginalia.model.EdgeUrl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
@Singleton
public class LinkdbReader {
private Path dbFile;
private volatile Connection connection;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public LinkdbReader(@Named("linkdb-file") Path dbFile) throws SQLException {
this.dbFile = dbFile;
if (Files.exists(dbFile)) {
try {
connection = createConnection();
}
catch (SQLException ex) {
connection = null;
logger.error("Failed to load linkdb file", ex);
}
}
else {
logger.warn("No linkdb file {}", dbFile);
}
}
private Connection createConnection() throws SQLException {
String connStr = "jdbc:sqlite:" + dbFile.toString();
return DriverManager.getConnection(connStr);
}
public void switchInput(Path newDbFile) throws IOException, SQLException {
if (connection != null) {
connection.close();
}
Files.move(newDbFile, dbFile, StandardCopyOption.REPLACE_EXISTING);
connection = createConnection();
}
public List<LdbUrlDetail> getUrlDetails(TLongList ids) throws SQLException {
List<LdbUrlDetail> ret = new ArrayList<>(ids.size());
if (connection == null ||
connection.isClosed())
{
throw new RuntimeException("URL query temporarily unavailable due to database switch");
}
try (var stmt = connection.prepareStatement("""
SELECT ID, URL, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR
FROM DOCUMENT WHERE ID = ?
""")) {
for (int i = 0; i < ids.size(); i++) {
long id = ids.get(i);
stmt.setLong(1, id);
var rs = stmt.executeQuery();
if (rs.next()) {
var url = new EdgeUrl(rs.getString("URL"));
ret.add(new LdbUrlDetail(
rs.getLong("ID"),
url,
rs.getString("TITLE"),
rs.getString("DESCRIPTION"),
rs.getDouble("QUALITY"),
rs.getString("FORMAT"),
rs.getInt("FEATURES"),
rs.getInt("PUB_YEAR"),
rs.getLong("DATA_HASH"),
rs.getInt("WORDS_TOTAL")
));
}
}
} catch (URISyntaxException e) {
throw new RuntimeException(e);
}
return ret;
}
}

View File

@ -0,0 +1,64 @@
package nu.marginalia.linkdb;
import nu.marginalia.linkdb.model.UrlStatus;
import java.io.IOException;
import java.nio.file.Path;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Types;
import java.util.List;
public class LinkdbStatusWriter {
private final Connection connection;
public LinkdbStatusWriter(Path outputFile) throws SQLException {
String connStr = "jdbc:sqlite:" + outputFile.toString();
connection = DriverManager.getConnection(connStr);
try (var stream = ClassLoader.getSystemResourceAsStream("db/linkdb-status.sql");
var stmt = connection.createStatement()
) {
var sql = new String(stream.readAllBytes());
stmt.executeUpdate(sql);
// Disable synchronous writing as this is a one-off operation with no recovery
stmt.execute("PRAGMA synchronous = OFF");
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public void add(List<UrlStatus> statuses) throws SQLException {
try (var stmt = connection.prepareStatement("""
INSERT INTO STATUS(ID, URL, STATUS, DESCRIPTION)
VALUES (?, ?, ?, ?)
""")) {
int count = 0;
for (var status : statuses) {
stmt.setLong(1, status.id());
stmt.setString(2, status.url().toString());
stmt.setString(3, status.status());
if (status.description() == null) {
stmt.setNull(4, Types.VARCHAR);
} else {
stmt.setString(4, status.description());
}
stmt.addBatch();
if (++count > 1000) {
count = 0;
stmt.executeBatch();
}
}
if (count != 0) {
stmt.executeBatch();
}
}
}
public void close() throws SQLException {
connection.close();
}
}

View File

@ -0,0 +1,80 @@
package nu.marginalia.linkdb;
import nu.marginalia.linkdb.model.LdbUrlDetail;
import java.io.IOException;
import java.nio.file.Path;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Types;
import java.util.List;
public class LinkdbWriter {
private final Connection connection;
public LinkdbWriter(Path outputFile) throws SQLException {
String connStr = "jdbc:sqlite:" + outputFile.toString();
connection = DriverManager.getConnection(connStr);
try (var stream = ClassLoader.getSystemResourceAsStream("db/linkdb-document.sql");
var stmt = connection.createStatement()
) {
var sql = new String(stream.readAllBytes());
stmt.executeUpdate(sql);
// Disable synchronous writing as this is a one-off operation with no recovery
stmt.execute("PRAGMA synchronous = OFF");
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public void add(LdbUrlDetail ldbUrlDetail) throws SQLException {
add(List.of(ldbUrlDetail));
}
public void add(List<LdbUrlDetail> ldbUrlDetail) throws SQLException {
try (var stmt = connection.prepareStatement("""
INSERT OR IGNORE INTO DOCUMENT(ID, URL, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""")) {
int i = 0;
for (var document : ldbUrlDetail) {
var url = document.url();
stmt.setLong(1, document.urlId());
stmt.setString(2, url.toString());
stmt.setString(3, document.title());
stmt.setString(4, document.description());
stmt.setInt(5, document.wordsTotal());
stmt.setString(6, document.format());
stmt.setInt(7, document.features());
stmt.setLong(8, document.dataHash());
stmt.setDouble(9, document.urlQuality());
if (document.pubYear() == null) {
stmt.setNull(10, Types.INTEGER);
} else {
stmt.setInt(10, document.pubYear());
}
stmt.addBatch();
if (++i > 1000) {
stmt.executeBatch();
i = 0;
}
}
if (i != 0) stmt.executeBatch();
}
}
public void close() throws SQLException {
connection.close();
}
}

View File

@ -0,0 +1,18 @@
package nu.marginalia.linkdb.model;
import nu.marginalia.model.EdgeUrl;
public record LdbUrlDetail(long urlId,
EdgeUrl url,
String title,
String description,
double urlQuality,
String format,
int features,
Integer pubYear,
long dataHash,
int wordsTotal
)
{
}

View File

@ -0,0 +1,24 @@
package nu.marginalia.linkdb.model;
public enum UrlProtocol {
HTTP,
HTTPS;
public static int encode(String str) {
if ("http".equalsIgnoreCase(str)) {
return HTTP.ordinal();
}
else if ("https".equalsIgnoreCase(str)) {
return HTTPS.ordinal();
}
throw new IllegalArgumentException(str);
}
public static String decode(int ordinal) {
return switch (values()[ordinal]) {
case HTTP -> "http";
case HTTPS -> "https";
};
};
}

View File

@ -0,0 +1,8 @@
package nu.marginalia.linkdb.model;
import nu.marginalia.model.EdgeUrl;
import javax.annotation.Nullable;
public record UrlStatus(long id, EdgeUrl url, String status, @Nullable String description) {
}

View File

@ -0,0 +1,17 @@
CREATE TABLE DOCUMENT (
ID INT8 PRIMARY KEY,
URL TEXT,
STATE INT,
TITLE TEXT NOT NULL,
DESCRIPTION TEXT NOT NULL,
WORDS_TOTAL INTEGER NOT NULL,
FORMAT TEXT NOT NULL,
FEATURES INTEGER NOT NULL,
DATA_HASH INTEGER NOT NULL,
QUALITY REAL NOT NULL,
PUB_YEAR INTEGER NOT NULL
);

View File

@ -0,0 +1,6 @@
CREATE TABLE STATUS (
ID INT8 PRIMARY KEY,
URL TEXT,
STATUS TEXT NOT NULL,
DESCRIPTION TEXT
);

View File

@ -0,0 +1,33 @@
package nu.marginalia.linkdb;
import nu.marginalia.linkdb.model.UrlStatus;
import nu.marginalia.model.EdgeUrl;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.List;
public class LinkdbStatusWriterTest {
@Test
public void testCreate() throws IOException {
Path tempPath = Files.createTempFile("linkdb-status", ".db");
try {
var writer = new LinkdbStatusWriter(tempPath);
writer.add(List.of(
new UrlStatus(5, new EdgeUrl("https://www.marginalia.nu/x"), "y", null),
new UrlStatus(6, new EdgeUrl("https://www.marginalia.nu/y"), "y", "z")
));
writer.close();
} catch (SQLException e) {
throw new RuntimeException(e);
} catch (URISyntaxException e) {
throw new RuntimeException(e);
} finally {
Files.deleteIfExists(tempPath);
}
}
}

View File

@ -0,0 +1,42 @@
package nu.marginalia.linkdb;
import gnu.trove.list.array.TLongArrayList;
import nu.marginalia.linkdb.model.LdbUrlDetail;
import nu.marginalia.model.EdgeDomain;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
public class LinkdbWriterTest {
@Test
public void testCreate() throws IOException {
Path tempPath = Files.createTempFile("linkdb", ".db");
try {
var writer = new LinkdbWriter(tempPath);
writer.add(new LdbUrlDetail(
1,
new nu.marginalia.model.EdgeUrl("http", new EdgeDomain("example.com"), null, "/", null),
"Test",
"This is a test",
-4.,
"XHTML",
5,
2020,
0xF00BA3,
444
));
writer.close();
var reader = new LinkdbReader(tempPath);
var deets = reader.getUrlDetails(new TLongArrayList(new long[]{1}));
System.out.println(deets);
} catch (SQLException e) {
throw new RuntimeException(e);
} finally {
Files.deleteIfExists(tempPath);
}
}
}

View File

@ -6,7 +6,6 @@ import nu.marginalia.bigstring.BigString;
import nu.marginalia.bigstring.CompressedBigString;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.EdgeId;
import java.net.URISyntaxException;
@ -24,8 +23,6 @@ public class GsonFactory {
}
})
.registerTypeAdapter(EdgeDomain.class, (JsonDeserializer<EdgeDomain>) (json, typeOfT, context) -> new EdgeDomain(json.getAsString()))
.registerTypeAdapter(EdgeId.class, (JsonDeserializer<EdgeId<?>>) (json, typeOfT, context) -> new EdgeId<>(json.getAsInt()))
.registerTypeAdapter(EdgeId.class, (JsonSerializer<EdgeId<?>>) (src, typeOfSrc, context) -> new JsonPrimitive(src.id()))
.registerTypeAdapter(BigString.class, (JsonDeserializer<BigString>) (json, typeOfT, context) -> BigString.encode(json.getAsString()))
.registerTypeAdapter(BigString.class, (JsonSerializer<BigString>) (src, typeOfT, context) -> new JsonPrimitive(src.decode()))
.registerTypeAdapter(CompressedBigString.class, (JsonSerializer<BigString>) (src, typeOfT, context) -> new JsonPrimitive(src.decode()))

View File

@ -1,10 +0,0 @@
package nu.marginalia.model.id;
/**
* This exists entirely for strengthening the typing of IDs
*
* @param <T>
*/
public record EdgeId<T>(int id) {
}

View File

@ -1,34 +0,0 @@
package nu.marginalia.model.id;
import java.util.Arrays;
import java.util.stream.IntStream;
public record EdgeIdArray<T> (int... values) implements EdgeIdCollection<T> {
public static <T> EdgeIdArray<T> gather(IntStream stream) {
return new EdgeIdArray<>(stream.toArray());
}
@Override
public int[] values() {
return values;
}
@Override
public boolean isEmpty() {
return values.length == 0;
}
@Override
public int size() {
return values.length;
}
public int get(int idx) {
return values[idx];
}
public void sort() {
Arrays.sort(values);
}
}

View File

@ -1,28 +0,0 @@
package nu.marginalia.model.id;
import java.util.Arrays;
import java.util.Iterator;
import java.util.stream.IntStream;
public interface EdgeIdCollection<T> extends Iterable<EdgeId<T>> {
int size();
boolean isEmpty();
int[] values();
default IntStream stream() {
return Arrays.stream(values());
}
default Iterator<EdgeId<T>> iterator() {
return Arrays.stream(values()).mapToObj(EdgeId<T>::new).iterator();
}
default EdgeIdArray<T> asArray() {
return new EdgeIdArray<>(values());
}
default EdgeIdList<T> asList() {
return new EdgeIdList<>(values());
}
default EdgeIdSet<T> asSet() {
return new EdgeIdSet<>(values());
}
}

View File

@ -1,12 +0,0 @@
package nu.marginalia.model.id;
import gnu.trove.TIntCollection;
public interface EdgeIdCollectionMutable<T> {
TIntCollection underlyingCollection();
default void addAll(EdgeIdArray<T> other) { underlyingCollection().addAll(other.values()); }
default void addAll(EdgeIdList<T> other) { underlyingCollection().addAll(other.list()); }
default void addAll(EdgeIdCollection<T> other) { underlyingCollection().addAll(other.values()); }
}

View File

@ -1,48 +0,0 @@
package nu.marginalia.model.id;
import gnu.trove.TIntCollection;
import gnu.trove.list.array.TIntArrayList;
import java.util.stream.IntStream;
public record EdgeIdList<T> (TIntArrayList list) implements
EdgeIdCollection<T>,
EdgeIdCollectionMutable<T> {
public EdgeIdList(int... values) { this(new TIntArrayList(values)); }
public static <T> EdgeIdList<T> gather(IntStream stream) {
return stream.collect(EdgeIdList::new, EdgeIdList::add, EdgeIdList::addAll);
}
@Override
public int[] values() {
return list.toArray();
}
@Override
public boolean isEmpty() {
return list.isEmpty();
}
@Override
public int size() {
return list.size();
}
public int get(int idx) {
return list.get(idx);
}
public void add(int id) {
list.add(id);
}
public void sort() {
list.sort();
}
@Override
public TIntCollection underlyingCollection() {
return list;
}
}

View File

@ -1,52 +0,0 @@
package nu.marginalia.model.id;
import gnu.trove.TIntCollection;
import gnu.trove.set.hash.TIntHashSet;
import java.util.stream.IntStream;
public record EdgeIdSet<T> (TIntHashSet set) implements EdgeIdCollection<T>, EdgeIdCollectionMutable<T> {
public EdgeIdSet(int... values) {
this(new TIntHashSet(values.length, 0.5f, -1));
set.addAll(values);
}
public EdgeIdSet(int initialCapacity, float loadFactor) {
this(new TIntHashSet(initialCapacity, loadFactor, -1));
}
@Override
public TIntCollection underlyingCollection() {
return set;
}
public static <T> EdgeIdSet<T> gather(IntStream stream) {
return new EdgeIdSet<>(stream.toArray());
}
@Override
public int[] values() {
return set.toArray();
}
@Override
public boolean isEmpty() {
return set.isEmpty();
}
@Override
public int size() {
return set.size();
}
public boolean contains(int id) {
return set.contains(id);
}
public boolean add(int id) {
return set.add(id);
}
public boolean remove(int id) { return set.remove(id); }
}

View File

@ -0,0 +1,78 @@
package nu.marginalia.model.id;
/** URL id encoding scheme, including an optional ranking part that's used in the indices and washed away
* outside. The ranking part is put in the highest bits so that when we sort the documents by id, they're
* actually sorted by rank. Next is the domain id part, which keeps documents from the same domain clustered.
* Finally is the document ordinal part, which is a non-unique sequence number for within the current set of
* documents loaded. The same ID may be re-used over time as a new index is loaded.
* <p></p>
* <table>
* <tr><th>Part</th><th>Bits</th><th>Cardinality</th></tr>
* <tr>
* <td>rank</td><td>6 bits</td><td>64</td>
* </tr>
* <tr>
* <td>domain</td><td>31 bits</td><td>2 billion</td>
* </tr>
* <tr>
* <td>document</td><td>26 bits</td><td>67 million</td>
* </tr>
* </table>
* <p></p>
* Most significant bit is unused for now because I'm not routing Long.compareUnsigned() all over the codebase.
* <i>If</i> we end up needing more domains, we'll cross that bridge when we come to it.
*
* <h2>Coding Scheme</h2>
* <code><pre>
* [ | rank | domain | url ]
* 0 1 6 38 64
* </pre></code>
*/
public class UrlIdCodec {
private static final long RANK_MASK = 0xFE00_0000_0000_0000L;
private static final int DOCORD_MASK = 0x03FF_FFFF;
/** Encode a URL id without a ranking element */
public static long encodeId(int domainId, int documentOrdinal) {
domainId &= 0x7FFF_FFFF;
documentOrdinal &= 0x03FF_FFFF;
return ((long) domainId << 26) | documentOrdinal;
}
/** Add a ranking element to an existing combined URL id.
*
* @param rank [0,1] the importance of the domain, low is good
* @param urlId
*/
public static long addRank(float rank, long urlId) {
long rankPart = (int)(rank * (1<<6));
if (rankPart >= 64) rankPart = 63;
if (rankPart < 0) rankPart = 0;
return (urlId&(~RANK_MASK)) | (rankPart << 57);
}
/** Extract the domain component from this URL id */
public static int getDomainId(long combinedId) {
return (int) ((combinedId >>> 26) & 0x7FFF_FFFFL);
}
/** Extract the document ordinal component from this URL id */
public static int getDocumentOrdinal(long combinedId) {
return (int) (combinedId & DOCORD_MASK);
}
/** Extract the document ordinal component from this URL id */
public static int getRank(long combinedId) {
return (int) (combinedId >>> 57);
}
/** Mask out the ranking element from this URL id */
public static long removeRank(long combinedId) {
return combinedId & ~RANK_MASK;
}
}

View File

@ -0,0 +1,41 @@
package nu.marginalia.model.id;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.*;
class UrlIdCodecTest {
@Test
public void testDocumentBounds() {
long encoded = UrlIdCodec.encodeId(0, ~0);
assertEquals(0, UrlIdCodec.getDomainId(encoded));
}
@Test
public void testDomainBounds() {
long encoded = UrlIdCodec.encodeId(~0, 0);
assertEquals(0x7FFF_FFFF, UrlIdCodec.getDomainId(encoded));
assertEquals(0, UrlIdCodec.getRank(encoded));
assertEquals(0, UrlIdCodec.getDocumentOrdinal(encoded));
}
@Test
public void testRankBoundsAdd() {
long encoded = UrlIdCodec.encodeId(0, 0);
encoded = UrlIdCodec.addRank(1.f, encoded);
assertEquals(0, UrlIdCodec.getDomainId(encoded));
assertEquals(63, UrlIdCodec.getRank(encoded));
assertEquals(0, UrlIdCodec.getDocumentOrdinal(encoded));
}
@Test
public void testRemoveRank() {
long encoded = UrlIdCodec.encodeId(0x7FFF_FFFF, ~0);
encoded = UrlIdCodec.addRank(1.f, encoded);
encoded = UrlIdCodec.removeRank(encoded);
assertEquals(0x7FFF_FFFFL, UrlIdCodec.getDomainId(encoded));
assertEquals(0, UrlIdCodec.getRank(encoded));
assertEquals(0x03FF_FFFF, UrlIdCodec.getDocumentOrdinal(encoded));
}
}

View File

@ -0,0 +1,35 @@
package nu.marginalia.process.control;
/** Dummy implementation of ProcessHeartbeat that does nothing */
public class FakeProcessHeartbeat implements ProcessHeartbeat {
@Override
public <T extends Enum<T>> ProcessTaskHeartbeat<T> createProcessTaskHeartbeat(Class<T> steps, String processName) {
return new ProcessTaskHeartbeat<>() {
@Override
public void progress(T step) {}
@Override
public void shutDown() {}
@Override
public void close() {}
};
}
@Override
public ProcessAdHocTaskHeartbeat createAdHocTaskHeartbeat(String processName) {
return new ProcessAdHocTaskHeartbeat() {
@Override
public void progress(String step, int progress, int total) {}
@Override
public void close() {}
};
}
@Override
public void setProgress(double progress) {}
}

View File

@ -0,0 +1,7 @@
package nu.marginalia.process.control;
public interface ProcessAdHocTaskHeartbeat extends AutoCloseable {
void progress(String step, int progress, int total);
void close();
}

View File

@ -0,0 +1,187 @@
package nu.marginalia.process.control;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.ProcessConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.UUID;
import java.util.concurrent.TimeUnit;
/** This object sends a heartbeat to the database every few seconds,
* updating with the progress of a task within a service. Progress is tracked by providing
* enumerations corresponding to the steps in the task. It's important they're arranged in the same
* order as the steps in the task in order to get an accurate progress tracking.
*/
public class ProcessAdHocTaskHeartbeatImpl implements AutoCloseable, ProcessAdHocTaskHeartbeat {
private final Logger logger = LoggerFactory.getLogger(ProcessAdHocTaskHeartbeatImpl.class);
private final String taskName;
private final String taskBase;
private final String instanceUUID;
private final HikariDataSource dataSource;
private final Thread runnerThread;
private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 1);
private final String serviceInstanceUUID;
private int progress;
private volatile boolean running = false;
private volatile String step = "-";
ProcessAdHocTaskHeartbeatImpl(ProcessConfiguration configuration,
String taskName,
HikariDataSource dataSource)
{
this.taskName = configuration.processName() + "." + taskName + ":" + configuration.node();
this.taskBase = configuration.processName() + "." + taskName;
this.dataSource = dataSource;
this.instanceUUID = UUID.randomUUID().toString();
this.serviceInstanceUUID = configuration.instanceUuid().toString();
heartbeatInit();
runnerThread = new Thread(this::run);
runnerThread.start();
}
/** Update the progress of the task. This is a fast function that doesn't block;
* the actual update is done in a separate thread.
*
* @param step The current step in the task.
*/
@Override
public void progress(String step, int stepProgress, int stepCount) {
this.step = step;
// off by one since we calculate the progress based on the number of steps,
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
// final progress being 80% and not 100%)
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
}
public void shutDown() {
if (!running)
return;
running = false;
try {
runnerThread.join();
heartbeatStop();
}
catch (InterruptedException|SQLException ex) {
logger.warn("ProcessHeartbeat shutdown failed", ex);
}
}
private void run() {
if (!running)
running = true;
else
return;
try {
while (running) {
try {
heartbeatUpdate();
}
catch (SQLException ex) {
logger.warn("ProcessHeartbeat failed to update", ex);
}
TimeUnit.SECONDS.sleep(heartbeatInterval);
}
}
catch (InterruptedException ex) {
logger.error("ProcessHeartbeat caught irrecoverable exception, killing service", ex);
System.exit(255);
}
}
private void heartbeatInit() {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(
"""
INSERT INTO TASK_HEARTBEAT (TASK_NAME, TASK_BASE, INSTANCE, SERVICE_INSTANCE, HEARTBEAT_TIME, STATUS)
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP(6), 'STARTING')
ON DUPLICATE KEY UPDATE
INSTANCE = ?,
SERVICE_INSTANCE = ?,
HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
STATUS = 'STARTING'
"""
))
{
stmt.setString(1, taskName);
stmt.setString(2, taskBase);
stmt.setString(3, instanceUUID);
stmt.setString(4, serviceInstanceUUID);
stmt.setString(5, instanceUUID);
stmt.setString(6, serviceInstanceUUID);
stmt.executeUpdate();
}
}
catch (SQLException ex) {
logger.error("ProcessHeartbeat failed to initialize", ex);
throw new RuntimeException(ex);
}
}
private void heartbeatUpdate() throws SQLException {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(
"""
UPDATE TASK_HEARTBEAT
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
STATUS = 'RUNNING',
PROGRESS = ?,
STAGE_NAME = ?
WHERE INSTANCE = ?
""")
)
{
stmt.setInt(1, progress);
stmt.setString(2, step);
stmt.setString(3, instanceUUID);
stmt.executeUpdate();
}
}
}
private void heartbeatStop() throws SQLException {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(
"""
UPDATE TASK_HEARTBEAT
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
STATUS='STOPPED',
PROGRESS = ?,
STAGE_NAME = ?
WHERE INSTANCE = ?
""")
)
{
stmt.setInt(1, progress);
stmt.setString( 2, step);
stmt.setString( 3, instanceUUID);
stmt.executeUpdate();
}
}
}
@Override
public void close() {
shutDown();
}
}

View File

@ -1,155 +1,11 @@
package nu.marginalia.process.control;
import com.google.inject.ImplementedBy;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.ProcessConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ImplementedBy(ProcessHeartbeatImpl.class)
public interface ProcessHeartbeat {
<T extends Enum<T>> ProcessTaskHeartbeat<T> createProcessTaskHeartbeat(Class<T> steps, String processName);
ProcessAdHocTaskHeartbeat createAdHocTaskHeartbeat(String processName);
import java.sql.SQLException;
import java.util.concurrent.TimeUnit;
/** This service sends a heartbeat to the database every 5 seconds.
*/
@Singleton
public class ProcessHeartbeat {
private final Logger logger = LoggerFactory.getLogger(ProcessHeartbeat.class);
private final String processName;
private final String processBase;
private final String instanceUUID;
private final HikariDataSource dataSource;
private final Thread runnerThread;
private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 1);
private volatile boolean running = false;
private volatile int progress = -1;
@Inject
public ProcessHeartbeat(ProcessConfiguration configuration,
HikariDataSource dataSource)
{
this.processName = configuration.processName() + ":" + configuration.node();
this.processBase = configuration.processName();
this.dataSource = dataSource;
this.instanceUUID = configuration.instanceUuid().toString();
runnerThread = new Thread(this::run);
Runtime.getRuntime().addShutdownHook(new Thread(this::shutDown));
}
public void setProgress(double progress) {
this.progress = (int) (progress * 100);
}
public void start() {
if (!running) {
runnerThread.start();
}
}
public void shutDown() {
if (!running)
return;
running = false;
try {
runnerThread.join();
heartbeatStop();
}
catch (InterruptedException|SQLException ex) {
logger.warn("ServiceHeartbeat shutdown failed", ex);
}
}
private void run() {
if (!running)
running = true;
else
return;
try {
heartbeatInit();
while (running) {
try {
heartbeatUpdate();
}
catch (SQLException ex) {
logger.warn("ServiceHeartbeat failed to update", ex);
}
TimeUnit.SECONDS.sleep(heartbeatInterval);
}
}
catch (InterruptedException|SQLException ex) {
logger.error("ServiceHeartbeat caught irrecoverable exception, killing service", ex);
System.exit(255);
}
}
private void heartbeatInit() throws SQLException {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(
"""
INSERT INTO PROCESS_HEARTBEAT (PROCESS_NAME, PROCESS_BASE, INSTANCE, HEARTBEAT_TIME, STATUS)
VALUES (?, ?, ?, CURRENT_TIMESTAMP(6), 'STARTING')
ON DUPLICATE KEY UPDATE
INSTANCE = ?,
HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
STATUS = 'STARTING'
"""
))
{
stmt.setString(1, processName);
stmt.setString(2, processBase);
stmt.setString(3, instanceUUID);
stmt.setString(4, instanceUUID);
stmt.executeUpdate();
}
}
}
private void heartbeatUpdate() throws SQLException {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(
"""
UPDATE PROCESS_HEARTBEAT
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), STATUS = 'RUNNING', PROGRESS = ?
WHERE INSTANCE = ?
""")
)
{
stmt.setInt(1, progress);
stmt.setString(2, instanceUUID);
stmt.executeUpdate();
}
}
}
private void heartbeatStop() throws SQLException {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(
"""
UPDATE PROCESS_HEARTBEAT
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), STATUS='STOPPED', PROGRESS=?
WHERE INSTANCE = ?
""")
)
{
stmt.setInt(1, progress);
stmt.setString( 2, instanceUUID);
stmt.executeUpdate();
}
}
}
void setProgress(double progress);
}

View File

@ -0,0 +1,170 @@
package nu.marginalia.process.control;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.ProcessConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.concurrent.TimeUnit;
/** This service sends a heartbeat to the database every 5 seconds.
*/
@Singleton
public class ProcessHeartbeatImpl implements ProcessHeartbeat {
private final Logger logger = LoggerFactory.getLogger(ProcessHeartbeatImpl.class);
private final String processName;
private final String processBase;
private final String instanceUUID;
@org.jetbrains.annotations.NotNull
private final ProcessConfiguration configuration;
private final HikariDataSource dataSource;
private final Thread runnerThread;
private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 1);
private volatile boolean running = false;
private volatile int progress = -1;
@Inject
public ProcessHeartbeatImpl(ProcessConfiguration configuration,
HikariDataSource dataSource)
{
this.processName = configuration.processName() + ":" + configuration.node();
this.processBase = configuration.processName();
this.configuration = configuration;
this.dataSource = dataSource;
this.instanceUUID = configuration.instanceUuid().toString();
runnerThread = new Thread(this::run);
Runtime.getRuntime().addShutdownHook(new Thread(this::shutDown));
}
@Override
public <T extends Enum<T>> ProcessTaskHeartbeat<T> createProcessTaskHeartbeat(Class<T> steps, String processName) {
return new ProcessTaskHeartbeatImpl<>(steps, configuration, processName, dataSource);
}
@Override
public ProcessAdHocTaskHeartbeat createAdHocTaskHeartbeat(String processName) {
return new ProcessAdHocTaskHeartbeatImpl(configuration, processName, dataSource);
}
@Override
public void setProgress(double progress) {
this.progress = (int) (progress * 100);
}
public void start() {
if (!running) {
runnerThread.start();
}
}
public void shutDown() {
if (!running)
return;
running = false;
try {
runnerThread.join();
heartbeatStop();
}
catch (InterruptedException|SQLException ex) {
logger.warn("ServiceHeartbeat shutdown failed", ex);
}
}
private void run() {
if (!running)
running = true;
else
return;
try {
heartbeatInit();
while (running) {
try {
heartbeatUpdate();
}
catch (SQLException ex) {
logger.warn("ServiceHeartbeat failed to update", ex);
}
TimeUnit.SECONDS.sleep(heartbeatInterval);
}
}
catch (InterruptedException|SQLException ex) {
logger.error("ServiceHeartbeat caught irrecoverable exception, killing service", ex);
System.exit(255);
}
}
private void heartbeatInit() throws SQLException {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(
"""
INSERT INTO PROCESS_HEARTBEAT (PROCESS_NAME, PROCESS_BASE, INSTANCE, HEARTBEAT_TIME, STATUS)
VALUES (?, ?, ?, CURRENT_TIMESTAMP(6), 'STARTING')
ON DUPLICATE KEY UPDATE
INSTANCE = ?,
HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
STATUS = 'STARTING'
"""
))
{
stmt.setString(1, processName);
stmt.setString(2, processBase);
stmt.setString(3, instanceUUID);
stmt.setString(4, instanceUUID);
stmt.executeUpdate();
}
}
}
private void heartbeatUpdate() throws SQLException {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(
"""
UPDATE PROCESS_HEARTBEAT
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), STATUS = 'RUNNING', PROGRESS = ?
WHERE INSTANCE = ?
""")
)
{
stmt.setInt(1, progress);
stmt.setString(2, instanceUUID);
stmt.executeUpdate();
}
}
}
private void heartbeatStop() throws SQLException {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(
"""
UPDATE PROCESS_HEARTBEAT
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), STATUS='STOPPED', PROGRESS=?
WHERE INSTANCE = ?
""")
)
{
stmt.setInt(1, progress);
stmt.setString( 2, instanceUUID);
stmt.executeUpdate();
}
}
}
}

View File

@ -0,0 +1,9 @@
package nu.marginalia.process.control;
public interface ProcessTaskHeartbeat<T extends Enum<T>> extends AutoCloseable {
void progress(T step);
void shutDown();
void close();
}

View File

@ -0,0 +1,192 @@
package nu.marginalia.process.control;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.ProcessConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.UUID;
import java.util.concurrent.TimeUnit;
/** This object sends a heartbeat to the database every few seconds,
* updating with the progress of a task within a service. Progress is tracked by providing
* enumerations corresponding to the steps in the task. It's important they're arranged in the same
* order as the steps in the task in order to get an accurate progress tracking.
*/
public class ProcessTaskHeartbeatImpl<T extends Enum<T>> implements AutoCloseable, ProcessTaskHeartbeat<T> {
private final Logger logger = LoggerFactory.getLogger(ProcessTaskHeartbeatImpl.class);
private final String taskName;
private final String taskBase;
private final String instanceUUID;
private final HikariDataSource dataSource;
private final Thread runnerThread;
private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 1);
private final String serviceInstanceUUID;
private final int stepCount;
private volatile boolean running = false;
private volatile int stepNum = 0;
private volatile String step = "-";
ProcessTaskHeartbeatImpl(Class<T> stepClass,
ProcessConfiguration configuration,
String taskName,
HikariDataSource dataSource)
{
this.taskName = configuration.processName() + "." + taskName + ":" + configuration.node();
this.taskBase = configuration.processName() + "." + taskName;
this.dataSource = dataSource;
this.instanceUUID = UUID.randomUUID().toString();
this.serviceInstanceUUID = configuration.instanceUuid().toString();
this.stepCount = stepClass.getEnumConstants().length;
heartbeatInit();
runnerThread = new Thread(this::run);
runnerThread.start();
}
/** Update the progress of the task. This is a fast function that doesn't block;
* the actual update is done in a separate thread.
*
* @param step The current step in the task.
*/
@Override
public void progress(T step) {
this.step = step.name();
// off by one since we calculate the progress based on the number of steps,
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
// final progress being 80% and not 100%)
this.stepNum = 1 + step.ordinal();
logger.info("ProcessTask {} progress: {}", taskBase, step.name());
}
@Override
public void shutDown() {
if (!running)
return;
running = false;
try {
runnerThread.join();
heartbeatStop();
}
catch (InterruptedException|SQLException ex) {
logger.warn("ProcessHeartbeat shutdown failed", ex);
}
}
private void run() {
if (!running)
running = true;
else
return;
try {
while (running) {
try {
heartbeatUpdate();
}
catch (SQLException ex) {
logger.warn("ProcessHeartbeat failed to update", ex);
}
TimeUnit.SECONDS.sleep(heartbeatInterval);
}
}
catch (InterruptedException ex) {
logger.error("ProcessHeartbeat caught irrecoverable exception, killing service", ex);
System.exit(255);
}
}
private void heartbeatInit() {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(
"""
INSERT INTO TASK_HEARTBEAT (TASK_NAME, TASK_BASE, INSTANCE, SERVICE_INSTANCE, HEARTBEAT_TIME, STATUS)
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP(6), 'STARTING')
ON DUPLICATE KEY UPDATE
INSTANCE = ?,
SERVICE_INSTANCE = ?,
HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
STATUS = 'STARTING'
"""
))
{
stmt.setString(1, taskName);
stmt.setString(2, taskBase);
stmt.setString(3, instanceUUID);
stmt.setString(4, serviceInstanceUUID);
stmt.setString(5, instanceUUID);
stmt.setString(6, serviceInstanceUUID);
stmt.executeUpdate();
}
}
catch (SQLException ex) {
logger.error("ProcessHeartbeat failed to initialize", ex);
throw new RuntimeException(ex);
}
}
private void heartbeatUpdate() throws SQLException {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(
"""
UPDATE TASK_HEARTBEAT
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
STATUS = 'RUNNING',
PROGRESS = ?,
STAGE_NAME = ?
WHERE INSTANCE = ?
""")
)
{
stmt.setInt(1, (int) Math.round(100 * stepNum / (double) stepCount));
stmt.setString(2, step);
stmt.setString(3, instanceUUID);
stmt.executeUpdate();
}
}
}
private void heartbeatStop() throws SQLException {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(
"""
UPDATE TASK_HEARTBEAT
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
STATUS='STOPPED',
PROGRESS = ?,
STAGE_NAME = ?
WHERE INSTANCE = ?
""")
)
{
stmt.setInt(1, (int) Math.round(100 * stepNum / (double) stepCount));
stmt.setString( 2, step);
stmt.setString( 3, instanceUUID);
stmt.executeUpdate();
}
}
}
@Override
public void close() {
shutDown();
}
}

View File

@ -0,0 +1,14 @@
package nu.marginalia.service.control;
/** Dummy implementation of ServiceHeartbeat that does nothing */
public class FakeServiceHeartbeat implements ServiceHeartbeat {
@Override
public <T extends Enum<T>> ServiceTaskHeartbeat<T> createServiceTaskHeartbeat(Class<T> steps, String processName) {
return new ServiceTaskHeartbeat<T>() {
@Override
public void progress(T step) {}
@Override
public void close() {}
};
}
}

View File

@ -1,157 +1,8 @@
package nu.marginalia.service.control;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.service.module.ServiceConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.concurrent.TimeUnit;
/** This service sends a heartbeat to the database every 5 seconds,
* updating the control service with the liveness information for the service.
*/
@Singleton
public class ServiceHeartbeat {
private final Logger logger = LoggerFactory.getLogger(ServiceHeartbeat.class);
private final String serviceName;
private final String serviceBase;
private final String instanceUUID;
private final ServiceConfiguration configuration;
private final ServiceEventLog eventLog;
private final HikariDataSource dataSource;
private final Thread runnerThread;
private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 5);
private volatile boolean running = false;
@Inject
public ServiceHeartbeat(ServiceConfiguration configuration,
ServiceEventLog eventLog,
HikariDataSource dataSource)
{
this.serviceName = configuration.serviceName() + ":" + configuration.node();
this.serviceBase = configuration.serviceName();
this.configuration = configuration;
this.eventLog = eventLog;
this.dataSource = dataSource;
this.instanceUUID = configuration.instanceUuid().toString();
runnerThread = new Thread(this::run);
Runtime.getRuntime().addShutdownHook(new Thread(this::shutDown));
}
public <T extends Enum<T>> ServiceTaskHeartbeat<T> createServiceTaskHeartbeat(Class<T> steps, String processName) {
return new ServiceTaskHeartbeat<>(steps, configuration, processName, eventLog, dataSource);
}
public void start() {
if (!running) {
runnerThread.start();
}
}
public void shutDown() {
if (!running)
return;
running = false;
try {
runnerThread.join();
heartbeatStop();
}
catch (InterruptedException|SQLException ex) {
logger.warn("ServiceHeartbeat shutdown failed", ex);
}
}
private void run() {
if (!running)
running = true;
else
return;
try {
heartbeatInit();
while (running) {
try {
heartbeatUpdate();
}
catch (SQLException ex) {
logger.warn("ServiceHeartbeat failed to update", ex);
}
TimeUnit.SECONDS.sleep(heartbeatInterval);
}
}
catch (InterruptedException|SQLException ex) {
logger.error("ServiceHeartbeat caught irrecoverable exception, killing service", ex);
System.exit(255);
}
}
private void heartbeatInit() throws SQLException {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(
"""
INSERT INTO SERVICE_HEARTBEAT (SERVICE_NAME, SERVICE_BASE, INSTANCE, HEARTBEAT_TIME, ALIVE)
VALUES (?, ?, ?, CURRENT_TIMESTAMP(6), 1)
ON DUPLICATE KEY UPDATE
INSTANCE = ?,
HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
ALIVE = 1
"""
))
{
stmt.setString(1, serviceName);
stmt.setString(2, serviceBase);
stmt.setString(3, instanceUUID);
stmt.setString(4, instanceUUID);
stmt.executeUpdate();
}
}
}
private void heartbeatUpdate() throws SQLException {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(
"""
UPDATE SERVICE_HEARTBEAT
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6)
WHERE INSTANCE = ? AND ALIVE = 1
""")
)
{
stmt.setString(1, instanceUUID);
stmt.executeUpdate();
}
}
}
private void heartbeatStop() throws SQLException {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(
"""
UPDATE SERVICE_HEARTBEAT
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), ALIVE = 0
WHERE INSTANCE = ?
""")
)
{
stmt.setString(1, instanceUUID);
stmt.executeUpdate();
}
}
}
import com.google.inject.ImplementedBy;
@ImplementedBy(ServiceHeartbeatImpl.class)
public interface ServiceHeartbeat {
<T extends Enum<T>> ServiceTaskHeartbeat<T> createServiceTaskHeartbeat(Class<T> steps, String processName);
}

View File

@ -0,0 +1,158 @@
package nu.marginalia.service.control;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.service.module.ServiceConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.concurrent.TimeUnit;
/** This service sends a heartbeat to the database every 5 seconds,
* updating the control service with the liveness information for the service.
*/
@Singleton
public class ServiceHeartbeatImpl implements ServiceHeartbeat {
private final Logger logger = LoggerFactory.getLogger(ServiceHeartbeatImpl.class);
private final String serviceName;
private final String serviceBase;
private final String instanceUUID;
private final ServiceConfiguration configuration;
private final ServiceEventLog eventLog;
private final HikariDataSource dataSource;
private final Thread runnerThread;
private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 5);
private volatile boolean running = false;
@Inject
public ServiceHeartbeatImpl(ServiceConfiguration configuration,
ServiceEventLog eventLog,
HikariDataSource dataSource)
{
this.serviceName = configuration.serviceName() + ":" + configuration.node();
this.serviceBase = configuration.serviceName();
this.configuration = configuration;
this.eventLog = eventLog;
this.dataSource = dataSource;
this.instanceUUID = configuration.instanceUuid().toString();
runnerThread = new Thread(this::run);
Runtime.getRuntime().addShutdownHook(new Thread(this::shutDown));
}
@Override
public <T extends Enum<T>> ServiceTaskHeartbeat<T> createServiceTaskHeartbeat(Class<T> steps, String processName) {
return new ServiceTaskHeartbeatImpl<>(steps, configuration, processName, eventLog, dataSource);
}
public void start() {
if (!running) {
runnerThread.start();
}
}
public void shutDown() {
if (!running)
return;
running = false;
try {
runnerThread.join();
heartbeatStop();
}
catch (InterruptedException|SQLException ex) {
logger.warn("ServiceHeartbeat shutdown failed", ex);
}
}
private void run() {
if (!running)
running = true;
else
return;
try {
heartbeatInit();
while (running) {
try {
heartbeatUpdate();
}
catch (SQLException ex) {
logger.warn("ServiceHeartbeat failed to update", ex);
}
TimeUnit.SECONDS.sleep(heartbeatInterval);
}
}
catch (InterruptedException|SQLException ex) {
logger.error("ServiceHeartbeat caught irrecoverable exception, killing service", ex);
System.exit(255);
}
}
private void heartbeatInit() throws SQLException {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(
"""
INSERT INTO SERVICE_HEARTBEAT (SERVICE_NAME, SERVICE_BASE, INSTANCE, HEARTBEAT_TIME, ALIVE)
VALUES (?, ?, ?, CURRENT_TIMESTAMP(6), 1)
ON DUPLICATE KEY UPDATE
INSTANCE = ?,
HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
ALIVE = 1
"""
))
{
stmt.setString(1, serviceName);
stmt.setString(2, serviceBase);
stmt.setString(3, instanceUUID);
stmt.setString(4, instanceUUID);
stmt.executeUpdate();
}
}
}
private void heartbeatUpdate() throws SQLException {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(
"""
UPDATE SERVICE_HEARTBEAT
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6)
WHERE INSTANCE = ? AND ALIVE = 1
""")
)
{
stmt.setString(1, instanceUUID);
stmt.executeUpdate();
}
}
}
private void heartbeatStop() throws SQLException {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(
"""
UPDATE SERVICE_HEARTBEAT
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), ALIVE = 0
WHERE INSTANCE = ?
""")
)
{
stmt.setString(1, instanceUUID);
stmt.executeUpdate();
}
}
}
}

View File

@ -1,196 +1,8 @@
package nu.marginalia.service.control;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.service.module.ServiceConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.UUID;
import java.util.concurrent.TimeUnit;
/** This object sends a heartbeat to the database every few seconds,
* updating with the progress of a task within a service. Progress is tracked by providing
* enumerations corresponding to the steps in the task. It's important they're arranged in the same
* order as the steps in the task in order to get an accurate progress tracking.
*/
public class ServiceTaskHeartbeat<T extends Enum<T>> implements AutoCloseable {
private final Logger logger = LoggerFactory.getLogger(ServiceTaskHeartbeat.class);
private final String taskName;
private final String taskBase;
private final String instanceUUID;
private final HikariDataSource dataSource;
private final Thread runnerThread;
private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 1);
private final String serviceInstanceUUID;
private final int stepCount;
private final ServiceEventLog eventLog;
private volatile boolean running = false;
private volatile int stepNum = 0;
private volatile String step = "-";
ServiceTaskHeartbeat(Class<T> stepClass,
ServiceConfiguration configuration,
String taskName,
ServiceEventLog eventLog,
HikariDataSource dataSource)
{
this.eventLog = eventLog;
this.taskName = configuration.serviceName() + "." + taskName + ":" + configuration.node();
this.taskBase = configuration.serviceName() + "." + taskName;
this.dataSource = dataSource;
this.instanceUUID = UUID.randomUUID().toString();
this.serviceInstanceUUID = configuration.instanceUuid().toString();
this.stepCount = stepClass.getEnumConstants().length;
heartbeatInit();
runnerThread = new Thread(this::run);
runnerThread.start();
}
/** Update the progress of the task. This is a fast function that doesn't block;
* the actual update is done in a separate thread.
*
* @param step The current step in the task.
*/
public void progress(T step) {
this.step = step.name();
// off by one since we calculate the progress based on the number of steps,
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
// final progress being 80% and not 100%)
this.stepNum = 1 + step.ordinal();
logger.info("ServiceTask {} progress: {}", taskBase, step.name());
eventLog.logEvent("TASK-STEP", taskName + " = " + step.name());
}
public void shutDown() {
if (!running)
return;
running = false;
try {
runnerThread.join();
heartbeatStop();
}
catch (InterruptedException|SQLException ex) {
logger.warn("ServiceHeartbeat shutdown failed", ex);
}
}
private void run() {
if (!running)
running = true;
else
return;
try {
while (running) {
try {
heartbeatUpdate();
}
catch (SQLException ex) {
logger.warn("ServiceHeartbeat failed to update", ex);
}
TimeUnit.SECONDS.sleep(heartbeatInterval);
}
}
catch (InterruptedException ex) {
logger.error("ServiceHeartbeat caught irrecoverable exception, killing service", ex);
System.exit(255);
}
}
private void heartbeatInit() {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(
"""
INSERT INTO TASK_HEARTBEAT (TASK_NAME, TASK_BASE, INSTANCE, SERVICE_INSTANCE, HEARTBEAT_TIME, STATUS)
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP(6), 'STARTING')
ON DUPLICATE KEY UPDATE
INSTANCE = ?,
SERVICE_INSTANCE = ?,
HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
STATUS = 'STARTING'
"""
))
{
stmt.setString(1, taskName);
stmt.setString(2, taskBase);
stmt.setString(3, instanceUUID);
stmt.setString(4, serviceInstanceUUID);
stmt.setString(5, instanceUUID);
stmt.setString(6, serviceInstanceUUID);
stmt.executeUpdate();
}
}
catch (SQLException ex) {
logger.error("ServiceHeartbeat failed to initialize", ex);
throw new RuntimeException(ex);
}
eventLog.logEvent("TASK-STARTED", taskName);
}
private void heartbeatUpdate() throws SQLException {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(
"""
UPDATE TASK_HEARTBEAT
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
STATUS = 'RUNNING',
PROGRESS = ?,
STAGE_NAME = ?
WHERE INSTANCE = ?
""")
)
{
stmt.setInt(1, (int) Math.round(100 * stepNum / (double) stepCount));
stmt.setString(2, step);
stmt.setString(3, instanceUUID);
stmt.executeUpdate();
}
}
}
private void heartbeatStop() throws SQLException {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(
"""
UPDATE TASK_HEARTBEAT
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
STATUS='STOPPED',
PROGRESS = ?,
STAGE_NAME = ?
WHERE INSTANCE = ?
""")
)
{
stmt.setInt(1, (int) Math.round(100 * stepNum / (double) stepCount));
stmt.setString( 2, step);
stmt.setString( 3, instanceUUID);
stmt.executeUpdate();
}
}
eventLog.logEvent("TASK-TERMINATED", taskName);
}
public interface ServiceTaskHeartbeat<T extends Enum<T>> extends AutoCloseable {
void progress(T step);
@Override
public void close() {
shutDown();
}
void close();
}

View File

@ -0,0 +1,197 @@
package nu.marginalia.service.control;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.service.module.ServiceConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.UUID;
import java.util.concurrent.TimeUnit;
/** This object sends a heartbeat to the database every few seconds,
* updating with the progress of a task within a service. Progress is tracked by providing
* enumerations corresponding to the steps in the task. It's important they're arranged in the same
* order as the steps in the task in order to get an accurate progress tracking.
*/
public class ServiceTaskHeartbeatImpl<T extends Enum<T>> implements ServiceTaskHeartbeat<T> {
private final Logger logger = LoggerFactory.getLogger(ServiceTaskHeartbeatImpl.class);
private final String taskName;
private final String taskBase;
private final String instanceUUID;
private final HikariDataSource dataSource;
private final Thread runnerThread;
private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 1);
private final String serviceInstanceUUID;
private final int stepCount;
private final ServiceEventLog eventLog;
private volatile boolean running = false;
private volatile int stepNum = 0;
private volatile String step = "-";
ServiceTaskHeartbeatImpl(Class<T> stepClass,
ServiceConfiguration configuration,
String taskName,
ServiceEventLog eventLog,
HikariDataSource dataSource)
{
this.eventLog = eventLog;
this.taskName = configuration.serviceName() + "." + taskName + ":" + configuration.node();
this.taskBase = configuration.serviceName() + "." + taskName;
this.dataSource = dataSource;
this.instanceUUID = UUID.randomUUID().toString();
this.serviceInstanceUUID = configuration.instanceUuid().toString();
this.stepCount = stepClass.getEnumConstants().length;
heartbeatInit();
runnerThread = new Thread(this::run);
runnerThread.start();
}
/** Update the progress of the task. This is a fast function that doesn't block;
* the actual update is done in a separate thread.
*
* @param step The current step in the task.
*/
@Override
public void progress(T step) {
this.step = step.name();
// off by one since we calculate the progress based on the number of steps,
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
// final progress being 80% and not 100%)
this.stepNum = 1 + step.ordinal();
logger.info("ServiceTask {} progress: {}", taskBase, step.name());
eventLog.logEvent("TASK-STEP", taskName + " = " + step.name());
}
public void shutDown() {
if (!running)
return;
running = false;
try {
runnerThread.join();
heartbeatStop();
}
catch (InterruptedException|SQLException ex) {
logger.warn("ServiceHeartbeat shutdown failed", ex);
}
}
private void run() {
if (!running)
running = true;
else
return;
try {
while (running) {
try {
heartbeatUpdate();
}
catch (SQLException ex) {
logger.warn("ServiceHeartbeat failed to update", ex);
}
TimeUnit.SECONDS.sleep(heartbeatInterval);
}
}
catch (InterruptedException ex) {
logger.error("ServiceHeartbeat caught irrecoverable exception, killing service", ex);
System.exit(255);
}
}
private void heartbeatInit() {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(
"""
INSERT INTO TASK_HEARTBEAT (TASK_NAME, TASK_BASE, INSTANCE, SERVICE_INSTANCE, HEARTBEAT_TIME, STATUS)
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP(6), 'STARTING')
ON DUPLICATE KEY UPDATE
INSTANCE = ?,
SERVICE_INSTANCE = ?,
HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
STATUS = 'STARTING'
"""
))
{
stmt.setString(1, taskName);
stmt.setString(2, taskBase);
stmt.setString(3, instanceUUID);
stmt.setString(4, serviceInstanceUUID);
stmt.setString(5, instanceUUID);
stmt.setString(6, serviceInstanceUUID);
stmt.executeUpdate();
}
}
catch (SQLException ex) {
logger.error("ServiceHeartbeat failed to initialize", ex);
throw new RuntimeException(ex);
}
eventLog.logEvent("TASK-STARTED", taskName);
}
private void heartbeatUpdate() throws SQLException {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(
"""
UPDATE TASK_HEARTBEAT
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
STATUS = 'RUNNING',
PROGRESS = ?,
STAGE_NAME = ?
WHERE INSTANCE = ?
""")
)
{
stmt.setInt(1, (int) Math.round(100 * stepNum / (double) stepCount));
stmt.setString(2, step);
stmt.setString(3, instanceUUID);
stmt.executeUpdate();
}
}
}
private void heartbeatStop() throws SQLException {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(
"""
UPDATE TASK_HEARTBEAT
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
STATUS='STOPPED',
PROGRESS = ?,
STAGE_NAME = ?
WHERE INSTANCE = ?
""")
)
{
stmt.setInt(1, (int) Math.round(100 * stepNum / (double) stepCount));
stmt.setString( 2, step);
stmt.setString( 3, instanceUUID);
stmt.executeUpdate();
}
}
eventLog.logEvent("TASK-TERMINATED", taskName);
}
@Override
public void close() {
shutDown();
}
}

View File

@ -4,7 +4,7 @@ import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.mq.MessageQueueFactory;
import nu.marginalia.service.control.ServiceEventLog;
import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.service.control.ServiceHeartbeatImpl;
import nu.marginalia.service.module.ServiceConfiguration;
/** This class exists to reduce Service boilerplate */
@ -13,14 +13,14 @@ public class BaseServiceParams {
public final ServiceConfiguration configuration;
public final Initialization initialization;
public final MetricsServer metricsServer;
public final ServiceHeartbeat heartbeat;
public final ServiceHeartbeatImpl heartbeat;
public final ServiceEventLog eventLog;
public final MessageQueueFactory messageQueueInboxFactory;
@Inject
public BaseServiceParams(ServiceConfiguration configuration,
Initialization initialization,
MetricsServer metricsServer,
ServiceHeartbeat heartbeat,
ServiceHeartbeatImpl heartbeat,
ServiceEventLog eventLog,
MessageQueueFactory messageQueueInboxFactory) {
this.configuration = configuration;

View File

@ -7,7 +7,7 @@ import nu.marginalia.language.model.WordRep;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.model.EdgeUrl;
import javax.inject.Inject;
import com.google.inject.Inject;
import java.util.*;
import java.util.stream.Stream;

View File

@ -2,6 +2,7 @@ package nu.marginalia.ranking;
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
import it.unimi.dsi.fastutil.ints.Int2ShortOpenHashMap;
import nu.marginalia.model.id.UrlIdCodec;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -37,6 +38,11 @@ public class DomainRankings {
return rankings.getOrDefault(domainId, (short) MAX_RANK_VALUE);
}
public float getSortRanking(long docId) {
int domainId = UrlIdCodec.getDomainId(docId);
return rankings.getOrDefault(domainId, (short) MAX_RANK_VALUE) / (float) MAX_RANK_VALUE;
}
public int size() {
return rankings.size();
}

View File

@ -16,9 +16,8 @@ dependencies {
implementation project(':code:features-index:domain-ranking')
implementation project(':code:features-index:index-query')
implementation project(':code:features-index:index-journal')
implementation project(':code:features-index:lexicon')
implementation project(':code:common:model')
implementation project(':code:common:service')
implementation project(':code:common:process')
implementation project(':third-party:uppend')

View File

@ -6,10 +6,10 @@ import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.array.LongArray;
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.ranking.DomainRankings;
import nu.marginalia.service.control.ServiceHeartbeat;
import org.roaringbitmap.IntConsumer;
import org.roaringbitmap.RoaringBitmap;
import org.roaringbitmap.longlong.LongConsumer;
import org.roaringbitmap.longlong.Roaring64Bitmap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -20,24 +20,24 @@ import java.nio.file.Path;
public class ForwardIndexConverter {
private final ServiceHeartbeat heartbeat;
private final File inputFile;
private final ProcessHeartbeat heartbeat;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final IndexJournalReader journalReader;
private final Path outputFileDocsId;
private final Path outputFileDocsData;
private final DomainRankings domainRankings;
public ForwardIndexConverter(ServiceHeartbeat heartbeat,
File inputFile,
public ForwardIndexConverter(ProcessHeartbeat heartbeat,
IndexJournalReader journalReader,
Path outputFileDocsId,
Path outputFileDocsData,
DomainRankings domainRankings
) {
this.heartbeat = heartbeat;
this.inputFile = inputFile;
this.journalReader = journalReader;
this.outputFileDocsId = outputFileDocsId;
this.outputFileDocsData = outputFileDocsData;
this.domainRankings = domainRankings;
@ -54,17 +54,9 @@ public class ForwardIndexConverter {
public void convert() throws IOException {
deleteOldFiles();
IndexJournalReaderSingleCompressedFile journalReader = new IndexJournalReaderSingleCompressedFile(inputFile.toPath());
if (journalReader.fileHeader().fileSize() <= IndexJournalReader.FILE_HEADER_SIZE_BYTES) {
logger.warn("Bailing: Journal is empty!");
return;
}
logger.info("Converting {} {}", inputFile, journalReader.fileHeader);
logger.info("Domain Rankings size = {}", domainRankings.size());
try (var progress = heartbeat.createServiceTaskHeartbeat(TaskSteps.class, "forwardIndexConverter")) {
try (var progress = heartbeat.createProcessTaskHeartbeat(TaskSteps.class, "forwardIndexConverter")) {
progress.progress(TaskSteps.GET_DOC_IDS);
LongArray docsFileId = getDocIds(outputFileDocsId, journalReader);
@ -83,12 +75,11 @@ public class ForwardIndexConverter {
LongArray docFileData = LongArray.mmapForWriting(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size());
journalReader.forEach(entry -> {
long entryOffset = (long) ForwardIndexParameters.ENTRY_SIZE * docIdToIdx.get(entry.urlId());
long entryOffset = (long) ForwardIndexParameters.ENTRY_SIZE * docIdToIdx.get(entry.docId());
int ranking = domainRankings.getRanking(entry.domainId());
long meta = DocumentMetadata.encodeRank(entry.docMeta(), ranking);
docFileData.set(entryOffset + ForwardIndexParameters.DOMAIN_OFFSET, entry.domainId());
docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta);
docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, entry.header.documentFeatures());
});
@ -109,17 +100,18 @@ public class ForwardIndexConverter {
}
private LongArray getDocIds(Path outputFileDocs, IndexJournalReader journalReader) throws IOException {
RoaringBitmap rbm = new RoaringBitmap();
journalReader.forEachUrlId(rbm::add);
Roaring64Bitmap rbm = new Roaring64Bitmap();
journalReader.forEachDocId(rbm::add);
LongArray ret = LongArray.mmapForWriting(outputFileDocs, rbm.getCardinality());
rbm.forEach(new IntConsumer() {
LongArray ret = LongArray.mmapForWriting(outputFileDocs, rbm.getIntCardinality());
rbm.forEach(new LongConsumer() {
int offset;
@Override
public void accept(int value) {
public void accept(long value) {
ret.set(offset++, value);
}
});
return ret;
}

View File

@ -0,0 +1,28 @@
package nu.marginalia.index.forward;
import java.nio.file.Path;
public class ForwardIndexFileNames {
public static Path resolve(Path basePath, FileIdentifier identifier, FileVersion version) {
return switch (identifier) {
case DOC_ID -> switch (version) {
case NEXT -> basePath.resolve("fwd-doc-id.dat.next");
case CURRENT -> basePath.resolve("fwd-doc-id.dat");
};
case DOC_DATA -> switch (version) {
case NEXT -> basePath.resolve("fwd-doc-data.dat.next");
case CURRENT -> basePath.resolve("fwd-doc-data.dat");
};
};
}
public enum FileVersion {
CURRENT,
NEXT
};
public enum FileIdentifier {
DOC_DATA,
DOC_ID
}
}

View File

@ -1,9 +1,8 @@
package nu.marginalia.index.forward;
class ForwardIndexParameters {
public static final int ENTRY_SIZE = 3;
public static final int DOMAIN_OFFSET = 0;
public static final int METADATA_OFFSET = 1;
public static final int FEATURES_OFFSET = 2;
public static final int ENTRY_SIZE = 2;
public static final int METADATA_OFFSET = 0;
public static final int FEATURES_OFFSET = 1;
}

View File

@ -3,6 +3,7 @@ package nu.marginalia.index.forward;
import com.upserve.uppend.blobs.NativeIO;
import gnu.trove.map.hash.TLongIntHashMap;
import nu.marginalia.array.LongArray;
import nu.marginalia.model.id.UrlIdCodec;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -71,6 +72,8 @@ public class ForwardIndexReader {
}
public long getDocMeta(long docId) {
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
long offset = idxForDoc(docId);
if (offset < 0) return 0;
@ -78,20 +81,17 @@ public class ForwardIndexReader {
}
public int getHtmlFeatures(long docId) {
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
long offset = idxForDoc(docId);
if (offset < 0) return 0;
return (int) data.get(ENTRY_SIZE * offset + FEATURES_OFFSET);
}
public int getDomainId(long docId) {
long offset = idxForDoc(docId);
if (offset < 0) return 0;
return Math.max(0, (int) data.get(ENTRY_SIZE * offset + DOMAIN_OFFSET));
}
private int idxForDoc(long docId) {
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
return idToOffset.get(docId);
}

View File

@ -1,5 +1,6 @@
package nu.marginalia.index.forward;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.index.query.limit.SpecificationLimitType;
import nu.marginalia.index.query.IndexQueryParams;
@ -15,10 +16,11 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
}
@Override
public boolean test(long docId) {
int urlId = (int) (docId & 0xFFFF_FFFFL);
int domainId = forwardIndexReader.getDomainId(urlId);
long meta = forwardIndexReader.getDocMeta(urlId);
public boolean test(long combinedId) {
long docId = UrlIdCodec.removeRank(combinedId);
int domainId = UrlIdCodec.getDomainId(docId);
long meta = forwardIndexReader.getDocMeta(docId);
if (!validateDomain(domainId, meta)) {
return false;

View File

@ -2,14 +2,14 @@ package nu.marginalia.index.forward;
import lombok.SneakyThrows;
import nu.marginalia.index.journal.model.IndexJournalEntry;
import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode;
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeatImpl;
import nu.marginalia.process.control.ProcessTaskHeartbeatImpl;
import nu.marginalia.ranking.DomainRankings;
import nu.marginalia.lexicon.KeywordLexicon;
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.service.control.ServiceTaskHeartbeat;
import nu.marginalia.test.TestUtil;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
@ -28,7 +28,6 @@ import static org.mockito.Mockito.when;
class ForwardIndexConverterTest {
KeywordLexicon keywordLexicon;
IndexJournalWriter writer;
Path indexFile;
@ -49,12 +48,9 @@ class ForwardIndexConverterTest {
dictionaryFile = Files.createTempFile("tmp", ".dict");
dictionaryFile.toFile().deleteOnExit();
keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile(), KeywordLexiconJournalMode.READ_WRITE));
keywordLexicon.getOrInsert("0");
indexFile = Files.createTempFile("tmp", ".idx");
indexFile.toFile().deleteOnExit();
writer = new IndexJournalWriterImpl(keywordLexicon, indexFile);
writer = new IndexJournalWriterSingleFileImpl(indexFile);
wordsFile1 = Files.createTempFile("words1", ".idx");
urlsFile1 = Files.createTempFile("urls1", ".idx");
@ -62,11 +58,9 @@ class ForwardIndexConverterTest {
dataDir = Files.createTempDirectory(getClass().getSimpleName());
for (int i = 1; i < workSetSize; i++) {
createEntry(writer, keywordLexicon, i);
createEntry(writer, i);
}
keywordLexicon.commitToDisk();
writer.close();
@ -84,15 +78,16 @@ class ForwardIndexConverterTest {
}
long createId(long url, long domain) {
return (domain << 32) | url;
return UrlIdCodec.encodeId((int) domain, (int) url);
}
public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) {
public void createEntry(IndexJournalWriter writer, int id) {
int[] factors = getFactorsI(id);
var entryBuilder = IndexJournalEntry.builder(createId(id, id/20), id%5);
for (int i = 0; i+1 < factors.length; i+=2) {
entryBuilder.add(keywordLexicon.getOrInsert(Integer.toString(factors[i])), -factors[i+1]);
entryBuilder.add(factors[i], -factors[i+1]);
}
writer.put(entryBuilder.build());
@ -101,18 +96,14 @@ class ForwardIndexConverterTest {
@Test
void testForwardIndex() throws IOException {
// RIP fairies
var serviceHeartbeat = Mockito.mock(ServiceHeartbeat.class);
when(serviceHeartbeat.createServiceTaskHeartbeat(Mockito.any(), Mockito.any()))
.thenReturn(Mockito.mock(ServiceTaskHeartbeat.class));
new ForwardIndexConverter(serviceHeartbeat, indexFile.toFile(), docsFileId, docsFileData, new DomainRankings()).convert();
new ForwardIndexConverter(new FakeProcessHeartbeat(), new IndexJournalReaderSingleCompressedFile(indexFile), docsFileId, docsFileData, new DomainRankings()).convert();
var forwardReader = new ForwardIndexReader(docsFileId, docsFileData);
for (int i = 36; i < workSetSize; i++) {
assertEquals(0x00FF000000000000L | (i % 5), forwardReader.getDocMeta(i));
assertEquals(i/20, forwardReader.getDomainId(i));
long docId = createId(i, i/20);
assertEquals(0x00FF000000000000L | (i % 5), forwardReader.getDocMeta(docId));
assertEquals(i/20, UrlIdCodec.getDomainId(docId));
}
}

View File

@ -13,7 +13,6 @@ java {
dependencies {
implementation project(':code:libraries:array')
implementation project(':code:common:model')
implementation project(':code:features-index:lexicon')
implementation libs.lombok
annotationProcessor libs.lombok
@ -22,6 +21,7 @@ dependencies {
implementation libs.prometheus
implementation libs.notnull
implementation libs.rxjava
implementation libs.guava
implementation libs.trove
implementation libs.zstd
implementation libs.commons.lang3

View File

@ -1,8 +1,6 @@
package nu.marginalia.index.journal.model;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.EdgeId;
import nu.marginalia.model.id.UrlIdCodec;
public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntryData data) {
@ -15,18 +13,7 @@ public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntr
long documentMeta) {
return builder(new EdgeId<>(domainId),
new EdgeId<>(urlId),
documentMeta);
return builder(UrlIdCodec.encodeId(domainId, urlId), documentMeta);
}
public static IndexJournalEntryBuilder builder(EdgeId<EdgeDomain> domainId,
EdgeId<EdgeUrl> urlId,
long documentMeta) {
return new IndexJournalEntryBuilder(0,
IndexJournalEntryHeader.combineIds(domainId, urlId),
documentMeta);
}
}

View File

@ -25,7 +25,7 @@ public class IndexJournalEntryData implements Iterable<IndexJournalEntryData.Rec
public long get(int idx) {
if (idx >= size)
throw new ArrayIndexOutOfBoundsException();
throw new ArrayIndexOutOfBoundsException(idx + " vs " + size);
return underlyingArray[idx];
}
@ -58,9 +58,9 @@ public class IndexJournalEntryData implements Iterable<IndexJournalEntryData.Rec
public Record next() {
pos+=ENTRY_SIZE;
return new Record((int) underlyingArray[pos], underlyingArray[pos+1]);
return new Record(underlyingArray[pos], underlyingArray[pos+1]);
}
}
public record Record(int wordId, long metadata) {}
public record Record(long wordId, long metadata) {}
}

View File

@ -1,29 +1,17 @@
package nu.marginalia.index.journal.model;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.EdgeId;
public record IndexJournalEntryHeader(int entrySize,
int documentFeatures,
long combinedId,
long documentMeta) {
public IndexJournalEntryHeader(EdgeId<EdgeDomain> domainId,
public IndexJournalEntryHeader(long combinedId,
int documentFeatures,
EdgeId<EdgeUrl> urlId,
long documentMeta) {
this(-1,
documentFeatures,
combineIds(domainId, urlId),
combinedId,
documentMeta);
}
static long combineIds(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId) {
long did = domainId.id();
long uid = urlId.id();
return (did << 32L) | uid;
}
}

View File

@ -2,11 +2,13 @@ package nu.marginalia.index.journal.reader;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.model.id.UrlIdCodec;
import java.io.DataInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.LongBuffer;
import java.util.Arrays;
public class IndexJournalReadEntry {
public final IndexJournalEntryHeader header;
@ -51,11 +53,7 @@ public class IndexJournalReadEntry {
}
public int domainId() {
return (int) (docId() >>> 32L);
}
public int urlId() {
return (int) (docId() & 0xFFFF_FFFFL);
return UrlIdCodec.getDomainId(docId());
}
public IndexJournalEntryData readEntry() {

View File

@ -1,31 +1,48 @@
package nu.marginalia.index.journal.reader;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalFileHeader;
import nu.marginalia.index.journal.model.IndexJournalStatistics;
import nu.marginalia.model.idx.WordFlags;
import org.jetbrains.annotations.NotNull;
import java.io.IOException;
import java.nio.file.Path;
import java.util.Iterator;
import java.util.function.IntConsumer;
import java.util.function.LongConsumer;
import java.util.function.Predicate;
public interface IndexJournalReader extends Iterable<IndexJournalReadEntry> {
int FILE_HEADER_SIZE_LONGS = 2;
int FILE_HEADER_SIZE_BYTES = 8 * FILE_HEADER_SIZE_LONGS;
IndexJournalFileHeader fileHeader();
static IndexJournalReader singleFile(Path fileName) throws IOException {
return new IndexJournalReaderSingleCompressedFile(fileName);
}
IndexJournalStatistics getStatistics();
static IndexJournalReader paging(Path baseDir) throws IOException {
return new IndexJournalReaderPagingImpl(baseDir);
}
void forEachWordId(IntConsumer consumer);
static IndexJournalReader singleFileWithPriorityFilters(Path path) throws IOException {
void forEachUrlIdWordId(BiIntConsumer consumer);
long highPriorityFlags =
WordFlags.Title.asBit()
| WordFlags.Subjects.asBit()
| WordFlags.TfIdfHigh.asBit()
| WordFlags.NamesWords.asBit()
| WordFlags.UrlDomain.asBit()
| WordFlags.UrlPath.asBit()
| WordFlags.Site.asBit()
| WordFlags.SiteAdjacent.asBit();
void forEachDocIdWordId(LongIntConsumer consumer);
return new IndexJournalReaderSingleCompressedFile(path, null,
r -> (r.metadata() & highPriorityFlags) != 0);
}
void forEachWordId(LongConsumer consumer);
void forEachDocIdRecord(LongObjectConsumer<IndexJournalEntryData.Record> consumer);
void forEachUrlId(IntConsumer consumer);
void forEachDocId(LongConsumer consumer);
@NotNull
@Override
@ -33,13 +50,7 @@ public interface IndexJournalReader extends Iterable<IndexJournalReadEntry> {
void close() throws IOException;
interface BiIntConsumer {
void accept(int left, int right);
}
interface LongIntConsumer {
void accept(long left, int right);
}
interface LongObjectConsumer<T> {
void accept(long left, T right);

View File

@ -0,0 +1,61 @@
package nu.marginalia.index.journal.reader;
import com.google.common.collect.Iterators;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalStatistics;
import nu.marginallia.index.journal.IndexJournalFileNames;
import org.jetbrains.annotations.NotNull;
import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.function.LongConsumer;
public class IndexJournalReaderPagingImpl implements IndexJournalReader {
private final List<IndexJournalReader> readers;
public IndexJournalReaderPagingImpl(Path baseDir) throws IOException {
var inputFiles = IndexJournalFileNames.findJournalFiles(baseDir);
this.readers = new ArrayList<>(inputFiles.size());
for (var inputFile : inputFiles) {
readers.add(new IndexJournalReaderSingleCompressedFile(inputFile));
}
}
@Override
public void forEachWordId(LongConsumer consumer) {
for (var reader : readers) {
reader.forEachWordId(consumer);
}
}
@Override
public void forEachDocIdRecord(LongObjectConsumer<IndexJournalEntryData.Record> consumer) {
for (var reader : readers) {
reader.forEachDocIdRecord(consumer);
}
}
@Override
public void forEachDocId(LongConsumer consumer) {
for (var reader : readers) {
reader.forEachDocId(consumer);
}
}
@Override
public @NotNull Iterator<IndexJournalReadEntry> iterator() {
return Iterators.concat(readers.stream().map(IndexJournalReader::iterator).iterator());
}
@Override
public void close() throws IOException {
for (var reader : readers) {
reader.close();
}
}
}

View File

@ -12,21 +12,30 @@ import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.Arrays;
import java.util.Iterator;
import java.util.function.IntConsumer;
import java.util.function.LongConsumer;
import java.util.function.Predicate;
public class IndexJournalReaderSingleCompressedFile implements IndexJournalReader {
private static Path journalFile;
private Path journalFile;
public final IndexJournalFileHeader fileHeader;
@Override
public String toString() {
return "IndexJournalReaderSingleCompressedFile{" + journalFile + " }";
}
private DataInputStream dataInputStream = null;
final Predicate<IndexJournalReadEntry> entryPredicate;
final Predicate<IndexJournalEntryData.Record> recordPredicate;
public IndexJournalReaderSingleCompressedFile(Path file) throws IOException {
this.journalFile = file;
fileHeader = readHeader(file);
this.recordPredicate = null;
@ -34,7 +43,8 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade
}
public IndexJournalReaderSingleCompressedFile(Path file, Predicate<IndexJournalReadEntry> entryPredicate, Predicate<IndexJournalEntryData.Record> recordPredicate) throws IOException {
journalFile = file;
this.journalFile = file;
fileHeader = readHeader(file);
this.recordPredicate = recordPredicate;
@ -42,8 +52,6 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade
}
private static IndexJournalFileHeader readHeader(Path file) throws IOException {
journalFile = file;
try (var raf = new RandomAccessFile(file.toFile(), "r")) {
long unused = raf.readLong();
long wordCount = raf.readLong();
@ -61,10 +69,6 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade
return new DataInputStream(new ZstdInputStream(new BufferedInputStream(fileInputStream)));
}
public IndexJournalFileHeader fileHeader() {
return fileHeader;
}
public boolean filter(IndexJournalReadEntry entry) {
return entryPredicate == null || entryPredicate.test(entry);
}
@ -80,31 +84,7 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade
@Override
public IndexJournalStatistics getStatistics() {
int highestWord = 0;
// Docs cardinality is a candidate for a HyperLogLog
Roaring64Bitmap docsBitmap = new Roaring64Bitmap();
for (var entry : this) {
var entryData = entry.readEntry();
if (filter(entry)) {
docsBitmap.addLong(entry.docId() & 0x0000_0000_FFFF_FFFFL);
for (var item : entryData) {
if (filter(entry, item)) {
highestWord = Integer.max(item.wordId(), highestWord);
}
}
}
}
return new IndexJournalStatistics(highestWord, docsBitmap.getIntCardinality());
}
@Override
public void forEachWordId(IntConsumer consumer) {
public void forEachWordId(LongConsumer consumer) {
for (var entry : this) {
var data = entry.readEntry();
for (var post : data) {
@ -115,32 +95,6 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade
}
}
@Override
public void forEachUrlIdWordId(BiIntConsumer consumer) {
for (var entry : this) {
var data = entry.readEntry();
for (var post : data) {
if (filter(entry, post)) {
consumer.accept(entry.urlId(), post.wordId());
}
}
}
}
@Override
public void forEachDocIdWordId(LongIntConsumer consumer) {
for (var entry : this) {
var data = entry.readEntry();
for (var post : data) {
if (filter(entry, post)) {
consumer.accept(entry.docId(), post.wordId());
}
}
}
}
@Override
public void forEachDocIdRecord(LongObjectConsumer<IndexJournalEntryData.Record> consumer) {
for (var entry : this) {
@ -154,10 +108,10 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade
}
}
@Override
public void forEachUrlId(IntConsumer consumer) {
public void forEachDocId(LongConsumer consumer) {
for (var entry : this) {
if (filter(entry)) {
consumer.accept(entry.urlId());
consumer.accept(entry.docId());
}
}
}

View File

@ -0,0 +1,48 @@
package nu.marginalia.index.journal.writer;
import lombok.SneakyThrows;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginallia.index.journal.IndexJournalFileNames;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Path;
public class IndexJournalWriterPagingImpl implements IndexJournalWriter {
private final Path outputDir;
private int fileNumber = 0;
private final Logger logger = LoggerFactory.getLogger(getClass());
private IndexJournalWriter currentWriter = null;
private int inputsForFile = 0;
public IndexJournalWriterPagingImpl(Path outputDir) throws IOException {
this.outputDir = outputDir;
switchToNextWriter();
logger.info("Creating Journal Writer {}", outputDir);
}
private void switchToNextWriter() throws IOException {
if (currentWriter != null)
currentWriter.close();
currentWriter = new IndexJournalWriterSingleFileImpl(IndexJournalFileNames.allocateName(outputDir, fileNumber++));
}
@Override
@SneakyThrows
public void put(IndexJournalEntryHeader header, IndexJournalEntryData entry) {
if (++inputsForFile > 100_000) {
inputsForFile = 0;
switchToNextWriter();
}
currentWriter.put(header, entry);
}
public void close() throws IOException {
currentWriter.close();
}
}

View File

@ -1,12 +1,11 @@
package nu.marginalia.index.journal.writer;
import com.github.luben.zstd.ZstdDirectBufferCompressingStream;
import com.github.luben.zstd.ZstdOutputStream;
import lombok.SneakyThrows;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.lexicon.KeywordLexicon;
import nu.marginallia.index.journal.IndexJournalFileNames;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -16,27 +15,34 @@ import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.nio.file.attribute.PosixFilePermissions;
public class IndexJournalWriterImpl implements IndexJournalWriter{
private final KeywordLexicon lexicon;
public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{
private static final int ZSTD_BUFFER_SIZE = 8192;
private static final int DATA_BUFFER_SIZE = 8192;
private final ByteBuffer dataBuffer = ByteBuffer.allocateDirect(DATA_BUFFER_SIZE);
private final ZstdDirectBufferCompressingStream compressingStream;
private int numEntries = 0;
private final FileChannel fileChannel;
public IndexJournalWriterImpl(KeywordLexicon lexicon, Path outputFile) throws IOException {
this.lexicon = lexicon;
private int numEntries = 0;
private final Logger logger = LoggerFactory.getLogger(getClass());
public IndexJournalWriterSingleFileImpl(Path outputFile) throws IOException {
logger.info("Creating Journal Writer {}", outputFile);
Files.deleteIfExists(outputFile);
Files.createFile(outputFile, PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
fileChannel = FileChannel.open(outputFile, StandardOpenOption.CREATE,
StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING);
writeHeaderPlaceholder(fileChannel);
compressingStream = new ZstdDirectBufferCompressingStream(ByteBuffer.allocateDirect(ZSTD_BUFFER_SIZE), 3) {
protected ByteBuffer flushBuffer(ByteBuffer toFlush) throws IOException {
toFlush.flip();
@ -64,7 +70,7 @@ public class IndexJournalWriterImpl implements IndexJournalWriter{
@Override
@SneakyThrows
public synchronized void put(IndexJournalEntryHeader header, IndexJournalEntryData entry) {
public void put(IndexJournalEntryHeader header, IndexJournalEntryData entry) {
if (dataBuffer.capacity() - dataBuffer.position() < 3*8) {
dataBuffer.flip();
compressingStream.compress(dataBuffer);
@ -84,6 +90,7 @@ public class IndexJournalWriterImpl implements IndexJournalWriter{
dataBuffer.clear();
}
else while (remaining-- > 0 && i < entry.size()) {
dataBuffer.putLong(entry.underlyingArray[i++]);
}
}
@ -103,7 +110,7 @@ public class IndexJournalWriterImpl implements IndexJournalWriter{
ByteBuffer header = ByteBuffer.allocate(16);
header.putLong(numEntries);
header.putLong(lexicon.size());
header.putLong(0);
header.flip();
while (header.position() < header.limit()) {

View File

@ -0,0 +1,30 @@
package nu.marginallia.index.journal;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
public class IndexJournalFileNames {
public static Path allocateName(Path base, int idx) {
return base.resolve(String.format("page-index-%04d.dat", idx));
}
public static List<Path> findJournalFiles(Path baseDirectory) throws IOException {
List<Path> ret = new ArrayList<>();
try (var listStream = Files.list(baseDirectory)) {
listStream
.filter(IndexJournalFileNames::isJournalFile)
.sorted()
.forEach(ret::add);
}
return ret;
}
public static boolean isJournalFile(Path file) {
return file.toFile().getName().matches("page-index-\\d{4}.dat");
}
}

View File

@ -4,13 +4,12 @@ import nu.marginalia.index.journal.model.IndexJournalEntry;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
import nu.marginalia.lexicon.KeywordLexicon;
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
import nu.marginalia.model.id.UrlIdCodec;
import org.apache.commons.lang3.tuple.Pair;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
import java.io.IOException;
import java.nio.file.Files;
@ -22,15 +21,16 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
public class IndexJournalTest {
Path tempFile;
KeywordLexicon lexicon;
IndexJournalReader reader;
long firstDocId = UrlIdCodec.encodeId(44, 10);
long secondDocId = UrlIdCodec.encodeId(43, 15);
@BeforeEach
public void setUp() throws IOException {
tempFile = Files.createTempFile(getClass().getSimpleName(), ".dat");
lexicon = Mockito.mock(KeywordLexicon.class);
var journalWriter = new IndexJournalWriterImpl(lexicon, tempFile);
var journalWriter = new IndexJournalWriterSingleFileImpl( tempFile);
journalWriter.put(IndexJournalEntry.builder(44, 10, 55)
.add(1, 2)
.add(2, 3)
@ -65,11 +65,11 @@ public class IndexJournalTest {
}
@Test
public void forEachUrlId() {
List<Integer> expected = List.of(10, 15);
List<Integer> actual = new ArrayList<>();
public void forEachDocId() {
List<Long> expected = List.of(firstDocId, secondDocId);
List<Long> actual = new ArrayList<>();
reader.forEachUrlId(actual::add);
reader.forEachDocId(actual::add);
assertEquals(expected, actual);
}
@ -78,50 +78,19 @@ public class IndexJournalTest {
List<Integer> expected = List.of(1, 2, 3, 5, 5 ,6);
List<Integer> actual = new ArrayList<>();
reader.forEachWordId(actual::add);
assertEquals(expected, actual);
}
@Test
public void forEachUrlIdWordId() {
List<Pair<Integer, Integer>> expected = List.of(
Pair.of(10, 1),
Pair.of(10, 2),
Pair.of(10, 3),
Pair.of(10, 5),
Pair.of(15, 5),
Pair.of(15, 6));
List<Pair<Integer, Integer>> actual = new ArrayList<>();
reader.forEachUrlIdWordId((url, word) -> actual.add(Pair.of(url, word)));
assertEquals(expected, actual);
}
@Test
public void forEachDocIdWordId() {
List<Pair<Long, Integer>> expected = List.of(
Pair.of(10L | (44L << 32), 1),
Pair.of(10L | (44L << 32), 2),
Pair.of(10L | (44L << 32), 3),
Pair.of(10L | (44L << 32), 5),
Pair.of(15L | (43L << 32), 5),
Pair.of(15L | (43L << 32), 6));
List<Pair<Long, Integer>> actual = new ArrayList<>();
reader.forEachDocIdWordId((url, word) -> actual.add(Pair.of(url, word)));
reader.forEachWordId(i -> actual.add((int) i));
assertEquals(expected, actual);
}
@Test
public void forEachDocIdRecord() {
List<Pair<Long, IndexJournalEntryData.Record>> expected = List.of(
Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(1, 2)),
Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(2, 3)),
Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(3, 4)),
Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(5, 6)),
Pair.of(15L | (43L << 32), new IndexJournalEntryData.Record(5, 5)),
Pair.of(15L | (43L << 32), new IndexJournalEntryData.Record(6, 6))
Pair.of(firstDocId, new IndexJournalEntryData.Record(1, 2)),
Pair.of(firstDocId, new IndexJournalEntryData.Record(2, 3)),
Pair.of(firstDocId, new IndexJournalEntryData.Record(3, 4)),
Pair.of(firstDocId, new IndexJournalEntryData.Record(5, 6)),
Pair.of(secondDocId, new IndexJournalEntryData.Record(5, 5)),
Pair.of(secondDocId, new IndexJournalEntryData.Record(6, 6))
);
List<Pair<Long, IndexJournalEntryData.Record>> actual = new ArrayList<>();

View File

@ -9,16 +9,16 @@ import nu.marginalia.index.query.filter.QueryFilterStepIf;
public interface IndexQueryBuilder {
/** Filters documents that also contain termId, within the full index.
*/
IndexQueryBuilder alsoFull(int termId);
IndexQueryBuilder alsoFull(long termId);
/**
* Filters documents that also contain the termId, within the priority index.
*/
IndexQueryBuilder alsoPrio(int termIds);
IndexQueryBuilder alsoPrio(long termIds);
/** Excludes documents that contain termId, within the full index
*/
IndexQueryBuilder notFull(int termId);
IndexQueryBuilder notFull(long termId);
IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep);

View File

@ -21,7 +21,7 @@ public class QueryFilterLetThrough implements QueryFilterStepIf {
}
public String describe() {
return "[NoPass]";
return "[PassThrough]";
}
}

View File

@ -3,10 +3,10 @@ package nu.marginalia.index.searchset;
public interface SearchSet {
/**
* Returns true if the given urlId is contained in the set
* Returns true if the given domainId is contained in the set
* or if the documentMetadata vibes with the set
*
*/
boolean contains(int urlId, long documentMetadata);
boolean contains(int domainId, long documentMetadata);
}

View File

@ -18,15 +18,15 @@ dependencies {
implementation project(':code:features-index:domain-ranking')
implementation project(':code:features-index:index-query')
implementation project(':code:features-index:index-journal')
implementation project(':code:features-index:lexicon')
implementation project(':code:common:model')
implementation project(':code:common:service')
implementation project(':code:common:process')
implementation libs.lombok
annotationProcessor libs.lombok
implementation libs.bundles.slf4j
implementation libs.prometheus
implementation libs.fastutil
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 21 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 21 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 29 KiB

View File

@ -12,9 +12,35 @@ The full index also provides access to term-level metadata, while the priority i
[1] See WordFlags in [common/model](../../common/model/) and
KeywordMetadata in [features-convert/keyword-extraction](../../features-convert/keyword-extraction).
## Construction
The reverse index is constructed by first building a series of preindexes.
Preindexes consist of a Segment and a Documents object. The segment contains
information about which word identifiers are present and how many, and the
documents contain information about in which documents the words can be found.
![Memory layout illustrations](./preindex.svg)
These would typically not fit in RAM, so the index journal is paged
and the preindexes are constructed small enough to fit in memory, and
then merged. Merging sorted arrays is a very fast operation that does
not require additional RAM.
![Illustration of successively merged preindex files](./merging.svg)
Once merged into one large preindex, indexes are added to the preindex data
to form a finalized reverse index.
![Illustration of the data layout of the finalized index](index.svg)
## Central Classes
* [ReverseIndexFullConverter](src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java) constructs the full index.
* [ReverseIndexFullReader](src/main/java/nu/marginalia/index/full/ReverseIndexFullReader.java) interrogates the full index.
* [ReverseIndexPriorityConverter](src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java) constructs the priority index.
* [ReverseIndexPriorityReader](src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityReader.java) interrogates the priority index.
* [ReversePreindex](src/main/java/nu/marginalia/index/construction/ReversePreindex.java) intermediate reverse index state.
* [ReverseIndexConstructor](src/main/java/nu/marginalia/index/construction/ReverseIndexConstructor.java) constructs the index.
* [ReverseIndexReader](src/main/java/nu/marginalia/index/ReverseIndexReader.java) interrogates the index.
## See Also
* [index-journal](../index-journal)
* [index-forward](../index-forward)
* [libraries/btree](../../libraries/btree)
* [libraries/array](../../libraries/array)

View File

@ -1,4 +1,4 @@
package nu.marginalia.index.full;
package nu.marginalia.index;
import nu.marginalia.array.buffer.LongQueryBuffer;
import nu.marginalia.btree.BTreeReader;
@ -6,18 +6,18 @@ import nu.marginalia.index.query.EntrySource;
import static java.lang.Math.min;
public class ReverseIndexFullEntrySource implements EntrySource {
public class ReverseIndexEntrySource implements EntrySource {
private final BTreeReader reader;
int pos;
int endOffset;
final int entrySize;
private final int wordId;
private final long wordId;
public ReverseIndexFullEntrySource(BTreeReader reader,
int entrySize,
int wordId) {
public ReverseIndexEntrySource(BTreeReader reader,
int entrySize,
long wordId) {
this.reader = reader;
this.entrySize = entrySize;
this.wordId = wordId;

View File

@ -0,0 +1,28 @@
package nu.marginalia.index;
import java.nio.file.Path;
public class ReverseIndexFullFileNames {
public static Path resolve(Path basePath, FileIdentifier identifier, FileVersion version) {
return switch (identifier) {
case WORDS -> switch (version) {
case NEXT -> basePath.resolve("rev-words.dat.next");
case CURRENT -> basePath.resolve("rev-words.dat");
};
case DOCS -> switch (version) {
case NEXT -> basePath.resolve("rev-docs.dat.next");
case CURRENT -> basePath.resolve("rev-docs.dat");
};
};
}
public enum FileVersion {
CURRENT,
NEXT
};
public enum FileIdentifier {
WORDS,
DOCS
}
}

View File

@ -0,0 +1,10 @@
package nu.marginalia.index;
import nu.marginalia.btree.model.BTreeBlockSize;
import nu.marginalia.btree.model.BTreeContext;
public class ReverseIndexParameters
{
public static final BTreeContext docsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048);
public static final BTreeContext wordsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048);
}

View File

@ -0,0 +1,28 @@
package nu.marginalia.index;
import java.nio.file.Path;
public class ReverseIndexPrioFileNames {
public static Path resolve(Path basePath, FileIdentifier identifier, FileVersion version) {
return switch (identifier) {
case WORDS -> switch (version) {
case NEXT -> basePath.resolve("rev-prio-words.dat.next");
case CURRENT -> basePath.resolve("rev-prio-words.dat");
};
case DOCS -> switch (version) {
case NEXT -> basePath.resolve("rev-prio-docs.dat.next");
case CURRENT -> basePath.resolve("rev-prio-docs.dat");
};
};
}
public enum FileVersion {
CURRENT,
NEXT
};
public enum FileIdentifier {
WORDS,
DOCS
}
}

View File

@ -1,11 +1,11 @@
package nu.marginalia.index.full;
package nu.marginalia.index;
import nu.marginalia.index.query.ReverseIndexRejectFilter;
import nu.marginalia.index.query.ReverseIndexRetainFilter;
import nu.marginalia.array.LongArray;
import nu.marginalia.btree.BTreeReader;
import nu.marginalia.index.query.EmptyEntrySource;
import nu.marginalia.index.query.EntrySource;
import nu.marginalia.index.query.ReverseIndexRejectFilter;
import nu.marginalia.index.query.ReverseIndexRetainFilter;
import nu.marginalia.index.query.filter.QueryFilterLetThrough;
import nu.marginalia.index.query.filter.QueryFilterNoPass;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
@ -15,18 +15,22 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
public class ReverseIndexFullReader {
public class ReverseIndexReader {
private final LongArray words;
private final LongArray documents;
private final long wordsDataOffset;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final BTreeReader wordsBTreeReader;
public ReverseIndexFullReader(Path words, Path documents) throws IOException {
public ReverseIndexReader(Path words, Path documents) throws IOException {
if (!Files.exists(words) || !Files.exists(documents)) {
this.words = null;
this.documents = null;
this.wordsBTreeReader = null;
this.wordsDataOffset = -1;
return;
}
@ -34,62 +38,52 @@ public class ReverseIndexFullReader {
this.words = LongArray.mmapRead(words);
this.documents = LongArray.mmapRead(documents);
wordsBTreeReader = new BTreeReader(this.words, ReverseIndexParameters.wordsBTreeContext, 0);
wordsDataOffset = wordsBTreeReader.getHeader().dataOffsetLongs();
}
public boolean isWordInDoc(int wordId, long documentId) {
if (wordId < 0) {
return false;
}
long offset = words.get(wordId);
private long wordOffset(long wordId) {
long idx = wordsBTreeReader.findEntry(wordId);
if (offset < 0) {
return false;
}
if (idx < 0)
return -1L;
return createReaderNew(offset).findEntry(documentId) >= 0;
return words.get(wordsDataOffset + idx + 1);
}
public EntrySource documents(int wordId) {
public EntrySource documents(long wordId) {
if (null == words) {
logger.warn("Reverse index is not ready, dropping query");
return new EmptyEntrySource();
}
if (wordId < 0 || wordId >= words.size()) return new EmptyEntrySource();
long offset = words.get(wordId);
long offset = wordOffset(wordId);
if (offset < 0) return new EmptyEntrySource();
return new ReverseIndexFullEntrySource(createReaderNew(offset), ReverseIndexFullParameters.ENTRY_SIZE, wordId);
return new ReverseIndexEntrySource(createReaderNew(offset), 2, wordId);
}
public QueryFilterStepIf also(int wordId) {
if (wordId < 0) return new QueryFilterNoPass();
long offset = words.get(wordId);
public QueryFilterStepIf also(long wordId) {
long offset = wordOffset(wordId);
if (offset < 0) return new QueryFilterNoPass();
return new ReverseIndexRetainFilter(createReaderNew(offset), "full", wordId);
}
public QueryFilterStepIf not(int wordId) {
if (wordId < 0) return new QueryFilterLetThrough();
long offset = words.get(wordId);
public QueryFilterStepIf not(long wordId) {
long offset = wordOffset(wordId);
if (offset < 0) return new QueryFilterLetThrough();
return new ReverseIndexRejectFilter(createReaderNew(offset));
}
public int numDocuments(int wordId) {
if (wordId < 0)
return 0;
long offset = words.get(wordId);
public int numDocuments(long wordId) {
long offset = wordOffset(wordId);
if (offset < 0)
return 0;
@ -98,23 +92,33 @@ public class ReverseIndexFullReader {
}
private BTreeReader createReaderNew(long offset) {
return new BTreeReader(documents, ReverseIndexFullParameters.bTreeContext, offset);
return new BTreeReader(documents, ReverseIndexParameters.docsBTreeContext, offset);
}
public long[] getTermMeta(int wordId, long[] docIds) {
if (wordId < 0) {
return new long[docIds.length];
}
public long[] getTermMeta(long wordId, long[] docIds) {
long offset = wordOffset(wordId);
long offset = words.get(wordId);
if (offset < 0) {
return new long[docIds.length];
}
Arrays.sort(docIds);
assert isSorted(docIds) : "The input array docIds is assumed to be sorted";
var reader = createReaderNew(offset);
return reader.queryData(docIds, 1);
}
private boolean isSorted(long[] ids) {
if (ids.length == 0)
return true;
long prev = ids[0];
for (int i = 1; i < ids.length; i++) {
if(ids[i] <= prev)
return false;
}
return true;
}
}

View File

@ -0,0 +1,9 @@
package nu.marginalia.index.construction;
public interface DocIdRewriter {
long rewriteDocId(long docId);
static DocIdRewriter identity() {
return l -> l;
}
}

View File

@ -0,0 +1,10 @@
package nu.marginalia.index.construction;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import java.io.IOException;
import java.nio.file.Path;
public interface JournalReaderSource {
IndexJournalReader construct(Path sourceFile) throws IOException;
}

View File

@ -4,7 +4,6 @@ import nu.marginalia.array.LongArray;
import nu.marginalia.array.functional.LongIOTransformer;
import nu.marginalia.btree.BTreeWriter;
import nu.marginalia.btree.model.BTreeContext;
import nu.marginalia.index.priority.ReverseIndexPriorityParameters;
import java.io.IOException;
import java.nio.channels.FileChannel;

View File

@ -0,0 +1,115 @@
package nu.marginalia.index.construction;
import nu.marginalia.process.control.ProcessAdHocTaskHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginallia.index.journal.IndexJournalFileNames;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
public class ReverseIndexConstructor {
private static final Logger logger = LoggerFactory.getLogger(ReverseIndexConstructor.class);
public enum CreateReverseIndexSteps {
CREATE_PREINDEXES,
MERGE_PREINDEXES,
FINALIZE,
FINISHED
}
public static void createReverseIndex(
ProcessHeartbeat processHeartbeat,
JournalReaderSource readerSource,
Path sourceBaseDir,
DocIdRewriter docIdRewriter,
Path tmpDir,
Path outputFileDocs,
Path outputFileWords) throws IOException
{
var inputs = IndexJournalFileNames.findJournalFiles(sourceBaseDir);
if (inputs.isEmpty()) {
logger.error("No journal files in base dir {}", sourceBaseDir);
return;
}
try (var heartbeat = processHeartbeat.createProcessTaskHeartbeat(CreateReverseIndexSteps.class, "createReverseIndex")) {
List<ReversePreindex> preindexes = new ArrayList<>();
heartbeat.progress(CreateReverseIndexSteps.CREATE_PREINDEXES);
try (var preindexHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("constructPreindexes")) {
for (int i = 0; i < inputs.size(); i++) {
var input = inputs.get(i);
preindexHeartbeat.progress(input.toFile().getName(), i, inputs.size());
preindexes.add(ReversePreindex.constructPreindex(readerSource.construct(input), docIdRewriter, tmpDir));
}
preindexHeartbeat.progress("FINISHED", inputs.size(), inputs.size());
}
heartbeat.progress(CreateReverseIndexSteps.MERGE_PREINDEXES);
ReversePreindex finalPreindex;
try (var mergeHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("mergePreindexes")) {
finalPreindex = mergePreindexes(tmpDir, mergeHeartbeat, preindexes);
}
heartbeat.progress(CreateReverseIndexSteps.FINALIZE);
finalPreindex.finalizeIndex(outputFileDocs, outputFileWords);
heartbeat.progress(CreateReverseIndexSteps.FINISHED);
finalPreindex.delete();
}
}
private static ReversePreindex mergePreindexes(Path workDir, ProcessAdHocTaskHeartbeat mergeHeartbeat, List<ReversePreindex> preindexes) throws IOException {
assert !preindexes.isEmpty();
if (preindexes.size() == 1) {
logger.info("Single preindex, no merge necessary");
return preindexes.get(0);
}
List<ReversePreindex> toMerge = new ArrayList<>(preindexes);
List<ReversePreindex> merged = new ArrayList<>();
int pass = 0;
while (toMerge.size() != 1) {
String stage = String.format("PASS[%d]: %d -> %d", ++pass,
toMerge.size(),
toMerge.size()/2 + (toMerge.size() % 2)
);
for (int i = 0; i + 1 < toMerge.size(); i+=2) {
mergeHeartbeat.progress(stage, i/2, toMerge.size()/2);
var left = toMerge.get(i);
var right = toMerge.get(i+1);
merged.add(ReversePreindex.merge(workDir, left, right));
left.delete();
right.delete();
}
if ((toMerge.size() % 2) != 0) {
merged.add(toMerge.get(toMerge.size()-1));
}
toMerge.clear();
toMerge.addAll(merged);
merged.clear();
}
mergeHeartbeat.progress("FINISHED", 1, 1);
return toMerge.get(0);
}
}

View File

@ -0,0 +1,280 @@
package nu.marginalia.index.construction;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.algo.SortingContext;
import nu.marginalia.btree.BTreeWriter;
import nu.marginalia.index.ReverseIndexParameters;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import static nu.marginalia.array.algo.TwoArrayOperations.*;
/** Contains the data that would go into a reverse index,
* that is, a mapping from words to documents, minus the actual
* index structure that makes the data quick to access while
* searching.
* <p>
* Two preindexes can be merged into a third preindex containing
* the union of their data. This operation requires no additional
* RAM.
*/
public class ReversePreindex {
final ReversePreindexWordSegments segments;
final ReversePreindexDocuments documents;
private static final Logger logger = LoggerFactory.getLogger(ReversePreindex.class);
public ReversePreindex(ReversePreindexWordSegments segments, ReversePreindexDocuments documents) {
this.segments = segments;
this.documents = documents;
}
/** Constructs a new preindex with the data associated with reader. The backing files
* will have randomly assigned names.
*/
public static ReversePreindex constructPreindex(IndexJournalReader reader,
DocIdRewriter docIdRewriter,
Path destDir) throws IOException
{
Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat");
Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat");
Path docsFile = Files.createTempFile(destDir, "docs", ".dat");
logger.info("Segmenting");
var segments = ReversePreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile);
logger.info("Mapping docs");
var docs = ReversePreindexDocuments.construct(docsFile, reader, docIdRewriter, segments);
logger.info("Done");
return new ReversePreindex(segments, docs);
}
/** Transform the preindex into a reverse index */
public void finalizeIndex(Path outputFileDocs, Path outputFileWords) throws IOException {
var offsets = segments.counts;
Files.deleteIfExists(outputFileDocs);
Files.deleteIfExists(outputFileWords);
// Estimate the size of the docs index data
offsets.transformEach(0, offsets.size(), new CountToOffsetTransformer(2));
IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(ReverseIndexParameters.docsBTreeContext, 2);
offsets.fold(0, 0, offsets.size(), sizeEstimator);
// Write the docs file
LongArray finalDocs = LongArray.mmapForWriting(outputFileDocs, sizeEstimator.size);
try (var intermediateDocChannel = documents.createDocumentsFileChannel()) {
offsets.transformEachIO(0, offsets.size(), new ReverseIndexBTreeTransformer(finalDocs, 2, ReverseIndexParameters.docsBTreeContext, intermediateDocChannel));
intermediateDocChannel.force(false);
}
LongArray wordIds = segments.wordIds;
assert offsets.size() == wordIds.size() : "Offsets and word-ids of different size";
// Estimate the size of the words index data
long wordsSize = ReverseIndexParameters.wordsBTreeContext.calculateSize((int) offsets.size());
// Construct the tree
LongArray wordsArray = LongArray.mmapForWriting(outputFileWords, wordsSize);
new BTreeWriter(wordsArray, ReverseIndexParameters.wordsBTreeContext)
.write(0, (int) offsets.size(), mapRegion -> {
for (long i = 0; i < offsets.size(); i++) {
mapRegion.set(2*i, wordIds.get(i));
mapRegion.set(2*i + 1, offsets.get(i));
}
});
wordsArray.force();
}
/** Delete all files associated with this pre-index */
public void delete() throws IOException {
segments.delete();
documents.delete();
}
public static ReversePreindex merge(Path destDir,
ReversePreindex left,
ReversePreindex right) throws IOException {
ReversePreindexWordSegments mergingSegment =
createMergedSegmentWordFile(destDir, left.segments, right.segments);
var mergingIter = mergingSegment.constructionIterator(2);
var leftIter = left.segments.iterator(2);
var rightIter = right.segments.iterator(2);
Path docsFile = Files.createTempFile(destDir, "docs", ".dat");
LongArray mergedDocuments = LongArray.mmapForWriting(docsFile, 8 * (left.documents.size() + right.documents.size()));
leftIter.next();
rightIter.next();
try (FileChannel leftChannel = left.documents.createDocumentsFileChannel();
FileChannel rightChannel = right.documents.createDocumentsFileChannel())
{
while (mergingIter.canPutMore()
&& leftIter.isPositionBeforeEnd()
&& rightIter.isPositionBeforeEnd())
{
final long currentWord = mergingIter.wordId;
if (leftIter.wordId == currentWord && rightIter.wordId == currentWord)
{
// both inputs have documents for the current word
mergeSegments(leftIter, rightIter,
left.documents, right.documents,
mergedDocuments, mergingIter);
}
else if (leftIter.wordId == currentWord) {
if (!copySegment(leftIter, mergedDocuments, leftChannel, mergingIter))
break;
}
else if (rightIter.wordId == currentWord) {
if (!copySegment(rightIter, mergedDocuments, rightChannel, mergingIter))
break;
}
else assert false : "This should never happen"; // the helvetica scenario
}
if (leftIter.isPositionBeforeEnd()) {
while (copySegment(leftIter, mergedDocuments, leftChannel, mergingIter));
}
if (rightIter.isPositionBeforeEnd()) {
while (copySegment(rightIter, mergedDocuments, rightChannel, mergingIter));
}
}
assert !leftIter.isPositionBeforeEnd() : "Left has more to go";
assert !rightIter.isPositionBeforeEnd() : "Right has more to go";
assert !mergingIter.canPutMore() : "Source iters ran dry before merging iter";
// We may have overestimated the size of the merged docs size in the case there were
// duplicates in the data, so we need to shrink it to the actual size we wrote.
mergedDocuments = shrinkMergedDocuments(mergedDocuments,
docsFile, 2 * mergingSegment.totalSize());
mergingSegment.force();
return new ReversePreindex(
mergingSegment,
new ReversePreindexDocuments(mergedDocuments, docsFile)
);
}
/** Create a segment word file with each word from both inputs, with zero counts for all the data.
* This is an intermediate product in merging.
*/
static ReversePreindexWordSegments createMergedSegmentWordFile(Path destDir,
ReversePreindexWordSegments left,
ReversePreindexWordSegments right) throws IOException {
Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat");
Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat");
long segmentsSize = countDistinctElements(left.wordIds, right.wordIds,
0, left.wordIds.size(),
0, right.wordIds.size());
LongArray wordIdsFile = LongArray.mmapForWriting(segmentWordsFile, segmentsSize);
mergeArrays(wordIdsFile, left.wordIds, right.wordIds,
0, wordIdsFile.size(),
0, left.wordIds.size(),
0, right.wordIds.size());
LongArray counts = LongArray.mmapForWriting(segmentCountsFile, segmentsSize);
return new ReversePreindexWordSegments(wordIdsFile, counts, segmentWordsFile, segmentCountsFile);
}
/** It's possible we overestimated the necessary size of the documents file,
* this will permit us to shrink it down to the smallest necessary size.
*/
private static LongArray shrinkMergedDocuments(LongArray mergedDocuments, Path docsFile, long sizeLongs) throws IOException {
mergedDocuments.force();
long beforeSize = mergedDocuments.size();
try (var bc = Files.newByteChannel(docsFile, StandardOpenOption.WRITE)) {
bc.truncate(sizeLongs * 8);
}
long afterSize = mergedDocuments.size();
mergedDocuments = LongArray.mmapForWriting(docsFile, sizeLongs);
if (beforeSize != afterSize) {
logger.info("Shrunk {} from {}b to {}b", docsFile, beforeSize, afterSize);
}
return mergedDocuments;
}
/** Merge contents of the segments indicated by leftIter and rightIter into the destionation
* segment, and advance the construction iterator with the appropriate size.
*/
private static void mergeSegments(ReversePreindexWordSegments.SegmentIterator leftIter,
ReversePreindexWordSegments.SegmentIterator rightIter,
ReversePreindexDocuments left,
ReversePreindexDocuments right,
LongArray dest,
ReversePreindexWordSegments.SegmentConstructionIterator destIter)
{
long distinct = countDistinctElementsN(2,
left.documents,
right.documents,
leftIter.startOffset, leftIter.endOffset,
rightIter.startOffset, rightIter.endOffset);
mergeArrays2(dest,
left.documents,
right.documents,
destIter.startOffset,
destIter.startOffset + 2*distinct,
leftIter.startOffset, leftIter.endOffset,
rightIter.startOffset, rightIter.endOffset);
destIter.putNext(distinct);
leftIter.next();
rightIter.next();
}
/** Copy the data from the source segment at the position and length indicated by sourceIter,
* into the destination segment, and advance the construction iterator.
*/
private static boolean copySegment(ReversePreindexWordSegments.SegmentIterator sourceIter,
LongArray dest,
FileChannel sourceChannel,
ReversePreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException {
long size = sourceIter.endOffset - sourceIter.startOffset;
long start = mergingIter.startOffset;
long end = start + size;
dest.transferFrom(sourceChannel,
sourceIter.startOffset,
mergingIter.startOffset,
end);
boolean putNext = mergingIter.putNext(size / 2);
boolean iterNext = sourceIter.next();
assert putNext || !iterNext : "Source iterator ran out before dest iterator?!";
return iterNext;
}
}

View File

@ -0,0 +1,123 @@
package nu.marginalia.index.construction;
import lombok.SneakyThrows;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.algo.SortingContext;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
/** A LongArray with document data, segmented according to
* the associated ReversePreindexWordSegments data
*/
public class ReversePreindexDocuments {
private final Path file;
public final LongArray documents;
private static final int RECORD_SIZE_LONGS = 2;
private static final Logger logger= LoggerFactory.getLogger(ReversePreindexDocuments.class);
public ReversePreindexDocuments(LongArray documents, Path file) {
this.documents = documents;
this.file = file;
}
public static ReversePreindexDocuments construct(
Path docsFile,
IndexJournalReader reader,
DocIdRewriter docIdRewriter,
ReversePreindexWordSegments segments) throws IOException {
logger.info("Transferring data");
createUnsortedDocsFile(docsFile, reader, segments, docIdRewriter);
LongArray docsFileMap = LongArray.mmapForWriting(docsFile, 8 * Files.size(docsFile));
logger.info("Sorting data");
sortDocsFile(docsFileMap, segments);
return new ReversePreindexDocuments(docsFileMap, docsFile);
}
public FileChannel createDocumentsFileChannel() throws IOException {
return (FileChannel) Files.newByteChannel(file, StandardOpenOption.READ);
}
public LongArray slice(long start, long end) {
return documents.range(start, end);
}
public long size() {
return documents.size();
}
private static void createUnsortedDocsFile(Path docsFile,
IndexJournalReader reader,
ReversePreindexWordSegments segments,
DocIdRewriter docIdRewriter) throws IOException {
long fileSize = 8 * segments.totalSize();
LongArray outArray = LongArray.mmapForWriting(docsFile, fileSize);
var offsetMap = segments.asMap(RECORD_SIZE_LONGS);
offsetMap.defaultReturnValue(0);
for (var entry : reader) {
long rankEncodedId = docIdRewriter.rewriteDocId(entry.docId());
var data = entry.readEntry();
for (int i = 0; i + 1 < data.size(); i+=2) {
long wordId = data.get(i);
long meta = data.get(i+1);
long offset = offsetMap.addTo(wordId, RECORD_SIZE_LONGS);
outArray.set(offset + 0, rankEncodedId);
outArray.set(offset + 1, meta);
}
}
outArray.force();
}
@SneakyThrows
private static void sortDocsFile(LongArray docsFileMap, ReversePreindexWordSegments segments) throws IOException {
var iter = segments.iterator(RECORD_SIZE_LONGS);
ExecutorService sortingWorkers = Executors.newWorkStealingPool(Runtime.getRuntime().availableProcessors());
while (iter.next()) {
if (iter.size() < 1024) {
docsFileMap.quickSortN(RECORD_SIZE_LONGS,
iter.startOffset,
iter.endOffset);
}
else {
sortingWorkers.execute(() ->
docsFileMap.quickSortN(RECORD_SIZE_LONGS,
iter.startOffset,
iter.endOffset));
}
}
sortingWorkers.shutdown();
logger.info("Awaiting shutdown");
while (!sortingWorkers.awaitTermination(1, TimeUnit.HOURS));
sortingWorkers.close();
}
public void delete() throws IOException {
Files.delete(this.file);
}
}

View File

@ -0,0 +1,197 @@
package nu.marginalia.index.construction;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
import it.unimi.dsi.fastutil.longs.LongIterator;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.algo.SortingContext;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import java.io.IOException;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
/** A pair of file-backed arrays of sorted wordIds
* and the count of documents associated with each wordId.
*/
public class ReversePreindexWordSegments {
public final LongArray wordIds;
public final LongArray counts;
private final Path wordsFile;
private final Path countsFile;
public ReversePreindexWordSegments(LongArray wordIds,
LongArray counts,
Path wordsFile,
Path countsFile)
{
assert wordIds.size() == counts.size();
this.wordIds = wordIds;
this.counts = counts;
this.wordsFile = wordsFile;
this.countsFile = countsFile;
}
/** Returns a long-long hash map where each key is a wordId,
* and each value is the start offset of the data.
*/
public Long2LongOpenHashMap asMap(int recordSize) {
Long2LongOpenHashMap ret = new Long2LongOpenHashMap((int) wordIds.size(), 0.75f);
var iter = iterator(recordSize);
while (iter.next()) {
ret.put(iter.wordId, iter.startOffset);
}
return ret;
}
public static ReversePreindexWordSegments construct(IndexJournalReader reader,
Path wordIdsFile,
Path countsFile)
throws IOException
{
Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f);
countsMap.defaultReturnValue(0);
reader.forEachWordId(wordId -> countsMap.addTo(wordId, 1));
LongArray words = LongArray.mmapForWriting(wordIdsFile, countsMap.size());
LongArray counts = LongArray.mmapForWriting(countsFile, countsMap.size());
// Create the words file by iterating over the map and inserting them into
// the words file in whatever bizarro hash table order they appear in
int i = 0;
LongIterator iter = countsMap.keySet().iterator();
while (iter.hasNext()) {
words.set(i, iter.nextLong());
i++;
}
// Sort the words file
words.quickSort(0, counts.size());
// Populate the counts
for (i = 0; i < countsMap.size(); i++) {
counts.set(i, countsMap.get(words.get(i)));
}
return new ReversePreindexWordSegments(words, counts, wordIdsFile, countsFile);
}
public SegmentIterator iterator(int recordSize) {
return new SegmentIterator(recordSize);
}
public SegmentConstructionIterator constructionIterator(int recordSize) {
return new SegmentConstructionIterator(recordSize);
}
public long totalSize() {
return counts.fold(0, 0, counts.size(), Long::sum);
}
public void delete() throws IOException {
Files.delete(countsFile);
Files.delete(wordsFile);
}
public void force() {
counts.force();
wordIds.force();
}
public class SegmentIterator {
private final int recordSize;
private final long fileSize;
long wordId;
long startOffset = 0;
long endOffset = 0;
private SegmentIterator(int recordSize) {
this.recordSize = recordSize;
this.fileSize = wordIds.size();
}
private int i = -1;
public int idx() {
return i;
}
public boolean next() {
if (++i >= fileSize) {
wordId = Long.MIN_VALUE;
return false;
}
wordId = wordIds.get(i);
startOffset = endOffset;
endOffset = startOffset + recordSize * counts.get(i);
return true;
}
public boolean hasMorePositions() {
return i + 1 < wordIds.size();
}
public boolean isPositionBeforeEnd() {
return i < wordIds.size();
}
public long size() {
return endOffset - startOffset;
}
}
class SegmentConstructionIterator {
private final int recordSize;
private final long fileSize;
long wordId;
long startOffset = 0;
long endOffset = 0;
private SegmentConstructionIterator(int recordSize) {
this.recordSize = recordSize;
this.fileSize = wordIds.size();
if (fileSize == 0) {
throw new IllegalArgumentException("Cannot construct zero-length word segment file");
}
this.wordId = wordIds.get(0);
}
private int i = 0;
public int idx() {
return i;
}
public boolean putNext(long size) {
if (i >= fileSize)
return false;
endOffset = startOffset + recordSize * size;
counts.set(i, size);
startOffset = endOffset;
endOffset = -1;
i++;
if (i == fileSize) {
// We've reached the end of the iteration and there is no
// "next" wordId to fetch
wordId = Long.MIN_VALUE;
return false;
}
else {
wordId = wordIds.get(i);
return true;
}
}
public boolean canPutMore() {
return i < wordIds.size();
}
}
}

View File

@ -1,218 +0,0 @@
package nu.marginalia.index.full;
import lombok.SneakyThrows;
import nu.marginalia.index.construction.CountToOffsetTransformer;
import nu.marginalia.index.construction.ReverseIndexBTreeTransformer;
import nu.marginalia.index.construction.IndexSizeEstimator;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalStatistics;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.ranking.DomainRankings;
import nu.marginalia.rwf.RandomWriteFunnel;
import nu.marginalia.array.IntArray;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.algo.SortingContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import nu.marginalia.service.control.ServiceHeartbeat;
import static nu.marginalia.index.full.ReverseIndexFullParameters.bTreeContext;
public class ReverseIndexFullConverter {
private static final int RWF_BIN_SIZE = 10_000_000;
private final ServiceHeartbeat heartbeat;
private final Path tmpFileDir;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final IndexJournalReader journalReader;
private final DomainRankings domainRankings;
private final Path outputFileWords;
private final Path outputFileDocs;
private final SortingContext sortingContext;
public ReverseIndexFullConverter(ServiceHeartbeat heartbeat,
Path tmpFileDir,
IndexJournalReader journalReader,
DomainRankings domainRankings,
Path outputFileWords,
Path outputFileDocs) {
this.heartbeat = heartbeat;
this.tmpFileDir = tmpFileDir;
this.journalReader = journalReader;
this.domainRankings = domainRankings;
this.outputFileWords = outputFileWords;
this.outputFileDocs = outputFileDocs;
this.sortingContext = new SortingContext(tmpFileDir, 64_000);
}
public enum TaskSteps {
ACCUMULATE_STATISTICS,
INCREMENT_OFFSETS,
COUNT_OFFSETS,
CREATE_INTERMEDIATE_DOCS,
SORT_INTERMEDIATE_DOCS,
SIZING,
FINALIZING_DOCS,
FORCE,
FINISHED,
}
public void convert() throws IOException {
deleteOldFiles();
if (journalReader.fileHeader().fileSize() <= IndexJournalReader.FILE_HEADER_SIZE_BYTES) {
logger.warn("Bailing: Journal is empty!");
return;
}
final Path intermediateUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat");
try (var progress = heartbeat.createServiceTaskHeartbeat(TaskSteps.class, "reverseIndexFullConverter")) {
progress.progress(TaskSteps.ACCUMULATE_STATISTICS);
final IndexJournalStatistics statistics = journalReader.getStatistics();
final long wordsFileSize = statistics.highestWord() + 1;
progress.progress(TaskSteps.INCREMENT_OFFSETS);
logger.debug("Words file size: {}", wordsFileSize);
// Create a count of how many documents has contains each word
final LongArray wordsOffsets = LongArray.allocate(wordsFileSize);
journalReader.forEachWordId(wordsOffsets::increment);
progress.progress(TaskSteps.COUNT_OFFSETS);
wordsOffsets.transformEach(0, wordsFileSize, new CountToOffsetTransformer(ReverseIndexFullParameters.ENTRY_SIZE));
progress.progress(TaskSteps.CREATE_INTERMEDIATE_DOCS);
// Construct an intermediate representation of the reverse documents index
try (FileChannel intermediateDocChannel =
(FileChannel) Files.newByteChannel(intermediateUrlsFile,
StandardOpenOption.CREATE, StandardOpenOption.READ, StandardOpenOption.WRITE))
{
// Construct intermediate index
try (RandomWriteFunnel intermediateDocumentWriteFunnel = new RandomWriteFunnel(tmpFileDir, RWF_BIN_SIZE);
IntermediateIndexConstructor intermediateIndexConstructor = new IntermediateIndexConstructor(tmpFileDir, wordsOffsets, intermediateDocumentWriteFunnel)
)
{
journalReader.forEachDocIdRecord(intermediateIndexConstructor);
intermediateDocumentWriteFunnel.write(intermediateDocChannel);
}
intermediateDocChannel.force(false);
progress.progress(TaskSteps.SORT_INTERMEDIATE_DOCS);
// Sort each segment of the intermediate file
{
LongArray intermediateDocs = LongArray.mmapForModifying(intermediateUrlsFile);
wordsOffsets.foldIO(0, 0, wordsFileSize, (s, e) -> {
intermediateDocs.sortLargeSpanN(sortingContext, ReverseIndexFullParameters.ENTRY_SIZE, s, e);
return e;
});
intermediateDocs.force();
}
progress.progress(TaskSteps.SIZING);
IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(
ReverseIndexFullParameters.bTreeContext,
ReverseIndexFullParameters.ENTRY_SIZE);
wordsOffsets.fold(0, 0, wordsOffsets.size(), sizeEstimator);
progress.progress(TaskSteps.FINALIZING_DOCS);
LongArray finalDocs = LongArray.mmapForWriting(outputFileDocs, sizeEstimator.size);
// Construct the proper reverse index
wordsOffsets.transformEachIO(0, wordsOffsets.size(), new ReverseIndexBTreeTransformer(finalDocs, ReverseIndexFullParameters.ENTRY_SIZE, bTreeContext, intermediateDocChannel));
wordsOffsets.write(outputFileWords);
progress.progress(TaskSteps.FORCE);
// Attempt to clean up before forcing (important disk space preservation)
Files.deleteIfExists(intermediateUrlsFile);
wordsOffsets.force();
finalDocs.force();
progress.progress(TaskSteps.FINISHED);
}
} catch (IOException ex) {
logger.error("Failed to convert", ex);
throw ex;
} finally {
Files.deleteIfExists(intermediateUrlsFile);
}
}
private void deleteOldFiles() throws IOException {
Files.deleteIfExists(outputFileWords);
Files.deleteIfExists(outputFileDocs);
}
private class IntermediateIndexConstructor implements IndexJournalReader.LongObjectConsumer<IndexJournalEntryData.Record>, AutoCloseable {
private final LongArray wordRangeEnds;
private final IntArray wordRangeOffset;
private final RandomWriteFunnel documentsFile;
private final Path tempFile;
public IntermediateIndexConstructor(Path tempDir, LongArray wordRangeEnds, RandomWriteFunnel documentsFile) throws IOException {
tempFile = Files.createTempFile(tempDir, "iic", "dat");
this.wordRangeEnds = wordRangeEnds;
this.wordRangeOffset = IntArray.mmapForWriting(tempFile, wordRangeEnds.size());
this.documentsFile = documentsFile;
}
@SneakyThrows
@Override
public void accept(long docId, IndexJournalEntryData.Record record) {
/* Encode the ID as
*
* 32 bits 32 bits
* [ ranking | url-id ]
*
* in order to get low-ranking documents to be considered first
* when sorting the items.
*/
int domainId = (int) (docId >>> 32);
long rankingId = (long) domainRankings.getRanking(domainId) << 32;
int urlId = (int) (docId & 0xFFFF_FFFFL);
long rankEncodedId = rankingId | urlId;
final int wordId = record.wordId();
long offset = startOfRange(wordId);
documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), rankEncodedId);
documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), record.metadata());
}
private long startOfRange(int wordId) {
if (wordId == 0) return 0;
return wordRangeEnds.get(wordId - 1);
}
public void close() throws IOException {
Files.delete(tempFile);
}
}
}

View File

@ -1,16 +0,0 @@
package nu.marginalia.index.full;
import nu.marginalia.btree.model.BTreeBlockSize;
import nu.marginalia.btree.model.BTreeContext;
public class ReverseIndexFullParameters {
static final int ENTRY_SIZE = 2;
// This is the byte size per index page on disk, the data pages are twice as large due to ENTRY_SIZE = 2.
//
// Given a hardware limit of 4k reads, 2k block size should be optimal.
static final BTreeBlockSize blockSize = BTreeBlockSize.BS_2048;
static final BTreeContext bTreeContext = new BTreeContext(5, ENTRY_SIZE, blockSize);
}

View File

@ -1,215 +0,0 @@
package nu.marginalia.index.priority;
import lombok.SneakyThrows;
import nu.marginalia.array.IntArray;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.algo.SortingContext;
import nu.marginalia.index.construction.CountToOffsetTransformer;
import nu.marginalia.index.construction.ReverseIndexBTreeTransformer;
import nu.marginalia.index.construction.IndexSizeEstimator;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalStatistics;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.ranking.DomainRankings;
import nu.marginalia.rwf.RandomWriteFunnel;
import nu.marginalia.service.control.ServiceHeartbeat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import static nu.marginalia.index.priority.ReverseIndexPriorityParameters.bTreeContext;
public class ReverseIndexPriorityConverter {
private static final int RWF_BIN_SIZE = 10_000_000;
private final ServiceHeartbeat heartbeat;
private final Path tmpFileDir;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final IndexJournalReader journalReader;
private final DomainRankings domainRankings;
private final Path outputFileWords;
private final Path outputFileDocs;
private final SortingContext sortingContext;
public ReverseIndexPriorityConverter(ServiceHeartbeat heartbeat,
Path tmpFileDir,
IndexJournalReader journalReader,
DomainRankings domainRankings,
Path outputFileWords,
Path outputFileDocs) {
this.heartbeat = heartbeat;
this.tmpFileDir = tmpFileDir;
this.journalReader = journalReader;
this.domainRankings = domainRankings;
this.outputFileWords = outputFileWords;
this.outputFileDocs = outputFileDocs;
this.sortingContext = new SortingContext(tmpFileDir, 64_000);
}
public enum TaskSteps {
ACCUMULATE_STATISTICS,
INCREMENT_OFFSETS,
COUNT_OFFSETS,
CREATE_INTERMEDIATE_DOCS,
SORT_INTERMEDIATE_DOCS,
SIZING,
FINALIZING_DOCS,
FORCE,
FINISHED,
}
public void convert() throws IOException {
deleteOldFiles();
if (journalReader.fileHeader().fileSize() <= IndexJournalReader.FILE_HEADER_SIZE_BYTES) {
logger.warn("Bailing: Journal is empty!");
return;
}
final Path intermediateUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat");
try (var progress = heartbeat.createServiceTaskHeartbeat(TaskSteps.class, "reverseIndexPriorityConverter")) {
progress.progress(TaskSteps.ACCUMULATE_STATISTICS);
final IndexJournalStatistics statistics = journalReader.getStatistics();
final long wordsFileSize = statistics.highestWord() + 1;
progress.progress(TaskSteps.INCREMENT_OFFSETS);
logger.debug("Words file size: {}", wordsFileSize);
// Create a count of how many documents has contains each word
final LongArray wordsOffsets = LongArray.allocate(wordsFileSize);
journalReader.forEachWordId(wordsOffsets::increment);
progress.progress(TaskSteps.COUNT_OFFSETS);
wordsOffsets.transformEach(0, wordsFileSize, new CountToOffsetTransformer(ReverseIndexPriorityParameters.ENTRY_SIZE));
progress.progress(TaskSteps.CREATE_INTERMEDIATE_DOCS);
// Construct an intermediate representation of the reverse documents index
try (FileChannel intermediateDocChannel =
(FileChannel) Files.newByteChannel(intermediateUrlsFile,
StandardOpenOption.CREATE, StandardOpenOption.READ, StandardOpenOption.WRITE))
{
// Construct intermediate index
try (RandomWriteFunnel intermediateDocumentWriteFunnel = new RandomWriteFunnel(tmpFileDir, RWF_BIN_SIZE);
IntermediateIndexConstructor intermediateIndexConstructor = new IntermediateIndexConstructor(tmpFileDir, wordsOffsets, intermediateDocumentWriteFunnel)
)
{
journalReader.forEachDocIdRecord(intermediateIndexConstructor);
intermediateDocumentWriteFunnel.write(intermediateDocChannel);
}
intermediateDocChannel.force(false);
progress.progress(TaskSteps.SORT_INTERMEDIATE_DOCS);
// Sort each segment of the intermediate file
{
LongArray intermediateDocs = LongArray.mmapForModifying(intermediateUrlsFile);
wordsOffsets.foldIO(0, 0, wordsFileSize, (s, e) -> {
intermediateDocs.sortLargeSpan(sortingContext, s, e);
return e;
});
intermediateDocs.force();
}
progress.progress(TaskSteps.SIZING);
IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(
bTreeContext,
ReverseIndexPriorityParameters.ENTRY_SIZE);
wordsOffsets.fold(0, 0, wordsOffsets.size(), sizeEstimator);
progress.progress(TaskSteps.FINALIZING_DOCS);
LongArray finalDocs = LongArray.mmapForWriting(outputFileDocs, sizeEstimator.size);
// Construct the proper reverse index
wordsOffsets.transformEachIO(0, wordsOffsets.size(), new ReverseIndexBTreeTransformer(finalDocs, ReverseIndexPriorityParameters.ENTRY_SIZE, bTreeContext, intermediateDocChannel));
wordsOffsets.write(outputFileWords);
progress.progress(TaskSteps.FORCE);
// Attempt to clean up before forcing (important disk space preservation)
Files.deleteIfExists(intermediateUrlsFile);
wordsOffsets.force();
finalDocs.force();
progress.progress(TaskSteps.FINISHED);
}
} catch (IOException ex) {
logger.error("Failed to convert", ex);
throw ex;
} finally {
Files.deleteIfExists(intermediateUrlsFile);
}
}
private void deleteOldFiles() throws IOException {
Files.deleteIfExists(outputFileWords);
Files.deleteIfExists(outputFileDocs);
}
private class IntermediateIndexConstructor implements IndexJournalReader.LongObjectConsumer<IndexJournalEntryData.Record>, AutoCloseable {
private final LongArray wordRangeEnds;
private final IntArray wordRangeOffset;
private final RandomWriteFunnel documentsFile;
private final Path tempFile;
public IntermediateIndexConstructor(Path tempDir, LongArray wordRangeEnds, RandomWriteFunnel documentsFile) throws IOException {
tempFile = Files.createTempFile(tempDir, "iic", "dat");
this.wordRangeEnds = wordRangeEnds;
this.wordRangeOffset = IntArray.mmapForWriting(tempFile, wordRangeEnds.size());
this.documentsFile = documentsFile;
}
@SneakyThrows
@Override
public void accept(long docId, IndexJournalEntryData.Record record) {
/* Encode the ID as
*
* 32 bits 32 bits
* [ ranking | url-id ]
*
* in order to get low-ranking documents to be considered first
* when sorting the items.
*/
int domainId = (int) (docId >>> 32);
long rankingId = (long) domainRankings.getRanking(domainId) << 32;
int urlId = (int) (docId & 0xFFFF_FFFFL);
long rankEncodedId = rankingId | urlId;
final int wordId = record.wordId();
long offset = startOfRange(wordId);
documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), rankEncodedId);
}
private long startOfRange(int wordId) {
if (wordId == 0) return 0;
return wordRangeEnds.get(wordId - 1);
}
public void close() throws IOException {
Files.delete(tempFile);
}
}
}

View File

@ -1,48 +0,0 @@
package nu.marginalia.index.priority;
import nu.marginalia.array.buffer.LongQueryBuffer;
import nu.marginalia.btree.BTreeReader;
import nu.marginalia.index.query.EntrySource;
import static java.lang.Math.min;
public class ReverseIndexPriorityEntrySource implements EntrySource {
private final BTreeReader reader;
int pos;
int endOffset;
private final int wordId;
public ReverseIndexPriorityEntrySource(BTreeReader reader, int wordId) {
this.reader = reader;
this.wordId = wordId;
pos = 0;
endOffset = pos + reader.numEntries();
}
@Override
public void skip(int n) {
pos += n;
}
@Override
public void read(LongQueryBuffer buffer) {
buffer.end = min(buffer.end, endOffset - pos);
reader.readData(buffer.data, buffer.end, pos);
pos += buffer.end;
buffer.uniq();
}
@Override
public boolean hasMore() {
return pos < endOffset;
}
@Override
public String indexName() {
return "Priority:" + wordId;
}
}

View File

@ -1,31 +0,0 @@
package nu.marginalia.index.priority;
import nu.marginalia.btree.model.BTreeBlockSize;
import nu.marginalia.btree.model.BTreeContext;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.model.idx.WordFlags;
public class ReverseIndexPriorityParameters {
static final int ENTRY_SIZE = 1;
static final BTreeBlockSize blockSize = BTreeBlockSize.BS_4096;
static final BTreeContext bTreeContext = new BTreeContext(5, ENTRY_SIZE, blockSize);
private static final long highPriorityFlags =
WordFlags.Title.asBit()
| WordFlags.Subjects.asBit()
| WordFlags.TfIdfHigh.asBit()
| WordFlags.NamesWords.asBit()
| WordFlags.UrlDomain.asBit()
| WordFlags.UrlPath.asBit()
| WordFlags.Site.asBit()
| WordFlags.SiteAdjacent.asBit();
public static boolean filterPriorityRecord(IndexJournalEntryData.Record record) {
long meta = record.metadata();
return (meta & highPriorityFlags) != 0;
}
}

View File

@ -1,77 +0,0 @@
package nu.marginalia.index.priority;
import nu.marginalia.index.query.EntrySource;
import nu.marginalia.array.LongArray;
import nu.marginalia.btree.BTreeReader;
import nu.marginalia.index.query.EmptyEntrySource;
import nu.marginalia.index.query.ReverseIndexRetainFilter;
import nu.marginalia.index.query.filter.QueryFilterNoPass;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
public class ReverseIndexPriorityReader {
private final LongArray words;
private final LongArray documents;
private final Logger logger = LoggerFactory.getLogger(getClass());
public ReverseIndexPriorityReader(Path words, Path documents) throws IOException {
if (!Files.exists(words) || !Files.exists(documents)) {
this.words = null;
this.documents = null;
return;
}
logger.info("Switching prio reverse index");
this.words = LongArray.mmapRead(words);
this.documents = LongArray.mmapRead(documents);
}
public EntrySource priorityDocuments(int wordId) {
if (words == null) {
// index not loaded
return new EmptyEntrySource();
}
if (wordId < 0 || wordId >= words.size()) return new EmptyEntrySource();
long offset = words.get(wordId);
if (offset < 0) return new EmptyEntrySource();
return new ReverseIndexPriorityEntrySource(createReaderNew(offset), wordId);
}
private BTreeReader createReaderNew(long offset) {
return new BTreeReader(documents, ReverseIndexPriorityParameters.bTreeContext, offset);
}
public QueryFilterStepIf also(int wordId) {
if (wordId < 0) return new QueryFilterNoPass();
long offset = words.get(wordId);
if (offset < 0) return new QueryFilterNoPass();
return new ReverseIndexRetainFilter(createReaderNew(offset), "priority", wordId);
}
public int numDocuments(int wordId) {
if (wordId < 0)
return 0;
long offset = words.get(wordId);
if (offset < 0)
return 0;
return createReaderNew(offset).numEntries();
}
}

View File

@ -4,7 +4,7 @@ import nu.marginalia.array.buffer.LongQueryBuffer;
import nu.marginalia.btree.BTreeReader;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
public record ReverseIndexRetainFilter(BTreeReader range, String name, int wordId) implements QueryFilterStepIf {
public record ReverseIndexRetainFilter(BTreeReader range, String name, long wordId) implements QueryFilterStepIf {
@Override
public void apply(LongQueryBuffer buffer) {

View File

@ -0,0 +1,109 @@
package nu.marginalia.index;
import nu.marginalia.array.algo.SortingContext;
import nu.marginalia.array.buffer.LongQueryBuffer;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.ReversePreindex;
import nu.marginalia.index.construction.TestJournalFactory;
import nu.marginalia.index.construction.TestJournalFactory.EntryDataWithWordMeta;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import static nu.marginalia.index.construction.TestJournalFactory.wm;
import static org.junit.jupiter.api.Assertions.*;
class ReverseIndexReaderTest {
TestJournalFactory journalFactory;
Path tempDir;
SortingContext sortingContext;
@BeforeEach
public void setUp() throws IOException {
journalFactory = new TestJournalFactory();
tempDir = Files.createTempDirectory("sort");
sortingContext = new SortingContext(Path.of("invalid"), 1<<20);
}
@AfterEach
public void tearDown() throws IOException {
journalFactory.clear();
List<Path> contents = new ArrayList<>();
Files.list(tempDir).forEach(contents::add);
for (var tempFile : contents) {
Files.delete(tempFile);
}
Files.delete(tempDir);
}
@Test
public void testSimple() throws IOException {
var indexReader = createIndex(
new EntryDataWithWordMeta(100, 101, wm(50, 51))
);
assertEquals(1, indexReader.numDocuments(50));
long[] meta = indexReader.getTermMeta(50, new long[] { 100 });
assertArrayEquals(new long[] { 51 }, meta);
assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50));
}
@Test
public void test2x2() throws IOException {
var indexReader = createIndex(
new EntryDataWithWordMeta(100, 101, wm(50, 51), wm(51, 52)),
new EntryDataWithWordMeta(101, 101, wm(51, 53), wm(52, 54))
);
assertEquals(1, indexReader.numDocuments(50));
assertEquals(2, indexReader.numDocuments(51));
assertEquals(1, indexReader.numDocuments(52));
assertArrayEquals(new long[] { 51 }, indexReader.getTermMeta(50, new long[] { 100 }));
assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50));
assertArrayEquals(new long[] { 52, 53 }, indexReader.getTermMeta(51, new long[] { 100, 101 }));
assertArrayEquals(new long[] { 100, 101 }, readEntries(indexReader, 51));
assertArrayEquals(new long[] { 54 }, indexReader.getTermMeta(52, new long[] { 101 }));
assertArrayEquals(new long[] { 101 }, readEntries(indexReader, 52));
}
private long[] readEntries(ReverseIndexReader reader, long wordId) {
var es = reader.documents(wordId);
assertTrue(es.hasMore());
LongQueryBuffer buffer = new LongQueryBuffer(4);
es.read(buffer);
assertFalse(es.hasMore());
return buffer.copyData();
}
private ReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException {
var reader = journalFactory.createReader(scenario);
var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir);
Path docsFile = tempDir.resolve("docs.dat");
Path wordsFile = tempDir.resolve("words.dat");
preindex.finalizeIndex(docsFile, wordsFile);
preindex.delete();
return new ReverseIndexReader(wordsFile, docsFile);
}
}

View File

@ -0,0 +1,171 @@
package nu.marginalia.index.construction;
import nu.marginalia.array.algo.SortingContext;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import static nu.marginalia.index.construction.TestJournalFactory.EntryData;
import static org.junit.jupiter.api.Assertions.assertEquals;
class ReversePreindexDocsTest {
Path countsFile;
Path wordsIdFile;
Path docsFile;
Path tempDir;
TestJournalFactory journalFactory;
@BeforeEach
public void setUp() throws IOException {
journalFactory = new TestJournalFactory();
countsFile = Files.createTempFile("counts", ".dat");
wordsIdFile = Files.createTempFile("words", ".dat");
docsFile = Files.createTempFile("docs", ".dat");
tempDir = Files.createTempDirectory("sort");
}
@AfterEach
public void tearDown() throws IOException {
journalFactory.clear();
Files.deleteIfExists(countsFile);
Files.deleteIfExists(wordsIdFile);
List<Path> contents = new ArrayList<>();
Files.list(tempDir).forEach(contents::add);
for (var tempFile : contents) {
Files.delete(tempFile);
}
Files.delete(tempDir);
}
@Test
public void testDocs() throws IOException {
var reader = journalFactory.createReader(
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33)
);
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), segments);
List<TestSegmentData> expected = List.of(
new TestSegmentData(-100, 0, 2, new long[] { -0xF00BA3L, 0 }),
new TestSegmentData(10, 2, 4, new long[] { -0xF00BA3L, 0 }),
new TestSegmentData(33, 4, 6, new long[] { -0xF00BA3L, 0 }),
new TestSegmentData(40, 6, 8, new long[] { -0xF00BA3L, 0 })
);
List<TestSegmentData> actual = new ArrayList<>();
var iter = segments.iterator(2);
while (iter.next()) {
long[] data = new long[(int) (iter.endOffset - iter.startOffset)];
docs.slice(iter.startOffset, iter.endOffset).get(0, data);
actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset,
data));
}
assertEquals(expected, actual);
}
@Test
public void testDocsRepeatedWord() throws IOException {
var reader = journalFactory.createReader(
new EntryData(-0xF00BA3L, 0, 4, 4)
);
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), segments);
List<TestSegmentData> expected = List.of(
new TestSegmentData(4, 0, 4, new long[] { -0xF00BA3L, 0, -0xF00BA3L, 0 })
);
List<TestSegmentData> actual = new ArrayList<>();
var iter = segments.iterator(2);
while (iter.next()) {
long[] data = new long[(int) (iter.endOffset - iter.startOffset)];
docs.slice(iter.startOffset, iter.endOffset).get(0, data);
actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset,
data));
}
assertEquals(expected, actual);
}
@Test
public void testDocs2() throws IOException {
var reader = journalFactory.createReader(
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33),
new EntryData(0xF00BA4L, 0, 15, 30, -100, 33)
);
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), segments);
List<TestSegmentData> expected = List.of(
new TestSegmentData(-100, 0, 4, new long[] { -0xF00BA3L, 0, 0xF00BA4L, 0 }),
new TestSegmentData(10, 4, 6, new long[] { -0xF00BA3L, 0}),
new TestSegmentData(15, 6, 8, new long[] { 0xF00BA4L, 0}),
new TestSegmentData(30, 8, 10, new long[] { 0xF00BA4L, 0}),
new TestSegmentData(33, 10, 14, new long[] { -0xF00BA3L, 0, 0xF00BA4L, 0}),
new TestSegmentData(40, 14, 16, new long[] { -0xF00BA3L, 0})
);
List<TestSegmentData> actual = new ArrayList<>();
var iter = segments.iterator(2);
while (iter.next()) {
long[] data = new long[(int) (iter.endOffset - iter.startOffset)];
docs.slice(iter.startOffset, iter.endOffset).get(0, data);
actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset,
data));
}
}
record TestSegmentData(long wordId, long start, long end, long[] data) {
public TestSegmentData(long wordId, long start, long end) {
this(wordId, start, end, null);
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
TestSegmentData that = (TestSegmentData) o;
if (wordId != that.wordId) return false;
if (start != that.start) return false;
if (end != that.end) return false;
return Arrays.equals(data, that.data);
}
@Override
public int hashCode() {
int result = (int) (wordId ^ (wordId >>> 32));
result = 31 * result + (int) (start ^ (start >>> 32));
result = 31 * result + (int) (end ^ (end >>> 32));
result = 31 * result + Arrays.hashCode(data);
return result;
}
@Override
public String toString() {
return "TestSegmentData{" +
"wordId=" + wordId +
", start=" + start +
", end=" + end +
", data=" + Arrays.toString(data) +
'}';
}
}
}

Some files were not shown because too many files have changed in this diff Show More