mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Merge pull request #42 from MarginaliaSearch/no-downtime-upgrades
Zero downtime upgrades, merge-based index construction
This commit is contained in:
commit
bdcbfb11a8
12
build.gradle
12
build.gradle
@ -13,6 +13,10 @@ tasks.register('dist', Copy) {
|
||||
from subprojects.collect { it.tasks.withType(Tar) }
|
||||
into "$buildDir/dist"
|
||||
|
||||
// For local development, each processes that are to be triggerable
|
||||
// from the control-service need to go here to end up somewhere the
|
||||
// control-service can find them
|
||||
|
||||
doLast {
|
||||
copy {
|
||||
from tarTree("$buildDir/dist/converter-process.tar")
|
||||
@ -34,10 +38,18 @@ tasks.register('dist', Copy) {
|
||||
from tarTree("$buildDir/dist/crawl-job-extractor-process.tar")
|
||||
into "$projectDir/run/dist/"
|
||||
}
|
||||
copy {
|
||||
from tarTree("$buildDir/dist/index-construction-process.tar")
|
||||
into "$projectDir/run/dist/"
|
||||
}
|
||||
}
|
||||
}
|
||||
idea {
|
||||
module {
|
||||
// Exclude these directories from being indexed by IntelliJ
|
||||
// as they tend to bring the IDE to its knees and use up all
|
||||
// Inotify spots in a hurry
|
||||
excludeDirs.add(file("$projectDir/run/backup"))
|
||||
excludeDirs.add(file("$projectDir/run/model"))
|
||||
excludeDirs.add(file("$projectDir/run/dist"))
|
||||
excludeDirs.add(file("$projectDir/run/samples"))
|
||||
|
@ -3,8 +3,6 @@ package nu.marginalia.index.client;
|
||||
public class IndexMqEndpoints {
|
||||
public static final String INDEX_IS_BLOCKED = "INDEX-IS-BLOCKED";
|
||||
public static final String INDEX_REPARTITION = "INDEX-REPARTITION";
|
||||
|
||||
public static final String INDEX_RELOAD_LEXICON = "INDEX-RELOAD-LEXICON";
|
||||
public static final String INDEX_REINDEX = "INDEX-REINDEX";
|
||||
public static final String SWITCH_INDEX = "SWITCH-INDEX";
|
||||
|
||||
}
|
||||
|
@ -2,16 +2,17 @@ package nu.marginalia.index.client.model.results;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.id.EdgeId;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/** Represents a document matching a search query */
|
||||
@AllArgsConstructor @Getter
|
||||
public class SearchResultItem {
|
||||
/** Encoded ID that contains both the URL id and its ranking */
|
||||
public class SearchResultItem implements Comparable<SearchResultItem> {
|
||||
/** Encoded ID that contains both the URL id and its ranking. This is
|
||||
* probably not what you want, use getDocumentId() instead */
|
||||
public final long combinedId;
|
||||
|
||||
/** How did the subqueries match against the document ? */
|
||||
@ -20,20 +21,18 @@ public class SearchResultItem {
|
||||
/** How many other potential results existed in the same domain */
|
||||
public int resultsFromDomain;
|
||||
|
||||
public SearchResultItem(long val) {
|
||||
this.combinedId = val;
|
||||
public SearchResultItem(long combinedId) {
|
||||
this.combinedId = combinedId;
|
||||
this.keywordScores = new ArrayList<>(16);
|
||||
}
|
||||
|
||||
public EdgeId<EdgeUrl> getUrlId() {
|
||||
return new EdgeId<>(getUrlIdInt());
|
||||
|
||||
public long getDocumentId() {
|
||||
return UrlIdCodec.removeRank(combinedId);
|
||||
}
|
||||
|
||||
public int getUrlIdInt() {
|
||||
return (int)(combinedId & 0xFFFF_FFFFL);
|
||||
}
|
||||
public int getRanking() {
|
||||
return (int)(combinedId >>> 32);
|
||||
return UrlIdCodec.getRank(combinedId);
|
||||
}
|
||||
|
||||
/* Used for evaluation */
|
||||
@ -45,20 +44,16 @@ public class SearchResultItem {
|
||||
return scoreValue;
|
||||
}
|
||||
|
||||
private transient int domainId = Integer.MIN_VALUE;
|
||||
public void setDomainId(int domainId) {
|
||||
this.domainId = domainId;
|
||||
}
|
||||
public int getDomainId() {
|
||||
return this.domainId;
|
||||
return UrlIdCodec.getDomainId(this.combinedId);
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return getUrlIdInt();
|
||||
return Long.hashCode(combinedId);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return getClass().getSimpleName() + "[ url= " + getUrlId() + ", rank=" + getRanking() + "]";
|
||||
return getClass().getSimpleName() + "[ url= " + getDocumentId() + ", rank=" + getRanking() + "]";
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
@ -67,18 +62,18 @@ public class SearchResultItem {
|
||||
if (other == this)
|
||||
return true;
|
||||
if (other instanceof SearchResultItem o) {
|
||||
return o.getUrlIdInt() == getUrlIdInt();
|
||||
return o.getDocumentId() == getDocumentId();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public long deduplicationKey() {
|
||||
final int domainId = getDomainId();
|
||||
@Override
|
||||
public int compareTo(@NotNull SearchResultItem o) {
|
||||
// this looks like a bug, but we actually want this in a reversed order
|
||||
int diff = o.getScore().compareTo(getScore());
|
||||
if (diff != 0)
|
||||
return diff;
|
||||
|
||||
if (domainId == Integer.MAX_VALUE || domainId == Integer.MIN_VALUE) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return domainId;
|
||||
return Long.compare(this.combinedId, o.combinedId);
|
||||
}
|
||||
}
|
||||
|
@ -6,7 +6,6 @@ import static java.lang.Boolean.compare;
|
||||
import static java.lang.Double.compare;
|
||||
|
||||
public record SearchResultPreliminaryScore(
|
||||
boolean disqualified,
|
||||
boolean hasPriorityTerm,
|
||||
double searchRankingScore)
|
||||
implements Comparable<SearchResultPreliminaryScore>
|
||||
@ -25,7 +24,4 @@ public record SearchResultPreliminaryScore(
|
||||
return PREFER_LOW * compare(searchRankingScore, other.searchRankingScore);
|
||||
}
|
||||
|
||||
public boolean isDisqualified() {
|
||||
return disqualified;
|
||||
}
|
||||
}
|
||||
|
@ -4,4 +4,6 @@ public class ProcessInboxNames {
|
||||
public static final String CONVERTER_INBOX = "converter";
|
||||
public static final String LOADER_INBOX = "loader";
|
||||
public static final String CRAWLER_INBOX = "crawler";
|
||||
|
||||
public static final String INDEX_CONSTRUCTOR_INBOX = "index_constructor";
|
||||
}
|
||||
|
@ -0,0 +1,5 @@
|
||||
package nu.marginalia.mqapi.index;
|
||||
|
||||
public record CreateIndexRequest(IndexName indexName)
|
||||
{
|
||||
}
|
@ -0,0 +1,7 @@
|
||||
package nu.marginalia.mqapi.index;
|
||||
|
||||
public enum IndexName {
|
||||
FORWARD,
|
||||
REVERSE_FULL,
|
||||
REVERSE_PRIO
|
||||
}
|
@ -3,4 +3,5 @@ package nu.marginalia.search.client;
|
||||
public class SearchMqEndpoints {
|
||||
/** Flushes the URL caches, run if significant changes have occurred in the URLs database */
|
||||
public static final String FLUSH_CACHES = "FLUSH_CACHES";
|
||||
public static final String SWITCH_LINKDB = "SWITCH_LINKDB";
|
||||
}
|
||||
|
@ -9,16 +9,16 @@ import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.id.EdgeId;
|
||||
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Optional;
|
||||
import java.util.OptionalInt;
|
||||
|
||||
@Singleton
|
||||
public class DbDomainQueries {
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
private final Cache<EdgeDomain, EdgeId<EdgeDomain>> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
private final Cache<EdgeDomain, Integer> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
|
||||
@Inject
|
||||
public DbDomainQueries(HikariDataSource dataSource)
|
||||
@ -28,7 +28,7 @@ public class DbDomainQueries {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public EdgeId<EdgeDomain> getDomainId(EdgeDomain domain) {
|
||||
public Integer getDomainId(EdgeDomain domain) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
return domainIdCache.get(domain, () -> {
|
||||
@ -36,7 +36,7 @@ public class DbDomainQueries {
|
||||
stmt.setString(1, domain.toString());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return new EdgeId<>(rsp.getInt(1));
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
}
|
||||
throw new NoSuchElementException();
|
||||
@ -48,12 +48,12 @@ public class DbDomainQueries {
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public Optional<EdgeId<EdgeDomain>> tryGetDomainId(EdgeDomain domain) {
|
||||
public OptionalInt tryGetDomainId(EdgeDomain domain) {
|
||||
|
||||
var maybe = Optional.ofNullable(domainIdCache.getIfPresent(domain));
|
||||
|
||||
if (maybe.isPresent())
|
||||
return maybe;
|
||||
Integer maybeId = domainIdCache.getIfPresent(domain);
|
||||
if (maybeId != null) {
|
||||
return OptionalInt.of(maybeId);
|
||||
}
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
@ -61,25 +61,25 @@ public class DbDomainQueries {
|
||||
stmt.setString(1, domain.toString());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
var id = new EdgeId<EdgeDomain>(rsp.getInt(1));
|
||||
var id = rsp.getInt(1);
|
||||
|
||||
domainIdCache.put(domain, id);
|
||||
return Optional.of(id);
|
||||
return OptionalInt.of(id);
|
||||
}
|
||||
}
|
||||
return Optional.empty();
|
||||
return OptionalInt.empty();
|
||||
}
|
||||
catch (UncheckedExecutionException ex) {
|
||||
return Optional.empty();
|
||||
return OptionalInt.empty();
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public Optional<EdgeDomain> getDomain(EdgeId<EdgeDomain> id) {
|
||||
public Optional<EdgeDomain> getDomain(int id) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
|
||||
stmt.setInt(1, id.id());
|
||||
stmt.setInt(1, id);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return Optional.of(new EdgeDomain(rsp.getString(1)));
|
||||
|
@ -2,15 +2,10 @@ package nu.marginalia.db;
|
||||
|
||||
import com.google.inject.ImplementedBy;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.id.EdgeId;
|
||||
|
||||
@ImplementedBy(DomainBlacklistImpl.class)
|
||||
public interface DomainBlacklist {
|
||||
boolean isBlacklisted(int domainId);
|
||||
default boolean isBlacklisted(EdgeId<EdgeDomain> domainId) {
|
||||
return isBlacklisted(domainId.id());
|
||||
}
|
||||
default TIntHashSet getSpamDomains() {
|
||||
return new TIntHashSet();
|
||||
}
|
||||
|
@ -1,13 +1,14 @@
|
||||
package nu.marginalia.db;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import gnu.trove.list.TIntList;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.id.EdgeIdList;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.slf4j.Logger;
|
||||
|
||||
import javax.inject.Inject;
|
||||
import javax.inject.Singleton;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
@ -58,10 +59,10 @@ public class DomainTypes {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/** Retrieve the EdgeId of all domains of a certain type,
|
||||
/** Retrieve the domain id of all domains of a certain type,
|
||||
* ignoring entries that are not in the EC_DOMAIN table */
|
||||
public EdgeIdList<EdgeDomain> getKnownDomainsByType(Type type) {
|
||||
EdgeIdList<EdgeDomain> ret = new EdgeIdList<>();
|
||||
public TIntList getKnownDomainsByType(Type type) {
|
||||
TIntList ret = new TIntArrayList();
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
|
@ -5,8 +5,8 @@ import nu.marginalia.db.storage.model.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.inject.Inject;
|
||||
import javax.inject.Singleton;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
|
@ -5,9 +5,9 @@ public enum FileStorageType {
|
||||
CRAWL_DATA,
|
||||
PROCESSED_DATA,
|
||||
INDEX_STAGING,
|
||||
LEXICON_STAGING,
|
||||
LINKDB_STAGING,
|
||||
LINKDB_LIVE,
|
||||
INDEX_LIVE,
|
||||
LEXICON_LIVE,
|
||||
BACKUP,
|
||||
EXPORT,
|
||||
SEARCH_SETS
|
||||
|
@ -0,0 +1,9 @@
|
||||
ALTER TABLE FILE_STORAGE MODIFY COLUMN TYPE ENUM ('CRAWL_SPEC', 'CRAWL_DATA', 'PROCESSED_DATA', 'INDEX_STAGING', 'LEXICON_STAGING', 'INDEX_LIVE', 'LEXICON_LIVE', 'SEARCH_SETS', 'BACKUP', 'EXPORT', 'LINKDB_LIVE', 'LINKDB_STAGING') NOT NULL;
|
||||
|
||||
INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE)
|
||||
SELECT ID, 'ldbr', "Linkdb Current", 'LINKDB_LIVE'
|
||||
FROM FILE_STORAGE_BASE WHERE NAME='Index Storage';
|
||||
|
||||
INSERT IGNORE INTO FILE_STORAGE(BASE_ID, PATH, DESCRIPTION, TYPE)
|
||||
SELECT ID, 'ldbw', "Linkdb Staging Area", 'LINKDB_STAGING'
|
||||
FROM FILE_STORAGE_BASE WHERE NAME='Index Storage';
|
@ -0,0 +1,3 @@
|
||||
DROP VIEW EC_URL_VIEW;
|
||||
DROP TABLE EC_PAGE_DATA;
|
||||
DROP TABLE EC_URL;
|
@ -0,0 +1,3 @@
|
||||
INSERT IGNORE INTO FILE_STORAGE_BASE(NAME, PATH, TYPE, PERMIT_TEMP)
|
||||
VALUES
|
||||
('Backup Storage', '/backup', 'BACKUP', true);
|
@ -0,0 +1 @@
|
||||
DELETE FROM FILE_STORAGE WHERE TYPE IN ('LEXICON_STAGING', 'LEXICON_LIVE');
|
56
code/common/linkdb/build.gradle
Normal file
56
code/common/linkdb/build.gradle
Normal file
@ -0,0 +1,56 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id "io.freefair.lombok" version "8.2.2"
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(20))
|
||||
}
|
||||
}
|
||||
|
||||
configurations {
|
||||
flywayMigration.extendsFrom(implementation)
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:model')
|
||||
|
||||
implementation libs.lombok
|
||||
annotationProcessor libs.lombok
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation libs.guice
|
||||
implementation libs.bundles.gson
|
||||
|
||||
implementation libs.notnull
|
||||
|
||||
implementation libs.sqlite
|
||||
implementation libs.commons.lang3
|
||||
|
||||
implementation libs.trove
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||
}
|
||||
|
||||
|
||||
test {
|
||||
maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1
|
||||
maxHeapSize = "8G"
|
||||
useJUnitPlatform()
|
||||
}
|
||||
|
||||
task fastTests(type: Test) {
|
||||
maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1
|
||||
maxHeapSize = "8G"
|
||||
useJUnitPlatform {
|
||||
excludeTags "slow"
|
||||
}
|
||||
}
|
@ -0,0 +1,102 @@
|
||||
package nu.marginalia.linkdb;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.google.inject.name.Named;
|
||||
import gnu.trove.list.TLongList;
|
||||
import nu.marginalia.linkdb.model.LdbUrlDetail;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Singleton
|
||||
public class LinkdbReader {
|
||||
private Path dbFile;
|
||||
private volatile Connection connection;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Inject
|
||||
public LinkdbReader(@Named("linkdb-file") Path dbFile) throws SQLException {
|
||||
this.dbFile = dbFile;
|
||||
|
||||
if (Files.exists(dbFile)) {
|
||||
try {
|
||||
connection = createConnection();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
connection = null;
|
||||
logger.error("Failed to load linkdb file", ex);
|
||||
}
|
||||
}
|
||||
else {
|
||||
logger.warn("No linkdb file {}", dbFile);
|
||||
}
|
||||
}
|
||||
|
||||
private Connection createConnection() throws SQLException {
|
||||
String connStr = "jdbc:sqlite:" + dbFile.toString();
|
||||
return DriverManager.getConnection(connStr);
|
||||
}
|
||||
|
||||
public void switchInput(Path newDbFile) throws IOException, SQLException {
|
||||
if (connection != null) {
|
||||
connection.close();
|
||||
}
|
||||
|
||||
Files.move(newDbFile, dbFile, StandardCopyOption.REPLACE_EXISTING);
|
||||
|
||||
connection = createConnection();
|
||||
}
|
||||
|
||||
public List<LdbUrlDetail> getUrlDetails(TLongList ids) throws SQLException {
|
||||
List<LdbUrlDetail> ret = new ArrayList<>(ids.size());
|
||||
|
||||
if (connection == null ||
|
||||
connection.isClosed())
|
||||
{
|
||||
throw new RuntimeException("URL query temporarily unavailable due to database switch");
|
||||
}
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
SELECT ID, URL, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR
|
||||
FROM DOCUMENT WHERE ID = ?
|
||||
""")) {
|
||||
for (int i = 0; i < ids.size(); i++) {
|
||||
long id = ids.get(i);
|
||||
stmt.setLong(1, id);
|
||||
var rs = stmt.executeQuery();
|
||||
if (rs.next()) {
|
||||
var url = new EdgeUrl(rs.getString("URL"));
|
||||
ret.add(new LdbUrlDetail(
|
||||
rs.getLong("ID"),
|
||||
url,
|
||||
rs.getString("TITLE"),
|
||||
rs.getString("DESCRIPTION"),
|
||||
rs.getDouble("QUALITY"),
|
||||
rs.getString("FORMAT"),
|
||||
rs.getInt("FEATURES"),
|
||||
rs.getInt("PUB_YEAR"),
|
||||
rs.getLong("DATA_HASH"),
|
||||
rs.getInt("WORDS_TOTAL")
|
||||
));
|
||||
}
|
||||
}
|
||||
} catch (URISyntaxException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
}
|
@ -0,0 +1,64 @@
|
||||
package nu.marginalia.linkdb;
|
||||
|
||||
import nu.marginalia.linkdb.model.UrlStatus;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Types;
|
||||
import java.util.List;
|
||||
|
||||
public class LinkdbStatusWriter {
|
||||
|
||||
private final Connection connection;
|
||||
|
||||
public LinkdbStatusWriter(Path outputFile) throws SQLException {
|
||||
String connStr = "jdbc:sqlite:" + outputFile.toString();
|
||||
connection = DriverManager.getConnection(connStr);
|
||||
|
||||
try (var stream = ClassLoader.getSystemResourceAsStream("db/linkdb-status.sql");
|
||||
var stmt = connection.createStatement()
|
||||
) {
|
||||
var sql = new String(stream.readAllBytes());
|
||||
stmt.executeUpdate(sql);
|
||||
|
||||
// Disable synchronous writing as this is a one-off operation with no recovery
|
||||
stmt.execute("PRAGMA synchronous = OFF");
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public void add(List<UrlStatus> statuses) throws SQLException {
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
INSERT INTO STATUS(ID, URL, STATUS, DESCRIPTION)
|
||||
VALUES (?, ?, ?, ?)
|
||||
""")) {
|
||||
int count = 0;
|
||||
for (var status : statuses) {
|
||||
stmt.setLong(1, status.id());
|
||||
stmt.setString(2, status.url().toString());
|
||||
stmt.setString(3, status.status());
|
||||
if (status.description() == null) {
|
||||
stmt.setNull(4, Types.VARCHAR);
|
||||
} else {
|
||||
stmt.setString(4, status.description());
|
||||
}
|
||||
stmt.addBatch();
|
||||
if (++count > 1000) {
|
||||
count = 0;
|
||||
stmt.executeBatch();
|
||||
}
|
||||
}
|
||||
if (count != 0) {
|
||||
stmt.executeBatch();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void close() throws SQLException {
|
||||
connection.close();
|
||||
}
|
||||
}
|
@ -0,0 +1,80 @@
|
||||
package nu.marginalia.linkdb;
|
||||
|
||||
import nu.marginalia.linkdb.model.LdbUrlDetail;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Types;
|
||||
import java.util.List;
|
||||
|
||||
public class LinkdbWriter {
|
||||
|
||||
private final Connection connection;
|
||||
|
||||
public LinkdbWriter(Path outputFile) throws SQLException {
|
||||
String connStr = "jdbc:sqlite:" + outputFile.toString();
|
||||
connection = DriverManager.getConnection(connStr);
|
||||
|
||||
try (var stream = ClassLoader.getSystemResourceAsStream("db/linkdb-document.sql");
|
||||
var stmt = connection.createStatement()
|
||||
) {
|
||||
var sql = new String(stream.readAllBytes());
|
||||
stmt.executeUpdate(sql);
|
||||
|
||||
// Disable synchronous writing as this is a one-off operation with no recovery
|
||||
stmt.execute("PRAGMA synchronous = OFF");
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public void add(LdbUrlDetail ldbUrlDetail) throws SQLException {
|
||||
add(List.of(ldbUrlDetail));
|
||||
}
|
||||
|
||||
public void add(List<LdbUrlDetail> ldbUrlDetail) throws SQLException {
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
INSERT OR IGNORE INTO DOCUMENT(ID, URL, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""")) {
|
||||
|
||||
int i = 0;
|
||||
for (var document : ldbUrlDetail) {
|
||||
var url = document.url();
|
||||
|
||||
stmt.setLong(1, document.urlId());
|
||||
stmt.setString(2, url.toString());
|
||||
|
||||
stmt.setString(3, document.title());
|
||||
stmt.setString(4, document.description());
|
||||
stmt.setInt(5, document.wordsTotal());
|
||||
stmt.setString(6, document.format());
|
||||
stmt.setInt(7, document.features());
|
||||
stmt.setLong(8, document.dataHash());
|
||||
stmt.setDouble(9, document.urlQuality());
|
||||
if (document.pubYear() == null) {
|
||||
stmt.setNull(10, Types.INTEGER);
|
||||
} else {
|
||||
stmt.setInt(10, document.pubYear());
|
||||
}
|
||||
|
||||
stmt.addBatch();
|
||||
|
||||
if (++i > 1000) {
|
||||
stmt.executeBatch();
|
||||
i = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (i != 0) stmt.executeBatch();
|
||||
}
|
||||
}
|
||||
|
||||
public void close() throws SQLException {
|
||||
connection.close();
|
||||
}
|
||||
}
|
@ -0,0 +1,18 @@
|
||||
package nu.marginalia.linkdb.model;
|
||||
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
public record LdbUrlDetail(long urlId,
|
||||
EdgeUrl url,
|
||||
String title,
|
||||
String description,
|
||||
double urlQuality,
|
||||
String format,
|
||||
int features,
|
||||
Integer pubYear,
|
||||
long dataHash,
|
||||
int wordsTotal
|
||||
)
|
||||
|
||||
{
|
||||
}
|
@ -0,0 +1,24 @@
|
||||
package nu.marginalia.linkdb.model;
|
||||
|
||||
public enum UrlProtocol {
|
||||
HTTP,
|
||||
HTTPS;
|
||||
|
||||
public static int encode(String str) {
|
||||
if ("http".equalsIgnoreCase(str)) {
|
||||
return HTTP.ordinal();
|
||||
}
|
||||
else if ("https".equalsIgnoreCase(str)) {
|
||||
return HTTPS.ordinal();
|
||||
}
|
||||
|
||||
throw new IllegalArgumentException(str);
|
||||
}
|
||||
|
||||
public static String decode(int ordinal) {
|
||||
return switch (values()[ordinal]) {
|
||||
case HTTP -> "http";
|
||||
case HTTPS -> "https";
|
||||
};
|
||||
};
|
||||
}
|
@ -0,0 +1,8 @@
|
||||
package nu.marginalia.linkdb.model;
|
||||
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
public record UrlStatus(long id, EdgeUrl url, String status, @Nullable String description) {
|
||||
}
|
17
code/common/linkdb/src/main/resources/db/linkdb-document.sql
Normal file
17
code/common/linkdb/src/main/resources/db/linkdb-document.sql
Normal file
@ -0,0 +1,17 @@
|
||||
CREATE TABLE DOCUMENT (
|
||||
ID INT8 PRIMARY KEY,
|
||||
|
||||
URL TEXT,
|
||||
|
||||
STATE INT,
|
||||
TITLE TEXT NOT NULL,
|
||||
DESCRIPTION TEXT NOT NULL,
|
||||
|
||||
WORDS_TOTAL INTEGER NOT NULL,
|
||||
FORMAT TEXT NOT NULL,
|
||||
FEATURES INTEGER NOT NULL,
|
||||
|
||||
DATA_HASH INTEGER NOT NULL,
|
||||
QUALITY REAL NOT NULL,
|
||||
PUB_YEAR INTEGER NOT NULL
|
||||
);
|
@ -0,0 +1,6 @@
|
||||
CREATE TABLE STATUS (
|
||||
ID INT8 PRIMARY KEY,
|
||||
URL TEXT,
|
||||
STATUS TEXT NOT NULL,
|
||||
DESCRIPTION TEXT
|
||||
);
|
@ -0,0 +1,33 @@
|
||||
package nu.marginalia.linkdb;
|
||||
|
||||
import nu.marginalia.linkdb.model.UrlStatus;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.List;
|
||||
|
||||
public class LinkdbStatusWriterTest {
|
||||
@Test
|
||||
public void testCreate() throws IOException {
|
||||
Path tempPath = Files.createTempFile("linkdb-status", ".db");
|
||||
try {
|
||||
var writer = new LinkdbStatusWriter(tempPath);
|
||||
writer.add(List.of(
|
||||
new UrlStatus(5, new EdgeUrl("https://www.marginalia.nu/x"), "y", null),
|
||||
new UrlStatus(6, new EdgeUrl("https://www.marginalia.nu/y"), "y", "z")
|
||||
));
|
||||
writer.close();
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException(e);
|
||||
} catch (URISyntaxException e) {
|
||||
throw new RuntimeException(e);
|
||||
} finally {
|
||||
Files.deleteIfExists(tempPath);
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,42 @@
|
||||
package nu.marginalia.linkdb;
|
||||
|
||||
import gnu.trove.list.array.TLongArrayList;
|
||||
import nu.marginalia.linkdb.model.LdbUrlDetail;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
|
||||
public class LinkdbWriterTest {
|
||||
@Test
|
||||
public void testCreate() throws IOException {
|
||||
Path tempPath = Files.createTempFile("linkdb", ".db");
|
||||
try {
|
||||
var writer = new LinkdbWriter(tempPath);
|
||||
writer.add(new LdbUrlDetail(
|
||||
1,
|
||||
new nu.marginalia.model.EdgeUrl("http", new EdgeDomain("example.com"), null, "/", null),
|
||||
"Test",
|
||||
"This is a test",
|
||||
-4.,
|
||||
"XHTML",
|
||||
5,
|
||||
2020,
|
||||
0xF00BA3,
|
||||
444
|
||||
));
|
||||
writer.close();
|
||||
|
||||
var reader = new LinkdbReader(tempPath);
|
||||
var deets = reader.getUrlDetails(new TLongArrayList(new long[]{1}));
|
||||
System.out.println(deets);
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException(e);
|
||||
} finally {
|
||||
Files.deleteIfExists(tempPath);
|
||||
}
|
||||
}
|
||||
}
|
@ -6,7 +6,6 @@ import nu.marginalia.bigstring.BigString;
|
||||
import nu.marginalia.bigstring.CompressedBigString;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.id.EdgeId;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
|
||||
@ -24,8 +23,6 @@ public class GsonFactory {
|
||||
}
|
||||
})
|
||||
.registerTypeAdapter(EdgeDomain.class, (JsonDeserializer<EdgeDomain>) (json, typeOfT, context) -> new EdgeDomain(json.getAsString()))
|
||||
.registerTypeAdapter(EdgeId.class, (JsonDeserializer<EdgeId<?>>) (json, typeOfT, context) -> new EdgeId<>(json.getAsInt()))
|
||||
.registerTypeAdapter(EdgeId.class, (JsonSerializer<EdgeId<?>>) (src, typeOfSrc, context) -> new JsonPrimitive(src.id()))
|
||||
.registerTypeAdapter(BigString.class, (JsonDeserializer<BigString>) (json, typeOfT, context) -> BigString.encode(json.getAsString()))
|
||||
.registerTypeAdapter(BigString.class, (JsonSerializer<BigString>) (src, typeOfT, context) -> new JsonPrimitive(src.decode()))
|
||||
.registerTypeAdapter(CompressedBigString.class, (JsonSerializer<BigString>) (src, typeOfT, context) -> new JsonPrimitive(src.decode()))
|
||||
|
@ -1,10 +0,0 @@
|
||||
package nu.marginalia.model.id;
|
||||
|
||||
|
||||
/**
|
||||
* This exists entirely for strengthening the typing of IDs
|
||||
*
|
||||
* @param <T>
|
||||
*/
|
||||
public record EdgeId<T>(int id) {
|
||||
}
|
@ -1,34 +0,0 @@
|
||||
package nu.marginalia.model.id;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
public record EdgeIdArray<T> (int... values) implements EdgeIdCollection<T> {
|
||||
|
||||
public static <T> EdgeIdArray<T> gather(IntStream stream) {
|
||||
return new EdgeIdArray<>(stream.toArray());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int[] values() {
|
||||
return values;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isEmpty() {
|
||||
return values.length == 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
return values.length;
|
||||
}
|
||||
|
||||
public int get(int idx) {
|
||||
return values[idx];
|
||||
}
|
||||
|
||||
public void sort() {
|
||||
Arrays.sort(values);
|
||||
}
|
||||
}
|
@ -1,28 +0,0 @@
|
||||
package nu.marginalia.model.id;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
public interface EdgeIdCollection<T> extends Iterable<EdgeId<T>> {
|
||||
int size();
|
||||
boolean isEmpty();
|
||||
int[] values();
|
||||
|
||||
default IntStream stream() {
|
||||
return Arrays.stream(values());
|
||||
}
|
||||
|
||||
default Iterator<EdgeId<T>> iterator() {
|
||||
return Arrays.stream(values()).mapToObj(EdgeId<T>::new).iterator();
|
||||
}
|
||||
default EdgeIdArray<T> asArray() {
|
||||
return new EdgeIdArray<>(values());
|
||||
}
|
||||
default EdgeIdList<T> asList() {
|
||||
return new EdgeIdList<>(values());
|
||||
}
|
||||
default EdgeIdSet<T> asSet() {
|
||||
return new EdgeIdSet<>(values());
|
||||
}
|
||||
}
|
@ -1,12 +0,0 @@
|
||||
package nu.marginalia.model.id;
|
||||
|
||||
import gnu.trove.TIntCollection;
|
||||
|
||||
public interface EdgeIdCollectionMutable<T> {
|
||||
TIntCollection underlyingCollection();
|
||||
|
||||
default void addAll(EdgeIdArray<T> other) { underlyingCollection().addAll(other.values()); }
|
||||
default void addAll(EdgeIdList<T> other) { underlyingCollection().addAll(other.list()); }
|
||||
default void addAll(EdgeIdCollection<T> other) { underlyingCollection().addAll(other.values()); }
|
||||
|
||||
}
|
@ -1,48 +0,0 @@
|
||||
package nu.marginalia.model.id;
|
||||
|
||||
import gnu.trove.TIntCollection;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
public record EdgeIdList<T> (TIntArrayList list) implements
|
||||
EdgeIdCollection<T>,
|
||||
EdgeIdCollectionMutable<T> {
|
||||
|
||||
public EdgeIdList(int... values) { this(new TIntArrayList(values)); }
|
||||
public static <T> EdgeIdList<T> gather(IntStream stream) {
|
||||
return stream.collect(EdgeIdList::new, EdgeIdList::add, EdgeIdList::addAll);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int[] values() {
|
||||
return list.toArray();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isEmpty() {
|
||||
return list.isEmpty();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
return list.size();
|
||||
}
|
||||
|
||||
public int get(int idx) {
|
||||
return list.get(idx);
|
||||
}
|
||||
|
||||
public void add(int id) {
|
||||
list.add(id);
|
||||
}
|
||||
|
||||
public void sort() {
|
||||
list.sort();
|
||||
}
|
||||
|
||||
@Override
|
||||
public TIntCollection underlyingCollection() {
|
||||
return list;
|
||||
}
|
||||
}
|
@ -1,52 +0,0 @@
|
||||
package nu.marginalia.model.id;
|
||||
|
||||
import gnu.trove.TIntCollection;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
public record EdgeIdSet<T> (TIntHashSet set) implements EdgeIdCollection<T>, EdgeIdCollectionMutable<T> {
|
||||
|
||||
public EdgeIdSet(int... values) {
|
||||
this(new TIntHashSet(values.length, 0.5f, -1));
|
||||
|
||||
set.addAll(values);
|
||||
}
|
||||
|
||||
public EdgeIdSet(int initialCapacity, float loadFactor) {
|
||||
this(new TIntHashSet(initialCapacity, loadFactor, -1));
|
||||
}
|
||||
|
||||
@Override
|
||||
public TIntCollection underlyingCollection() {
|
||||
return set;
|
||||
}
|
||||
|
||||
public static <T> EdgeIdSet<T> gather(IntStream stream) {
|
||||
return new EdgeIdSet<>(stream.toArray());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int[] values() {
|
||||
return set.toArray();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isEmpty() {
|
||||
return set.isEmpty();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
return set.size();
|
||||
}
|
||||
|
||||
public boolean contains(int id) {
|
||||
return set.contains(id);
|
||||
}
|
||||
public boolean add(int id) {
|
||||
return set.add(id);
|
||||
}
|
||||
public boolean remove(int id) { return set.remove(id); }
|
||||
|
||||
}
|
@ -0,0 +1,78 @@
|
||||
package nu.marginalia.model.id;
|
||||
|
||||
/** URL id encoding scheme, including an optional ranking part that's used in the indices and washed away
|
||||
* outside. The ranking part is put in the highest bits so that when we sort the documents by id, they're
|
||||
* actually sorted by rank. Next is the domain id part, which keeps documents from the same domain clustered.
|
||||
* Finally is the document ordinal part, which is a non-unique sequence number for within the current set of
|
||||
* documents loaded. The same ID may be re-used over time as a new index is loaded.
|
||||
* <p></p>
|
||||
* <table>
|
||||
* <tr><th>Part</th><th>Bits</th><th>Cardinality</th></tr>
|
||||
* <tr>
|
||||
* <td>rank</td><td>6 bits</td><td>64</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>domain</td><td>31 bits</td><td>2 billion</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>document</td><td>26 bits</td><td>67 million</td>
|
||||
* </tr>
|
||||
* </table>
|
||||
* <p></p>
|
||||
* Most significant bit is unused for now because I'm not routing Long.compareUnsigned() all over the codebase.
|
||||
* <i>If</i> we end up needing more domains, we'll cross that bridge when we come to it.
|
||||
*
|
||||
* <h2>Coding Scheme</h2>
|
||||
* <code><pre>
|
||||
* [ | rank | domain | url ]
|
||||
* 0 1 6 38 64
|
||||
* </pre></code>
|
||||
*/
|
||||
public class UrlIdCodec {
|
||||
private static final long RANK_MASK = 0xFE00_0000_0000_0000L;
|
||||
private static final int DOCORD_MASK = 0x03FF_FFFF;
|
||||
|
||||
/** Encode a URL id without a ranking element */
|
||||
public static long encodeId(int domainId, int documentOrdinal) {
|
||||
domainId &= 0x7FFF_FFFF;
|
||||
documentOrdinal &= 0x03FF_FFFF;
|
||||
|
||||
return ((long) domainId << 26) | documentOrdinal;
|
||||
}
|
||||
|
||||
/** Add a ranking element to an existing combined URL id.
|
||||
*
|
||||
* @param rank [0,1] the importance of the domain, low is good
|
||||
* @param urlId
|
||||
*/
|
||||
public static long addRank(float rank, long urlId) {
|
||||
long rankPart = (int)(rank * (1<<6));
|
||||
|
||||
if (rankPart >= 64) rankPart = 63;
|
||||
if (rankPart < 0) rankPart = 0;
|
||||
|
||||
return (urlId&(~RANK_MASK)) | (rankPart << 57);
|
||||
}
|
||||
|
||||
/** Extract the domain component from this URL id */
|
||||
public static int getDomainId(long combinedId) {
|
||||
return (int) ((combinedId >>> 26) & 0x7FFF_FFFFL);
|
||||
}
|
||||
|
||||
/** Extract the document ordinal component from this URL id */
|
||||
public static int getDocumentOrdinal(long combinedId) {
|
||||
return (int) (combinedId & DOCORD_MASK);
|
||||
}
|
||||
|
||||
|
||||
/** Extract the document ordinal component from this URL id */
|
||||
public static int getRank(long combinedId) {
|
||||
return (int) (combinedId >>> 57);
|
||||
}
|
||||
|
||||
/** Mask out the ranking element from this URL id */
|
||||
public static long removeRank(long combinedId) {
|
||||
return combinedId & ~RANK_MASK;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,41 @@
|
||||
package nu.marginalia.model.id;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class UrlIdCodecTest {
|
||||
@Test
|
||||
public void testDocumentBounds() {
|
||||
long encoded = UrlIdCodec.encodeId(0, ~0);
|
||||
assertEquals(0, UrlIdCodec.getDomainId(encoded));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDomainBounds() {
|
||||
long encoded = UrlIdCodec.encodeId(~0, 0);
|
||||
assertEquals(0x7FFF_FFFF, UrlIdCodec.getDomainId(encoded));
|
||||
assertEquals(0, UrlIdCodec.getRank(encoded));
|
||||
assertEquals(0, UrlIdCodec.getDocumentOrdinal(encoded));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRankBoundsAdd() {
|
||||
long encoded = UrlIdCodec.encodeId(0, 0);
|
||||
encoded = UrlIdCodec.addRank(1.f, encoded);
|
||||
assertEquals(0, UrlIdCodec.getDomainId(encoded));
|
||||
assertEquals(63, UrlIdCodec.getRank(encoded));
|
||||
assertEquals(0, UrlIdCodec.getDocumentOrdinal(encoded));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRemoveRank() {
|
||||
long encoded = UrlIdCodec.encodeId(0x7FFF_FFFF, ~0);
|
||||
encoded = UrlIdCodec.addRank(1.f, encoded);
|
||||
encoded = UrlIdCodec.removeRank(encoded);
|
||||
assertEquals(0x7FFF_FFFFL, UrlIdCodec.getDomainId(encoded));
|
||||
assertEquals(0, UrlIdCodec.getRank(encoded));
|
||||
assertEquals(0x03FF_FFFF, UrlIdCodec.getDocumentOrdinal(encoded));
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,35 @@
|
||||
package nu.marginalia.process.control;
|
||||
|
||||
/** Dummy implementation of ProcessHeartbeat that does nothing */
|
||||
public class FakeProcessHeartbeat implements ProcessHeartbeat {
|
||||
|
||||
@Override
|
||||
public <T extends Enum<T>> ProcessTaskHeartbeat<T> createProcessTaskHeartbeat(Class<T> steps, String processName) {
|
||||
return new ProcessTaskHeartbeat<>() {
|
||||
@Override
|
||||
public void progress(T step) {}
|
||||
|
||||
@Override
|
||||
public void shutDown() {}
|
||||
|
||||
@Override
|
||||
public void close() {}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public ProcessAdHocTaskHeartbeat createAdHocTaskHeartbeat(String processName) {
|
||||
return new ProcessAdHocTaskHeartbeat() {
|
||||
@Override
|
||||
public void progress(String step, int progress, int total) {}
|
||||
|
||||
@Override
|
||||
public void close() {}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setProgress(double progress) {}
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,7 @@
|
||||
package nu.marginalia.process.control;
|
||||
|
||||
public interface ProcessAdHocTaskHeartbeat extends AutoCloseable {
|
||||
void progress(String step, int progress, int total);
|
||||
|
||||
void close();
|
||||
}
|
@ -0,0 +1,187 @@
|
||||
package nu.marginalia.process.control;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.ProcessConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/** This object sends a heartbeat to the database every few seconds,
|
||||
* updating with the progress of a task within a service. Progress is tracked by providing
|
||||
* enumerations corresponding to the steps in the task. It's important they're arranged in the same
|
||||
* order as the steps in the task in order to get an accurate progress tracking.
|
||||
*/
|
||||
public class ProcessAdHocTaskHeartbeatImpl implements AutoCloseable, ProcessAdHocTaskHeartbeat {
|
||||
private final Logger logger = LoggerFactory.getLogger(ProcessAdHocTaskHeartbeatImpl.class);
|
||||
private final String taskName;
|
||||
private final String taskBase;
|
||||
private final String instanceUUID;
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
|
||||
private final Thread runnerThread;
|
||||
private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 1);
|
||||
private final String serviceInstanceUUID;
|
||||
private int progress;
|
||||
|
||||
private volatile boolean running = false;
|
||||
private volatile String step = "-";
|
||||
|
||||
ProcessAdHocTaskHeartbeatImpl(ProcessConfiguration configuration,
|
||||
String taskName,
|
||||
HikariDataSource dataSource)
|
||||
{
|
||||
this.taskName = configuration.processName() + "." + taskName + ":" + configuration.node();
|
||||
this.taskBase = configuration.processName() + "." + taskName;
|
||||
this.dataSource = dataSource;
|
||||
|
||||
this.instanceUUID = UUID.randomUUID().toString();
|
||||
this.serviceInstanceUUID = configuration.instanceUuid().toString();
|
||||
|
||||
heartbeatInit();
|
||||
|
||||
runnerThread = new Thread(this::run);
|
||||
runnerThread.start();
|
||||
}
|
||||
|
||||
/** Update the progress of the task. This is a fast function that doesn't block;
|
||||
* the actual update is done in a separate thread.
|
||||
*
|
||||
* @param step The current step in the task.
|
||||
*/
|
||||
@Override
|
||||
public void progress(String step, int stepProgress, int stepCount) {
|
||||
this.step = step;
|
||||
|
||||
|
||||
// off by one since we calculate the progress based on the number of steps,
|
||||
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
|
||||
// final progress being 80% and not 100%)
|
||||
|
||||
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
|
||||
|
||||
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
|
||||
}
|
||||
|
||||
public void shutDown() {
|
||||
if (!running)
|
||||
return;
|
||||
|
||||
running = false;
|
||||
|
||||
try {
|
||||
runnerThread.join();
|
||||
heartbeatStop();
|
||||
}
|
||||
catch (InterruptedException|SQLException ex) {
|
||||
logger.warn("ProcessHeartbeat shutdown failed", ex);
|
||||
}
|
||||
}
|
||||
|
||||
private void run() {
|
||||
if (!running)
|
||||
running = true;
|
||||
else
|
||||
return;
|
||||
|
||||
try {
|
||||
while (running) {
|
||||
try {
|
||||
heartbeatUpdate();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.warn("ProcessHeartbeat failed to update", ex);
|
||||
}
|
||||
|
||||
TimeUnit.SECONDS.sleep(heartbeatInterval);
|
||||
}
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
logger.error("ProcessHeartbeat caught irrecoverable exception, killing service", ex);
|
||||
System.exit(255);
|
||||
}
|
||||
}
|
||||
|
||||
private void heartbeatInit() {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
INSERT INTO TASK_HEARTBEAT (TASK_NAME, TASK_BASE, INSTANCE, SERVICE_INSTANCE, HEARTBEAT_TIME, STATUS)
|
||||
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP(6), 'STARTING')
|
||||
ON DUPLICATE KEY UPDATE
|
||||
INSTANCE = ?,
|
||||
SERVICE_INSTANCE = ?,
|
||||
HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
|
||||
STATUS = 'STARTING'
|
||||
"""
|
||||
))
|
||||
{
|
||||
stmt.setString(1, taskName);
|
||||
stmt.setString(2, taskBase);
|
||||
stmt.setString(3, instanceUUID);
|
||||
stmt.setString(4, serviceInstanceUUID);
|
||||
stmt.setString(5, instanceUUID);
|
||||
stmt.setString(6, serviceInstanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("ProcessHeartbeat failed to initialize", ex);
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void heartbeatUpdate() throws SQLException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
UPDATE TASK_HEARTBEAT
|
||||
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
|
||||
STATUS = 'RUNNING',
|
||||
PROGRESS = ?,
|
||||
STAGE_NAME = ?
|
||||
WHERE INSTANCE = ?
|
||||
""")
|
||||
)
|
||||
{
|
||||
stmt.setInt(1, progress);
|
||||
stmt.setString(2, step);
|
||||
stmt.setString(3, instanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void heartbeatStop() throws SQLException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
UPDATE TASK_HEARTBEAT
|
||||
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
|
||||
STATUS='STOPPED',
|
||||
PROGRESS = ?,
|
||||
STAGE_NAME = ?
|
||||
WHERE INSTANCE = ?
|
||||
""")
|
||||
)
|
||||
{
|
||||
stmt.setInt(1, progress);
|
||||
stmt.setString( 2, step);
|
||||
stmt.setString( 3, instanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
shutDown();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,155 +1,11 @@
|
||||
package nu.marginalia.process.control;
|
||||
|
||||
import com.google.inject.ImplementedBy;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.ProcessConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ImplementedBy(ProcessHeartbeatImpl.class)
|
||||
public interface ProcessHeartbeat {
|
||||
<T extends Enum<T>> ProcessTaskHeartbeat<T> createProcessTaskHeartbeat(Class<T> steps, String processName);
|
||||
ProcessAdHocTaskHeartbeat createAdHocTaskHeartbeat(String processName);
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/** This service sends a heartbeat to the database every 5 seconds.
|
||||
*/
|
||||
@Singleton
|
||||
public class ProcessHeartbeat {
|
||||
private final Logger logger = LoggerFactory.getLogger(ProcessHeartbeat.class);
|
||||
private final String processName;
|
||||
private final String processBase;
|
||||
private final String instanceUUID;
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
|
||||
private final Thread runnerThread;
|
||||
private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 1);
|
||||
|
||||
private volatile boolean running = false;
|
||||
|
||||
private volatile int progress = -1;
|
||||
|
||||
@Inject
|
||||
public ProcessHeartbeat(ProcessConfiguration configuration,
|
||||
HikariDataSource dataSource)
|
||||
{
|
||||
this.processName = configuration.processName() + ":" + configuration.node();
|
||||
this.processBase = configuration.processName();
|
||||
this.dataSource = dataSource;
|
||||
|
||||
this.instanceUUID = configuration.instanceUuid().toString();
|
||||
|
||||
runnerThread = new Thread(this::run);
|
||||
|
||||
Runtime.getRuntime().addShutdownHook(new Thread(this::shutDown));
|
||||
}
|
||||
|
||||
public void setProgress(double progress) {
|
||||
this.progress = (int) (progress * 100);
|
||||
}
|
||||
|
||||
public void start() {
|
||||
if (!running) {
|
||||
runnerThread.start();
|
||||
}
|
||||
}
|
||||
|
||||
public void shutDown() {
|
||||
if (!running)
|
||||
return;
|
||||
|
||||
running = false;
|
||||
|
||||
try {
|
||||
runnerThread.join();
|
||||
heartbeatStop();
|
||||
}
|
||||
catch (InterruptedException|SQLException ex) {
|
||||
logger.warn("ServiceHeartbeat shutdown failed", ex);
|
||||
}
|
||||
}
|
||||
|
||||
private void run() {
|
||||
if (!running)
|
||||
running = true;
|
||||
else
|
||||
return;
|
||||
|
||||
try {
|
||||
heartbeatInit();
|
||||
|
||||
while (running) {
|
||||
|
||||
try {
|
||||
heartbeatUpdate();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.warn("ServiceHeartbeat failed to update", ex);
|
||||
}
|
||||
|
||||
TimeUnit.SECONDS.sleep(heartbeatInterval);
|
||||
}
|
||||
}
|
||||
catch (InterruptedException|SQLException ex) {
|
||||
logger.error("ServiceHeartbeat caught irrecoverable exception, killing service", ex);
|
||||
System.exit(255);
|
||||
}
|
||||
}
|
||||
|
||||
private void heartbeatInit() throws SQLException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
INSERT INTO PROCESS_HEARTBEAT (PROCESS_NAME, PROCESS_BASE, INSTANCE, HEARTBEAT_TIME, STATUS)
|
||||
VALUES (?, ?, ?, CURRENT_TIMESTAMP(6), 'STARTING')
|
||||
ON DUPLICATE KEY UPDATE
|
||||
INSTANCE = ?,
|
||||
HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
|
||||
STATUS = 'STARTING'
|
||||
"""
|
||||
))
|
||||
{
|
||||
stmt.setString(1, processName);
|
||||
stmt.setString(2, processBase);
|
||||
stmt.setString(3, instanceUUID);
|
||||
stmt.setString(4, instanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void heartbeatUpdate() throws SQLException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
UPDATE PROCESS_HEARTBEAT
|
||||
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), STATUS = 'RUNNING', PROGRESS = ?
|
||||
WHERE INSTANCE = ?
|
||||
""")
|
||||
)
|
||||
{
|
||||
stmt.setInt(1, progress);
|
||||
stmt.setString(2, instanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void heartbeatStop() throws SQLException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
UPDATE PROCESS_HEARTBEAT
|
||||
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), STATUS='STOPPED', PROGRESS=?
|
||||
WHERE INSTANCE = ?
|
||||
""")
|
||||
)
|
||||
{
|
||||
stmt.setInt(1, progress);
|
||||
stmt.setString( 2, instanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
||||
void setProgress(double progress);
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,170 @@
|
||||
package nu.marginalia.process.control;
|
||||
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.ProcessConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/** This service sends a heartbeat to the database every 5 seconds.
|
||||
*/
|
||||
@Singleton
|
||||
public class ProcessHeartbeatImpl implements ProcessHeartbeat {
|
||||
private final Logger logger = LoggerFactory.getLogger(ProcessHeartbeatImpl.class);
|
||||
private final String processName;
|
||||
private final String processBase;
|
||||
private final String instanceUUID;
|
||||
@org.jetbrains.annotations.NotNull
|
||||
private final ProcessConfiguration configuration;
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
|
||||
private final Thread runnerThread;
|
||||
private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 1);
|
||||
|
||||
private volatile boolean running = false;
|
||||
|
||||
private volatile int progress = -1;
|
||||
|
||||
@Inject
|
||||
public ProcessHeartbeatImpl(ProcessConfiguration configuration,
|
||||
HikariDataSource dataSource)
|
||||
{
|
||||
this.processName = configuration.processName() + ":" + configuration.node();
|
||||
this.processBase = configuration.processName();
|
||||
this.configuration = configuration;
|
||||
this.dataSource = dataSource;
|
||||
|
||||
this.instanceUUID = configuration.instanceUuid().toString();
|
||||
|
||||
runnerThread = new Thread(this::run);
|
||||
|
||||
Runtime.getRuntime().addShutdownHook(new Thread(this::shutDown));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public <T extends Enum<T>> ProcessTaskHeartbeat<T> createProcessTaskHeartbeat(Class<T> steps, String processName) {
|
||||
return new ProcessTaskHeartbeatImpl<>(steps, configuration, processName, dataSource);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ProcessAdHocTaskHeartbeat createAdHocTaskHeartbeat(String processName) {
|
||||
return new ProcessAdHocTaskHeartbeatImpl(configuration, processName, dataSource);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setProgress(double progress) {
|
||||
this.progress = (int) (progress * 100);
|
||||
}
|
||||
|
||||
public void start() {
|
||||
if (!running) {
|
||||
runnerThread.start();
|
||||
}
|
||||
}
|
||||
|
||||
public void shutDown() {
|
||||
if (!running)
|
||||
return;
|
||||
|
||||
running = false;
|
||||
|
||||
try {
|
||||
runnerThread.join();
|
||||
heartbeatStop();
|
||||
}
|
||||
catch (InterruptedException|SQLException ex) {
|
||||
logger.warn("ServiceHeartbeat shutdown failed", ex);
|
||||
}
|
||||
}
|
||||
|
||||
private void run() {
|
||||
if (!running)
|
||||
running = true;
|
||||
else
|
||||
return;
|
||||
|
||||
try {
|
||||
heartbeatInit();
|
||||
|
||||
while (running) {
|
||||
|
||||
try {
|
||||
heartbeatUpdate();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.warn("ServiceHeartbeat failed to update", ex);
|
||||
}
|
||||
|
||||
TimeUnit.SECONDS.sleep(heartbeatInterval);
|
||||
}
|
||||
}
|
||||
catch (InterruptedException|SQLException ex) {
|
||||
logger.error("ServiceHeartbeat caught irrecoverable exception, killing service", ex);
|
||||
System.exit(255);
|
||||
}
|
||||
}
|
||||
|
||||
private void heartbeatInit() throws SQLException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
INSERT INTO PROCESS_HEARTBEAT (PROCESS_NAME, PROCESS_BASE, INSTANCE, HEARTBEAT_TIME, STATUS)
|
||||
VALUES (?, ?, ?, CURRENT_TIMESTAMP(6), 'STARTING')
|
||||
ON DUPLICATE KEY UPDATE
|
||||
INSTANCE = ?,
|
||||
HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
|
||||
STATUS = 'STARTING'
|
||||
"""
|
||||
))
|
||||
{
|
||||
stmt.setString(1, processName);
|
||||
stmt.setString(2, processBase);
|
||||
stmt.setString(3, instanceUUID);
|
||||
stmt.setString(4, instanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void heartbeatUpdate() throws SQLException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
UPDATE PROCESS_HEARTBEAT
|
||||
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), STATUS = 'RUNNING', PROGRESS = ?
|
||||
WHERE INSTANCE = ?
|
||||
""")
|
||||
)
|
||||
{
|
||||
stmt.setInt(1, progress);
|
||||
stmt.setString(2, instanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void heartbeatStop() throws SQLException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
UPDATE PROCESS_HEARTBEAT
|
||||
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), STATUS='STOPPED', PROGRESS=?
|
||||
WHERE INSTANCE = ?
|
||||
""")
|
||||
)
|
||||
{
|
||||
stmt.setInt(1, progress);
|
||||
stmt.setString( 2, instanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,9 @@
|
||||
package nu.marginalia.process.control;
|
||||
|
||||
public interface ProcessTaskHeartbeat<T extends Enum<T>> extends AutoCloseable {
|
||||
void progress(T step);
|
||||
|
||||
void shutDown();
|
||||
|
||||
void close();
|
||||
}
|
@ -0,0 +1,192 @@
|
||||
package nu.marginalia.process.control;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.ProcessConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/** This object sends a heartbeat to the database every few seconds,
|
||||
* updating with the progress of a task within a service. Progress is tracked by providing
|
||||
* enumerations corresponding to the steps in the task. It's important they're arranged in the same
|
||||
* order as the steps in the task in order to get an accurate progress tracking.
|
||||
*/
|
||||
public class ProcessTaskHeartbeatImpl<T extends Enum<T>> implements AutoCloseable, ProcessTaskHeartbeat<T> {
|
||||
private final Logger logger = LoggerFactory.getLogger(ProcessTaskHeartbeatImpl.class);
|
||||
private final String taskName;
|
||||
private final String taskBase;
|
||||
private final String instanceUUID;
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
|
||||
private final Thread runnerThread;
|
||||
private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 1);
|
||||
private final String serviceInstanceUUID;
|
||||
private final int stepCount;
|
||||
|
||||
private volatile boolean running = false;
|
||||
private volatile int stepNum = 0;
|
||||
private volatile String step = "-";
|
||||
|
||||
ProcessTaskHeartbeatImpl(Class<T> stepClass,
|
||||
ProcessConfiguration configuration,
|
||||
String taskName,
|
||||
HikariDataSource dataSource)
|
||||
{
|
||||
this.taskName = configuration.processName() + "." + taskName + ":" + configuration.node();
|
||||
this.taskBase = configuration.processName() + "." + taskName;
|
||||
this.dataSource = dataSource;
|
||||
|
||||
this.instanceUUID = UUID.randomUUID().toString();
|
||||
this.serviceInstanceUUID = configuration.instanceUuid().toString();
|
||||
|
||||
this.stepCount = stepClass.getEnumConstants().length;
|
||||
|
||||
heartbeatInit();
|
||||
|
||||
runnerThread = new Thread(this::run);
|
||||
runnerThread.start();
|
||||
}
|
||||
|
||||
/** Update the progress of the task. This is a fast function that doesn't block;
|
||||
* the actual update is done in a separate thread.
|
||||
*
|
||||
* @param step The current step in the task.
|
||||
*/
|
||||
@Override
|
||||
public void progress(T step) {
|
||||
this.step = step.name();
|
||||
|
||||
|
||||
// off by one since we calculate the progress based on the number of steps,
|
||||
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
|
||||
// final progress being 80% and not 100%)
|
||||
|
||||
this.stepNum = 1 + step.ordinal();
|
||||
|
||||
logger.info("ProcessTask {} progress: {}", taskBase, step.name());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void shutDown() {
|
||||
if (!running)
|
||||
return;
|
||||
|
||||
running = false;
|
||||
|
||||
try {
|
||||
runnerThread.join();
|
||||
heartbeatStop();
|
||||
}
|
||||
catch (InterruptedException|SQLException ex) {
|
||||
logger.warn("ProcessHeartbeat shutdown failed", ex);
|
||||
}
|
||||
}
|
||||
|
||||
private void run() {
|
||||
if (!running)
|
||||
running = true;
|
||||
else
|
||||
return;
|
||||
|
||||
try {
|
||||
while (running) {
|
||||
try {
|
||||
heartbeatUpdate();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.warn("ProcessHeartbeat failed to update", ex);
|
||||
}
|
||||
|
||||
TimeUnit.SECONDS.sleep(heartbeatInterval);
|
||||
}
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
logger.error("ProcessHeartbeat caught irrecoverable exception, killing service", ex);
|
||||
System.exit(255);
|
||||
}
|
||||
}
|
||||
|
||||
private void heartbeatInit() {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
INSERT INTO TASK_HEARTBEAT (TASK_NAME, TASK_BASE, INSTANCE, SERVICE_INSTANCE, HEARTBEAT_TIME, STATUS)
|
||||
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP(6), 'STARTING')
|
||||
ON DUPLICATE KEY UPDATE
|
||||
INSTANCE = ?,
|
||||
SERVICE_INSTANCE = ?,
|
||||
HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
|
||||
STATUS = 'STARTING'
|
||||
"""
|
||||
))
|
||||
{
|
||||
stmt.setString(1, taskName);
|
||||
stmt.setString(2, taskBase);
|
||||
stmt.setString(3, instanceUUID);
|
||||
stmt.setString(4, serviceInstanceUUID);
|
||||
stmt.setString(5, instanceUUID);
|
||||
stmt.setString(6, serviceInstanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("ProcessHeartbeat failed to initialize", ex);
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void heartbeatUpdate() throws SQLException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
UPDATE TASK_HEARTBEAT
|
||||
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
|
||||
STATUS = 'RUNNING',
|
||||
PROGRESS = ?,
|
||||
STAGE_NAME = ?
|
||||
WHERE INSTANCE = ?
|
||||
""")
|
||||
)
|
||||
{
|
||||
stmt.setInt(1, (int) Math.round(100 * stepNum / (double) stepCount));
|
||||
stmt.setString(2, step);
|
||||
stmt.setString(3, instanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void heartbeatStop() throws SQLException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
UPDATE TASK_HEARTBEAT
|
||||
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
|
||||
STATUS='STOPPED',
|
||||
PROGRESS = ?,
|
||||
STAGE_NAME = ?
|
||||
WHERE INSTANCE = ?
|
||||
""")
|
||||
)
|
||||
{
|
||||
stmt.setInt(1, (int) Math.round(100 * stepNum / (double) stepCount));
|
||||
stmt.setString( 2, step);
|
||||
stmt.setString( 3, instanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
shutDown();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,14 @@
|
||||
package nu.marginalia.service.control;
|
||||
|
||||
/** Dummy implementation of ServiceHeartbeat that does nothing */
|
||||
public class FakeServiceHeartbeat implements ServiceHeartbeat {
|
||||
@Override
|
||||
public <T extends Enum<T>> ServiceTaskHeartbeat<T> createServiceTaskHeartbeat(Class<T> steps, String processName) {
|
||||
return new ServiceTaskHeartbeat<T>() {
|
||||
@Override
|
||||
public void progress(T step) {}
|
||||
@Override
|
||||
public void close() {}
|
||||
};
|
||||
}
|
||||
}
|
@ -1,157 +1,8 @@
|
||||
package nu.marginalia.service.control;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/** This service sends a heartbeat to the database every 5 seconds,
|
||||
* updating the control service with the liveness information for the service.
|
||||
*/
|
||||
@Singleton
|
||||
public class ServiceHeartbeat {
|
||||
private final Logger logger = LoggerFactory.getLogger(ServiceHeartbeat.class);
|
||||
private final String serviceName;
|
||||
private final String serviceBase;
|
||||
private final String instanceUUID;
|
||||
private final ServiceConfiguration configuration;
|
||||
private final ServiceEventLog eventLog;
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
|
||||
private final Thread runnerThread;
|
||||
private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 5);
|
||||
|
||||
private volatile boolean running = false;
|
||||
|
||||
@Inject
|
||||
public ServiceHeartbeat(ServiceConfiguration configuration,
|
||||
ServiceEventLog eventLog,
|
||||
HikariDataSource dataSource)
|
||||
{
|
||||
this.serviceName = configuration.serviceName() + ":" + configuration.node();
|
||||
this.serviceBase = configuration.serviceName();
|
||||
this.configuration = configuration;
|
||||
this.eventLog = eventLog;
|
||||
this.dataSource = dataSource;
|
||||
|
||||
this.instanceUUID = configuration.instanceUuid().toString();
|
||||
|
||||
runnerThread = new Thread(this::run);
|
||||
|
||||
Runtime.getRuntime().addShutdownHook(new Thread(this::shutDown));
|
||||
}
|
||||
|
||||
public <T extends Enum<T>> ServiceTaskHeartbeat<T> createServiceTaskHeartbeat(Class<T> steps, String processName) {
|
||||
return new ServiceTaskHeartbeat<>(steps, configuration, processName, eventLog, dataSource);
|
||||
}
|
||||
|
||||
|
||||
public void start() {
|
||||
if (!running) {
|
||||
runnerThread.start();
|
||||
}
|
||||
}
|
||||
|
||||
public void shutDown() {
|
||||
if (!running)
|
||||
return;
|
||||
|
||||
running = false;
|
||||
|
||||
try {
|
||||
runnerThread.join();
|
||||
heartbeatStop();
|
||||
}
|
||||
catch (InterruptedException|SQLException ex) {
|
||||
logger.warn("ServiceHeartbeat shutdown failed", ex);
|
||||
}
|
||||
}
|
||||
|
||||
private void run() {
|
||||
if (!running)
|
||||
running = true;
|
||||
else
|
||||
return;
|
||||
|
||||
try {
|
||||
heartbeatInit();
|
||||
|
||||
while (running) {
|
||||
|
||||
try {
|
||||
heartbeatUpdate();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.warn("ServiceHeartbeat failed to update", ex);
|
||||
}
|
||||
|
||||
TimeUnit.SECONDS.sleep(heartbeatInterval);
|
||||
}
|
||||
}
|
||||
catch (InterruptedException|SQLException ex) {
|
||||
logger.error("ServiceHeartbeat caught irrecoverable exception, killing service", ex);
|
||||
System.exit(255);
|
||||
}
|
||||
}
|
||||
|
||||
private void heartbeatInit() throws SQLException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
INSERT INTO SERVICE_HEARTBEAT (SERVICE_NAME, SERVICE_BASE, INSTANCE, HEARTBEAT_TIME, ALIVE)
|
||||
VALUES (?, ?, ?, CURRENT_TIMESTAMP(6), 1)
|
||||
ON DUPLICATE KEY UPDATE
|
||||
INSTANCE = ?,
|
||||
HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
|
||||
ALIVE = 1
|
||||
"""
|
||||
))
|
||||
{
|
||||
stmt.setString(1, serviceName);
|
||||
stmt.setString(2, serviceBase);
|
||||
stmt.setString(3, instanceUUID);
|
||||
stmt.setString(4, instanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void heartbeatUpdate() throws SQLException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
UPDATE SERVICE_HEARTBEAT
|
||||
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6)
|
||||
WHERE INSTANCE = ? AND ALIVE = 1
|
||||
""")
|
||||
)
|
||||
{
|
||||
stmt.setString(1, instanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void heartbeatStop() throws SQLException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
UPDATE SERVICE_HEARTBEAT
|
||||
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), ALIVE = 0
|
||||
WHERE INSTANCE = ?
|
||||
""")
|
||||
)
|
||||
{
|
||||
stmt.setString(1, instanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
||||
import com.google.inject.ImplementedBy;
|
||||
|
||||
@ImplementedBy(ServiceHeartbeatImpl.class)
|
||||
public interface ServiceHeartbeat {
|
||||
<T extends Enum<T>> ServiceTaskHeartbeat<T> createServiceTaskHeartbeat(Class<T> steps, String processName);
|
||||
}
|
||||
|
@ -0,0 +1,158 @@
|
||||
package nu.marginalia.service.control;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/** This service sends a heartbeat to the database every 5 seconds,
|
||||
* updating the control service with the liveness information for the service.
|
||||
*/
|
||||
@Singleton
|
||||
public class ServiceHeartbeatImpl implements ServiceHeartbeat {
|
||||
private final Logger logger = LoggerFactory.getLogger(ServiceHeartbeatImpl.class);
|
||||
private final String serviceName;
|
||||
private final String serviceBase;
|
||||
private final String instanceUUID;
|
||||
private final ServiceConfiguration configuration;
|
||||
private final ServiceEventLog eventLog;
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
|
||||
private final Thread runnerThread;
|
||||
private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 5);
|
||||
|
||||
private volatile boolean running = false;
|
||||
|
||||
@Inject
|
||||
public ServiceHeartbeatImpl(ServiceConfiguration configuration,
|
||||
ServiceEventLog eventLog,
|
||||
HikariDataSource dataSource)
|
||||
{
|
||||
this.serviceName = configuration.serviceName() + ":" + configuration.node();
|
||||
this.serviceBase = configuration.serviceName();
|
||||
this.configuration = configuration;
|
||||
this.eventLog = eventLog;
|
||||
this.dataSource = dataSource;
|
||||
|
||||
this.instanceUUID = configuration.instanceUuid().toString();
|
||||
|
||||
runnerThread = new Thread(this::run);
|
||||
|
||||
Runtime.getRuntime().addShutdownHook(new Thread(this::shutDown));
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T extends Enum<T>> ServiceTaskHeartbeat<T> createServiceTaskHeartbeat(Class<T> steps, String processName) {
|
||||
return new ServiceTaskHeartbeatImpl<>(steps, configuration, processName, eventLog, dataSource);
|
||||
}
|
||||
|
||||
|
||||
public void start() {
|
||||
if (!running) {
|
||||
runnerThread.start();
|
||||
}
|
||||
}
|
||||
|
||||
public void shutDown() {
|
||||
if (!running)
|
||||
return;
|
||||
|
||||
running = false;
|
||||
|
||||
try {
|
||||
runnerThread.join();
|
||||
heartbeatStop();
|
||||
}
|
||||
catch (InterruptedException|SQLException ex) {
|
||||
logger.warn("ServiceHeartbeat shutdown failed", ex);
|
||||
}
|
||||
}
|
||||
|
||||
private void run() {
|
||||
if (!running)
|
||||
running = true;
|
||||
else
|
||||
return;
|
||||
|
||||
try {
|
||||
heartbeatInit();
|
||||
|
||||
while (running) {
|
||||
|
||||
try {
|
||||
heartbeatUpdate();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.warn("ServiceHeartbeat failed to update", ex);
|
||||
}
|
||||
|
||||
TimeUnit.SECONDS.sleep(heartbeatInterval);
|
||||
}
|
||||
}
|
||||
catch (InterruptedException|SQLException ex) {
|
||||
logger.error("ServiceHeartbeat caught irrecoverable exception, killing service", ex);
|
||||
System.exit(255);
|
||||
}
|
||||
}
|
||||
|
||||
private void heartbeatInit() throws SQLException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
INSERT INTO SERVICE_HEARTBEAT (SERVICE_NAME, SERVICE_BASE, INSTANCE, HEARTBEAT_TIME, ALIVE)
|
||||
VALUES (?, ?, ?, CURRENT_TIMESTAMP(6), 1)
|
||||
ON DUPLICATE KEY UPDATE
|
||||
INSTANCE = ?,
|
||||
HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
|
||||
ALIVE = 1
|
||||
"""
|
||||
))
|
||||
{
|
||||
stmt.setString(1, serviceName);
|
||||
stmt.setString(2, serviceBase);
|
||||
stmt.setString(3, instanceUUID);
|
||||
stmt.setString(4, instanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void heartbeatUpdate() throws SQLException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
UPDATE SERVICE_HEARTBEAT
|
||||
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6)
|
||||
WHERE INSTANCE = ? AND ALIVE = 1
|
||||
""")
|
||||
)
|
||||
{
|
||||
stmt.setString(1, instanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void heartbeatStop() throws SQLException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
UPDATE SERVICE_HEARTBEAT
|
||||
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6), ALIVE = 0
|
||||
WHERE INSTANCE = ?
|
||||
""")
|
||||
)
|
||||
{
|
||||
stmt.setString(1, instanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -1,196 +1,8 @@
|
||||
package nu.marginalia.service.control;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/** This object sends a heartbeat to the database every few seconds,
|
||||
* updating with the progress of a task within a service. Progress is tracked by providing
|
||||
* enumerations corresponding to the steps in the task. It's important they're arranged in the same
|
||||
* order as the steps in the task in order to get an accurate progress tracking.
|
||||
*/
|
||||
public class ServiceTaskHeartbeat<T extends Enum<T>> implements AutoCloseable {
|
||||
private final Logger logger = LoggerFactory.getLogger(ServiceTaskHeartbeat.class);
|
||||
private final String taskName;
|
||||
private final String taskBase;
|
||||
private final String instanceUUID;
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
|
||||
private final Thread runnerThread;
|
||||
private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 1);
|
||||
private final String serviceInstanceUUID;
|
||||
private final int stepCount;
|
||||
private final ServiceEventLog eventLog;
|
||||
|
||||
private volatile boolean running = false;
|
||||
private volatile int stepNum = 0;
|
||||
private volatile String step = "-";
|
||||
|
||||
ServiceTaskHeartbeat(Class<T> stepClass,
|
||||
ServiceConfiguration configuration,
|
||||
String taskName,
|
||||
ServiceEventLog eventLog,
|
||||
HikariDataSource dataSource)
|
||||
{
|
||||
this.eventLog = eventLog;
|
||||
this.taskName = configuration.serviceName() + "." + taskName + ":" + configuration.node();
|
||||
this.taskBase = configuration.serviceName() + "." + taskName;
|
||||
this.dataSource = dataSource;
|
||||
|
||||
this.instanceUUID = UUID.randomUUID().toString();
|
||||
this.serviceInstanceUUID = configuration.instanceUuid().toString();
|
||||
|
||||
this.stepCount = stepClass.getEnumConstants().length;
|
||||
|
||||
heartbeatInit();
|
||||
|
||||
runnerThread = new Thread(this::run);
|
||||
runnerThread.start();
|
||||
}
|
||||
|
||||
/** Update the progress of the task. This is a fast function that doesn't block;
|
||||
* the actual update is done in a separate thread.
|
||||
*
|
||||
* @param step The current step in the task.
|
||||
*/
|
||||
public void progress(T step) {
|
||||
this.step = step.name();
|
||||
|
||||
|
||||
// off by one since we calculate the progress based on the number of steps,
|
||||
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
|
||||
// final progress being 80% and not 100%)
|
||||
|
||||
this.stepNum = 1 + step.ordinal();
|
||||
|
||||
logger.info("ServiceTask {} progress: {}", taskBase, step.name());
|
||||
eventLog.logEvent("TASK-STEP", taskName + " = " + step.name());
|
||||
}
|
||||
|
||||
public void shutDown() {
|
||||
if (!running)
|
||||
return;
|
||||
|
||||
running = false;
|
||||
|
||||
try {
|
||||
runnerThread.join();
|
||||
heartbeatStop();
|
||||
}
|
||||
catch (InterruptedException|SQLException ex) {
|
||||
logger.warn("ServiceHeartbeat shutdown failed", ex);
|
||||
}
|
||||
}
|
||||
|
||||
private void run() {
|
||||
if (!running)
|
||||
running = true;
|
||||
else
|
||||
return;
|
||||
|
||||
try {
|
||||
while (running) {
|
||||
try {
|
||||
heartbeatUpdate();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.warn("ServiceHeartbeat failed to update", ex);
|
||||
}
|
||||
|
||||
TimeUnit.SECONDS.sleep(heartbeatInterval);
|
||||
}
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
logger.error("ServiceHeartbeat caught irrecoverable exception, killing service", ex);
|
||||
System.exit(255);
|
||||
}
|
||||
}
|
||||
|
||||
private void heartbeatInit() {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
INSERT INTO TASK_HEARTBEAT (TASK_NAME, TASK_BASE, INSTANCE, SERVICE_INSTANCE, HEARTBEAT_TIME, STATUS)
|
||||
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP(6), 'STARTING')
|
||||
ON DUPLICATE KEY UPDATE
|
||||
INSTANCE = ?,
|
||||
SERVICE_INSTANCE = ?,
|
||||
HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
|
||||
STATUS = 'STARTING'
|
||||
"""
|
||||
))
|
||||
{
|
||||
stmt.setString(1, taskName);
|
||||
stmt.setString(2, taskBase);
|
||||
stmt.setString(3, instanceUUID);
|
||||
stmt.setString(4, serviceInstanceUUID);
|
||||
stmt.setString(5, instanceUUID);
|
||||
stmt.setString(6, serviceInstanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("ServiceHeartbeat failed to initialize", ex);
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
eventLog.logEvent("TASK-STARTED", taskName);
|
||||
}
|
||||
|
||||
private void heartbeatUpdate() throws SQLException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
UPDATE TASK_HEARTBEAT
|
||||
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
|
||||
STATUS = 'RUNNING',
|
||||
PROGRESS = ?,
|
||||
STAGE_NAME = ?
|
||||
WHERE INSTANCE = ?
|
||||
""")
|
||||
)
|
||||
{
|
||||
stmt.setInt(1, (int) Math.round(100 * stepNum / (double) stepCount));
|
||||
stmt.setString(2, step);
|
||||
stmt.setString(3, instanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void heartbeatStop() throws SQLException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
UPDATE TASK_HEARTBEAT
|
||||
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
|
||||
STATUS='STOPPED',
|
||||
PROGRESS = ?,
|
||||
STAGE_NAME = ?
|
||||
WHERE INSTANCE = ?
|
||||
""")
|
||||
)
|
||||
{
|
||||
stmt.setInt(1, (int) Math.round(100 * stepNum / (double) stepCount));
|
||||
stmt.setString( 2, step);
|
||||
stmt.setString( 3, instanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
eventLog.logEvent("TASK-TERMINATED", taskName);
|
||||
}
|
||||
public interface ServiceTaskHeartbeat<T extends Enum<T>> extends AutoCloseable {
|
||||
void progress(T step);
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
shutDown();
|
||||
}
|
||||
|
||||
void close();
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,197 @@
|
||||
package nu.marginalia.service.control;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/** This object sends a heartbeat to the database every few seconds,
|
||||
* updating with the progress of a task within a service. Progress is tracked by providing
|
||||
* enumerations corresponding to the steps in the task. It's important they're arranged in the same
|
||||
* order as the steps in the task in order to get an accurate progress tracking.
|
||||
*/
|
||||
public class ServiceTaskHeartbeatImpl<T extends Enum<T>> implements ServiceTaskHeartbeat<T> {
|
||||
private final Logger logger = LoggerFactory.getLogger(ServiceTaskHeartbeatImpl.class);
|
||||
private final String taskName;
|
||||
private final String taskBase;
|
||||
private final String instanceUUID;
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
|
||||
private final Thread runnerThread;
|
||||
private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 1);
|
||||
private final String serviceInstanceUUID;
|
||||
private final int stepCount;
|
||||
private final ServiceEventLog eventLog;
|
||||
|
||||
private volatile boolean running = false;
|
||||
private volatile int stepNum = 0;
|
||||
private volatile String step = "-";
|
||||
|
||||
ServiceTaskHeartbeatImpl(Class<T> stepClass,
|
||||
ServiceConfiguration configuration,
|
||||
String taskName,
|
||||
ServiceEventLog eventLog,
|
||||
HikariDataSource dataSource)
|
||||
{
|
||||
this.eventLog = eventLog;
|
||||
this.taskName = configuration.serviceName() + "." + taskName + ":" + configuration.node();
|
||||
this.taskBase = configuration.serviceName() + "." + taskName;
|
||||
this.dataSource = dataSource;
|
||||
|
||||
this.instanceUUID = UUID.randomUUID().toString();
|
||||
this.serviceInstanceUUID = configuration.instanceUuid().toString();
|
||||
|
||||
this.stepCount = stepClass.getEnumConstants().length;
|
||||
|
||||
heartbeatInit();
|
||||
|
||||
runnerThread = new Thread(this::run);
|
||||
runnerThread.start();
|
||||
}
|
||||
|
||||
/** Update the progress of the task. This is a fast function that doesn't block;
|
||||
* the actual update is done in a separate thread.
|
||||
*
|
||||
* @param step The current step in the task.
|
||||
*/
|
||||
@Override
|
||||
public void progress(T step) {
|
||||
this.step = step.name();
|
||||
|
||||
|
||||
// off by one since we calculate the progress based on the number of steps,
|
||||
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
|
||||
// final progress being 80% and not 100%)
|
||||
|
||||
this.stepNum = 1 + step.ordinal();
|
||||
|
||||
logger.info("ServiceTask {} progress: {}", taskBase, step.name());
|
||||
eventLog.logEvent("TASK-STEP", taskName + " = " + step.name());
|
||||
}
|
||||
|
||||
public void shutDown() {
|
||||
if (!running)
|
||||
return;
|
||||
|
||||
running = false;
|
||||
|
||||
try {
|
||||
runnerThread.join();
|
||||
heartbeatStop();
|
||||
}
|
||||
catch (InterruptedException|SQLException ex) {
|
||||
logger.warn("ServiceHeartbeat shutdown failed", ex);
|
||||
}
|
||||
}
|
||||
|
||||
private void run() {
|
||||
if (!running)
|
||||
running = true;
|
||||
else
|
||||
return;
|
||||
|
||||
try {
|
||||
while (running) {
|
||||
try {
|
||||
heartbeatUpdate();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.warn("ServiceHeartbeat failed to update", ex);
|
||||
}
|
||||
|
||||
TimeUnit.SECONDS.sleep(heartbeatInterval);
|
||||
}
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
logger.error("ServiceHeartbeat caught irrecoverable exception, killing service", ex);
|
||||
System.exit(255);
|
||||
}
|
||||
}
|
||||
|
||||
private void heartbeatInit() {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
INSERT INTO TASK_HEARTBEAT (TASK_NAME, TASK_BASE, INSTANCE, SERVICE_INSTANCE, HEARTBEAT_TIME, STATUS)
|
||||
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP(6), 'STARTING')
|
||||
ON DUPLICATE KEY UPDATE
|
||||
INSTANCE = ?,
|
||||
SERVICE_INSTANCE = ?,
|
||||
HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
|
||||
STATUS = 'STARTING'
|
||||
"""
|
||||
))
|
||||
{
|
||||
stmt.setString(1, taskName);
|
||||
stmt.setString(2, taskBase);
|
||||
stmt.setString(3, instanceUUID);
|
||||
stmt.setString(4, serviceInstanceUUID);
|
||||
stmt.setString(5, instanceUUID);
|
||||
stmt.setString(6, serviceInstanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("ServiceHeartbeat failed to initialize", ex);
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
eventLog.logEvent("TASK-STARTED", taskName);
|
||||
}
|
||||
|
||||
private void heartbeatUpdate() throws SQLException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
UPDATE TASK_HEARTBEAT
|
||||
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
|
||||
STATUS = 'RUNNING',
|
||||
PROGRESS = ?,
|
||||
STAGE_NAME = ?
|
||||
WHERE INSTANCE = ?
|
||||
""")
|
||||
)
|
||||
{
|
||||
stmt.setInt(1, (int) Math.round(100 * stepNum / (double) stepCount));
|
||||
stmt.setString(2, step);
|
||||
stmt.setString(3, instanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void heartbeatStop() throws SQLException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
UPDATE TASK_HEARTBEAT
|
||||
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
|
||||
STATUS='STOPPED',
|
||||
PROGRESS = ?,
|
||||
STAGE_NAME = ?
|
||||
WHERE INSTANCE = ?
|
||||
""")
|
||||
)
|
||||
{
|
||||
stmt.setInt(1, (int) Math.round(100 * stepNum / (double) stepCount));
|
||||
stmt.setString( 2, step);
|
||||
stmt.setString( 3, instanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
eventLog.logEvent("TASK-TERMINATED", taskName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
shutDown();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -4,7 +4,7 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.mq.MessageQueueFactory;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||
import nu.marginalia.service.control.ServiceHeartbeatImpl;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
/** This class exists to reduce Service boilerplate */
|
||||
@ -13,14 +13,14 @@ public class BaseServiceParams {
|
||||
public final ServiceConfiguration configuration;
|
||||
public final Initialization initialization;
|
||||
public final MetricsServer metricsServer;
|
||||
public final ServiceHeartbeat heartbeat;
|
||||
public final ServiceHeartbeatImpl heartbeat;
|
||||
public final ServiceEventLog eventLog;
|
||||
public final MessageQueueFactory messageQueueInboxFactory;
|
||||
@Inject
|
||||
public BaseServiceParams(ServiceConfiguration configuration,
|
||||
Initialization initialization,
|
||||
MetricsServer metricsServer,
|
||||
ServiceHeartbeat heartbeat,
|
||||
ServiceHeartbeatImpl heartbeat,
|
||||
ServiceEventLog eventLog,
|
||||
MessageQueueFactory messageQueueInboxFactory) {
|
||||
this.configuration = configuration;
|
||||
|
@ -7,7 +7,7 @@ import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import javax.inject.Inject;
|
||||
import com.google.inject.Inject;
|
||||
import java.util.*;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
|
@ -2,6 +2,7 @@ package nu.marginalia.ranking;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
|
||||
import it.unimi.dsi.fastutil.ints.Int2ShortOpenHashMap;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -37,6 +38,11 @@ public class DomainRankings {
|
||||
return rankings.getOrDefault(domainId, (short) MAX_RANK_VALUE);
|
||||
}
|
||||
|
||||
public float getSortRanking(long docId) {
|
||||
int domainId = UrlIdCodec.getDomainId(docId);
|
||||
return rankings.getOrDefault(domainId, (short) MAX_RANK_VALUE) / (float) MAX_RANK_VALUE;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return rankings.size();
|
||||
}
|
||||
|
@ -16,9 +16,8 @@ dependencies {
|
||||
implementation project(':code:features-index:domain-ranking')
|
||||
implementation project(':code:features-index:index-query')
|
||||
implementation project(':code:features-index:index-journal')
|
||||
implementation project(':code:features-index:lexicon')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:process')
|
||||
|
||||
implementation project(':third-party:uppend')
|
||||
|
||||
|
@ -6,10 +6,10 @@ import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import nu.marginalia.ranking.DomainRankings;
|
||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||
import org.roaringbitmap.IntConsumer;
|
||||
import org.roaringbitmap.RoaringBitmap;
|
||||
import org.roaringbitmap.longlong.LongConsumer;
|
||||
import org.roaringbitmap.longlong.Roaring64Bitmap;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -20,24 +20,24 @@ import java.nio.file.Path;
|
||||
|
||||
public class ForwardIndexConverter {
|
||||
|
||||
private final ServiceHeartbeat heartbeat;
|
||||
private final File inputFile;
|
||||
private final ProcessHeartbeat heartbeat;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final IndexJournalReader journalReader;
|
||||
private final Path outputFileDocsId;
|
||||
private final Path outputFileDocsData;
|
||||
private final DomainRankings domainRankings;
|
||||
|
||||
|
||||
public ForwardIndexConverter(ServiceHeartbeat heartbeat,
|
||||
File inputFile,
|
||||
public ForwardIndexConverter(ProcessHeartbeat heartbeat,
|
||||
IndexJournalReader journalReader,
|
||||
Path outputFileDocsId,
|
||||
Path outputFileDocsData,
|
||||
DomainRankings domainRankings
|
||||
) {
|
||||
this.heartbeat = heartbeat;
|
||||
this.inputFile = inputFile;
|
||||
this.journalReader = journalReader;
|
||||
this.outputFileDocsId = outputFileDocsId;
|
||||
this.outputFileDocsData = outputFileDocsData;
|
||||
this.domainRankings = domainRankings;
|
||||
@ -54,17 +54,9 @@ public class ForwardIndexConverter {
|
||||
public void convert() throws IOException {
|
||||
deleteOldFiles();
|
||||
|
||||
IndexJournalReaderSingleCompressedFile journalReader = new IndexJournalReaderSingleCompressedFile(inputFile.toPath());
|
||||
if (journalReader.fileHeader().fileSize() <= IndexJournalReader.FILE_HEADER_SIZE_BYTES) {
|
||||
logger.warn("Bailing: Journal is empty!");
|
||||
return;
|
||||
}
|
||||
|
||||
logger.info("Converting {} {}", inputFile, journalReader.fileHeader);
|
||||
|
||||
logger.info("Domain Rankings size = {}", domainRankings.size());
|
||||
|
||||
try (var progress = heartbeat.createServiceTaskHeartbeat(TaskSteps.class, "forwardIndexConverter")) {
|
||||
try (var progress = heartbeat.createProcessTaskHeartbeat(TaskSteps.class, "forwardIndexConverter")) {
|
||||
progress.progress(TaskSteps.GET_DOC_IDS);
|
||||
|
||||
LongArray docsFileId = getDocIds(outputFileDocsId, journalReader);
|
||||
@ -83,12 +75,11 @@ public class ForwardIndexConverter {
|
||||
LongArray docFileData = LongArray.mmapForWriting(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size());
|
||||
|
||||
journalReader.forEach(entry -> {
|
||||
long entryOffset = (long) ForwardIndexParameters.ENTRY_SIZE * docIdToIdx.get(entry.urlId());
|
||||
long entryOffset = (long) ForwardIndexParameters.ENTRY_SIZE * docIdToIdx.get(entry.docId());
|
||||
|
||||
int ranking = domainRankings.getRanking(entry.domainId());
|
||||
long meta = DocumentMetadata.encodeRank(entry.docMeta(), ranking);
|
||||
|
||||
docFileData.set(entryOffset + ForwardIndexParameters.DOMAIN_OFFSET, entry.domainId());
|
||||
docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta);
|
||||
docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, entry.header.documentFeatures());
|
||||
});
|
||||
@ -109,17 +100,18 @@ public class ForwardIndexConverter {
|
||||
}
|
||||
|
||||
private LongArray getDocIds(Path outputFileDocs, IndexJournalReader journalReader) throws IOException {
|
||||
RoaringBitmap rbm = new RoaringBitmap();
|
||||
journalReader.forEachUrlId(rbm::add);
|
||||
Roaring64Bitmap rbm = new Roaring64Bitmap();
|
||||
journalReader.forEachDocId(rbm::add);
|
||||
|
||||
LongArray ret = LongArray.mmapForWriting(outputFileDocs, rbm.getCardinality());
|
||||
rbm.forEach(new IntConsumer() {
|
||||
LongArray ret = LongArray.mmapForWriting(outputFileDocs, rbm.getIntCardinality());
|
||||
rbm.forEach(new LongConsumer() {
|
||||
int offset;
|
||||
@Override
|
||||
public void accept(int value) {
|
||||
public void accept(long value) {
|
||||
ret.set(offset++, value);
|
||||
}
|
||||
});
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,28 @@
|
||||
package nu.marginalia.index.forward;
|
||||
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class ForwardIndexFileNames {
|
||||
public static Path resolve(Path basePath, FileIdentifier identifier, FileVersion version) {
|
||||
return switch (identifier) {
|
||||
case DOC_ID -> switch (version) {
|
||||
case NEXT -> basePath.resolve("fwd-doc-id.dat.next");
|
||||
case CURRENT -> basePath.resolve("fwd-doc-id.dat");
|
||||
};
|
||||
case DOC_DATA -> switch (version) {
|
||||
case NEXT -> basePath.resolve("fwd-doc-data.dat.next");
|
||||
case CURRENT -> basePath.resolve("fwd-doc-data.dat");
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
public enum FileVersion {
|
||||
CURRENT,
|
||||
NEXT
|
||||
};
|
||||
|
||||
public enum FileIdentifier {
|
||||
DOC_DATA,
|
||||
DOC_ID
|
||||
}
|
||||
}
|
@ -1,9 +1,8 @@
|
||||
package nu.marginalia.index.forward;
|
||||
|
||||
class ForwardIndexParameters {
|
||||
public static final int ENTRY_SIZE = 3;
|
||||
public static final int DOMAIN_OFFSET = 0;
|
||||
public static final int METADATA_OFFSET = 1;
|
||||
public static final int FEATURES_OFFSET = 2;
|
||||
public static final int ENTRY_SIZE = 2;
|
||||
public static final int METADATA_OFFSET = 0;
|
||||
public static final int FEATURES_OFFSET = 1;
|
||||
|
||||
}
|
||||
|
@ -3,6 +3,7 @@ package nu.marginalia.index.forward;
|
||||
import com.upserve.uppend.blobs.NativeIO;
|
||||
import gnu.trove.map.hash.TLongIntHashMap;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -71,6 +72,8 @@ public class ForwardIndexReader {
|
||||
}
|
||||
|
||||
public long getDocMeta(long docId) {
|
||||
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
|
||||
|
||||
long offset = idxForDoc(docId);
|
||||
if (offset < 0) return 0;
|
||||
|
||||
@ -78,20 +81,17 @@ public class ForwardIndexReader {
|
||||
}
|
||||
|
||||
public int getHtmlFeatures(long docId) {
|
||||
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
|
||||
|
||||
long offset = idxForDoc(docId);
|
||||
if (offset < 0) return 0;
|
||||
|
||||
return (int) data.get(ENTRY_SIZE * offset + FEATURES_OFFSET);
|
||||
}
|
||||
|
||||
public int getDomainId(long docId) {
|
||||
long offset = idxForDoc(docId);
|
||||
if (offset < 0) return 0;
|
||||
|
||||
return Math.max(0, (int) data.get(ENTRY_SIZE * offset + DOMAIN_OFFSET));
|
||||
}
|
||||
|
||||
private int idxForDoc(long docId) {
|
||||
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
|
||||
|
||||
return idToOffset.get(docId);
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.index.forward;
|
||||
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||
import nu.marginalia.index.query.IndexQueryParams;
|
||||
@ -15,10 +16,11 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean test(long docId) {
|
||||
int urlId = (int) (docId & 0xFFFF_FFFFL);
|
||||
int domainId = forwardIndexReader.getDomainId(urlId);
|
||||
long meta = forwardIndexReader.getDocMeta(urlId);
|
||||
public boolean test(long combinedId) {
|
||||
long docId = UrlIdCodec.removeRank(combinedId);
|
||||
int domainId = UrlIdCodec.getDomainId(docId);
|
||||
|
||||
long meta = forwardIndexReader.getDocMeta(docId);
|
||||
|
||||
if (!validateDomain(domainId, meta)) {
|
||||
return false;
|
||||
|
@ -2,14 +2,14 @@ package nu.marginalia.index.forward;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntry;
|
||||
import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
|
||||
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
||||
import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode;
|
||||
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
||||
import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
||||
import nu.marginalia.process.control.ProcessTaskHeartbeatImpl;
|
||||
import nu.marginalia.ranking.DomainRankings;
|
||||
import nu.marginalia.lexicon.KeywordLexicon;
|
||||
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
|
||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||
import nu.marginalia.service.control.ServiceTaskHeartbeat;
|
||||
import nu.marginalia.test.TestUtil;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
@ -28,7 +28,6 @@ import static org.mockito.Mockito.when;
|
||||
|
||||
class ForwardIndexConverterTest {
|
||||
|
||||
KeywordLexicon keywordLexicon;
|
||||
IndexJournalWriter writer;
|
||||
|
||||
Path indexFile;
|
||||
@ -49,12 +48,9 @@ class ForwardIndexConverterTest {
|
||||
dictionaryFile = Files.createTempFile("tmp", ".dict");
|
||||
dictionaryFile.toFile().deleteOnExit();
|
||||
|
||||
keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile(), KeywordLexiconJournalMode.READ_WRITE));
|
||||
keywordLexicon.getOrInsert("0");
|
||||
|
||||
indexFile = Files.createTempFile("tmp", ".idx");
|
||||
indexFile.toFile().deleteOnExit();
|
||||
writer = new IndexJournalWriterImpl(keywordLexicon, indexFile);
|
||||
writer = new IndexJournalWriterSingleFileImpl(indexFile);
|
||||
|
||||
wordsFile1 = Files.createTempFile("words1", ".idx");
|
||||
urlsFile1 = Files.createTempFile("urls1", ".idx");
|
||||
@ -62,11 +58,9 @@ class ForwardIndexConverterTest {
|
||||
dataDir = Files.createTempDirectory(getClass().getSimpleName());
|
||||
|
||||
for (int i = 1; i < workSetSize; i++) {
|
||||
createEntry(writer, keywordLexicon, i);
|
||||
createEntry(writer, i);
|
||||
}
|
||||
|
||||
|
||||
keywordLexicon.commitToDisk();
|
||||
writer.close();
|
||||
|
||||
|
||||
@ -84,15 +78,16 @@ class ForwardIndexConverterTest {
|
||||
}
|
||||
|
||||
long createId(long url, long domain) {
|
||||
return (domain << 32) | url;
|
||||
return UrlIdCodec.encodeId((int) domain, (int) url);
|
||||
}
|
||||
public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) {
|
||||
|
||||
public void createEntry(IndexJournalWriter writer, int id) {
|
||||
int[] factors = getFactorsI(id);
|
||||
|
||||
var entryBuilder = IndexJournalEntry.builder(createId(id, id/20), id%5);
|
||||
|
||||
for (int i = 0; i+1 < factors.length; i+=2) {
|
||||
entryBuilder.add(keywordLexicon.getOrInsert(Integer.toString(factors[i])), -factors[i+1]);
|
||||
entryBuilder.add(factors[i], -factors[i+1]);
|
||||
}
|
||||
|
||||
writer.put(entryBuilder.build());
|
||||
@ -101,18 +96,14 @@ class ForwardIndexConverterTest {
|
||||
@Test
|
||||
void testForwardIndex() throws IOException {
|
||||
|
||||
// RIP fairies
|
||||
var serviceHeartbeat = Mockito.mock(ServiceHeartbeat.class);
|
||||
when(serviceHeartbeat.createServiceTaskHeartbeat(Mockito.any(), Mockito.any()))
|
||||
.thenReturn(Mockito.mock(ServiceTaskHeartbeat.class));
|
||||
|
||||
new ForwardIndexConverter(serviceHeartbeat, indexFile.toFile(), docsFileId, docsFileData, new DomainRankings()).convert();
|
||||
new ForwardIndexConverter(new FakeProcessHeartbeat(), new IndexJournalReaderSingleCompressedFile(indexFile), docsFileId, docsFileData, new DomainRankings()).convert();
|
||||
|
||||
var forwardReader = new ForwardIndexReader(docsFileId, docsFileData);
|
||||
|
||||
for (int i = 36; i < workSetSize; i++) {
|
||||
assertEquals(0x00FF000000000000L | (i % 5), forwardReader.getDocMeta(i));
|
||||
assertEquals(i/20, forwardReader.getDomainId(i));
|
||||
long docId = createId(i, i/20);
|
||||
assertEquals(0x00FF000000000000L | (i % 5), forwardReader.getDocMeta(docId));
|
||||
assertEquals(i/20, UrlIdCodec.getDomainId(docId));
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -13,7 +13,6 @@ java {
|
||||
dependencies {
|
||||
implementation project(':code:libraries:array')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:features-index:lexicon')
|
||||
|
||||
implementation libs.lombok
|
||||
annotationProcessor libs.lombok
|
||||
@ -22,6 +21,7 @@ dependencies {
|
||||
implementation libs.prometheus
|
||||
implementation libs.notnull
|
||||
implementation libs.rxjava
|
||||
implementation libs.guava
|
||||
implementation libs.trove
|
||||
implementation libs.zstd
|
||||
implementation libs.commons.lang3
|
||||
|
@ -1,8 +1,6 @@
|
||||
package nu.marginalia.index.journal.model;
|
||||
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.id.EdgeId;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
|
||||
public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntryData data) {
|
||||
|
||||
@ -15,18 +13,7 @@ public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntr
|
||||
long documentMeta) {
|
||||
|
||||
|
||||
return builder(new EdgeId<>(domainId),
|
||||
new EdgeId<>(urlId),
|
||||
documentMeta);
|
||||
return builder(UrlIdCodec.encodeId(domainId, urlId), documentMeta);
|
||||
}
|
||||
|
||||
public static IndexJournalEntryBuilder builder(EdgeId<EdgeDomain> domainId,
|
||||
EdgeId<EdgeUrl> urlId,
|
||||
long documentMeta) {
|
||||
|
||||
|
||||
return new IndexJournalEntryBuilder(0,
|
||||
IndexJournalEntryHeader.combineIds(domainId, urlId),
|
||||
documentMeta);
|
||||
}
|
||||
}
|
||||
|
@ -25,7 +25,7 @@ public class IndexJournalEntryData implements Iterable<IndexJournalEntryData.Rec
|
||||
|
||||
public long get(int idx) {
|
||||
if (idx >= size)
|
||||
throw new ArrayIndexOutOfBoundsException();
|
||||
throw new ArrayIndexOutOfBoundsException(idx + " vs " + size);
|
||||
return underlyingArray[idx];
|
||||
}
|
||||
|
||||
@ -58,9 +58,9 @@ public class IndexJournalEntryData implements Iterable<IndexJournalEntryData.Rec
|
||||
public Record next() {
|
||||
pos+=ENTRY_SIZE;
|
||||
|
||||
return new Record((int) underlyingArray[pos], underlyingArray[pos+1]);
|
||||
return new Record(underlyingArray[pos], underlyingArray[pos+1]);
|
||||
}
|
||||
}
|
||||
|
||||
public record Record(int wordId, long metadata) {}
|
||||
public record Record(long wordId, long metadata) {}
|
||||
}
|
||||
|
@ -1,29 +1,17 @@
|
||||
package nu.marginalia.index.journal.model;
|
||||
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.id.EdgeId;
|
||||
|
||||
public record IndexJournalEntryHeader(int entrySize,
|
||||
int documentFeatures,
|
||||
long combinedId,
|
||||
long documentMeta) {
|
||||
|
||||
public IndexJournalEntryHeader(EdgeId<EdgeDomain> domainId,
|
||||
public IndexJournalEntryHeader(long combinedId,
|
||||
int documentFeatures,
|
||||
EdgeId<EdgeUrl> urlId,
|
||||
long documentMeta) {
|
||||
this(-1,
|
||||
documentFeatures,
|
||||
combineIds(domainId, urlId),
|
||||
combinedId,
|
||||
documentMeta);
|
||||
}
|
||||
|
||||
static long combineIds(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId) {
|
||||
long did = domainId.id();
|
||||
long uid = urlId.id();
|
||||
|
||||
return (did << 32L) | uid;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -2,11 +2,13 @@ package nu.marginalia.index.journal.reader;
|
||||
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
|
||||
import java.io.DataInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.LongBuffer;
|
||||
import java.util.Arrays;
|
||||
|
||||
public class IndexJournalReadEntry {
|
||||
public final IndexJournalEntryHeader header;
|
||||
@ -51,11 +53,7 @@ public class IndexJournalReadEntry {
|
||||
}
|
||||
|
||||
public int domainId() {
|
||||
return (int) (docId() >>> 32L);
|
||||
}
|
||||
|
||||
public int urlId() {
|
||||
return (int) (docId() & 0xFFFF_FFFFL);
|
||||
return UrlIdCodec.getDomainId(docId());
|
||||
}
|
||||
|
||||
public IndexJournalEntryData readEntry() {
|
||||
|
@ -1,31 +1,48 @@
|
||||
package nu.marginalia.index.journal.reader;
|
||||
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.index.journal.model.IndexJournalFileHeader;
|
||||
import nu.marginalia.index.journal.model.IndexJournalStatistics;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Iterator;
|
||||
import java.util.function.IntConsumer;
|
||||
import java.util.function.LongConsumer;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
public interface IndexJournalReader extends Iterable<IndexJournalReadEntry> {
|
||||
int FILE_HEADER_SIZE_LONGS = 2;
|
||||
int FILE_HEADER_SIZE_BYTES = 8 * FILE_HEADER_SIZE_LONGS;
|
||||
|
||||
IndexJournalFileHeader fileHeader();
|
||||
static IndexJournalReader singleFile(Path fileName) throws IOException {
|
||||
return new IndexJournalReaderSingleCompressedFile(fileName);
|
||||
}
|
||||
|
||||
IndexJournalStatistics getStatistics();
|
||||
static IndexJournalReader paging(Path baseDir) throws IOException {
|
||||
return new IndexJournalReaderPagingImpl(baseDir);
|
||||
}
|
||||
|
||||
void forEachWordId(IntConsumer consumer);
|
||||
static IndexJournalReader singleFileWithPriorityFilters(Path path) throws IOException {
|
||||
|
||||
void forEachUrlIdWordId(BiIntConsumer consumer);
|
||||
long highPriorityFlags =
|
||||
WordFlags.Title.asBit()
|
||||
| WordFlags.Subjects.asBit()
|
||||
| WordFlags.TfIdfHigh.asBit()
|
||||
| WordFlags.NamesWords.asBit()
|
||||
| WordFlags.UrlDomain.asBit()
|
||||
| WordFlags.UrlPath.asBit()
|
||||
| WordFlags.Site.asBit()
|
||||
| WordFlags.SiteAdjacent.asBit();
|
||||
|
||||
void forEachDocIdWordId(LongIntConsumer consumer);
|
||||
return new IndexJournalReaderSingleCompressedFile(path, null,
|
||||
r -> (r.metadata() & highPriorityFlags) != 0);
|
||||
}
|
||||
|
||||
void forEachWordId(LongConsumer consumer);
|
||||
|
||||
void forEachDocIdRecord(LongObjectConsumer<IndexJournalEntryData.Record> consumer);
|
||||
|
||||
void forEachUrlId(IntConsumer consumer);
|
||||
void forEachDocId(LongConsumer consumer);
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
@ -33,13 +50,7 @@ public interface IndexJournalReader extends Iterable<IndexJournalReadEntry> {
|
||||
|
||||
void close() throws IOException;
|
||||
|
||||
interface BiIntConsumer {
|
||||
void accept(int left, int right);
|
||||
}
|
||||
|
||||
interface LongIntConsumer {
|
||||
void accept(long left, int right);
|
||||
}
|
||||
|
||||
interface LongObjectConsumer<T> {
|
||||
void accept(long left, T right);
|
||||
|
@ -0,0 +1,61 @@
|
||||
package nu.marginalia.index.journal.reader;
|
||||
|
||||
import com.google.common.collect.Iterators;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.index.journal.model.IndexJournalStatistics;
|
||||
import nu.marginallia.index.journal.IndexJournalFileNames;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.function.LongConsumer;
|
||||
|
||||
public class IndexJournalReaderPagingImpl implements IndexJournalReader {
|
||||
|
||||
private final List<IndexJournalReader> readers;
|
||||
|
||||
public IndexJournalReaderPagingImpl(Path baseDir) throws IOException {
|
||||
var inputFiles = IndexJournalFileNames.findJournalFiles(baseDir);
|
||||
this.readers = new ArrayList<>(inputFiles.size());
|
||||
|
||||
for (var inputFile : inputFiles) {
|
||||
readers.add(new IndexJournalReaderSingleCompressedFile(inputFile));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void forEachWordId(LongConsumer consumer) {
|
||||
for (var reader : readers) {
|
||||
reader.forEachWordId(consumer);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void forEachDocIdRecord(LongObjectConsumer<IndexJournalEntryData.Record> consumer) {
|
||||
for (var reader : readers) {
|
||||
reader.forEachDocIdRecord(consumer);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void forEachDocId(LongConsumer consumer) {
|
||||
for (var reader : readers) {
|
||||
reader.forEachDocId(consumer);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public @NotNull Iterator<IndexJournalReadEntry> iterator() {
|
||||
return Iterators.concat(readers.stream().map(IndexJournalReader::iterator).iterator());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
for (var reader : readers) {
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
}
|
@ -12,21 +12,30 @@ import java.io.*;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.function.IntConsumer;
|
||||
import java.util.function.LongConsumer;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
public class IndexJournalReaderSingleCompressedFile implements IndexJournalReader {
|
||||
|
||||
private static Path journalFile;
|
||||
private Path journalFile;
|
||||
public final IndexJournalFileHeader fileHeader;
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "IndexJournalReaderSingleCompressedFile{" + journalFile + " }";
|
||||
}
|
||||
|
||||
private DataInputStream dataInputStream = null;
|
||||
|
||||
final Predicate<IndexJournalReadEntry> entryPredicate;
|
||||
final Predicate<IndexJournalEntryData.Record> recordPredicate;
|
||||
|
||||
public IndexJournalReaderSingleCompressedFile(Path file) throws IOException {
|
||||
this.journalFile = file;
|
||||
|
||||
fileHeader = readHeader(file);
|
||||
|
||||
this.recordPredicate = null;
|
||||
@ -34,7 +43,8 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade
|
||||
}
|
||||
|
||||
public IndexJournalReaderSingleCompressedFile(Path file, Predicate<IndexJournalReadEntry> entryPredicate, Predicate<IndexJournalEntryData.Record> recordPredicate) throws IOException {
|
||||
journalFile = file;
|
||||
this.journalFile = file;
|
||||
|
||||
fileHeader = readHeader(file);
|
||||
|
||||
this.recordPredicate = recordPredicate;
|
||||
@ -42,8 +52,6 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade
|
||||
}
|
||||
|
||||
private static IndexJournalFileHeader readHeader(Path file) throws IOException {
|
||||
journalFile = file;
|
||||
|
||||
try (var raf = new RandomAccessFile(file.toFile(), "r")) {
|
||||
long unused = raf.readLong();
|
||||
long wordCount = raf.readLong();
|
||||
@ -61,10 +69,6 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade
|
||||
return new DataInputStream(new ZstdInputStream(new BufferedInputStream(fileInputStream)));
|
||||
}
|
||||
|
||||
public IndexJournalFileHeader fileHeader() {
|
||||
return fileHeader;
|
||||
}
|
||||
|
||||
public boolean filter(IndexJournalReadEntry entry) {
|
||||
return entryPredicate == null || entryPredicate.test(entry);
|
||||
}
|
||||
@ -80,31 +84,7 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade
|
||||
|
||||
|
||||
@Override
|
||||
public IndexJournalStatistics getStatistics() {
|
||||
int highestWord = 0;
|
||||
|
||||
// Docs cardinality is a candidate for a HyperLogLog
|
||||
Roaring64Bitmap docsBitmap = new Roaring64Bitmap();
|
||||
|
||||
for (var entry : this) {
|
||||
var entryData = entry.readEntry();
|
||||
|
||||
if (filter(entry)) {
|
||||
docsBitmap.addLong(entry.docId() & 0x0000_0000_FFFF_FFFFL);
|
||||
|
||||
for (var item : entryData) {
|
||||
if (filter(entry, item)) {
|
||||
highestWord = Integer.max(item.wordId(), highestWord);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new IndexJournalStatistics(highestWord, docsBitmap.getIntCardinality());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void forEachWordId(IntConsumer consumer) {
|
||||
public void forEachWordId(LongConsumer consumer) {
|
||||
for (var entry : this) {
|
||||
var data = entry.readEntry();
|
||||
for (var post : data) {
|
||||
@ -115,32 +95,6 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void forEachUrlIdWordId(BiIntConsumer consumer) {
|
||||
for (var entry : this) {
|
||||
var data = entry.readEntry();
|
||||
|
||||
for (var post : data) {
|
||||
if (filter(entry, post)) {
|
||||
consumer.accept(entry.urlId(), post.wordId());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void forEachDocIdWordId(LongIntConsumer consumer) {
|
||||
for (var entry : this) {
|
||||
var data = entry.readEntry();
|
||||
|
||||
for (var post : data) {
|
||||
if (filter(entry, post)) {
|
||||
consumer.accept(entry.docId(), post.wordId());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void forEachDocIdRecord(LongObjectConsumer<IndexJournalEntryData.Record> consumer) {
|
||||
for (var entry : this) {
|
||||
@ -154,10 +108,10 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade
|
||||
}
|
||||
}
|
||||
@Override
|
||||
public void forEachUrlId(IntConsumer consumer) {
|
||||
public void forEachDocId(LongConsumer consumer) {
|
||||
for (var entry : this) {
|
||||
if (filter(entry)) {
|
||||
consumer.accept(entry.urlId());
|
||||
consumer.accept(entry.docId());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,48 @@
|
||||
package nu.marginalia.index.journal.writer;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||
import nu.marginallia.index.journal.IndexJournalFileNames;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class IndexJournalWriterPagingImpl implements IndexJournalWriter {
|
||||
private final Path outputDir;
|
||||
private int fileNumber = 0;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private IndexJournalWriter currentWriter = null;
|
||||
private int inputsForFile = 0;
|
||||
|
||||
public IndexJournalWriterPagingImpl(Path outputDir) throws IOException {
|
||||
this.outputDir = outputDir;
|
||||
switchToNextWriter();
|
||||
|
||||
logger.info("Creating Journal Writer {}", outputDir);
|
||||
}
|
||||
|
||||
private void switchToNextWriter() throws IOException {
|
||||
if (currentWriter != null)
|
||||
currentWriter.close();
|
||||
|
||||
currentWriter = new IndexJournalWriterSingleFileImpl(IndexJournalFileNames.allocateName(outputDir, fileNumber++));
|
||||
}
|
||||
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public void put(IndexJournalEntryHeader header, IndexJournalEntryData entry) {
|
||||
if (++inputsForFile > 100_000) {
|
||||
inputsForFile = 0;
|
||||
switchToNextWriter();
|
||||
}
|
||||
currentWriter.put(header, entry);
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
currentWriter.close();
|
||||
}
|
||||
}
|
@ -1,12 +1,11 @@
|
||||
package nu.marginalia.index.journal.writer;
|
||||
|
||||
import com.github.luben.zstd.ZstdDirectBufferCompressingStream;
|
||||
import com.github.luben.zstd.ZstdOutputStream;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
import nu.marginalia.lexicon.KeywordLexicon;
|
||||
import nu.marginallia.index.journal.IndexJournalFileNames;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -16,27 +15,34 @@ import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.nio.file.attribute.PosixFilePermissions;
|
||||
|
||||
public class IndexJournalWriterImpl implements IndexJournalWriter{
|
||||
private final KeywordLexicon lexicon;
|
||||
public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{
|
||||
|
||||
private static final int ZSTD_BUFFER_SIZE = 8192;
|
||||
private static final int DATA_BUFFER_SIZE = 8192;
|
||||
|
||||
private final ByteBuffer dataBuffer = ByteBuffer.allocateDirect(DATA_BUFFER_SIZE);
|
||||
|
||||
|
||||
private final ZstdDirectBufferCompressingStream compressingStream;
|
||||
private int numEntries = 0;
|
||||
private final FileChannel fileChannel;
|
||||
|
||||
public IndexJournalWriterImpl(KeywordLexicon lexicon, Path outputFile) throws IOException {
|
||||
this.lexicon = lexicon;
|
||||
private int numEntries = 0;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
public IndexJournalWriterSingleFileImpl(Path outputFile) throws IOException {
|
||||
|
||||
logger.info("Creating Journal Writer {}", outputFile);
|
||||
|
||||
Files.deleteIfExists(outputFile);
|
||||
Files.createFile(outputFile, PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||
|
||||
fileChannel = FileChannel.open(outputFile, StandardOpenOption.CREATE,
|
||||
StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING);
|
||||
|
||||
writeHeaderPlaceholder(fileChannel);
|
||||
|
||||
compressingStream = new ZstdDirectBufferCompressingStream(ByteBuffer.allocateDirect(ZSTD_BUFFER_SIZE), 3) {
|
||||
protected ByteBuffer flushBuffer(ByteBuffer toFlush) throws IOException {
|
||||
toFlush.flip();
|
||||
@ -64,7 +70,7 @@ public class IndexJournalWriterImpl implements IndexJournalWriter{
|
||||
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public synchronized void put(IndexJournalEntryHeader header, IndexJournalEntryData entry) {
|
||||
public void put(IndexJournalEntryHeader header, IndexJournalEntryData entry) {
|
||||
if (dataBuffer.capacity() - dataBuffer.position() < 3*8) {
|
||||
dataBuffer.flip();
|
||||
compressingStream.compress(dataBuffer);
|
||||
@ -84,6 +90,7 @@ public class IndexJournalWriterImpl implements IndexJournalWriter{
|
||||
dataBuffer.clear();
|
||||
}
|
||||
else while (remaining-- > 0 && i < entry.size()) {
|
||||
|
||||
dataBuffer.putLong(entry.underlyingArray[i++]);
|
||||
}
|
||||
}
|
||||
@ -103,7 +110,7 @@ public class IndexJournalWriterImpl implements IndexJournalWriter{
|
||||
|
||||
ByteBuffer header = ByteBuffer.allocate(16);
|
||||
header.putLong(numEntries);
|
||||
header.putLong(lexicon.size());
|
||||
header.putLong(0);
|
||||
header.flip();
|
||||
|
||||
while (header.position() < header.limit()) {
|
@ -0,0 +1,30 @@
|
||||
package nu.marginallia.index.journal;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class IndexJournalFileNames {
|
||||
public static Path allocateName(Path base, int idx) {
|
||||
return base.resolve(String.format("page-index-%04d.dat", idx));
|
||||
}
|
||||
|
||||
public static List<Path> findJournalFiles(Path baseDirectory) throws IOException {
|
||||
List<Path> ret = new ArrayList<>();
|
||||
|
||||
try (var listStream = Files.list(baseDirectory)) {
|
||||
listStream
|
||||
.filter(IndexJournalFileNames::isJournalFile)
|
||||
.sorted()
|
||||
.forEach(ret::add);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public static boolean isJournalFile(Path file) {
|
||||
return file.toFile().getName().matches("page-index-\\d{4}.dat");
|
||||
}
|
||||
}
|
@ -4,13 +4,12 @@ import nu.marginalia.index.journal.model.IndexJournalEntry;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
|
||||
import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
|
||||
import nu.marginalia.lexicon.KeywordLexicon;
|
||||
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
@ -22,15 +21,16 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
public class IndexJournalTest {
|
||||
Path tempFile;
|
||||
KeywordLexicon lexicon;
|
||||
IndexJournalReader reader;
|
||||
|
||||
long firstDocId = UrlIdCodec.encodeId(44, 10);
|
||||
long secondDocId = UrlIdCodec.encodeId(43, 15);
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws IOException {
|
||||
tempFile = Files.createTempFile(getClass().getSimpleName(), ".dat");
|
||||
lexicon = Mockito.mock(KeywordLexicon.class);
|
||||
|
||||
var journalWriter = new IndexJournalWriterImpl(lexicon, tempFile);
|
||||
var journalWriter = new IndexJournalWriterSingleFileImpl( tempFile);
|
||||
journalWriter.put(IndexJournalEntry.builder(44, 10, 55)
|
||||
.add(1, 2)
|
||||
.add(2, 3)
|
||||
@ -65,11 +65,11 @@ public class IndexJournalTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void forEachUrlId() {
|
||||
List<Integer> expected = List.of(10, 15);
|
||||
List<Integer> actual = new ArrayList<>();
|
||||
public void forEachDocId() {
|
||||
List<Long> expected = List.of(firstDocId, secondDocId);
|
||||
List<Long> actual = new ArrayList<>();
|
||||
|
||||
reader.forEachUrlId(actual::add);
|
||||
reader.forEachDocId(actual::add);
|
||||
assertEquals(expected, actual);
|
||||
}
|
||||
|
||||
@ -78,50 +78,19 @@ public class IndexJournalTest {
|
||||
List<Integer> expected = List.of(1, 2, 3, 5, 5 ,6);
|
||||
List<Integer> actual = new ArrayList<>();
|
||||
|
||||
reader.forEachWordId(actual::add);
|
||||
assertEquals(expected, actual);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void forEachUrlIdWordId() {
|
||||
List<Pair<Integer, Integer>> expected = List.of(
|
||||
Pair.of(10, 1),
|
||||
Pair.of(10, 2),
|
||||
Pair.of(10, 3),
|
||||
Pair.of(10, 5),
|
||||
Pair.of(15, 5),
|
||||
Pair.of(15, 6));
|
||||
List<Pair<Integer, Integer>> actual = new ArrayList<>();
|
||||
|
||||
reader.forEachUrlIdWordId((url, word) -> actual.add(Pair.of(url, word)));
|
||||
assertEquals(expected, actual);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void forEachDocIdWordId() {
|
||||
List<Pair<Long, Integer>> expected = List.of(
|
||||
Pair.of(10L | (44L << 32), 1),
|
||||
Pair.of(10L | (44L << 32), 2),
|
||||
Pair.of(10L | (44L << 32), 3),
|
||||
Pair.of(10L | (44L << 32), 5),
|
||||
Pair.of(15L | (43L << 32), 5),
|
||||
Pair.of(15L | (43L << 32), 6));
|
||||
List<Pair<Long, Integer>> actual = new ArrayList<>();
|
||||
|
||||
reader.forEachDocIdWordId((url, word) -> actual.add(Pair.of(url, word)));
|
||||
reader.forEachWordId(i -> actual.add((int) i));
|
||||
assertEquals(expected, actual);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void forEachDocIdRecord() {
|
||||
List<Pair<Long, IndexJournalEntryData.Record>> expected = List.of(
|
||||
Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(1, 2)),
|
||||
Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(2, 3)),
|
||||
Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(3, 4)),
|
||||
Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(5, 6)),
|
||||
Pair.of(15L | (43L << 32), new IndexJournalEntryData.Record(5, 5)),
|
||||
Pair.of(15L | (43L << 32), new IndexJournalEntryData.Record(6, 6))
|
||||
Pair.of(firstDocId, new IndexJournalEntryData.Record(1, 2)),
|
||||
Pair.of(firstDocId, new IndexJournalEntryData.Record(2, 3)),
|
||||
Pair.of(firstDocId, new IndexJournalEntryData.Record(3, 4)),
|
||||
Pair.of(firstDocId, new IndexJournalEntryData.Record(5, 6)),
|
||||
Pair.of(secondDocId, new IndexJournalEntryData.Record(5, 5)),
|
||||
Pair.of(secondDocId, new IndexJournalEntryData.Record(6, 6))
|
||||
);
|
||||
List<Pair<Long, IndexJournalEntryData.Record>> actual = new ArrayList<>();
|
||||
|
||||
|
@ -9,16 +9,16 @@ import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||
public interface IndexQueryBuilder {
|
||||
/** Filters documents that also contain termId, within the full index.
|
||||
*/
|
||||
IndexQueryBuilder alsoFull(int termId);
|
||||
IndexQueryBuilder alsoFull(long termId);
|
||||
|
||||
/**
|
||||
* Filters documents that also contain the termId, within the priority index.
|
||||
*/
|
||||
IndexQueryBuilder alsoPrio(int termIds);
|
||||
IndexQueryBuilder alsoPrio(long termIds);
|
||||
|
||||
/** Excludes documents that contain termId, within the full index
|
||||
*/
|
||||
IndexQueryBuilder notFull(int termId);
|
||||
IndexQueryBuilder notFull(long termId);
|
||||
|
||||
IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep);
|
||||
|
||||
|
@ -21,7 +21,7 @@ public class QueryFilterLetThrough implements QueryFilterStepIf {
|
||||
}
|
||||
|
||||
public String describe() {
|
||||
return "[NoPass]";
|
||||
return "[PassThrough]";
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -3,10 +3,10 @@ package nu.marginalia.index.searchset;
|
||||
public interface SearchSet {
|
||||
|
||||
/**
|
||||
* Returns true if the given urlId is contained in the set
|
||||
* Returns true if the given domainId is contained in the set
|
||||
* or if the documentMetadata vibes with the set
|
||||
*
|
||||
*/
|
||||
boolean contains(int urlId, long documentMetadata);
|
||||
boolean contains(int domainId, long documentMetadata);
|
||||
|
||||
}
|
||||
|
@ -18,15 +18,15 @@ dependencies {
|
||||
implementation project(':code:features-index:domain-ranking')
|
||||
implementation project(':code:features-index:index-query')
|
||||
implementation project(':code:features-index:index-journal')
|
||||
implementation project(':code:features-index:lexicon')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:process')
|
||||
|
||||
|
||||
implementation libs.lombok
|
||||
annotationProcessor libs.lombok
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation libs.prometheus
|
||||
implementation libs.fastutil
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
|
4
code/features-index/index-reverse/index.svg
Normal file
4
code/features-index/index-reverse/index.svg
Normal file
File diff suppressed because one or more lines are too long
After Width: | Height: | Size: 21 KiB |
4
code/features-index/index-reverse/merging.svg
Normal file
4
code/features-index/index-reverse/merging.svg
Normal file
File diff suppressed because one or more lines are too long
After Width: | Height: | Size: 21 KiB |
4
code/features-index/index-reverse/preindex.svg
Normal file
4
code/features-index/index-reverse/preindex.svg
Normal file
File diff suppressed because one or more lines are too long
After Width: | Height: | Size: 29 KiB |
@ -12,9 +12,35 @@ The full index also provides access to term-level metadata, while the priority i
|
||||
[1] See WordFlags in [common/model](../../common/model/) and
|
||||
KeywordMetadata in [features-convert/keyword-extraction](../../features-convert/keyword-extraction).
|
||||
|
||||
## Construction
|
||||
|
||||
The reverse index is constructed by first building a series of preindexes.
|
||||
Preindexes consist of a Segment and a Documents object. The segment contains
|
||||
information about which word identifiers are present and how many, and the
|
||||
documents contain information about in which documents the words can be found.
|
||||
|
||||

|
||||
|
||||
These would typically not fit in RAM, so the index journal is paged
|
||||
and the preindexes are constructed small enough to fit in memory, and
|
||||
then merged. Merging sorted arrays is a very fast operation that does
|
||||
not require additional RAM.
|
||||
|
||||

|
||||
|
||||
Once merged into one large preindex, indexes are added to the preindex data
|
||||
to form a finalized reverse index.
|
||||
|
||||

|
||||
## Central Classes
|
||||
|
||||
* [ReverseIndexFullConverter](src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java) constructs the full index.
|
||||
* [ReverseIndexFullReader](src/main/java/nu/marginalia/index/full/ReverseIndexFullReader.java) interrogates the full index.
|
||||
* [ReverseIndexPriorityConverter](src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java) constructs the priority index.
|
||||
* [ReverseIndexPriorityReader](src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityReader.java) interrogates the priority index.
|
||||
* [ReversePreindex](src/main/java/nu/marginalia/index/construction/ReversePreindex.java) intermediate reverse index state.
|
||||
* [ReverseIndexConstructor](src/main/java/nu/marginalia/index/construction/ReverseIndexConstructor.java) constructs the index.
|
||||
* [ReverseIndexReader](src/main/java/nu/marginalia/index/ReverseIndexReader.java) interrogates the index.
|
||||
|
||||
## See Also
|
||||
|
||||
* [index-journal](../index-journal)
|
||||
* [index-forward](../index-forward)
|
||||
* [libraries/btree](../../libraries/btree)
|
||||
* [libraries/array](../../libraries/array)
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.index.full;
|
||||
package nu.marginalia.index;
|
||||
|
||||
import nu.marginalia.array.buffer.LongQueryBuffer;
|
||||
import nu.marginalia.btree.BTreeReader;
|
||||
@ -6,18 +6,18 @@ import nu.marginalia.index.query.EntrySource;
|
||||
|
||||
import static java.lang.Math.min;
|
||||
|
||||
public class ReverseIndexFullEntrySource implements EntrySource {
|
||||
public class ReverseIndexEntrySource implements EntrySource {
|
||||
private final BTreeReader reader;
|
||||
|
||||
int pos;
|
||||
int endOffset;
|
||||
|
||||
final int entrySize;
|
||||
private final int wordId;
|
||||
private final long wordId;
|
||||
|
||||
public ReverseIndexFullEntrySource(BTreeReader reader,
|
||||
int entrySize,
|
||||
int wordId) {
|
||||
public ReverseIndexEntrySource(BTreeReader reader,
|
||||
int entrySize,
|
||||
long wordId) {
|
||||
this.reader = reader;
|
||||
this.entrySize = entrySize;
|
||||
this.wordId = wordId;
|
@ -0,0 +1,28 @@
|
||||
package nu.marginalia.index;
|
||||
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class ReverseIndexFullFileNames {
|
||||
public static Path resolve(Path basePath, FileIdentifier identifier, FileVersion version) {
|
||||
return switch (identifier) {
|
||||
case WORDS -> switch (version) {
|
||||
case NEXT -> basePath.resolve("rev-words.dat.next");
|
||||
case CURRENT -> basePath.resolve("rev-words.dat");
|
||||
};
|
||||
case DOCS -> switch (version) {
|
||||
case NEXT -> basePath.resolve("rev-docs.dat.next");
|
||||
case CURRENT -> basePath.resolve("rev-docs.dat");
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
public enum FileVersion {
|
||||
CURRENT,
|
||||
NEXT
|
||||
};
|
||||
|
||||
public enum FileIdentifier {
|
||||
WORDS,
|
||||
DOCS
|
||||
}
|
||||
}
|
@ -0,0 +1,10 @@
|
||||
package nu.marginalia.index;
|
||||
|
||||
import nu.marginalia.btree.model.BTreeBlockSize;
|
||||
import nu.marginalia.btree.model.BTreeContext;
|
||||
|
||||
public class ReverseIndexParameters
|
||||
{
|
||||
public static final BTreeContext docsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048);
|
||||
public static final BTreeContext wordsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048);
|
||||
}
|
@ -0,0 +1,28 @@
|
||||
package nu.marginalia.index;
|
||||
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class ReverseIndexPrioFileNames {
|
||||
public static Path resolve(Path basePath, FileIdentifier identifier, FileVersion version) {
|
||||
return switch (identifier) {
|
||||
case WORDS -> switch (version) {
|
||||
case NEXT -> basePath.resolve("rev-prio-words.dat.next");
|
||||
case CURRENT -> basePath.resolve("rev-prio-words.dat");
|
||||
};
|
||||
case DOCS -> switch (version) {
|
||||
case NEXT -> basePath.resolve("rev-prio-docs.dat.next");
|
||||
case CURRENT -> basePath.resolve("rev-prio-docs.dat");
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
public enum FileVersion {
|
||||
CURRENT,
|
||||
NEXT
|
||||
};
|
||||
|
||||
public enum FileIdentifier {
|
||||
WORDS,
|
||||
DOCS
|
||||
}
|
||||
}
|
@ -1,11 +1,11 @@
|
||||
package nu.marginalia.index.full;
|
||||
package nu.marginalia.index;
|
||||
|
||||
import nu.marginalia.index.query.ReverseIndexRejectFilter;
|
||||
import nu.marginalia.index.query.ReverseIndexRetainFilter;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.btree.BTreeReader;
|
||||
import nu.marginalia.index.query.EmptyEntrySource;
|
||||
import nu.marginalia.index.query.EntrySource;
|
||||
import nu.marginalia.index.query.ReverseIndexRejectFilter;
|
||||
import nu.marginalia.index.query.ReverseIndexRetainFilter;
|
||||
import nu.marginalia.index.query.filter.QueryFilterLetThrough;
|
||||
import nu.marginalia.index.query.filter.QueryFilterNoPass;
|
||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||
@ -15,18 +15,22 @@ import org.slf4j.LoggerFactory;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
|
||||
public class ReverseIndexFullReader {
|
||||
public class ReverseIndexReader {
|
||||
private final LongArray words;
|
||||
private final LongArray documents;
|
||||
|
||||
private final long wordsDataOffset;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final BTreeReader wordsBTreeReader;
|
||||
|
||||
public ReverseIndexFullReader(Path words, Path documents) throws IOException {
|
||||
|
||||
|
||||
public ReverseIndexReader(Path words, Path documents) throws IOException {
|
||||
if (!Files.exists(words) || !Files.exists(documents)) {
|
||||
this.words = null;
|
||||
this.documents = null;
|
||||
this.wordsBTreeReader = null;
|
||||
this.wordsDataOffset = -1;
|
||||
return;
|
||||
}
|
||||
|
||||
@ -34,62 +38,52 @@ public class ReverseIndexFullReader {
|
||||
|
||||
this.words = LongArray.mmapRead(words);
|
||||
this.documents = LongArray.mmapRead(documents);
|
||||
|
||||
wordsBTreeReader = new BTreeReader(this.words, ReverseIndexParameters.wordsBTreeContext, 0);
|
||||
wordsDataOffset = wordsBTreeReader.getHeader().dataOffsetLongs();
|
||||
}
|
||||
|
||||
public boolean isWordInDoc(int wordId, long documentId) {
|
||||
if (wordId < 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
long offset = words.get(wordId);
|
||||
private long wordOffset(long wordId) {
|
||||
long idx = wordsBTreeReader.findEntry(wordId);
|
||||
|
||||
if (offset < 0) {
|
||||
return false;
|
||||
}
|
||||
if (idx < 0)
|
||||
return -1L;
|
||||
|
||||
return createReaderNew(offset).findEntry(documentId) >= 0;
|
||||
return words.get(wordsDataOffset + idx + 1);
|
||||
}
|
||||
|
||||
public EntrySource documents(int wordId) {
|
||||
public EntrySource documents(long wordId) {
|
||||
if (null == words) {
|
||||
logger.warn("Reverse index is not ready, dropping query");
|
||||
return new EmptyEntrySource();
|
||||
}
|
||||
|
||||
if (wordId < 0 || wordId >= words.size()) return new EmptyEntrySource();
|
||||
|
||||
long offset = words.get(wordId);
|
||||
long offset = wordOffset(wordId);
|
||||
|
||||
if (offset < 0) return new EmptyEntrySource();
|
||||
|
||||
return new ReverseIndexFullEntrySource(createReaderNew(offset), ReverseIndexFullParameters.ENTRY_SIZE, wordId);
|
||||
return new ReverseIndexEntrySource(createReaderNew(offset), 2, wordId);
|
||||
}
|
||||
|
||||
public QueryFilterStepIf also(int wordId) {
|
||||
if (wordId < 0) return new QueryFilterNoPass();
|
||||
|
||||
long offset = words.get(wordId);
|
||||
public QueryFilterStepIf also(long wordId) {
|
||||
long offset = wordOffset(wordId);
|
||||
|
||||
if (offset < 0) return new QueryFilterNoPass();
|
||||
|
||||
return new ReverseIndexRetainFilter(createReaderNew(offset), "full", wordId);
|
||||
}
|
||||
|
||||
public QueryFilterStepIf not(int wordId) {
|
||||
if (wordId < 0) return new QueryFilterLetThrough();
|
||||
|
||||
long offset = words.get(wordId);
|
||||
public QueryFilterStepIf not(long wordId) {
|
||||
long offset = wordOffset(wordId);
|
||||
|
||||
if (offset < 0) return new QueryFilterLetThrough();
|
||||
|
||||
return new ReverseIndexRejectFilter(createReaderNew(offset));
|
||||
}
|
||||
|
||||
public int numDocuments(int wordId) {
|
||||
if (wordId < 0)
|
||||
return 0;
|
||||
|
||||
long offset = words.get(wordId);
|
||||
public int numDocuments(long wordId) {
|
||||
long offset = wordOffset(wordId);
|
||||
|
||||
if (offset < 0)
|
||||
return 0;
|
||||
@ -98,23 +92,33 @@ public class ReverseIndexFullReader {
|
||||
}
|
||||
|
||||
private BTreeReader createReaderNew(long offset) {
|
||||
return new BTreeReader(documents, ReverseIndexFullParameters.bTreeContext, offset);
|
||||
return new BTreeReader(documents, ReverseIndexParameters.docsBTreeContext, offset);
|
||||
}
|
||||
|
||||
public long[] getTermMeta(int wordId, long[] docIds) {
|
||||
if (wordId < 0) {
|
||||
return new long[docIds.length];
|
||||
}
|
||||
public long[] getTermMeta(long wordId, long[] docIds) {
|
||||
long offset = wordOffset(wordId);
|
||||
|
||||
long offset = words.get(wordId);
|
||||
if (offset < 0) {
|
||||
return new long[docIds.length];
|
||||
}
|
||||
|
||||
Arrays.sort(docIds);
|
||||
assert isSorted(docIds) : "The input array docIds is assumed to be sorted";
|
||||
|
||||
var reader = createReaderNew(offset);
|
||||
return reader.queryData(docIds, 1);
|
||||
}
|
||||
|
||||
private boolean isSorted(long[] ids) {
|
||||
if (ids.length == 0)
|
||||
return true;
|
||||
long prev = ids[0];
|
||||
|
||||
for (int i = 1; i < ids.length; i++) {
|
||||
if(ids[i] <= prev)
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,9 @@
|
||||
package nu.marginalia.index.construction;
|
||||
|
||||
public interface DocIdRewriter {
|
||||
long rewriteDocId(long docId);
|
||||
|
||||
static DocIdRewriter identity() {
|
||||
return l -> l;
|
||||
}
|
||||
}
|
@ -0,0 +1,10 @@
|
||||
package nu.marginalia.index.construction;
|
||||
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public interface JournalReaderSource {
|
||||
IndexJournalReader construct(Path sourceFile) throws IOException;
|
||||
}
|
@ -4,7 +4,6 @@ import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.functional.LongIOTransformer;
|
||||
import nu.marginalia.btree.BTreeWriter;
|
||||
import nu.marginalia.btree.model.BTreeContext;
|
||||
import nu.marginalia.index.priority.ReverseIndexPriorityParameters;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.channels.FileChannel;
|
||||
|
@ -0,0 +1,115 @@
|
||||
package nu.marginalia.index.construction;
|
||||
|
||||
import nu.marginalia.process.control.ProcessAdHocTaskHeartbeat;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import nu.marginallia.index.journal.IndexJournalFileNames;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class ReverseIndexConstructor {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(ReverseIndexConstructor.class);
|
||||
|
||||
public enum CreateReverseIndexSteps {
|
||||
CREATE_PREINDEXES,
|
||||
MERGE_PREINDEXES,
|
||||
FINALIZE,
|
||||
FINISHED
|
||||
}
|
||||
public static void createReverseIndex(
|
||||
ProcessHeartbeat processHeartbeat,
|
||||
JournalReaderSource readerSource,
|
||||
Path sourceBaseDir,
|
||||
DocIdRewriter docIdRewriter,
|
||||
Path tmpDir,
|
||||
Path outputFileDocs,
|
||||
Path outputFileWords) throws IOException
|
||||
{
|
||||
var inputs = IndexJournalFileNames.findJournalFiles(sourceBaseDir);
|
||||
if (inputs.isEmpty()) {
|
||||
logger.error("No journal files in base dir {}", sourceBaseDir);
|
||||
return;
|
||||
}
|
||||
|
||||
try (var heartbeat = processHeartbeat.createProcessTaskHeartbeat(CreateReverseIndexSteps.class, "createReverseIndex")) {
|
||||
List<ReversePreindex> preindexes = new ArrayList<>();
|
||||
|
||||
heartbeat.progress(CreateReverseIndexSteps.CREATE_PREINDEXES);
|
||||
|
||||
try (var preindexHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("constructPreindexes")) {
|
||||
for (int i = 0; i < inputs.size(); i++) {
|
||||
var input = inputs.get(i);
|
||||
|
||||
preindexHeartbeat.progress(input.toFile().getName(), i, inputs.size());
|
||||
|
||||
preindexes.add(ReversePreindex.constructPreindex(readerSource.construct(input), docIdRewriter, tmpDir));
|
||||
}
|
||||
|
||||
preindexHeartbeat.progress("FINISHED", inputs.size(), inputs.size());
|
||||
}
|
||||
|
||||
heartbeat.progress(CreateReverseIndexSteps.MERGE_PREINDEXES);
|
||||
ReversePreindex finalPreindex;
|
||||
|
||||
try (var mergeHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("mergePreindexes")) {
|
||||
finalPreindex = mergePreindexes(tmpDir, mergeHeartbeat, preindexes);
|
||||
}
|
||||
|
||||
heartbeat.progress(CreateReverseIndexSteps.FINALIZE);
|
||||
finalPreindex.finalizeIndex(outputFileDocs, outputFileWords);
|
||||
|
||||
heartbeat.progress(CreateReverseIndexSteps.FINISHED);
|
||||
finalPreindex.delete();
|
||||
}
|
||||
}
|
||||
|
||||
private static ReversePreindex mergePreindexes(Path workDir, ProcessAdHocTaskHeartbeat mergeHeartbeat, List<ReversePreindex> preindexes) throws IOException {
|
||||
assert !preindexes.isEmpty();
|
||||
|
||||
if (preindexes.size() == 1) {
|
||||
logger.info("Single preindex, no merge necessary");
|
||||
return preindexes.get(0);
|
||||
}
|
||||
|
||||
List<ReversePreindex> toMerge = new ArrayList<>(preindexes);
|
||||
List<ReversePreindex> merged = new ArrayList<>();
|
||||
|
||||
int pass = 0;
|
||||
while (toMerge.size() != 1) {
|
||||
String stage = String.format("PASS[%d]: %d -> %d", ++pass,
|
||||
toMerge.size(),
|
||||
toMerge.size()/2 + (toMerge.size() % 2)
|
||||
);
|
||||
|
||||
for (int i = 0; i + 1 < toMerge.size(); i+=2) {
|
||||
mergeHeartbeat.progress(stage, i/2, toMerge.size()/2);
|
||||
|
||||
var left = toMerge.get(i);
|
||||
var right = toMerge.get(i+1);
|
||||
|
||||
merged.add(ReversePreindex.merge(workDir, left, right));
|
||||
|
||||
left.delete();
|
||||
right.delete();
|
||||
}
|
||||
|
||||
if ((toMerge.size() % 2) != 0) {
|
||||
merged.add(toMerge.get(toMerge.size()-1));
|
||||
}
|
||||
|
||||
toMerge.clear();
|
||||
toMerge.addAll(merged);
|
||||
merged.clear();
|
||||
}
|
||||
|
||||
mergeHeartbeat.progress("FINISHED", 1, 1);
|
||||
|
||||
return toMerge.get(0);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,280 @@
|
||||
package nu.marginalia.index.construction;
|
||||
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.algo.SortingContext;
|
||||
import nu.marginalia.btree.BTreeWriter;
|
||||
import nu.marginalia.index.ReverseIndexParameters;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
import static nu.marginalia.array.algo.TwoArrayOperations.*;
|
||||
|
||||
/** Contains the data that would go into a reverse index,
|
||||
* that is, a mapping from words to documents, minus the actual
|
||||
* index structure that makes the data quick to access while
|
||||
* searching.
|
||||
* <p>
|
||||
* Two preindexes can be merged into a third preindex containing
|
||||
* the union of their data. This operation requires no additional
|
||||
* RAM.
|
||||
*/
|
||||
public class ReversePreindex {
|
||||
final ReversePreindexWordSegments segments;
|
||||
final ReversePreindexDocuments documents;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(ReversePreindex.class);
|
||||
|
||||
public ReversePreindex(ReversePreindexWordSegments segments, ReversePreindexDocuments documents) {
|
||||
this.segments = segments;
|
||||
this.documents = documents;
|
||||
}
|
||||
|
||||
/** Constructs a new preindex with the data associated with reader. The backing files
|
||||
* will have randomly assigned names.
|
||||
*/
|
||||
public static ReversePreindex constructPreindex(IndexJournalReader reader,
|
||||
DocIdRewriter docIdRewriter,
|
||||
Path destDir) throws IOException
|
||||
{
|
||||
Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat");
|
||||
Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat");
|
||||
Path docsFile = Files.createTempFile(destDir, "docs", ".dat");
|
||||
|
||||
logger.info("Segmenting");
|
||||
var segments = ReversePreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile);
|
||||
logger.info("Mapping docs");
|
||||
var docs = ReversePreindexDocuments.construct(docsFile, reader, docIdRewriter, segments);
|
||||
logger.info("Done");
|
||||
return new ReversePreindex(segments, docs);
|
||||
}
|
||||
|
||||
/** Transform the preindex into a reverse index */
|
||||
public void finalizeIndex(Path outputFileDocs, Path outputFileWords) throws IOException {
|
||||
var offsets = segments.counts;
|
||||
|
||||
Files.deleteIfExists(outputFileDocs);
|
||||
Files.deleteIfExists(outputFileWords);
|
||||
|
||||
// Estimate the size of the docs index data
|
||||
offsets.transformEach(0, offsets.size(), new CountToOffsetTransformer(2));
|
||||
IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(ReverseIndexParameters.docsBTreeContext, 2);
|
||||
offsets.fold(0, 0, offsets.size(), sizeEstimator);
|
||||
|
||||
// Write the docs file
|
||||
LongArray finalDocs = LongArray.mmapForWriting(outputFileDocs, sizeEstimator.size);
|
||||
try (var intermediateDocChannel = documents.createDocumentsFileChannel()) {
|
||||
offsets.transformEachIO(0, offsets.size(), new ReverseIndexBTreeTransformer(finalDocs, 2, ReverseIndexParameters.docsBTreeContext, intermediateDocChannel));
|
||||
intermediateDocChannel.force(false);
|
||||
}
|
||||
|
||||
LongArray wordIds = segments.wordIds;
|
||||
|
||||
assert offsets.size() == wordIds.size() : "Offsets and word-ids of different size";
|
||||
|
||||
// Estimate the size of the words index data
|
||||
long wordsSize = ReverseIndexParameters.wordsBTreeContext.calculateSize((int) offsets.size());
|
||||
|
||||
// Construct the tree
|
||||
LongArray wordsArray = LongArray.mmapForWriting(outputFileWords, wordsSize);
|
||||
|
||||
new BTreeWriter(wordsArray, ReverseIndexParameters.wordsBTreeContext)
|
||||
.write(0, (int) offsets.size(), mapRegion -> {
|
||||
for (long i = 0; i < offsets.size(); i++) {
|
||||
mapRegion.set(2*i, wordIds.get(i));
|
||||
mapRegion.set(2*i + 1, offsets.get(i));
|
||||
}
|
||||
});
|
||||
|
||||
wordsArray.force();
|
||||
|
||||
}
|
||||
|
||||
/** Delete all files associated with this pre-index */
|
||||
public void delete() throws IOException {
|
||||
segments.delete();
|
||||
documents.delete();
|
||||
}
|
||||
|
||||
public static ReversePreindex merge(Path destDir,
|
||||
ReversePreindex left,
|
||||
ReversePreindex right) throws IOException {
|
||||
|
||||
ReversePreindexWordSegments mergingSegment =
|
||||
createMergedSegmentWordFile(destDir, left.segments, right.segments);
|
||||
|
||||
var mergingIter = mergingSegment.constructionIterator(2);
|
||||
var leftIter = left.segments.iterator(2);
|
||||
var rightIter = right.segments.iterator(2);
|
||||
|
||||
Path docsFile = Files.createTempFile(destDir, "docs", ".dat");
|
||||
|
||||
LongArray mergedDocuments = LongArray.mmapForWriting(docsFile, 8 * (left.documents.size() + right.documents.size()));
|
||||
|
||||
leftIter.next();
|
||||
rightIter.next();
|
||||
|
||||
try (FileChannel leftChannel = left.documents.createDocumentsFileChannel();
|
||||
FileChannel rightChannel = right.documents.createDocumentsFileChannel())
|
||||
{
|
||||
|
||||
while (mergingIter.canPutMore()
|
||||
&& leftIter.isPositionBeforeEnd()
|
||||
&& rightIter.isPositionBeforeEnd())
|
||||
{
|
||||
final long currentWord = mergingIter.wordId;
|
||||
|
||||
if (leftIter.wordId == currentWord && rightIter.wordId == currentWord)
|
||||
{
|
||||
// both inputs have documents for the current word
|
||||
mergeSegments(leftIter, rightIter,
|
||||
left.documents, right.documents,
|
||||
mergedDocuments, mergingIter);
|
||||
}
|
||||
else if (leftIter.wordId == currentWord) {
|
||||
if (!copySegment(leftIter, mergedDocuments, leftChannel, mergingIter))
|
||||
break;
|
||||
}
|
||||
else if (rightIter.wordId == currentWord) {
|
||||
if (!copySegment(rightIter, mergedDocuments, rightChannel, mergingIter))
|
||||
break;
|
||||
}
|
||||
else assert false : "This should never happen"; // the helvetica scenario
|
||||
}
|
||||
|
||||
if (leftIter.isPositionBeforeEnd()) {
|
||||
while (copySegment(leftIter, mergedDocuments, leftChannel, mergingIter));
|
||||
}
|
||||
|
||||
if (rightIter.isPositionBeforeEnd()) {
|
||||
while (copySegment(rightIter, mergedDocuments, rightChannel, mergingIter));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
assert !leftIter.isPositionBeforeEnd() : "Left has more to go";
|
||||
assert !rightIter.isPositionBeforeEnd() : "Right has more to go";
|
||||
assert !mergingIter.canPutMore() : "Source iters ran dry before merging iter";
|
||||
|
||||
// We may have overestimated the size of the merged docs size in the case there were
|
||||
// duplicates in the data, so we need to shrink it to the actual size we wrote.
|
||||
|
||||
mergedDocuments = shrinkMergedDocuments(mergedDocuments,
|
||||
docsFile, 2 * mergingSegment.totalSize());
|
||||
|
||||
mergingSegment.force();
|
||||
|
||||
return new ReversePreindex(
|
||||
mergingSegment,
|
||||
new ReversePreindexDocuments(mergedDocuments, docsFile)
|
||||
);
|
||||
}
|
||||
|
||||
/** Create a segment word file with each word from both inputs, with zero counts for all the data.
|
||||
* This is an intermediate product in merging.
|
||||
*/
|
||||
static ReversePreindexWordSegments createMergedSegmentWordFile(Path destDir,
|
||||
ReversePreindexWordSegments left,
|
||||
ReversePreindexWordSegments right) throws IOException {
|
||||
Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat");
|
||||
Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat");
|
||||
|
||||
long segmentsSize = countDistinctElements(left.wordIds, right.wordIds,
|
||||
0, left.wordIds.size(),
|
||||
0, right.wordIds.size());
|
||||
|
||||
LongArray wordIdsFile = LongArray.mmapForWriting(segmentWordsFile, segmentsSize);
|
||||
|
||||
mergeArrays(wordIdsFile, left.wordIds, right.wordIds,
|
||||
0, wordIdsFile.size(),
|
||||
0, left.wordIds.size(),
|
||||
0, right.wordIds.size());
|
||||
|
||||
LongArray counts = LongArray.mmapForWriting(segmentCountsFile, segmentsSize);
|
||||
|
||||
return new ReversePreindexWordSegments(wordIdsFile, counts, segmentWordsFile, segmentCountsFile);
|
||||
}
|
||||
|
||||
/** It's possible we overestimated the necessary size of the documents file,
|
||||
* this will permit us to shrink it down to the smallest necessary size.
|
||||
*/
|
||||
private static LongArray shrinkMergedDocuments(LongArray mergedDocuments, Path docsFile, long sizeLongs) throws IOException {
|
||||
|
||||
mergedDocuments.force();
|
||||
|
||||
long beforeSize = mergedDocuments.size();
|
||||
try (var bc = Files.newByteChannel(docsFile, StandardOpenOption.WRITE)) {
|
||||
bc.truncate(sizeLongs * 8);
|
||||
}
|
||||
long afterSize = mergedDocuments.size();
|
||||
mergedDocuments = LongArray.mmapForWriting(docsFile, sizeLongs);
|
||||
|
||||
if (beforeSize != afterSize) {
|
||||
logger.info("Shrunk {} from {}b to {}b", docsFile, beforeSize, afterSize);
|
||||
}
|
||||
|
||||
return mergedDocuments;
|
||||
}
|
||||
|
||||
/** Merge contents of the segments indicated by leftIter and rightIter into the destionation
|
||||
* segment, and advance the construction iterator with the appropriate size.
|
||||
*/
|
||||
private static void mergeSegments(ReversePreindexWordSegments.SegmentIterator leftIter,
|
||||
ReversePreindexWordSegments.SegmentIterator rightIter,
|
||||
ReversePreindexDocuments left,
|
||||
ReversePreindexDocuments right,
|
||||
LongArray dest,
|
||||
ReversePreindexWordSegments.SegmentConstructionIterator destIter)
|
||||
{
|
||||
long distinct = countDistinctElementsN(2,
|
||||
left.documents,
|
||||
right.documents,
|
||||
leftIter.startOffset, leftIter.endOffset,
|
||||
rightIter.startOffset, rightIter.endOffset);
|
||||
|
||||
mergeArrays2(dest,
|
||||
left.documents,
|
||||
right.documents,
|
||||
destIter.startOffset,
|
||||
destIter.startOffset + 2*distinct,
|
||||
leftIter.startOffset, leftIter.endOffset,
|
||||
rightIter.startOffset, rightIter.endOffset);
|
||||
|
||||
destIter.putNext(distinct);
|
||||
leftIter.next();
|
||||
rightIter.next();
|
||||
}
|
||||
|
||||
/** Copy the data from the source segment at the position and length indicated by sourceIter,
|
||||
* into the destination segment, and advance the construction iterator.
|
||||
*/
|
||||
private static boolean copySegment(ReversePreindexWordSegments.SegmentIterator sourceIter,
|
||||
LongArray dest,
|
||||
FileChannel sourceChannel,
|
||||
ReversePreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException {
|
||||
|
||||
long size = sourceIter.endOffset - sourceIter.startOffset;
|
||||
long start = mergingIter.startOffset;
|
||||
long end = start + size;
|
||||
|
||||
dest.transferFrom(sourceChannel,
|
||||
sourceIter.startOffset,
|
||||
mergingIter.startOffset,
|
||||
end);
|
||||
|
||||
boolean putNext = mergingIter.putNext(size / 2);
|
||||
boolean iterNext = sourceIter.next();
|
||||
|
||||
assert putNext || !iterNext : "Source iterator ran out before dest iterator?!";
|
||||
|
||||
return iterNext;
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,123 @@
|
||||
package nu.marginalia.index.construction;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.algo.SortingContext;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/** A LongArray with document data, segmented according to
|
||||
* the associated ReversePreindexWordSegments data
|
||||
*/
|
||||
public class ReversePreindexDocuments {
|
||||
private final Path file;
|
||||
public final LongArray documents;
|
||||
private static final int RECORD_SIZE_LONGS = 2;
|
||||
private static final Logger logger= LoggerFactory.getLogger(ReversePreindexDocuments.class);
|
||||
|
||||
public ReversePreindexDocuments(LongArray documents, Path file) {
|
||||
this.documents = documents;
|
||||
this.file = file;
|
||||
}
|
||||
|
||||
public static ReversePreindexDocuments construct(
|
||||
Path docsFile,
|
||||
IndexJournalReader reader,
|
||||
DocIdRewriter docIdRewriter,
|
||||
ReversePreindexWordSegments segments) throws IOException {
|
||||
|
||||
|
||||
logger.info("Transferring data");
|
||||
createUnsortedDocsFile(docsFile, reader, segments, docIdRewriter);
|
||||
|
||||
LongArray docsFileMap = LongArray.mmapForWriting(docsFile, 8 * Files.size(docsFile));
|
||||
logger.info("Sorting data");
|
||||
sortDocsFile(docsFileMap, segments);
|
||||
|
||||
return new ReversePreindexDocuments(docsFileMap, docsFile);
|
||||
}
|
||||
|
||||
public FileChannel createDocumentsFileChannel() throws IOException {
|
||||
return (FileChannel) Files.newByteChannel(file, StandardOpenOption.READ);
|
||||
}
|
||||
|
||||
|
||||
public LongArray slice(long start, long end) {
|
||||
return documents.range(start, end);
|
||||
}
|
||||
|
||||
public long size() {
|
||||
return documents.size();
|
||||
}
|
||||
|
||||
private static void createUnsortedDocsFile(Path docsFile,
|
||||
IndexJournalReader reader,
|
||||
ReversePreindexWordSegments segments,
|
||||
DocIdRewriter docIdRewriter) throws IOException {
|
||||
long fileSize = 8 * segments.totalSize();
|
||||
LongArray outArray = LongArray.mmapForWriting(docsFile, fileSize);
|
||||
|
||||
var offsetMap = segments.asMap(RECORD_SIZE_LONGS);
|
||||
offsetMap.defaultReturnValue(0);
|
||||
|
||||
for (var entry : reader) {
|
||||
long rankEncodedId = docIdRewriter.rewriteDocId(entry.docId());
|
||||
|
||||
var data = entry.readEntry();
|
||||
for (int i = 0; i + 1 < data.size(); i+=2) {
|
||||
long wordId = data.get(i);
|
||||
long meta = data.get(i+1);
|
||||
|
||||
long offset = offsetMap.addTo(wordId, RECORD_SIZE_LONGS);
|
||||
|
||||
outArray.set(offset + 0, rankEncodedId);
|
||||
outArray.set(offset + 1, meta);
|
||||
}
|
||||
}
|
||||
|
||||
outArray.force();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private static void sortDocsFile(LongArray docsFileMap, ReversePreindexWordSegments segments) throws IOException {
|
||||
|
||||
var iter = segments.iterator(RECORD_SIZE_LONGS);
|
||||
|
||||
ExecutorService sortingWorkers = Executors.newWorkStealingPool(Runtime.getRuntime().availableProcessors());
|
||||
|
||||
while (iter.next()) {
|
||||
if (iter.size() < 1024) {
|
||||
docsFileMap.quickSortN(RECORD_SIZE_LONGS,
|
||||
iter.startOffset,
|
||||
iter.endOffset);
|
||||
}
|
||||
else {
|
||||
sortingWorkers.execute(() ->
|
||||
docsFileMap.quickSortN(RECORD_SIZE_LONGS,
|
||||
iter.startOffset,
|
||||
iter.endOffset));
|
||||
}
|
||||
}
|
||||
|
||||
sortingWorkers.shutdown();
|
||||
logger.info("Awaiting shutdown");
|
||||
|
||||
while (!sortingWorkers.awaitTermination(1, TimeUnit.HOURS));
|
||||
|
||||
sortingWorkers.close();
|
||||
}
|
||||
|
||||
public void delete() throws IOException {
|
||||
Files.delete(this.file);
|
||||
}
|
||||
}
|
@ -0,0 +1,197 @@
|
||||
package nu.marginalia.index.construction;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
||||
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
|
||||
import it.unimi.dsi.fastutil.longs.LongIterator;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.algo.SortingContext;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
/** A pair of file-backed arrays of sorted wordIds
|
||||
* and the count of documents associated with each wordId.
|
||||
*/
|
||||
public class ReversePreindexWordSegments {
|
||||
public final LongArray wordIds;
|
||||
public final LongArray counts;
|
||||
|
||||
private final Path wordsFile;
|
||||
private final Path countsFile;
|
||||
|
||||
public ReversePreindexWordSegments(LongArray wordIds,
|
||||
LongArray counts,
|
||||
Path wordsFile,
|
||||
Path countsFile)
|
||||
{
|
||||
assert wordIds.size() == counts.size();
|
||||
|
||||
this.wordIds = wordIds;
|
||||
this.counts = counts;
|
||||
this.wordsFile = wordsFile;
|
||||
this.countsFile = countsFile;
|
||||
}
|
||||
|
||||
/** Returns a long-long hash map where each key is a wordId,
|
||||
* and each value is the start offset of the data.
|
||||
*/
|
||||
public Long2LongOpenHashMap asMap(int recordSize) {
|
||||
Long2LongOpenHashMap ret = new Long2LongOpenHashMap((int) wordIds.size(), 0.75f);
|
||||
var iter = iterator(recordSize);
|
||||
|
||||
while (iter.next()) {
|
||||
ret.put(iter.wordId, iter.startOffset);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public static ReversePreindexWordSegments construct(IndexJournalReader reader,
|
||||
Path wordIdsFile,
|
||||
Path countsFile)
|
||||
throws IOException
|
||||
{
|
||||
Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f);
|
||||
countsMap.defaultReturnValue(0);
|
||||
reader.forEachWordId(wordId -> countsMap.addTo(wordId, 1));
|
||||
|
||||
LongArray words = LongArray.mmapForWriting(wordIdsFile, countsMap.size());
|
||||
LongArray counts = LongArray.mmapForWriting(countsFile, countsMap.size());
|
||||
|
||||
// Create the words file by iterating over the map and inserting them into
|
||||
// the words file in whatever bizarro hash table order they appear in
|
||||
int i = 0;
|
||||
LongIterator iter = countsMap.keySet().iterator();
|
||||
while (iter.hasNext()) {
|
||||
words.set(i, iter.nextLong());
|
||||
i++;
|
||||
}
|
||||
|
||||
// Sort the words file
|
||||
words.quickSort(0, counts.size());
|
||||
|
||||
// Populate the counts
|
||||
for (i = 0; i < countsMap.size(); i++) {
|
||||
counts.set(i, countsMap.get(words.get(i)));
|
||||
}
|
||||
|
||||
return new ReversePreindexWordSegments(words, counts, wordIdsFile, countsFile);
|
||||
}
|
||||
|
||||
public SegmentIterator iterator(int recordSize) {
|
||||
return new SegmentIterator(recordSize);
|
||||
}
|
||||
public SegmentConstructionIterator constructionIterator(int recordSize) {
|
||||
return new SegmentConstructionIterator(recordSize);
|
||||
}
|
||||
|
||||
public long totalSize() {
|
||||
return counts.fold(0, 0, counts.size(), Long::sum);
|
||||
}
|
||||
|
||||
public void delete() throws IOException {
|
||||
Files.delete(countsFile);
|
||||
Files.delete(wordsFile);
|
||||
}
|
||||
|
||||
public void force() {
|
||||
counts.force();
|
||||
wordIds.force();
|
||||
}
|
||||
|
||||
public class SegmentIterator {
|
||||
private final int recordSize;
|
||||
private final long fileSize;
|
||||
long wordId;
|
||||
long startOffset = 0;
|
||||
long endOffset = 0;
|
||||
|
||||
private SegmentIterator(int recordSize) {
|
||||
this.recordSize = recordSize;
|
||||
this.fileSize = wordIds.size();
|
||||
}
|
||||
|
||||
private int i = -1;
|
||||
public int idx() {
|
||||
return i;
|
||||
}
|
||||
public boolean next() {
|
||||
if (++i >= fileSize) {
|
||||
wordId = Long.MIN_VALUE;
|
||||
return false;
|
||||
}
|
||||
|
||||
wordId = wordIds.get(i);
|
||||
startOffset = endOffset;
|
||||
endOffset = startOffset + recordSize * counts.get(i);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean hasMorePositions() {
|
||||
return i + 1 < wordIds.size();
|
||||
}
|
||||
|
||||
public boolean isPositionBeforeEnd() {
|
||||
return i < wordIds.size();
|
||||
}
|
||||
|
||||
public long size() {
|
||||
return endOffset - startOffset;
|
||||
}
|
||||
}
|
||||
|
||||
class SegmentConstructionIterator {
|
||||
private final int recordSize;
|
||||
private final long fileSize;
|
||||
long wordId;
|
||||
long startOffset = 0;
|
||||
long endOffset = 0;
|
||||
|
||||
private SegmentConstructionIterator(int recordSize) {
|
||||
this.recordSize = recordSize;
|
||||
this.fileSize = wordIds.size();
|
||||
if (fileSize == 0) {
|
||||
throw new IllegalArgumentException("Cannot construct zero-length word segment file");
|
||||
}
|
||||
this.wordId = wordIds.get(0);
|
||||
}
|
||||
|
||||
private int i = 0;
|
||||
public int idx() {
|
||||
return i;
|
||||
}
|
||||
|
||||
public boolean putNext(long size) {
|
||||
|
||||
if (i >= fileSize)
|
||||
return false;
|
||||
|
||||
endOffset = startOffset + recordSize * size;
|
||||
counts.set(i, size);
|
||||
startOffset = endOffset;
|
||||
endOffset = -1;
|
||||
|
||||
i++;
|
||||
|
||||
if (i == fileSize) {
|
||||
// We've reached the end of the iteration and there is no
|
||||
// "next" wordId to fetch
|
||||
wordId = Long.MIN_VALUE;
|
||||
return false;
|
||||
}
|
||||
else {
|
||||
wordId = wordIds.get(i);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean canPutMore() {
|
||||
return i < wordIds.size();
|
||||
}
|
||||
}
|
||||
}
|
@ -1,218 +0,0 @@
|
||||
package nu.marginalia.index.full;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.index.construction.CountToOffsetTransformer;
|
||||
import nu.marginalia.index.construction.ReverseIndexBTreeTransformer;
|
||||
import nu.marginalia.index.construction.IndexSizeEstimator;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.index.journal.model.IndexJournalStatistics;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
import nu.marginalia.ranking.DomainRankings;
|
||||
import nu.marginalia.rwf.RandomWriteFunnel;
|
||||
import nu.marginalia.array.IntArray;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.algo.SortingContext;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||
|
||||
import static nu.marginalia.index.full.ReverseIndexFullParameters.bTreeContext;
|
||||
|
||||
public class ReverseIndexFullConverter {
|
||||
private static final int RWF_BIN_SIZE = 10_000_000;
|
||||
|
||||
private final ServiceHeartbeat heartbeat;
|
||||
private final Path tmpFileDir;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final IndexJournalReader journalReader;
|
||||
private final DomainRankings domainRankings;
|
||||
private final Path outputFileWords;
|
||||
private final Path outputFileDocs;
|
||||
private final SortingContext sortingContext;
|
||||
|
||||
public ReverseIndexFullConverter(ServiceHeartbeat heartbeat,
|
||||
Path tmpFileDir,
|
||||
IndexJournalReader journalReader,
|
||||
DomainRankings domainRankings,
|
||||
Path outputFileWords,
|
||||
Path outputFileDocs) {
|
||||
this.heartbeat = heartbeat;
|
||||
this.tmpFileDir = tmpFileDir;
|
||||
this.journalReader = journalReader;
|
||||
this.domainRankings = domainRankings;
|
||||
this.outputFileWords = outputFileWords;
|
||||
this.outputFileDocs = outputFileDocs;
|
||||
this.sortingContext = new SortingContext(tmpFileDir, 64_000);
|
||||
}
|
||||
|
||||
public enum TaskSteps {
|
||||
ACCUMULATE_STATISTICS,
|
||||
INCREMENT_OFFSETS,
|
||||
COUNT_OFFSETS,
|
||||
CREATE_INTERMEDIATE_DOCS,
|
||||
SORT_INTERMEDIATE_DOCS,
|
||||
SIZING,
|
||||
FINALIZING_DOCS,
|
||||
FORCE,
|
||||
FINISHED,
|
||||
}
|
||||
|
||||
public void convert() throws IOException {
|
||||
deleteOldFiles();
|
||||
|
||||
if (journalReader.fileHeader().fileSize() <= IndexJournalReader.FILE_HEADER_SIZE_BYTES) {
|
||||
logger.warn("Bailing: Journal is empty!");
|
||||
return;
|
||||
}
|
||||
|
||||
final Path intermediateUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat");
|
||||
|
||||
try (var progress = heartbeat.createServiceTaskHeartbeat(TaskSteps.class, "reverseIndexFullConverter")) {
|
||||
progress.progress(TaskSteps.ACCUMULATE_STATISTICS);
|
||||
|
||||
final IndexJournalStatistics statistics = journalReader.getStatistics();
|
||||
final long wordsFileSize = statistics.highestWord() + 1;
|
||||
|
||||
progress.progress(TaskSteps.INCREMENT_OFFSETS);
|
||||
|
||||
logger.debug("Words file size: {}", wordsFileSize);
|
||||
// Create a count of how many documents has contains each word
|
||||
final LongArray wordsOffsets = LongArray.allocate(wordsFileSize);
|
||||
|
||||
journalReader.forEachWordId(wordsOffsets::increment);
|
||||
progress.progress(TaskSteps.COUNT_OFFSETS);
|
||||
|
||||
wordsOffsets.transformEach(0, wordsFileSize, new CountToOffsetTransformer(ReverseIndexFullParameters.ENTRY_SIZE));
|
||||
|
||||
progress.progress(TaskSteps.CREATE_INTERMEDIATE_DOCS);
|
||||
|
||||
// Construct an intermediate representation of the reverse documents index
|
||||
try (FileChannel intermediateDocChannel =
|
||||
(FileChannel) Files.newByteChannel(intermediateUrlsFile,
|
||||
StandardOpenOption.CREATE, StandardOpenOption.READ, StandardOpenOption.WRITE))
|
||||
{
|
||||
|
||||
// Construct intermediate index
|
||||
try (RandomWriteFunnel intermediateDocumentWriteFunnel = new RandomWriteFunnel(tmpFileDir, RWF_BIN_SIZE);
|
||||
IntermediateIndexConstructor intermediateIndexConstructor = new IntermediateIndexConstructor(tmpFileDir, wordsOffsets, intermediateDocumentWriteFunnel)
|
||||
)
|
||||
{
|
||||
journalReader.forEachDocIdRecord(intermediateIndexConstructor);
|
||||
intermediateDocumentWriteFunnel.write(intermediateDocChannel);
|
||||
}
|
||||
intermediateDocChannel.force(false);
|
||||
progress.progress(TaskSteps.SORT_INTERMEDIATE_DOCS);
|
||||
|
||||
// Sort each segment of the intermediate file
|
||||
{
|
||||
LongArray intermediateDocs = LongArray.mmapForModifying(intermediateUrlsFile);
|
||||
wordsOffsets.foldIO(0, 0, wordsFileSize, (s, e) -> {
|
||||
intermediateDocs.sortLargeSpanN(sortingContext, ReverseIndexFullParameters.ENTRY_SIZE, s, e);
|
||||
return e;
|
||||
});
|
||||
intermediateDocs.force();
|
||||
}
|
||||
|
||||
progress.progress(TaskSteps.SIZING);
|
||||
|
||||
IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(
|
||||
ReverseIndexFullParameters.bTreeContext,
|
||||
ReverseIndexFullParameters.ENTRY_SIZE);
|
||||
|
||||
wordsOffsets.fold(0, 0, wordsOffsets.size(), sizeEstimator);
|
||||
progress.progress(TaskSteps.FINALIZING_DOCS);
|
||||
|
||||
LongArray finalDocs = LongArray.mmapForWriting(outputFileDocs, sizeEstimator.size);
|
||||
// Construct the proper reverse index
|
||||
wordsOffsets.transformEachIO(0, wordsOffsets.size(), new ReverseIndexBTreeTransformer(finalDocs, ReverseIndexFullParameters.ENTRY_SIZE, bTreeContext, intermediateDocChannel));
|
||||
wordsOffsets.write(outputFileWords);
|
||||
|
||||
progress.progress(TaskSteps.FORCE);
|
||||
|
||||
// Attempt to clean up before forcing (important disk space preservation)
|
||||
Files.deleteIfExists(intermediateUrlsFile);
|
||||
|
||||
wordsOffsets.force();
|
||||
finalDocs.force();
|
||||
|
||||
progress.progress(TaskSteps.FINISHED);
|
||||
}
|
||||
|
||||
} catch (IOException ex) {
|
||||
logger.error("Failed to convert", ex);
|
||||
throw ex;
|
||||
} finally {
|
||||
Files.deleteIfExists(intermediateUrlsFile);
|
||||
}
|
||||
}
|
||||
|
||||
private void deleteOldFiles() throws IOException {
|
||||
Files.deleteIfExists(outputFileWords);
|
||||
Files.deleteIfExists(outputFileDocs);
|
||||
}
|
||||
|
||||
private class IntermediateIndexConstructor implements IndexJournalReader.LongObjectConsumer<IndexJournalEntryData.Record>, AutoCloseable {
|
||||
|
||||
private final LongArray wordRangeEnds;
|
||||
private final IntArray wordRangeOffset;
|
||||
private final RandomWriteFunnel documentsFile;
|
||||
|
||||
private final Path tempFile;
|
||||
|
||||
public IntermediateIndexConstructor(Path tempDir, LongArray wordRangeEnds, RandomWriteFunnel documentsFile) throws IOException {
|
||||
tempFile = Files.createTempFile(tempDir, "iic", "dat");
|
||||
|
||||
this.wordRangeEnds = wordRangeEnds;
|
||||
this.wordRangeOffset = IntArray.mmapForWriting(tempFile, wordRangeEnds.size());
|
||||
this.documentsFile = documentsFile;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public void accept(long docId, IndexJournalEntryData.Record record) {
|
||||
|
||||
/* Encode the ID as
|
||||
*
|
||||
* 32 bits 32 bits
|
||||
* [ ranking | url-id ]
|
||||
*
|
||||
* in order to get low-ranking documents to be considered first
|
||||
* when sorting the items.
|
||||
*/
|
||||
|
||||
int domainId = (int) (docId >>> 32);
|
||||
long rankingId = (long) domainRankings.getRanking(domainId) << 32;
|
||||
|
||||
int urlId = (int) (docId & 0xFFFF_FFFFL);
|
||||
long rankEncodedId = rankingId | urlId;
|
||||
|
||||
final int wordId = record.wordId();
|
||||
long offset = startOfRange(wordId);
|
||||
|
||||
documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), rankEncodedId);
|
||||
documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), record.metadata());
|
||||
|
||||
}
|
||||
|
||||
private long startOfRange(int wordId) {
|
||||
if (wordId == 0) return 0;
|
||||
|
||||
return wordRangeEnds.get(wordId - 1);
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
Files.delete(tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,16 +0,0 @@
|
||||
package nu.marginalia.index.full;
|
||||
|
||||
import nu.marginalia.btree.model.BTreeBlockSize;
|
||||
import nu.marginalia.btree.model.BTreeContext;
|
||||
|
||||
public class ReverseIndexFullParameters {
|
||||
static final int ENTRY_SIZE = 2;
|
||||
|
||||
// This is the byte size per index page on disk, the data pages are twice as large due to ENTRY_SIZE = 2.
|
||||
//
|
||||
// Given a hardware limit of 4k reads, 2k block size should be optimal.
|
||||
static final BTreeBlockSize blockSize = BTreeBlockSize.BS_2048;
|
||||
|
||||
|
||||
static final BTreeContext bTreeContext = new BTreeContext(5, ENTRY_SIZE, blockSize);
|
||||
}
|
@ -1,215 +0,0 @@
|
||||
package nu.marginalia.index.priority;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.array.IntArray;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.algo.SortingContext;
|
||||
import nu.marginalia.index.construction.CountToOffsetTransformer;
|
||||
import nu.marginalia.index.construction.ReverseIndexBTreeTransformer;
|
||||
import nu.marginalia.index.construction.IndexSizeEstimator;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.index.journal.model.IndexJournalStatistics;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
import nu.marginalia.ranking.DomainRankings;
|
||||
import nu.marginalia.rwf.RandomWriteFunnel;
|
||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
import static nu.marginalia.index.priority.ReverseIndexPriorityParameters.bTreeContext;
|
||||
|
||||
public class ReverseIndexPriorityConverter {
|
||||
private static final int RWF_BIN_SIZE = 10_000_000;
|
||||
|
||||
private final ServiceHeartbeat heartbeat;
|
||||
private final Path tmpFileDir;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final IndexJournalReader journalReader;
|
||||
private final DomainRankings domainRankings;
|
||||
private final Path outputFileWords;
|
||||
private final Path outputFileDocs;
|
||||
private final SortingContext sortingContext;
|
||||
|
||||
public ReverseIndexPriorityConverter(ServiceHeartbeat heartbeat,
|
||||
Path tmpFileDir,
|
||||
IndexJournalReader journalReader,
|
||||
DomainRankings domainRankings,
|
||||
Path outputFileWords,
|
||||
Path outputFileDocs) {
|
||||
this.heartbeat = heartbeat;
|
||||
this.tmpFileDir = tmpFileDir;
|
||||
this.journalReader = journalReader;
|
||||
this.domainRankings = domainRankings;
|
||||
this.outputFileWords = outputFileWords;
|
||||
this.outputFileDocs = outputFileDocs;
|
||||
this.sortingContext = new SortingContext(tmpFileDir, 64_000);
|
||||
}
|
||||
|
||||
public enum TaskSteps {
|
||||
ACCUMULATE_STATISTICS,
|
||||
INCREMENT_OFFSETS,
|
||||
COUNT_OFFSETS,
|
||||
CREATE_INTERMEDIATE_DOCS,
|
||||
SORT_INTERMEDIATE_DOCS,
|
||||
SIZING,
|
||||
FINALIZING_DOCS,
|
||||
FORCE,
|
||||
FINISHED,
|
||||
}
|
||||
|
||||
public void convert() throws IOException {
|
||||
deleteOldFiles();
|
||||
|
||||
if (journalReader.fileHeader().fileSize() <= IndexJournalReader.FILE_HEADER_SIZE_BYTES) {
|
||||
logger.warn("Bailing: Journal is empty!");
|
||||
return;
|
||||
}
|
||||
|
||||
final Path intermediateUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat");
|
||||
|
||||
try (var progress = heartbeat.createServiceTaskHeartbeat(TaskSteps.class, "reverseIndexPriorityConverter")) {
|
||||
progress.progress(TaskSteps.ACCUMULATE_STATISTICS);
|
||||
|
||||
final IndexJournalStatistics statistics = journalReader.getStatistics();
|
||||
final long wordsFileSize = statistics.highestWord() + 1;
|
||||
|
||||
progress.progress(TaskSteps.INCREMENT_OFFSETS);
|
||||
|
||||
logger.debug("Words file size: {}", wordsFileSize);
|
||||
// Create a count of how many documents has contains each word
|
||||
final LongArray wordsOffsets = LongArray.allocate(wordsFileSize);
|
||||
|
||||
journalReader.forEachWordId(wordsOffsets::increment);
|
||||
progress.progress(TaskSteps.COUNT_OFFSETS);
|
||||
|
||||
wordsOffsets.transformEach(0, wordsFileSize, new CountToOffsetTransformer(ReverseIndexPriorityParameters.ENTRY_SIZE));
|
||||
|
||||
progress.progress(TaskSteps.CREATE_INTERMEDIATE_DOCS);
|
||||
|
||||
// Construct an intermediate representation of the reverse documents index
|
||||
try (FileChannel intermediateDocChannel =
|
||||
(FileChannel) Files.newByteChannel(intermediateUrlsFile,
|
||||
StandardOpenOption.CREATE, StandardOpenOption.READ, StandardOpenOption.WRITE))
|
||||
{
|
||||
|
||||
// Construct intermediate index
|
||||
try (RandomWriteFunnel intermediateDocumentWriteFunnel = new RandomWriteFunnel(tmpFileDir, RWF_BIN_SIZE);
|
||||
IntermediateIndexConstructor intermediateIndexConstructor = new IntermediateIndexConstructor(tmpFileDir, wordsOffsets, intermediateDocumentWriteFunnel)
|
||||
)
|
||||
{
|
||||
journalReader.forEachDocIdRecord(intermediateIndexConstructor);
|
||||
intermediateDocumentWriteFunnel.write(intermediateDocChannel);
|
||||
}
|
||||
intermediateDocChannel.force(false);
|
||||
progress.progress(TaskSteps.SORT_INTERMEDIATE_DOCS);
|
||||
|
||||
// Sort each segment of the intermediate file
|
||||
{
|
||||
LongArray intermediateDocs = LongArray.mmapForModifying(intermediateUrlsFile);
|
||||
wordsOffsets.foldIO(0, 0, wordsFileSize, (s, e) -> {
|
||||
intermediateDocs.sortLargeSpan(sortingContext, s, e);
|
||||
return e;
|
||||
});
|
||||
intermediateDocs.force();
|
||||
}
|
||||
|
||||
progress.progress(TaskSteps.SIZING);
|
||||
|
||||
IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(
|
||||
bTreeContext,
|
||||
ReverseIndexPriorityParameters.ENTRY_SIZE);
|
||||
|
||||
wordsOffsets.fold(0, 0, wordsOffsets.size(), sizeEstimator);
|
||||
progress.progress(TaskSteps.FINALIZING_DOCS);
|
||||
|
||||
LongArray finalDocs = LongArray.mmapForWriting(outputFileDocs, sizeEstimator.size);
|
||||
// Construct the proper reverse index
|
||||
wordsOffsets.transformEachIO(0, wordsOffsets.size(), new ReverseIndexBTreeTransformer(finalDocs, ReverseIndexPriorityParameters.ENTRY_SIZE, bTreeContext, intermediateDocChannel));
|
||||
wordsOffsets.write(outputFileWords);
|
||||
|
||||
progress.progress(TaskSteps.FORCE);
|
||||
|
||||
// Attempt to clean up before forcing (important disk space preservation)
|
||||
Files.deleteIfExists(intermediateUrlsFile);
|
||||
|
||||
wordsOffsets.force();
|
||||
finalDocs.force();
|
||||
|
||||
progress.progress(TaskSteps.FINISHED);
|
||||
}
|
||||
|
||||
} catch (IOException ex) {
|
||||
logger.error("Failed to convert", ex);
|
||||
throw ex;
|
||||
} finally {
|
||||
Files.deleteIfExists(intermediateUrlsFile);
|
||||
}
|
||||
}
|
||||
|
||||
private void deleteOldFiles() throws IOException {
|
||||
Files.deleteIfExists(outputFileWords);
|
||||
Files.deleteIfExists(outputFileDocs);
|
||||
}
|
||||
|
||||
private class IntermediateIndexConstructor implements IndexJournalReader.LongObjectConsumer<IndexJournalEntryData.Record>, AutoCloseable {
|
||||
|
||||
private final LongArray wordRangeEnds;
|
||||
private final IntArray wordRangeOffset;
|
||||
private final RandomWriteFunnel documentsFile;
|
||||
|
||||
private final Path tempFile;
|
||||
|
||||
public IntermediateIndexConstructor(Path tempDir, LongArray wordRangeEnds, RandomWriteFunnel documentsFile) throws IOException {
|
||||
tempFile = Files.createTempFile(tempDir, "iic", "dat");
|
||||
|
||||
this.wordRangeEnds = wordRangeEnds;
|
||||
this.wordRangeOffset = IntArray.mmapForWriting(tempFile, wordRangeEnds.size());
|
||||
this.documentsFile = documentsFile;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public void accept(long docId, IndexJournalEntryData.Record record) {
|
||||
|
||||
/* Encode the ID as
|
||||
*
|
||||
* 32 bits 32 bits
|
||||
* [ ranking | url-id ]
|
||||
*
|
||||
* in order to get low-ranking documents to be considered first
|
||||
* when sorting the items.
|
||||
*/
|
||||
|
||||
int domainId = (int) (docId >>> 32);
|
||||
long rankingId = (long) domainRankings.getRanking(domainId) << 32;
|
||||
|
||||
int urlId = (int) (docId & 0xFFFF_FFFFL);
|
||||
long rankEncodedId = rankingId | urlId;
|
||||
|
||||
final int wordId = record.wordId();
|
||||
long offset = startOfRange(wordId);
|
||||
|
||||
documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), rankEncodedId);
|
||||
}
|
||||
|
||||
private long startOfRange(int wordId) {
|
||||
if (wordId == 0) return 0;
|
||||
|
||||
return wordRangeEnds.get(wordId - 1);
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
Files.delete(tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,48 +0,0 @@
|
||||
package nu.marginalia.index.priority;
|
||||
|
||||
import nu.marginalia.array.buffer.LongQueryBuffer;
|
||||
import nu.marginalia.btree.BTreeReader;
|
||||
import nu.marginalia.index.query.EntrySource;
|
||||
|
||||
import static java.lang.Math.min;
|
||||
|
||||
public class ReverseIndexPriorityEntrySource implements EntrySource {
|
||||
private final BTreeReader reader;
|
||||
|
||||
int pos;
|
||||
int endOffset;
|
||||
|
||||
private final int wordId;
|
||||
|
||||
public ReverseIndexPriorityEntrySource(BTreeReader reader, int wordId) {
|
||||
this.reader = reader;
|
||||
this.wordId = wordId;
|
||||
|
||||
pos = 0;
|
||||
endOffset = pos + reader.numEntries();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void skip(int n) {
|
||||
pos += n;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void read(LongQueryBuffer buffer) {
|
||||
buffer.end = min(buffer.end, endOffset - pos);
|
||||
reader.readData(buffer.data, buffer.end, pos);
|
||||
pos += buffer.end;
|
||||
|
||||
buffer.uniq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasMore() {
|
||||
return pos < endOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String indexName() {
|
||||
return "Priority:" + wordId;
|
||||
}
|
||||
}
|
@ -1,31 +0,0 @@
|
||||
package nu.marginalia.index.priority;
|
||||
|
||||
import nu.marginalia.btree.model.BTreeBlockSize;
|
||||
import nu.marginalia.btree.model.BTreeContext;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
|
||||
public class ReverseIndexPriorityParameters {
|
||||
static final int ENTRY_SIZE = 1;
|
||||
static final BTreeBlockSize blockSize = BTreeBlockSize.BS_4096;
|
||||
|
||||
static final BTreeContext bTreeContext = new BTreeContext(5, ENTRY_SIZE, blockSize);
|
||||
|
||||
private static final long highPriorityFlags =
|
||||
WordFlags.Title.asBit()
|
||||
| WordFlags.Subjects.asBit()
|
||||
| WordFlags.TfIdfHigh.asBit()
|
||||
| WordFlags.NamesWords.asBit()
|
||||
| WordFlags.UrlDomain.asBit()
|
||||
| WordFlags.UrlPath.asBit()
|
||||
| WordFlags.Site.asBit()
|
||||
| WordFlags.SiteAdjacent.asBit();
|
||||
|
||||
public static boolean filterPriorityRecord(IndexJournalEntryData.Record record) {
|
||||
long meta = record.metadata();
|
||||
|
||||
return (meta & highPriorityFlags) != 0;
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -1,77 +0,0 @@
|
||||
package nu.marginalia.index.priority;
|
||||
|
||||
import nu.marginalia.index.query.EntrySource;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.btree.BTreeReader;
|
||||
import nu.marginalia.index.query.EmptyEntrySource;
|
||||
import nu.marginalia.index.query.ReverseIndexRetainFilter;
|
||||
import nu.marginalia.index.query.filter.QueryFilterNoPass;
|
||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class ReverseIndexPriorityReader {
|
||||
private final LongArray words;
|
||||
private final LongArray documents;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
public ReverseIndexPriorityReader(Path words, Path documents) throws IOException {
|
||||
if (!Files.exists(words) || !Files.exists(documents)) {
|
||||
this.words = null;
|
||||
this.documents = null;
|
||||
return;
|
||||
}
|
||||
|
||||
logger.info("Switching prio reverse index");
|
||||
|
||||
this.words = LongArray.mmapRead(words);
|
||||
this.documents = LongArray.mmapRead(documents);
|
||||
}
|
||||
|
||||
public EntrySource priorityDocuments(int wordId) {
|
||||
if (words == null) {
|
||||
// index not loaded
|
||||
return new EmptyEntrySource();
|
||||
}
|
||||
|
||||
if (wordId < 0 || wordId >= words.size()) return new EmptyEntrySource();
|
||||
|
||||
long offset = words.get(wordId);
|
||||
|
||||
if (offset < 0) return new EmptyEntrySource();
|
||||
|
||||
return new ReverseIndexPriorityEntrySource(createReaderNew(offset), wordId);
|
||||
}
|
||||
|
||||
private BTreeReader createReaderNew(long offset) {
|
||||
return new BTreeReader(documents, ReverseIndexPriorityParameters.bTreeContext, offset);
|
||||
}
|
||||
|
||||
public QueryFilterStepIf also(int wordId) {
|
||||
if (wordId < 0) return new QueryFilterNoPass();
|
||||
|
||||
long offset = words.get(wordId);
|
||||
|
||||
if (offset < 0) return new QueryFilterNoPass();
|
||||
|
||||
return new ReverseIndexRetainFilter(createReaderNew(offset), "priority", wordId);
|
||||
}
|
||||
|
||||
public int numDocuments(int wordId) {
|
||||
if (wordId < 0)
|
||||
return 0;
|
||||
|
||||
long offset = words.get(wordId);
|
||||
|
||||
if (offset < 0)
|
||||
return 0;
|
||||
|
||||
return createReaderNew(offset).numEntries();
|
||||
}
|
||||
|
||||
}
|
@ -4,7 +4,7 @@ import nu.marginalia.array.buffer.LongQueryBuffer;
|
||||
import nu.marginalia.btree.BTreeReader;
|
||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||
|
||||
public record ReverseIndexRetainFilter(BTreeReader range, String name, int wordId) implements QueryFilterStepIf {
|
||||
public record ReverseIndexRetainFilter(BTreeReader range, String name, long wordId) implements QueryFilterStepIf {
|
||||
|
||||
@Override
|
||||
public void apply(LongQueryBuffer buffer) {
|
||||
|
@ -0,0 +1,109 @@
|
||||
package nu.marginalia.index;
|
||||
|
||||
import nu.marginalia.array.algo.SortingContext;
|
||||
import nu.marginalia.array.buffer.LongQueryBuffer;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.ReversePreindex;
|
||||
import nu.marginalia.index.construction.TestJournalFactory;
|
||||
import nu.marginalia.index.construction.TestJournalFactory.EntryDataWithWordMeta;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import static nu.marginalia.index.construction.TestJournalFactory.wm;
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class ReverseIndexReaderTest {
|
||||
TestJournalFactory journalFactory;
|
||||
Path tempDir;
|
||||
SortingContext sortingContext;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws IOException {
|
||||
journalFactory = new TestJournalFactory();
|
||||
|
||||
tempDir = Files.createTempDirectory("sort");
|
||||
sortingContext = new SortingContext(Path.of("invalid"), 1<<20);
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws IOException {
|
||||
journalFactory.clear();
|
||||
|
||||
List<Path> contents = new ArrayList<>();
|
||||
Files.list(tempDir).forEach(contents::add);
|
||||
for (var tempFile : contents) {
|
||||
Files.delete(tempFile);
|
||||
}
|
||||
Files.delete(tempDir);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimple() throws IOException {
|
||||
|
||||
var indexReader = createIndex(
|
||||
new EntryDataWithWordMeta(100, 101, wm(50, 51))
|
||||
);
|
||||
|
||||
assertEquals(1, indexReader.numDocuments(50));
|
||||
|
||||
long[] meta = indexReader.getTermMeta(50, new long[] { 100 });
|
||||
assertArrayEquals(new long[] { 51 }, meta);
|
||||
assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test2x2() throws IOException {
|
||||
|
||||
var indexReader = createIndex(
|
||||
new EntryDataWithWordMeta(100, 101, wm(50, 51), wm(51, 52)),
|
||||
new EntryDataWithWordMeta(101, 101, wm(51, 53), wm(52, 54))
|
||||
|
||||
);
|
||||
|
||||
assertEquals(1, indexReader.numDocuments(50));
|
||||
assertEquals(2, indexReader.numDocuments(51));
|
||||
assertEquals(1, indexReader.numDocuments(52));
|
||||
|
||||
assertArrayEquals(new long[] { 51 }, indexReader.getTermMeta(50, new long[] { 100 }));
|
||||
assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50));
|
||||
|
||||
assertArrayEquals(new long[] { 52, 53 }, indexReader.getTermMeta(51, new long[] { 100, 101 }));
|
||||
assertArrayEquals(new long[] { 100, 101 }, readEntries(indexReader, 51));
|
||||
|
||||
assertArrayEquals(new long[] { 54 }, indexReader.getTermMeta(52, new long[] { 101 }));
|
||||
assertArrayEquals(new long[] { 101 }, readEntries(indexReader, 52));
|
||||
|
||||
}
|
||||
|
||||
private long[] readEntries(ReverseIndexReader reader, long wordId) {
|
||||
var es = reader.documents(wordId);
|
||||
assertTrue(es.hasMore());
|
||||
LongQueryBuffer buffer = new LongQueryBuffer(4);
|
||||
es.read(buffer);
|
||||
assertFalse(es.hasMore());
|
||||
return buffer.copyData();
|
||||
}
|
||||
|
||||
private ReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException {
|
||||
var reader = journalFactory.createReader(scenario);
|
||||
var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir);
|
||||
|
||||
|
||||
Path docsFile = tempDir.resolve("docs.dat");
|
||||
Path wordsFile = tempDir.resolve("words.dat");
|
||||
|
||||
preindex.finalizeIndex(docsFile, wordsFile);
|
||||
preindex.delete();
|
||||
|
||||
return new ReverseIndexReader(wordsFile, docsFile);
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,171 @@
|
||||
package nu.marginalia.index.construction;
|
||||
|
||||
import nu.marginalia.array.algo.SortingContext;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import static nu.marginalia.index.construction.TestJournalFactory.EntryData;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class ReversePreindexDocsTest {
|
||||
Path countsFile;
|
||||
Path wordsIdFile;
|
||||
Path docsFile;
|
||||
Path tempDir;
|
||||
|
||||
TestJournalFactory journalFactory;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws IOException {
|
||||
journalFactory = new TestJournalFactory();
|
||||
|
||||
countsFile = Files.createTempFile("counts", ".dat");
|
||||
wordsIdFile = Files.createTempFile("words", ".dat");
|
||||
docsFile = Files.createTempFile("docs", ".dat");
|
||||
tempDir = Files.createTempDirectory("sort");
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws IOException {
|
||||
journalFactory.clear();
|
||||
|
||||
Files.deleteIfExists(countsFile);
|
||||
Files.deleteIfExists(wordsIdFile);
|
||||
List<Path> contents = new ArrayList<>();
|
||||
Files.list(tempDir).forEach(contents::add);
|
||||
for (var tempFile : contents) {
|
||||
Files.delete(tempFile);
|
||||
}
|
||||
Files.delete(tempDir);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDocs() throws IOException {
|
||||
var reader = journalFactory.createReader(
|
||||
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33)
|
||||
);
|
||||
|
||||
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||
var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), segments);
|
||||
|
||||
List<TestSegmentData> expected = List.of(
|
||||
new TestSegmentData(-100, 0, 2, new long[] { -0xF00BA3L, 0 }),
|
||||
new TestSegmentData(10, 2, 4, new long[] { -0xF00BA3L, 0 }),
|
||||
new TestSegmentData(33, 4, 6, new long[] { -0xF00BA3L, 0 }),
|
||||
new TestSegmentData(40, 6, 8, new long[] { -0xF00BA3L, 0 })
|
||||
);
|
||||
|
||||
List<TestSegmentData> actual = new ArrayList<>();
|
||||
|
||||
var iter = segments.iterator(2);
|
||||
while (iter.next()) {
|
||||
long[] data = new long[(int) (iter.endOffset - iter.startOffset)];
|
||||
docs.slice(iter.startOffset, iter.endOffset).get(0, data);
|
||||
actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset,
|
||||
data));
|
||||
}
|
||||
|
||||
assertEquals(expected, actual);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDocsRepeatedWord() throws IOException {
|
||||
var reader = journalFactory.createReader(
|
||||
new EntryData(-0xF00BA3L, 0, 4, 4)
|
||||
);
|
||||
|
||||
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||
var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), segments);
|
||||
|
||||
List<TestSegmentData> expected = List.of(
|
||||
new TestSegmentData(4, 0, 4, new long[] { -0xF00BA3L, 0, -0xF00BA3L, 0 })
|
||||
);
|
||||
|
||||
List<TestSegmentData> actual = new ArrayList<>();
|
||||
|
||||
var iter = segments.iterator(2);
|
||||
while (iter.next()) {
|
||||
long[] data = new long[(int) (iter.endOffset - iter.startOffset)];
|
||||
docs.slice(iter.startOffset, iter.endOffset).get(0, data);
|
||||
actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset,
|
||||
data));
|
||||
}
|
||||
|
||||
assertEquals(expected, actual);
|
||||
}
|
||||
@Test
|
||||
public void testDocs2() throws IOException {
|
||||
var reader = journalFactory.createReader(
|
||||
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33),
|
||||
new EntryData(0xF00BA4L, 0, 15, 30, -100, 33)
|
||||
);
|
||||
|
||||
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||
var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), segments);
|
||||
|
||||
List<TestSegmentData> expected = List.of(
|
||||
new TestSegmentData(-100, 0, 4, new long[] { -0xF00BA3L, 0, 0xF00BA4L, 0 }),
|
||||
new TestSegmentData(10, 4, 6, new long[] { -0xF00BA3L, 0}),
|
||||
new TestSegmentData(15, 6, 8, new long[] { 0xF00BA4L, 0}),
|
||||
new TestSegmentData(30, 8, 10, new long[] { 0xF00BA4L, 0}),
|
||||
new TestSegmentData(33, 10, 14, new long[] { -0xF00BA3L, 0, 0xF00BA4L, 0}),
|
||||
new TestSegmentData(40, 14, 16, new long[] { -0xF00BA3L, 0})
|
||||
);
|
||||
|
||||
List<TestSegmentData> actual = new ArrayList<>();
|
||||
|
||||
var iter = segments.iterator(2);
|
||||
while (iter.next()) {
|
||||
long[] data = new long[(int) (iter.endOffset - iter.startOffset)];
|
||||
docs.slice(iter.startOffset, iter.endOffset).get(0, data);
|
||||
actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset,
|
||||
data));
|
||||
}
|
||||
}
|
||||
|
||||
record TestSegmentData(long wordId, long start, long end, long[] data) {
|
||||
public TestSegmentData(long wordId, long start, long end) {
|
||||
this(wordId, start, end, null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
|
||||
TestSegmentData that = (TestSegmentData) o;
|
||||
|
||||
if (wordId != that.wordId) return false;
|
||||
if (start != that.start) return false;
|
||||
if (end != that.end) return false;
|
||||
return Arrays.equals(data, that.data);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int result = (int) (wordId ^ (wordId >>> 32));
|
||||
result = 31 * result + (int) (start ^ (start >>> 32));
|
||||
result = 31 * result + (int) (end ^ (end >>> 32));
|
||||
result = 31 * result + Arrays.hashCode(data);
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "TestSegmentData{" +
|
||||
"wordId=" + wordId +
|
||||
", start=" + start +
|
||||
", end=" + end +
|
||||
", data=" + Arrays.toString(data) +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user