mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(work-log) New batching work log
This commit is contained in:
parent
a00cabe223
commit
a52d78c8ee
33
code/process-models/work-log/build.gradle
Normal file
33
code/process-models/work-log/build.gradle
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
id "io.freefair.lombok" version "8.2.2"
|
||||||
|
|
||||||
|
id 'jvm-test-suite'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(20))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
dependencies {
|
||||||
|
implementation libs.lombok
|
||||||
|
annotationProcessor libs.lombok
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
|
implementation libs.notnull
|
||||||
|
|
||||||
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
testImplementation libs.bundles.junit
|
||||||
|
testImplementation libs.mockito
|
||||||
|
}
|
||||||
|
|
||||||
|
test {
|
||||||
|
useJUnitPlatform()
|
||||||
|
}
|
||||||
|
|
||||||
|
task fastTests(type: Test) {
|
||||||
|
useJUnitPlatform {
|
||||||
|
excludeTags "slow"
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,35 @@
|
|||||||
|
package nu.marginalia.worklog;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/** The BatchingWorkLog is a work log for items of work performed in batches,
|
||||||
|
* where each batch needs to be finalized before the items it consists of can be
|
||||||
|
* considered done. This is needed when the data is serialized into a format such
|
||||||
|
* as Parquet, where disparate items go into the same file, and the writer needs to be
|
||||||
|
* properly closed before the file can be read.
|
||||||
|
*/
|
||||||
|
public interface BatchingWorkLog extends AutoCloseable {
|
||||||
|
|
||||||
|
/** Returns true if logItem(id) has been run,
|
||||||
|
* and logFinishedBatch has been run after that.
|
||||||
|
*/
|
||||||
|
boolean isItemCommitted(String id);
|
||||||
|
|
||||||
|
/** Returns true if logItem(id) has been run
|
||||||
|
* but not logFinishedBatch().
|
||||||
|
* <p/>
|
||||||
|
* Unlike isItemCommitted(), this state is ephemeral and not
|
||||||
|
* retained if e.g. the process crashes and resumes.
|
||||||
|
* */
|
||||||
|
boolean isItemInCurrentBatch(String id);
|
||||||
|
|
||||||
|
/** Log additional item to the current batch */
|
||||||
|
void logItem(String id) throws IOException;
|
||||||
|
|
||||||
|
/** Mark the current batch as finished and increment
|
||||||
|
* the batch number counter
|
||||||
|
*/
|
||||||
|
void logFinishedBatch() throws IOException;
|
||||||
|
|
||||||
|
int getBatchNumber();
|
||||||
|
}
|
@ -0,0 +1,221 @@
|
|||||||
|
package nu.marginalia.worklog;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardOpenOption;
|
||||||
|
import java.time.LocalDateTime;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
public class BatchingWorkLogImpl implements BatchingWorkLog {
|
||||||
|
private int batchNumber = 0;
|
||||||
|
private final Set<String> currentBatchItems = new HashSet<>(1000);
|
||||||
|
private final Set<String> commitedItems = new HashSet<>(10_000);
|
||||||
|
private final OutputStream writer;
|
||||||
|
|
||||||
|
public BatchingWorkLogImpl(Path file) throws IOException {
|
||||||
|
if (Files.exists(file)) {
|
||||||
|
try (var linesStream = Files.lines(file)) {
|
||||||
|
linesStream.map(WorkLogItem::parse).forEach(
|
||||||
|
item -> item.replay(this)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
writer = Files.newOutputStream(file, StandardOpenOption.APPEND);
|
||||||
|
writeLogEntry(new CommentLine("Log resumed on " + LocalDateTime.now()));
|
||||||
|
if (getCurrentBatchSize() > 0) {
|
||||||
|
writeLogEntry(new CrashMarker());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
writer = Files.newOutputStream(file, StandardOpenOption.CREATE_NEW);
|
||||||
|
writeLogEntry(new CommentLine("Log created on " + LocalDateTime.now()));
|
||||||
|
writeLogEntry(new CommentLine(" Format: "));
|
||||||
|
writeLogEntry(new CommentLine(" " + AddItem.MARKER + " ID\tsignifies adding an item to the current batch"));
|
||||||
|
writeLogEntry(new CommentLine(" " + FinishBatch.MARKER + "\tsignifies finalizing the current batch and switching to the next"));
|
||||||
|
writeLogEntry(new CommentLine(" " + CrashMarker.MARKER + "\tdiscard contents from the current batch and start over, written after a crash"));
|
||||||
|
writeLogEntry(new CommentLine("Upon a crash, items that have re-process until their batch is finalized"));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void writeLogEntry(WorkLogItem item) throws IOException {
|
||||||
|
item.write(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
void writeLine(String line) throws IOException {
|
||||||
|
writer.write(line.getBytes(StandardCharsets.UTF_8));
|
||||||
|
writer.write('\n');
|
||||||
|
writer.flush();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isItemCommitted(String id) {
|
||||||
|
return commitedItems.contains(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isItemInCurrentBatch(String id) {
|
||||||
|
return currentBatchItems.contains(id);
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public void logItem(String id) throws IOException {
|
||||||
|
writeLogEntry(new AddItem(id));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void logFinishedBatch() throws IOException {
|
||||||
|
writeLogEntry(new FinishBatch());
|
||||||
|
incrementBatch();
|
||||||
|
}
|
||||||
|
|
||||||
|
void incrementBatch() {
|
||||||
|
batchNumber++;
|
||||||
|
|
||||||
|
// Transfer all items from the current batch to the committed items' batch
|
||||||
|
commitedItems.addAll(currentBatchItems);
|
||||||
|
currentBatchItems.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
void restartBatch() {
|
||||||
|
currentBatchItems.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
void addItemToCurrentBatch(String id) {
|
||||||
|
currentBatchItems.add(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
writer.flush();
|
||||||
|
writer.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getBatchNumber() {
|
||||||
|
return batchNumber;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getCurrentBatchSize() {
|
||||||
|
return currentBatchItems.size();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
interface WorkLogItem {
|
||||||
|
|
||||||
|
void replay(BatchingWorkLogImpl bwl);
|
||||||
|
void write(BatchingWorkLogImpl bwl) throws IOException;
|
||||||
|
|
||||||
|
static WorkLogItem parse(String line) {
|
||||||
|
if (line.isBlank())
|
||||||
|
return new BlankLine();
|
||||||
|
|
||||||
|
var lineParts = LogLineParts.parse(line);
|
||||||
|
|
||||||
|
return switch (lineParts.tag()) {
|
||||||
|
case CommentLine.MARKER -> new CommentLine(lineParts.arg());
|
||||||
|
case AddItem.MARKER -> new AddItem(lineParts.arg());
|
||||||
|
case FinishBatch.MARKER -> new FinishBatch();
|
||||||
|
case CrashMarker.MARKER -> new CrashMarker();
|
||||||
|
default -> throw new WorkLogParseException(line);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
record LogLineParts(char tag, String arg) {
|
||||||
|
public static LogLineParts parse(String line) {
|
||||||
|
line = line.trim();
|
||||||
|
|
||||||
|
char tag = line.charAt(0);
|
||||||
|
String arg = line.substring(1).trim();
|
||||||
|
|
||||||
|
int commentIdx = arg.indexOf('#');
|
||||||
|
if (commentIdx >= 0) arg = arg.substring(0, commentIdx).trim();
|
||||||
|
|
||||||
|
return new LogLineParts(tag, arg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
record CommentLine(String comment) implements WorkLogItem {
|
||||||
|
final static char MARKER = '#';
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void replay(BatchingWorkLogImpl bwl) {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(BatchingWorkLogImpl bwl) throws IOException {
|
||||||
|
bwl.writeLine(MARKER + " " + comment);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
record BlankLine() implements WorkLogItem {
|
||||||
|
final static char MARKER = ' ';
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void replay(BatchingWorkLogImpl bwl) {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(BatchingWorkLogImpl bwl) throws IOException {
|
||||||
|
bwl.writeLine(MARKER + "");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
record FinishBatch() implements WorkLogItem {
|
||||||
|
final static char MARKER = 'F';
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void replay(BatchingWorkLogImpl bwl) {
|
||||||
|
bwl.incrementBatch();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(BatchingWorkLogImpl bwl) throws IOException {
|
||||||
|
bwl.writeLine("# " + LocalDateTime.now());
|
||||||
|
bwl.writeLine("# finalizing batchNumber = " + bwl.getBatchNumber());
|
||||||
|
bwl.writeLine(Character.toString(MARKER));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
record CrashMarker() implements WorkLogItem {
|
||||||
|
final static char MARKER = 'X';
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void replay(BatchingWorkLogImpl bwl) {
|
||||||
|
bwl.restartBatch();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(BatchingWorkLogImpl bwl) throws IOException {
|
||||||
|
bwl.writeLine("# " + LocalDateTime.now());
|
||||||
|
bwl.writeLine("# discarding batchNumber = " + bwl.getBatchNumber());
|
||||||
|
bwl.writeLine(Character.toString(MARKER));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
record AddItem(String id) implements WorkLogItem {
|
||||||
|
final static char MARKER = '+';
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void replay(BatchingWorkLogImpl bwl) {
|
||||||
|
bwl.addItemToCurrentBatch(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(BatchingWorkLogImpl bwl) throws IOException {
|
||||||
|
bwl.writeLine(MARKER + " " + id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class WorkLogParseException extends RuntimeException {
|
||||||
|
@Serial
|
||||||
|
private static final long serialVersionUID = -1238138989389021166L;
|
||||||
|
|
||||||
|
public WorkLogParseException(String logLine) {
|
||||||
|
super("Failed to parse work log line: '" + logLine + "'");
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,63 @@
|
|||||||
|
package nu.marginalia.worklog;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
class BatchingWorkLogImplTest {
|
||||||
|
Path fileName;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws IOException {
|
||||||
|
fileName = Files.createTempFile(getClass().getSimpleName(), ".test");
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() throws IOException {
|
||||||
|
Files.deleteIfExists(fileName);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testResumeOnEmptyFile() throws IOException {
|
||||||
|
Files.delete(fileName);
|
||||||
|
|
||||||
|
try (var wl = new BatchingWorkLogImpl(fileName)) {
|
||||||
|
wl.logItem("1");
|
||||||
|
wl.logItem("2");
|
||||||
|
wl.logItem("3");
|
||||||
|
wl.logFinishedBatch();
|
||||||
|
wl.logItem("4");
|
||||||
|
wl.logItem("5");
|
||||||
|
wl.logFinishedBatch();
|
||||||
|
wl.logItem("6");
|
||||||
|
}
|
||||||
|
|
||||||
|
try (var wl = new BatchingWorkLogImpl(fileName)) {
|
||||||
|
assertTrue(wl.isItemCommitted("1"));
|
||||||
|
assertTrue(wl.isItemCommitted("2"));
|
||||||
|
assertTrue(wl.isItemCommitted("3"));
|
||||||
|
assertTrue(wl.isItemCommitted("4"));
|
||||||
|
assertTrue(wl.isItemCommitted("5"));
|
||||||
|
assertFalse(wl.isItemCommitted("6"));
|
||||||
|
wl.logItem("7");
|
||||||
|
wl.logFinishedBatch();
|
||||||
|
}
|
||||||
|
try (var wl = new BatchingWorkLogImpl(fileName)) {
|
||||||
|
assertTrue(wl.isItemCommitted("1"));
|
||||||
|
assertTrue(wl.isItemCommitted("2"));
|
||||||
|
assertTrue(wl.isItemCommitted("3"));
|
||||||
|
assertTrue(wl.isItemCommitted("4"));
|
||||||
|
assertTrue(wl.isItemCommitted("5"));
|
||||||
|
assertFalse(wl.isItemCommitted("6"));
|
||||||
|
assertTrue(wl.isItemCommitted("7"));
|
||||||
|
}
|
||||||
|
|
||||||
|
Files.readAllLines(fileName).forEach(System.out::println);
|
||||||
|
}
|
||||||
|
}
|
@ -65,6 +65,7 @@ include 'code:processes:test-data'
|
|||||||
|
|
||||||
include 'code:process-models:converting-model'
|
include 'code:process-models:converting-model'
|
||||||
include 'code:process-models:crawling-model'
|
include 'code:process-models:crawling-model'
|
||||||
|
include 'code:process-models:work-log'
|
||||||
|
|
||||||
include 'code:tools:term-frequency-extractor'
|
include 'code:tools:term-frequency-extractor'
|
||||||
include 'code:tools:crawl-job-extractor'
|
include 'code:tools:crawl-job-extractor'
|
||||||
|
Loading…
Reference in New Issue
Block a user