2023-09-22 11:14:58 +00:00
|
|
|
package nu.marginalia.loading;
|
|
|
|
|
|
|
|
import nu.marginalia.io.processed.ProcessedDataFileNames;
|
2024-07-27 09:44:13 +00:00
|
|
|
import nu.marginalia.model.processed.SlopDocumentRecord;
|
|
|
|
import nu.marginalia.model.processed.SlopDomainLinkRecord;
|
|
|
|
import nu.marginalia.model.processed.SlopDomainRecord;
|
2024-08-21 08:13:49 +00:00
|
|
|
import nu.marginalia.slop.SlopTable;
|
2023-09-22 11:14:58 +00:00
|
|
|
import nu.marginalia.worklog.BatchingWorkLogInspector;
|
2024-01-19 12:59:03 +00:00
|
|
|
import org.slf4j.Logger;
|
|
|
|
import org.slf4j.LoggerFactory;
|
2023-09-22 11:14:58 +00:00
|
|
|
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.nio.file.Path;
|
|
|
|
import java.util.*;
|
|
|
|
|
|
|
|
|
|
|
|
public class LoaderInputData {
|
|
|
|
private final List<Path> sourceDirectories;
|
2024-01-19 12:59:03 +00:00
|
|
|
private static final Logger logger = LoggerFactory.getLogger(LoaderInputData.class);
|
2023-09-22 11:14:58 +00:00
|
|
|
private final Map<Path, Integer> lastGoodBatch = new HashMap<>();
|
|
|
|
|
|
|
|
public LoaderInputData(List<Path> sourceDirectories) throws IOException {
|
|
|
|
this.sourceDirectories = sourceDirectories;
|
|
|
|
|
|
|
|
for (var source : sourceDirectories) {
|
2024-01-19 12:59:03 +00:00
|
|
|
int lastGoodBatch = BatchingWorkLogInspector.getValidBatches(source.resolve("processor.log"));
|
|
|
|
|
|
|
|
this.lastGoodBatch.put(source, lastGoodBatch);
|
|
|
|
|
|
|
|
if (lastGoodBatch == 0) {
|
|
|
|
// This is useful diagnostic information, so we log it as a warning
|
|
|
|
logger.warn("No valid batches found in {}", source);
|
|
|
|
}
|
2023-09-22 11:14:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
/** This constructor is primarily intended for testing. It still works and is good though,
|
|
|
|
* but it skips consulting processor.log for lastGoodBatch
|
|
|
|
*/
|
|
|
|
public LoaderInputData(Path singleSource, int lastBatch) throws IOException {
|
|
|
|
sourceDirectories = List.of(singleSource);
|
|
|
|
lastGoodBatch.put(singleSource, lastBatch);
|
|
|
|
}
|
|
|
|
|
2024-08-21 08:13:49 +00:00
|
|
|
public Collection<SlopTable.Ref<SlopDomainRecord>> listDomainPages() {
|
|
|
|
List<SlopTable.Ref<SlopDomainRecord>> pathsAll = new ArrayList<>();
|
|
|
|
|
2023-09-22 11:14:58 +00:00
|
|
|
for (var source : sourceDirectories) {
|
2024-07-27 09:44:13 +00:00
|
|
|
for (int i = 0; i < lastGoodBatch.get(source); i++) {
|
2024-08-21 08:13:49 +00:00
|
|
|
pathsAll.add(new SlopTable.Ref<>(ProcessedDataFileNames.domainFileName(source), i));
|
2024-07-27 09:44:13 +00:00
|
|
|
}
|
2023-09-22 11:14:58 +00:00
|
|
|
}
|
|
|
|
return pathsAll;
|
|
|
|
}
|
|
|
|
|
2024-08-21 08:13:49 +00:00
|
|
|
public Collection<SlopTable.Ref<SlopDomainLinkRecord>> listDomainLinkPages() {
|
|
|
|
List<SlopTable.Ref<SlopDomainLinkRecord>> pathsAll = new ArrayList<>();
|
|
|
|
|
2023-09-22 11:14:58 +00:00
|
|
|
for (var source : sourceDirectories) {
|
2024-07-27 09:44:13 +00:00
|
|
|
for (int i = 0; i < lastGoodBatch.get(source); i++) {
|
2024-08-21 08:13:49 +00:00
|
|
|
pathsAll.add(new SlopTable.Ref<>(ProcessedDataFileNames.domainLinkFileName(source), i));
|
2024-07-27 09:44:13 +00:00
|
|
|
}
|
2023-09-22 11:14:58 +00:00
|
|
|
}
|
|
|
|
return pathsAll;
|
|
|
|
}
|
|
|
|
|
2024-08-21 08:13:49 +00:00
|
|
|
public Collection<SlopTable.Ref<SlopDocumentRecord>> listDocumentFiles() {
|
|
|
|
List<SlopTable.Ref<SlopDocumentRecord>> pathsAll = new ArrayList<>();
|
|
|
|
|
2023-09-22 11:14:58 +00:00
|
|
|
for (var source : sourceDirectories) {
|
2024-07-27 09:44:13 +00:00
|
|
|
for (int i = 0; i < lastGoodBatch.get(source); i++) {
|
2024-08-21 08:13:49 +00:00
|
|
|
pathsAll.add(new SlopTable.Ref<>(ProcessedDataFileNames.documentFileName(source), i));
|
2024-07-27 09:44:13 +00:00
|
|
|
}
|
2023-09-22 11:14:58 +00:00
|
|
|
}
|
|
|
|
return pathsAll;
|
|
|
|
}
|
|
|
|
}
|