mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Merge branch 'master' into converter-optimizations
This commit is contained in:
commit
5ce46a61d4
22
code/features-search/feedlot-client/build.gradle
Normal file
22
code/features-search/feedlot-client/build.gradle
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
id 'jvm-test-suite'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(21))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
|
implementation libs.notnull
|
||||||
|
implementation libs.gson
|
||||||
|
|
||||||
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
testImplementation libs.bundles.junit
|
||||||
|
testImplementation libs.mockito
|
||||||
|
|
||||||
|
}
|
20
code/features-search/feedlot-client/readme.md
Normal file
20
code/features-search/feedlot-client/readme.md
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
Client for [FeedlotTheFeedBot](https://github.com/MarginaliaSearch/FeedLotTheFeedBot),
|
||||||
|
the RSS/Atom feed fetcher and cache for Marginalia Search.
|
||||||
|
|
||||||
|
This service is external to the Marginalia Search codebase,
|
||||||
|
as it is not a core part of the search engine and has other
|
||||||
|
utilities.
|
||||||
|
|
||||||
|
## Example
|
||||||
|
|
||||||
|
```java
|
||||||
|
|
||||||
|
import java.time.Duration;
|
||||||
|
|
||||||
|
var client = new FeedlotClient("localhost", 8080,
|
||||||
|
gson,
|
||||||
|
Duration.ofMillis(100), // connect timeout
|
||||||
|
Duration.ofMillis(100)); // request timeout
|
||||||
|
|
||||||
|
CompleteableFuture<FeedItems> items = client.getFeedItems("www.marginalia.nu");
|
||||||
|
```
|
@ -0,0 +1,58 @@
|
|||||||
|
package nu.marginalia.feedlot;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import nu.marginalia.feedlot.model.FeedItems;
|
||||||
|
|
||||||
|
import java.net.URI;
|
||||||
|
import java.net.http.HttpClient;
|
||||||
|
import java.net.http.HttpRequest;
|
||||||
|
import java.net.http.HttpResponse;
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.CompletableFuture;
|
||||||
|
|
||||||
|
public class FeedlotClient {
|
||||||
|
private final String feedlotHost;
|
||||||
|
private final int feedlotPort;
|
||||||
|
private final Gson gson;
|
||||||
|
private final HttpClient httpClient;
|
||||||
|
private final Duration requestTimeout;
|
||||||
|
|
||||||
|
public FeedlotClient(String feedlotHost,
|
||||||
|
int feedlotPort,
|
||||||
|
Gson gson,
|
||||||
|
Duration connectTimeout,
|
||||||
|
Duration requestTimeout
|
||||||
|
)
|
||||||
|
{
|
||||||
|
this.feedlotHost = feedlotHost;
|
||||||
|
this.feedlotPort = feedlotPort;
|
||||||
|
this.gson = gson;
|
||||||
|
|
||||||
|
httpClient = HttpClient.newBuilder()
|
||||||
|
.executor(Executors.newVirtualThreadPerTaskExecutor())
|
||||||
|
.connectTimeout(connectTimeout)
|
||||||
|
.build();
|
||||||
|
this.requestTimeout = requestTimeout;
|
||||||
|
}
|
||||||
|
|
||||||
|
public CompletableFuture<FeedItems> getFeedItems(String domainName) {
|
||||||
|
return httpClient.sendAsync(
|
||||||
|
HttpRequest.newBuilder()
|
||||||
|
.uri(URI.create("http://%s:%d/feed/%s".formatted(feedlotHost, feedlotPort, domainName)))
|
||||||
|
.GET()
|
||||||
|
.timeout(requestTimeout)
|
||||||
|
.build(),
|
||||||
|
HttpResponse.BodyHandlers.ofString()
|
||||||
|
).thenApply(HttpResponse::body)
|
||||||
|
.thenApply(this::parseFeedItems);
|
||||||
|
}
|
||||||
|
|
||||||
|
private FeedItems parseFeedItems(String s) {
|
||||||
|
return gson.fromJson(s, FeedItems.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void stop() {
|
||||||
|
httpClient.close();
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,17 @@
|
|||||||
|
package nu.marginalia.feedlot.model;
|
||||||
|
|
||||||
|
public record FeedItem(String title, String date, String description, String url) {
|
||||||
|
|
||||||
|
public String pubDay() { // Extract the date from an ISO style date string
|
||||||
|
if (date.length() > 10) {
|
||||||
|
return date.substring(0, 10);
|
||||||
|
}
|
||||||
|
return date;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String descriptionSafe() {
|
||||||
|
return description
|
||||||
|
.replace("<", "<")
|
||||||
|
.replace(">", ">");
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,6 @@
|
|||||||
|
package nu.marginalia.feedlot.model;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public record FeedItems(String domain, String feedUrl, String updated, List<FeedItem> items) {
|
||||||
|
}
|
@ -40,8 +40,8 @@ public class CrawlerWarcResynchronizer {
|
|||||||
for (var item : reader) {
|
for (var item : reader) {
|
||||||
accept(item);
|
accept(item);
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (Exception e) {
|
||||||
logger.info(STR."Failed read full warc file \{tempFile}", e);
|
logger.info(STR."(Expected) Failed read full warc file \{tempFile}: \{e.getClass().getSimpleName()} \{e.getMessage()}");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Second pass, copy records to the new warc file
|
// Second pass, copy records to the new warc file
|
||||||
@ -49,8 +49,8 @@ public class CrawlerWarcResynchronizer {
|
|||||||
for (var item : reader) {
|
for (var item : reader) {
|
||||||
recorder.resync(item);
|
recorder.resync(item);
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (Exception e) {
|
||||||
logger.info(STR."Failed read full warc file \{tempFile}", e);
|
logger.info(STR."(Expected) Failed read full warc file \{tempFile}: \{e.getClass().getSimpleName()} \{e.getMessage()}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -37,7 +37,7 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
private final Path warcFile;
|
private final Path warcFile;
|
||||||
private static final Logger logger = LoggerFactory.getLogger(WarcRecorder.class);
|
private static final Logger logger = LoggerFactory.getLogger(WarcRecorder.class);
|
||||||
|
|
||||||
private final ThreadLocal<byte[]> bufferThreadLocal = ThreadLocal.withInitial(() -> new byte[MAX_SIZE]);
|
private final static ThreadLocal<byte[]> bufferThreadLocal = ThreadLocal.withInitial(() -> new byte[MAX_SIZE]);
|
||||||
|
|
||||||
private boolean temporaryFile = false;
|
private boolean temporaryFile = false;
|
||||||
|
|
||||||
|
@ -4,23 +4,24 @@ import lombok.SneakyThrows;
|
|||||||
import nu.marginalia.UserAgent;
|
import nu.marginalia.UserAgent;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.atags.model.DomainLinks;
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
import nu.marginalia.crawl.retreival.*;
|
||||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
|
||||||
import nu.marginalia.crawl.retreival.DomainProber;
|
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.crawling.io.CrawledDomainReader;
|
import nu.marginalia.crawling.io.CrawledDomainReader;
|
||||||
|
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.crawling.model.CrawledDomain;
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
import nu.marginalia.crawling.model.SerializableCrawlData;
|
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||||
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
|
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
|
||||||
import nu.marginalia.io.crawlspec.CrawlSpecRecordParquetFileWriter;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
|
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
|
||||||
import org.junit.jupiter.api.*;
|
import org.junit.jupiter.api.*;
|
||||||
import org.netpreserve.jwarc.*;
|
import org.netpreserve.jwarc.*;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.RandomAccessFile;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
@ -37,6 +38,7 @@ class CrawlerRetreiverTest {
|
|||||||
Path tempFileParquet1;
|
Path tempFileParquet1;
|
||||||
Path tempFileWarc2;
|
Path tempFileWarc2;
|
||||||
Path tempFileParquet2;
|
Path tempFileParquet2;
|
||||||
|
Path tempFileWarc3;
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() throws IOException {
|
public void setUp() throws IOException {
|
||||||
httpFetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D");
|
httpFetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D");
|
||||||
@ -66,7 +68,11 @@ class CrawlerRetreiverTest {
|
|||||||
if (tempFileParquet2 != null) {
|
if (tempFileParquet2 != null) {
|
||||||
Files.deleteIfExists(tempFileParquet2);
|
Files.deleteIfExists(tempFileParquet2);
|
||||||
}
|
}
|
||||||
|
if (tempFileWarc3 != null) {
|
||||||
|
Files.deleteIfExists(tempFileWarc3);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testWarcOutput() throws IOException {
|
public void testWarcOutput() throws IOException {
|
||||||
var specs = CrawlSpecRecord
|
var specs = CrawlSpecRecord
|
||||||
@ -79,11 +85,7 @@ class CrawlerRetreiverTest {
|
|||||||
try {
|
try {
|
||||||
tempFile = Files.createTempFile("crawling-process", "warc");
|
tempFile = Files.createTempFile("crawling-process", "warc");
|
||||||
|
|
||||||
try (var recorder = new WarcRecorder(tempFile)) {
|
doCrawl(tempFile, specs);
|
||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
|
|
||||||
} catch (IOException ex) {
|
|
||||||
Assertions.fail(ex);
|
|
||||||
}
|
|
||||||
|
|
||||||
Set<String> requests = new HashSet<>();
|
Set<String> requests = new HashSet<>();
|
||||||
Set<String> responses = new HashSet<>();
|
Set<String> responses = new HashSet<>();
|
||||||
@ -112,6 +114,57 @@ class CrawlerRetreiverTest {
|
|||||||
Files.deleteIfExists(tempFile);
|
Files.deleteIfExists(tempFile);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
@Test
|
||||||
|
public void testResync() throws IOException {
|
||||||
|
var specs = CrawlSpecRecord
|
||||||
|
.builder()
|
||||||
|
.crawlDepth(5)
|
||||||
|
.domain("www.marginalia.nu")
|
||||||
|
.urls(List.of("https://www.marginalia.nu/misc/debian-laptop-install-log/"))
|
||||||
|
.build();
|
||||||
|
tempFileWarc1 = Files.createTempFile("crawling-process", "warc");
|
||||||
|
tempFileWarc2 = Files.createTempFile("crawling-process", "warc");
|
||||||
|
|
||||||
|
doCrawl(tempFileWarc1, specs);
|
||||||
|
|
||||||
|
Set<String> requests = new HashSet<>();
|
||||||
|
Set<String> responses = new HashSet<>();
|
||||||
|
|
||||||
|
var revisitCrawlFrontier = new DomainCrawlFrontier(
|
||||||
|
new EdgeDomain("www.marginalia.nu"),
|
||||||
|
List.of(), 100);
|
||||||
|
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
|
||||||
|
new WarcRecorder(tempFileWarc2)
|
||||||
|
);
|
||||||
|
|
||||||
|
// truncate the size of the file to simulate a crash
|
||||||
|
simulatePartialWrite(tempFileWarc1);
|
||||||
|
|
||||||
|
resync.run(tempFileWarc1);
|
||||||
|
assertTrue(revisitCrawlFrontier.addKnown(new EdgeUrl("https://www.marginalia.nu/misc/debian-laptop-install-log/")));
|
||||||
|
|
||||||
|
try (var reader = new WarcReader(tempFileWarc2)) {
|
||||||
|
reader.forEach(record -> {
|
||||||
|
if (record instanceof WarcRequest req) {
|
||||||
|
requests.add(req.target());
|
||||||
|
System.out.println(req.type() + ":" + req.target());
|
||||||
|
}
|
||||||
|
else if (record instanceof WarcResponse rsp) {
|
||||||
|
responses.add(rsp.target());
|
||||||
|
System.out.println(rsp.type() + ":" + rsp.target());
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
System.out.println(record.type());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
assertTrue(requests.contains("https://www.marginalia.nu/misc/debian-laptop-install-log/"));
|
||||||
|
assertEquals(requests, responses);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testWithKnownDomains() throws IOException {
|
public void testWithKnownDomains() throws IOException {
|
||||||
var specs = CrawlSpecRecord
|
var specs = CrawlSpecRecord
|
||||||
@ -125,15 +178,9 @@ class CrawlerRetreiverTest {
|
|||||||
|
|
||||||
tempFileWarc1 = Files.createTempFile("crawling-process", ".warc");
|
tempFileWarc1 = Files.createTempFile("crawling-process", ".warc");
|
||||||
|
|
||||||
try (var recorder = new WarcRecorder(tempFileWarc1)) {
|
doCrawl(tempFileWarc1, specs);
|
||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
|
|
||||||
}
|
|
||||||
catch (IOException ex) {
|
|
||||||
Assertions.fail(ex);
|
|
||||||
}
|
|
||||||
|
|
||||||
CrawledDocumentParquetRecordFileWriter.convertWarc("www.marginalia.nu",
|
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||||
new UserAgent("test"), tempFileWarc1, tempFileParquet1);
|
|
||||||
|
|
||||||
try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
|
try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
|
||||||
while (stream.hasNext()) {
|
while (stream.hasNext()) {
|
||||||
@ -177,16 +224,8 @@ class CrawlerRetreiverTest {
|
|||||||
|
|
||||||
tempFileWarc1 = Files.createTempFile("crawling-process", ".warc");
|
tempFileWarc1 = Files.createTempFile("crawling-process", ".warc");
|
||||||
|
|
||||||
try (var recorder = new WarcRecorder(tempFileWarc1)) {
|
doCrawl(tempFileWarc1, specs);
|
||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
|
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||||
}
|
|
||||||
catch (IOException ex) {
|
|
||||||
Assertions.fail(ex);
|
|
||||||
}
|
|
||||||
|
|
||||||
CrawledDocumentParquetRecordFileWriter.convertWarc("www.marginalia.nu",
|
|
||||||
new UserAgent("test"), tempFileWarc1, tempFileParquet1);
|
|
||||||
|
|
||||||
|
|
||||||
try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
|
try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
|
||||||
while (stream.hasNext()) {
|
while (stream.hasNext()) {
|
||||||
@ -232,46 +271,11 @@ class CrawlerRetreiverTest {
|
|||||||
tempFileWarc1 = Files.createTempFile("crawling-process", ".warc.gz");
|
tempFileWarc1 = Files.createTempFile("crawling-process", ".warc.gz");
|
||||||
tempFileWarc2 = Files.createTempFile("crawling-process", ".warc.gz");
|
tempFileWarc2 = Files.createTempFile("crawling-process", ".warc.gz");
|
||||||
|
|
||||||
Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>();
|
doCrawl(tempFileWarc1, specs);
|
||||||
|
doCrawlWithReferenceStream(specs,
|
||||||
try (var recorder = new WarcRecorder(tempFileWarc1)) {
|
CrawledDomainReader.createDataStream(tempFileParquet1)
|
||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
|
);
|
||||||
}
|
convertToParquet(tempFileWarc2, tempFileParquet2);
|
||||||
catch (IOException ex) {
|
|
||||||
Assertions.fail(ex);
|
|
||||||
}
|
|
||||||
|
|
||||||
CrawledDocumentParquetRecordFileWriter.convertWarc("www.marginalia.nu",
|
|
||||||
new UserAgent("test"), tempFileWarc1, tempFileParquet1);
|
|
||||||
|
|
||||||
try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
|
|
||||||
while (stream.hasNext()) {
|
|
||||||
var doc = stream.next();
|
|
||||||
data.computeIfAbsent(doc.getClass(), c -> new ArrayList<>()).add(doc);
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
var stream = CrawledDomainReader.createDataStream(tempFileParquet1);
|
|
||||||
|
|
||||||
System.out.println("---");
|
|
||||||
|
|
||||||
CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0);
|
|
||||||
domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList());
|
|
||||||
try (var recorder = new WarcRecorder(tempFileWarc2)) {
|
|
||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(new DomainLinks(),
|
|
||||||
new CrawlDataReference(stream));
|
|
||||||
}
|
|
||||||
catch (IOException ex) {
|
|
||||||
Assertions.fail(ex);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
CrawledDocumentParquetRecordFileWriter.convertWarc("www.marginalia.nu",
|
|
||||||
new UserAgent("test"), tempFileWarc2, tempFileParquet2);
|
|
||||||
|
|
||||||
|
|
||||||
try (var reader = new WarcReader(tempFileWarc2)) {
|
try (var reader = new WarcReader(tempFileWarc2)) {
|
||||||
WarcXResponseReference.register(reader);
|
WarcXResponseReference.register(reader);
|
||||||
@ -304,4 +308,120 @@ class CrawlerRetreiverTest {
|
|||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void convertToParquet(Path tempFileWarc2, Path tempFileParquet2) {
|
||||||
|
CrawledDocumentParquetRecordFileWriter.convertWarc("www.marginalia.nu",
|
||||||
|
new UserAgent("test"), tempFileWarc2, tempFileParquet2);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
@Test
|
||||||
|
public void testRecrawlWithResync() throws IOException {
|
||||||
|
|
||||||
|
var specs = CrawlSpecRecord
|
||||||
|
.builder()
|
||||||
|
.crawlDepth(12)
|
||||||
|
.domain("www.marginalia.nu")
|
||||||
|
.urls(List.of("https://www.marginalia.nu/some-dead-link"))
|
||||||
|
.build();
|
||||||
|
|
||||||
|
|
||||||
|
tempFileWarc1 = Files.createTempFile("crawling-process", ".warc.gz");
|
||||||
|
tempFileWarc2 = Files.createTempFile("crawling-process", ".warc.gz");
|
||||||
|
tempFileWarc3 = Files.createTempFile("crawling-process", ".warc.gz");
|
||||||
|
|
||||||
|
Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>();
|
||||||
|
|
||||||
|
doCrawl(tempFileWarc1, specs);
|
||||||
|
|
||||||
|
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||||
|
|
||||||
|
try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
|
||||||
|
while (stream.hasNext()) {
|
||||||
|
var doc = stream.next();
|
||||||
|
data.computeIfAbsent(doc.getClass(), c -> new ArrayList<>()).add(doc);
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
var stream = CrawledDomainReader.createDataStream(tempFileParquet1);
|
||||||
|
|
||||||
|
System.out.println("---");
|
||||||
|
|
||||||
|
doCrawlWithReferenceStream(specs, stream);
|
||||||
|
|
||||||
|
var revisitCrawlFrontier = new DomainCrawlFrontier(
|
||||||
|
new EdgeDomain("www.marginalia.nu"),
|
||||||
|
List.of(), 100);
|
||||||
|
|
||||||
|
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
|
||||||
|
new WarcRecorder(tempFileWarc3)
|
||||||
|
);
|
||||||
|
|
||||||
|
// truncate the size of the file to simulate a crash
|
||||||
|
simulatePartialWrite(tempFileWarc2);
|
||||||
|
|
||||||
|
resync.run(tempFileWarc2);
|
||||||
|
|
||||||
|
assertTrue(revisitCrawlFrontier.addKnown(new EdgeUrl("https://www.marginalia.nu/")));
|
||||||
|
convertToParquet(tempFileWarc3, tempFileParquet2);
|
||||||
|
|
||||||
|
|
||||||
|
try (var reader = new WarcReader(tempFileWarc3)) {
|
||||||
|
WarcXResponseReference.register(reader);
|
||||||
|
|
||||||
|
reader.forEach(record -> {
|
||||||
|
if (record instanceof WarcResponse rsp) {
|
||||||
|
try {
|
||||||
|
System.out.println(rsp.type() + ":" + rsp.target() + "/" + rsp.http().status());
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (record instanceof WarcMetadata rsp) {
|
||||||
|
System.out.println("meta:" + rsp.target());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
try (var ds = CrawledDomainReader.createDataStream(tempFileParquet2)) {
|
||||||
|
while (ds.hasNext()) {
|
||||||
|
var doc = ds.next();
|
||||||
|
if (doc instanceof CrawledDomain dr) {
|
||||||
|
System.out.println(dr.domain + "/" + dr.crawlerStatus);
|
||||||
|
}
|
||||||
|
else if (doc instanceof CrawledDocument dc) {
|
||||||
|
System.out.println(dc.url + "/" + dc.crawlerStatus + "/" + dc.httpStatus + "/" + dc.timestamp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void simulatePartialWrite(Path tempFileWarc2) throws IOException {
|
||||||
|
try (var raf = new RandomAccessFile(tempFileWarc2.toFile(), "rw")) {
|
||||||
|
raf.setLength(raf.length() - 10);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doCrawlWithReferenceStream(CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
|
||||||
|
try (var recorder = new WarcRecorder(tempFileWarc2)) {
|
||||||
|
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(new DomainLinks(),
|
||||||
|
new CrawlDataReference(stream));
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
Assertions.fail(ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doCrawl(Path tempFileWarc1, CrawlSpecRecord specs) {
|
||||||
|
try (var recorder = new WarcRecorder(tempFileWarc1)) {
|
||||||
|
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch();
|
||||||
|
} catch (IOException ex) {
|
||||||
|
Assertions.fail(ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
@ -47,6 +47,7 @@ dependencies {
|
|||||||
|
|
||||||
implementation project(':code:features-search:screenshots')
|
implementation project(':code:features-search:screenshots')
|
||||||
implementation project(':code:features-search:random-websites')
|
implementation project(':code:features-search:random-websites')
|
||||||
|
implementation project(':code:features-search:feedlot-client')
|
||||||
|
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
|
@ -1,10 +1,15 @@
|
|||||||
package nu.marginalia.search;
|
package nu.marginalia.search;
|
||||||
|
|
||||||
import com.google.inject.AbstractModule;
|
import com.google.inject.AbstractModule;
|
||||||
|
import com.google.inject.Provides;
|
||||||
import nu.marginalia.LanguageModels;
|
import nu.marginalia.LanguageModels;
|
||||||
import nu.marginalia.WebsiteUrl;
|
import nu.marginalia.WebsiteUrl;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.model.gson.GsonFactory;
|
||||||
import nu.marginalia.renderer.config.HandlebarsConfigurator;
|
import nu.marginalia.renderer.config.HandlebarsConfigurator;
|
||||||
|
import nu.marginalia.feedlot.FeedlotClient;
|
||||||
|
|
||||||
|
import java.time.Duration;
|
||||||
|
|
||||||
public class SearchModule extends AbstractModule {
|
public class SearchModule extends AbstractModule {
|
||||||
|
|
||||||
@ -17,4 +22,14 @@ public class SearchModule extends AbstractModule {
|
|||||||
System.getProperty("website-url", "https://search.marginalia.nu/")));
|
System.getProperty("website-url", "https://search.marginalia.nu/")));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Provides
|
||||||
|
public FeedlotClient provideFeedlotClient() {
|
||||||
|
return new FeedlotClient(
|
||||||
|
System.getProperty("ext-svc-feedlot-host", "feedlot"),
|
||||||
|
Integer.getInteger("ext-svc-feedlot-port", 80),
|
||||||
|
GsonFactory.get(),
|
||||||
|
Duration.ofMillis(250),
|
||||||
|
Duration.ofMillis(100)
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -5,13 +5,17 @@ import nu.marginalia.assistant.client.AssistantClient;
|
|||||||
import nu.marginalia.assistant.client.model.SimilarDomain;
|
import nu.marginalia.assistant.client.model.SimilarDomain;
|
||||||
import nu.marginalia.client.Context;
|
import nu.marginalia.client.Context;
|
||||||
import nu.marginalia.db.DbDomainQueries;
|
import nu.marginalia.db.DbDomainQueries;
|
||||||
|
import nu.marginalia.feedlot.model.FeedItems;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.renderer.MustacheRenderer;
|
import nu.marginalia.renderer.MustacheRenderer;
|
||||||
import nu.marginalia.renderer.RendererFactory;
|
import nu.marginalia.renderer.RendererFactory;
|
||||||
import nu.marginalia.search.SearchOperator;
|
import nu.marginalia.search.SearchOperator;
|
||||||
import nu.marginalia.assistant.client.model.DomainInformation;
|
import nu.marginalia.assistant.client.model.DomainInformation;
|
||||||
|
import nu.marginalia.feedlot.FeedlotClient;
|
||||||
import nu.marginalia.search.model.UrlDetails;
|
import nu.marginalia.search.model.UrlDetails;
|
||||||
import nu.marginalia.search.svc.SearchFlagSiteService.FlagSiteFormData;
|
import nu.marginalia.search.svc.SearchFlagSiteService.FlagSiteFormData;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
import spark.Request;
|
import spark.Request;
|
||||||
import spark.Response;
|
import spark.Response;
|
||||||
|
|
||||||
@ -21,19 +25,23 @@ import java.util.List;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
public class SearchSiteInfoService {
|
public class SearchSiteInfoService {
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(SearchSiteInfoService.class);
|
||||||
|
|
||||||
private final SearchOperator searchOperator;
|
private final SearchOperator searchOperator;
|
||||||
private final AssistantClient assistantClient;
|
private final AssistantClient assistantClient;
|
||||||
private final SearchFlagSiteService flagSiteService;
|
private final SearchFlagSiteService flagSiteService;
|
||||||
private final DbDomainQueries domainQueries;
|
private final DbDomainQueries domainQueries;
|
||||||
private final MustacheRenderer<Object> renderer;
|
private final MustacheRenderer<Object> renderer;
|
||||||
|
private final FeedlotClient feedlotClient;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public SearchSiteInfoService(SearchOperator searchOperator,
|
public SearchSiteInfoService(SearchOperator searchOperator,
|
||||||
AssistantClient assistantClient,
|
AssistantClient assistantClient,
|
||||||
RendererFactory rendererFactory,
|
RendererFactory rendererFactory,
|
||||||
SearchFlagSiteService flagSiteService,
|
SearchFlagSiteService flagSiteService,
|
||||||
DbDomainQueries domainQueries) throws IOException {
|
DbDomainQueries domainQueries,
|
||||||
|
FeedlotClient feedlotClient) throws IOException
|
||||||
|
{
|
||||||
this.searchOperator = searchOperator;
|
this.searchOperator = searchOperator;
|
||||||
this.assistantClient = assistantClient;
|
this.assistantClient = assistantClient;
|
||||||
this.flagSiteService = flagSiteService;
|
this.flagSiteService = flagSiteService;
|
||||||
@ -41,6 +49,7 @@ public class SearchSiteInfoService {
|
|||||||
|
|
||||||
this.renderer = rendererFactory.renderer("search/site-info/site-info");
|
this.renderer = rendererFactory.renderer("search/site-info/site-info");
|
||||||
|
|
||||||
|
this.feedlotClient = feedlotClient;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Object handle(Request request, Response response) throws SQLException {
|
public Object handle(Request request, Response response) throws SQLException {
|
||||||
@ -121,6 +130,7 @@ public class SearchSiteInfoService {
|
|||||||
final List<SimilarDomain> linkingDomains;
|
final List<SimilarDomain> linkingDomains;
|
||||||
String url = "https://" + domainName + "/";;
|
String url = "https://" + domainName + "/";;
|
||||||
|
|
||||||
|
var feedItemsFuture = feedlotClient.getFeedItems(domainName);
|
||||||
if (domainId < 0 || !assistantClient.isAccepting()) {
|
if (domainId < 0 || !assistantClient.isAccepting()) {
|
||||||
domainInfo = createDummySiteInfo(domainName);
|
domainInfo = createDummySiteInfo(domainName);
|
||||||
similarSet = List.of();
|
similarSet = List.of();
|
||||||
@ -134,11 +144,18 @@ public class SearchSiteInfoService {
|
|||||||
linkingDomains = assistantClient
|
linkingDomains = assistantClient
|
||||||
.linkedDomains(ctx, domainId, 100)
|
.linkedDomains(ctx, domainId, 100)
|
||||||
.blockingFirst();
|
.blockingFirst();
|
||||||
|
}
|
||||||
|
|
||||||
List<UrlDetails> sampleResults = searchOperator.doSiteSearch(ctx, domainName, 1);
|
List<UrlDetails> sampleResults = searchOperator.doSiteSearch(ctx, domainName, 5);
|
||||||
if (!sampleResults.isEmpty()) {
|
if (!sampleResults.isEmpty()) {
|
||||||
url = sampleResults.getFirst().url.withPathAndParam("/", null).toString();
|
url = sampleResults.getFirst().url.withPathAndParam("/", null).toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
FeedItems feedItems = null;
|
||||||
|
try {
|
||||||
|
feedItems = feedItemsFuture.get();
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.debug("Failed to get feed items for {}: {}", domainName, e.getMessage());
|
||||||
}
|
}
|
||||||
|
|
||||||
return new SiteInfoWithContext(domainName,
|
return new SiteInfoWithContext(domainName,
|
||||||
@ -146,7 +163,9 @@ public class SearchSiteInfoService {
|
|||||||
url,
|
url,
|
||||||
domainInfo,
|
domainInfo,
|
||||||
similarSet,
|
similarSet,
|
||||||
linkingDomains
|
linkingDomains,
|
||||||
|
feedItems,
|
||||||
|
sampleResults
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -200,13 +219,18 @@ public class SearchSiteInfoService {
|
|||||||
String siteUrl,
|
String siteUrl,
|
||||||
DomainInformation domainInformation,
|
DomainInformation domainInformation,
|
||||||
List<SimilarDomain> similar,
|
List<SimilarDomain> similar,
|
||||||
List<SimilarDomain> linking) {
|
List<SimilarDomain> linking,
|
||||||
|
FeedItems feed,
|
||||||
|
List<UrlDetails> samples
|
||||||
|
) {
|
||||||
public SiteInfoWithContext(String domain,
|
public SiteInfoWithContext(String domain,
|
||||||
long domainId,
|
long domainId,
|
||||||
String siteUrl,
|
String siteUrl,
|
||||||
DomainInformation domainInformation,
|
DomainInformation domainInformation,
|
||||||
List<SimilarDomain> similar,
|
List<SimilarDomain> similar,
|
||||||
List<SimilarDomain> linking
|
List<SimilarDomain> linking,
|
||||||
|
FeedItems feedInfo,
|
||||||
|
List<UrlDetails> samples
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
this(Map.of("info", true),
|
this(Map.of("info", true),
|
||||||
@ -216,7 +240,9 @@ public class SearchSiteInfoService {
|
|||||||
siteUrl,
|
siteUrl,
|
||||||
domainInformation,
|
domainInformation,
|
||||||
similar,
|
similar,
|
||||||
linking);
|
linking,
|
||||||
|
feedInfo,
|
||||||
|
samples);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getLayout() {
|
public String getLayout() {
|
||||||
@ -224,6 +250,12 @@ public class SearchSiteInfoService {
|
|||||||
if (similar.size() < 25) {
|
if (similar.size() < 25) {
|
||||||
return "lopsided";
|
return "lopsided";
|
||||||
}
|
}
|
||||||
|
else if (!feed.items().isEmpty()) {
|
||||||
|
return "lopsided";
|
||||||
|
}
|
||||||
|
else if (!samples.isEmpty()) {
|
||||||
|
return "lopsided";
|
||||||
|
}
|
||||||
else {
|
else {
|
||||||
return "balanced";
|
return "balanced";
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,17 @@
|
|||||||
|
<?xml version="1.0"?>
|
||||||
|
<!-- CC0 -->
|
||||||
|
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||||
|
<svg version="1.1" id="Capa_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||||
|
viewBox="0 0 455.731 455.731" xml:space="preserve">
|
||||||
|
<g>
|
||||||
|
<rect x="0" y="0" style="fill:#F78422;" width="455.731" height="455.731"/>
|
||||||
|
<g>
|
||||||
|
<path style="fill:#FFFFFF;" d="M296.208,159.16C234.445,97.397,152.266,63.382,64.81,63.382v64.348
|
||||||
|
c70.268,0,136.288,27.321,185.898,76.931c49.609,49.61,76.931,115.63,76.931,185.898h64.348
|
||||||
|
C391.986,303.103,357.971,220.923,296.208,159.16z"/>
|
||||||
|
<path style="fill:#FFFFFF;" d="M64.143,172.273v64.348c84.881,0,153.938,69.056,153.938,153.939h64.348
|
||||||
|
C282.429,270.196,184.507,172.273,64.143,172.273z"/>
|
||||||
|
<circle style="fill:#FFFFFF;" cx="109.833" cy="346.26" r="46.088"/>
|
||||||
|
</g>
|
||||||
|
</g>
|
||||||
|
</svg>
|
After Width: | Height: | Size: 891 B |
@ -0,0 +1,20 @@
|
|||||||
|
{{#with feed}}
|
||||||
|
<h2><a title="Atom/RSS feed" target="external" href="{{feedUrl}}"><img width="16" height="16" src="/rss.svg"></a> Feed (Experimental)</h2>
|
||||||
|
|
||||||
|
<dl>
|
||||||
|
{{#each items}}
|
||||||
|
<dt><a href="{{url}}" rel="external noopener ugc">{{title}}</a></dt>
|
||||||
|
<dd><date>{{pubDay}}</date><br>{{{descriptionSafe}}}</dd>
|
||||||
|
{{/each}}
|
||||||
|
</dl>
|
||||||
|
{{/with}}
|
||||||
|
|
||||||
|
{{#unless feed}}{{#if samples}}
|
||||||
|
<h2>Sample</h2>
|
||||||
|
<dl>
|
||||||
|
{{#each samples}}
|
||||||
|
<dt><a href="{{url}}" rel="external noopener ugc">{{title}}</a></dt>
|
||||||
|
<dd>{{{description}}}</dd>
|
||||||
|
{{/each}}
|
||||||
|
</dl>
|
||||||
|
{{/if}}{{/unless}}
|
@ -12,11 +12,58 @@
|
|||||||
<img class="screenshot" width="300" height="225" src="/screenshot/{{domainId}}" alt="Screenshot of {{domain}}" />
|
<img class="screenshot" width="300" height="225" src="/screenshot/{{domainId}}" alt="Screenshot of {{domain}}" />
|
||||||
</a>
|
</a>
|
||||||
{{#with domainInformation}}
|
{{#with domainInformation}}
|
||||||
|
{{> search/site-info/site-info-feed}}
|
||||||
{{> search/site-info/site-info-index}}
|
{{> search/site-info/site-info-index}}
|
||||||
{{> search/site-info/site-info-links}}
|
{{> search/site-info/site-info-links}}
|
||||||
{{/with}}
|
{{/with}}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
{{#if linking}}
|
||||||
|
<div id="similar-links">
|
||||||
|
<h2>Linking Domains</h2>
|
||||||
|
|
||||||
|
<table class="similarity-table">
|
||||||
|
<tr>
|
||||||
|
<th colspan="3">Meta</th>
|
||||||
|
<th>Rank</th>
|
||||||
|
<th>Domain</th>
|
||||||
|
<th>Similarity</th>
|
||||||
|
</tr>
|
||||||
|
{{#each linking}}
|
||||||
|
<tr>
|
||||||
|
<td>
|
||||||
|
{{#if indexed}}
|
||||||
|
{{#if active}}
|
||||||
|
<span title="Indexed">👀</span>
|
||||||
|
{{/if}}
|
||||||
|
{{#unless active}}
|
||||||
|
<span title="Problem">🔥</span>
|
||||||
|
{{/unless}}
|
||||||
|
{{/if}}
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
{{#if screenshot}}📷{{/if}}
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
{{#if linkType.isLinked}}
|
||||||
|
<span title="{{linkType.description}}"><a href="/crosstalk/?domains={{domain}},{{url.domain}}">{{{linkType}}}</a></span>
|
||||||
|
{{/if}}
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
<span title="{{rank}}%">{{{rankSymbols}}}</span>
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
<a href="/site/{{url.domain}}?view=similar" rel="external noopener nofollow">{{url.domain}}</a></td>
|
||||||
|
<td>
|
||||||
|
<progress value="{{relatedness}}" max="100.0">{{relatedness}}</progress><br>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
{{/each}}
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
{{/if}}
|
||||||
|
|
||||||
|
|
||||||
{{#if similar}}
|
{{#if similar}}
|
||||||
<div id="similar-domains">
|
<div id="similar-domains">
|
||||||
<h2>Similar Domains</h2>
|
<h2>Similar Domains</h2>
|
||||||
@ -67,48 +114,4 @@
|
|||||||
</div>
|
</div>
|
||||||
{{/if}}
|
{{/if}}
|
||||||
|
|
||||||
{{#if linking}}
|
|
||||||
<div id="similar-links">
|
|
||||||
<h2>Linking Domains</h2>
|
|
||||||
|
|
||||||
<table class="similarity-table">
|
|
||||||
<tr>
|
|
||||||
<th colspan="3">Meta</th>
|
|
||||||
<th>Rank</th>
|
|
||||||
<th>Domain</th>
|
|
||||||
<th>Similarity</th>
|
|
||||||
</tr>
|
|
||||||
{{#each linking}}
|
|
||||||
<tr>
|
|
||||||
<td>
|
|
||||||
{{#if indexed}}
|
|
||||||
{{#if active}}
|
|
||||||
<span title="Indexed">👀</span>
|
|
||||||
{{/if}}
|
|
||||||
{{#unless active}}
|
|
||||||
<span title="Problem">🔥</span>
|
|
||||||
{{/unless}}
|
|
||||||
{{/if}}
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
{{#if screenshot}}📷{{/if}}
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
{{#if linkType.isLinked}}
|
|
||||||
<span title="{{linkType.description}}"><a href="/crosstalk/?domains={{domain}},{{url.domain}}">{{{linkType}}}</a></span>
|
|
||||||
{{/if}}
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<span title="{{rank}}%">{{{rankSymbols}}}</span>
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<a href="/site/{{url.domain}}?view=similar" rel="external noopener nofollow">{{url.domain}}</a></td>
|
|
||||||
<td>
|
|
||||||
<progress value="{{relatedness}}" max="100.0">{{relatedness}}</progress><br>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
{{/each}}
|
|
||||||
</table>
|
|
||||||
</div>
|
|
||||||
{{/if}}
|
|
||||||
</div>
|
</div>
|
@ -28,6 +28,7 @@ include 'code:libraries:message-queue'
|
|||||||
|
|
||||||
include 'code:features-search:screenshots'
|
include 'code:features-search:screenshots'
|
||||||
include 'code:features-search:random-websites'
|
include 'code:features-search:random-websites'
|
||||||
|
include 'code:features-search:feedlot-client'
|
||||||
include 'code:features-qs:query-parser'
|
include 'code:features-qs:query-parser'
|
||||||
include 'code:features-index:result-ranking'
|
include 'code:features-index:result-ranking'
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user