mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(encyclopedia-sideloader) Add test suite and clean up urlencoding logic
This commit is contained in:
parent
95776e9bee
commit
b9842b57e0
@ -91,6 +91,14 @@ dependencies {
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||
testImplementation libs.commons.codec
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
testImplementation project(':third-party:encyclopedia-marginalia-nu')
|
||||
testImplementation project(':code:processes:test-data')
|
||||
testImplementation project(':code:processes:crawling-process')
|
||||
|
||||
|
||||
}
|
||||
|
@ -25,7 +25,6 @@ import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.URLEncoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
@ -93,7 +92,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
while (rs.next()) {
|
||||
var articleParts = fromCompressedJson(rs.getBytes("html"), ArticleParts.class);
|
||||
String title = rs.getString("title");
|
||||
String url = URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8);
|
||||
String url = rs.getString("url");
|
||||
|
||||
taskConsumer.accept(() -> convertDocument(articleParts.parts, title, url, domainLinks));
|
||||
}
|
||||
@ -147,7 +146,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
// This is a stopgap to fix them, as the URLs break if you urlencode the UTF-8.
|
||||
|
||||
return url
|
||||
.replace('\u2013', '-') // Replace en-dash with hyphen
|
||||
.replace('\u2013', '-')
|
||||
;
|
||||
}
|
||||
|
||||
|
@ -1,40 +1,166 @@
|
||||
package nu.marginalia.converting.sideload.encyclopedia;
|
||||
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import com.google.inject.Guice;
|
||||
import nu.marginalia.atags.AnchorTextKeywords;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
import nu.marginalia.converting.ConverterModule;
|
||||
import nu.marginalia.converting.sideload.SideloaderProcessing;
|
||||
import nu.marginalia.encyclopedia.cleaner.model.ArticleParts;
|
||||
import nu.marginalia.encyclopedia.model.Article;
|
||||
import nu.marginalia.encyclopedia.model.LinkList;
|
||||
import nu.marginalia.encyclopedia.store.ArticleDbProvider;
|
||||
import nu.marginalia.encyclopedia.store.ArticleStoreWriter;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import nu.marginalia.process.ProcessConfigurationModule;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.testcontainers.containers.MariaDBContainer;
|
||||
import org.testcontainers.junit.jupiter.Container;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
@Tag("slow")
|
||||
@Testcontainers
|
||||
class EncyclopediaMarginaliaNuSideloaderTest {
|
||||
Path tempFile;
|
||||
@Container
|
||||
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
|
||||
.withDatabaseName("WMSA_prod")
|
||||
.withUsername("wmsa")
|
||||
.withPassword("wmsa")
|
||||
.withNetworkAliases("mariadb");
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws IOException {
|
||||
tempFile = Files.createTempFile(getClass().getSimpleName(), ".dat");
|
||||
static AnchorTagsSourceFactory anchorTagsSourceFactory;
|
||||
static AnchorTextKeywords anchorTextKeywords;
|
||||
static SideloaderProcessing sideloaderProcessing;
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpAll() throws IOException {
|
||||
System.setProperty("db.overrideJdbc", mariaDBContainer.getJdbcUrl());
|
||||
System.setProperty("system.serviceNode", "1");
|
||||
|
||||
var injector = Guice.createInjector(
|
||||
new ConverterModule(),
|
||||
new DatabaseModule(true),
|
||||
new ProcessConfigurationModule("test"));
|
||||
|
||||
anchorTagsSourceFactory = injector.getInstance(AnchorTagsSourceFactory.class);
|
||||
anchorTextKeywords = injector.getInstance(AnchorTextKeywords.class);
|
||||
sideloaderProcessing = injector.getInstance(SideloaderProcessing.class);
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws IOException {
|
||||
Files.deleteIfExists(tempFile);
|
||||
@AfterAll
|
||||
public static void tearDown() throws IOException {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test() {
|
||||
System.out.printf("%64s\n", Long.toBinaryString(Long.reverseBytes(0x8fa302ffffcffebfL)));
|
||||
System.out.printf("%64s\n", Long.toBinaryString(Long.reverseBytes(0x8fa302ffffcffebfL)));
|
||||
System.out.printf("%64s\n", Long.toBinaryString(0xFAAFFFF7F75AA808L));
|
||||
public void testSunnyDay() throws Exception {
|
||||
|
||||
System.out.printf("%64s\n", Long.toBinaryString(Long.reverseBytes(0xa00000L)));
|
||||
System.out.printf("%64s\n", Long.toBinaryString(0x20A00000000000L));
|
||||
Path fileName = Files.createTempFile(getClass().getSimpleName(), ".db");
|
||||
try {
|
||||
ArticleDbProvider dbProvider = new ArticleDbProvider(fileName);
|
||||
|
||||
System.out.printf("%64s\n", Long.toBinaryString(Long.reverseBytes(0x200000L)));
|
||||
System.out.printf("%64s\n", Long.toBinaryString(0x200000000004L));
|
||||
try (ArticleStoreWriter writer = new ArticleStoreWriter(dbProvider)) {
|
||||
|
||||
System.out.printf("%64s\n", Long.toBinaryString(Long.reverseBytes(0x1000000000000000L)));
|
||||
System.out.printf("%64s\n", Long.toBinaryString(0x10L));
|
||||
writer.add(new Article(
|
||||
"shoes",
|
||||
"Shoes",
|
||||
"Lorem ipsum dolor sit amet",
|
||||
new ArticleParts("""
|
||||
A shoe is an item of footwear intended to protect and comfort the human foot. Though the human foot can adapt to varied terrains and climate conditions, it is vulnerable, and shoes provide protection. Form was originally tied to function, but over time, shoes also became fashion items. Some shoes are worn as safety equipment, such as steel-toe boots, which are required footwear at industrial worksites.
|
||||
"""),
|
||||
new LinkList(),
|
||||
new LinkList()
|
||||
).asData());
|
||||
}
|
||||
|
||||
var sideloader = new EncyclopediaMarginaliaNuSideloader(fileName, "https://en.wikipedia.org/wiki/", GsonFactory.get(), anchorTagsSourceFactory, anchorTextKeywords, sideloaderProcessing);
|
||||
var domain = sideloader.getDomain();
|
||||
|
||||
Assertions.assertEquals(new EdgeDomain("en.wikipedia.org"), domain.domain);
|
||||
|
||||
var documentsStream = sideloader.getDocumentsStream();
|
||||
Assertions.assertTrue(documentsStream.hasNext());
|
||||
var doc = documentsStream.next();
|
||||
Assertions.assertEquals(new EdgeUrl("https://en.wikipedia.org/wiki/shoes"), doc.url);
|
||||
Assertions.assertFalse(documentsStream.hasNext());
|
||||
}
|
||||
finally {
|
||||
Files.deleteIfExists(fileName);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDashRewriting() throws Exception {
|
||||
|
||||
Path fileName = Files.createTempFile(getClass().getSimpleName(), ".db");
|
||||
try {
|
||||
ArticleDbProvider dbProvider = new ArticleDbProvider(fileName);
|
||||
|
||||
try (ArticleStoreWriter writer = new ArticleStoreWriter(dbProvider)) {
|
||||
|
||||
writer.add(new Article(
|
||||
"tf\u2013idf",
|
||||
"TF-IDF",
|
||||
"Lorem ipsum dolor sit amet",
|
||||
new ArticleParts(""),
|
||||
new LinkList(),
|
||||
new LinkList()
|
||||
).asData());
|
||||
}
|
||||
|
||||
var sideloader = new EncyclopediaMarginaliaNuSideloader(fileName, "https://en.wikipedia.org/wiki/", GsonFactory.get(), anchorTagsSourceFactory, anchorTextKeywords, sideloaderProcessing);
|
||||
var domain = sideloader.getDomain();
|
||||
|
||||
Assertions.assertEquals(new EdgeDomain("en.wikipedia.org"), domain.domain);
|
||||
|
||||
var documentsStream = sideloader.getDocumentsStream();
|
||||
Assertions.assertTrue(documentsStream.hasNext());
|
||||
var doc = documentsStream.next();
|
||||
Assertions.assertEquals(new EdgeUrl("https://en.wikipedia.org/wiki/tf-idf"), doc.url);
|
||||
Assertions.assertFalse(documentsStream.hasNext());
|
||||
}
|
||||
finally {
|
||||
Files.deleteIfExists(fileName);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUrlencoding() throws Exception {
|
||||
|
||||
Path fileName = Files.createTempFile(getClass().getSimpleName(), ".db");
|
||||
try {
|
||||
ArticleDbProvider dbProvider = new ArticleDbProvider(fileName);
|
||||
|
||||
try (ArticleStoreWriter writer = new ArticleStoreWriter(dbProvider)) {
|
||||
|
||||
writer.add(new Article(
|
||||
"any percent",
|
||||
"Any %",
|
||||
"Summoning salt go brr",
|
||||
new ArticleParts(""),
|
||||
new LinkList(),
|
||||
new LinkList()
|
||||
).asData());
|
||||
}
|
||||
|
||||
var sideloader = new EncyclopediaMarginaliaNuSideloader(fileName, "https://en.wikipedia.org/wiki/", GsonFactory.get(), anchorTagsSourceFactory, anchorTextKeywords, sideloaderProcessing);
|
||||
var domain = sideloader.getDomain();
|
||||
|
||||
Assertions.assertEquals(new EdgeDomain("en.wikipedia.org"), domain.domain);
|
||||
|
||||
var documentsStream = sideloader.getDocumentsStream();
|
||||
Assertions.assertTrue(documentsStream.hasNext());
|
||||
var doc = documentsStream.next();
|
||||
Assertions.assertEquals(new EdgeUrl("https://en.wikipedia.org/wiki/any+percent"), doc.url);
|
||||
Assertions.assertFalse(documentsStream.hasNext());
|
||||
}
|
||||
finally {
|
||||
Files.deleteIfExists(fileName);
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user