mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(search-service) Begin replacement of the crawl queue mechanism with node_affinity flagging
Previously a special db table was used to hold domains slated for crawling, but this is deprecated, and instead now each domain has a node_affinity flag that decides its indexing state, where a value of -1 indicates it shouldn't be crawled, a value of 0 means it's slated for crawling by the next index partition to be crawled, and a positive value means it's assigned to an index partition. The change set also adds a test case validating the modified behavior.
This commit is contained in:
parent
dc1b6373eb
commit
ad8c97f342
@ -47,18 +47,23 @@ public class SearchAddToCrawlQueueService {
|
||||
return new MapModelAndView("redirect.jte", Map.of("url", "/site/"+domainName));
|
||||
}
|
||||
|
||||
private void addToCrawlQueue(int id) throws SQLException {
|
||||
/** Mark a domain for crawling by setting node affinity to zero,
|
||||
* unless it is already marked for crawling, then node affinity should
|
||||
* be left unchanged.
|
||||
* */
|
||||
void addToCrawlQueue(int domainId) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
INSERT IGNORE INTO CRAWL_QUEUE(DOMAIN_NAME, SOURCE)
|
||||
SELECT DOMAIN_NAME, "user" FROM EC_DOMAIN WHERE ID=?
|
||||
UPDATE EC_DOMAIN
|
||||
SET WMSA_prod.EC_DOMAIN.NODE_AFFINITY = 0
|
||||
WHERE ID=? AND WMSA_prod.EC_DOMAIN.NODE_AFFINITY < 0
|
||||
""")) {
|
||||
stmt.setInt(1, id);
|
||||
stmt.setInt(1, domainId);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
private String getDomainName(int id) {
|
||||
String getDomainName(int id) {
|
||||
var domain = domainQueries.getDomain(id);
|
||||
if (domain.isEmpty())
|
||||
throw new IllegalArgumentException();
|
||||
|
@ -0,0 +1,85 @@
|
||||
package nu.marginalia.search.svc;
|
||||
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.test.TestMigrationLoader;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.testcontainers.containers.MariaDBContainer;
|
||||
import org.testcontainers.junit.jupiter.Container;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
|
||||
import java.sql.SQLException;
|
||||
|
||||
@Tag("slow")
|
||||
@Testcontainers
|
||||
class SearchAddToCrawlQueueServiceTest {
|
||||
@Container
|
||||
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
|
||||
.withDatabaseName("WMSA_prod")
|
||||
.withUsername("wmsa")
|
||||
.withPassword("wmsa")
|
||||
.withNetworkAliases("mariadb");
|
||||
|
||||
static HikariDataSource dataSource;
|
||||
|
||||
private DbDomainQueries domainQueries;
|
||||
private SearchAddToCrawlQueueService addToCrawlQueueService;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.createStatement()) {
|
||||
stmt.executeQuery("DELETE FROM EC_DOMAIN"); // Wipe any old state from other test runs
|
||||
|
||||
stmt.executeQuery("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES ('known.example.com', 'example.com', -1)");
|
||||
stmt.executeQuery("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES ('added.example.com', 'example.com', 0)");
|
||||
stmt.executeQuery("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES ('indexed.example.com', 'example.com', 1)");
|
||||
}
|
||||
|
||||
domainQueries = new DbDomainQueries(dataSource);
|
||||
addToCrawlQueueService = new SearchAddToCrawlQueueService(domainQueries, dataSource);
|
||||
}
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpAll() {
|
||||
HikariConfig config = new HikariConfig();
|
||||
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
|
||||
config.setUsername("wmsa");
|
||||
config.setPassword("wmsa");
|
||||
|
||||
dataSource = new HikariDataSource(config);
|
||||
TestMigrationLoader.flywayMigration(dataSource);
|
||||
}
|
||||
|
||||
private int getNodeAffinity(String domainName) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("SELECT NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_NAME=?"))
|
||||
{
|
||||
stmt.setString(1, domainName);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Test
|
||||
void addToCrawlQueue() throws SQLException {
|
||||
int knownId = domainQueries.getDomainId(new EdgeDomain("known.example.com"));
|
||||
int addedId = domainQueries.getDomainId(new EdgeDomain("added.example.com"));
|
||||
int indexedId = domainQueries.getDomainId(new EdgeDomain("indexed.example.com"));
|
||||
|
||||
addToCrawlQueueService.addToCrawlQueue(knownId);
|
||||
addToCrawlQueueService.addToCrawlQueue(addedId);
|
||||
addToCrawlQueueService.addToCrawlQueue(indexedId);
|
||||
|
||||
Assertions.assertEquals(0, getNodeAffinity("known.example.com"));
|
||||
Assertions.assertEquals(0, getNodeAffinity("added.example.com"));
|
||||
Assertions.assertEquals(1, getNodeAffinity("indexed.example.com"));
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user