mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(search-service) Begin replacement of the crawl queue mechanism with node_affinity flagging
Previously a special db table was used to hold domains slated for crawling, but this is deprecated, and instead now each domain has a node_affinity flag that decides its indexing state, where a value of -1 indicates it shouldn't be crawled, a value of 0 means it's slated for crawling by the next index partition to be crawled, and a positive value means it's assigned to an index partition. The change set also adds a test case validating the modified behavior.
This commit is contained in:
parent
dc1b6373eb
commit
ad8c97f342
@ -47,18 +47,23 @@ public class SearchAddToCrawlQueueService {
|
|||||||
return new MapModelAndView("redirect.jte", Map.of("url", "/site/"+domainName));
|
return new MapModelAndView("redirect.jte", Map.of("url", "/site/"+domainName));
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addToCrawlQueue(int id) throws SQLException {
|
/** Mark a domain for crawling by setting node affinity to zero,
|
||||||
|
* unless it is already marked for crawling, then node affinity should
|
||||||
|
* be left unchanged.
|
||||||
|
* */
|
||||||
|
void addToCrawlQueue(int domainId) throws SQLException {
|
||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
var stmt = conn.prepareStatement("""
|
var stmt = conn.prepareStatement("""
|
||||||
INSERT IGNORE INTO CRAWL_QUEUE(DOMAIN_NAME, SOURCE)
|
UPDATE EC_DOMAIN
|
||||||
SELECT DOMAIN_NAME, "user" FROM EC_DOMAIN WHERE ID=?
|
SET WMSA_prod.EC_DOMAIN.NODE_AFFINITY = 0
|
||||||
|
WHERE ID=? AND WMSA_prod.EC_DOMAIN.NODE_AFFINITY < 0
|
||||||
""")) {
|
""")) {
|
||||||
stmt.setInt(1, id);
|
stmt.setInt(1, domainId);
|
||||||
stmt.executeUpdate();
|
stmt.executeUpdate();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private String getDomainName(int id) {
|
String getDomainName(int id) {
|
||||||
var domain = domainQueries.getDomain(id);
|
var domain = domainQueries.getDomain(id);
|
||||||
if (domain.isEmpty())
|
if (domain.isEmpty())
|
||||||
throw new IllegalArgumentException();
|
throw new IllegalArgumentException();
|
||||||
|
@ -0,0 +1,85 @@
|
|||||||
|
package nu.marginalia.search.svc;
|
||||||
|
|
||||||
|
import com.zaxxer.hikari.HikariConfig;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import nu.marginalia.db.DbDomainQueries;
|
||||||
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
import nu.marginalia.test.TestMigrationLoader;
|
||||||
|
import org.junit.jupiter.api.*;
|
||||||
|
import org.testcontainers.containers.MariaDBContainer;
|
||||||
|
import org.testcontainers.junit.jupiter.Container;
|
||||||
|
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||||
|
|
||||||
|
import java.sql.SQLException;
|
||||||
|
|
||||||
|
@Tag("slow")
|
||||||
|
@Testcontainers
|
||||||
|
class SearchAddToCrawlQueueServiceTest {
|
||||||
|
@Container
|
||||||
|
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
|
||||||
|
.withDatabaseName("WMSA_prod")
|
||||||
|
.withUsername("wmsa")
|
||||||
|
.withPassword("wmsa")
|
||||||
|
.withNetworkAliases("mariadb");
|
||||||
|
|
||||||
|
static HikariDataSource dataSource;
|
||||||
|
|
||||||
|
private DbDomainQueries domainQueries;
|
||||||
|
private SearchAddToCrawlQueueService addToCrawlQueueService;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws SQLException {
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.createStatement()) {
|
||||||
|
stmt.executeQuery("DELETE FROM EC_DOMAIN"); // Wipe any old state from other test runs
|
||||||
|
|
||||||
|
stmt.executeQuery("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES ('known.example.com', 'example.com', -1)");
|
||||||
|
stmt.executeQuery("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES ('added.example.com', 'example.com', 0)");
|
||||||
|
stmt.executeQuery("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES ('indexed.example.com', 'example.com', 1)");
|
||||||
|
}
|
||||||
|
|
||||||
|
domainQueries = new DbDomainQueries(dataSource);
|
||||||
|
addToCrawlQueueService = new SearchAddToCrawlQueueService(domainQueries, dataSource);
|
||||||
|
}
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
public static void setUpAll() {
|
||||||
|
HikariConfig config = new HikariConfig();
|
||||||
|
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
|
||||||
|
config.setUsername("wmsa");
|
||||||
|
config.setPassword("wmsa");
|
||||||
|
|
||||||
|
dataSource = new HikariDataSource(config);
|
||||||
|
TestMigrationLoader.flywayMigration(dataSource);
|
||||||
|
}
|
||||||
|
|
||||||
|
private int getNodeAffinity(String domainName) throws SQLException {
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("SELECT NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_NAME=?"))
|
||||||
|
{
|
||||||
|
stmt.setString(1, domainName);
|
||||||
|
var rsp = stmt.executeQuery();
|
||||||
|
if (rsp.next()) {
|
||||||
|
return rsp.getInt(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void addToCrawlQueue() throws SQLException {
|
||||||
|
int knownId = domainQueries.getDomainId(new EdgeDomain("known.example.com"));
|
||||||
|
int addedId = domainQueries.getDomainId(new EdgeDomain("added.example.com"));
|
||||||
|
int indexedId = domainQueries.getDomainId(new EdgeDomain("indexed.example.com"));
|
||||||
|
|
||||||
|
addToCrawlQueueService.addToCrawlQueue(knownId);
|
||||||
|
addToCrawlQueueService.addToCrawlQueue(addedId);
|
||||||
|
addToCrawlQueueService.addToCrawlQueue(indexedId);
|
||||||
|
|
||||||
|
Assertions.assertEquals(0, getNodeAffinity("known.example.com"));
|
||||||
|
Assertions.assertEquals(0, getNodeAffinity("added.example.com"));
|
||||||
|
Assertions.assertEquals(1, getNodeAffinity("indexed.example.com"));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user