(system) Remove EdgeId<T> and similar objects

They seemed like a good idea at the time, but in practice they're wasting resources and not really providing the clarity I had hoped.
This commit is contained in:
Viktor Lofgren 2023-08-24 17:46:02 +02:00
parent c909120ae1
commit 1e6800565a
30 changed files with 74 additions and 548 deletions

View File

@ -2,8 +2,6 @@ package nu.marginalia.index.client.model.results;
import lombok.AllArgsConstructor;
import lombok.Getter;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.EdgeId;
import nu.marginalia.model.id.UrlIdCodec;
import org.jetbrains.annotations.NotNull;

View File

@ -9,16 +9,16 @@ import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.id.EdgeId;
import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.OptionalInt;
@Singleton
public class DbDomainQueries {
private final HikariDataSource dataSource;
private final Cache<EdgeDomain, EdgeId<EdgeDomain>> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
private final Cache<EdgeDomain, Integer> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
@Inject
public DbDomainQueries(HikariDataSource dataSource)
@ -28,7 +28,7 @@ public class DbDomainQueries {
@SneakyThrows
public EdgeId<EdgeDomain> getDomainId(EdgeDomain domain) {
public Integer getDomainId(EdgeDomain domain) {
try (var connection = dataSource.getConnection()) {
return domainIdCache.get(domain, () -> {
@ -36,7 +36,7 @@ public class DbDomainQueries {
stmt.setString(1, domain.toString());
var rsp = stmt.executeQuery();
if (rsp.next()) {
return new EdgeId<>(rsp.getInt(1));
return rsp.getInt(1);
}
}
throw new NoSuchElementException();
@ -48,12 +48,12 @@ public class DbDomainQueries {
}
@SneakyThrows
public Optional<EdgeId<EdgeDomain>> tryGetDomainId(EdgeDomain domain) {
public OptionalInt tryGetDomainId(EdgeDomain domain) {
var maybe = Optional.ofNullable(domainIdCache.getIfPresent(domain));
if (maybe.isPresent())
return maybe;
Integer maybeId = domainIdCache.getIfPresent(domain);
if (maybeId != null) {
return OptionalInt.of(maybeId);
}
try (var connection = dataSource.getConnection()) {
@ -61,25 +61,25 @@ public class DbDomainQueries {
stmt.setString(1, domain.toString());
var rsp = stmt.executeQuery();
if (rsp.next()) {
var id = new EdgeId<EdgeDomain>(rsp.getInt(1));
var id = rsp.getInt(1);
domainIdCache.put(domain, id);
return Optional.of(id);
return OptionalInt.of(id);
}
}
return Optional.empty();
return OptionalInt.empty();
}
catch (UncheckedExecutionException ex) {
return Optional.empty();
return OptionalInt.empty();
}
}
@SneakyThrows
public Optional<EdgeDomain> getDomain(EdgeId<EdgeDomain> id) {
public Optional<EdgeDomain> getDomain(int id) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, id.id());
stmt.setInt(1, id);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return Optional.of(new EdgeDomain(rsp.getString(1)));

View File

@ -2,15 +2,10 @@ package nu.marginalia.db;
import com.google.inject.ImplementedBy;
import gnu.trove.set.hash.TIntHashSet;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.id.EdgeId;
@ImplementedBy(DomainBlacklistImpl.class)
public interface DomainBlacklist {
boolean isBlacklisted(int domainId);
default boolean isBlacklisted(EdgeId<EdgeDomain> domainId) {
return isBlacklisted(domainId.id());
}
default TIntHashSet getSpamDomains() {
return new TIntHashSet();
}

View File

@ -1,8 +1,9 @@
package nu.marginalia.db;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.id.EdgeIdList;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
@ -58,10 +59,10 @@ public class DomainTypes {
return ret;
}
/** Retrieve the EdgeId of all domains of a certain type,
/** Retrieve the domain id of all domains of a certain type,
* ignoring entries that are not in the EC_DOMAIN table */
public EdgeIdList<EdgeDomain> getKnownDomainsByType(Type type) {
EdgeIdList<EdgeDomain> ret = new EdgeIdList<>();
public TIntList getKnownDomainsByType(Type type) {
TIntList ret = new TIntArrayList();
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""

View File

@ -6,7 +6,6 @@ import nu.marginalia.bigstring.BigString;
import nu.marginalia.bigstring.CompressedBigString;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.EdgeId;
import java.net.URISyntaxException;
@ -24,8 +23,6 @@ public class GsonFactory {
}
})
.registerTypeAdapter(EdgeDomain.class, (JsonDeserializer<EdgeDomain>) (json, typeOfT, context) -> new EdgeDomain(json.getAsString()))
.registerTypeAdapter(EdgeId.class, (JsonDeserializer<EdgeId<?>>) (json, typeOfT, context) -> new EdgeId<>(json.getAsInt()))
.registerTypeAdapter(EdgeId.class, (JsonSerializer<EdgeId<?>>) (src, typeOfSrc, context) -> new JsonPrimitive(src.id()))
.registerTypeAdapter(BigString.class, (JsonDeserializer<BigString>) (json, typeOfT, context) -> BigString.encode(json.getAsString()))
.registerTypeAdapter(BigString.class, (JsonSerializer<BigString>) (src, typeOfT, context) -> new JsonPrimitive(src.decode()))
.registerTypeAdapter(CompressedBigString.class, (JsonSerializer<BigString>) (src, typeOfT, context) -> new JsonPrimitive(src.decode()))

View File

@ -1,11 +0,0 @@
package nu.marginalia.model.id;
/**
* This exists entirely for strengthening the typing of IDs
* Deprecated: We dont' use this anymore
* @param <T>
*/
@Deprecated
public record EdgeId<T>(int id) {
}

View File

@ -1,35 +0,0 @@
package nu.marginalia.model.id;
import java.util.Arrays;
import java.util.stream.IntStream;
@Deprecated
public record EdgeIdArray<T> (int... values) implements EdgeIdCollection<T> {
public static <T> EdgeIdArray<T> gather(IntStream stream) {
return new EdgeIdArray<>(stream.toArray());
}
@Override
public int[] values() {
return values;
}
@Override
public boolean isEmpty() {
return values.length == 0;
}
@Override
public int size() {
return values.length;
}
public int get(int idx) {
return values[idx];
}
public void sort() {
Arrays.sort(values);
}
}

View File

@ -1,29 +0,0 @@
package nu.marginalia.model.id;
import java.util.Arrays;
import java.util.Iterator;
import java.util.stream.IntStream;
@Deprecated
public interface EdgeIdCollection<T> extends Iterable<EdgeId<T>> {
int size();
boolean isEmpty();
int[] values();
default IntStream stream() {
return Arrays.stream(values());
}
default Iterator<EdgeId<T>> iterator() {
return Arrays.stream(values()).mapToObj(EdgeId<T>::new).iterator();
}
default EdgeIdArray<T> asArray() {
return new EdgeIdArray<>(values());
}
default EdgeIdList<T> asList() {
return new EdgeIdList<>(values());
}
default EdgeIdSet<T> asSet() {
return new EdgeIdSet<>(values());
}
}

View File

@ -1,13 +0,0 @@
package nu.marginalia.model.id;
import gnu.trove.TIntCollection;
@Deprecated
public interface EdgeIdCollectionMutable<T> {
TIntCollection underlyingCollection();
default void addAll(EdgeIdArray<T> other) { underlyingCollection().addAll(other.values()); }
default void addAll(EdgeIdList<T> other) { underlyingCollection().addAll(other.list()); }
default void addAll(EdgeIdCollection<T> other) { underlyingCollection().addAll(other.values()); }
}

View File

@ -1,49 +0,0 @@
package nu.marginalia.model.id;
import gnu.trove.TIntCollection;
import gnu.trove.list.array.TIntArrayList;
import java.util.stream.IntStream;
@Deprecated
public record EdgeIdList<T> (TIntArrayList list) implements
EdgeIdCollection<T>,
EdgeIdCollectionMutable<T> {
public EdgeIdList(int... values) { this(new TIntArrayList(values)); }
public static <T> EdgeIdList<T> gather(IntStream stream) {
return stream.collect(EdgeIdList::new, EdgeIdList::add, EdgeIdList::addAll);
}
@Override
public int[] values() {
return list.toArray();
}
@Override
public boolean isEmpty() {
return list.isEmpty();
}
@Override
public int size() {
return list.size();
}
public int get(int idx) {
return list.get(idx);
}
public void add(int id) {
list.add(id);
}
public void sort() {
list.sort();
}
@Override
public TIntCollection underlyingCollection() {
return list;
}
}

View File

@ -1,53 +0,0 @@
package nu.marginalia.model.id;
import gnu.trove.TIntCollection;
import gnu.trove.set.hash.TIntHashSet;
import java.util.stream.IntStream;
@Deprecated
public record EdgeIdSet<T> (TIntHashSet set) implements EdgeIdCollection<T>, EdgeIdCollectionMutable<T> {
public EdgeIdSet(int... values) {
this(new TIntHashSet(values.length, 0.5f, -1));
set.addAll(values);
}
public EdgeIdSet(int initialCapacity, float loadFactor) {
this(new TIntHashSet(initialCapacity, loadFactor, -1));
}
@Override
public TIntCollection underlyingCollection() {
return set;
}
public static <T> EdgeIdSet<T> gather(IntStream stream) {
return new EdgeIdSet<>(stream.toArray());
}
@Override
public int[] values() {
return set.toArray();
}
@Override
public boolean isEmpty() {
return set.isEmpty();
}
@Override
public int size() {
return set.size();
}
public boolean contains(int id) {
return set.contains(id);
}
public boolean add(int id) {
return set.add(id);
}
public boolean remove(int id) { return set.remove(id); }
}

View File

@ -1,8 +1,5 @@
package nu.marginalia.index.journal.model;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.EdgeId;
import nu.marginalia.model.id.UrlIdCodec;
public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntryData data) {

View File

@ -1,71 +0,0 @@
package nu.marginalia.browse;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.browse.model.BrowseResult;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.EdgeIdCollection;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.*;
@Singleton
public class DbBrowseDomainsFromUrlId {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final HikariDataSource dataSource;
@Inject
public DbBrowseDomainsFromUrlId(HikariDataSource dataSource) {
this.dataSource = dataSource;
}
private <T> String idList(EdgeIdCollection<EdgeUrl> ids) {
StringJoiner j = new StringJoiner(",", "(", ")");
for (var id : ids.values()) {
j.add(Integer.toString(id));
}
return j.toString();
}
public List<BrowseResult> getBrowseResultFromUrlIds(EdgeIdCollection<EdgeUrl> urlIds) {
if (urlIds.isEmpty())
return Collections.emptyList();
List<BrowseResult> ret = new ArrayList<>(urlIds.size());
try (var conn = dataSource.getConnection()) {
try (var stmt = conn.createStatement()) {
String inStmt = idList(urlIds);
var rsp = stmt.executeQuery("""
SELECT DOMAIN_ID, DOMAIN_NAME
FROM EC_URL_VIEW
INNER JOIN DOMAIN_METADATA ON EC_URL_VIEW.DOMAIN_ID=DOMAIN_METADATA.ID
WHERE
KNOWN_URLS<5000
AND QUALITY>-10
AND EC_URL_VIEW.ID IN
""" + inStmt); // this injection is safe, inStmt is derived from concatenating a list of integers
while (rsp.next()) {
int id = rsp.getInt(1);
String domain = rsp.getString(2);
ret.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0));
}
}
}
catch (SQLException ex) {
logger.error("SQL error", ex);
}
return ret;
}
}

View File

@ -6,7 +6,6 @@ import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.browse.model.BrowseResult;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.db.DomainBlacklist;
import nu.marginalia.model.id.EdgeId;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -24,7 +23,7 @@ public class DbBrowseDomainsSimilarCosine {
this.dataSource = dataSource;
}
public List<BrowseResult> getDomainNeighborsAdjacentCosine(EdgeId<EdgeDomain> domainId, DomainBlacklist blacklist, int count) {
public List<BrowseResult> getDomainNeighborsAdjacentCosine(int domainId, DomainBlacklist blacklist, int count) {
List<BrowseResult> domains = new ArrayList<>(count);
String q = """
@ -43,7 +42,7 @@ public class DbBrowseDomainsSimilarCosine {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(q)) {
stmt.setFetchSize(count);
stmt.setInt(1, domainId.id());
stmt.setInt(1, domainId);
stmt.setInt(2, count);
var rsp = stmt.executeQuery();
while (rsp.next() && domains.size() < count) {

View File

@ -5,10 +5,7 @@ import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.browse.model.BrowseResult;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.db.DomainBlacklist;
import nu.marginalia.model.id.EdgeId;
import nu.marginalia.model.id.EdgeIdCollection;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -26,7 +23,7 @@ public class DbBrowseDomainsSimilarOldAlgo {
this.dataSource = dataSource;
}
public List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, DomainBlacklist blacklist, int count) {
public List<BrowseResult> getDomainNeighborsAdjacent(int domainId, DomainBlacklist blacklist, int count) {
final Set<BrowseResult> domains = new HashSet<>(count*3);
final String q = """
@ -49,7 +46,7 @@ public class DbBrowseDomainsSimilarOldAlgo {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(q)) {
stmt.setFetchSize(count);
stmt.setInt(1, domainId.id());
stmt.setInt(1, domainId);
stmt.setInt(2, count);
var rsp = stmt.executeQuery();
while (rsp.next()) {
@ -78,7 +75,7 @@ public class DbBrowseDomainsSimilarOldAlgo {
try (var stmt = connection.prepareStatement(q2)) {
stmt.setFetchSize(count/2);
stmt.setInt(1, domainId.id());
stmt.setInt(1, domainId);
stmt.setInt(2, count/2 - domains.size());
var rsp = stmt.executeQuery();
while (rsp.next() && domains.size() < count/2) {
@ -109,7 +106,7 @@ public class DbBrowseDomainsSimilarOldAlgo {
LIMIT ?""";
try (var stmt = connection.prepareStatement(q3)) {
stmt.setFetchSize(count/2);
stmt.setInt(1, domainId.id());
stmt.setInt(1, domainId);
stmt.setInt(2, count/2 - domains.size());
var rsp = stmt.executeQuery();
@ -165,49 +162,4 @@ public class DbBrowseDomainsSimilarOldAlgo {
return domains;
}
private <T> String idList(EdgeIdCollection<EdgeUrl> ids) {
StringJoiner j = new StringJoiner(",", "(", ")");
for (var id : ids.values()) {
j.add(Integer.toString(id));
}
return j.toString();
}
public List<BrowseResult> getBrowseResultFromUrlIds(EdgeIdCollection<EdgeUrl> urlIds) {
if (urlIds.isEmpty())
return Collections.emptyList();
List<BrowseResult> ret = new ArrayList<>(urlIds.size());
try (var conn = dataSource.getConnection()) {
try (var stmt = conn.createStatement()) {
String inStmt = idList(urlIds);
var rsp = stmt.executeQuery("""
SELECT DOMAIN_ID, DOMAIN_NAME
FROM EC_URL_VIEW
INNER JOIN DOMAIN_METADATA ON EC_URL_VIEW.DOMAIN_ID=DOMAIN_METADATA.ID
WHERE
KNOWN_URLS<5000
AND QUALITY>-10
AND EC_URL_VIEW.ID IN
""" + inStmt); // this injection is safe, inStmt is derived from concatenating a list of integers
while (rsp.next()) {
int id = rsp.getInt(1);
String domain = rsp.getString(2);
ret.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0));
}
}
}
catch (SQLException ex) {
logger.error("SQL error", ex);
}
return ret;
}
}

View File

@ -4,9 +4,7 @@ import com.google.common.base.Strings;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.model.id.EdgeId;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -30,7 +28,7 @@ public class ScreenshotService {
this.dataSource = dataSource;
}
public boolean hasScreenshot(EdgeId<EdgeDomain> domainId) {
public boolean hasScreenshot(int domainId) {
try (var conn = dataSource.getConnection();
var ps = conn.prepareStatement("""
SELECT TRUE
@ -38,7 +36,7 @@ public class ScreenshotService {
INNER JOIN EC_DOMAIN ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME
WHERE EC_DOMAIN.ID=?
""")) {
ps.setInt(1, domainId.id());
ps.setInt(1, domainId);
var rs = ps.executeQuery();
if (rs.next()) {
return rs.getBoolean(1);
@ -86,7 +84,7 @@ public class ScreenshotService {
private Object serveSvgPlaceholder(Response response, int id) {
var name = domainQueries.getDomain(new EdgeId<>(id)).map(Object::toString)
var name = domainQueries.getDomain(id).map(Object::toString)
.orElse("[Screenshot Not Yet Captured]");
response.type("image/svg+xml");

View File

@ -15,9 +15,6 @@ import nu.marginalia.lexicon.KeywordLexicon;
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.EdgeId;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

View File

@ -11,7 +11,6 @@ import nu.marginalia.db.storage.model.FileStorageId;
import nu.marginalia.db.storage.model.FileStorageType;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.model.id.EdgeIdList;
import nu.marginalia.renderer.RendererFactory;
import nu.marginalia.screenshot.ScreenshotService;
import nu.marginalia.service.server.*;
@ -224,7 +223,7 @@ public class ControlService extends Service {
}
});
randomExplorationService.removeRandomDomains(new EdgeIdList<>(idList.toArray()));
randomExplorationService.removeRandomDomains(idList.toArray());
String after = request.queryParams("after");

View File

@ -2,8 +2,6 @@ package nu.marginalia.control.svc;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.id.EdgeIdList;
import java.sql.SQLException;
import java.util.ArrayList;
@ -18,7 +16,7 @@ public class RandomExplorationService {
this.dataSource = dataSource;
}
public void removeRandomDomains(EdgeIdList<EdgeDomain> ids) throws SQLException {
public void removeRandomDomains(int[] ids) throws SQLException {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
DELETE FROM EC_RANDOM_DOMAINS
@ -27,7 +25,7 @@ public class RandomExplorationService {
"""))
{
for (var id : ids) {
stmt.setInt(1, id.id());
stmt.setInt(1, id);
stmt.addBatch();
}
stmt.executeBatch();

View File

@ -2,13 +2,12 @@ package nu.marginalia.index.svc;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import gnu.trove.list.TIntList;
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
import lombok.SneakyThrows;
import nu.marginalia.db.DomainTypes;
import nu.marginalia.index.IndexServicesFactory;
import nu.marginalia.index.searchset.SearchSet;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.id.EdgeIdList;
import nu.marginalia.ranking.ReversePageRank;
import nu.marginalia.ranking.StandardPageRank;
import nu.marginalia.ranking.accumulator.RankingResultHashMapAccumulator;
@ -168,7 +167,7 @@ public class IndexSearchSetsService {
@SneakyThrows
public void updateBlogsSet() {
EdgeIdList<EdgeDomain> knownDomains = domainTypes.getKnownDomainsByType(DomainTypes.Type.BLOG);
TIntList knownDomains = domainTypes.getKnownDomainsByType(DomainTypes.Type.BLOG);
if (knownDomains.isEmpty()) {
// FIXME: We don't want to reload the entire list every time, but we do want to do it sometimes. Actor maybe?
@ -177,7 +176,7 @@ public class IndexSearchSetsService {
}
synchronized (this) {
blogsSet = new RankingSearchSet(SearchSetIdentifier.BLOGS, blogsSet.source, new IntOpenHashSet(knownDomains.values()));
blogsSet = new RankingSearchSet(SearchSetIdentifier.BLOGS, blogsSet.source, new IntOpenHashSet(knownDomains.toArray()));
blogsSet.write();
}
}

View File

@ -106,16 +106,11 @@ public class SearchOperator {
}
private int getDomainId(String domain) {
int domainId = -1;
try {
if (domain != null) {
return domainQueries.getDomainId(new EdgeDomain(domain)).id();
}
if (domain == null) {
return -1;
}
catch (NoSuchElementException ex) {
}
return domainId;
return domainQueries.tryGetDomainId(new EdgeDomain(domain)).orElse(-1);
}
private List<String> getProblems(Context ctx, String evalResult, List<UrlDetails> queryResults, SearchQuery processedQuery) {

View File

@ -10,7 +10,6 @@ import nu.marginalia.db.storage.model.FileStorageType;
import nu.marginalia.linkdb.LinkdbReader;
import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.search.client.SearchMqEndpoints;
import nu.marginalia.search.db.DbUrlDetailsQuery;
import nu.marginalia.search.svc.SearchFrontPageService;
import nu.marginalia.search.svc.*;
import nu.marginalia.service.control.ServiceEventLog;
@ -30,7 +29,6 @@ import java.nio.file.Path;
public class SearchService extends Service {
private final WebsiteUrl websiteUrl;
private final DbUrlDetailsQuery dbUrlDetailsQuery;
private final StaticResources staticResources;
private final FileStorageService fileStorageService;
private final LinkdbReader linkdbReader;
@ -42,7 +40,6 @@ public class SearchService extends Service {
@Inject
public SearchService(BaseServiceParams params,
WebsiteUrl websiteUrl,
DbUrlDetailsQuery dbUrlDetailsQuery,
StaticResources staticResources,
SearchFrontPageService frontPageService,
SearchErrorPageService errorPageService,
@ -57,7 +54,6 @@ public class SearchService extends Service {
this.eventLog = params.eventLog;
this.websiteUrl = websiteUrl;
this.dbUrlDetailsQuery = dbUrlDetailsQuery;
this.staticResources = staticResources;
this.fileStorageService = fileStorageService;
this.linkdbReader = linkdbReader;
@ -91,12 +87,6 @@ public class SearchService extends Service {
Spark.awaitInitialization();
}
@MqNotification(endpoint = SearchMqEndpoints.FLUSH_CACHES)
public void flushCaches(String unusedArg) {
logger.info("Flushing caches");
dbUrlDetailsQuery.clearCaches();
}
@SneakyThrows
@MqNotification(endpoint = SearchMqEndpoints.SWITCH_LINKDB)
public void switchLinkdb(String unusedArg) {

View File

@ -61,13 +61,13 @@ public class SiteListCommand implements SearchCommandInterface {
List<UrlDetails> resultSet;
Path screenshotPath = null;
Integer domainId = -1;
int domainId = -1;
if (null != domain) {
var dumbQuery = queryFactory.createQuery(SearchProfile.CORPO, 100, 100, "site:"+domain);
resultSet = searchQueryIndexService.executeQuery(ctx, dumbQuery);
var maybeId = domainQueries.tryGetDomainId(domain);
if (maybeId.isPresent()) {
domainId = maybeId.get().id();
domainId = maybeId.getAsInt();
screenshotPath = Path.of("/screenshot/" + domainId);
}
else {

View File

@ -1,112 +0,0 @@
package nu.marginalia.search.db;
import com.google.common.base.Strings;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.model.id.EdgeId;
import nu.marginalia.model.id.EdgeIdCollection;
import nu.marginalia.search.model.PageScoreAdjustment;
import nu.marginalia.search.model.UrlDetails;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
public class DbUrlDetailsQuery {
private final HikariDataSource dataSource;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Cache<EdgeUrl, EdgeId<EdgeUrl>> urlIdCache = CacheBuilder.newBuilder().maximumSize(100_000).build();
public static double QUALITY_LOWER_BOUND_CUTOFF = -15.;
@Inject
public DbUrlDetailsQuery(HikariDataSource dataSource)
{
this.dataSource = dataSource;
}
public synchronized void clearCaches()
{
urlIdCache.invalidateAll();
}
private <T> String idList(EdgeIdCollection<EdgeUrl> ids) {
StringJoiner j = new StringJoiner(",", "(", ")");
for (var id : ids.values()) {
j.add(Integer.toString(id));
}
return j.toString();
}
@SneakyThrows
public List<UrlDetails> getUrlDetailsMulti(EdgeIdCollection<EdgeUrl> ids) {
if (ids.isEmpty()) {
return Collections.emptyList();
}
List<UrlDetails> result = new ArrayList<>(ids.size());
try (var connection = dataSource.getConnection()) {
String idString = idList(ids);
try (var stmt = connection.prepareStatement(
"""
SELECT ID, DOMAIN_ID, URL,
TITLE, DESCRIPTION,
QUALITY,
WORDS_TOTAL, FORMAT, FEATURES,
IP, DOMAIN_STATE,
DATA_HASH
FROM EC_URL_VIEW
WHERE TITLE IS NOT NULL
AND ID IN
""" + idString)) {
stmt.setFetchSize(ids.size());
var rsp = stmt.executeQuery();
while (rsp.next()) {
var val = new UrlDetails(rsp.getInt(1),
rsp.getInt(2),
new EdgeUrl(rsp.getString(3)),
rsp.getString(4), // title
rsp.getString(5), // description
rsp.getDouble(6), // quality
rsp.getInt(7), // wordsTotal
rsp.getString(8), // format
rsp.getInt(9), // features
rsp.getString(10), // ip
DomainIndexingState.valueOf(rsp.getString(11)), // domainState
rsp.getLong(12), // dataHash
PageScoreAdjustment.zero(), // urlQualityAdjustment
Integer.MAX_VALUE, // rankingId
Double.MAX_VALUE, // termScore
1, // resultsFromSameDomain
"", // positions
null, // result item
null // keyword scores
);
if (val.urlQuality <= QUALITY_LOWER_BOUND_CUTOFF
&& Strings.isNullOrEmpty(val.description)
&& val.url.path.length() > 1) {
continue;
}
result.add(val);
}
}
}
return result;
}
}

View File

@ -4,7 +4,6 @@ import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.browse.model.BrowseResult;
import nu.marginalia.screenshot.ScreenshotService;
import nu.marginalia.model.id.EdgeId;
import java.util.HashSet;
import java.util.Set;
@ -22,7 +21,7 @@ public class BrowseResultCleaner {
public Predicate<BrowseResult> shouldRemoveResultPredicate() {
Set<String> domainHashes = new HashSet<>(100);
return (res) -> !screenshotService.hasScreenshot(new EdgeId<>(res.domainId()))
return (res) -> !screenshotService.hasScreenshot(res.domainId())
|| !domainHashes.add(res.domainHash());
}
}

View File

@ -5,7 +5,6 @@ import lombok.SneakyThrows;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.model.id.EdgeId;
import nu.marginalia.search.model.DomainInformation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -13,10 +12,7 @@ import org.slf4j.LoggerFactory;
import javax.inject.Inject;
import javax.inject.Singleton;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Optional;
import java.util.*;
/*
TODO: This class needs to be refactored, a lot of
@ -42,10 +38,11 @@ public class DomainInformationService {
public Optional<DomainInformation> domainInfo(String site) {
EdgeId<EdgeDomain> domainId = getDomainFromPartial(site);
if (domainId == null) {
OptionalInt maybeDomainId = getDomainFromPartial(site);
if (maybeDomainId.isEmpty()) {
return Optional.empty();
}
int domainId = maybeDomainId.getAsInt();
Optional<EdgeDomain> domain = dbDomainQueries.getDomain(domainId);
if (domain.isEmpty()) {
@ -85,7 +82,7 @@ public class DomainInformationService {
}
@SneakyThrows
private boolean inCrawlQueue(EdgeId<EdgeDomain> domainId) {
private boolean inCrawlQueue(int domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement(
"""
@ -94,21 +91,15 @@ public class DomainInformationService {
WHERE EC_DOMAIN.ID=?
"""))
{
stmt.setInt(1, domainId.id());
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
return rsp.next();
}
}
}
private EdgeId<EdgeDomain> getDomainFromPartial(String site) {
try {
return dbDomainQueries.getDomainId(new EdgeDomain(site));
}
catch (Exception ex) {
return null;
}
private OptionalInt getDomainFromPartial(String site) {
return dbDomainQueries.tryGetDomainId(new EdgeDomain(site));
}
@SneakyThrows
@ -125,11 +116,11 @@ public class DomainInformationService {
}
@SneakyThrows
public int getPagesKnown(EdgeId<EdgeDomain> domainId) {
public int getPagesKnown(int domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT KNOWN_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
stmt.setInt(1, domainId.id());
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
@ -142,11 +133,11 @@ public class DomainInformationService {
}
@SneakyThrows
public int getPagesVisited(EdgeId<EdgeDomain> domainId) {
public int getPagesVisited(int domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT VISITED_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
stmt.setInt(1, domainId.id());
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
@ -160,11 +151,11 @@ public class DomainInformationService {
@SneakyThrows
public int getPagesIndexed(EdgeId<EdgeDomain> domainId) {
public int getPagesIndexed(int domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT GOOD_URLS FROM DOMAIN_METADATA WHERE ID=?")) {
stmt.setInt(1, domainId.id());
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
@ -177,11 +168,11 @@ public class DomainInformationService {
}
@SneakyThrows
public int getIncomingLinks(EdgeId<EdgeDomain> domainId) {
public int getIncomingLinks(int domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE DEST_DOMAIN_ID=?")) {
stmt.setInt(1, domainId.id());
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
@ -193,11 +184,11 @@ public class DomainInformationService {
}
}
@SneakyThrows
public int getOutboundLinks(EdgeId<EdgeDomain> domainId) {
public int getOutboundLinks(int domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT COUNT(ID) FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?")) {
stmt.setInt(1, domainId.id());
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getInt(1);
@ -210,11 +201,11 @@ public class DomainInformationService {
}
@SneakyThrows
public double getDomainQuality(EdgeId<EdgeDomain> domainId) {
public double getDomainQuality(int domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT QUALITY FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, domainId.id());
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getDouble(1);
@ -226,11 +217,11 @@ public class DomainInformationService {
}
}
public DomainIndexingState getDomainState(EdgeId<EdgeDomain> domainId) {
public DomainIndexingState getDomainState(int domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT STATE FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, domainId.id());
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return DomainIndexingState.valueOf(rsp.getString(1));
@ -244,11 +235,11 @@ public class DomainInformationService {
return DomainIndexingState.ERROR;
}
public List<EdgeDomain> getLinkingDomains(EdgeId<EdgeDomain> domainId) {
public List<EdgeDomain> getLinkingDomains(int domainId) {
try (var connection = dataSource.getConnection()) {
List<EdgeDomain> results = new ArrayList<>(25);
try (var stmt = connection.prepareStatement("SELECT SOURCE_DOMAIN FROM EC_RELATED_LINKS_VIEW WHERE DEST_DOMAIN_ID=? ORDER BY SOURCE_DOMAIN_ID LIMIT 25")) {
stmt.setInt(1, domainId.id());
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
while (rsp.next()) {
results.add(new EdgeDomain(rsp.getString(1)));
@ -264,11 +255,11 @@ public class DomainInformationService {
return Collections.emptyList();
}
public double getRank(EdgeId<EdgeDomain> domainId) {
public double getRank(int domainId) {
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT IFNULL(RANK, 1) FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, domainId.id());
stmt.setInt(1, domainId);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return rsp.getDouble(1);

View File

@ -4,7 +4,6 @@ import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.WebsiteUrl;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.model.id.EdgeId;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request;
@ -61,7 +60,7 @@ public class SearchAddToCrawlQueueService {
}
private String getDomainName(int id) {
var domain = domainQueries.getDomain(new EdgeId<>(id));
var domain = domainQueries.getDomain(id);
if (domain.isEmpty())
Spark.halt(404);
return domain.get().toString();

View File

@ -9,7 +9,6 @@ import nu.marginalia.db.DomainBlacklist;
import nu.marginalia.renderer.MustacheRenderer;
import nu.marginalia.renderer.RendererFactory;
import nu.marginalia.screenshot.ScreenshotService;
import nu.marginalia.model.id.EdgeId;
import nu.marginalia.service.server.*;
import org.jetbrains.annotations.NotNull;
import spark.Request;
@ -156,7 +155,7 @@ public class DatingService extends Service {
var session = sessionObjectOpt.get();
int id = Integer.parseInt(request.params("id"));
BrowseResult res = session.nextSimilar(new EdgeId<>(id), browseSimilarCosine, blacklist);
BrowseResult res = session.nextSimilar(id, browseSimilarCosine, blacklist);
res = findViableDomain(session, res);
@ -168,7 +167,7 @@ public class DatingService extends Service {
@NotNull
private BrowseResult findViableDomain(DatingSessionObject session, BrowseResult res) {
while (!screenshotService.hasScreenshot(new EdgeId<>(res.domainId())) || session.isRecent(res)) {
while (!screenshotService.hasScreenshot(res.domainId()) || session.isRecent(res)) {
res = session.next(browseRandom, blacklist);
}
return res;

View File

@ -3,9 +3,7 @@ package nu.marginalia.dating;
import nu.marginalia.browse.DbBrowseDomainsRandom;
import nu.marginalia.browse.DbBrowseDomainsSimilarCosine;
import nu.marginalia.browse.model.BrowseResult;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.db.DomainBlacklist;
import nu.marginalia.model.id.EdgeId;
import java.util.LinkedList;
@ -29,8 +27,8 @@ public class DatingSessionObject {
return queue.pollFirst();
}
public BrowseResult nextSimilar(EdgeId<EdgeDomain> id, DbBrowseDomainsSimilarCosine adjacent, DomainBlacklist blacklist) {
adjacent.getDomainNeighborsAdjacentCosine(id, blacklist, 25).forEach(queue::addFirst);
public BrowseResult nextSimilar(int domainId, DbBrowseDomainsSimilarCosine adjacent, DomainBlacklist blacklist) {
adjacent.getDomainNeighborsAdjacentCosine(domainId, blacklist, 25).forEach(queue::addFirst);
while (queue.size() > MAX_QUEUE_SIZE) {
queue.removeLast();

View File

@ -5,7 +5,6 @@ import lombok.SneakyThrows;
import nu.marginalia.ProcessConfiguration;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.id.EdgeId;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.service.module.DatabaseModule;
@ -40,8 +39,7 @@ public class WebsiteAdjacenciesCalculator {
System.out.println(Arrays.toString(domainName));
int[] domainIds = Arrays.stream(domainName).map(EdgeDomain::new)
.map(dataStoreDao::getDomainId)
.mapToInt(EdgeId::id)
.mapToInt(dataStoreDao::getDomainId)
.map(domainAliases::deAlias)
.toArray();
@ -49,7 +47,7 @@ public class WebsiteAdjacenciesCalculator {
findAdjacentDtoS(domainId, similarities -> {
for (var similarity : similarities.similarities()) {
if (adjacenciesData.isIndexedDomain(similarity.domainId)) System.out.print("*");
System.out.println(dataStoreDao.getDomain(new EdgeId<>(similarity.domainId)).map(Object::toString).orElse("") + " " + prettyPercent(similarity.value));
System.out.println(dataStoreDao.getDomain(similarity.domainId).map(Object::toString).orElse("") + " " + prettyPercent(similarity.value));
}
});
}