(domain-info) Reduce memory usage

This commit is contained in:
Viktor Lofgren 2024-02-27 20:25:52 +01:00
parent eaf836dc66
commit c943954bb4
2 changed files with 35 additions and 28 deletions

View File

@ -34,6 +34,7 @@ dependencies {
implementation libs.spark implementation libs.spark
implementation libs.opencsv implementation libs.opencsv
implementation libs.trove implementation libs.trove
implementation libs.roaringbitmap
implementation libs.fastutil implementation libs.fastutil
implementation libs.bundles.gson implementation libs.bundles.gson
implementation libs.bundles.mariadb implementation libs.bundles.mariadb

View File

@ -8,10 +8,12 @@ import gnu.trove.map.hash.TIntDoubleHashMap;
import gnu.trove.map.hash.TIntIntHashMap; import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.set.TIntSet; import gnu.trove.set.TIntSet;
import gnu.trove.set.hash.TIntHashSet; import gnu.trove.set.hash.TIntHashSet;
import it.unimi.dsi.fastutil.ints.Int2DoubleArrayMap;
import nu.marginalia.api.domains.*; import nu.marginalia.api.domains.*;
import nu.marginalia.api.domains.model.SimilarDomain; import nu.marginalia.api.domains.model.SimilarDomain;
import nu.marginalia.api.indexdomainlinks.AggregateDomainLinksClient; import nu.marginalia.api.indexdomainlinks.AggregateDomainLinksClient;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import org.roaringbitmap.RoaringBitmap;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -32,12 +34,12 @@ public class SimilarDomainsService {
private volatile TIntIntHashMap domainIdToIdx = new TIntIntHashMap(100_000); private volatile TIntIntHashMap domainIdToIdx = new TIntIntHashMap(100_000);
private volatile int[] domainIdxToId; private volatile int[] domainIdxToId;
public volatile TIntDoubleHashMap[] relatedDomains; public volatile Int2DoubleArrayMap[] relatedDomains;
public volatile TIntList[] domainNeighbors = null; public volatile TIntList[] domainNeighbors = null;
public volatile BitSet screenshotDomains = null; public volatile RoaringBitmap screenshotDomains = null;
public volatile BitSet activeDomains = null; public volatile RoaringBitmap activeDomains = null;
public volatile BitSet indexedDomains = null; public volatile RoaringBitmap indexedDomains = null;
public volatile double[] domainRanks = null; public volatile TIntDoubleHashMap domainRanks = null;
public volatile String[] domainNames = null; public volatile String[] domainNames = null;
volatile boolean isReady = false; volatile boolean isReady = false;
@ -69,13 +71,13 @@ public class SimilarDomainsService {
domainIdxToId[idx] = id; domainIdxToId[idx] = id;
return true; return true;
}); });
domainRanks = new double[domainIdToIdx.size()]; domainRanks = new TIntDoubleHashMap(100_000, 0.5f, -1, 0.);
domainNames = new String[domainIdToIdx.size()]; domainNames = new String[domainIdToIdx.size()];
domainNeighbors = new TIntList[domainIdToIdx.size()]; domainNeighbors = new TIntList[domainIdToIdx.size()];
screenshotDomains = new BitSet(domainIdToIdx.size()); screenshotDomains = new RoaringBitmap();
activeDomains = new BitSet(domainIdToIdx.size()); activeDomains = new RoaringBitmap();
indexedDomains = new BitSet(domainIdToIdx.size()); indexedDomains = new RoaringBitmap();
relatedDomains = new TIntDoubleHashMap[domainIdToIdx.size()]; relatedDomains = new Int2DoubleArrayMap[domainIdToIdx.size()];
logger.info("Loaded {} domain IDs", domainIdToIdx.size()); logger.info("Loaded {} domain IDs", domainIdToIdx.size());
@ -94,13 +96,17 @@ public class SimilarDomainsService {
int higherIndex = Math.max(didx, nidx); int higherIndex = Math.max(didx, nidx);
if (relatedDomains[lowerIndex] == null) if (relatedDomains[lowerIndex] == null)
relatedDomains[lowerIndex] = new TIntDoubleHashMap(32); relatedDomains[lowerIndex] = new Int2DoubleArrayMap(4);
relatedDomains[lowerIndex].put(higherIndex, Math.round(100 * rs.getDouble(3)));
double rank = Math.round(100 * rs.getDouble(3));
if (rank > 0.1) {
relatedDomains[lowerIndex].put(higherIndex, rank);
}
if (domainNeighbors[didx] == null) if (domainNeighbors[didx] == null)
domainNeighbors[didx] = new TIntArrayList(32); domainNeighbors[didx] = new TIntArrayList(4);
if (domainNeighbors[nidx] == null) if (domainNeighbors[nidx] == null)
domainNeighbors[nidx] = new TIntArrayList(32); domainNeighbors[nidx] = new TIntArrayList(4);
domainNeighbors[didx].add(nidx); domainNeighbors[didx].add(nidx);
domainNeighbors[nidx].add(didx); domainNeighbors[nidx].add(didx);
@ -122,14 +128,14 @@ public class SimilarDomainsService {
final int id = rs.getInt("ID"); final int id = rs.getInt("ID");
final int idx = domainIdToIdx.get(id); final int idx = domainIdToIdx.get(id);
domainRanks[idx] = Math.round(100 * (1. - rs.getDouble("RANK"))); domainRanks.put(idx, Math.round(100 * (1. - rs.getDouble("RANK"))));
domainNames[idx] = rs.getString("DOMAIN_NAME"); domainNames[idx] = rs.getString("DOMAIN_NAME");
if (rs.getBoolean("INDEXED")) if (rs.getBoolean("INDEXED"))
indexedDomains.set(idx); indexedDomains.add(idx);
if (rs.getBoolean("ACTIVE")) if (rs.getBoolean("ACTIVE"))
activeDomains.set(idx); activeDomains.add(idx);
} }
@ -142,10 +148,10 @@ public class SimilarDomainsService {
final int id = rs.getInt(1); final int id = rs.getInt(1);
final int idx = domainIdToIdx.get(id); final int idx = domainIdToIdx.get(id);
screenshotDomains.set(idx); screenshotDomains.add(idx);
} }
logger.info("Loaded {} domains", domainRanks.length); logger.info("Loaded {} domains", domainRanks.size());
isReady = true; isReady = true;
} }
} }
@ -222,10 +228,10 @@ public class SimilarDomainsService {
.setDomainId(id) .setDomainId(id)
.setUrl(new EdgeDomain(domainNames[idx]).toRootUrl().toString()) .setUrl(new EdgeDomain(domainNames[idx]).toRootUrl().toString())
.setRelatedness(getRelatedness(domainId, id)) .setRelatedness(getRelatedness(domainId, id))
.setRank(domainRanks[idx]) .setRank(domainRanks.get(idx))
.setIndexed(indexedDomains.get(idx)) .setIndexed(indexedDomains.contains(idx))
.setActive(activeDomains.get(idx)) .setActive(activeDomains.contains(idx))
.setScreenshot(screenshotDomains.get(idx)) .setScreenshot(screenshotDomains.contains(idx))
.setLinkType(RpcSimilarDomain.LINK_TYPE.valueOf(linkType.name())) .setLinkType(RpcSimilarDomain.LINK_TYPE.valueOf(linkType.name()))
.build()); .build());
@ -291,7 +297,7 @@ public class SimilarDomainsService {
double[] ranksArray = new double[idsArray.length]; double[] ranksArray = new double[idsArray.length];
for (int i = 0; i < idxArray.length; i++) { for (int i = 0; i < idxArray.length; i++) {
ranksArray[i] = this.domainRanks[idxArray[i]]; ranksArray[i] = this.domainRanks.get(idxArray[i]);
} }
double[] relatednessArray = new double[idsArray.length]; double[] relatednessArray = new double[idsArray.length];
for (int i = 0; i < idsArray.length; i++) { for (int i = 0; i < idsArray.length; i++) {
@ -337,10 +343,10 @@ public class SimilarDomainsService {
.setDomainId(id) .setDomainId(id)
.setUrl(new EdgeDomain(domainNames[idx]).toRootUrl().toString()) .setUrl(new EdgeDomain(domainNames[idx]).toRootUrl().toString())
.setRelatedness(getRelatedness(domainId, id)) .setRelatedness(getRelatedness(domainId, id))
.setRank(domainRanks[idx]) .setRank(ranksArray[id])
.setIndexed(indexedDomains.get(idx)) .setIndexed(indexedDomains.contains(idx))
.setActive(activeDomains.get(idx)) .setActive(activeDomains.contains(idx))
.setScreenshot(screenshotDomains.get(idx)) .setScreenshot(screenshotDomains.contains(idx))
.setLinkType(RpcSimilarDomain.LINK_TYPE.valueOf(linkType.name())) .setLinkType(RpcSimilarDomain.LINK_TYPE.valueOf(linkType.name()))
.build()); .build());