Merge pull request #99 from MarginaliaSearch/term-positions

Improve term positions accuracy
This commit is contained in:
Viktor 2024-09-17 15:30:04 +02:00 committed by GitHub
commit 463b3ed0ce
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
738 changed files with 13155 additions and 9735 deletions

View File

@ -6,7 +6,7 @@ plugins {
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly
// https://github.com/GoogleContainerTools/jib/issues/3347
id 'com.google.cloud.tools.jib' version '3.4.2' apply(false)
id 'com.google.cloud.tools.jib' version '3.4.3' apply(false)
}
group 'marginalia'
@ -44,10 +44,11 @@ subprojects.forEach {it ->
}
ext {
jvmVersion=21
dockerImageBase='container-registry.oracle.com/graalvm/jdk:21@sha256:1fd33d4d4eba3a9e1a41a728e39ea217178d257694eea1214fec68d2ed4d3d9b'
jvmVersion=22
dockerImageBase='container-registry.oracle.com/graalvm/jdk:22'
dockerImageTag='latest'
dockerImageRegistry='marginalia'
jibVersion = '3.4.3'
}
idea {

View File

@ -33,6 +33,7 @@ dependencies {
testImplementation project(':code:libraries:test-helpers')
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
testImplementation libs.commons.codec
testImplementation 'org.testcontainers:mariadb:1.17.4'
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
testImplementation project(':code:libraries:test-helpers')

View File

@ -54,6 +54,7 @@ dependencies {
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
testImplementation libs.commons.codec
testImplementation 'org.testcontainers:mariadb:1.17.4'
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
testImplementation project(':code:libraries:test-helpers')

View File

@ -41,6 +41,7 @@ dependencies {
testImplementation libs.mockito
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
testImplementation libs.commons.codec
testImplementation 'org.testcontainers:mariadb:1.17.4'
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
testImplementation project(':code:libraries:test-helpers')

View File

@ -22,6 +22,12 @@ import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
/** Reads the document database, which is a SQLite database
* containing the URLs and metadata of the documents in the
* index.
* <p></p>
* The database is created by the DocumentDbWriter class.
* */
@Singleton
public class DocumentDbReader {
private final Path dbFile;
@ -52,6 +58,11 @@ public class DocumentDbReader {
}
}
/** Switches the input database file to a new file.
* <p></p>
* This is used to switch over to a new database file
* when the index is re-indexed.
* */
public void switchInput(Path newDbFile) throws IOException, SQLException {
if (!Files.isRegularFile(newDbFile)) {
logger.error("Source is not a file, refusing switch-over {}", newDbFile);
@ -78,35 +89,11 @@ public class DocumentDbReader {
connection = createConnection();
}
public List<String> getUrlsFromDomain(int domainId) throws SQLException {
if (connection == null ||
connection.isClosed())
{
throw new RuntimeException("URL query temporarily unavailable due to database switch");
}
long minId = UrlIdCodec.encodeId(domainId, 0);
long maxId = UrlIdCodec.encodeId(domainId+1, 0);
List<String> ret = new ArrayList<>();
try (var stmt = connection.prepareStatement("""
SELECT URL
FROM DOCUMENT
WHERE ID >= ? AND ID < ?
"""))
{
stmt.setLong(1, minId);
stmt.setLong(2, maxId);
var rs = stmt.executeQuery();
while (rs.next()) {
ret.add(rs.getString(1));
}
}
return ret;
}
/** Returns the URL details for the given document ids.
* <p></p>
* This is used to get the URL details for the search
* results.
* */
public List<DocdbUrlDetail> getUrlDetails(TLongList ids) throws SQLException {
List<DocdbUrlDetail> ret = new ArrayList<>(ids.size());

View File

@ -9,6 +9,10 @@ import java.sql.DriverManager;
import java.sql.SQLException;
import java.util.List;
/** Writes the document database, which is a SQLite database
* containing the URLs and metadata of the documents in the
* index.
* */
public class DocumentDbWriter {
private final Connection connection;

View File

@ -14,6 +14,7 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:libraries:braille-block-punch-cards')
implementation project(':code:libraries:coded-sequence')
implementation libs.bundles.slf4j

View File

@ -37,9 +37,24 @@ public class UrlIdCodec {
domainId &= 0x7FFF_FFFF;
documentOrdinal &= 0x03FF_FFFF;
assert (domainId & 0x7FFF_FFFF) == domainId : "Domain id must be in [0, 2^31-1], was " + domainId;
assert (documentOrdinal & 0x03FF_FFFF) == documentOrdinal : "Document ordinal must be in [0, 2^26-1], was " + documentOrdinal;
return ((long) domainId << 26) | documentOrdinal;
}
/** Encode a URL id with a ranking element */
public static long encodeId(int rank, int domainId, int documentOrdinal) {
assert (rank & 0x3F) == rank : "Rank must be in [0, 63], was " + rank;
assert (domainId & 0x7FFF_FFFF) == domainId : "Domain id must be in [0, 2^31-1], was " + domainId;
assert (documentOrdinal & 0x03FF_FFFF) == documentOrdinal : "Document ordinal must be in [0, 2^26-1], was " + documentOrdinal;
domainId &= 0x7FFF_FFFF;
documentOrdinal &= 0x03FF_FFFF;
rank &= 0x3F;
return ((long) rank << 57) | ((long) domainId << 26) | documentOrdinal;
}
/** Add a ranking element to an existing combined URL id.
*
* @param rank [0,1] the importance of the domain, low is good
@ -67,7 +82,7 @@ public class UrlIdCodec {
/** Extract the document ordinal component from this URL id */
public static int getRank(long combinedId) {
return (int) (combinedId >>> 57);
return (int) (combinedId >>> 57) & 0x3F;
}
/** Mask out the ranking element from this URL id */

View File

@ -0,0 +1,6 @@
package nu.marginalia.model.idx;
import nu.marginalia.sequence.VarintCodedSequence;
public record CodedWordSpan(byte code, VarintCodedSequence spans) {
}

View File

@ -4,16 +4,12 @@ package nu.marginalia.model.idx;
import java.util.EnumSet;
public enum WordFlags {
/** Word appears in title */
Title,
/** Word appears to be the subject in several sentences */
Subjects,
/** Word has high tf-idf */
TfIdfHigh,
/** Word is a likely named object. This is a weaker version of Subjects. */
NamesWords,
@ -42,19 +38,27 @@ public enum WordFlags {
ExternalLink
;
public int asBit() {
return 1 << ordinal();
public byte asBit() {
return (byte) (1 << ordinal());
}
public boolean isPresent(long value) {
public boolean isPresent(byte value) {
return (asBit() & value) > 0;
}
public boolean isAbsent(long value) {
public boolean isAbsent(byte value) {
return (asBit() & value) == 0;
}
public static EnumSet<WordFlags> decode(long encodedValue) {
public static byte encode(EnumSet<WordFlags> flags) {
byte ret = 0;
for (WordFlags f : flags) {
ret |= f.asBit();
}
return ret;
}
public static EnumSet<WordFlags> decode(byte encodedValue) {
EnumSet<WordFlags> ret = EnumSet.noneOf(WordFlags.class);
for (WordFlags f : values()) {

View File

@ -1,89 +0,0 @@
package nu.marginalia.model.idx;
import nu.marginalia.bbpc.BrailleBlockPunchCards;
import java.util.EnumSet;
import java.util.Set;
/** Word level metadata designed to fit in a single 64 bit long.
*
* @param positions bitmask of term positions within the document
* @param flags word flags (see {@link WordFlags})
*/
public record WordMetadata(long positions,
int flags) {
public static final long FLAGS_MASK = (1L << WordFlags.values().length) - 1;
public static final int POSITIONS_COUNT = 64 - WordFlags.values().length;
public static final int POSITIONS_SHIFT = WordFlags.values().length;
public static final long POSITIONS_MASK = ~0L >>> POSITIONS_SHIFT;
public WordMetadata() {
this(emptyValue());
}
public WordMetadata(long value) {
this(
((value >>> POSITIONS_SHIFT) & POSITIONS_MASK),
(int)(value & FLAGS_MASK)
);
}
public WordMetadata(long positions,
Set<WordFlags> flags)
{
this(positions, encodeFlags(flags));
}
private static int encodeFlags(Set<WordFlags> flags) {
int ret = 0;
for (var flag : flags) { ret |= flag.asBit(); }
return ret;
}
public static boolean hasFlags(long encoded, long metadataBitMask) {
return (encoded & metadataBitMask) == metadataBitMask;
}
public static boolean hasAnyFlags(long encoded, long metadataBitMask) {
return (encoded & metadataBitMask) != 0;
}
public static long decodePositions(long meta) {
return (meta >>> POSITIONS_SHIFT) & POSITIONS_MASK;
}
public boolean hasFlag(WordFlags flag) {
return (flags & flag.asBit()) != 0;
}
public String toString() {
return "[positions=%s; %s]".formatted(BrailleBlockPunchCards.printBits(positions, 56), flagSet());
}
/* Encoded in a 64 bit long
*/
public long encode() {
long ret = 0;
ret |= Integer.toUnsignedLong(flags) & FLAGS_MASK;
ret |= (positions & POSITIONS_MASK) << POSITIONS_SHIFT;
return ret;
}
public boolean isEmpty() {
return positions == 0 && flags == 0;
}
public static long emptyValue() {
return 0L;
}
public EnumSet<WordFlags> flagSet() {
return WordFlags.decode(flags);
}
}

View File

@ -10,7 +10,6 @@ import java.util.StringJoiner;
public class QueryParams {
@Nullable
public static String queryParamsSanitizer(String path, @Nullable String queryParams) {
if (queryParams == null) {

View File

@ -1,70 +0,0 @@
package nu.marginalia.util;
import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Objects;
public class StringPool {
private final HashMap<String, String> words;
private final Object2LongOpenHashMap<String> ages;
private final int maxCap;
long idx;
private StringPool(int capacity, int maxCap) {
this.ages = new Object2LongOpenHashMap<>(capacity);
this.words = new HashMap<>(capacity);
this.maxCap = maxCap;
}
public static StringPool create(int capacity) {
return new StringPool(capacity, capacity * 10);
}
public String internalize(String str) {
prune();
final String ret = words.putIfAbsent(str, str);
ages.put(ret, idx++);
return Objects.requireNonNullElse(ret, str);
}
public String[] internalize(String[] str) {
for (int i = 0; i < str.length; i++) {
str[i] = internalize(str[i]);
}
return str;
}
public void prune() {
if (words.size() < maxCap)
return;
long[] ageValues = ages.values().toLongArray();
Arrays.sort(ageValues);
long cutoff = ageValues[ageValues.length - maxCap / 10];
words.clear();
ages.forEach((word, cnt) -> {
if (cnt >= cutoff) {
words.put(word, word);
}
});
ages.clear();
words.forEach((w,w2) -> {
ages.put(w, idx);
});
}
public void flush() {
words.clear();
}
}

View File

@ -8,5 +8,4 @@ This package contains common models to the search engine
* [EdgeUrl](java/nu/marginalia/model/EdgeUrl.java)
* [DocumentMetadata](java/nu/marginalia/model/idx/DocumentMetadata.java)
* [DocumentFlags](java/nu/marginalia/model/idx/DocumentFlags.java)
* [WordMetadata](java/nu/marginalia/model/idx/WordMetadata.java)
* [WordFlags](java/nu/marginalia/model/idx/WordFlags.java)

View File

@ -1,41 +0,0 @@
package nu.marginalia.model;
import nu.marginalia.bbpc.BrailleBlockPunchCards;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import org.junit.jupiter.api.Test;
import java.util.EnumSet;
import static org.junit.jupiter.api.Assertions.assertEquals;
class WordMetadataTest {
@Test
public void codecTest() {
verifyCodec("Vanilla case", new WordMetadata(0x7f0f0000L, EnumSet.allOf(WordFlags.class)));
verifyCodec("Position 32bit", new WordMetadata(0xff0f0000L, EnumSet.allOf(WordFlags.class)));
verifyCodec("Position all", new WordMetadata(0xffff_ff0f_0000L, EnumSet.allOf(WordFlags.class)));
verifyCodec("No flags", new WordMetadata( 0xff0f0000L, EnumSet.noneOf(WordFlags.class)));
verifyCodec("No flags, some bits", new WordMetadata(0x3f_7f7f_7f7f_7f7fL, EnumSet.noneOf(WordFlags.class)));
verifyCodec("No flags, all bits", new WordMetadata( 0x3f_ffff_ffff_ffffL, EnumSet.noneOf(WordFlags.class)));
verifyCodec("All flags, all bits", new WordMetadata( 0x3f_ffff_ffff_ffffL, EnumSet.allOf(WordFlags.class)));
System.out.println(new WordMetadata(0x7f0f0005L, EnumSet.allOf(WordFlags.class)));
System.out.println(new WordMetadata(0xff0f0013L, EnumSet.noneOf(WordFlags.class)));
System.out.println(new WordMetadata(0xf0f000ff0f0013L, EnumSet.allOf(WordFlags.class)));
System.out.println(new WordMetadata(0xf0f000ff0f0013L, (byte)-1));
System.out.println(new WordMetadata(0x3f_ffff_ffff_ffffL, (byte)0));
System.out.println(new WordMetadata(0x3f_ffff_ffff_ffffL, (byte)0));
System.out.println(BrailleBlockPunchCards.printBits(new WordMetadata(~0L, (byte) 0).encode(), 64));
System.out.println(BrailleBlockPunchCards.printBits(new WordMetadata(0, (byte) 0xff).encode(), 64));
System.out.println(BrailleBlockPunchCards.printBits(131973L, 64));
System.out.println(new WordMetadata(131973L));
}
public void verifyCodec(String message, WordMetadata data) {
System.out.println(BrailleBlockPunchCards.printBits(data.encode(), 64));
assertEquals(data, new WordMetadata(data.encode()), message);
}
}

View File

@ -1,13 +1,18 @@
package nu.marginalia.process.control;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/** Dummy implementation of ProcessHeartbeat that does nothing */
public class FakeProcessHeartbeat implements ProcessHeartbeat {
private static final Logger logger = LoggerFactory.getLogger(FakeProcessHeartbeat.class);
@Override
public <T extends Enum<T>> ProcessTaskHeartbeat<T> createProcessTaskHeartbeat(Class<T> steps, String processName) {
return new ProcessTaskHeartbeat<>() {
@Override
public void progress(T step) {}
public void progress(T step) {
logger.info("Progress: {}", step);
}
@Override
public void shutDown() {}
@ -21,7 +26,9 @@ public class FakeProcessHeartbeat implements ProcessHeartbeat {
public ProcessAdHocTaskHeartbeat createAdHocTaskHeartbeat(String processName) {
return new ProcessAdHocTaskHeartbeat() {
@Override
public void progress(String step, int progress, int total) {}
public void progress(String step, int progress, int total) {
logger.info("Progress: {}, {}/{}", step, progress, total);
}
@Override
public void close() {}

View File

@ -46,6 +46,7 @@ dependencies {
implementation libs.bundles.mariadb
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
testImplementation libs.commons.codec
testImplementation 'org.testcontainers:mariadb:1.17.4'
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
testImplementation project(':code:libraries:test-helpers')

View File

@ -64,6 +64,11 @@ public class GrpcMultiNodeChannelPool<STUB> {
return nodeConfigurationWatcher.getQueryNodes();
}
/** Return the number of nodes that are eligible for broadcast-style requests */
public int getNumNodes() {
return nodeConfigurationWatcher.getQueryNodes().size();
}
/** Create a new call builder for the given method. This is a fluent-style
* method, where you can chain calls to specify how to run the method.
* <p></p>

View File

@ -2,22 +2,7 @@
<Appenders>
<Console name="Console" target="SYSTEM_OUT">
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
<Filters>
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
</Filters>
</Console>
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
ignoreExceptions="false">
<PatternLayout>
<Pattern>%-5level %d{yyyy-MM-dd HH:mm:ss,SSS} %-20t %-20c{1}: %msg{nolookups}%n</Pattern>
</PatternLayout>
<SizeBasedTriggeringPolicy size="10MB" />
<Filters>
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
</Filters>
</RollingFile>
</Appenders>
<Loggers>
<Logger name="org.apache.zookeeper" level="WARN" />

View File

@ -38,15 +38,13 @@ dependencies {
implementation project(':code:functions:search-query')
implementation project(':code:execution:api')
implementation project(':code:process-models:crawl-spec')
implementation project(':code:process-models:crawling-model')
implementation project(':code:features-crawl:link-parser')
implementation project(':code:features-convert:data-extractors')
implementation project(':code:features-convert:stackexchange-xml')
implementation project(':code:features-convert:reddit-json')
implementation project(':code:processes:crawling-process:model')
implementation project(':code:processes:crawling-process:model')
implementation project(':code:processes:crawling-process:ft-link-parser')
implementation project(':code:execution:data-extractors')
implementation project(':code:index:index-journal')
implementation project(':code:index:api')
implementation project(':code:process-mqapi')
implementation project(':code:processes:process-mq-api')
implementation project(':third-party:encyclopedia-marginalia-nu')
implementation libs.bundles.slf4j
@ -84,6 +82,7 @@ dependencies {
testImplementation libs.mockito
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
testImplementation libs.commons.codec
testImplementation 'org.testcontainers:mariadb:1.17.4'
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
testImplementation project(':code:libraries:test-helpers')

View File

@ -22,9 +22,9 @@ dependencies {
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:libraries:blocking-thread-pool')
implementation project(':code:features-crawl:link-parser')
implementation project(':code:features-convert:anchor-keywords')
implementation project(':code:process-models:crawling-model')
implementation project(':code:processes:crawling-process:ft-link-parser')
implementation project(':code:processes:converting-process:ft-anchor-keywords')
implementation project(':code:processes:crawling-process:model')
implementation project(':code:processes:converting-process')
implementation project(':third-party:commons-codec')

View File

@ -3,13 +3,13 @@ package nu.marginalia.extractor;
import com.google.inject.Inject;
import gnu.trove.set.hash.TLongHashSet;
import lombok.SneakyThrows;
import nu.marginalia.crawling.io.CrawledDomainReader;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.io.crawldata.CrawledDomainReader;
import nu.marginalia.io.crawldata.SerializableCrawlDataStream;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawldata.CrawledDocument;
import nu.marginalia.process.log.WorkLog;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorage;

View File

@ -2,13 +2,13 @@ package nu.marginalia.extractor;
import com.google.inject.Inject;
import lombok.SneakyThrows;
import nu.marginalia.crawling.io.CrawledDomainReader;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.io.crawldata.CrawledDomainReader;
import nu.marginalia.io.crawldata.SerializableCrawlDataStream;
import nu.marginalia.link_parser.FeedExtractor;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawldata.CrawledDocument;
import nu.marginalia.process.log.WorkLog;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorage;

View File

@ -5,11 +5,11 @@ import gnu.trove.map.hash.TLongIntHashMap;
import gnu.trove.set.hash.TLongHashSet;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.crawling.io.CrawledDomainReader;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.io.crawldata.CrawledDomainReader;
import nu.marginalia.language.filter.LanguageFilter;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.crawldata.CrawledDocument;
import nu.marginalia.process.log.WorkLog;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorage;
@ -27,7 +27,7 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.nio.file.attribute.PosixFilePermissions;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
@ -97,8 +97,13 @@ public class TermFrequencyExporter implements ExporterIf {
}
private void processFile(Path crawlDataPath, TLongIntHashMap counts, AtomicInteger docCount, SentenceExtractor se) {
TLongHashSet words = new TLongHashSet(10_000);
private void processFile(Path crawlDataPath,
TLongIntHashMap counts,
AtomicInteger docCount,
SentenceExtractor se)
{
TLongHashSet words = new TLongHashSet(1000);
try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
while (stream.hasNext()) {
if (Thread.interrupted())
@ -120,19 +125,33 @@ public class TermFrequencyExporter implements ExporterIf {
return;
}
for (var sent : dld.sentences) {
for (var sent : dld) {
// Skip sentences with non-language tags, e.g. program code
if (sent.htmlTags.stream().anyMatch(t -> t.nonLanguage))
continue;
for (var word : sent) {
words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8)));
}
for (var ngram : sent.ngramStemmed) {
words.add(longHash(ngram.getBytes()));
}
}
var random = ThreadLocalRandom.current();
synchronized (counts) {
words.forEach(w -> {
counts.adjustOrPutValue(w, 1, 1);
// Mathematicians hate him for this one weird trick:
//
// We generally aren't interested in low-frequency entries,
// but due to zipf's law, there are a lot of them, in fact
// almost the entire term frequency dictionary is full of them.
//
// So we use a simple statistical trick to reduce the number
// of nearly unique entries in the dictionary, while still keeping the
// distribution of higher-frequency entries relatively intact
if (random.nextDouble() < 0.2) {
counts.adjustOrPutValue(w, 5, 5);
}
return true;
});
}

View File

@ -6,19 +6,11 @@ import com.google.inject.Singleton;
import lombok.AllArgsConstructor;
import lombok.NoArgsConstructor;
import lombok.With;
import nu.marginalia.IndexLocations;
import nu.marginalia.actor.prototype.RecordActorPrototype;
import nu.marginalia.actor.state.ActorResumeBehavior;
import nu.marginalia.actor.state.ActorStep;
import nu.marginalia.actor.state.Resume;
import nu.marginalia.nodecfg.NodeConfigurationService;
import nu.marginalia.process.ProcessOutboxes;
import nu.marginalia.process.ProcessService;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.storage.model.FileStorageState;
import nu.marginalia.svc.BackupService;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.storage.model.FileStorageType;
import nu.marginalia.index.api.IndexMqClient;
import nu.marginalia.index.api.IndexMqEndpoints;
import nu.marginalia.mq.MqMessageState;
@ -27,9 +19,20 @@ import nu.marginalia.mqapi.converting.ConvertRequest;
import nu.marginalia.mqapi.index.CreateIndexRequest;
import nu.marginalia.mqapi.index.IndexName;
import nu.marginalia.mqapi.loading.LoadRequest;
import nu.marginalia.nodecfg.NodeConfigurationService;
import nu.marginalia.process.ProcessOutboxes;
import nu.marginalia.process.ProcessService;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.storage.model.FileStorageState;
import nu.marginalia.storage.model.FileStorageType;
import nu.marginalia.svc.BackupService;
import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.file.Files;
import java.sql.SQLException;
import java.util.List;
@ -110,9 +113,30 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
if (rsp.state() != MqMessageState.OK)
yield new Error("Converter failed");
if (!shouldAutoClean()) {
// If we're not auto-cleaning, we need to clean the NEW flag for the processed storage
storageService.setFileStorageState(processedId, FileStorageState.UNSET);
// (if we do auto-clean, we skip this step and purge the items after loading)
}
yield new Load(List.of(processedId));
}
case Load(List<FileStorageId> processedIds, long msgId) when msgId < 0 -> {
// clear the output directory of the loader from any debris from partial jobs that have been aborted
Files.list(IndexLocations.getIndexConstructionArea(storageService)).forEach(path -> {
try {
if (Files.isDirectory(path)) {
FileUtils.deleteDirectory(path.toFile());
}
else if (Files.isRegularFile(path)) {
Files.delete(path);
}
} catch (Exception e) {
logger.error("Error clearing staging area", e);
}
});
long id = mqLoaderOutbox.sendAsync(new LoadRequest(processedIds));
yield new Load(processedIds, id);
@ -122,9 +146,20 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
if (rsp.state() != MqMessageState.OK) {
yield new Error("Loader failed");
} else {
cleanProcessedStorage(processedIds);
}
// If we're auto-cleaning, flag the processed files for deletion if they have the NEW flag,
// indicating they've recently been created. We need to check this, so we don't delete archived
// stuff that's being loaded manually
if (shouldAutoClean()) {
for (var id : processedIds) {
if (FileStorageState.NEW.equals(storageService.getStorage(id).state())) {
storageService.flagFileForDeletion(id);
}
}
}
yield new Backup(processedIds);
}
case Backup(List<FileStorageId> processedIds) -> {
@ -146,7 +181,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessService.ProcessId.INDEX_CONSTRUCTOR, id);
if (rsp.state() != MqMessageState.OK)
yield new Error("Repartition failed");
yield new Error("Forward index construction failed");
else
yield new ReindexFull();
}
@ -155,7 +190,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessService.ProcessId.INDEX_CONSTRUCTOR, id);
if (rsp.state() != MqMessageState.OK)
yield new Error("Repartition failed");
yield new Error("Full index construction failed");
else
yield new ReindexPrio();
}
@ -164,7 +199,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessService.ProcessId.INDEX_CONSTRUCTOR, id);
if (rsp.state() != MqMessageState.OK)
yield new Error("Repartition failed");
yield new Error("Prio index construction failed");
else
yield new SwitchIndex();
}
@ -186,6 +221,16 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
return mqIndexConstructorOutbox.sendAsync(new CreateIndexRequest(index));
}
private boolean shouldAutoClean() {
try {
return nodeConfigurationService.get(nodeId).autoClean();
}
catch (SQLException ex) {
logger.error("Error getting node configuration", ex);
return false; // safe dafault
}
}
@Override
public String describe() {
@ -215,24 +260,5 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
this.nodeId = serviceConfiguration.node();
}
private void cleanProcessedStorage(List<FileStorageId> processedStorageId) {
try {
var config = nodeConfigurationService.get(nodeId);
for (var id : processedStorageId) {
if (FileStorageState.NEW.equals(storageService.getStorage(id).state())) {
if (config.autoClean()) {
storageService.flagFileForDeletion(id);
}
else {
storageService.setFileStorageState(id, FileStorageState.UNSET);
}
}
}
}
catch (SQLException ex) {
logger.error("Error in clean-up", ex);
}
}
}

View File

@ -19,6 +19,8 @@ import org.slf4j.MarkerFactory;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@ -32,6 +34,7 @@ public class ProcessService {
private final ServiceEventLog eventLog;
private final ConcurrentHashMap<ProcessId, Process> processes = new ConcurrentHashMap<>();
private final int node;
public static ProcessService.ProcessId translateExternalIdBase(String id) {
@ -78,6 +81,7 @@ public class ProcessService {
@Inject
public ProcessService(BaseServiceParams params) {
this.eventLog = params.eventLog;
this.node = params.configuration.node();
}
@ -86,7 +90,7 @@ public class ProcessService {
List<String> args = new ArrayList<>();
String javaHome = System.getProperty("java.home");
args.add(STR."\{javaHome}/bin/java");
args.add(javaHome + "/bin/java");
args.add("-cp");
args.add(System.getProperty("java.class.path"));
@ -94,6 +98,7 @@ public class ProcessService {
else args.add("-da");
args.add("--enable-preview");
args.add("--enable-native-access=ALL-UNNAMED");
String loggingOpts = System.getProperty("log4j2.configurationFile");
if (loggingOpts != null) {
@ -104,6 +109,17 @@ public class ProcessService {
args.add("-Dsystem.serviceNode=" + System.getProperty("system.serviceNode"));
}
if (Boolean.getBoolean("system.profile")) {
// add jfr options
args.add("-XX:+FlightRecorder");
String jfrFileName = "/var/log/wmsa/profile-%s-%d-%s.jfr".formatted(
processId.toString(),
node,
LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME).replace(':', '.')
);
args.add("-XX:StartFlightRecording=filename=%s,name=%s".formatted(jfrFileName, processId.toString()));
}
args.addAll(processId.envOpts());
args.add(processId.mainClass);
args.addAll(Arrays.asList(extraArgs));

View File

@ -2,22 +2,25 @@ package nu.marginalia.svc;
import com.github.luben.zstd.ZstdInputStream;
import com.github.luben.zstd.ZstdOutputStream;
import com.google.inject.Inject;
import nu.marginalia.IndexLocations;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.linkdb.LinkdbFileNames;
import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.storage.model.FileStorageType;
import nu.marginalia.index.journal.IndexJournalFileNames;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import com.google.inject.Inject;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.time.LocalDateTime;
import java.util.List;
import java.util.Optional;
public class BackupService {
@ -97,35 +100,20 @@ public class BackupService {
private void backupJournal(Path inputStorage, Path backupStorage) throws IOException
{
for (var source : IndexJournalFileNames.findJournalFiles(inputStorage)) {
var dest = backupStorage.resolve(source.toFile().getName());
try (var is = Files.newInputStream(source);
var os = Files.newOutputStream(dest)
) {
IOUtils.copyLarge(is, os);
}
Optional<IndexJournal> journal = IndexJournal.findJournal(inputStorage);
if (journal.isEmpty()) {
throw new FileNotFoundException("No journal found in input storage");
}
FileUtils.copyDirectory(journal.get().journalDir().toFile(), backupStorage.resolve(journal.get().journalDir().getFileName()).toFile());
}
private void restoreJournal(Path destStorage, Path backupStorage) throws IOException {
// Remove any old journal files first to avoid them getting loaded
for (var garbage : IndexJournalFileNames.findJournalFiles(destStorage)) {
Files.delete(garbage);
Optional<IndexJournal> journal = IndexJournal.findJournal(backupStorage);
if (journal.isEmpty()) {
throw new FileNotFoundException("No journal found in backup");
}
for (var source : IndexJournalFileNames.findJournalFiles(backupStorage)) {
var dest = destStorage.resolve(source.toFile().getName());
try (var is = Files.newInputStream(source);
var os = Files.newOutputStream(dest)
) {
IOUtils.copyLarge(is, os);
}
}
FileUtils.copyDirectory(backupStorage.resolve(journal.get().journalDir().getFileName()).toFile(), destStorage.toFile());
}
private void backupFileCompressed(String fileName, Path inputStorage, Path backupStorage) throws IOException

View File

@ -1,33 +0,0 @@
plugins {
id 'java'
id "de.undercouch.download" version "5.1.0"
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:common:config')
implementation libs.bundles.slf4j
implementation libs.guava
implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava'
}
implementation libs.notnull
implementation libs.jsoup
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}

View File

@ -1,8 +0,0 @@
# Adblock
Contains an adblock simulator that reads an adblock specifications file and
uses it to identify if a document has ads.
## Central Classes
* [AdblockSimulator](java/nu/marginalia/adblock/AdblockSimulator.java)

View File

@ -1,189 +0,0 @@
package nu.marginalia.keyword;
import nu.marginalia.segmentation.NgramLexicon;
import nu.marginalia.keyword.extractors.*;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.model.EdgeUrl;
import com.google.inject.Inject;
import java.util.*;
import java.util.stream.Stream;
public class DocumentKeywordExtractor {
private final KeywordExtractor keywordExtractor;
private final TermFrequencyDict dict;
private final NgramLexicon ngramLexicon;
@Inject
public DocumentKeywordExtractor(TermFrequencyDict dict, NgramLexicon ngramLexicon) {
this.dict = dict;
this.ngramLexicon = ngramLexicon;
this.keywordExtractor = new KeywordExtractor();
}
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData dld, EdgeUrl url) {
var bitmask = new KeywordPositionBitmask(keywordExtractor, dld);
var tfIdfCounts = new WordsTfIdfCounts(dict, keywordExtractor, dld);
var titleKeywords = new TitleKeywords(keywordExtractor, dld);
var nameLikeKeywords = new NameLikeKeywords(keywordExtractor, dld, 2);
var subjectLikeKeywords = new SubjectLikeKeywords(keywordExtractor, tfIdfCounts, dld);
var artifactKeywords = new ArtifactKeywords(dld);
var urlKeywords = new UrlKeywords(url);
var keywordMetadata = KeywordMetadata.builder()
.bitmask(bitmask)
.tfIdfCounts(tfIdfCounts)
.titleKeywords(titleKeywords)
.nameLikeKeywords(nameLikeKeywords)
.subjectLikeKeywords(subjectLikeKeywords)
.urlKeywords(urlKeywords)
.build();
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
createSimpleWords(wordsBuilder, keywordMetadata, dld);
createWordsFromSet(wordsBuilder, keywordMetadata, tfIdfCounts);
createWordsFromSet(wordsBuilder, keywordMetadata, titleKeywords);
createWordsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords);
createWordsFromSet(wordsBuilder, keywordMetadata, nameLikeKeywords);
var importantWords = getImportantWords(tfIdfCounts, nameLikeKeywords, subjectLikeKeywords, wordsBuilder);
wordsBuilder.addImportantWords(importantWords);
wordsBuilder.addAllSyntheticTerms(artifactKeywords.getWords());
return wordsBuilder;
}
private static Collection<String> getImportantWords(WordsTfIdfCounts tfIdfCounts, NameLikeKeywords nameLikeKeywords, SubjectLikeKeywords subjectLikeKeywords, DocumentKeywordsBuilder wordsBuilder) {
return Stream.of(tfIdfCounts, nameLikeKeywords, subjectLikeKeywords)
.flatMap(k -> k.getReps().stream())
.filter(w -> {
if (w.word.length() < 3)
return false;
if (w.word.contains("_"))
return false;
return true;
})
.sorted(tfIdfCounts.reversed())
.limit(16)
.filter(w -> tfIdfCounts.termFrequencyDictValue(w) > 100)
.sorted(Comparator.comparing(w -> tfIdfCounts.termFrequencyDictValue(w)))
.limit(6)
.map(w -> w.word)
.toList();
}
private void createWordsFromSet(DocumentKeywordsBuilder wordsBuilder,
KeywordMetadata metadata,
WordReps words) {
for (var rep : words.getReps()) {
var word = rep.word;
if (!word.isBlank()) {
long meta = metadata.getMetadataForWord(rep.stemmed);
assert meta != 0L : "Missing meta for " + rep.word;
wordsBuilder.add(word, meta);
}
}
}
private void createSimpleWords(DocumentKeywordsBuilder wordsBuilder,
KeywordMetadata metadata,
DocumentLanguageData documentLanguageData)
{
for (var sent : documentLanguageData.sentences) {
if (wordsBuilder.size() > 1500)
break;
for (var word : sent) {
if (word.isStopWord()) {
continue;
}
String w = word.wordLowerCase();
if (matchesWordPattern(w)) {
long meta = metadata.getMetadataForWord(word.stemmed());
assert meta != 0L : "Missing meta for " + word.word();
wordsBuilder.add(w, meta);
}
}
for (var names : keywordExtractor.getProperNames(sent)) {
var rep = new WordRep(sent, names);
long meta = metadata.getMetadataForWord(rep.stemmed);
assert meta != 0L : "Missing meta for " + rep.word;
wordsBuilder.add(rep.word, meta);
}
for (int i = 0; i < sent.ngrams.length; i++) {
var ngram = sent.ngrams[i];
var ngramStemmed = sent.ngramStemmed[i];
long meta = metadata.getMetadataForWord(ngramStemmed);
assert meta != 0L : "Missing meta for " + ngram;
wordsBuilder.add(ngram, meta);
}
}
}
boolean matchesWordPattern(String s) {
// this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4}
String wordPartSeparator = ".-_/:+*";
int i = 0;
for (int run = 0; run < 15 && i < s.length(); run++, i++) {
char c = s.charAt(i);
if (c >= 'a' && c <= 'z') continue;
if (c >= 'A' && c <= 'Z') continue;
if (c >= '0' && c <= '9') continue;
break;
}
if (i == 0)
return false;
for (int j = 0; j < 5; j++) {
if (i == s.length()) return true;
if (wordPartSeparator.indexOf(s.charAt(i)) < 0) {
return false;
}
i++;
for (int run = 0; run < 10 && i < s.length(); run++, i++) {
char c = s.charAt(i);
if (c >= 'a' && c <= 'z') continue;
if (c >= 'A' && c <= 'Z') continue;
if (c >= '0' && c <= '9') continue;
break;
}
}
return false;
}
}

View File

@ -1,64 +0,0 @@
package nu.marginalia.keyword;
import lombok.Builder;
import nu.marginalia.keyword.extractors.*;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.model.idx.WordFlags;
import java.util.EnumSet;
class KeywordMetadata {
private final KeywordPositionBitmask bitmask;
private final TitleKeywords titleKeywords;
private final NameLikeKeywords nameLikeKeywords;
private final SubjectLikeKeywords subjectLikeKeywords;
private final UrlKeywords urlKeywords;
private final WordsTfIdfCounts tfIdfCounts;
@Builder
public KeywordMetadata(
KeywordPositionBitmask bitmask,
TitleKeywords titleKeywords,
NameLikeKeywords nameLikeKeywords,
SubjectLikeKeywords subjectLikeKeywords,
UrlKeywords urlKeywords,
WordsTfIdfCounts tfIdfCounts) {
this.bitmask = bitmask;
this.titleKeywords = titleKeywords;
this.nameLikeKeywords = nameLikeKeywords;
this.subjectLikeKeywords = subjectLikeKeywords;
this.urlKeywords = urlKeywords;
this.tfIdfCounts = tfIdfCounts;
}
public long getMetadataForWord(String stemmed) {
int tfidf = tfIdfCounts.getTfIdf(stemmed);
EnumSet<WordFlags> flags = EnumSet.noneOf(WordFlags.class);
if (tfidf > 100)
flags.add(WordFlags.TfIdfHigh);
if (subjectLikeKeywords.contains(stemmed))
flags.add(WordFlags.Subjects);
if (nameLikeKeywords.contains(stemmed))
flags.add(WordFlags.NamesWords);
if (titleKeywords.contains(stemmed))
flags.add(WordFlags.Title);
if (urlKeywords.containsUrl(stemmed))
flags.add(WordFlags.UrlPath);
if (urlKeywords.containsDomain(stemmed))
flags.add(WordFlags.UrlDomain);
long positions = bitmask.get(stemmed);
return new WordMetadata(positions, flags).encode();
}
}

View File

@ -1,105 +0,0 @@
package nu.marginalia.keyword.extractors;
import com.google.inject.Inject;
import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.idx.WordMetadata;
/** Generates a position bitmask for each word in a document */
public class KeywordPositionBitmask {
private final Object2LongOpenHashMap<String> positionMask = new Object2LongOpenHashMap<>(10_000, 0.7f);
private final static int positionWidth = WordMetadata.POSITIONS_COUNT;
private final static long positionBitmask = WordMetadata.POSITIONS_MASK;
private static final int unmodulatedPortion = 16;
@Inject
public KeywordPositionBitmask(KeywordExtractor keywordExtractor,
DocumentLanguageData dld)
{
// Mark the title words as position 0
for (var sent : dld.titleSentences) {
int posBit = 1;
for (var word : sent) {
positionMask.merge(word.stemmed(), posBit, this::bitwiseOr);
}
for (var ngram : sent.ngramStemmed) {
positionMask.merge(ngram, posBit, this::bitwiseOr);
}
for (var span : keywordExtractor.getKeywordsFromSentence(sent)) {
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}
for (var span : keywordExtractor.getProperNames(sent)) {
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}
}
// Mark subsequent sentences in subsequent positions, with increasing sentence step size
LinePosition linePos = new LinePosition();
for (var sent : dld.sentences) {
long posBit = (1L << linePos.pos()) & positionBitmask;
for (var word : sent) {
positionMask.merge(word.stemmed(), posBit, this::bitwiseOr);
}
for (var ngram : sent.ngramStemmed) {
positionMask.merge(ngram, posBit, this::bitwiseOr);
}
for (var span : keywordExtractor.getKeywordsFromSentence(sent)) {
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}
for (var span : keywordExtractor.getProperNames(sent)) {
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}
linePos.next(sent.length());
}
}
public long get(String stemmed) {
return positionMask.getOrDefault(stemmed, 0);
}
private long bitwiseOr(long a, long b) {
return a | b;
}
private static class LinePosition {
private int lineLengthCtr = 0;
private int bitMaskPos = 1;
public int pos() {
if (bitMaskPos < unmodulatedPortion) {
return bitMaskPos;
}
else {
return unmodulatedPortion + ((bitMaskPos - unmodulatedPortion) % (positionWidth - unmodulatedPortion));
}
}
public void next(int sentenceLength)
{
if (sentenceLength > 10) {
lineLengthCtr = 0;
++bitMaskPos;
}
lineLengthCtr += sentenceLength;
if (lineLengthCtr > 15) {
lineLengthCtr = 0;
++bitMaskPos;
}
}
}
}

View File

@ -1,68 +0,0 @@
package nu.marginalia.keyword.model;
import nu.marginalia.model.idx.WordMetadata;
import java.io.Serial;
import java.io.Serializable;
public final class DocumentKeywords implements Serializable {
@Serial
private static final long serialVersionUID = 1387282293082091432L;
public final String[] keywords;
public final long[] metadata;
public DocumentKeywords(String[] keywords,
long[] metadata)
{
this.keywords = keywords;
this.metadata = metadata;
assert keywords.length == metadata.length;
if (DocumentKeywords.class.desiredAssertionStatus()) {
for (int i = 0; i < metadata.length; i++) {
if (metadata[i] == 0) {
System.err.println("Bad metadata for keyword " + keywords[i]);
}
}
}
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(getClass().getSimpleName());
sb.append('[');
var pointer = newPointer();
while (pointer.advancePointer()) {
sb.append("\n\t ");
long metadata = pointer.getMetadata();
String keyword = pointer.getKeyword();
sb.append(keyword);
if (metadata != 0) {
sb.append("/").append(new WordMetadata(metadata));
}
}
return sb.append("\n]").toString();
}
public boolean isEmpty() {
return keywords.length == 0;
}
public int size() {
return keywords.length;
}
/** Return a pointer for traversing this structure */
public DocumentKeywordsPointer newPointer() {
return new DocumentKeywordsPointer(this);
}
}

View File

@ -1,122 +0,0 @@
package nu.marginalia.keyword.model;
import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap;
import lombok.Getter;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import java.util.*;
@Getter
public class DocumentKeywordsBuilder {
public final Object2LongLinkedOpenHashMap<String> words;
/** These ware keywords that had signals of high relevance */
public final Set<String> importantWords = new HashSet<>();
// |------64 letters is this long-------------------------------|
// granted, some of these words are word n-grams, but 64 ought to
// be plenty. The lexicon writer has another limit that's higher.
private final int MAX_WORD_LENGTH = 64;
public DocumentKeywordsBuilder() {
this(1600);
}
public DocumentKeywords build() {
final String[] wordArray = new String[words.size()];
final long[] meta = new long[words.size()];
var iter = words.object2LongEntrySet().fastIterator();
for (int i = 0; iter.hasNext(); i++) {
var entry = iter.next();
meta[i] = entry.getLongValue();
wordArray[i] = entry.getKey();
}
return new DocumentKeywords(wordArray, meta);
}
public DocumentKeywordsBuilder(int capacity) {
words = new Object2LongLinkedOpenHashMap<>(capacity);
}
public void add(String word, long meta) {
if (word.length() > MAX_WORD_LENGTH)
return;
words.put(word, meta);
}
public void addImportantWords(Collection<String> words) {
importantWords.addAll(words);
}
public void addJustNoMeta(String word) {
if (word.length() > MAX_WORD_LENGTH)
return;
words.putIfAbsent(word, 0);
}
public void setFlagOnMetadataForWords(WordFlags flag, Collection<String> flagWords) {
flagWords.forEach(word ->
words.mergeLong(word, flag.asBit(), (a, b) -> a|b)
);
}
public void addAllSyntheticTerms(Collection<String> newWords) {
long meta = WordFlags.Synthetic.asBit();
// Only add the synthetic flag if the words aren't already present
newWords.forEach(word -> words.putIfAbsent(word, meta));
}
public void addAnchorTerms(Map<String, Integer> keywords) {
long flagA = WordFlags.ExternalLink.asBit();
long flagB = flagA | WordFlags.Site.asBit();
long flagC = flagB | WordFlags.SiteAdjacent.asBit();
keywords.forEach((word, count) -> {
if (count > 5) {
words.mergeLong(word, flagC, (a, b) -> a|b);
} else if (count > 2) {
words.mergeLong(word, flagB, (a, b) -> a|b);
} else {
words.mergeLong(word, flagA, (a, b) -> a|b);
}
});
}
public List<String> getWordsWithAnyFlag(long flags) {
List<String> ret = new ArrayList<>();
for (var iter = words.object2LongEntrySet().fastIterator(); iter.hasNext();) {
var entry = iter.next();
if ((flags & entry.getLongValue()) != 0) {
ret.add(entry.getKey());
}
}
return ret;
}
public int size() {
return words.size();
}
public WordMetadata getMetaForWord(String word) {
return new WordMetadata(words.getLong(word));
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder("[ ");
words.forEach((word, meta) -> sb.append(word).append("->").append(new WordMetadata(meta)).append(' '));
return sb.append(']').toString();
}
}

View File

@ -1,41 +0,0 @@
package nu.marginalia.keyword.model;
/** Pointer into a {@see DocumentKeywords}. It starts out before the first position,
* forward with advancePointer().
* */
public class DocumentKeywordsPointer {
private int pos = -1;
private final DocumentKeywords keywords;
DocumentKeywordsPointer(DocumentKeywords keywords) {
this.keywords = keywords;
}
/** Number of positions remaining */
public int remaining() {
return keywords.size() - Math.max(0, pos);
}
/** Return the keyword associated with the current position */
public String getKeyword() {
return keywords.keywords[pos];
}
/** Return the metadata associated with the current position */
public long getMetadata() {
return keywords.metadata[pos];
}
/** Advance the current position,
* returns false if this was the
* last position */
public boolean advancePointer() {
return ++pos < keywords.size();
}
/** Returns true unless the pointer is beyond the last position in the keyword set */
public boolean hasMore() {
return pos + 1 < keywords.size();
}
}

View File

@ -1,149 +0,0 @@
package nu.marginalia.keyword;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.segmentation.NgramLexicon;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
class DocumentKeywordExtractorTest {
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(
new TermFrequencyDict(WmsaHome.getLanguageModels()),
new NgramLexicon(WmsaHome.getLanguageModels()));
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
@Test
public void testWordPattern() {
Assertions.assertTrue(extractor.matchesWordPattern("test"));
Assertions.assertTrue(extractor.matchesWordPattern("1234567890abcde"));
Assertions.assertFalse(extractor.matchesWordPattern("1234567890abcdef"));
Assertions.assertTrue(extractor.matchesWordPattern("test-test-test-test-test"));
Assertions.assertFalse(extractor.matchesWordPattern("test-test-test-test-test-test"));
Assertions.assertTrue(extractor.matchesWordPattern("192.168.1.100/24"));
Assertions.assertTrue(extractor.matchesWordPattern("std::vector"));
Assertions.assertTrue(extractor.matchesWordPattern("c++"));
Assertions.assertTrue(extractor.matchesWordPattern("m*a*s*h"));
Assertions.assertFalse(extractor.matchesWordPattern("Stulpnagelstrasse"));
}
@Test
public void testEmptyMetadata() throws URISyntaxException {
var dld = se.extractSentences("""
Some sample text, I'm not sure what even triggers this
""", "A title perhaps?");
var keywordBuilder = extractor.extractKeywords(dld, new EdgeUrl("https://www.example.com/invalid"));
var keywords = keywordBuilder.build();
var pointer = keywords.newPointer();
while (pointer.advancePointer()) {
if (pointer.getMetadata() == 0L) {
System.out.println("Aha! " + pointer.getKeyword());
}
}
}
@Test
public void testKeyboards2() throws IOException, URISyntaxException {
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
"Could not load word frequency table");
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
var doc = Jsoup.parse(html);
doc.filter(new DomPruningFilter(0.5));
var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/"));
keywords.getWords().forEach((k, v) -> {
if (k.contains("_")) {
System.out.println(k + " " + new WordMetadata(v));
}
});
}
@Test
public void testKeyboards() throws IOException, URISyntaxException {
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
"Could not load word frequency table");
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
var doc = Jsoup.parse(html);
doc.filter(new DomPruningFilter(0.5));
var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/"));
System.out.println(keywords.getMetaForWord("mechanical"));
System.out.println(keywords.getMetaForWord("keyboard"));
System.out.println(keywords.getMetaForWord("keyboards"));
System.out.println(new WordMetadata(8894889328781L));
System.out.println(new WordMetadata(4294967297L));
System.out.println(new WordMetadata(566820053975498886L));
// -
System.out.println(new WordMetadata(1198298103937L));
System.out.println(new WordMetadata(1103808168065L));
}
@Test
public void testMadonna() throws IOException, URISyntaxException {
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/madonna.html"),
"Could not load word frequency table");
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
var doc = Jsoup.parse(html);
doc.filter(new DomPruningFilter(0.5));
var keywords = extractor.extractKeywords(
se.extractSentences(doc),
new EdgeUrl("https://encyclopedia.marginalia.nu/article/Don't_Tell_Me_(Madonna_song)")
);
var keywordsBuilt = keywords.build();
var ptr = keywordsBuilt.newPointer();
Map<String, WordMetadata> dirtyAndBlues = new HashMap<>();
while (ptr.advancePointer()) {
if (Set.of("dirty", "blues").contains(ptr.getKeyword())) {
Assertions.assertNull(
dirtyAndBlues.put(ptr.getKeyword(), new WordMetadata(ptr.getMetadata()))
);
}
}
Assertions.assertTrue(dirtyAndBlues.containsKey("dirty"));
Assertions.assertTrue(dirtyAndBlues.containsKey("blues"));
Assertions.assertNotEquals(
dirtyAndBlues.get("dirty"),
dirtyAndBlues.get("blues")
);
}
@Test
public void testSpam() throws IOException, URISyntaxException {
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/spam.html"),
"Could not load word frequency table");
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
var doc = Jsoup.parse(html);
doc.filter(new DomPruningFilter(0.5));
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(
new TermFrequencyDict(WmsaHome.getLanguageModels()),
new NgramLexicon(WmsaHome.getLanguageModels()));
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://math.byu.edu/wiki/index.php/All_You_Need_To_Know_About_Earning_Money_Online"));
System.out.println(keywords.getMetaForWord("knitting"));
}
}

View File

@ -1,34 +0,0 @@
plugins {
id 'java'
id "de.undercouch.download" version "5.1.0"
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:common:model')
implementation libs.bundles.slf4j
implementation libs.guava
implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava'
}
implementation libs.notnull
implementation libs.bundles.gson
implementation libs.jsoup
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
testImplementation project(':code:common:config')
}

View File

@ -1,7 +0,0 @@
# Pubdate
Contains advanced haruspicy for figuring out when a document was published.
## Central Classes
* [PubDateSniffer](java/nu/marginalia/pubdate/PubDateSniffer.java)

View File

@ -1,13 +0,0 @@
# Converter Features
## Major features
* [keyword-extraction](keyword-extraction/) - Identifies keywords to index in a document
* [summary-extraction](summary-extraction/) - Generate an excerpt/quote from a website to display on the search results page.
## Smaller features:
* [adblock](adblock/) - Simulates Adblock
* [pubdate](pubdate/) - Determines when a document was published
* [topic-detection](topic-detection/) - Tries to identify the topic of a website

View File

@ -1,44 +0,0 @@
plugins {
id 'java'
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation libs.bundles.slf4j
implementation project(':code:libraries:blocking-thread-pool')
implementation project(':code:common:model')
implementation libs.notnull
implementation libs.jsoup
implementation libs.sqlite
implementation libs.guava
implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava'
}
implementation libs.guava
implementation libs.gson
implementation libs.zstd
implementation libs.trove
implementation libs.commons.compress
implementation libs.xz
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}
test {
maxHeapSize = "8G"
useJUnitPlatform()
}

View File

@ -1,43 +0,0 @@
plugins {
id 'java'
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation libs.bundles.slf4j
implementation project(':code:libraries:blocking-thread-pool')
implementation project(':code:common:model')
implementation libs.notnull
implementation libs.jsoup
implementation libs.sqlite
implementation libs.guava
implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava'
}
implementation libs.guava
implementation libs.zstd
implementation libs.trove
implementation libs.commons.compress
implementation libs.xz
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}
test {
maxHeapSize = "8G"
useJUnitPlatform()
}

View File

@ -1,18 +0,0 @@
Stackexchange's data is a jumble of questions and answers,
where the answers refer to the questions with a parentId field.
e.g.
```xml
<?xml version="1.0" encoding="utf-8"?>
<posts>
<row Id="1" PostTypeId="1" AcceptedAnswerId="51" CreationDate="2016-01-12T18:45:19.963" Score="10" ViewCount="424" Body="&lt;p&gt;When I've printed an object I've had to choose between high resolution and quick prints. What techniques or technologies can I use or deploy to speed up my high resolution prints?&lt;/p&gt;&#xA;" OwnerUserId="16" LastActivityDate="2017-10-31T02:31:08.560" Title="How to obtain high resolution prints in a shorter period of time?" Tags="&lt;resolution&gt;&lt;speed&gt;&lt;quality&gt;" AnswerCount="2" CommentCount="6" ContentLicense="CC BY-SA 3.0" />
<row Id="2" PostTypeId="1" AcceptedAnswerId="12" CreationDate="2016-01-12T18:45:51.287" Score="34" ViewCount="7377" Body="&lt;p&gt;I would like to buy a 3D printer, but I'm concerned about the health risks that are associated with its operation. Some groups of scientists say it can be &lt;a href=&quot;http://www.techworld.com/news/personal-tech/scientists-warn-of-3d-printing-health-effects-as-tech-hits-high-street-3460992/&quot;&gt;harmful&lt;/a&gt; for humans.&lt;/p&gt;&#xA;&#xA;&lt;p&gt;What do I need to consider before buying a 3D printer if I care about my health? Are there any safe printers?&lt;/p&gt;&#xA;" OwnerUserId="20" LastEditorUserId="334" LastEditDate="2016-11-15T16:16:11.163" LastActivityDate="2019-06-10T23:18:34.190" Title="Is 3D printing safe for your health?" Tags="&lt;print-material&gt;&lt;safety&gt;&lt;health&gt;" AnswerCount="4" CommentCount="1" ContentLicense="CC BY-SA 3.0" />
<row Id="12" PostTypeId="2" ParentId="2" CreationDate="2016-01-12T19:13:00.710" Score="23" Body="&lt;p&gt;There is very little information about safety available, as home 3D printers are relatively new. However, plastics such as ABS have a long history in making plastic products, and a study found..." />
</posts>
```
Since the search engine wants to extract keywords for each thread
holistically, not by question or answer, it is necessary to re-arrange
the data (which is very large). SQLite does a decent job of enabling
this task.

View File

@ -1,42 +0,0 @@
plugins {
id 'java'
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation libs.bundles.slf4j
implementation libs.notnull
implementation libs.jsoup
implementation libs.guava
implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava'
}
implementation libs.guava
implementation libs.bundles.gson
implementation libs.trove
implementation libs.fastutil
implementation libs.commons.lang3
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
testImplementation project(':code:features-convert:keyword-extraction')
testImplementation project(':code:libraries:language-processing')
testImplementation project(':code:libraries:term-frequency-dict')
testImplementation project(':code:common:config')
testImplementation project(':code:common:model')
}

View File

@ -1,25 +0,0 @@
# Summary Extraction
This feature attempts to find a descriptive passage of text that summarizes
what a search result "is about". It's the text you see below a search result.
It must solve two problems:
1. Identify which part of the document that contains "the text".
The crux is that the document may be anywhere from 1993 to the present, with era-appropriate
formatting. It may be formatted with &lt;center&gt;ed &lt;font&gt;-tags, or semantic HTML5.
2. Identify which part of "the text" best describes the document.
It uses several naive heuristics to try to find something that makes sense,
and there is probably room for improvement.
There are many good techniques for doing this, but they've sadly not proved
particularly fast. Whatever solution is used needs to be able to summarize of
order of a 100,000,000 documents with a time budget of a couple of hours.
## Central Classes
* [SummaryExtractor](java/nu/marginalia/summary/SummaryExtractor.java)

View File

@ -1,34 +0,0 @@
plugins {
id 'java'
id "de.undercouch.download" version "5.1.0"
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:common:config')
implementation project(':code:libraries:language-processing')
implementation project(':third-party:porterstemmer')
implementation libs.bundles.slf4j
implementation libs.guava
implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava'
}
implementation libs.notnull
implementation libs.jsoup
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}

View File

@ -1,4 +0,0 @@
# Topic Detection
This is an experiment in using hand-crafted naive bayesian filters to detecting the topic of a website.
It's noteworthy it detects recipes very well.

View File

@ -1,8 +0,0 @@
# Crawl Features
These are bits of search-engine related code that are relatively isolated pieces of business logic,
that benefit from the clarity of being kept separate from the rest of the crawling code.
* [content-type](content-type/) - Content Type identification
* [crawl-blocklist](crawl-blocklist/) - IP and URL blocklists
* [link-parser](link-parser/) - Code for parsing and normalizing links

View File

@ -2,6 +2,11 @@ package nu.marginalia.api.math;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.api.math.MathProtobufCodec.DictionaryLookup;
import nu.marginalia.api.math.MathProtobufCodec.EvalMath;
import nu.marginalia.api.math.MathProtobufCodec.SpellCheck;
import nu.marginalia.api.math.MathProtobufCodec.UnitConversion;
import nu.marginalia.api.math.model.DictionaryResponse;
import nu.marginalia.service.client.GrpcChannelPoolFactory;
import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
import nu.marginalia.service.discovery.property.ServiceKey;
@ -9,14 +14,11 @@ import nu.marginalia.service.discovery.property.ServicePartition;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.time.Duration;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.*;
import nu.marginalia.api.math.model.*;
import nu.marginalia.api.math.MathProtobufCodec.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
@Singleton
@ -49,24 +51,14 @@ public class MathClient {
.thenApply(SpellCheck::convertResponse);
}
public Map<String, List<String>> spellCheck(List<String> words, Duration timeout) throws InterruptedException {
// This looks a bit different because we need to spell check multiple words, and we want to do it in parallel
public Future<Map<String, List<String>>> spellCheck(List<String> words) throws InterruptedException {
List<RpcSpellCheckRequest> requests = words.stream().map(SpellCheck::createRequest).toList();
var future = channelPool.call(MathApiGrpc.MathApiBlockingStub::spellCheck)
return channelPool.call(MathApiGrpc.MathApiBlockingStub::spellCheck)
.async(executor)
.runFor(requests);
try {
var results = future.get();
Map<String, List<String>> map = new HashMap<>();
for (int i = 0; i < words.size(); i++) {
map.put(words.get(i), SpellCheck.convertResponse(results.get(i)));
}
return map;
}
catch (ExecutionException e) {
throw new RuntimeException(e);
}
.runFor(requests)
.thenApply(rsp -> SpellCheck.convertResponses(words, rsp));
}
public Future<String> unitConversion(String value, String from, String to) {

View File

@ -3,7 +3,9 @@ package nu.marginalia.api.math;
import nu.marginalia.api.math.model.DictionaryEntry;
import nu.marginalia.api.math.model.DictionaryResponse;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class MathProtobufCodec {
@ -35,6 +37,15 @@ public class MathProtobufCodec {
public static List<String> convertResponse(RpcSpellCheckResponse rsp) {
return rsp.getSuggestionsList();
}
public static Map<String, List<String>> convertResponses(List<String> words, List<RpcSpellCheckResponse> responses) {
var map = new HashMap<String, List<String>>();
for (int i = 0; i < words.size(); i++) {
map.put(words.get(i), responses.get(i).getSuggestionsList());
}
return map;
}
}
public static class UnitConversion {

View File

@ -23,6 +23,7 @@ dependencies {
implementation project(':code:common:config')
implementation project(':code:common:service')
implementation project(':code:index:query')
implementation project(':code:libraries:language-processing')
implementation libs.bundles.slf4j

View File

@ -1,11 +1,9 @@
package nu.marginalia.api.searchquery;
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingInputs;
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingOutputs;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.index.query.limit.SpecificationLimitType;
@ -48,11 +46,22 @@ public class IndexProtobufCodec {
}
public static SearchQuery convertRpcQuery(RpcQuery query) {
List<List<String>> coherences = new ArrayList<>();
List<SearchPhraseConstraint> phraeConstraints = new ArrayList<>();
for (int j = 0; j < query.getCoherencesCount(); j++) {
var coh = query.getCoherences(j);
coherences.add(new ArrayList<>(coh.getCoherencesList()));
for (int j = 0; j < query.getPhrasesCount(); j++) {
var coh = query.getPhrases(j);
if (coh.getType() == RpcPhrases.TYPE.OPTIONAL) {
phraeConstraints.add(new SearchPhraseConstraint.Optional(List.copyOf(coh.getTermsList())));
}
else if (coh.getType() == RpcPhrases.TYPE.MANDATORY) {
phraeConstraints.add(new SearchPhraseConstraint.Mandatory(List.copyOf(coh.getTermsList())));
}
else if (coh.getType() == RpcPhrases.TYPE.FULL) {
phraeConstraints.add(new SearchPhraseConstraint.Full(List.copyOf(coh.getTermsList())));
}
else {
throw new IllegalArgumentException("Unknown phrase constraint type: " + coh.getType());
}
}
return new SearchQuery(
@ -61,7 +70,7 @@ public class IndexProtobufCodec {
query.getExcludeList(),
query.getAdviceList(),
query.getPriorityList(),
coherences
phraeConstraints
);
}
@ -74,8 +83,21 @@ public class IndexProtobufCodec {
.addAllExclude(searchQuery.getSearchTermsExclude())
.addAllPriority(searchQuery.getSearchTermsPriority());
for (var coherences : searchQuery.searchTermCoherences) {
subqueryBuilder.addCoherencesBuilder().addAllCoherences(coherences);
for (var constraint : searchQuery.phraseConstraints) {
switch (constraint) {
case SearchPhraseConstraint.Optional(List<String> terms) ->
subqueryBuilder.addPhrasesBuilder()
.addAllTerms(terms)
.setType(RpcPhrases.TYPE.OPTIONAL);
case SearchPhraseConstraint.Mandatory(List<String> terms) ->
subqueryBuilder.addPhrasesBuilder()
.addAllTerms(terms)
.setType(RpcPhrases.TYPE.MANDATORY);
case SearchPhraseConstraint.Full(List<String> terms) ->
subqueryBuilder.addPhrasesBuilder()
.addAllTerms(terms)
.setType(RpcPhrases.TYPE.FULL);
}
}
return subqueryBuilder.build();
@ -86,19 +108,17 @@ public class IndexProtobufCodec {
return ResultRankingParameters.sensibleDefaults();
return new ResultRankingParameters(
new Bm25Parameters(params.getFullK(), params.getFullB()),
new Bm25Parameters(params.getPrioK(), params.getPrioB()),
new Bm25Parameters(params.getBm25K(), params.getBm25B()),
params.getShortDocumentThreshold(),
params.getShortDocumentPenalty(),
params.getDomainRankBonus(),
params.getQualityPenalty(),
params.getShortSentenceThreshold(),
params.getShortSentencePenalty(),
params.getBm25FullWeight(),
params.getBm25NgramWeight(),
params.getBm25PrioWeight(),
params.getTcfJaccardWeight(),
params.getTcfOverlapWeight(),
params.getBm25Weight(),
params.getTcfFirstPositionWeight(),
params.getTcfVerbatimWeight(),
params.getTcfProximityWeight(),
ResultRankingParameters.TemporalBias.valueOf(params.getTemporalBias().getBias().name()),
params.getTemporalBiasWeight(),
params.getExportDebugData()
@ -113,21 +133,18 @@ public class IndexProtobufCodec {
}
var builder = RpcResultRankingParameters.newBuilder()
.setFullB(rankingParams.fullParams.b())
.setFullK(rankingParams.fullParams.k())
.setPrioB(rankingParams.prioParams.b())
.setPrioK(rankingParams.prioParams.k())
.setBm25B(rankingParams.bm25Params.b())
.setBm25K(rankingParams.bm25Params.k())
.setShortDocumentThreshold(rankingParams.shortDocumentThreshold)
.setShortDocumentPenalty(rankingParams.shortDocumentPenalty)
.setDomainRankBonus(rankingParams.domainRankBonus)
.setQualityPenalty(rankingParams.qualityPenalty)
.setShortSentenceThreshold(rankingParams.shortSentenceThreshold)
.setShortSentencePenalty(rankingParams.shortSentencePenalty)
.setBm25FullWeight(rankingParams.bm25FullWeight)
.setBm25NgramWeight(rankingParams.bm25NgramWeight)
.setBm25PrioWeight(rankingParams.bm25PrioWeight)
.setTcfOverlapWeight(rankingParams.tcfOverlapWeight)
.setTcfJaccardWeight(rankingParams.tcfJaccardWeight)
.setBm25Weight(rankingParams.bm25Weight)
.setTcfFirstPositionWeight(rankingParams.tcfFirstPosition)
.setTcfProximityWeight(rankingParams.tcfProximity)
.setTcfVerbatimWeight(rankingParams.tcfVerbatim)
.setTemporalBiasWeight(rankingParams.temporalBiasWeight)
.setExportDebugData(rankingParams.exportDebugData);
@ -142,45 +159,4 @@ public class IndexProtobufCodec {
return builder.build();
}
public static RpcResultRankingDetails convertRankingDetails(ResultRankingDetails rankingDetails) {
if (rankingDetails == null) {
return null;
}
return RpcResultRankingDetails.newBuilder()
.setInputs(convertRankingInputs(rankingDetails.inputs()))
.setOutput(convertRankingOutput(rankingDetails.outputs()))
.build();
}
private static RpcResultRankingOutputs convertRankingOutput(ResultRankingOutputs outputs) {
return RpcResultRankingOutputs.newBuilder()
.setAverageSentenceLengthPenalty(outputs.averageSentenceLengthPenalty())
.setQualityPenalty(outputs.qualityPenalty())
.setRankingBonus(outputs.rankingBonus())
.setTopologyBonus(outputs.topologyBonus())
.setDocumentLengthPenalty(outputs.documentLengthPenalty())
.setTemporalBias(outputs.temporalBias())
.setFlagsPenalty(outputs.flagsPenalty())
.setOverallPart(outputs.overallPart())
.setTcfOverlap(outputs.tcfOverlap())
.setTcfJaccard(outputs.tcfJaccard())
.setBM25F(outputs.bM25F())
.setBM25N(outputs.bM25N())
.setBM25P(outputs.bM25P())
.build();
}
private static RpcResultRankingInputs convertRankingInputs(ResultRankingInputs inputs) {
return RpcResultRankingInputs.newBuilder()
.setRank(inputs.rank())
.setAsl(inputs.asl())
.setQuality(inputs.quality())
.setSize(inputs.size())
.setTopology(inputs.topology())
.setYear(inputs.year())
.addAllFlags(inputs.flags())
.build();
}
}

View File

@ -1,21 +1,25 @@
package nu.marginalia.api.searchquery;
import lombok.SneakyThrows;
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.QueryResponse;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
import nu.marginalia.api.searchquery.model.results.debug.DebugFactor;
import nu.marginalia.api.searchquery.model.results.debug.DebugFactorGroup;
import nu.marginalia.api.searchquery.model.results.debug.DebugTermFactorGroup;
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingInputs;
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingOutputs;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.QueryResponse;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class QueryProtobufCodec {
@ -130,6 +134,7 @@ public class QueryProtobufCodec {
results.getWordsTotal(),
results.getBestPositions(),
results.getRankingScore(),
results.getResultsFromDomain(),
convertRankingDetails(results.getRankingDetails())
);
}
@ -137,46 +142,109 @@ public class QueryProtobufCodec {
private static ResultRankingDetails convertRankingDetails(RpcResultRankingDetails rankingDetails) {
if (rankingDetails == null)
return null;
var inputs = rankingDetails.getInputs();
var outputs = rankingDetails.getOutput();
var docData = rankingDetails.getDocumentOutputs();
var termData = rankingDetails.getTermOutputs();
return new ResultRankingDetails(
convertRankingInputs(inputs),
convertRankingOutputs(outputs)
convertDocumentOutputs(docData),
convertTermData(termData)
);
}
private static ResultRankingOutputs convertRankingOutputs(RpcResultRankingOutputs outputs) {
return new ResultRankingOutputs(
outputs.getAverageSentenceLengthPenalty(),
outputs.getQualityPenalty(),
outputs.getRankingBonus(),
outputs.getTopologyBonus(),
outputs.getDocumentLengthPenalty(),
outputs.getTemporalBias(),
outputs.getFlagsPenalty(),
outputs.getOverallPart(),
outputs.getTcfOverlap(),
outputs.getTcfJaccard(),
outputs.getBM25F(),
outputs.getBM25N(),
outputs.getBM25P()
);
private static List<DebugTermFactorGroup> convertTermData(RpcResultTermRankingOutputs termData) {
Map<String, Long> termIdByName = new HashMap<>();
Map<String, List<DebugFactor>> factorsByTerm = new HashMap<>();
for (int i = 0; i < termData.getTermCount(); i++) {
termIdByName.put(termData.getTerm(i), termData.getTermId(i));
factorsByTerm.computeIfAbsent(termData.getTerm(i), k -> new ArrayList<>())
.add(new DebugFactor(termData.getFactor(i), termData.getValue(i)));
}
Map<String, List<DebugFactorGroup>> factorGroupsByTerm = new HashMap<>();
for (var entry : factorsByTerm.entrySet()) {
String term = entry.getKey();
var factorsList = entry.getValue();
Map<String, List<DebugFactor>> factorsByGroup = new HashMap<>();
for (var factor : factorsList) {
String[] parts = factor.factor().split("\\.");
String group, name;
if (parts.length != 2) {
group = "unknown";
name = parts[0];
} else {
group = parts[0];
name = parts[1];
}
factorsByGroup.computeIfAbsent(group, k -> new ArrayList<>())
.add(new DebugFactor(name, factor.value()));
}
factorsByGroup.forEach((groupName, groupData) -> {
factorGroupsByTerm.computeIfAbsent(term, k -> new ArrayList<>())
.add(new DebugFactorGroup(groupName, groupData));
});
}
List<DebugTermFactorGroup> groups = new ArrayList<>();
for (var entry : factorGroupsByTerm.entrySet()) {
groups.add(new DebugTermFactorGroup(entry.getKey(), termIdByName.get(entry.getKey()), entry.getValue()));
}
return groups;
}
private static ResultRankingInputs convertRankingInputs(RpcResultRankingInputs inputs) {
return new ResultRankingInputs(
inputs.getRank(),
inputs.getAsl(),
inputs.getQuality(),
inputs.getSize(),
inputs.getTopology(),
inputs.getYear(),
inputs.getFlagsList()
);
private static List<DebugFactorGroup> convertDocumentOutputs(RpcResultDocumentRankingOutputs docData) {
List<DebugFactor> unclusteredFactors = new ArrayList<>();
for (int i = 0; i < docData.getFactorCount(); i++) {
String factor = docData.getFactor(i);
String value = docData.getValue(i);
unclusteredFactors.add(new DebugFactor(factor, value));
}
Map<String, List<DebugFactor>> factorsByGroup = new HashMap<>();
for (var factor : unclusteredFactors) {
String factorName = factor.factor();
String value = factor.value();
String[] parts = factorName.split("\\.");
String group, name;
if (parts.length != 2) {
group = "unknown";
name = factorName;
}
else {
group = parts[0];
name = parts[1];
}
factorsByGroup.computeIfAbsent(group, k -> new ArrayList<>())
.add(new DebugFactor(name, value));
}
List<DebugFactorGroup> groups = new ArrayList<>();
for (var entry : factorsByGroup.entrySet()) {
groups.add(new DebugFactorGroup(entry.getKey(), entry.getValue()));
}
return groups;
}
private static SearchResultItem convertRawResult(RpcRawResultItem rawItem) {
var keywordScores = new ArrayList<SearchResultKeywordScore>(rawItem.getKeywordScoresCount());
@ -188,8 +256,9 @@ public class QueryProtobufCodec {
rawItem.getEncodedDocMetadata(),
rawItem.getHtmlFeatures(),
keywordScores,
rawItem.getResultsFromDomain(),
rawItem.getHasPriorityTerms(),
0, // Not set
null, // Not set
Double.NaN // Not set
);
}
@ -198,7 +267,8 @@ public class QueryProtobufCodec {
return new SearchResultKeywordScore(
keywordScores.getKeyword(),
-1, // termId is internal to index service
keywordScores.getEncodedWordMetadata()
(byte) keywordScores.getFlags(),
keywordScores.getPositions()
);
}
@ -257,6 +327,7 @@ public class QueryProtobufCodec {
rpcDecoratedResultItem.getWordsTotal(),
rpcDecoratedResultItem.getBestPositions(),
rpcDecoratedResultItem.getRankingScore(),
rpcDecoratedResultItem.getResultsFromDomain(),
convertRankingDetails(rpcDecoratedResultItem.getRankingDetails())
);
}

View File

@ -3,7 +3,9 @@ package nu.marginalia.api.searchquery.model.compiled;
import org.jetbrains.annotations.NotNull;
import java.util.Iterator;
import java.util.function.*;
import java.util.function.Function;
import java.util.function.ToIntFunction;
import java.util.function.ToLongFunction;
import java.util.stream.IntStream;
import java.util.stream.Stream;
@ -46,8 +48,8 @@ public class CompiledQuery<T> implements Iterable<T> {
return new CompiledQueryLong(root, data.mapToLong(mapper));
}
public CompiledQueryLong mapToInt(ToIntFunction<T> mapper) {
return new CompiledQueryLong(root, data.mapToInt(mapper));
public CompiledQueryInt mapToInt(ToIntFunction<T> mapper) {
return new CompiledQueryInt(root, data.mapToInt(mapper));
}
public CqExpression root() {

View File

@ -5,8 +5,8 @@ import java.util.stream.IntStream;
/** A compiled index service query */
public class CompiledQueryInt {
private final CqExpression root;
private final CqDataInt data;
public final CqExpression root;
public final CqDataInt data;
public CompiledQueryInt(CqExpression root, CqDataInt data) {
this.root = root;
@ -26,7 +26,7 @@ public class CompiledQueryInt {
return IntStream.range(0, data.size());
}
public long at(int index) {
public int at(int index) {
return data.get(index);
}

View File

@ -61,7 +61,8 @@ public class CompiledQueryParser {
String[] cqData = new String[wordIds.size()];
wordIds.forEach((w, i) -> cqData[i] = w);
return new CompiledQuery<>(root, new CqData<>(cqData));
return root.newQuery(cqData);
}

View File

@ -33,13 +33,13 @@ public class CqData<T> {
return new CqDataLong(newData);
}
public CqDataLong mapToInt(ToIntFunction<T> mapper) {
long[] newData = new long[data.length];
public CqDataInt mapToInt(ToIntFunction<T> mapper) {
int[] newData = new int[data.length];
for (int i = 0; i < data.length; i++) {
newData[i] = mapper.applyAsInt((T) data[i]);
newData[i] = mapper.applyAsInt(data[i]);
}
return new CqDataLong(newData);
return new CqDataInt(newData);
}
public T get(int i) {

View File

@ -8,6 +8,18 @@ import java.util.stream.Stream;
*
*/
public sealed interface CqExpression {
/** Create a new query for the provided data using this expression as the root */
default <T> CompiledQuery<T> newQuery(T[] data) {
return new CompiledQuery<>(this, data);
}
/** Create a new query for the provided data using this expression as the root */
default CompiledQueryInt newQuery(int[] data) {
return new CompiledQueryInt(this, new CqDataInt(data));
}
/** Create a new query for the provided data using this expression as the root */
default CompiledQueryLong newQuery(long[] data) {
return new CompiledQueryLong(this, new CqDataLong(data));
}
Stream<Word> stream();

View File

@ -2,6 +2,7 @@ package nu.marginalia.api.searchquery.model.compiled.aggregate;
import it.unimi.dsi.fastutil.longs.LongSet;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import java.util.ArrayList;
@ -36,7 +37,10 @@ public class CompiledQueryAggregates {
public static <T> int intMaxMinAggregate(CompiledQuery<T> query, ToIntFunction<T> operator) {
return query.root.visit(new CqIntMaxMinOperator(query, operator));
}
/** Apply the operator to each leaf node, then return the highest minimum value found along any path */
public static <T> int intMaxMinAggregate(CompiledQueryInt query, IntUnaryOperator operator) {
return query.root.visit(new CqIntMaxMinOperator(query, operator));
}
/** Apply the operator to each leaf node, then return the highest minimum value found along any path */
public static int intMaxMinAggregate(CompiledQueryLong query, LongToIntFunction operator) {
return query.root.visit(new CqIntMaxMinOperator(query, operator));
@ -55,13 +59,4 @@ public class CompiledQueryAggregates {
return new ArrayList<>(query.root().visit(new CqQueryPathsOperator(query)));
}
/** Using the bitwise AND operator, aggregate all possible combined values of the long generated by the provided operator */
public static <T> LongSet positionsAggregate(CompiledQuery<T> query, ToLongFunction<T> operator) {
return query.root().visit(new CqPositionsOperator(query, operator));
}
/** Using the bitwise AND operator, aggregate all possible combined values of the long generated by the provided operator */
public static <T> LongSet positionsAggregate(CompiledQueryLong query, LongUnaryOperator operator) {
return query.root().visit(new CqPositionsOperator(query, operator));
}
}

View File

@ -1,6 +1,7 @@
package nu.marginalia.api.searchquery.model.compiled.aggregate;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
@ -21,7 +22,9 @@ public class CqIntMaxMinOperator implements CqExpression.IntVisitor {
public CqIntMaxMinOperator(CompiledQueryLong query, LongToIntFunction operator) {
this.operator = idx -> operator.applyAsInt(query.at(idx));
}
public CqIntMaxMinOperator(CompiledQueryInt query, IntUnaryOperator operator) {
this.operator = idx -> operator.applyAsInt(query.at(idx));
}
@Override
public int onAnd(List<? extends CqExpression> parts) {
int value = parts.getFirst().visit(this);

View File

@ -1,85 +0,0 @@
package nu.marginalia.api.searchquery.model.compiled.aggregate;
import it.unimi.dsi.fastutil.longs.LongArraySet;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import it.unimi.dsi.fastutil.longs.LongSet;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
import java.util.List;
import java.util.function.IntToLongFunction;
import java.util.function.LongUnaryOperator;
import java.util.function.ToLongFunction;
public class CqPositionsOperator implements CqExpression.ObjectVisitor<LongSet> {
private final IntToLongFunction operator;
public <T> CqPositionsOperator(CompiledQuery<T> query, ToLongFunction<T> operator) {
this.operator = idx -> operator.applyAsLong(query.at(idx));
}
public CqPositionsOperator(CompiledQueryLong query, LongUnaryOperator operator) {
this.operator = idx -> operator.applyAsLong(query.at(idx));
}
@Override
public LongSet onAnd(List<? extends CqExpression> parts) {
LongSet ret = new LongArraySet();
for (var part : parts) {
ret = comineSets(ret, part.visit(this));
}
return ret;
}
private LongSet comineSets(LongSet a, LongSet b) {
if (a.isEmpty())
return b;
if (b.isEmpty())
return a;
LongSet ret = newSet(a.size() * b.size());
var ai = a.longIterator();
while (ai.hasNext()) {
long aval = ai.nextLong();
var bi = b.longIterator();
while (bi.hasNext()) {
ret.add(aval & bi.nextLong());
}
}
return ret;
}
@Override
public LongSet onOr(List<? extends CqExpression> parts) {
LongSet ret = newSet(parts.size());
for (var part : parts) {
ret.addAll(part.visit(this));
}
return ret;
}
@Override
public LongSet onLeaf(int idx) {
var set = newSet(1);
set.add(operator.applyAsLong(idx));
return set;
}
/** Allocate a new set suitable for a collection with the provided cardinality */
private LongSet newSet(int cardinality) {
if (cardinality < 8)
return new LongArraySet(cardinality);
else
return new LongOpenHashSet(cardinality);
}
}

View File

@ -2,6 +2,7 @@ package nu.marginalia.api.searchquery.model.query;
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
import javax.annotation.Nullable;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
@ -10,7 +11,7 @@ public record QueryResponse(SearchSpecification specs,
List<DecoratedSearchResultItem> results,
List<String> searchTermsHuman,
List<String> problems,
String domain)
@Nullable String domain)
{
public Set<String> getAllKeywords() {
return new HashSet<>(specs.query.searchTermsInclude);

View File

@ -0,0 +1,85 @@
package nu.marginalia.api.searchquery.model.query;
import nu.marginalia.language.WordPatterns;
import java.util.ArrayList;
import java.util.List;
public sealed interface SearchPhraseConstraint {
record Mandatory(List<String> terms) implements SearchPhraseConstraint {
public Mandatory(String... terms) {
this(List.of(terms));
}
}
record Optional(List<String> terms) implements SearchPhraseConstraint {
public Optional(String... terms) {
this(List.of(terms));
}
}
record Full(List<String> terms) implements SearchPhraseConstraint {
public Full(String... terms) {
this(List.of(terms));
}
}
List<String> terms();
default int size() {
return terms().size();
}
static SearchPhraseConstraint mandatory(String... terms) {
return new Mandatory(trimStopWords(terms));
}
static SearchPhraseConstraint mandatory(List<String> terms) {
return new Mandatory(trimStopWords(terms));
}
static SearchPhraseConstraint optional(String... terms) {
return new Optional(trimStopWords(terms));
}
static SearchPhraseConstraint optional(List<String> terms) {
return new Optional(trimStopWords(terms));
}
static SearchPhraseConstraint full(String... terms) {
return new Full(trimStopWords(terms));
}
static SearchPhraseConstraint full(List<String> terms) {
return new Full(trimStopWords(terms));
}
private static List<String> trimStopWords(List<String> terms) {
List<String> ret = new ArrayList<>(terms.size());
for (var term : terms) {
if (WordPatterns.isStopWord(term)) {
ret.add("");
} else {
ret.add(term);
}
}
return List.copyOf(ret);
}
private static List<String> trimStopWords(String... terms) {
List<String> ret = new ArrayList<>(terms.length);
for (var term : terms) {
if (WordPatterns.isStopWord(term)) {
ret.add("");
} else {
ret.add(term);
}
}
while (!ret.isEmpty() && "".equals(ret.getFirst())) {
ret.removeFirst();
}
while (!ret.isEmpty() && "".equals(ret.getLast())) {
ret.removeLast();
}
return List.copyOf(ret);
}
}

View File

@ -31,18 +31,22 @@ public class SearchQuery {
public final List<String> searchTermsPriority;
/** Terms that we require to be in the same sentence */
public final List<List<String>> searchTermCoherences;
public final List<SearchPhraseConstraint> phraseConstraints;
@Deprecated // why does this exist?
private double value = 0;
public static SearchQueryBuilder builder() {
return new SearchQueryBuilder();
}
public SearchQuery() {
this.compiledQuery = "";
this.searchTermsInclude = new ArrayList<>();
this.searchTermsExclude = new ArrayList<>();
this.searchTermsAdvice = new ArrayList<>();
this.searchTermsPriority = new ArrayList<>();
this.searchTermCoherences = new ArrayList<>();
this.phraseConstraints = new ArrayList<>();
}
public SearchQuery(String compiledQuery,
@ -50,13 +54,13 @@ public class SearchQuery {
List<String> searchTermsExclude,
List<String> searchTermsAdvice,
List<String> searchTermsPriority,
List<List<String>> searchTermCoherences) {
List<SearchPhraseConstraint> phraseConstraints) {
this.compiledQuery = compiledQuery;
this.searchTermsInclude = searchTermsInclude;
this.searchTermsExclude = searchTermsExclude;
this.searchTermsAdvice = searchTermsAdvice;
this.searchTermsPriority = searchTermsPriority;
this.searchTermCoherences = searchTermCoherences;
this.phraseConstraints = phraseConstraints;
}
@Deprecated // why does this exist?
@ -76,10 +80,62 @@ public class SearchQuery {
if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] ")));
if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] ")));
if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] ")));
if (!searchTermCoherences.isEmpty()) sb.append("coherences=").append(searchTermCoherences.stream().map(coh->coh.stream().collect(Collectors.joining(",", "[", "] "))).collect(Collectors.joining(", ")));
if (!phraseConstraints.isEmpty()) sb.append("phraseConstraints=").append(phraseConstraints.stream().map(coh->coh.terms().stream().collect(Collectors.joining(",", "[", "] "))).collect(Collectors.joining(", ")));
return sb.toString();
}
public static class SearchQueryBuilder {
private String compiledQuery;
public final List<String> searchTermsInclude = new ArrayList<>();
public final List<String> searchTermsExclude = new ArrayList<>();
public final List<String> searchTermsAdvice = new ArrayList<>();
public final List<String> searchTermsPriority = new ArrayList<>();
public final List<SearchPhraseConstraint> searchPhraseConstraints = new ArrayList<>();
private SearchQueryBuilder() {
}
public SearchQueryBuilder compiledQuery(String query) {
this.compiledQuery = query;
return this;
}
public SearchQueryBuilder include(String... terms) {
searchTermsInclude.addAll(List.of(terms));
return this;
}
public SearchQueryBuilder exclude(String... terms) {
searchTermsExclude.addAll(List.of(terms));
return this;
}
public SearchQueryBuilder advice(String... terms) {
searchTermsAdvice.addAll(List.of(terms));
return this;
}
public SearchQueryBuilder priority(String... terms) {
searchTermsPriority.addAll(List.of(terms));
return this;
}
public SearchQueryBuilder phraseConstraint(SearchPhraseConstraint constraint) {
searchPhraseConstraints.add(constraint);
return this;
}
public SearchQuery build() {
return new SearchQuery(compiledQuery, searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchPhraseConstraints);
}
/** If there are no ranking terms, promote the advice terms to ranking terms */
public void promoteNonRankingTerms() {
if (searchTermsInclude.isEmpty() && !searchTermsAdvice.isEmpty()) {
searchTermsInclude.addAll(searchTermsAdvice);
searchTermsAdvice.clear();
}
}
}
}

View File

@ -19,10 +19,14 @@ public class SearchSpecification {
public final String humanQuery;
public final SpecificationLimit quality;
public final SpecificationLimit year;
public final SpecificationLimit size;
public final SpecificationLimit rank;
@Builder.Default
public final SpecificationLimit quality = SpecificationLimit.none();
@Builder.Default
public final SpecificationLimit year = SpecificationLimit.none();
@Builder.Default
public final SpecificationLimit size = SpecificationLimit.none();
@Builder.Default
public final SpecificationLimit rank = SpecificationLimit.none();
public final QueryLimits queryLimits;

View File

@ -34,6 +34,8 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
public final long bestPositions;
public final double rankingScore;
public final int resultsFromDomain;
@Nullable
public ResultRankingDetails rankingDetails;
@ -43,9 +45,6 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
public int domainId() {
return rawIndexResult.getDomainId();
}
public int resultsFromDomain() {
return rawIndexResult.getResultsFromDomain();
}
public List<SearchResultKeywordScore> keywordScores() {
return rawIndexResult.getKeywordScores();
@ -72,6 +71,7 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
int wordsTotal,
long bestPositions,
double rankingScore,
int resultsFromDomain,
@Nullable
ResultRankingDetails rankingDetails
)
@ -88,6 +88,7 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
this.wordsTotal = wordsTotal;
this.bestPositions = bestPositions;
this.rankingScore = rankingScore;
this.resultsFromDomain = resultsFromDomain;
this.rankingDetails = rankingDetails;
}

View File

@ -10,9 +10,7 @@ import lombok.*;
public class ResultRankingParameters {
/** Tuning for BM25 when applied to full document matches */
public final Bm25Parameters fullParams;
/** Tuning for BM25 when applied to priority matches, terms with relevance signal indicators */
public final Bm25Parameters prioParams;
public final Bm25Parameters bm25Params;
/** Documents below this length are penalized */
public int shortDocumentThreshold;
@ -32,11 +30,10 @@ public class ResultRankingParameters {
/** Magnitude of penalty for documents with low average sentence length */
public double shortSentencePenalty;
public double bm25FullWeight;
public double bm25NgramWeight;
public double bm25PrioWeight;
public double tcfJaccardWeight;
public double tcfOverlapWeight;
public double bm25Weight;
public double tcfFirstPosition;
public double tcfVerbatim;
public double tcfProximity;
public TemporalBias temporalBias;
public double temporalBiasWeight;
@ -45,19 +42,17 @@ public class ResultRankingParameters {
public static ResultRankingParameters sensibleDefaults() {
return builder()
.fullParams(new Bm25Parameters(1.2, 0.5))
.prioParams(new Bm25Parameters(1.5, 0))
.bm25Params(new Bm25Parameters(1.2, 0.5))
.shortDocumentThreshold(2000)
.shortDocumentPenalty(2.)
.domainRankBonus(1/25.)
.qualityPenalty(1/15.)
.shortSentenceThreshold(2)
.shortSentencePenalty(5)
.bm25FullWeight(1.)
.bm25NgramWeight(.25)
.bm25PrioWeight(1.)
.tcfOverlapWeight(3.)
.tcfJaccardWeight(1)
.bm25Weight(1.)
.tcfVerbatim(2.)
.tcfProximity(2.)
.tcfFirstPosition(25)
.temporalBias(TemporalBias.NONE)
.temporalBiasWeight(1. / (5.))
.exportDebugData(false)

View File

@ -2,6 +2,7 @@ package nu.marginalia.api.searchquery.model.results;
import lombok.AllArgsConstructor;
import lombok.Getter;
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
import nu.marginalia.model.id.UrlIdCodec;
import org.jetbrains.annotations.NotNull;
@ -25,20 +26,23 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
/** How did the subqueries match against the document ? */
public final List<SearchResultKeywordScore> keywordScores;
/** How many other potential results existed in the same domain */
public int resultsFromDomain;
public boolean hasPrioTerm;
public long bestPositions;
public DebugRankingFactors debugRankingFactors;
public SearchResultItem(long combinedId,
long encodedDocMetadata,
int htmlFeatures,
boolean hasPrioTerm) {
double score,
long bestPositions) {
this.combinedId = combinedId;
this.encodedDocMetadata = encodedDocMetadata;
this.bestPositions = bestPositions;
this.keywordScores = new ArrayList<>();
this.htmlFeatures = htmlFeatures;
this.hasPrioTerm = hasPrioTerm;
this.scoreValue = score;
}
@ -84,7 +88,6 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
@Override
public int compareTo(@NotNull SearchResultItem o) {
// this looks like a bug, but we actually want this in a reversed order
int diff = Double.compare(getScore(), o.getScore());
if (diff != 0)
return diff;

View File

@ -1,40 +1,32 @@
package nu.marginalia.api.searchquery.model.results;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import java.util.Objects;
public final class SearchResultKeywordScore {
public final long termId;
public final String keyword;
private final long encodedWordMetadata;
public byte flags;
public int positionCount;
public SearchResultKeywordScore(String keyword,
long termId,
long encodedWordMetadata) {
byte flags,
int positionCount) {
this.termId = termId;
this.keyword = keyword;
this.encodedWordMetadata = encodedWordMetadata;
}
public boolean hasTermFlag(WordFlags flag) {
return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit());
return (flags & flag.asBit()) != 0;
}
public long positions() {
return WordMetadata.decodePositions(encodedWordMetadata);
}
public boolean isKeywordSpecial() {
return keyword.contains(":") || hasTermFlag(WordFlags.Synthetic);
}
public long encodedWordMetadata() {
return encodedWordMetadata;
}
@Override
public boolean equals(Object obj) {
if (obj == this) return true;
@ -51,8 +43,7 @@ public final class SearchResultKeywordScore {
@Override
public String toString() {
return "SearchResultKeywordScore[" +
"keyword=" + keyword + ", " +
"encodedWordMetadata=" + new WordMetadata(encodedWordMetadata) + ']';
"keyword=" + keyword + ']';
}
}

View File

@ -1,22 +0,0 @@
package nu.marginalia.api.searchquery.model.results;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
@AllArgsConstructor @Getter @ToString
public class SearchResultSet {
public SearchResultSet() {
results = new ArrayList<>();
}
public List<DecoratedSearchResultItem> results;
public int size() {
return results.size();
}
}

View File

@ -0,0 +1,4 @@
package nu.marginalia.api.searchquery.model.results.debug;
public record DebugFactor(String factor, String value) {
}

View File

@ -0,0 +1,5 @@
package nu.marginalia.api.searchquery.model.results.debug;
import java.util.List;
public record DebugFactorGroup(String name, List<DebugFactor> factors) {}

View File

@ -0,0 +1,39 @@
package nu.marginalia.api.searchquery.model.results.debug;
import it.unimi.dsi.fastutil.ints.IntIterator;
import java.util.ArrayList;
import java.util.List;
import java.util.StringJoiner;
/** Utility for capturing debug information about ranking factors */
public class DebugRankingFactors {
private final List<DebugFactor> documentFactors = new ArrayList<>();
private final List<DebugTermFactor> termFactors = new ArrayList<>();
public DebugRankingFactors() {}
public void addDocumentFactor(String factor, String value) {
documentFactors.add(new DebugFactor(factor, value));
}
public void addTermFactor(long termId, String factor, String value) {
termFactors.add(new DebugTermFactor(termId, null, factor, value));
}
public void addTermFactor(long termId, String factor, IntIterator sequenceIter) {
if (!sequenceIter.hasNext()) return;
StringJoiner joiner = new StringJoiner(",");
while (sequenceIter.hasNext()) {
joiner.add(String.valueOf(sequenceIter.nextInt()));
}
termFactors.add(new DebugTermFactor(termId, null, factor, joiner.toString()));
}
public List<DebugFactor> getDocumentFactors() {
return documentFactors;
}
public List<DebugTermFactor> getTermFactors() {
return termFactors;
}
}

View File

@ -0,0 +1,4 @@
package nu.marginalia.api.searchquery.model.results.debug;
public record DebugTermFactor(long termId, String term, String factor, String value) {
}

View File

@ -0,0 +1,6 @@
package nu.marginalia.api.searchquery.model.results.debug;
import java.util.List;
public record DebugTermFactorGroup(String term, long termId, List<DebugFactorGroup> factorList) {
}

View File

@ -1,6 +1,9 @@
package nu.marginalia.api.searchquery.model.results.debug;
public record ResultRankingDetails(ResultRankingInputs inputs, ResultRankingOutputs outputs)
import java.util.List;
public record ResultRankingDetails(List<DebugFactorGroup> docFactorGroups,
List<DebugTermFactorGroup> termFactorGroups)
{
}

View File

@ -1,5 +0,0 @@
package nu.marginalia.api.searchquery.model.results.debug;
import java.util.List;
public record ResultRankingInputs(int rank, int asl, int quality, int size, int topology, int year, List<String> flags) {}

View File

@ -1,17 +0,0 @@
package nu.marginalia.api.searchquery.model.results.debug;
public record ResultRankingOutputs(double averageSentenceLengthPenalty,
double qualityPenalty,
double rankingBonus,
double topologyBonus,
double documentLengthPenalty,
double temporalBias,
double flagsPenalty,
double overallPart,
double tcfOverlap,
double tcfJaccard,
double bM25F,
double bM25N,
double bM25P)
{
}

View File

@ -93,22 +93,30 @@ message RpcDecoratedResultItem {
double rankingScore = 11; // The ranking score of this search result item, lower is better
int64 bestPositions = 12;
RpcResultRankingDetails rankingDetails = 13; // optional, only present if exportDebugData is true in RpcResultRankingParameters
int32 resultsFromDomain = 14;
}
/** A raw index-service view of a search result */
message RpcRawResultItem {
int64 combinedId = 1; // raw ID with bit-encoded ranking information still present
int32 resultsFromDomain = 2; // number of other results from the same domain
int64 encodedDocMetadata = 3; // bit encoded document metadata
int32 htmlFeatures = 4; // bitmask encoding features of the document
repeated RpcResultKeywordScore keywordScores = 5;
bool hasPriorityTerms = 6; // true if this word is important to the document
MATCH_TYPE matchType = 7; // the type of match this result represents
enum MATCH_TYPE {
FLAGS = 0;
PROXIMITY = 1;
PHRASE = 2;
};
}
/* Information about how well a keyword matches a query */
message RpcResultKeywordScore {
string keyword = 1; // the keyword
int64 encodedWordMetadata = 2; // bit encoded word metadata
int32 flags = 2;
int32 positions = 3;
}
/* Query execution parameters */
@ -119,30 +127,32 @@ message RpcQueryLimits {
int32 fetchSize = 4; // Size of the fetch buffer in the index service
}
/** Parameters for the result ranking function */
message RpcResultRankingParameters {
double fullK = 1; // BM25 parameter
double fullB = 2; // BM25 parameter
double prioK = 3; // BM25 parameter
double prioB = 4; // BM25 parameter
double bm25K = 1; // BM25 parameter
double bm25B = 2; // BM25 parameter
int32 shortDocumentThreshold = 5;
double shortDocumentPenalty = 6;
double domainRankBonus = 7;
double qualityPenalty = 8;
int32 shortSentenceThreshold = 9;
double shortSentencePenalty = 10;
double bm25FullWeight = 11;
double bm25NgramWeight = 12;
double bm25PrioWeight = 13;
double tcfOverlapWeight = 14;
double tcfJaccardWeight = 15;
double bm25Weight = 11;
// -- 12 unused --
double tcfFirstPositionWeight = 13;
double tcfVerbatimWeight = 14;
double tcfProximityWeight = 15;
RpcTemporalBias temporalBias = 16;
double temporalBiasWeight = 17;
bool exportDebugData = 18;
}
message RpcResultRankingDetails {
RpcResultRankingInputs inputs = 1;
RpcResultRankingOutputs output = 2;
RpcResultDocumentRankingOutputs documentOutputs = 1;
RpcResultTermRankingOutputs termOutputs = 2;
}
message RpcResultRankingInputs {
@ -155,20 +165,17 @@ message RpcResultRankingInputs {
repeated string flags = 7;
}
message RpcResultRankingOutputs {
double averageSentenceLengthPenalty = 1;
double qualityPenalty = 2;
double rankingBonus = 3;
double topologyBonus = 4;
double documentLengthPenalty = 5;
double temporalBias = 6;
double flagsPenalty = 7;
double overallPart = 8;
double tcfOverlap = 9;
double tcfJaccard = 10;
double bM25F = 11;
double bM25N = 12;
double bM25P = 13;
/** Summary of the output of the ranking function */
message RpcResultDocumentRankingOutputs {
repeated string factor = 1;
repeated string value = 2;
}
message RpcResultTermRankingOutputs {
repeated int64 termId = 1;
repeated string term = 2;
repeated string factor = 3;
repeated string value = 4;
}
/* Defines a single subquery */
@ -177,11 +184,18 @@ message RpcQuery {
repeated string exclude = 2; // These terms must be absent
repeated string advice = 3; // These terms must be present, but do not affect ranking
repeated string priority = 4; // These terms are not mandatory, but affect ranking positively if they are present
repeated RpcCoherences coherences = 5; // Groups of terms that must exist in proximity of each other
repeated RpcPhrases phrases = 5; // Groups of terms that must exist in proximity of each other
string compiledQuery = 6; // Compiled query in infix notation
}
/* Defines a group of search terms that must exist in close proximity within the document */
message RpcCoherences {
repeated string coherences = 1;
/* Defines a group of search terms that must exist in the the specified order within the document */
message RpcPhrases {
repeated string terms = 1;
TYPE type = 2;
enum TYPE {
OPTIONAL = 0;
MANDATORY = 1;
FULL = 2;
};
}

View File

@ -1,10 +1,11 @@
package nu.marginalia.api.searchquery.model.compiled;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
import org.junit.jupiter.api.Test;
import java.util.List;
import static org.junit.jupiter.api.Assertions.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
class CompiledQueryParserTest {
@ -22,6 +23,21 @@ class CompiledQueryParserTest {
assertEquals(w(q, "foo"), q.root);
}
@Test
public void testCohen() {
CompiledQuery<String> q = CompiledQueryParser.parse("( tube brief of elaboration | brief_elaboration_of_a_tube )");
int val = CompiledQueryAggregates.intMaxMinAggregate(q, s ->
switch (s) {
case "brief" -> 3;
case "tube" -> 2;
case "of" -> 1;
default -> 0;
});
assertEquals(0, val);
System.out.println(q.stream().toList());
}
@Test
public void testAndTwoWords() {
CompiledQuery<String> q = CompiledQueryParser.parse("foo bar");

View File

@ -1,6 +1,7 @@
package nu.marginalia.index.client;
import nu.marginalia.api.searchquery.IndexProtobufCodec;
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.index.query.limit.QueryLimits;
@ -10,7 +11,7 @@ import org.junit.jupiter.api.Test;
import java.util.List;
import java.util.function.Function;
import static org.junit.jupiter.api.Assertions.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
class IndexProtobufCodecTest {
@Test
@ -41,7 +42,9 @@ class IndexProtobufCodecTest {
List.of("c", "d"),
List.of("e", "f"),
List.of("g", "h"),
List.of(List.of("i", "j"), List.of("k"))
List.of(
SearchPhraseConstraint.mandatory(List.of("i", "j")),
SearchPhraseConstraint.optional(List.of("k")))
),
s -> IndexProtobufCodec.convertRpcQuery(IndexProtobufCodec.convertRpcQuery(s))
);

View File

@ -31,7 +31,7 @@ dependencies {
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:features-convert:keyword-extraction')
implementation project(':code:processes:converting-process:ft-keyword-extraction')
implementation libs.bundles.slf4j

View File

@ -1,18 +1,15 @@
package nu.marginalia.functions.searchquery.svc;
package nu.marginalia.functions.searchquery;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.query.*;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
import nu.marginalia.functions.searchquery.query_parser.QueryParser;
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
import nu.marginalia.functions.searchquery.query_parser.QueryParser;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -56,11 +53,7 @@ public class QueryFactory {
basicQuery.clear();
}
List<String> searchTermsExclude = new ArrayList<>();
List<String> searchTermsInclude = new ArrayList<>();
List<String> searchTermsAdvice = new ArrayList<>();
List<String> searchTermsPriority = new ArrayList<>();
List<List<String>> searchTermCoherences = new ArrayList<>();
SearchQuery.SearchQueryBuilder queryBuilder = SearchQuery.builder();
SpecificationLimit qualityLimit = SpecificationLimit.none();
SpecificationLimit year = SpecificationLimit.none();
@ -78,58 +71,50 @@ public class QueryFactory {
String[] parts = StringUtils.split(str, '_');
// Checking for stop words here is a bit of a stop-gap to fix the issue of stop words being
// required in the query (which is a problem because they are not indexed). How to do this
// in a clean way is a bit of an open problem that may not get resolved until query-parsing is
// improved.
if (parts.length > 1) {
// Require that the terms appear in sequence
queryBuilder.phraseConstraint(SearchPhraseConstraint.mandatory(parts));
// Construct a regular query from the parts in the quoted string
queryBuilder.include(parts);
if (parts.length > 1 && !anyPartIsStopWord(parts)) {
// Prefer that the actual n-gram is present
searchTermsAdvice.add(str);
// Require that the terms appear in the same sentence
searchTermCoherences.add(Arrays.asList(parts));
// Require that each term exists in the document
// (needed for ranking)
searchTermsInclude.addAll(Arrays.asList(parts));
queryBuilder.priority(str);
}
else {
searchTermsInclude.add(str);
// If the quoted word is a single word, we don't need to do more than include it in the search
queryBuilder.include(str);
}
}
case QueryToken.LiteralTerm(String str, String displayStr) -> {
analyzeSearchTerm(problems, str, displayStr);
searchTermsHuman.addAll(Arrays.asList(displayStr.split("\\s+")));
searchTermsInclude.add(str);
queryBuilder.include(str);
}
case QueryToken.ExcludeTerm(String str, String displayStr) -> searchTermsExclude.add(str);
case QueryToken.PriorityTerm(String str, String displayStr) -> searchTermsPriority.add(str);
case QueryToken.ExcludeTerm(String str, String displayStr) -> queryBuilder.exclude(str);
case QueryToken.PriorityTerm(String str, String displayStr) -> queryBuilder.priority(str);
case QueryToken.AdviceTerm(String str, String displayStr) -> {
searchTermsAdvice.add(str);
queryBuilder.advice(str);
if (str.toLowerCase().startsWith("site:")) {
domain = str.substring("site:".length());
}
}
case QueryToken.YearTerm(String str) -> year = parseSpecificationLimit(str);
case QueryToken.SizeTerm(String str) -> size = parseSpecificationLimit(str);
case QueryToken.RankTerm(String str) -> rank = parseSpecificationLimit(str);
case QueryToken.QualityTerm(String str) -> qualityLimit = parseSpecificationLimit(str);
case QueryToken.YearTerm(SpecificationLimit limit, String displayStr) -> year = limit;
case QueryToken.SizeTerm(SpecificationLimit limit, String displayStr) -> size = limit;
case QueryToken.RankTerm(SpecificationLimit limit, String displayStr) -> rank = limit;
case QueryToken.QualityTerm(SpecificationLimit limit, String displayStr) -> qualityLimit = limit;
case QueryToken.QsTerm(String str) -> queryStrategy = parseQueryStrategy(str);
default -> {}
}
}
if (searchTermsInclude.isEmpty() && !searchTermsAdvice.isEmpty()) {
searchTermsInclude.addAll(searchTermsAdvice);
searchTermsAdvice.clear();
}
queryBuilder.promoteNonRankingTerms();
List<Integer> domainIds = params.domainIds();
@ -139,20 +124,21 @@ public class QueryFactory {
limits = limits.forSingleDomain();
}
var expansion = queryExpansion.expandQuery(searchTermsInclude);
searchTermCoherences.addAll(expansion.extraCoherences());
var expansion = queryExpansion.expandQuery(queryBuilder.searchTermsInclude);
var searchQuery = new SearchQuery(
expansion.compiledQuery(),
searchTermsInclude,
searchTermsExclude,
searchTermsAdvice,
searchTermsPriority,
searchTermCoherences
);
// Query expansion may produce suggestions for phrase constraints,
// add these to the query
for (var coh : expansion.optionalPharseConstraints()) {
queryBuilder.phraseConstraint(SearchPhraseConstraint.optional(coh));
}
// add a pseudo-constraint for the full query
queryBuilder.phraseConstraint(SearchPhraseConstraint.full(expansion.fullPhraseConstraint()));
queryBuilder.compiledQuery(expansion.compiledQuery());
var specsBuilder = SearchSpecification.builder()
.query(searchQuery)
.query(queryBuilder.build())
.humanQuery(query)
.quality(qualityLimit)
.year(year)
@ -183,20 +169,7 @@ public class QueryFactory {
problems.add("Search term \"" + displayStr + "\" too long");
}
}
private SpecificationLimit parseSpecificationLimit(String str) {
int startChar = str.charAt(0);
int val = Integer.parseInt(str.substring(1));
if (startChar == '=') {
return SpecificationLimit.equals(val);
} else if (startChar == '<') {
return SpecificationLimit.lessThan(val);
} else if (startChar == '>') {
return SpecificationLimit.greaterThan(val);
} else {
return SpecificationLimit.none();
}
}
private QueryStrategy parseQueryStrategy(String str) {
return switch (str.toUpperCase()) {
@ -211,14 +184,4 @@ public class QueryFactory {
default -> QueryStrategy.AUTO;
};
}
private boolean anyPartIsStopWord(String[] parts) {
for (String part : parts) {
if (WordPatterns.isStopWord(part)) {
return true;
}
}
return false;
}
}

View File

@ -1,19 +1,16 @@
package nu.marginalia.functions.searchquery;
import com.google.common.collect.Lists;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import io.grpc.stub.StreamObserver;
import io.prometheus.client.Histogram;
import lombok.SneakyThrows;
import nu.marginalia.api.searchquery.*;
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.db.DomainBlacklist;
import nu.marginalia.index.api.IndexClient;
import nu.marginalia.functions.searchquery.svc.QueryFactory;
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
import nu.marginalia.model.id.UrlIdCodec;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -33,18 +30,18 @@ public class QueryGRPCService extends QueryApiGrpc.QueryApiImplBase {
private final QueryFactory queryFactory;
private final DomainBlacklist blacklist;
private final IndexClient indexClient;
@Inject
public QueryGRPCService(QueryFactory queryFactory,
DomainBlacklist blacklist,
IndexClient indexClient)
{
this.queryFactory = queryFactory;
this.blacklist = blacklist;
this.indexClient = indexClient;
}
/** GRPC endpoint that parses a query, delegates it to the index partitions, and then collects the results.
*/
public void query(RpcQsQuery request, StreamObserver<RpcQsResponse> responseObserver)
{
try {
@ -55,16 +52,20 @@ public class QueryGRPCService extends QueryApiGrpc.QueryApiImplBase {
var params = QueryProtobufCodec.convertRequest(request);
var query = queryFactory.createQuery(params, ResultRankingParameters.sensibleDefaults());
RpcIndexQuery indexRequest = QueryProtobufCodec.convertQuery(request, query);
List<RpcDecoratedResultItem> bestItems = executeQueries(indexRequest, request.getQueryLimits().getResultsTotal());
var indexRequest = QueryProtobufCodec.convertQuery(request, query);
// Execute the query on the index partitions
List<RpcDecoratedResultItem> bestItems = indexClient.executeQueries(indexRequest);
// Convert results to response and send it back
var responseBuilder = RpcQsResponse.newBuilder()
.addAllResults(bestItems)
.setSpecs(indexRequest)
.addAllSearchTermsHuman(query.searchTermsHuman);
if (query.domain != null)
if (query.domain != null) {
responseBuilder.setDomain(query.domain);
}
responseObserver.onNext(responseBuilder.build());
responseObserver.onCompleted();
@ -75,44 +76,19 @@ public class QueryGRPCService extends QueryApiGrpc.QueryApiImplBase {
}
}
private static final Comparator<RpcDecoratedResultItem> comparator =
Comparator.comparing(RpcDecoratedResultItem::getRankingScore);
private boolean isBlacklisted(RpcDecoratedResultItem item) {
return blacklist.isBlacklisted(UrlIdCodec.getDomainId(item.getRawItem().getCombinedId()));
}
public record DetailedDirectResult(ProcessedQuery processedQuery,
List<DecoratedSearchResultItem> result) {}
/** Local query execution, without GRPC. */
public DetailedDirectResult executeDirect(
String originalQuery,
QueryParams params,
ResultRankingParameters rankingParameters,
int count) {
ResultRankingParameters rankingParameters) {
var query = queryFactory.createQuery(params, rankingParameters);
var items = indexClient.executeQueries(QueryProtobufCodec.convertQuery(originalQuery, query));
var items = executeQueries(
QueryProtobufCodec.convertQuery(originalQuery, query),
count)
.stream().map(QueryProtobufCodec::convertQueryResult)
.toList();
return new DetailedDirectResult(query, items);
}
public record DetailedDirectResult(ProcessedQuery processedQuery,
List<DecoratedSearchResultItem> result) {}
@SneakyThrows
List<RpcDecoratedResultItem> executeQueries(RpcIndexQuery indexRequest, int totalSize) {
var results = indexClient.executeQueries(indexRequest);
results.sort(comparator);
results.removeIf(this::isBlacklisted);
if (results.size() > totalSize) {
results = results.subList(0, totalSize);
}
return results;
return new DetailedDirectResult(query, Lists.transform(items, QueryProtobufCodec::convertQueryResult));
}
}

View File

@ -5,7 +5,6 @@ import com.google.inject.Inject;
import nu.marginalia.functions.searchquery.query_parser.model.QWord;
import nu.marginalia.functions.searchquery.query_parser.model.QWordGraph;
import nu.marginalia.functions.searchquery.query_parser.model.QWordPathsRenderer;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.segmentation.NgramLexicon;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.apache.commons.lang3.StringUtils;
@ -45,11 +44,17 @@ public class QueryExpansion {
strategy.expand(graph);
}
List<List<String>> coherences = createSegments(graph);
List<List<String>> optionalPhraseConstraints = createSegments(graph);
// also create a segmentation that is just the entire query
List<String> fullPhraseConstraint = new ArrayList<> ();
for (var qw : graph) {
fullPhraseConstraint.add(qw.word());
}
var compiled = QWordPathsRenderer.render(graph);
return new Expansion(compiled, coherences);
return new Expansion(compiled, optionalPhraseConstraints, fullPhraseConstraint);
}
private static final Pattern dashPattern = Pattern.compile("-");
@ -131,6 +136,10 @@ public class QueryExpansion {
nodes.add(qw);
}
if (nodes.size() <= 1) {
return List.of();
}
String[] words = nodes.stream().map(QWord::stemmed).toArray(String[]::new);
// Grab all segments
@ -141,15 +150,11 @@ public class QueryExpansion {
}
allSegments.sort(Comparator.comparing(NgramLexicon.SentenceSegment::start));
if (allSegments.isEmpty()) {
return List.of();
}
Set<List<String>> constraints = new HashSet<>();
Set<NgramLexicon.SentenceSegment> bestSegmentation =
findBestSegmentation(allSegments);
List<List<String>> coherences = new ArrayList<>();
for (var segment : bestSegmentation) {
int start = segment.start();
@ -159,14 +164,14 @@ public class QueryExpansion {
for (int i = start; i < end; i++) {
components.add(nodes.get(i).word());
}
coherences.add(components);
constraints.add(components);
// Create an n-gram search term for the segment
String word = String.join("_", components);
graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word);
}
return coherences;
return new ArrayList<>(constraints);
}
private Set<NgramLexicon.SentenceSegment> findBestSegmentation(List<NgramLexicon.SentenceSegment> allSegments) {
@ -209,5 +214,5 @@ public class QueryExpansion {
void expand(QWordGraph graph);
}
public record Expansion(String compiledQuery, List<List<String>> extraCoherences) {}
public record Expansion(String compiledQuery, List<List<String>> optionalPharseConstraints, List<String> fullPhraseConstraint) {}
}

View File

@ -1,22 +1,24 @@
package nu.marginalia.functions.searchquery.query_parser;
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.encoding.AsciiFlattener;
import nu.marginalia.util.transform_list.TransformList;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
public class QueryParser {
private final QueryTokenizer tokenizer = new QueryTokenizer();
public List<QueryToken> parse(String query) {
List<QueryToken> basicTokens = tokenizer.tokenizeQuery(query);
List<QueryToken> basicTokens = tokenizeQuery(query);
TransformList<QueryToken> list = new TransformList<>(basicTokens);
list.transformEach(QueryParser::handleQuoteTokens);
list.transformEach(QueryParser::trimLiterals);
list.transformEach(QueryParser::handleQuoteTokens);
list.transformEachPair(QueryParser::createNegatedTerms);
list.transformEachPair(QueryParser::createPriorityTerms);
list.transformEach(QueryParser::handleSpecialOperations);
@ -26,6 +28,96 @@ public class QueryParser {
return list.getBackingList();
}
private static final Pattern noisePattern = Pattern.compile("[,\\s]");
public List<QueryToken> tokenizeQuery(String rawQuery) {
List<QueryToken> tokens = new ArrayList<>();
String query = AsciiFlattener.flattenUnicode(rawQuery);
query = noisePattern.matcher(query).replaceAll(" ");
int chr = -1;
int parenDepth = 0;
for (int i = 0; i < query.length(); i++) {
chr = query.charAt(i);
if ('(' == chr) {
parenDepth++;
tokens.add(new QueryToken.LParen());
}
else if (')' == chr) {
parenDepth--;
tokens.add(new QueryToken.RParen());
}
else if ('"' == chr) {
int end = query.indexOf('"', i+1);
if (end == -1) {
end = query.length();
}
tokens.add(new QueryToken.Quot(query.substring(i + 1, end).toLowerCase()));
i = end;
}
else if ('-' == chr) {
tokens.add(new QueryToken.Minus());
}
else if ('?' == chr) {
tokens.add(new QueryToken.QMark());
}
else if (!Character.isSpaceChar(chr)) {
// search for the end of the term
int end = i+1;
int prevC = -1;
int c = -1;
for (; end < query.length(); end++) {
prevC = c;
c = query.charAt(end);
if (prevC == '\\')
continue;
if (c == ' ')
break;
// special case to deal with possible RPAREN token at the end,
// but we don't want to break if it's likely part of the search term
if (c == '(' && prevC != ')' && parenDepth > 0)
break;
}
String displayStr = query.substring(i, end);
String str = trimEscape(displayStr.toLowerCase());
tokens.add(new QueryToken.LiteralTerm(str, displayStr));
i = end-1;
}
}
return tokens;
}
private String trimEscape(String str) {
if (!str.contains("\\")) {
return str;
}
StringBuilder sb = new StringBuilder(str.length());
for (int j = 0; j < str.length(); j++) {
char c = str.charAt(j);
if (c == '\\') {
if (j + 1 < str.length()) {
sb.append(str.charAt(j + 1));
j++;
}
} else {
sb.append(c);
}
}
return sb.toString();
}
private static void normalizeDomainName(TransformList<QueryToken>.Entity entity) {
var t = entity.value();
@ -60,10 +152,22 @@ public class QueryParser {
if (str.isBlank())
return;
if (str.endsWith(":") || str.endsWith(".")) {
// Remove trailing punctuation
int lastChar = str.charAt(str.length() - 1);
if (":.,!?$'".indexOf(lastChar) >= 0)
entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 1), lt.displayStr()));
}
// Remove term elements that aren't indexed by the search engine
if (str.endsWith("'s"))
entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 2), lt.displayStr()));
if (str.endsWith("()"))
entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 2), lt.displayStr()));
if (str.startsWith("$"))
entity.replace(new QueryToken.LiteralTerm(str.substring(1), lt.displayStr()));
if (entity.isBlank()) {
entity.remove();
}
}
private static void createNegatedTerms(TransformList<QueryToken>.Entity first, TransformList<QueryToken>.Entity second) {
@ -104,15 +208,19 @@ public class QueryParser {
String str = t.str();
if (str.startsWith("q") && str.matches("q[=><]\\d+")) {
entity.replace(new QueryToken.QualityTerm(str.substring(1)));
var limit = parseSpecificationLimit(str.substring(1));
entity.replace(new QueryToken.QualityTerm(limit, str));
} else if (str.startsWith("near:")) {
entity.replace(new QueryToken.NearTerm(str.substring(5)));
} else if (str.startsWith("year") && str.matches("year[=><]\\d{4}")) {
entity.replace(new QueryToken.YearTerm(str.substring(4)));
var limit = parseSpecificationLimit(str.substring(4));
entity.replace(new QueryToken.YearTerm(limit, str));
} else if (str.startsWith("size") && str.matches("size[=><]\\d+")) {
entity.replace(new QueryToken.SizeTerm(str.substring(4)));
var limit = parseSpecificationLimit(str.substring(4));
entity.replace(new QueryToken.SizeTerm(limit, str));
} else if (str.startsWith("rank") && str.matches("rank[=><]\\d+")) {
entity.replace(new QueryToken.RankTerm(str.substring(4)));
var limit = parseSpecificationLimit(str.substring(4));
entity.replace(new QueryToken.RankTerm(limit, str));
} else if (str.startsWith("qs=")) {
entity.replace(new QueryToken.QsTerm(str.substring(3)));
} else if (str.contains(":")) {
@ -120,6 +228,21 @@ public class QueryParser {
}
}
private static SpecificationLimit parseSpecificationLimit(String str) {
int startChar = str.charAt(0);
int val = Integer.parseInt(str.substring(1));
if (startChar == '=') {
return SpecificationLimit.equals(val);
} else if (startChar == '<') {
return SpecificationLimit.lessThan(val);
} else if (startChar == '>') {
return SpecificationLimit.greaterThan(val);
} else {
return SpecificationLimit.none();
}
}
private static void handleAdvisoryTerms(TransformList<QueryToken>.Entity entity) {
var t = entity.value();
if (t instanceof QueryToken.LParen) {

View File

@ -1,69 +0,0 @@
package nu.marginalia.functions.searchquery.query_parser;
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
import nu.marginalia.language.encoding.AsciiFlattener;
import nu.marginalia.language.sentence.SentenceExtractorStringUtils;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
public class QueryTokenizer {
private static final Pattern noisePattern = Pattern.compile("[,\\s]");
public List<QueryToken> tokenizeQuery(String rawQuery) {
List<QueryToken> tokens = new ArrayList<>();
String query = AsciiFlattener.flattenUnicode(rawQuery);
query = noisePattern.matcher(query).replaceAll(" ");
for (int i = 0; i < query.length(); i++) {
int chr = query.charAt(i);
if ('(' == chr) {
tokens.add(new QueryToken.LParen());
}
else if (')' == chr) {
tokens.add(new QueryToken.RParen());
}
else if ('"' == chr) {
int end = query.indexOf('"', i+1);
if (end == -1) {
end = query.length();
}
tokens.add(new QueryToken.Quot(query.substring(i + 1, end).toLowerCase()));
i = end;
}
else if ('-' == chr) {
tokens.add(new QueryToken.Minus());
}
else if ('?' == chr) {
tokens.add(new QueryToken.QMark());
}
else if (Character.isSpaceChar(chr)) {
//
}
else {
int end = i+1;
for (; end < query.length(); end++) {
if (query.charAt(end) == ' ' || query.charAt(end) == ')')
break;
}
String displayStr = query.substring(i, end);
String str = SentenceExtractorStringUtils.toLowerCaseStripPossessive(displayStr);
tokens.add(new QueryToken.LiteralTerm(str, displayStr));
i = end-1;
}
}
return tokens;
}
}

View File

@ -248,16 +248,29 @@ public class QWordGraph implements Iterable<QWord> {
@Override
public Iterator<QWord> iterator() {
return new Iterator<>() {
QWord next = null;
QWord pos = QWord.beg();
@Override
public boolean hasNext() {
return !pos.isEnd();
if (next == null) {
if (pos.isEnd()) {
return false;
}
next = getNextOriginal(pos).getFirst();
}
return !next.isEnd();
}
@Override
public QWord next() {
pos = getNextOriginal(pos).getFirst();
if (!hasNext()) {
throw new NoSuchElementException();
}
pos = next;
next = null;
return pos;
}
};

View File

@ -1,6 +1,8 @@
package nu.marginalia.functions.searchquery.query_parser.token;
import nu.marginalia.index.query.limit.SpecificationLimit;
public sealed interface QueryToken {
String str();
String displayStr();
@ -11,25 +13,18 @@ public sealed interface QueryToken {
record AdviceTerm(String str, String displayStr) implements QueryToken {}
record PriorityTerm(String str, String displayStr) implements QueryToken {}
record QualityTerm(String str) implements QueryToken {
public String displayStr() {
return "q" + str;
}
record QualityTerm(SpecificationLimit limit, String displayStr) implements QueryToken {
public String str() { return displayStr; }
}
record YearTerm(String str) implements QueryToken {
public String displayStr() {
return "year" + str;
}
record YearTerm(SpecificationLimit limit, String displayStr) implements QueryToken {
public String str() { return displayStr; }
}
record SizeTerm(String str) implements QueryToken {
public String displayStr() {
return "size" + str;
}
record SizeTerm(SpecificationLimit limit, String displayStr) implements QueryToken {
public String str() { return displayStr; }
}
record RankTerm(String str) implements QueryToken {
public String displayStr() {
return "rank" + str;
}
record RankTerm(SpecificationLimit limit, String displayStr) implements QueryToken {
public String str() { return displayStr; }
}
record NearTerm(String str) implements QueryToken {
public String displayStr() {

View File

@ -1,5 +1,7 @@
package nu.marginalia.util.transform_list;
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
import java.util.List;
import java.util.function.BiConsumer;
import java.util.function.Consumer;
@ -30,7 +32,7 @@ import java.util.function.Predicate;
* </pre>
* </code>
*/
public class TransformList<T> {
public class TransformList<T extends QueryToken> {
private final List<T> backingList;
public TransformList(List<T> backingList) {
@ -138,6 +140,10 @@ public class TransformList<T> {
value = newValue;
}
public boolean isBlank() {
return value == null || value.str().isBlank();
}
public void remove() {
action = Action.REMOVE;
}

View File

@ -1,5 +1,6 @@
package nu.marginalia.functions.searchquery.query_parser.model;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import java.util.Comparator;
@ -100,7 +101,8 @@ class QWordGraphTest {
assertEquals("q b ( c | d )", graph.compileToQuery());
}
@Test // this test is a bit flaky, the order of the variants is not guaranteed
@Disabled // flaky, the order of the variants is not guaranteed
@Test
void testCompile5() {
// Construct a graph like

View File

@ -1,16 +1,16 @@
package nu.marginalia.query.svc;
import nu.marginalia.WmsaHome;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.functions.searchquery.QueryFactory;
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
import nu.marginalia.functions.searchquery.svc.QueryFactory;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.index.query.limit.SpecificationLimitType;
import nu.marginalia.segmentation.NgramLexicon;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
@ -57,7 +57,12 @@ public class QueryFactoryTest {
@Test
void qsec10() {
try (var lines = Files.lines(Path.of("/home/vlofgren/Exports/qsec10/webis-qsec-10-training-set/webis-qsec-10-training-set-queries.txt"))) {
Path webis = Path.of("/home/vlofgren/Exports/qsec10/webis-qsec-10-training-set/webis-qsec-10-training-set-queries.txt");
if (!Files.exists(webis))
return;
try (var lines = Files.lines(webis)) {
lines.limit(1000).forEach(line -> {
String[] parts = line.split("\t");
if (parts.length == 2) {
@ -124,24 +129,6 @@ public class QueryFactoryTest {
assertEquals(2000, size.value());
}
@Test
public void testQuotedStopwords() {
{
// the is a stopword, so it should generate an ngram search term
var specs = parseAndGetSpecs("\"the shining\"");
assertEquals("the_shining", specs.query.compiledQuery);
}
{
// tde isn't a stopword, so we should get the normal behavior
var specs = parseAndGetSpecs("\"tde shining\"");
assertEquals("( shining tde | tde_shining )", specs.query.compiledQuery);
assertEquals(List.of("tde_shining"), specs.query.searchTermsAdvice);
assertEquals(List.of(List.of("tde", "shining")), specs.query.searchTermCoherences);
}
}
@Test
public void testParseQualityEq() {
var quality = parseAndGetSpecs("q=2000").quality;
@ -212,12 +199,38 @@ public class QueryFactoryTest {
var subquery = parseAndGetSpecs("The");
System.out.println("Time: " + (System.currentTimeMillis() - start));
System.out.println(subquery);
} @Test
}
@Test
public void testExpansion6() {
long start = System.currentTimeMillis();
var subquery = parseAndGetSpecs("burning the nerves in the neck");
System.out.println("Time: " + (System.currentTimeMillis() - start));
System.out.println(subquery);
}
@Test
public void testExpansion7() {
long start = System.currentTimeMillis();
var subquery = parseAndGetSpecs("amazing work being done");
System.out.println("Time: " + (System.currentTimeMillis() - start));
System.out.println(subquery);
}
@Test
public void testExpansion8() {
long start = System.currentTimeMillis();
var subquery = parseAndGetSpecs("success often consists of");
System.out.println("Time: " + (System.currentTimeMillis() - start));
System.out.println(subquery);
}
@Test
public void testParsing() {
long start = System.currentTimeMillis();
var subquery = parseAndGetSpecs("strlen()");
assertEquals("strlen", subquery.query.compiledQuery);
System.out.println("Time: " + (System.currentTimeMillis() - start));
System.out.println(subquery);
}
}

View File

@ -15,6 +15,7 @@ dependencies {
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:common:service')
implementation project(':code:common:db')
implementation project(':code:libraries:message-queue')
implementation project(':code:functions:search-query:api')

View File

@ -6,6 +6,8 @@ import lombok.SneakyThrows;
import nu.marginalia.api.searchquery.IndexApiGrpc;
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
import nu.marginalia.api.searchquery.RpcIndexQuery;
import nu.marginalia.db.DomainBlacklistImpl;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.service.client.GrpcChannelPoolFactory;
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
import nu.marginalia.service.discovery.property.ServiceKey;
@ -14,6 +16,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
@ -22,21 +25,34 @@ import java.util.concurrent.Executors;
public class IndexClient {
private static final Logger logger = LoggerFactory.getLogger(IndexClient.class);
private final GrpcMultiNodeChannelPool<IndexApiGrpc.IndexApiBlockingStub> channelPool;
private final DomainBlacklistImpl blacklist;
private static final ExecutorService executor = Executors.newFixedThreadPool(32);
@Inject
public IndexClient(GrpcChannelPoolFactory channelPoolFactory) {
public IndexClient(GrpcChannelPoolFactory channelPoolFactory, DomainBlacklistImpl blacklist) {
this.channelPool = channelPoolFactory.createMulti(
ServiceKey.forGrpcApi(IndexApiGrpc.class, ServicePartition.multi()),
IndexApiGrpc::newBlockingStub);
this.blacklist = blacklist;
}
private static final Comparator<RpcDecoratedResultItem> comparator =
Comparator.comparing(RpcDecoratedResultItem::getRankingScore);
/** Execute a query on the index partitions and return the combined results. */
@SneakyThrows
public List<RpcDecoratedResultItem> executeQueries(RpcIndexQuery indexRequest) {
var futures =
channelPool.call(IndexApiGrpc.IndexApiBlockingStub::query)
.async(executor)
.runEach(indexRequest);
List<RpcDecoratedResultItem> results = new ArrayList<>();
final int resultsTotal = indexRequest.getQueryLimits().getResultsTotal();
final int resultsUpperBound = resultsTotal * channelPool.getNumNodes();
List<RpcDecoratedResultItem> results = new ArrayList<>(resultsUpperBound);
for (var future : futures) {
try {
future.get().forEachRemaining(results::add);
@ -46,7 +62,20 @@ public class IndexClient {
}
}
// Sort the results by ranking score and remove blacklisted domains
results.sort(comparator);
results.removeIf(this::isBlacklisted);
// Keep only as many results as were requested
if (results.size() > resultsTotal) {
results = results.subList(0, resultsTotal);
}
return results;
}
private boolean isBlacklisted(RpcDecoratedResultItem item) {
return blacklist.isBlacklisted(UrlIdCodec.getDomainId(item.getRawItem().getCombinedId()));
}
}

View File

@ -5,7 +5,5 @@ public class IndexMqEndpoints {
public static final String INDEX_RERANK = "INDEX-RERANK";
public static final String INDEX_REPARTITION = "INDEX-REPARTITION";
public static final String SWITCH_INDEX = "SWITCH-INDEX";
public static final String SWITCH_LINKDB = "SWITCH_LINKDB";
}

View File

@ -15,12 +15,15 @@ dependencies {
implementation 'org.jgrapht:jgrapht-core:1.5.2'
implementation project(':third-party:commons-codec')
implementation project(':third-party:parquet-floor')
implementation project(':code:index:api')
implementation project(':code:functions:link-graph:api')
implementation project(':code:libraries:array')
implementation project(':code:libraries:btree')
implementation project(':code:libraries:coded-sequence')
implementation project(':code:libraries:language-processing')
implementation project(':code:common:db')
implementation project(':code:common:config')
@ -28,14 +31,16 @@ dependencies {
implementation project(':code:common:linkdb')
implementation project(':code:common:service')
implementation project(':code:functions:search-query:api')
implementation project(':code:processes:converting-process:model')
implementation project(':code:functions:search-query:api')
implementation project(':code:index:index-forward')
implementation project(':code:index:index-reverse')
implementation project(':code:index:query')
implementation project(':code:index:index-journal')
implementation libs.slop
implementation libs.bundles.slf4j
implementation libs.prometheus
@ -66,9 +71,11 @@ dependencies {
testImplementation project(':code:libraries:array')
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
testImplementation libs.commons.codec
testImplementation 'org.testcontainers:mariadb:1.17.4'
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
testImplementation project(':code:libraries:test-helpers')
testImplementation project(':code:libraries:term-frequency-dict')
testImplementation project(':code:libraries:braille-block-punch-cards')
testImplementation project(':code:libraries:test-helpers')
}

View File

@ -15,10 +15,13 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:libraries:array')
implementation project(':code:libraries:btree')
implementation project(':code:libraries:coded-sequence')
implementation project(':code:libraries:language-processing')
implementation project(':code:index:query')
implementation project(':code:index:index-journal')
implementation project(':code:common:model')
implementation project(':code:common:process')
implementation project(':code:processes:converting-process:model')
implementation libs.bundles.slf4j
@ -26,7 +29,9 @@ dependencies {
implementation libs.roaringbitmap
implementation libs.fastutil
implementation libs.trove
implementation libs.slop
testImplementation project(':code:libraries:test-helpers')
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito

View File

@ -1,127 +0,0 @@
package nu.marginalia.index.forward;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.array.LongArray;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.process.control.ProcessHeartbeat;
import org.roaringbitmap.longlong.LongConsumer;
import org.roaringbitmap.longlong.Roaring64Bitmap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
public class ForwardIndexConverter {
private final ProcessHeartbeat heartbeat;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final IndexJournalReader journalReader;
private final Path outputFileDocsId;
private final Path outputFileDocsData;
private final DomainRankings domainRankings;
public ForwardIndexConverter(ProcessHeartbeat heartbeat,
IndexJournalReader journalReader,
Path outputFileDocsId,
Path outputFileDocsData,
DomainRankings domainRankings
) {
this.heartbeat = heartbeat;
this.journalReader = journalReader;
this.outputFileDocsId = outputFileDocsId;
this.outputFileDocsData = outputFileDocsData;
this.domainRankings = domainRankings;
}
public enum TaskSteps {
GET_DOC_IDS,
GATHER_OFFSETS,
SUPPLEMENTAL_INDEXES,
FORCE,
FINISHED
}
public void convert() throws IOException {
deleteOldFiles();
logger.info("Domain Rankings size = {}", domainRankings.size());
try (var progress = heartbeat.createProcessTaskHeartbeat(TaskSteps.class, "forwardIndexConverter")) {
progress.progress(TaskSteps.GET_DOC_IDS);
LongArray docsFileId = getDocIds(outputFileDocsId, journalReader);
progress.progress(TaskSteps.GATHER_OFFSETS);
// doc ids -> sorted list of ids
Long2IntOpenHashMap docIdToIdx = new Long2IntOpenHashMap((int) docsFileId.size());
docsFileId.forEach(0, docsFileId.size(), (pos, val) -> docIdToIdx.put(val, (int) pos));
progress.progress(TaskSteps.SUPPLEMENTAL_INDEXES);
// docIdToIdx -> file offset for id
LongArray docFileData = LongArrayFactory.mmapForWritingConfined(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size());
var pointer = journalReader.newPointer();
while (pointer.nextDocument()) {
long docId = pointer.documentId();
int domainId = UrlIdCodec.getDomainId(docId);
long entryOffset = (long) ForwardIndexParameters.ENTRY_SIZE * docIdToIdx.get(docId);
int ranking = domainRankings.getRanking(domainId);
long meta = DocumentMetadata.encodeRank(pointer.documentMeta(), ranking);
docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta);
docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, pointer.documentFeatures());
}
progress.progress(TaskSteps.FORCE);
docFileData.force();
docsFileId.force();
docFileData.close();
docsFileId.close();
progress.progress(TaskSteps.FINISHED);
} catch (IOException ex) {
logger.error("Failed to convert", ex);
throw ex;
}
}
private LongArray getDocIds(Path outputFileDocs, IndexJournalReader journalReader) throws IOException {
Roaring64Bitmap rbm = new Roaring64Bitmap();
journalReader.forEachDocId(rbm::add);
LongArray ret = LongArrayFactory.mmapForWritingConfined(outputFileDocs, rbm.getIntCardinality());
rbm.forEach(new LongConsumer() {
int offset;
@Override
public void accept(long value) {
ret.set(offset++, value);
}
});
return ret;
}
private void deleteOldFiles() throws IOException {
Files.deleteIfExists(outputFileDocsId);
Files.deleteIfExists(outputFileDocsData);
}
}

Some files were not shown because too many files have changed in this diff Show More