mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Merge pull request #99 from MarginaliaSearch/term-positions
Improve term positions accuracy
This commit is contained in:
commit
463b3ed0ce
@ -6,7 +6,7 @@ plugins {
|
||||
|
||||
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly
|
||||
// https://github.com/GoogleContainerTools/jib/issues/3347
|
||||
id 'com.google.cloud.tools.jib' version '3.4.2' apply(false)
|
||||
id 'com.google.cloud.tools.jib' version '3.4.3' apply(false)
|
||||
}
|
||||
|
||||
group 'marginalia'
|
||||
@ -44,10 +44,11 @@ subprojects.forEach {it ->
|
||||
}
|
||||
|
||||
ext {
|
||||
jvmVersion=21
|
||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:21@sha256:1fd33d4d4eba3a9e1a41a728e39ea217178d257694eea1214fec68d2ed4d3d9b'
|
||||
jvmVersion=22
|
||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:22'
|
||||
dockerImageTag='latest'
|
||||
dockerImageRegistry='marginalia'
|
||||
jibVersion = '3.4.3'
|
||||
}
|
||||
|
||||
idea {
|
||||
|
@ -33,6 +33,7 @@ dependencies {
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
|
||||
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||
testImplementation libs.commons.codec
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
|
@ -54,6 +54,7 @@ dependencies {
|
||||
|
||||
|
||||
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||
testImplementation libs.commons.codec
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
|
@ -41,6 +41,7 @@ dependencies {
|
||||
testImplementation libs.mockito
|
||||
|
||||
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||
testImplementation libs.commons.codec
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
|
@ -22,6 +22,12 @@ import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/** Reads the document database, which is a SQLite database
|
||||
* containing the URLs and metadata of the documents in the
|
||||
* index.
|
||||
* <p></p>
|
||||
* The database is created by the DocumentDbWriter class.
|
||||
* */
|
||||
@Singleton
|
||||
public class DocumentDbReader {
|
||||
private final Path dbFile;
|
||||
@ -52,6 +58,11 @@ public class DocumentDbReader {
|
||||
}
|
||||
}
|
||||
|
||||
/** Switches the input database file to a new file.
|
||||
* <p></p>
|
||||
* This is used to switch over to a new database file
|
||||
* when the index is re-indexed.
|
||||
* */
|
||||
public void switchInput(Path newDbFile) throws IOException, SQLException {
|
||||
if (!Files.isRegularFile(newDbFile)) {
|
||||
logger.error("Source is not a file, refusing switch-over {}", newDbFile);
|
||||
@ -78,35 +89,11 @@ public class DocumentDbReader {
|
||||
connection = createConnection();
|
||||
}
|
||||
|
||||
public List<String> getUrlsFromDomain(int domainId) throws SQLException {
|
||||
if (connection == null ||
|
||||
connection.isClosed())
|
||||
{
|
||||
throw new RuntimeException("URL query temporarily unavailable due to database switch");
|
||||
}
|
||||
|
||||
long minId = UrlIdCodec.encodeId(domainId, 0);
|
||||
long maxId = UrlIdCodec.encodeId(domainId+1, 0);
|
||||
|
||||
List<String> ret = new ArrayList<>();
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
SELECT URL
|
||||
FROM DOCUMENT
|
||||
WHERE ID >= ? AND ID < ?
|
||||
"""))
|
||||
{
|
||||
stmt.setLong(1, minId);
|
||||
stmt.setLong(2, maxId);
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
ret.add(rs.getString(1));
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/** Returns the URL details for the given document ids.
|
||||
* <p></p>
|
||||
* This is used to get the URL details for the search
|
||||
* results.
|
||||
* */
|
||||
public List<DocdbUrlDetail> getUrlDetails(TLongList ids) throws SQLException {
|
||||
List<DocdbUrlDetail> ret = new ArrayList<>(ids.size());
|
||||
|
||||
|
@ -9,6 +9,10 @@ import java.sql.DriverManager;
|
||||
import java.sql.SQLException;
|
||||
import java.util.List;
|
||||
|
||||
/** Writes the document database, which is a SQLite database
|
||||
* containing the URLs and metadata of the documents in the
|
||||
* index.
|
||||
* */
|
||||
public class DocumentDbWriter {
|
||||
|
||||
private final Connection connection;
|
||||
|
@ -14,6 +14,7 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:libraries:braille-block-punch-cards')
|
||||
implementation project(':code:libraries:coded-sequence')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
|
@ -37,9 +37,24 @@ public class UrlIdCodec {
|
||||
domainId &= 0x7FFF_FFFF;
|
||||
documentOrdinal &= 0x03FF_FFFF;
|
||||
|
||||
assert (domainId & 0x7FFF_FFFF) == domainId : "Domain id must be in [0, 2^31-1], was " + domainId;
|
||||
assert (documentOrdinal & 0x03FF_FFFF) == documentOrdinal : "Document ordinal must be in [0, 2^26-1], was " + documentOrdinal;
|
||||
|
||||
return ((long) domainId << 26) | documentOrdinal;
|
||||
}
|
||||
|
||||
/** Encode a URL id with a ranking element */
|
||||
public static long encodeId(int rank, int domainId, int documentOrdinal) {
|
||||
assert (rank & 0x3F) == rank : "Rank must be in [0, 63], was " + rank;
|
||||
assert (domainId & 0x7FFF_FFFF) == domainId : "Domain id must be in [0, 2^31-1], was " + domainId;
|
||||
assert (documentOrdinal & 0x03FF_FFFF) == documentOrdinal : "Document ordinal must be in [0, 2^26-1], was " + documentOrdinal;
|
||||
|
||||
domainId &= 0x7FFF_FFFF;
|
||||
documentOrdinal &= 0x03FF_FFFF;
|
||||
rank &= 0x3F;
|
||||
|
||||
return ((long) rank << 57) | ((long) domainId << 26) | documentOrdinal;
|
||||
}
|
||||
/** Add a ranking element to an existing combined URL id.
|
||||
*
|
||||
* @param rank [0,1] the importance of the domain, low is good
|
||||
@ -67,7 +82,7 @@ public class UrlIdCodec {
|
||||
|
||||
/** Extract the document ordinal component from this URL id */
|
||||
public static int getRank(long combinedId) {
|
||||
return (int) (combinedId >>> 57);
|
||||
return (int) (combinedId >>> 57) & 0x3F;
|
||||
}
|
||||
|
||||
/** Mask out the ranking element from this URL id */
|
||||
|
@ -0,0 +1,6 @@
|
||||
package nu.marginalia.model.idx;
|
||||
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
|
||||
public record CodedWordSpan(byte code, VarintCodedSequence spans) {
|
||||
}
|
@ -4,16 +4,12 @@ package nu.marginalia.model.idx;
|
||||
import java.util.EnumSet;
|
||||
|
||||
public enum WordFlags {
|
||||
|
||||
/** Word appears in title */
|
||||
Title,
|
||||
|
||||
/** Word appears to be the subject in several sentences */
|
||||
Subjects,
|
||||
|
||||
/** Word has high tf-idf */
|
||||
TfIdfHigh,
|
||||
|
||||
/** Word is a likely named object. This is a weaker version of Subjects. */
|
||||
NamesWords,
|
||||
|
||||
@ -42,19 +38,27 @@ public enum WordFlags {
|
||||
ExternalLink
|
||||
;
|
||||
|
||||
public int asBit() {
|
||||
return 1 << ordinal();
|
||||
public byte asBit() {
|
||||
return (byte) (1 << ordinal());
|
||||
}
|
||||
|
||||
public boolean isPresent(long value) {
|
||||
public boolean isPresent(byte value) {
|
||||
return (asBit() & value) > 0;
|
||||
}
|
||||
|
||||
public boolean isAbsent(long value) {
|
||||
public boolean isAbsent(byte value) {
|
||||
return (asBit() & value) == 0;
|
||||
}
|
||||
|
||||
public static EnumSet<WordFlags> decode(long encodedValue) {
|
||||
public static byte encode(EnumSet<WordFlags> flags) {
|
||||
byte ret = 0;
|
||||
for (WordFlags f : flags) {
|
||||
ret |= f.asBit();
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
public static EnumSet<WordFlags> decode(byte encodedValue) {
|
||||
EnumSet<WordFlags> ret = EnumSet.noneOf(WordFlags.class);
|
||||
|
||||
for (WordFlags f : values()) {
|
||||
|
@ -1,89 +0,0 @@
|
||||
package nu.marginalia.model.idx;
|
||||
|
||||
|
||||
import nu.marginalia.bbpc.BrailleBlockPunchCards;
|
||||
|
||||
import java.util.EnumSet;
|
||||
import java.util.Set;
|
||||
|
||||
/** Word level metadata designed to fit in a single 64 bit long.
|
||||
*
|
||||
* @param positions bitmask of term positions within the document
|
||||
* @param flags word flags (see {@link WordFlags})
|
||||
*/
|
||||
public record WordMetadata(long positions,
|
||||
int flags) {
|
||||
|
||||
public static final long FLAGS_MASK = (1L << WordFlags.values().length) - 1;
|
||||
public static final int POSITIONS_COUNT = 64 - WordFlags.values().length;
|
||||
public static final int POSITIONS_SHIFT = WordFlags.values().length;
|
||||
public static final long POSITIONS_MASK = ~0L >>> POSITIONS_SHIFT;
|
||||
|
||||
|
||||
|
||||
public WordMetadata() {
|
||||
this(emptyValue());
|
||||
}
|
||||
|
||||
public WordMetadata(long value) {
|
||||
this(
|
||||
((value >>> POSITIONS_SHIFT) & POSITIONS_MASK),
|
||||
(int)(value & FLAGS_MASK)
|
||||
);
|
||||
}
|
||||
|
||||
public WordMetadata(long positions,
|
||||
Set<WordFlags> flags)
|
||||
{
|
||||
this(positions, encodeFlags(flags));
|
||||
}
|
||||
|
||||
private static int encodeFlags(Set<WordFlags> flags) {
|
||||
int ret = 0;
|
||||
for (var flag : flags) { ret |= flag.asBit(); }
|
||||
return ret;
|
||||
}
|
||||
|
||||
public static boolean hasFlags(long encoded, long metadataBitMask) {
|
||||
return (encoded & metadataBitMask) == metadataBitMask;
|
||||
}
|
||||
public static boolean hasAnyFlags(long encoded, long metadataBitMask) {
|
||||
return (encoded & metadataBitMask) != 0;
|
||||
}
|
||||
public static long decodePositions(long meta) {
|
||||
return (meta >>> POSITIONS_SHIFT) & POSITIONS_MASK;
|
||||
}
|
||||
|
||||
public boolean hasFlag(WordFlags flag) {
|
||||
return (flags & flag.asBit()) != 0;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "[positions=%s; %s]".formatted(BrailleBlockPunchCards.printBits(positions, 56), flagSet());
|
||||
}
|
||||
|
||||
/* Encoded in a 64 bit long
|
||||
*/
|
||||
public long encode() {
|
||||
long ret = 0;
|
||||
|
||||
ret |= Integer.toUnsignedLong(flags) & FLAGS_MASK;
|
||||
ret |= (positions & POSITIONS_MASK) << POSITIONS_SHIFT;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return positions == 0 && flags == 0;
|
||||
}
|
||||
|
||||
public static long emptyValue() {
|
||||
return 0L;
|
||||
}
|
||||
|
||||
|
||||
public EnumSet<WordFlags> flagSet() {
|
||||
return WordFlags.decode(flags);
|
||||
}
|
||||
|
||||
}
|
@ -10,7 +10,6 @@ import java.util.StringJoiner;
|
||||
|
||||
public class QueryParams {
|
||||
|
||||
|
||||
@Nullable
|
||||
public static String queryParamsSanitizer(String path, @Nullable String queryParams) {
|
||||
if (queryParams == null) {
|
||||
|
@ -1,70 +0,0 @@
|
||||
package nu.marginalia.util;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Objects;
|
||||
|
||||
public class StringPool {
|
||||
|
||||
private final HashMap<String, String> words;
|
||||
private final Object2LongOpenHashMap<String> ages;
|
||||
private final int maxCap;
|
||||
|
||||
long idx;
|
||||
|
||||
private StringPool(int capacity, int maxCap) {
|
||||
this.ages = new Object2LongOpenHashMap<>(capacity);
|
||||
this.words = new HashMap<>(capacity);
|
||||
this.maxCap = maxCap;
|
||||
}
|
||||
|
||||
public static StringPool create(int capacity) {
|
||||
return new StringPool(capacity, capacity * 10);
|
||||
}
|
||||
|
||||
public String internalize(String str) {
|
||||
prune();
|
||||
|
||||
final String ret = words.putIfAbsent(str, str);
|
||||
ages.put(ret, idx++);
|
||||
|
||||
return Objects.requireNonNullElse(ret, str);
|
||||
}
|
||||
|
||||
public String[] internalize(String[] str) {
|
||||
|
||||
for (int i = 0; i < str.length; i++) {
|
||||
str[i] = internalize(str[i]);
|
||||
}
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
public void prune() {
|
||||
|
||||
if (words.size() < maxCap)
|
||||
return;
|
||||
|
||||
long[] ageValues = ages.values().toLongArray();
|
||||
Arrays.sort(ageValues);
|
||||
|
||||
long cutoff = ageValues[ageValues.length - maxCap / 10];
|
||||
|
||||
words.clear();
|
||||
ages.forEach((word, cnt) -> {
|
||||
if (cnt >= cutoff) {
|
||||
words.put(word, word);
|
||||
}
|
||||
});
|
||||
ages.clear();
|
||||
words.forEach((w,w2) -> {
|
||||
ages.put(w, idx);
|
||||
});
|
||||
}
|
||||
|
||||
public void flush() {
|
||||
words.clear();
|
||||
}
|
||||
}
|
@ -8,5 +8,4 @@ This package contains common models to the search engine
|
||||
* [EdgeUrl](java/nu/marginalia/model/EdgeUrl.java)
|
||||
* [DocumentMetadata](java/nu/marginalia/model/idx/DocumentMetadata.java)
|
||||
* [DocumentFlags](java/nu/marginalia/model/idx/DocumentFlags.java)
|
||||
* [WordMetadata](java/nu/marginalia/model/idx/WordMetadata.java)
|
||||
* [WordFlags](java/nu/marginalia/model/idx/WordFlags.java)
|
@ -1,41 +0,0 @@
|
||||
package nu.marginalia.model;
|
||||
|
||||
import nu.marginalia.bbpc.BrailleBlockPunchCards;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.EnumSet;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class WordMetadataTest {
|
||||
|
||||
@Test
|
||||
public void codecTest() {
|
||||
verifyCodec("Vanilla case", new WordMetadata(0x7f0f0000L, EnumSet.allOf(WordFlags.class)));
|
||||
verifyCodec("Position 32bit", new WordMetadata(0xff0f0000L, EnumSet.allOf(WordFlags.class)));
|
||||
verifyCodec("Position all", new WordMetadata(0xffff_ff0f_0000L, EnumSet.allOf(WordFlags.class)));
|
||||
verifyCodec("No flags", new WordMetadata( 0xff0f0000L, EnumSet.noneOf(WordFlags.class)));
|
||||
verifyCodec("No flags, some bits", new WordMetadata(0x3f_7f7f_7f7f_7f7fL, EnumSet.noneOf(WordFlags.class)));
|
||||
verifyCodec("No flags, all bits", new WordMetadata( 0x3f_ffff_ffff_ffffL, EnumSet.noneOf(WordFlags.class)));
|
||||
verifyCodec("All flags, all bits", new WordMetadata( 0x3f_ffff_ffff_ffffL, EnumSet.allOf(WordFlags.class)));
|
||||
System.out.println(new WordMetadata(0x7f0f0005L, EnumSet.allOf(WordFlags.class)));
|
||||
System.out.println(new WordMetadata(0xff0f0013L, EnumSet.noneOf(WordFlags.class)));
|
||||
System.out.println(new WordMetadata(0xf0f000ff0f0013L, EnumSet.allOf(WordFlags.class)));
|
||||
System.out.println(new WordMetadata(0xf0f000ff0f0013L, (byte)-1));
|
||||
System.out.println(new WordMetadata(0x3f_ffff_ffff_ffffL, (byte)0));
|
||||
System.out.println(new WordMetadata(0x3f_ffff_ffff_ffffL, (byte)0));
|
||||
System.out.println(BrailleBlockPunchCards.printBits(new WordMetadata(~0L, (byte) 0).encode(), 64));
|
||||
System.out.println(BrailleBlockPunchCards.printBits(new WordMetadata(0, (byte) 0xff).encode(), 64));
|
||||
System.out.println(BrailleBlockPunchCards.printBits(131973L, 64));
|
||||
System.out.println(new WordMetadata(131973L));
|
||||
}
|
||||
|
||||
public void verifyCodec(String message, WordMetadata data) {
|
||||
System.out.println(BrailleBlockPunchCards.printBits(data.encode(), 64));
|
||||
assertEquals(data, new WordMetadata(data.encode()), message);
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -1,13 +1,18 @@
|
||||
package nu.marginalia.process.control;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/** Dummy implementation of ProcessHeartbeat that does nothing */
|
||||
public class FakeProcessHeartbeat implements ProcessHeartbeat {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(FakeProcessHeartbeat.class);
|
||||
@Override
|
||||
public <T extends Enum<T>> ProcessTaskHeartbeat<T> createProcessTaskHeartbeat(Class<T> steps, String processName) {
|
||||
return new ProcessTaskHeartbeat<>() {
|
||||
@Override
|
||||
public void progress(T step) {}
|
||||
public void progress(T step) {
|
||||
logger.info("Progress: {}", step);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void shutDown() {}
|
||||
@ -21,7 +26,9 @@ public class FakeProcessHeartbeat implements ProcessHeartbeat {
|
||||
public ProcessAdHocTaskHeartbeat createAdHocTaskHeartbeat(String processName) {
|
||||
return new ProcessAdHocTaskHeartbeat() {
|
||||
@Override
|
||||
public void progress(String step, int progress, int total) {}
|
||||
public void progress(String step, int progress, int total) {
|
||||
logger.info("Progress: {}, {}/{}", step, progress, total);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {}
|
||||
|
@ -46,6 +46,7 @@ dependencies {
|
||||
implementation libs.bundles.mariadb
|
||||
|
||||
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||
testImplementation libs.commons.codec
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
|
@ -64,6 +64,11 @@ public class GrpcMultiNodeChannelPool<STUB> {
|
||||
return nodeConfigurationWatcher.getQueryNodes();
|
||||
}
|
||||
|
||||
/** Return the number of nodes that are eligible for broadcast-style requests */
|
||||
public int getNumNodes() {
|
||||
return nodeConfigurationWatcher.getQueryNodes().size();
|
||||
}
|
||||
|
||||
/** Create a new call builder for the given method. This is a fluent-style
|
||||
* method, where you can chain calls to specify how to run the method.
|
||||
* <p></p>
|
||||
|
@ -2,22 +2,7 @@
|
||||
<Appenders>
|
||||
<Console name="Console" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%-5level %d{yyyy-MM-dd HH:mm:ss,SSS} %-20t %-20c{1}: %msg{nolookups}%n</Pattern>
|
||||
</PatternLayout>
|
||||
<SizeBasedTriggeringPolicy size="10MB" />
|
||||
<Filters>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||
|
@ -38,15 +38,13 @@ dependencies {
|
||||
implementation project(':code:functions:search-query')
|
||||
implementation project(':code:execution:api')
|
||||
|
||||
implementation project(':code:process-models:crawl-spec')
|
||||
implementation project(':code:process-models:crawling-model')
|
||||
implementation project(':code:features-crawl:link-parser')
|
||||
implementation project(':code:features-convert:data-extractors')
|
||||
implementation project(':code:features-convert:stackexchange-xml')
|
||||
implementation project(':code:features-convert:reddit-json')
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
implementation project(':code:processes:crawling-process:ft-link-parser')
|
||||
implementation project(':code:execution:data-extractors')
|
||||
implementation project(':code:index:index-journal')
|
||||
implementation project(':code:index:api')
|
||||
implementation project(':code:process-mqapi')
|
||||
implementation project(':code:processes:process-mq-api')
|
||||
implementation project(':third-party:encyclopedia-marginalia-nu')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
@ -84,6 +82,7 @@ dependencies {
|
||||
testImplementation libs.mockito
|
||||
|
||||
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||
testImplementation libs.commons.codec
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
|
@ -22,9 +22,9 @@ dependencies {
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
implementation project(':code:libraries:blocking-thread-pool')
|
||||
implementation project(':code:features-crawl:link-parser')
|
||||
implementation project(':code:features-convert:anchor-keywords')
|
||||
implementation project(':code:process-models:crawling-model')
|
||||
implementation project(':code:processes:crawling-process:ft-link-parser')
|
||||
implementation project(':code:processes:converting-process:ft-anchor-keywords')
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
implementation project(':code:processes:converting-process')
|
||||
implementation project(':third-party:commons-codec')
|
||||
|
@ -3,13 +3,13 @@ package nu.marginalia.extractor;
|
||||
import com.google.inject.Inject;
|
||||
import gnu.trove.set.hash.TLongHashSet;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.crawling.io.CrawledDomainReader;
|
||||
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.io.crawldata.CrawledDomainReader;
|
||||
import nu.marginalia.io.crawldata.SerializableCrawlDataStream;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
@ -2,13 +2,13 @@ package nu.marginalia.extractor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.crawling.io.CrawledDomainReader;
|
||||
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.io.crawldata.CrawledDomainReader;
|
||||
import nu.marginalia.io.crawldata.SerializableCrawlDataStream;
|
||||
import nu.marginalia.link_parser.FeedExtractor;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
@ -5,11 +5,11 @@ import gnu.trove.map.hash.TLongIntHashMap;
|
||||
import gnu.trove.set.hash.TLongHashSet;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.crawling.io.CrawledDomainReader;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.io.crawldata.CrawledDomainReader;
|
||||
import nu.marginalia.language.filter.LanguageFilter;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
||||
@ -27,7 +27,7 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.nio.file.attribute.PosixFilePermissions;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
@ -97,8 +97,13 @@ public class TermFrequencyExporter implements ExporterIf {
|
||||
|
||||
}
|
||||
|
||||
private void processFile(Path crawlDataPath, TLongIntHashMap counts, AtomicInteger docCount, SentenceExtractor se) {
|
||||
TLongHashSet words = new TLongHashSet(10_000);
|
||||
private void processFile(Path crawlDataPath,
|
||||
TLongIntHashMap counts,
|
||||
AtomicInteger docCount,
|
||||
SentenceExtractor se)
|
||||
{
|
||||
TLongHashSet words = new TLongHashSet(1000);
|
||||
|
||||
try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
|
||||
while (stream.hasNext()) {
|
||||
if (Thread.interrupted())
|
||||
@ -120,19 +125,33 @@ public class TermFrequencyExporter implements ExporterIf {
|
||||
return;
|
||||
}
|
||||
|
||||
for (var sent : dld.sentences) {
|
||||
for (var sent : dld) {
|
||||
// Skip sentences with non-language tags, e.g. program code
|
||||
if (sent.htmlTags.stream().anyMatch(t -> t.nonLanguage))
|
||||
continue;
|
||||
|
||||
for (var word : sent) {
|
||||
words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8)));
|
||||
}
|
||||
|
||||
for (var ngram : sent.ngramStemmed) {
|
||||
words.add(longHash(ngram.getBytes()));
|
||||
}
|
||||
}
|
||||
|
||||
var random = ThreadLocalRandom.current();
|
||||
synchronized (counts) {
|
||||
words.forEach(w -> {
|
||||
counts.adjustOrPutValue(w, 1, 1);
|
||||
// Mathematicians hate him for this one weird trick:
|
||||
//
|
||||
// We generally aren't interested in low-frequency entries,
|
||||
// but due to zipf's law, there are a lot of them, in fact
|
||||
// almost the entire term frequency dictionary is full of them.
|
||||
//
|
||||
// So we use a simple statistical trick to reduce the number
|
||||
// of nearly unique entries in the dictionary, while still keeping the
|
||||
// distribution of higher-frequency entries relatively intact
|
||||
|
||||
if (random.nextDouble() < 0.2) {
|
||||
counts.adjustOrPutValue(w, 5, 5);
|
||||
}
|
||||
|
||||
return true;
|
||||
});
|
||||
}
|
@ -6,19 +6,11 @@ import com.google.inject.Singleton;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.With;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.actor.state.Resume;
|
||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.storage.model.FileStorageState;
|
||||
import nu.marginalia.svc.BackupService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import nu.marginalia.index.api.IndexMqClient;
|
||||
import nu.marginalia.index.api.IndexMqEndpoints;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
@ -27,9 +19,20 @@ import nu.marginalia.mqapi.converting.ConvertRequest;
|
||||
import nu.marginalia.mqapi.index.CreateIndexRequest;
|
||||
import nu.marginalia.mqapi.index.IndexName;
|
||||
import nu.marginalia.mqapi.loading.LoadRequest;
|
||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageState;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import nu.marginalia.svc.BackupService;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.sql.SQLException;
|
||||
import java.util.List;
|
||||
|
||||
@ -110,9 +113,30 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
||||
if (rsp.state() != MqMessageState.OK)
|
||||
yield new Error("Converter failed");
|
||||
|
||||
if (!shouldAutoClean()) {
|
||||
// If we're not auto-cleaning, we need to clean the NEW flag for the processed storage
|
||||
storageService.setFileStorageState(processedId, FileStorageState.UNSET);
|
||||
// (if we do auto-clean, we skip this step and purge the items after loading)
|
||||
}
|
||||
|
||||
yield new Load(List.of(processedId));
|
||||
}
|
||||
case Load(List<FileStorageId> processedIds, long msgId) when msgId < 0 -> {
|
||||
// clear the output directory of the loader from any debris from partial jobs that have been aborted
|
||||
Files.list(IndexLocations.getIndexConstructionArea(storageService)).forEach(path -> {
|
||||
try {
|
||||
if (Files.isDirectory(path)) {
|
||||
FileUtils.deleteDirectory(path.toFile());
|
||||
}
|
||||
else if (Files.isRegularFile(path)) {
|
||||
Files.delete(path);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
logger.error("Error clearing staging area", e);
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
long id = mqLoaderOutbox.sendAsync(new LoadRequest(processedIds));
|
||||
|
||||
yield new Load(processedIds, id);
|
||||
@ -122,9 +146,20 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
yield new Error("Loader failed");
|
||||
} else {
|
||||
cleanProcessedStorage(processedIds);
|
||||
}
|
||||
|
||||
// If we're auto-cleaning, flag the processed files for deletion if they have the NEW flag,
|
||||
// indicating they've recently been created. We need to check this, so we don't delete archived
|
||||
// stuff that's being loaded manually
|
||||
|
||||
if (shouldAutoClean()) {
|
||||
for (var id : processedIds) {
|
||||
if (FileStorageState.NEW.equals(storageService.getStorage(id).state())) {
|
||||
storageService.flagFileForDeletion(id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
yield new Backup(processedIds);
|
||||
}
|
||||
case Backup(List<FileStorageId> processedIds) -> {
|
||||
@ -146,7 +181,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
||||
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessService.ProcessId.INDEX_CONSTRUCTOR, id);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK)
|
||||
yield new Error("Repartition failed");
|
||||
yield new Error("Forward index construction failed");
|
||||
else
|
||||
yield new ReindexFull();
|
||||
}
|
||||
@ -155,7 +190,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
||||
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessService.ProcessId.INDEX_CONSTRUCTOR, id);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK)
|
||||
yield new Error("Repartition failed");
|
||||
yield new Error("Full index construction failed");
|
||||
else
|
||||
yield new ReindexPrio();
|
||||
}
|
||||
@ -164,7 +199,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
||||
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessService.ProcessId.INDEX_CONSTRUCTOR, id);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK)
|
||||
yield new Error("Repartition failed");
|
||||
yield new Error("Prio index construction failed");
|
||||
else
|
||||
yield new SwitchIndex();
|
||||
}
|
||||
@ -186,6 +221,16 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
||||
return mqIndexConstructorOutbox.sendAsync(new CreateIndexRequest(index));
|
||||
}
|
||||
|
||||
private boolean shouldAutoClean() {
|
||||
try {
|
||||
return nodeConfigurationService.get(nodeId).autoClean();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Error getting node configuration", ex);
|
||||
return false; // safe dafault
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String describe() {
|
||||
@ -215,24 +260,5 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
||||
this.nodeId = serviceConfiguration.node();
|
||||
}
|
||||
|
||||
private void cleanProcessedStorage(List<FileStorageId> processedStorageId) {
|
||||
try {
|
||||
var config = nodeConfigurationService.get(nodeId);
|
||||
|
||||
for (var id : processedStorageId) {
|
||||
if (FileStorageState.NEW.equals(storageService.getStorage(id).state())) {
|
||||
if (config.autoClean()) {
|
||||
storageService.flagFileForDeletion(id);
|
||||
}
|
||||
else {
|
||||
storageService.setFileStorageState(id, FileStorageState.UNSET);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Error in clean-up", ex);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -19,6 +19,8 @@ import org.slf4j.MarkerFactory;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
@ -32,6 +34,7 @@ public class ProcessService {
|
||||
private final ServiceEventLog eventLog;
|
||||
|
||||
private final ConcurrentHashMap<ProcessId, Process> processes = new ConcurrentHashMap<>();
|
||||
private final int node;
|
||||
|
||||
|
||||
public static ProcessService.ProcessId translateExternalIdBase(String id) {
|
||||
@ -78,6 +81,7 @@ public class ProcessService {
|
||||
@Inject
|
||||
public ProcessService(BaseServiceParams params) {
|
||||
this.eventLog = params.eventLog;
|
||||
this.node = params.configuration.node();
|
||||
}
|
||||
|
||||
|
||||
@ -86,7 +90,7 @@ public class ProcessService {
|
||||
List<String> args = new ArrayList<>();
|
||||
String javaHome = System.getProperty("java.home");
|
||||
|
||||
args.add(STR."\{javaHome}/bin/java");
|
||||
args.add(javaHome + "/bin/java");
|
||||
args.add("-cp");
|
||||
args.add(System.getProperty("java.class.path"));
|
||||
|
||||
@ -94,6 +98,7 @@ public class ProcessService {
|
||||
else args.add("-da");
|
||||
|
||||
args.add("--enable-preview");
|
||||
args.add("--enable-native-access=ALL-UNNAMED");
|
||||
|
||||
String loggingOpts = System.getProperty("log4j2.configurationFile");
|
||||
if (loggingOpts != null) {
|
||||
@ -104,6 +109,17 @@ public class ProcessService {
|
||||
args.add("-Dsystem.serviceNode=" + System.getProperty("system.serviceNode"));
|
||||
}
|
||||
|
||||
if (Boolean.getBoolean("system.profile")) {
|
||||
// add jfr options
|
||||
args.add("-XX:+FlightRecorder");
|
||||
String jfrFileName = "/var/log/wmsa/profile-%s-%d-%s.jfr".formatted(
|
||||
processId.toString(),
|
||||
node,
|
||||
LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME).replace(':', '.')
|
||||
);
|
||||
args.add("-XX:StartFlightRecording=filename=%s,name=%s".formatted(jfrFileName, processId.toString()));
|
||||
}
|
||||
|
||||
args.addAll(processId.envOpts());
|
||||
args.add(processId.mainClass);
|
||||
args.addAll(Arrays.asList(extraArgs));
|
||||
|
@ -2,22 +2,25 @@ package nu.marginalia.svc;
|
||||
|
||||
import com.github.luben.zstd.ZstdInputStream;
|
||||
import com.github.luben.zstd.ZstdOutputStream;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.index.journal.IndexJournal;
|
||||
import nu.marginalia.linkdb.LinkdbFileNames;
|
||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import nu.marginalia.index.journal.IndexJournalFileNames;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
public class BackupService {
|
||||
|
||||
@ -97,35 +100,20 @@ public class BackupService {
|
||||
|
||||
private void backupJournal(Path inputStorage, Path backupStorage) throws IOException
|
||||
{
|
||||
for (var source : IndexJournalFileNames.findJournalFiles(inputStorage)) {
|
||||
var dest = backupStorage.resolve(source.toFile().getName());
|
||||
|
||||
try (var is = Files.newInputStream(source);
|
||||
var os = Files.newOutputStream(dest)
|
||||
) {
|
||||
IOUtils.copyLarge(is, os);
|
||||
}
|
||||
Optional<IndexJournal> journal = IndexJournal.findJournal(inputStorage);
|
||||
if (journal.isEmpty()) {
|
||||
throw new FileNotFoundException("No journal found in input storage");
|
||||
}
|
||||
|
||||
FileUtils.copyDirectory(journal.get().journalDir().toFile(), backupStorage.resolve(journal.get().journalDir().getFileName()).toFile());
|
||||
}
|
||||
|
||||
private void restoreJournal(Path destStorage, Path backupStorage) throws IOException {
|
||||
|
||||
// Remove any old journal files first to avoid them getting loaded
|
||||
for (var garbage : IndexJournalFileNames.findJournalFiles(destStorage)) {
|
||||
Files.delete(garbage);
|
||||
Optional<IndexJournal> journal = IndexJournal.findJournal(backupStorage);
|
||||
if (journal.isEmpty()) {
|
||||
throw new FileNotFoundException("No journal found in backup");
|
||||
}
|
||||
|
||||
for (var source : IndexJournalFileNames.findJournalFiles(backupStorage)) {
|
||||
var dest = destStorage.resolve(source.toFile().getName());
|
||||
|
||||
try (var is = Files.newInputStream(source);
|
||||
var os = Files.newOutputStream(dest)
|
||||
) {
|
||||
IOUtils.copyLarge(is, os);
|
||||
}
|
||||
}
|
||||
|
||||
FileUtils.copyDirectory(backupStorage.resolve(journal.get().journalDir().getFileName()).toFile(), destStorage.toFile());
|
||||
}
|
||||
|
||||
private void backupFileCompressed(String fileName, Path inputStorage, Path backupStorage) throws IOException
|
||||
|
@ -1,33 +0,0 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
|
||||
id "de.undercouch.download" version "5.1.0"
|
||||
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:config')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.notnull
|
||||
implementation libs.jsoup
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
||||
|
@ -1,8 +0,0 @@
|
||||
# Adblock
|
||||
|
||||
Contains an adblock simulator that reads an adblock specifications file and
|
||||
uses it to identify if a document has ads.
|
||||
|
||||
## Central Classes
|
||||
|
||||
* [AdblockSimulator](java/nu/marginalia/adblock/AdblockSimulator.java)
|
@ -1,189 +0,0 @@
|
||||
package nu.marginalia.keyword;
|
||||
|
||||
import nu.marginalia.segmentation.NgramLexicon;
|
||||
import nu.marginalia.keyword.extractors.*;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import java.util.*;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public class DocumentKeywordExtractor {
|
||||
|
||||
private final KeywordExtractor keywordExtractor;
|
||||
private final TermFrequencyDict dict;
|
||||
private final NgramLexicon ngramLexicon;
|
||||
|
||||
|
||||
@Inject
|
||||
public DocumentKeywordExtractor(TermFrequencyDict dict, NgramLexicon ngramLexicon) {
|
||||
this.dict = dict;
|
||||
this.ngramLexicon = ngramLexicon;
|
||||
this.keywordExtractor = new KeywordExtractor();
|
||||
}
|
||||
|
||||
|
||||
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData dld, EdgeUrl url) {
|
||||
|
||||
var bitmask = new KeywordPositionBitmask(keywordExtractor, dld);
|
||||
var tfIdfCounts = new WordsTfIdfCounts(dict, keywordExtractor, dld);
|
||||
|
||||
var titleKeywords = new TitleKeywords(keywordExtractor, dld);
|
||||
var nameLikeKeywords = new NameLikeKeywords(keywordExtractor, dld, 2);
|
||||
var subjectLikeKeywords = new SubjectLikeKeywords(keywordExtractor, tfIdfCounts, dld);
|
||||
var artifactKeywords = new ArtifactKeywords(dld);
|
||||
var urlKeywords = new UrlKeywords(url);
|
||||
|
||||
var keywordMetadata = KeywordMetadata.builder()
|
||||
.bitmask(bitmask)
|
||||
.tfIdfCounts(tfIdfCounts)
|
||||
.titleKeywords(titleKeywords)
|
||||
.nameLikeKeywords(nameLikeKeywords)
|
||||
.subjectLikeKeywords(subjectLikeKeywords)
|
||||
.urlKeywords(urlKeywords)
|
||||
.build();
|
||||
|
||||
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
|
||||
|
||||
createSimpleWords(wordsBuilder, keywordMetadata, dld);
|
||||
|
||||
createWordsFromSet(wordsBuilder, keywordMetadata, tfIdfCounts);
|
||||
createWordsFromSet(wordsBuilder, keywordMetadata, titleKeywords);
|
||||
createWordsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords);
|
||||
createWordsFromSet(wordsBuilder, keywordMetadata, nameLikeKeywords);
|
||||
|
||||
var importantWords = getImportantWords(tfIdfCounts, nameLikeKeywords, subjectLikeKeywords, wordsBuilder);
|
||||
wordsBuilder.addImportantWords(importantWords);
|
||||
|
||||
wordsBuilder.addAllSyntheticTerms(artifactKeywords.getWords());
|
||||
|
||||
return wordsBuilder;
|
||||
}
|
||||
|
||||
private static Collection<String> getImportantWords(WordsTfIdfCounts tfIdfCounts, NameLikeKeywords nameLikeKeywords, SubjectLikeKeywords subjectLikeKeywords, DocumentKeywordsBuilder wordsBuilder) {
|
||||
return Stream.of(tfIdfCounts, nameLikeKeywords, subjectLikeKeywords)
|
||||
.flatMap(k -> k.getReps().stream())
|
||||
.filter(w -> {
|
||||
if (w.word.length() < 3)
|
||||
return false;
|
||||
if (w.word.contains("_"))
|
||||
return false;
|
||||
return true;
|
||||
})
|
||||
.sorted(tfIdfCounts.reversed())
|
||||
.limit(16)
|
||||
.filter(w -> tfIdfCounts.termFrequencyDictValue(w) > 100)
|
||||
.sorted(Comparator.comparing(w -> tfIdfCounts.termFrequencyDictValue(w)))
|
||||
.limit(6)
|
||||
.map(w -> w.word)
|
||||
.toList();
|
||||
}
|
||||
|
||||
private void createWordsFromSet(DocumentKeywordsBuilder wordsBuilder,
|
||||
KeywordMetadata metadata,
|
||||
WordReps words) {
|
||||
|
||||
for (var rep : words.getReps()) {
|
||||
|
||||
var word = rep.word;
|
||||
|
||||
if (!word.isBlank()) {
|
||||
long meta = metadata.getMetadataForWord(rep.stemmed);
|
||||
|
||||
assert meta != 0L : "Missing meta for " + rep.word;
|
||||
|
||||
wordsBuilder.add(word, meta);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
private void createSimpleWords(DocumentKeywordsBuilder wordsBuilder,
|
||||
KeywordMetadata metadata,
|
||||
DocumentLanguageData documentLanguageData)
|
||||
{
|
||||
for (var sent : documentLanguageData.sentences) {
|
||||
|
||||
if (wordsBuilder.size() > 1500)
|
||||
break;
|
||||
|
||||
for (var word : sent) {
|
||||
if (word.isStopWord()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
String w = word.wordLowerCase();
|
||||
if (matchesWordPattern(w)) {
|
||||
long meta = metadata.getMetadataForWord(word.stemmed());
|
||||
assert meta != 0L : "Missing meta for " + word.word();
|
||||
|
||||
wordsBuilder.add(w, meta);
|
||||
}
|
||||
}
|
||||
|
||||
for (var names : keywordExtractor.getProperNames(sent)) {
|
||||
var rep = new WordRep(sent, names);
|
||||
|
||||
long meta = metadata.getMetadataForWord(rep.stemmed);
|
||||
assert meta != 0L : "Missing meta for " + rep.word;
|
||||
|
||||
wordsBuilder.add(rep.word, meta);
|
||||
}
|
||||
|
||||
for (int i = 0; i < sent.ngrams.length; i++) {
|
||||
var ngram = sent.ngrams[i];
|
||||
var ngramStemmed = sent.ngramStemmed[i];
|
||||
|
||||
long meta = metadata.getMetadataForWord(ngramStemmed);
|
||||
assert meta != 0L : "Missing meta for " + ngram;
|
||||
|
||||
wordsBuilder.add(ngram, meta);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
boolean matchesWordPattern(String s) {
|
||||
// this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4}
|
||||
|
||||
String wordPartSeparator = ".-_/:+*";
|
||||
|
||||
int i = 0;
|
||||
|
||||
for (int run = 0; run < 15 && i < s.length(); run++, i++) {
|
||||
char c = s.charAt(i);
|
||||
if (c >= 'a' && c <= 'z') continue;
|
||||
if (c >= 'A' && c <= 'Z') continue;
|
||||
if (c >= '0' && c <= '9') continue;
|
||||
break;
|
||||
}
|
||||
|
||||
if (i == 0)
|
||||
return false;
|
||||
|
||||
for (int j = 0; j < 5; j++) {
|
||||
if (i == s.length()) return true;
|
||||
|
||||
if (wordPartSeparator.indexOf(s.charAt(i)) < 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
i++;
|
||||
|
||||
for (int run = 0; run < 10 && i < s.length(); run++, i++) {
|
||||
char c = s.charAt(i);
|
||||
if (c >= 'a' && c <= 'z') continue;
|
||||
if (c >= 'A' && c <= 'Z') continue;
|
||||
if (c >= '0' && c <= '9') continue;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
@ -1,64 +0,0 @@
|
||||
package nu.marginalia.keyword;
|
||||
|
||||
import lombok.Builder;
|
||||
import nu.marginalia.keyword.extractors.*;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
|
||||
import java.util.EnumSet;
|
||||
|
||||
class KeywordMetadata {
|
||||
|
||||
private final KeywordPositionBitmask bitmask;
|
||||
private final TitleKeywords titleKeywords;
|
||||
private final NameLikeKeywords nameLikeKeywords;
|
||||
private final SubjectLikeKeywords subjectLikeKeywords;
|
||||
private final UrlKeywords urlKeywords;
|
||||
private final WordsTfIdfCounts tfIdfCounts;
|
||||
|
||||
@Builder
|
||||
public KeywordMetadata(
|
||||
KeywordPositionBitmask bitmask,
|
||||
TitleKeywords titleKeywords,
|
||||
NameLikeKeywords nameLikeKeywords,
|
||||
SubjectLikeKeywords subjectLikeKeywords,
|
||||
UrlKeywords urlKeywords,
|
||||
WordsTfIdfCounts tfIdfCounts) {
|
||||
|
||||
this.bitmask = bitmask;
|
||||
this.titleKeywords = titleKeywords;
|
||||
this.nameLikeKeywords = nameLikeKeywords;
|
||||
this.subjectLikeKeywords = subjectLikeKeywords;
|
||||
this.urlKeywords = urlKeywords;
|
||||
this.tfIdfCounts = tfIdfCounts;
|
||||
}
|
||||
|
||||
public long getMetadataForWord(String stemmed) {
|
||||
|
||||
int tfidf = tfIdfCounts.getTfIdf(stemmed);
|
||||
EnumSet<WordFlags> flags = EnumSet.noneOf(WordFlags.class);
|
||||
|
||||
if (tfidf > 100)
|
||||
flags.add(WordFlags.TfIdfHigh);
|
||||
|
||||
if (subjectLikeKeywords.contains(stemmed))
|
||||
flags.add(WordFlags.Subjects);
|
||||
|
||||
if (nameLikeKeywords.contains(stemmed))
|
||||
flags.add(WordFlags.NamesWords);
|
||||
|
||||
if (titleKeywords.contains(stemmed))
|
||||
flags.add(WordFlags.Title);
|
||||
|
||||
if (urlKeywords.containsUrl(stemmed))
|
||||
flags.add(WordFlags.UrlPath);
|
||||
|
||||
if (urlKeywords.containsDomain(stemmed))
|
||||
flags.add(WordFlags.UrlDomain);
|
||||
|
||||
long positions = bitmask.get(stemmed);
|
||||
|
||||
return new WordMetadata(positions, flags).encode();
|
||||
}
|
||||
|
||||
}
|
@ -1,105 +0,0 @@
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
|
||||
/** Generates a position bitmask for each word in a document */
|
||||
public class KeywordPositionBitmask {
|
||||
private final Object2LongOpenHashMap<String> positionMask = new Object2LongOpenHashMap<>(10_000, 0.7f);
|
||||
private final static int positionWidth = WordMetadata.POSITIONS_COUNT;
|
||||
private final static long positionBitmask = WordMetadata.POSITIONS_MASK;
|
||||
private static final int unmodulatedPortion = 16;
|
||||
|
||||
@Inject
|
||||
public KeywordPositionBitmask(KeywordExtractor keywordExtractor,
|
||||
DocumentLanguageData dld)
|
||||
{
|
||||
|
||||
// Mark the title words as position 0
|
||||
for (var sent : dld.titleSentences) {
|
||||
int posBit = 1;
|
||||
|
||||
for (var word : sent) {
|
||||
positionMask.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var ngram : sent.ngramStemmed) {
|
||||
positionMask.merge(ngram, posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var span : keywordExtractor.getKeywordsFromSentence(sent)) {
|
||||
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var span : keywordExtractor.getProperNames(sent)) {
|
||||
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
}
|
||||
|
||||
// Mark subsequent sentences in subsequent positions, with increasing sentence step size
|
||||
LinePosition linePos = new LinePosition();
|
||||
for (var sent : dld.sentences) {
|
||||
|
||||
long posBit = (1L << linePos.pos()) & positionBitmask;
|
||||
|
||||
for (var word : sent) {
|
||||
positionMask.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var ngram : sent.ngramStemmed) {
|
||||
positionMask.merge(ngram, posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var span : keywordExtractor.getKeywordsFromSentence(sent)) {
|
||||
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var span : keywordExtractor.getProperNames(sent)) {
|
||||
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
linePos.next(sent.length());
|
||||
}
|
||||
}
|
||||
|
||||
public long get(String stemmed) {
|
||||
return positionMask.getOrDefault(stemmed, 0);
|
||||
}
|
||||
|
||||
private long bitwiseOr(long a, long b) {
|
||||
return a | b;
|
||||
}
|
||||
|
||||
private static class LinePosition {
|
||||
private int lineLengthCtr = 0;
|
||||
private int bitMaskPos = 1;
|
||||
|
||||
public int pos() {
|
||||
if (bitMaskPos < unmodulatedPortion) {
|
||||
return bitMaskPos;
|
||||
}
|
||||
else {
|
||||
return unmodulatedPortion + ((bitMaskPos - unmodulatedPortion) % (positionWidth - unmodulatedPortion));
|
||||
}
|
||||
}
|
||||
|
||||
public void next(int sentenceLength)
|
||||
{
|
||||
if (sentenceLength > 10) {
|
||||
lineLengthCtr = 0;
|
||||
++bitMaskPos;
|
||||
}
|
||||
|
||||
lineLengthCtr += sentenceLength;
|
||||
if (lineLengthCtr > 15) {
|
||||
lineLengthCtr = 0;
|
||||
++bitMaskPos;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@ -1,68 +0,0 @@
|
||||
package nu.marginalia.keyword.model;
|
||||
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
|
||||
import java.io.Serial;
|
||||
import java.io.Serializable;
|
||||
|
||||
public final class DocumentKeywords implements Serializable {
|
||||
|
||||
@Serial
|
||||
private static final long serialVersionUID = 1387282293082091432L;
|
||||
|
||||
public final String[] keywords;
|
||||
public final long[] metadata;
|
||||
|
||||
public DocumentKeywords(String[] keywords,
|
||||
long[] metadata)
|
||||
{
|
||||
this.keywords = keywords;
|
||||
this.metadata = metadata;
|
||||
|
||||
assert keywords.length == metadata.length;
|
||||
|
||||
if (DocumentKeywords.class.desiredAssertionStatus()) {
|
||||
for (int i = 0; i < metadata.length; i++) {
|
||||
if (metadata[i] == 0) {
|
||||
System.err.println("Bad metadata for keyword " + keywords[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append(getClass().getSimpleName());
|
||||
sb.append('[');
|
||||
var pointer = newPointer();
|
||||
while (pointer.advancePointer()) {
|
||||
sb.append("\n\t ");
|
||||
|
||||
long metadata = pointer.getMetadata();
|
||||
String keyword = pointer.getKeyword();
|
||||
sb.append(keyword);
|
||||
|
||||
if (metadata != 0) {
|
||||
sb.append("/").append(new WordMetadata(metadata));
|
||||
}
|
||||
}
|
||||
return sb.append("\n]").toString();
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return keywords.length == 0;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return keywords.length;
|
||||
}
|
||||
|
||||
/** Return a pointer for traversing this structure */
|
||||
public DocumentKeywordsPointer newPointer() {
|
||||
return new DocumentKeywordsPointer(this);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -1,122 +0,0 @@
|
||||
package nu.marginalia.keyword.model;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap;
|
||||
import lombok.Getter;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
@Getter
|
||||
public class DocumentKeywordsBuilder {
|
||||
public final Object2LongLinkedOpenHashMap<String> words;
|
||||
|
||||
/** These ware keywords that had signals of high relevance */
|
||||
public final Set<String> importantWords = new HashSet<>();
|
||||
|
||||
// |------64 letters is this long-------------------------------|
|
||||
// granted, some of these words are word n-grams, but 64 ought to
|
||||
// be plenty. The lexicon writer has another limit that's higher.
|
||||
private final int MAX_WORD_LENGTH = 64;
|
||||
|
||||
public DocumentKeywordsBuilder() {
|
||||
this(1600);
|
||||
}
|
||||
|
||||
public DocumentKeywords build() {
|
||||
final String[] wordArray = new String[words.size()];
|
||||
final long[] meta = new long[words.size()];
|
||||
|
||||
var iter = words.object2LongEntrySet().fastIterator();
|
||||
|
||||
for (int i = 0; iter.hasNext(); i++) {
|
||||
var entry = iter.next();
|
||||
|
||||
meta[i] = entry.getLongValue();
|
||||
wordArray[i] = entry.getKey();
|
||||
}
|
||||
|
||||
return new DocumentKeywords(wordArray, meta);
|
||||
}
|
||||
|
||||
public DocumentKeywordsBuilder(int capacity) {
|
||||
words = new Object2LongLinkedOpenHashMap<>(capacity);
|
||||
}
|
||||
|
||||
public void add(String word, long meta) {
|
||||
if (word.length() > MAX_WORD_LENGTH)
|
||||
return;
|
||||
|
||||
words.put(word, meta);
|
||||
}
|
||||
|
||||
public void addImportantWords(Collection<String> words) {
|
||||
importantWords.addAll(words);
|
||||
}
|
||||
|
||||
public void addJustNoMeta(String word) {
|
||||
if (word.length() > MAX_WORD_LENGTH)
|
||||
return;
|
||||
|
||||
words.putIfAbsent(word, 0);
|
||||
}
|
||||
|
||||
public void setFlagOnMetadataForWords(WordFlags flag, Collection<String> flagWords) {
|
||||
flagWords.forEach(word ->
|
||||
words.mergeLong(word, flag.asBit(), (a, b) -> a|b)
|
||||
);
|
||||
}
|
||||
|
||||
public void addAllSyntheticTerms(Collection<String> newWords) {
|
||||
long meta = WordFlags.Synthetic.asBit();
|
||||
|
||||
// Only add the synthetic flag if the words aren't already present
|
||||
|
||||
newWords.forEach(word -> words.putIfAbsent(word, meta));
|
||||
}
|
||||
|
||||
public void addAnchorTerms(Map<String, Integer> keywords) {
|
||||
long flagA = WordFlags.ExternalLink.asBit();
|
||||
long flagB = flagA | WordFlags.Site.asBit();
|
||||
long flagC = flagB | WordFlags.SiteAdjacent.asBit();
|
||||
|
||||
keywords.forEach((word, count) -> {
|
||||
if (count > 5) {
|
||||
words.mergeLong(word, flagC, (a, b) -> a|b);
|
||||
} else if (count > 2) {
|
||||
words.mergeLong(word, flagB, (a, b) -> a|b);
|
||||
} else {
|
||||
words.mergeLong(word, flagA, (a, b) -> a|b);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public List<String> getWordsWithAnyFlag(long flags) {
|
||||
List<String> ret = new ArrayList<>();
|
||||
|
||||
for (var iter = words.object2LongEntrySet().fastIterator(); iter.hasNext();) {
|
||||
var entry = iter.next();
|
||||
if ((flags & entry.getLongValue()) != 0) {
|
||||
ret.add(entry.getKey());
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return words.size();
|
||||
}
|
||||
|
||||
public WordMetadata getMetaForWord(String word) {
|
||||
return new WordMetadata(words.getLong(word));
|
||||
}
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder("[ ");
|
||||
words.forEach((word, meta) -> sb.append(word).append("->").append(new WordMetadata(meta)).append(' '));
|
||||
return sb.append(']').toString();
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -1,41 +0,0 @@
|
||||
package nu.marginalia.keyword.model;
|
||||
|
||||
/** Pointer into a {@see DocumentKeywords}. It starts out before the first position,
|
||||
* forward with advancePointer().
|
||||
* */
|
||||
public class DocumentKeywordsPointer {
|
||||
private int pos = -1;
|
||||
|
||||
private final DocumentKeywords keywords;
|
||||
|
||||
DocumentKeywordsPointer(DocumentKeywords keywords) {
|
||||
this.keywords = keywords;
|
||||
}
|
||||
|
||||
/** Number of positions remaining */
|
||||
public int remaining() {
|
||||
return keywords.size() - Math.max(0, pos);
|
||||
}
|
||||
|
||||
/** Return the keyword associated with the current position */
|
||||
public String getKeyword() {
|
||||
return keywords.keywords[pos];
|
||||
}
|
||||
|
||||
/** Return the metadata associated with the current position */
|
||||
public long getMetadata() {
|
||||
return keywords.metadata[pos];
|
||||
}
|
||||
|
||||
/** Advance the current position,
|
||||
* returns false if this was the
|
||||
* last position */
|
||||
public boolean advancePointer() {
|
||||
return ++pos < keywords.size();
|
||||
}
|
||||
|
||||
/** Returns true unless the pointer is beyond the last position in the keyword set */
|
||||
public boolean hasMore() {
|
||||
return pos + 1 < keywords.size();
|
||||
}
|
||||
}
|
@ -1,149 +0,0 @@
|
||||
package nu.marginalia.keyword;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.segmentation.NgramLexicon;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
|
||||
class DocumentKeywordExtractorTest {
|
||||
|
||||
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(
|
||||
new TermFrequencyDict(WmsaHome.getLanguageModels()),
|
||||
new NgramLexicon(WmsaHome.getLanguageModels()));
|
||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
|
||||
@Test
|
||||
public void testWordPattern() {
|
||||
Assertions.assertTrue(extractor.matchesWordPattern("test"));
|
||||
Assertions.assertTrue(extractor.matchesWordPattern("1234567890abcde"));
|
||||
Assertions.assertFalse(extractor.matchesWordPattern("1234567890abcdef"));
|
||||
|
||||
Assertions.assertTrue(extractor.matchesWordPattern("test-test-test-test-test"));
|
||||
Assertions.assertFalse(extractor.matchesWordPattern("test-test-test-test-test-test"));
|
||||
Assertions.assertTrue(extractor.matchesWordPattern("192.168.1.100/24"));
|
||||
Assertions.assertTrue(extractor.matchesWordPattern("std::vector"));
|
||||
Assertions.assertTrue(extractor.matchesWordPattern("c++"));
|
||||
Assertions.assertTrue(extractor.matchesWordPattern("m*a*s*h"));
|
||||
Assertions.assertFalse(extractor.matchesWordPattern("Stulpnagelstrasse"));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testEmptyMetadata() throws URISyntaxException {
|
||||
var dld = se.extractSentences("""
|
||||
Some sample text, I'm not sure what even triggers this
|
||||
""", "A title perhaps?");
|
||||
var keywordBuilder = extractor.extractKeywords(dld, new EdgeUrl("https://www.example.com/invalid"));
|
||||
var keywords = keywordBuilder.build();
|
||||
|
||||
var pointer = keywords.newPointer();
|
||||
while (pointer.advancePointer()) {
|
||||
if (pointer.getMetadata() == 0L) {
|
||||
System.out.println("Aha! " + pointer.getKeyword());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testKeyboards2() throws IOException, URISyntaxException {
|
||||
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
|
||||
"Could not load word frequency table");
|
||||
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
|
||||
var doc = Jsoup.parse(html);
|
||||
doc.filter(new DomPruningFilter(0.5));
|
||||
|
||||
var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/"));
|
||||
|
||||
keywords.getWords().forEach((k, v) -> {
|
||||
if (k.contains("_")) {
|
||||
System.out.println(k + " " + new WordMetadata(v));
|
||||
}
|
||||
});
|
||||
}
|
||||
@Test
|
||||
public void testKeyboards() throws IOException, URISyntaxException {
|
||||
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
|
||||
"Could not load word frequency table");
|
||||
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
|
||||
var doc = Jsoup.parse(html);
|
||||
doc.filter(new DomPruningFilter(0.5));
|
||||
|
||||
var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/"));
|
||||
System.out.println(keywords.getMetaForWord("mechanical"));
|
||||
System.out.println(keywords.getMetaForWord("keyboard"));
|
||||
System.out.println(keywords.getMetaForWord("keyboards"));
|
||||
|
||||
System.out.println(new WordMetadata(8894889328781L));
|
||||
System.out.println(new WordMetadata(4294967297L));
|
||||
System.out.println(new WordMetadata(566820053975498886L));
|
||||
// -
|
||||
System.out.println(new WordMetadata(1198298103937L));
|
||||
System.out.println(new WordMetadata(1103808168065L));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMadonna() throws IOException, URISyntaxException {
|
||||
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/madonna.html"),
|
||||
"Could not load word frequency table");
|
||||
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
|
||||
var doc = Jsoup.parse(html);
|
||||
doc.filter(new DomPruningFilter(0.5));
|
||||
|
||||
var keywords = extractor.extractKeywords(
|
||||
se.extractSentences(doc),
|
||||
new EdgeUrl("https://encyclopedia.marginalia.nu/article/Don't_Tell_Me_(Madonna_song)")
|
||||
);
|
||||
|
||||
var keywordsBuilt = keywords.build();
|
||||
var ptr = keywordsBuilt.newPointer();
|
||||
|
||||
Map<String, WordMetadata> dirtyAndBlues = new HashMap<>();
|
||||
|
||||
while (ptr.advancePointer()) {
|
||||
if (Set.of("dirty", "blues").contains(ptr.getKeyword())) {
|
||||
Assertions.assertNull(
|
||||
dirtyAndBlues.put(ptr.getKeyword(), new WordMetadata(ptr.getMetadata()))
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Assertions.assertTrue(dirtyAndBlues.containsKey("dirty"));
|
||||
Assertions.assertTrue(dirtyAndBlues.containsKey("blues"));
|
||||
Assertions.assertNotEquals(
|
||||
dirtyAndBlues.get("dirty"),
|
||||
dirtyAndBlues.get("blues")
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSpam() throws IOException, URISyntaxException {
|
||||
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/spam.html"),
|
||||
"Could not load word frequency table");
|
||||
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
|
||||
var doc = Jsoup.parse(html);
|
||||
doc.filter(new DomPruningFilter(0.5));
|
||||
|
||||
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(
|
||||
new TermFrequencyDict(WmsaHome.getLanguageModels()),
|
||||
new NgramLexicon(WmsaHome.getLanguageModels()));
|
||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
|
||||
var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://math.byu.edu/wiki/index.php/All_You_Need_To_Know_About_Earning_Money_Online"));
|
||||
System.out.println(keywords.getMetaForWord("knitting"));
|
||||
}
|
||||
}
|
@ -1,34 +0,0 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
|
||||
id "de.undercouch.download" version "5.1.0"
|
||||
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:model')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.notnull
|
||||
implementation libs.bundles.gson
|
||||
implementation libs.jsoup
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
testImplementation project(':code:common:config')
|
||||
}
|
@ -1,7 +0,0 @@
|
||||
# Pubdate
|
||||
|
||||
Contains advanced haruspicy for figuring out when a document was published.
|
||||
|
||||
## Central Classes
|
||||
|
||||
* [PubDateSniffer](java/nu/marginalia/pubdate/PubDateSniffer.java)
|
@ -1,13 +0,0 @@
|
||||
# Converter Features
|
||||
|
||||
## Major features
|
||||
|
||||
* [keyword-extraction](keyword-extraction/) - Identifies keywords to index in a document
|
||||
* [summary-extraction](summary-extraction/) - Generate an excerpt/quote from a website to display on the search results page.
|
||||
|
||||
|
||||
## Smaller features:
|
||||
|
||||
* [adblock](adblock/) - Simulates Adblock
|
||||
* [pubdate](pubdate/) - Determines when a document was published
|
||||
* [topic-detection](topic-detection/) - Tries to identify the topic of a website
|
@ -1,44 +0,0 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation project(':code:libraries:blocking-thread-pool')
|
||||
implementation project(':code:common:model')
|
||||
implementation libs.notnull
|
||||
|
||||
implementation libs.jsoup
|
||||
implementation libs.sqlite
|
||||
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.guava
|
||||
implementation libs.gson
|
||||
implementation libs.zstd
|
||||
implementation libs.trove
|
||||
implementation libs.commons.compress
|
||||
implementation libs.xz
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
||||
|
||||
test {
|
||||
maxHeapSize = "8G"
|
||||
useJUnitPlatform()
|
||||
}
|
@ -1,43 +0,0 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation project(':code:libraries:blocking-thread-pool')
|
||||
implementation project(':code:common:model')
|
||||
implementation libs.notnull
|
||||
|
||||
implementation libs.jsoup
|
||||
implementation libs.sqlite
|
||||
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.guava
|
||||
implementation libs.zstd
|
||||
implementation libs.trove
|
||||
implementation libs.commons.compress
|
||||
implementation libs.xz
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
||||
|
||||
test {
|
||||
maxHeapSize = "8G"
|
||||
useJUnitPlatform()
|
||||
}
|
@ -1,18 +0,0 @@
|
||||
Stackexchange's data is a jumble of questions and answers,
|
||||
where the answers refer to the questions with a parentId field.
|
||||
|
||||
e.g.
|
||||
```xml
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<posts>
|
||||
<row Id="1" PostTypeId="1" AcceptedAnswerId="51" CreationDate="2016-01-12T18:45:19.963" Score="10" ViewCount="424" Body="<p>When I've printed an object I've had to choose between high resolution and quick prints. What techniques or technologies can I use or deploy to speed up my high resolution prints?</p>
" OwnerUserId="16" LastActivityDate="2017-10-31T02:31:08.560" Title="How to obtain high resolution prints in a shorter period of time?" Tags="<resolution><speed><quality>" AnswerCount="2" CommentCount="6" ContentLicense="CC BY-SA 3.0" />
|
||||
<row Id="2" PostTypeId="1" AcceptedAnswerId="12" CreationDate="2016-01-12T18:45:51.287" Score="34" ViewCount="7377" Body="<p>I would like to buy a 3D printer, but I'm concerned about the health risks that are associated with its operation. Some groups of scientists say it can be <a href="http://www.techworld.com/news/personal-tech/scientists-warn-of-3d-printing-health-effects-as-tech-hits-high-street-3460992/">harmful</a> for humans.</p>

<p>What do I need to consider before buying a 3D printer if I care about my health? Are there any safe printers?</p>
" OwnerUserId="20" LastEditorUserId="334" LastEditDate="2016-11-15T16:16:11.163" LastActivityDate="2019-06-10T23:18:34.190" Title="Is 3D printing safe for your health?" Tags="<print-material><safety><health>" AnswerCount="4" CommentCount="1" ContentLicense="CC BY-SA 3.0" />
|
||||
<row Id="12" PostTypeId="2" ParentId="2" CreationDate="2016-01-12T19:13:00.710" Score="23" Body="<p>There is very little information about safety available, as home 3D printers are relatively new. However, plastics such as ABS have a long history in making plastic products, and a study found..." />
|
||||
</posts>
|
||||
```
|
||||
|
||||
Since the search engine wants to extract keywords for each thread
|
||||
holistically, not by question or answer, it is necessary to re-arrange
|
||||
the data (which is very large). SQLite does a decent job of enabling
|
||||
this task.
|
||||
|
@ -1,42 +0,0 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation libs.notnull
|
||||
|
||||
implementation libs.jsoup
|
||||
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.guava
|
||||
implementation libs.bundles.gson
|
||||
implementation libs.trove
|
||||
implementation libs.fastutil
|
||||
implementation libs.commons.lang3
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
testImplementation project(':code:features-convert:keyword-extraction')
|
||||
testImplementation project(':code:libraries:language-processing')
|
||||
testImplementation project(':code:libraries:term-frequency-dict')
|
||||
testImplementation project(':code:common:config')
|
||||
testImplementation project(':code:common:model')
|
||||
}
|
||||
|
@ -1,25 +0,0 @@
|
||||
# Summary Extraction
|
||||
|
||||
This feature attempts to find a descriptive passage of text that summarizes
|
||||
what a search result "is about". It's the text you see below a search result.
|
||||
|
||||
It must solve two problems:
|
||||
|
||||
1. Identify which part of the document that contains "the text".
|
||||
The crux is that the document may be anywhere from 1993 to the present, with era-appropriate
|
||||
formatting. It may be formatted with <center>ed <font>-tags, or semantic HTML5.
|
||||
|
||||
2. Identify which part of "the text" best describes the document.
|
||||
|
||||
It uses several naive heuristics to try to find something that makes sense,
|
||||
and there is probably room for improvement.
|
||||
|
||||
There are many good techniques for doing this, but they've sadly not proved
|
||||
particularly fast. Whatever solution is used needs to be able to summarize of
|
||||
order of a 100,000,000 documents with a time budget of a couple of hours.
|
||||
|
||||
|
||||
## Central Classes
|
||||
|
||||
* [SummaryExtractor](java/nu/marginalia/summary/SummaryExtractor.java)
|
||||
|
@ -1,34 +0,0 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
|
||||
id "de.undercouch.download" version "5.1.0"
|
||||
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':third-party:porterstemmer')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.notnull
|
||||
implementation libs.jsoup
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
@ -1,4 +0,0 @@
|
||||
# Topic Detection
|
||||
|
||||
This is an experiment in using hand-crafted naive bayesian filters to detecting the topic of a website.
|
||||
It's noteworthy it detects recipes very well.
|
@ -1,8 +0,0 @@
|
||||
# Crawl Features
|
||||
|
||||
These are bits of search-engine related code that are relatively isolated pieces of business logic,
|
||||
that benefit from the clarity of being kept separate from the rest of the crawling code.
|
||||
|
||||
* [content-type](content-type/) - Content Type identification
|
||||
* [crawl-blocklist](crawl-blocklist/) - IP and URL blocklists
|
||||
* [link-parser](link-parser/) - Code for parsing and normalizing links
|
@ -2,6 +2,11 @@ package nu.marginalia.api.math;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.api.math.MathProtobufCodec.DictionaryLookup;
|
||||
import nu.marginalia.api.math.MathProtobufCodec.EvalMath;
|
||||
import nu.marginalia.api.math.MathProtobufCodec.SpellCheck;
|
||||
import nu.marginalia.api.math.MathProtobufCodec.UnitConversion;
|
||||
import nu.marginalia.api.math.model.DictionaryResponse;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
@ -9,14 +14,11 @@ import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.*;
|
||||
|
||||
import nu.marginalia.api.math.model.*;
|
||||
import nu.marginalia.api.math.MathProtobufCodec.*;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Future;
|
||||
|
||||
|
||||
@Singleton
|
||||
@ -49,24 +51,14 @@ public class MathClient {
|
||||
.thenApply(SpellCheck::convertResponse);
|
||||
}
|
||||
|
||||
public Map<String, List<String>> spellCheck(List<String> words, Duration timeout) throws InterruptedException {
|
||||
// This looks a bit different because we need to spell check multiple words, and we want to do it in parallel
|
||||
public Future<Map<String, List<String>>> spellCheck(List<String> words) throws InterruptedException {
|
||||
List<RpcSpellCheckRequest> requests = words.stream().map(SpellCheck::createRequest).toList();
|
||||
|
||||
var future = channelPool.call(MathApiGrpc.MathApiBlockingStub::spellCheck)
|
||||
return channelPool.call(MathApiGrpc.MathApiBlockingStub::spellCheck)
|
||||
.async(executor)
|
||||
.runFor(requests);
|
||||
|
||||
try {
|
||||
var results = future.get();
|
||||
Map<String, List<String>> map = new HashMap<>();
|
||||
for (int i = 0; i < words.size(); i++) {
|
||||
map.put(words.get(i), SpellCheck.convertResponse(results.get(i)));
|
||||
}
|
||||
return map;
|
||||
}
|
||||
catch (ExecutionException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
.runFor(requests)
|
||||
.thenApply(rsp -> SpellCheck.convertResponses(words, rsp));
|
||||
}
|
||||
|
||||
public Future<String> unitConversion(String value, String from, String to) {
|
||||
|
@ -3,7 +3,9 @@ package nu.marginalia.api.math;
|
||||
import nu.marginalia.api.math.model.DictionaryEntry;
|
||||
import nu.marginalia.api.math.model.DictionaryResponse;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class MathProtobufCodec {
|
||||
|
||||
@ -35,6 +37,15 @@ public class MathProtobufCodec {
|
||||
public static List<String> convertResponse(RpcSpellCheckResponse rsp) {
|
||||
return rsp.getSuggestionsList();
|
||||
}
|
||||
|
||||
|
||||
public static Map<String, List<String>> convertResponses(List<String> words, List<RpcSpellCheckResponse> responses) {
|
||||
var map = new HashMap<String, List<String>>();
|
||||
for (int i = 0; i < words.size(); i++) {
|
||||
map.put(words.get(i), responses.get(i).getSuggestionsList());
|
||||
}
|
||||
return map;
|
||||
}
|
||||
}
|
||||
|
||||
public static class UnitConversion {
|
||||
|
@ -23,6 +23,7 @@ dependencies {
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:index:query')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
|
@ -1,11 +1,9 @@
|
||||
package nu.marginalia.api.searchquery;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingInputs;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingOutputs;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||
@ -48,11 +46,22 @@ public class IndexProtobufCodec {
|
||||
}
|
||||
|
||||
public static SearchQuery convertRpcQuery(RpcQuery query) {
|
||||
List<List<String>> coherences = new ArrayList<>();
|
||||
List<SearchPhraseConstraint> phraeConstraints = new ArrayList<>();
|
||||
|
||||
for (int j = 0; j < query.getCoherencesCount(); j++) {
|
||||
var coh = query.getCoherences(j);
|
||||
coherences.add(new ArrayList<>(coh.getCoherencesList()));
|
||||
for (int j = 0; j < query.getPhrasesCount(); j++) {
|
||||
var coh = query.getPhrases(j);
|
||||
if (coh.getType() == RpcPhrases.TYPE.OPTIONAL) {
|
||||
phraeConstraints.add(new SearchPhraseConstraint.Optional(List.copyOf(coh.getTermsList())));
|
||||
}
|
||||
else if (coh.getType() == RpcPhrases.TYPE.MANDATORY) {
|
||||
phraeConstraints.add(new SearchPhraseConstraint.Mandatory(List.copyOf(coh.getTermsList())));
|
||||
}
|
||||
else if (coh.getType() == RpcPhrases.TYPE.FULL) {
|
||||
phraeConstraints.add(new SearchPhraseConstraint.Full(List.copyOf(coh.getTermsList())));
|
||||
}
|
||||
else {
|
||||
throw new IllegalArgumentException("Unknown phrase constraint type: " + coh.getType());
|
||||
}
|
||||
}
|
||||
|
||||
return new SearchQuery(
|
||||
@ -61,7 +70,7 @@ public class IndexProtobufCodec {
|
||||
query.getExcludeList(),
|
||||
query.getAdviceList(),
|
||||
query.getPriorityList(),
|
||||
coherences
|
||||
phraeConstraints
|
||||
);
|
||||
}
|
||||
|
||||
@ -74,8 +83,21 @@ public class IndexProtobufCodec {
|
||||
.addAllExclude(searchQuery.getSearchTermsExclude())
|
||||
.addAllPriority(searchQuery.getSearchTermsPriority());
|
||||
|
||||
for (var coherences : searchQuery.searchTermCoherences) {
|
||||
subqueryBuilder.addCoherencesBuilder().addAllCoherences(coherences);
|
||||
for (var constraint : searchQuery.phraseConstraints) {
|
||||
switch (constraint) {
|
||||
case SearchPhraseConstraint.Optional(List<String> terms) ->
|
||||
subqueryBuilder.addPhrasesBuilder()
|
||||
.addAllTerms(terms)
|
||||
.setType(RpcPhrases.TYPE.OPTIONAL);
|
||||
case SearchPhraseConstraint.Mandatory(List<String> terms) ->
|
||||
subqueryBuilder.addPhrasesBuilder()
|
||||
.addAllTerms(terms)
|
||||
.setType(RpcPhrases.TYPE.MANDATORY);
|
||||
case SearchPhraseConstraint.Full(List<String> terms) ->
|
||||
subqueryBuilder.addPhrasesBuilder()
|
||||
.addAllTerms(terms)
|
||||
.setType(RpcPhrases.TYPE.FULL);
|
||||
}
|
||||
}
|
||||
|
||||
return subqueryBuilder.build();
|
||||
@ -86,19 +108,17 @@ public class IndexProtobufCodec {
|
||||
return ResultRankingParameters.sensibleDefaults();
|
||||
|
||||
return new ResultRankingParameters(
|
||||
new Bm25Parameters(params.getFullK(), params.getFullB()),
|
||||
new Bm25Parameters(params.getPrioK(), params.getPrioB()),
|
||||
new Bm25Parameters(params.getBm25K(), params.getBm25B()),
|
||||
params.getShortDocumentThreshold(),
|
||||
params.getShortDocumentPenalty(),
|
||||
params.getDomainRankBonus(),
|
||||
params.getQualityPenalty(),
|
||||
params.getShortSentenceThreshold(),
|
||||
params.getShortSentencePenalty(),
|
||||
params.getBm25FullWeight(),
|
||||
params.getBm25NgramWeight(),
|
||||
params.getBm25PrioWeight(),
|
||||
params.getTcfJaccardWeight(),
|
||||
params.getTcfOverlapWeight(),
|
||||
params.getBm25Weight(),
|
||||
params.getTcfFirstPositionWeight(),
|
||||
params.getTcfVerbatimWeight(),
|
||||
params.getTcfProximityWeight(),
|
||||
ResultRankingParameters.TemporalBias.valueOf(params.getTemporalBias().getBias().name()),
|
||||
params.getTemporalBiasWeight(),
|
||||
params.getExportDebugData()
|
||||
@ -113,21 +133,18 @@ public class IndexProtobufCodec {
|
||||
}
|
||||
|
||||
var builder = RpcResultRankingParameters.newBuilder()
|
||||
.setFullB(rankingParams.fullParams.b())
|
||||
.setFullK(rankingParams.fullParams.k())
|
||||
.setPrioB(rankingParams.prioParams.b())
|
||||
.setPrioK(rankingParams.prioParams.k())
|
||||
.setBm25B(rankingParams.bm25Params.b())
|
||||
.setBm25K(rankingParams.bm25Params.k())
|
||||
.setShortDocumentThreshold(rankingParams.shortDocumentThreshold)
|
||||
.setShortDocumentPenalty(rankingParams.shortDocumentPenalty)
|
||||
.setDomainRankBonus(rankingParams.domainRankBonus)
|
||||
.setQualityPenalty(rankingParams.qualityPenalty)
|
||||
.setShortSentenceThreshold(rankingParams.shortSentenceThreshold)
|
||||
.setShortSentencePenalty(rankingParams.shortSentencePenalty)
|
||||
.setBm25FullWeight(rankingParams.bm25FullWeight)
|
||||
.setBm25NgramWeight(rankingParams.bm25NgramWeight)
|
||||
.setBm25PrioWeight(rankingParams.bm25PrioWeight)
|
||||
.setTcfOverlapWeight(rankingParams.tcfOverlapWeight)
|
||||
.setTcfJaccardWeight(rankingParams.tcfJaccardWeight)
|
||||
.setBm25Weight(rankingParams.bm25Weight)
|
||||
.setTcfFirstPositionWeight(rankingParams.tcfFirstPosition)
|
||||
.setTcfProximityWeight(rankingParams.tcfProximity)
|
||||
.setTcfVerbatimWeight(rankingParams.tcfVerbatim)
|
||||
.setTemporalBiasWeight(rankingParams.temporalBiasWeight)
|
||||
.setExportDebugData(rankingParams.exportDebugData);
|
||||
|
||||
@ -142,45 +159,4 @@ public class IndexProtobufCodec {
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
|
||||
public static RpcResultRankingDetails convertRankingDetails(ResultRankingDetails rankingDetails) {
|
||||
if (rankingDetails == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return RpcResultRankingDetails.newBuilder()
|
||||
.setInputs(convertRankingInputs(rankingDetails.inputs()))
|
||||
.setOutput(convertRankingOutput(rankingDetails.outputs()))
|
||||
.build();
|
||||
}
|
||||
|
||||
private static RpcResultRankingOutputs convertRankingOutput(ResultRankingOutputs outputs) {
|
||||
return RpcResultRankingOutputs.newBuilder()
|
||||
.setAverageSentenceLengthPenalty(outputs.averageSentenceLengthPenalty())
|
||||
.setQualityPenalty(outputs.qualityPenalty())
|
||||
.setRankingBonus(outputs.rankingBonus())
|
||||
.setTopologyBonus(outputs.topologyBonus())
|
||||
.setDocumentLengthPenalty(outputs.documentLengthPenalty())
|
||||
.setTemporalBias(outputs.temporalBias())
|
||||
.setFlagsPenalty(outputs.flagsPenalty())
|
||||
.setOverallPart(outputs.overallPart())
|
||||
.setTcfOverlap(outputs.tcfOverlap())
|
||||
.setTcfJaccard(outputs.tcfJaccard())
|
||||
.setBM25F(outputs.bM25F())
|
||||
.setBM25N(outputs.bM25N())
|
||||
.setBM25P(outputs.bM25P())
|
||||
.build();
|
||||
}
|
||||
|
||||
private static RpcResultRankingInputs convertRankingInputs(ResultRankingInputs inputs) {
|
||||
return RpcResultRankingInputs.newBuilder()
|
||||
.setRank(inputs.rank())
|
||||
.setAsl(inputs.asl())
|
||||
.setQuality(inputs.quality())
|
||||
.setSize(inputs.size())
|
||||
.setTopology(inputs.topology())
|
||||
.setYear(inputs.year())
|
||||
.addAllFlags(inputs.flags())
|
||||
.build();
|
||||
}
|
||||
}
|
||||
|
@ -1,21 +1,25 @@
|
||||
package nu.marginalia.api.searchquery;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryResponse;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.DebugFactor;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.DebugFactorGroup;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.DebugTermFactorGroup;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingInputs;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingOutputs;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryResponse;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class QueryProtobufCodec {
|
||||
|
||||
@ -130,6 +134,7 @@ public class QueryProtobufCodec {
|
||||
results.getWordsTotal(),
|
||||
results.getBestPositions(),
|
||||
results.getRankingScore(),
|
||||
results.getResultsFromDomain(),
|
||||
convertRankingDetails(results.getRankingDetails())
|
||||
);
|
||||
}
|
||||
@ -137,46 +142,109 @@ public class QueryProtobufCodec {
|
||||
private static ResultRankingDetails convertRankingDetails(RpcResultRankingDetails rankingDetails) {
|
||||
if (rankingDetails == null)
|
||||
return null;
|
||||
var inputs = rankingDetails.getInputs();
|
||||
var outputs = rankingDetails.getOutput();
|
||||
|
||||
var docData = rankingDetails.getDocumentOutputs();
|
||||
var termData = rankingDetails.getTermOutputs();
|
||||
|
||||
return new ResultRankingDetails(
|
||||
convertRankingInputs(inputs),
|
||||
convertRankingOutputs(outputs)
|
||||
convertDocumentOutputs(docData),
|
||||
convertTermData(termData)
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
private static ResultRankingOutputs convertRankingOutputs(RpcResultRankingOutputs outputs) {
|
||||
return new ResultRankingOutputs(
|
||||
outputs.getAverageSentenceLengthPenalty(),
|
||||
outputs.getQualityPenalty(),
|
||||
outputs.getRankingBonus(),
|
||||
outputs.getTopologyBonus(),
|
||||
outputs.getDocumentLengthPenalty(),
|
||||
outputs.getTemporalBias(),
|
||||
outputs.getFlagsPenalty(),
|
||||
outputs.getOverallPart(),
|
||||
outputs.getTcfOverlap(),
|
||||
outputs.getTcfJaccard(),
|
||||
outputs.getBM25F(),
|
||||
outputs.getBM25N(),
|
||||
outputs.getBM25P()
|
||||
);
|
||||
private static List<DebugTermFactorGroup> convertTermData(RpcResultTermRankingOutputs termData) {
|
||||
Map<String, Long> termIdByName = new HashMap<>();
|
||||
Map<String, List<DebugFactor>> factorsByTerm = new HashMap<>();
|
||||
|
||||
for (int i = 0; i < termData.getTermCount(); i++) {
|
||||
termIdByName.put(termData.getTerm(i), termData.getTermId(i));
|
||||
factorsByTerm.computeIfAbsent(termData.getTerm(i), k -> new ArrayList<>())
|
||||
.add(new DebugFactor(termData.getFactor(i), termData.getValue(i)));
|
||||
}
|
||||
|
||||
Map<String, List<DebugFactorGroup>> factorGroupsByTerm = new HashMap<>();
|
||||
for (var entry : factorsByTerm.entrySet()) {
|
||||
String term = entry.getKey();
|
||||
var factorsList = entry.getValue();
|
||||
|
||||
Map<String, List<DebugFactor>> factorsByGroup = new HashMap<>();
|
||||
|
||||
for (var factor : factorsList) {
|
||||
String[] parts = factor.factor().split("\\.");
|
||||
|
||||
String group, name;
|
||||
|
||||
if (parts.length != 2) {
|
||||
group = "unknown";
|
||||
name = parts[0];
|
||||
} else {
|
||||
group = parts[0];
|
||||
name = parts[1];
|
||||
}
|
||||
|
||||
|
||||
factorsByGroup.computeIfAbsent(group, k -> new ArrayList<>())
|
||||
.add(new DebugFactor(name, factor.value()));
|
||||
}
|
||||
|
||||
factorsByGroup.forEach((groupName, groupData) -> {
|
||||
factorGroupsByTerm.computeIfAbsent(term, k -> new ArrayList<>())
|
||||
.add(new DebugFactorGroup(groupName, groupData));
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
List<DebugTermFactorGroup> groups = new ArrayList<>();
|
||||
|
||||
for (var entry : factorGroupsByTerm.entrySet()) {
|
||||
groups.add(new DebugTermFactorGroup(entry.getKey(), termIdByName.get(entry.getKey()), entry.getValue()));
|
||||
}
|
||||
|
||||
return groups;
|
||||
}
|
||||
|
||||
private static ResultRankingInputs convertRankingInputs(RpcResultRankingInputs inputs) {
|
||||
return new ResultRankingInputs(
|
||||
inputs.getRank(),
|
||||
inputs.getAsl(),
|
||||
inputs.getQuality(),
|
||||
inputs.getSize(),
|
||||
inputs.getTopology(),
|
||||
inputs.getYear(),
|
||||
inputs.getFlagsList()
|
||||
);
|
||||
private static List<DebugFactorGroup> convertDocumentOutputs(RpcResultDocumentRankingOutputs docData) {
|
||||
|
||||
List<DebugFactor> unclusteredFactors = new ArrayList<>();
|
||||
for (int i = 0; i < docData.getFactorCount(); i++) {
|
||||
String factor = docData.getFactor(i);
|
||||
String value = docData.getValue(i);
|
||||
unclusteredFactors.add(new DebugFactor(factor, value));
|
||||
}
|
||||
|
||||
Map<String, List<DebugFactor>> factorsByGroup = new HashMap<>();
|
||||
|
||||
for (var factor : unclusteredFactors) {
|
||||
String factorName = factor.factor();
|
||||
String value = factor.value();
|
||||
|
||||
String[] parts = factorName.split("\\.");
|
||||
|
||||
String group, name;
|
||||
|
||||
if (parts.length != 2) {
|
||||
group = "unknown";
|
||||
name = factorName;
|
||||
}
|
||||
else {
|
||||
group = parts[0];
|
||||
name = parts[1];
|
||||
}
|
||||
|
||||
factorsByGroup.computeIfAbsent(group, k -> new ArrayList<>())
|
||||
.add(new DebugFactor(name, value));
|
||||
}
|
||||
|
||||
List<DebugFactorGroup> groups = new ArrayList<>();
|
||||
for (var entry : factorsByGroup.entrySet()) {
|
||||
groups.add(new DebugFactorGroup(entry.getKey(), entry.getValue()));
|
||||
}
|
||||
|
||||
return groups;
|
||||
}
|
||||
|
||||
|
||||
private static SearchResultItem convertRawResult(RpcRawResultItem rawItem) {
|
||||
var keywordScores = new ArrayList<SearchResultKeywordScore>(rawItem.getKeywordScoresCount());
|
||||
|
||||
@ -188,8 +256,9 @@ public class QueryProtobufCodec {
|
||||
rawItem.getEncodedDocMetadata(),
|
||||
rawItem.getHtmlFeatures(),
|
||||
keywordScores,
|
||||
rawItem.getResultsFromDomain(),
|
||||
rawItem.getHasPriorityTerms(),
|
||||
0, // Not set
|
||||
null, // Not set
|
||||
Double.NaN // Not set
|
||||
);
|
||||
}
|
||||
@ -198,7 +267,8 @@ public class QueryProtobufCodec {
|
||||
return new SearchResultKeywordScore(
|
||||
keywordScores.getKeyword(),
|
||||
-1, // termId is internal to index service
|
||||
keywordScores.getEncodedWordMetadata()
|
||||
(byte) keywordScores.getFlags(),
|
||||
keywordScores.getPositions()
|
||||
);
|
||||
}
|
||||
|
||||
@ -257,6 +327,7 @@ public class QueryProtobufCodec {
|
||||
rpcDecoratedResultItem.getWordsTotal(),
|
||||
rpcDecoratedResultItem.getBestPositions(),
|
||||
rpcDecoratedResultItem.getRankingScore(),
|
||||
rpcDecoratedResultItem.getResultsFromDomain(),
|
||||
convertRankingDetails(rpcDecoratedResultItem.getRankingDetails())
|
||||
);
|
||||
}
|
||||
|
@ -3,7 +3,9 @@ package nu.marginalia.api.searchquery.model.compiled;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.function.*;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.ToIntFunction;
|
||||
import java.util.function.ToLongFunction;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
@ -46,8 +48,8 @@ public class CompiledQuery<T> implements Iterable<T> {
|
||||
return new CompiledQueryLong(root, data.mapToLong(mapper));
|
||||
}
|
||||
|
||||
public CompiledQueryLong mapToInt(ToIntFunction<T> mapper) {
|
||||
return new CompiledQueryLong(root, data.mapToInt(mapper));
|
||||
public CompiledQueryInt mapToInt(ToIntFunction<T> mapper) {
|
||||
return new CompiledQueryInt(root, data.mapToInt(mapper));
|
||||
}
|
||||
|
||||
public CqExpression root() {
|
||||
|
@ -5,8 +5,8 @@ import java.util.stream.IntStream;
|
||||
|
||||
/** A compiled index service query */
|
||||
public class CompiledQueryInt {
|
||||
private final CqExpression root;
|
||||
private final CqDataInt data;
|
||||
public final CqExpression root;
|
||||
public final CqDataInt data;
|
||||
|
||||
public CompiledQueryInt(CqExpression root, CqDataInt data) {
|
||||
this.root = root;
|
||||
@ -26,7 +26,7 @@ public class CompiledQueryInt {
|
||||
return IntStream.range(0, data.size());
|
||||
}
|
||||
|
||||
public long at(int index) {
|
||||
public int at(int index) {
|
||||
return data.get(index);
|
||||
}
|
||||
|
||||
|
@ -61,7 +61,8 @@ public class CompiledQueryParser {
|
||||
|
||||
String[] cqData = new String[wordIds.size()];
|
||||
wordIds.forEach((w, i) -> cqData[i] = w);
|
||||
return new CompiledQuery<>(root, new CqData<>(cqData));
|
||||
|
||||
return root.newQuery(cqData);
|
||||
|
||||
}
|
||||
|
||||
|
@ -33,13 +33,13 @@ public class CqData<T> {
|
||||
return new CqDataLong(newData);
|
||||
}
|
||||
|
||||
public CqDataLong mapToInt(ToIntFunction<T> mapper) {
|
||||
long[] newData = new long[data.length];
|
||||
public CqDataInt mapToInt(ToIntFunction<T> mapper) {
|
||||
int[] newData = new int[data.length];
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
newData[i] = mapper.applyAsInt((T) data[i]);
|
||||
newData[i] = mapper.applyAsInt(data[i]);
|
||||
}
|
||||
|
||||
return new CqDataLong(newData);
|
||||
return new CqDataInt(newData);
|
||||
}
|
||||
|
||||
public T get(int i) {
|
||||
|
@ -8,6 +8,18 @@ import java.util.stream.Stream;
|
||||
*
|
||||
*/
|
||||
public sealed interface CqExpression {
|
||||
/** Create a new query for the provided data using this expression as the root */
|
||||
default <T> CompiledQuery<T> newQuery(T[] data) {
|
||||
return new CompiledQuery<>(this, data);
|
||||
}
|
||||
/** Create a new query for the provided data using this expression as the root */
|
||||
default CompiledQueryInt newQuery(int[] data) {
|
||||
return new CompiledQueryInt(this, new CqDataInt(data));
|
||||
}
|
||||
/** Create a new query for the provided data using this expression as the root */
|
||||
default CompiledQueryLong newQuery(long[] data) {
|
||||
return new CompiledQueryLong(this, new CqDataLong(data));
|
||||
}
|
||||
|
||||
Stream<Word> stream();
|
||||
|
||||
|
@ -2,6 +2,7 @@ package nu.marginalia.api.searchquery.model.compiled.aggregate;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.LongSet;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
|
||||
import java.util.ArrayList;
|
||||
@ -36,7 +37,10 @@ public class CompiledQueryAggregates {
|
||||
public static <T> int intMaxMinAggregate(CompiledQuery<T> query, ToIntFunction<T> operator) {
|
||||
return query.root.visit(new CqIntMaxMinOperator(query, operator));
|
||||
}
|
||||
|
||||
/** Apply the operator to each leaf node, then return the highest minimum value found along any path */
|
||||
public static <T> int intMaxMinAggregate(CompiledQueryInt query, IntUnaryOperator operator) {
|
||||
return query.root.visit(new CqIntMaxMinOperator(query, operator));
|
||||
}
|
||||
/** Apply the operator to each leaf node, then return the highest minimum value found along any path */
|
||||
public static int intMaxMinAggregate(CompiledQueryLong query, LongToIntFunction operator) {
|
||||
return query.root.visit(new CqIntMaxMinOperator(query, operator));
|
||||
@ -55,13 +59,4 @@ public class CompiledQueryAggregates {
|
||||
return new ArrayList<>(query.root().visit(new CqQueryPathsOperator(query)));
|
||||
}
|
||||
|
||||
/** Using the bitwise AND operator, aggregate all possible combined values of the long generated by the provided operator */
|
||||
public static <T> LongSet positionsAggregate(CompiledQuery<T> query, ToLongFunction<T> operator) {
|
||||
return query.root().visit(new CqPositionsOperator(query, operator));
|
||||
}
|
||||
|
||||
/** Using the bitwise AND operator, aggregate all possible combined values of the long generated by the provided operator */
|
||||
public static <T> LongSet positionsAggregate(CompiledQueryLong query, LongUnaryOperator operator) {
|
||||
return query.root().visit(new CqPositionsOperator(query, operator));
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
package nu.marginalia.api.searchquery.model.compiled.aggregate;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||
|
||||
@ -21,7 +22,9 @@ public class CqIntMaxMinOperator implements CqExpression.IntVisitor {
|
||||
public CqIntMaxMinOperator(CompiledQueryLong query, LongToIntFunction operator) {
|
||||
this.operator = idx -> operator.applyAsInt(query.at(idx));
|
||||
}
|
||||
|
||||
public CqIntMaxMinOperator(CompiledQueryInt query, IntUnaryOperator operator) {
|
||||
this.operator = idx -> operator.applyAsInt(query.at(idx));
|
||||
}
|
||||
@Override
|
||||
public int onAnd(List<? extends CqExpression> parts) {
|
||||
int value = parts.getFirst().visit(this);
|
||||
|
@ -1,85 +0,0 @@
|
||||
package nu.marginalia.api.searchquery.model.compiled.aggregate;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.LongArraySet;
|
||||
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||
import it.unimi.dsi.fastutil.longs.LongSet;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.function.IntToLongFunction;
|
||||
import java.util.function.LongUnaryOperator;
|
||||
import java.util.function.ToLongFunction;
|
||||
|
||||
public class CqPositionsOperator implements CqExpression.ObjectVisitor<LongSet> {
|
||||
private final IntToLongFunction operator;
|
||||
|
||||
public <T> CqPositionsOperator(CompiledQuery<T> query, ToLongFunction<T> operator) {
|
||||
this.operator = idx -> operator.applyAsLong(query.at(idx));
|
||||
}
|
||||
|
||||
public CqPositionsOperator(CompiledQueryLong query, LongUnaryOperator operator) {
|
||||
this.operator = idx -> operator.applyAsLong(query.at(idx));
|
||||
}
|
||||
|
||||
@Override
|
||||
public LongSet onAnd(List<? extends CqExpression> parts) {
|
||||
LongSet ret = new LongArraySet();
|
||||
|
||||
for (var part : parts) {
|
||||
ret = comineSets(ret, part.visit(this));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
private LongSet comineSets(LongSet a, LongSet b) {
|
||||
if (a.isEmpty())
|
||||
return b;
|
||||
if (b.isEmpty())
|
||||
return a;
|
||||
|
||||
LongSet ret = newSet(a.size() * b.size());
|
||||
|
||||
var ai = a.longIterator();
|
||||
|
||||
while (ai.hasNext()) {
|
||||
long aval = ai.nextLong();
|
||||
|
||||
var bi = b.longIterator();
|
||||
while (bi.hasNext()) {
|
||||
ret.add(aval & bi.nextLong());
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public LongSet onOr(List<? extends CqExpression> parts) {
|
||||
LongSet ret = newSet(parts.size());
|
||||
|
||||
for (var part : parts) {
|
||||
ret.addAll(part.visit(this));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public LongSet onLeaf(int idx) {
|
||||
var set = newSet(1);
|
||||
set.add(operator.applyAsLong(idx));
|
||||
return set;
|
||||
}
|
||||
|
||||
/** Allocate a new set suitable for a collection with the provided cardinality */
|
||||
private LongSet newSet(int cardinality) {
|
||||
if (cardinality < 8)
|
||||
return new LongArraySet(cardinality);
|
||||
else
|
||||
return new LongOpenHashSet(cardinality);
|
||||
}
|
||||
|
||||
}
|
@ -2,6 +2,7 @@ package nu.marginalia.api.searchquery.model.query;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
@ -10,7 +11,7 @@ public record QueryResponse(SearchSpecification specs,
|
||||
List<DecoratedSearchResultItem> results,
|
||||
List<String> searchTermsHuman,
|
||||
List<String> problems,
|
||||
String domain)
|
||||
@Nullable String domain)
|
||||
{
|
||||
public Set<String> getAllKeywords() {
|
||||
return new HashSet<>(specs.query.searchTermsInclude);
|
||||
|
@ -0,0 +1,85 @@
|
||||
package nu.marginalia.api.searchquery.model.query;
|
||||
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public sealed interface SearchPhraseConstraint {
|
||||
|
||||
record Mandatory(List<String> terms) implements SearchPhraseConstraint {
|
||||
public Mandatory(String... terms) {
|
||||
this(List.of(terms));
|
||||
}
|
||||
}
|
||||
|
||||
record Optional(List<String> terms) implements SearchPhraseConstraint {
|
||||
public Optional(String... terms) {
|
||||
this(List.of(terms));
|
||||
}
|
||||
}
|
||||
|
||||
record Full(List<String> terms) implements SearchPhraseConstraint {
|
||||
public Full(String... terms) {
|
||||
this(List.of(terms));
|
||||
}
|
||||
}
|
||||
|
||||
List<String> terms();
|
||||
default int size() {
|
||||
return terms().size();
|
||||
}
|
||||
|
||||
static SearchPhraseConstraint mandatory(String... terms) {
|
||||
return new Mandatory(trimStopWords(terms));
|
||||
}
|
||||
static SearchPhraseConstraint mandatory(List<String> terms) {
|
||||
return new Mandatory(trimStopWords(terms));
|
||||
}
|
||||
static SearchPhraseConstraint optional(String... terms) {
|
||||
return new Optional(trimStopWords(terms));
|
||||
}
|
||||
static SearchPhraseConstraint optional(List<String> terms) {
|
||||
return new Optional(trimStopWords(terms));
|
||||
}
|
||||
static SearchPhraseConstraint full(String... terms) {
|
||||
return new Full(trimStopWords(terms));
|
||||
}
|
||||
static SearchPhraseConstraint full(List<String> terms) {
|
||||
return new Full(trimStopWords(terms));
|
||||
}
|
||||
|
||||
|
||||
private static List<String> trimStopWords(List<String> terms) {
|
||||
List<String> ret = new ArrayList<>(terms.size());
|
||||
for (var term : terms) {
|
||||
if (WordPatterns.isStopWord(term)) {
|
||||
ret.add("");
|
||||
} else {
|
||||
ret.add(term);
|
||||
}
|
||||
}
|
||||
return List.copyOf(ret);
|
||||
}
|
||||
|
||||
private static List<String> trimStopWords(String... terms) {
|
||||
List<String> ret = new ArrayList<>(terms.length);
|
||||
for (var term : terms) {
|
||||
if (WordPatterns.isStopWord(term)) {
|
||||
ret.add("");
|
||||
} else {
|
||||
ret.add(term);
|
||||
}
|
||||
}
|
||||
|
||||
while (!ret.isEmpty() && "".equals(ret.getFirst())) {
|
||||
ret.removeFirst();
|
||||
}
|
||||
while (!ret.isEmpty() && "".equals(ret.getLast())) {
|
||||
ret.removeLast();
|
||||
}
|
||||
|
||||
return List.copyOf(ret);
|
||||
}
|
||||
|
||||
}
|
@ -31,18 +31,22 @@ public class SearchQuery {
|
||||
public final List<String> searchTermsPriority;
|
||||
|
||||
/** Terms that we require to be in the same sentence */
|
||||
public final List<List<String>> searchTermCoherences;
|
||||
public final List<SearchPhraseConstraint> phraseConstraints;
|
||||
|
||||
@Deprecated // why does this exist?
|
||||
private double value = 0;
|
||||
|
||||
public static SearchQueryBuilder builder() {
|
||||
return new SearchQueryBuilder();
|
||||
}
|
||||
|
||||
public SearchQuery() {
|
||||
this.compiledQuery = "";
|
||||
this.searchTermsInclude = new ArrayList<>();
|
||||
this.searchTermsExclude = new ArrayList<>();
|
||||
this.searchTermsAdvice = new ArrayList<>();
|
||||
this.searchTermsPriority = new ArrayList<>();
|
||||
this.searchTermCoherences = new ArrayList<>();
|
||||
this.phraseConstraints = new ArrayList<>();
|
||||
}
|
||||
|
||||
public SearchQuery(String compiledQuery,
|
||||
@ -50,13 +54,13 @@ public class SearchQuery {
|
||||
List<String> searchTermsExclude,
|
||||
List<String> searchTermsAdvice,
|
||||
List<String> searchTermsPriority,
|
||||
List<List<String>> searchTermCoherences) {
|
||||
List<SearchPhraseConstraint> phraseConstraints) {
|
||||
this.compiledQuery = compiledQuery;
|
||||
this.searchTermsInclude = searchTermsInclude;
|
||||
this.searchTermsExclude = searchTermsExclude;
|
||||
this.searchTermsAdvice = searchTermsAdvice;
|
||||
this.searchTermsPriority = searchTermsPriority;
|
||||
this.searchTermCoherences = searchTermCoherences;
|
||||
this.phraseConstraints = phraseConstraints;
|
||||
}
|
||||
|
||||
@Deprecated // why does this exist?
|
||||
@ -76,10 +80,62 @@ public class SearchQuery {
|
||||
if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] ")));
|
||||
if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] ")));
|
||||
if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] ")));
|
||||
if (!searchTermCoherences.isEmpty()) sb.append("coherences=").append(searchTermCoherences.stream().map(coh->coh.stream().collect(Collectors.joining(",", "[", "] "))).collect(Collectors.joining(", ")));
|
||||
if (!phraseConstraints.isEmpty()) sb.append("phraseConstraints=").append(phraseConstraints.stream().map(coh->coh.terms().stream().collect(Collectors.joining(",", "[", "] "))).collect(Collectors.joining(", ")));
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public static class SearchQueryBuilder {
|
||||
private String compiledQuery;
|
||||
public final List<String> searchTermsInclude = new ArrayList<>();
|
||||
public final List<String> searchTermsExclude = new ArrayList<>();
|
||||
public final List<String> searchTermsAdvice = new ArrayList<>();
|
||||
public final List<String> searchTermsPriority = new ArrayList<>();
|
||||
public final List<SearchPhraseConstraint> searchPhraseConstraints = new ArrayList<>();
|
||||
|
||||
private SearchQueryBuilder() {
|
||||
}
|
||||
|
||||
public SearchQueryBuilder compiledQuery(String query) {
|
||||
this.compiledQuery = query;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchQueryBuilder include(String... terms) {
|
||||
searchTermsInclude.addAll(List.of(terms));
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchQueryBuilder exclude(String... terms) {
|
||||
searchTermsExclude.addAll(List.of(terms));
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchQueryBuilder advice(String... terms) {
|
||||
searchTermsAdvice.addAll(List.of(terms));
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchQueryBuilder priority(String... terms) {
|
||||
searchTermsPriority.addAll(List.of(terms));
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchQueryBuilder phraseConstraint(SearchPhraseConstraint constraint) {
|
||||
searchPhraseConstraints.add(constraint);
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchQuery build() {
|
||||
return new SearchQuery(compiledQuery, searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchPhraseConstraints);
|
||||
}
|
||||
|
||||
/** If there are no ranking terms, promote the advice terms to ranking terms */
|
||||
public void promoteNonRankingTerms() {
|
||||
if (searchTermsInclude.isEmpty() && !searchTermsAdvice.isEmpty()) {
|
||||
searchTermsInclude.addAll(searchTermsAdvice);
|
||||
searchTermsAdvice.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -19,10 +19,14 @@ public class SearchSpecification {
|
||||
|
||||
public final String humanQuery;
|
||||
|
||||
public final SpecificationLimit quality;
|
||||
public final SpecificationLimit year;
|
||||
public final SpecificationLimit size;
|
||||
public final SpecificationLimit rank;
|
||||
@Builder.Default
|
||||
public final SpecificationLimit quality = SpecificationLimit.none();
|
||||
@Builder.Default
|
||||
public final SpecificationLimit year = SpecificationLimit.none();
|
||||
@Builder.Default
|
||||
public final SpecificationLimit size = SpecificationLimit.none();
|
||||
@Builder.Default
|
||||
public final SpecificationLimit rank = SpecificationLimit.none();
|
||||
|
||||
public final QueryLimits queryLimits;
|
||||
|
||||
|
@ -34,6 +34,8 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
|
||||
public final long bestPositions;
|
||||
public final double rankingScore;
|
||||
|
||||
public final int resultsFromDomain;
|
||||
|
||||
@Nullable
|
||||
public ResultRankingDetails rankingDetails;
|
||||
|
||||
@ -43,9 +45,6 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
|
||||
public int domainId() {
|
||||
return rawIndexResult.getDomainId();
|
||||
}
|
||||
public int resultsFromDomain() {
|
||||
return rawIndexResult.getResultsFromDomain();
|
||||
}
|
||||
|
||||
public List<SearchResultKeywordScore> keywordScores() {
|
||||
return rawIndexResult.getKeywordScores();
|
||||
@ -72,6 +71,7 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
|
||||
int wordsTotal,
|
||||
long bestPositions,
|
||||
double rankingScore,
|
||||
int resultsFromDomain,
|
||||
@Nullable
|
||||
ResultRankingDetails rankingDetails
|
||||
)
|
||||
@ -88,6 +88,7 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
|
||||
this.wordsTotal = wordsTotal;
|
||||
this.bestPositions = bestPositions;
|
||||
this.rankingScore = rankingScore;
|
||||
this.resultsFromDomain = resultsFromDomain;
|
||||
this.rankingDetails = rankingDetails;
|
||||
}
|
||||
|
||||
|
@ -10,9 +10,7 @@ import lombok.*;
|
||||
public class ResultRankingParameters {
|
||||
|
||||
/** Tuning for BM25 when applied to full document matches */
|
||||
public final Bm25Parameters fullParams;
|
||||
/** Tuning for BM25 when applied to priority matches, terms with relevance signal indicators */
|
||||
public final Bm25Parameters prioParams;
|
||||
public final Bm25Parameters bm25Params;
|
||||
|
||||
/** Documents below this length are penalized */
|
||||
public int shortDocumentThreshold;
|
||||
@ -32,11 +30,10 @@ public class ResultRankingParameters {
|
||||
/** Magnitude of penalty for documents with low average sentence length */
|
||||
public double shortSentencePenalty;
|
||||
|
||||
public double bm25FullWeight;
|
||||
public double bm25NgramWeight;
|
||||
public double bm25PrioWeight;
|
||||
public double tcfJaccardWeight;
|
||||
public double tcfOverlapWeight;
|
||||
public double bm25Weight;
|
||||
public double tcfFirstPosition;
|
||||
public double tcfVerbatim;
|
||||
public double tcfProximity;
|
||||
|
||||
public TemporalBias temporalBias;
|
||||
public double temporalBiasWeight;
|
||||
@ -45,19 +42,17 @@ public class ResultRankingParameters {
|
||||
|
||||
public static ResultRankingParameters sensibleDefaults() {
|
||||
return builder()
|
||||
.fullParams(new Bm25Parameters(1.2, 0.5))
|
||||
.prioParams(new Bm25Parameters(1.5, 0))
|
||||
.bm25Params(new Bm25Parameters(1.2, 0.5))
|
||||
.shortDocumentThreshold(2000)
|
||||
.shortDocumentPenalty(2.)
|
||||
.domainRankBonus(1/25.)
|
||||
.qualityPenalty(1/15.)
|
||||
.shortSentenceThreshold(2)
|
||||
.shortSentencePenalty(5)
|
||||
.bm25FullWeight(1.)
|
||||
.bm25NgramWeight(.25)
|
||||
.bm25PrioWeight(1.)
|
||||
.tcfOverlapWeight(3.)
|
||||
.tcfJaccardWeight(1)
|
||||
.bm25Weight(1.)
|
||||
.tcfVerbatim(2.)
|
||||
.tcfProximity(2.)
|
||||
.tcfFirstPosition(25)
|
||||
.temporalBias(TemporalBias.NONE)
|
||||
.temporalBiasWeight(1. / (5.))
|
||||
.exportDebugData(false)
|
||||
|
@ -2,6 +2,7 @@ package nu.marginalia.api.searchquery.model.results;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
@ -25,20 +26,23 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
|
||||
/** How did the subqueries match against the document ? */
|
||||
public final List<SearchResultKeywordScore> keywordScores;
|
||||
|
||||
/** How many other potential results existed in the same domain */
|
||||
public int resultsFromDomain;
|
||||
|
||||
public boolean hasPrioTerm;
|
||||
|
||||
public long bestPositions;
|
||||
|
||||
public DebugRankingFactors debugRankingFactors;
|
||||
|
||||
public SearchResultItem(long combinedId,
|
||||
long encodedDocMetadata,
|
||||
int htmlFeatures,
|
||||
boolean hasPrioTerm) {
|
||||
double score,
|
||||
long bestPositions) {
|
||||
this.combinedId = combinedId;
|
||||
this.encodedDocMetadata = encodedDocMetadata;
|
||||
this.bestPositions = bestPositions;
|
||||
this.keywordScores = new ArrayList<>();
|
||||
this.htmlFeatures = htmlFeatures;
|
||||
this.hasPrioTerm = hasPrioTerm;
|
||||
this.scoreValue = score;
|
||||
}
|
||||
|
||||
|
||||
@ -84,7 +88,6 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
|
||||
|
||||
@Override
|
||||
public int compareTo(@NotNull SearchResultItem o) {
|
||||
// this looks like a bug, but we actually want this in a reversed order
|
||||
int diff = Double.compare(getScore(), o.getScore());
|
||||
if (diff != 0)
|
||||
return diff;
|
||||
|
@ -1,40 +1,32 @@
|
||||
package nu.marginalia.api.searchquery.model.results;
|
||||
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
public final class SearchResultKeywordScore {
|
||||
public final long termId;
|
||||
public final String keyword;
|
||||
private final long encodedWordMetadata;
|
||||
public byte flags;
|
||||
public int positionCount;
|
||||
|
||||
public SearchResultKeywordScore(String keyword,
|
||||
long termId,
|
||||
long encodedWordMetadata) {
|
||||
byte flags,
|
||||
int positionCount) {
|
||||
this.termId = termId;
|
||||
this.keyword = keyword;
|
||||
this.encodedWordMetadata = encodedWordMetadata;
|
||||
}
|
||||
|
||||
public boolean hasTermFlag(WordFlags flag) {
|
||||
return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit());
|
||||
return (flags & flag.asBit()) != 0;
|
||||
}
|
||||
|
||||
|
||||
public long positions() {
|
||||
return WordMetadata.decodePositions(encodedWordMetadata);
|
||||
}
|
||||
|
||||
public boolean isKeywordSpecial() {
|
||||
return keyword.contains(":") || hasTermFlag(WordFlags.Synthetic);
|
||||
}
|
||||
|
||||
public long encodedWordMetadata() {
|
||||
return encodedWordMetadata;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (obj == this) return true;
|
||||
@ -51,8 +43,7 @@ public final class SearchResultKeywordScore {
|
||||
@Override
|
||||
public String toString() {
|
||||
return "SearchResultKeywordScore[" +
|
||||
"keyword=" + keyword + ", " +
|
||||
"encodedWordMetadata=" + new WordMetadata(encodedWordMetadata) + ']';
|
||||
"keyword=" + keyword + ']';
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,22 +0,0 @@
|
||||
package nu.marginalia.api.searchquery.model.results;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.ToString;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
@AllArgsConstructor @Getter @ToString
|
||||
public class SearchResultSet {
|
||||
public SearchResultSet() {
|
||||
results = new ArrayList<>();
|
||||
}
|
||||
|
||||
public List<DecoratedSearchResultItem> results;
|
||||
public int size() {
|
||||
return results.size();
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,4 @@
|
||||
package nu.marginalia.api.searchquery.model.results.debug;
|
||||
|
||||
public record DebugFactor(String factor, String value) {
|
||||
}
|
@ -0,0 +1,5 @@
|
||||
package nu.marginalia.api.searchquery.model.results.debug;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public record DebugFactorGroup(String name, List<DebugFactor> factors) {}
|
@ -0,0 +1,39 @@
|
||||
package nu.marginalia.api.searchquery.model.results.debug;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntIterator;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
/** Utility for capturing debug information about ranking factors */
|
||||
public class DebugRankingFactors {
|
||||
private final List<DebugFactor> documentFactors = new ArrayList<>();
|
||||
private final List<DebugTermFactor> termFactors = new ArrayList<>();
|
||||
|
||||
public DebugRankingFactors() {}
|
||||
|
||||
public void addDocumentFactor(String factor, String value) {
|
||||
documentFactors.add(new DebugFactor(factor, value));
|
||||
}
|
||||
|
||||
public void addTermFactor(long termId, String factor, String value) {
|
||||
termFactors.add(new DebugTermFactor(termId, null, factor, value));
|
||||
}
|
||||
public void addTermFactor(long termId, String factor, IntIterator sequenceIter) {
|
||||
if (!sequenceIter.hasNext()) return;
|
||||
|
||||
StringJoiner joiner = new StringJoiner(",");
|
||||
while (sequenceIter.hasNext()) {
|
||||
joiner.add(String.valueOf(sequenceIter.nextInt()));
|
||||
}
|
||||
termFactors.add(new DebugTermFactor(termId, null, factor, joiner.toString()));
|
||||
}
|
||||
|
||||
public List<DebugFactor> getDocumentFactors() {
|
||||
return documentFactors;
|
||||
}
|
||||
public List<DebugTermFactor> getTermFactors() {
|
||||
return termFactors;
|
||||
}
|
||||
}
|
@ -0,0 +1,4 @@
|
||||
package nu.marginalia.api.searchquery.model.results.debug;
|
||||
|
||||
public record DebugTermFactor(long termId, String term, String factor, String value) {
|
||||
}
|
@ -0,0 +1,6 @@
|
||||
package nu.marginalia.api.searchquery.model.results.debug;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public record DebugTermFactorGroup(String term, long termId, List<DebugFactorGroup> factorList) {
|
||||
}
|
@ -1,6 +1,9 @@
|
||||
package nu.marginalia.api.searchquery.model.results.debug;
|
||||
|
||||
public record ResultRankingDetails(ResultRankingInputs inputs, ResultRankingOutputs outputs)
|
||||
import java.util.List;
|
||||
|
||||
public record ResultRankingDetails(List<DebugFactorGroup> docFactorGroups,
|
||||
List<DebugTermFactorGroup> termFactorGroups)
|
||||
{
|
||||
|
||||
}
|
||||
|
@ -1,5 +0,0 @@
|
||||
package nu.marginalia.api.searchquery.model.results.debug;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public record ResultRankingInputs(int rank, int asl, int quality, int size, int topology, int year, List<String> flags) {}
|
@ -1,17 +0,0 @@
|
||||
package nu.marginalia.api.searchquery.model.results.debug;
|
||||
|
||||
public record ResultRankingOutputs(double averageSentenceLengthPenalty,
|
||||
double qualityPenalty,
|
||||
double rankingBonus,
|
||||
double topologyBonus,
|
||||
double documentLengthPenalty,
|
||||
double temporalBias,
|
||||
double flagsPenalty,
|
||||
double overallPart,
|
||||
double tcfOverlap,
|
||||
double tcfJaccard,
|
||||
double bM25F,
|
||||
double bM25N,
|
||||
double bM25P)
|
||||
{
|
||||
}
|
@ -93,22 +93,30 @@ message RpcDecoratedResultItem {
|
||||
double rankingScore = 11; // The ranking score of this search result item, lower is better
|
||||
int64 bestPositions = 12;
|
||||
RpcResultRankingDetails rankingDetails = 13; // optional, only present if exportDebugData is true in RpcResultRankingParameters
|
||||
int32 resultsFromDomain = 14;
|
||||
}
|
||||
|
||||
/** A raw index-service view of a search result */
|
||||
message RpcRawResultItem {
|
||||
int64 combinedId = 1; // raw ID with bit-encoded ranking information still present
|
||||
int32 resultsFromDomain = 2; // number of other results from the same domain
|
||||
int64 encodedDocMetadata = 3; // bit encoded document metadata
|
||||
int32 htmlFeatures = 4; // bitmask encoding features of the document
|
||||
repeated RpcResultKeywordScore keywordScores = 5;
|
||||
bool hasPriorityTerms = 6; // true if this word is important to the document
|
||||
MATCH_TYPE matchType = 7; // the type of match this result represents
|
||||
|
||||
enum MATCH_TYPE {
|
||||
FLAGS = 0;
|
||||
PROXIMITY = 1;
|
||||
PHRASE = 2;
|
||||
};
|
||||
}
|
||||
|
||||
/* Information about how well a keyword matches a query */
|
||||
message RpcResultKeywordScore {
|
||||
string keyword = 1; // the keyword
|
||||
int64 encodedWordMetadata = 2; // bit encoded word metadata
|
||||
int32 flags = 2;
|
||||
int32 positions = 3;
|
||||
}
|
||||
|
||||
/* Query execution parameters */
|
||||
@ -119,30 +127,32 @@ message RpcQueryLimits {
|
||||
int32 fetchSize = 4; // Size of the fetch buffer in the index service
|
||||
}
|
||||
|
||||
/** Parameters for the result ranking function */
|
||||
message RpcResultRankingParameters {
|
||||
double fullK = 1; // BM25 parameter
|
||||
double fullB = 2; // BM25 parameter
|
||||
double prioK = 3; // BM25 parameter
|
||||
double prioB = 4; // BM25 parameter
|
||||
double bm25K = 1; // BM25 parameter
|
||||
double bm25B = 2; // BM25 parameter
|
||||
|
||||
int32 shortDocumentThreshold = 5;
|
||||
double shortDocumentPenalty = 6;
|
||||
double domainRankBonus = 7;
|
||||
double qualityPenalty = 8;
|
||||
int32 shortSentenceThreshold = 9;
|
||||
double shortSentencePenalty = 10;
|
||||
double bm25FullWeight = 11;
|
||||
double bm25NgramWeight = 12;
|
||||
double bm25PrioWeight = 13;
|
||||
double tcfOverlapWeight = 14;
|
||||
double tcfJaccardWeight = 15;
|
||||
double bm25Weight = 11;
|
||||
// -- 12 unused --
|
||||
double tcfFirstPositionWeight = 13;
|
||||
double tcfVerbatimWeight = 14;
|
||||
double tcfProximityWeight = 15;
|
||||
RpcTemporalBias temporalBias = 16;
|
||||
double temporalBiasWeight = 17;
|
||||
|
||||
bool exportDebugData = 18;
|
||||
|
||||
}
|
||||
|
||||
message RpcResultRankingDetails {
|
||||
RpcResultRankingInputs inputs = 1;
|
||||
RpcResultRankingOutputs output = 2;
|
||||
RpcResultDocumentRankingOutputs documentOutputs = 1;
|
||||
RpcResultTermRankingOutputs termOutputs = 2;
|
||||
}
|
||||
|
||||
message RpcResultRankingInputs {
|
||||
@ -155,20 +165,17 @@ message RpcResultRankingInputs {
|
||||
repeated string flags = 7;
|
||||
}
|
||||
|
||||
message RpcResultRankingOutputs {
|
||||
double averageSentenceLengthPenalty = 1;
|
||||
double qualityPenalty = 2;
|
||||
double rankingBonus = 3;
|
||||
double topologyBonus = 4;
|
||||
double documentLengthPenalty = 5;
|
||||
double temporalBias = 6;
|
||||
double flagsPenalty = 7;
|
||||
double overallPart = 8;
|
||||
double tcfOverlap = 9;
|
||||
double tcfJaccard = 10;
|
||||
double bM25F = 11;
|
||||
double bM25N = 12;
|
||||
double bM25P = 13;
|
||||
/** Summary of the output of the ranking function */
|
||||
message RpcResultDocumentRankingOutputs {
|
||||
repeated string factor = 1;
|
||||
repeated string value = 2;
|
||||
}
|
||||
|
||||
message RpcResultTermRankingOutputs {
|
||||
repeated int64 termId = 1;
|
||||
repeated string term = 2;
|
||||
repeated string factor = 3;
|
||||
repeated string value = 4;
|
||||
}
|
||||
|
||||
/* Defines a single subquery */
|
||||
@ -177,11 +184,18 @@ message RpcQuery {
|
||||
repeated string exclude = 2; // These terms must be absent
|
||||
repeated string advice = 3; // These terms must be present, but do not affect ranking
|
||||
repeated string priority = 4; // These terms are not mandatory, but affect ranking positively if they are present
|
||||
repeated RpcCoherences coherences = 5; // Groups of terms that must exist in proximity of each other
|
||||
repeated RpcPhrases phrases = 5; // Groups of terms that must exist in proximity of each other
|
||||
string compiledQuery = 6; // Compiled query in infix notation
|
||||
}
|
||||
|
||||
/* Defines a group of search terms that must exist in close proximity within the document */
|
||||
message RpcCoherences {
|
||||
repeated string coherences = 1;
|
||||
/* Defines a group of search terms that must exist in the the specified order within the document */
|
||||
message RpcPhrases {
|
||||
repeated string terms = 1;
|
||||
TYPE type = 2;
|
||||
|
||||
enum TYPE {
|
||||
OPTIONAL = 0;
|
||||
MANDATORY = 1;
|
||||
FULL = 2;
|
||||
};
|
||||
}
|
||||
|
@ -1,10 +1,11 @@
|
||||
package nu.marginalia.api.searchquery.model.compiled;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class CompiledQueryParserTest {
|
||||
|
||||
@ -22,6 +23,21 @@ class CompiledQueryParserTest {
|
||||
assertEquals(w(q, "foo"), q.root);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCohen() {
|
||||
CompiledQuery<String> q = CompiledQueryParser.parse("( tube brief of elaboration | brief_elaboration_of_a_tube )");
|
||||
int val = CompiledQueryAggregates.intMaxMinAggregate(q, s ->
|
||||
switch (s) {
|
||||
case "brief" -> 3;
|
||||
case "tube" -> 2;
|
||||
case "of" -> 1;
|
||||
default -> 0;
|
||||
});
|
||||
assertEquals(0, val);
|
||||
|
||||
System.out.println(q.stream().toList());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAndTwoWords() {
|
||||
CompiledQuery<String> q = CompiledQueryParser.parse("foo bar");
|
||||
|
@ -1,6 +1,7 @@
|
||||
package nu.marginalia.index.client;
|
||||
|
||||
import nu.marginalia.api.searchquery.IndexProtobufCodec;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
@ -10,7 +11,7 @@ import org.junit.jupiter.api.Test;
|
||||
import java.util.List;
|
||||
import java.util.function.Function;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class IndexProtobufCodecTest {
|
||||
@Test
|
||||
@ -41,7 +42,9 @@ class IndexProtobufCodecTest {
|
||||
List.of("c", "d"),
|
||||
List.of("e", "f"),
|
||||
List.of("g", "h"),
|
||||
List.of(List.of("i", "j"), List.of("k"))
|
||||
List.of(
|
||||
SearchPhraseConstraint.mandatory(List.of("i", "j")),
|
||||
SearchPhraseConstraint.optional(List.of("k")))
|
||||
),
|
||||
s -> IndexProtobufCodec.convertRpcQuery(IndexProtobufCodec.convertRpcQuery(s))
|
||||
);
|
||||
|
@ -31,7 +31,7 @@ dependencies {
|
||||
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
implementation project(':code:features-convert:keyword-extraction')
|
||||
implementation project(':code:processes:converting-process:ft-keyword-extraction')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
|
@ -1,18 +1,15 @@
|
||||
package nu.marginalia.functions.searchquery.svc;
|
||||
package nu.marginalia.functions.searchquery;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.*;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
|
||||
import nu.marginalia.functions.searchquery.query_parser.QueryParser;
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
|
||||
import nu.marginalia.functions.searchquery.query_parser.QueryParser;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -56,11 +53,7 @@ public class QueryFactory {
|
||||
basicQuery.clear();
|
||||
}
|
||||
|
||||
List<String> searchTermsExclude = new ArrayList<>();
|
||||
List<String> searchTermsInclude = new ArrayList<>();
|
||||
List<String> searchTermsAdvice = new ArrayList<>();
|
||||
List<String> searchTermsPriority = new ArrayList<>();
|
||||
List<List<String>> searchTermCoherences = new ArrayList<>();
|
||||
SearchQuery.SearchQueryBuilder queryBuilder = SearchQuery.builder();
|
||||
|
||||
SpecificationLimit qualityLimit = SpecificationLimit.none();
|
||||
SpecificationLimit year = SpecificationLimit.none();
|
||||
@ -78,58 +71,50 @@ public class QueryFactory {
|
||||
|
||||
String[] parts = StringUtils.split(str, '_');
|
||||
|
||||
// Checking for stop words here is a bit of a stop-gap to fix the issue of stop words being
|
||||
// required in the query (which is a problem because they are not indexed). How to do this
|
||||
// in a clean way is a bit of an open problem that may not get resolved until query-parsing is
|
||||
// improved.
|
||||
if (parts.length > 1) {
|
||||
// Require that the terms appear in sequence
|
||||
queryBuilder.phraseConstraint(SearchPhraseConstraint.mandatory(parts));
|
||||
|
||||
// Construct a regular query from the parts in the quoted string
|
||||
queryBuilder.include(parts);
|
||||
|
||||
if (parts.length > 1 && !anyPartIsStopWord(parts)) {
|
||||
// Prefer that the actual n-gram is present
|
||||
searchTermsAdvice.add(str);
|
||||
|
||||
// Require that the terms appear in the same sentence
|
||||
searchTermCoherences.add(Arrays.asList(parts));
|
||||
|
||||
// Require that each term exists in the document
|
||||
// (needed for ranking)
|
||||
searchTermsInclude.addAll(Arrays.asList(parts));
|
||||
queryBuilder.priority(str);
|
||||
}
|
||||
else {
|
||||
searchTermsInclude.add(str);
|
||||
// If the quoted word is a single word, we don't need to do more than include it in the search
|
||||
queryBuilder.include(str);
|
||||
}
|
||||
}
|
||||
|
||||
case QueryToken.LiteralTerm(String str, String displayStr) -> {
|
||||
analyzeSearchTerm(problems, str, displayStr);
|
||||
searchTermsHuman.addAll(Arrays.asList(displayStr.split("\\s+")));
|
||||
|
||||
searchTermsInclude.add(str);
|
||||
queryBuilder.include(str);
|
||||
}
|
||||
|
||||
|
||||
case QueryToken.ExcludeTerm(String str, String displayStr) -> searchTermsExclude.add(str);
|
||||
case QueryToken.PriorityTerm(String str, String displayStr) -> searchTermsPriority.add(str);
|
||||
case QueryToken.ExcludeTerm(String str, String displayStr) -> queryBuilder.exclude(str);
|
||||
case QueryToken.PriorityTerm(String str, String displayStr) -> queryBuilder.priority(str);
|
||||
case QueryToken.AdviceTerm(String str, String displayStr) -> {
|
||||
searchTermsAdvice.add(str);
|
||||
queryBuilder.advice(str);
|
||||
|
||||
if (str.toLowerCase().startsWith("site:")) {
|
||||
domain = str.substring("site:".length());
|
||||
}
|
||||
}
|
||||
|
||||
case QueryToken.YearTerm(String str) -> year = parseSpecificationLimit(str);
|
||||
case QueryToken.SizeTerm(String str) -> size = parseSpecificationLimit(str);
|
||||
case QueryToken.RankTerm(String str) -> rank = parseSpecificationLimit(str);
|
||||
case QueryToken.QualityTerm(String str) -> qualityLimit = parseSpecificationLimit(str);
|
||||
case QueryToken.YearTerm(SpecificationLimit limit, String displayStr) -> year = limit;
|
||||
case QueryToken.SizeTerm(SpecificationLimit limit, String displayStr) -> size = limit;
|
||||
case QueryToken.RankTerm(SpecificationLimit limit, String displayStr) -> rank = limit;
|
||||
case QueryToken.QualityTerm(SpecificationLimit limit, String displayStr) -> qualityLimit = limit;
|
||||
case QueryToken.QsTerm(String str) -> queryStrategy = parseQueryStrategy(str);
|
||||
|
||||
default -> {}
|
||||
}
|
||||
}
|
||||
|
||||
if (searchTermsInclude.isEmpty() && !searchTermsAdvice.isEmpty()) {
|
||||
searchTermsInclude.addAll(searchTermsAdvice);
|
||||
searchTermsAdvice.clear();
|
||||
}
|
||||
queryBuilder.promoteNonRankingTerms();
|
||||
|
||||
List<Integer> domainIds = params.domainIds();
|
||||
|
||||
@ -139,20 +124,21 @@ public class QueryFactory {
|
||||
limits = limits.forSingleDomain();
|
||||
}
|
||||
|
||||
var expansion = queryExpansion.expandQuery(searchTermsInclude);
|
||||
searchTermCoherences.addAll(expansion.extraCoherences());
|
||||
var expansion = queryExpansion.expandQuery(queryBuilder.searchTermsInclude);
|
||||
|
||||
var searchQuery = new SearchQuery(
|
||||
expansion.compiledQuery(),
|
||||
searchTermsInclude,
|
||||
searchTermsExclude,
|
||||
searchTermsAdvice,
|
||||
searchTermsPriority,
|
||||
searchTermCoherences
|
||||
);
|
||||
// Query expansion may produce suggestions for phrase constraints,
|
||||
// add these to the query
|
||||
for (var coh : expansion.optionalPharseConstraints()) {
|
||||
queryBuilder.phraseConstraint(SearchPhraseConstraint.optional(coh));
|
||||
}
|
||||
|
||||
// add a pseudo-constraint for the full query
|
||||
queryBuilder.phraseConstraint(SearchPhraseConstraint.full(expansion.fullPhraseConstraint()));
|
||||
|
||||
queryBuilder.compiledQuery(expansion.compiledQuery());
|
||||
|
||||
var specsBuilder = SearchSpecification.builder()
|
||||
.query(searchQuery)
|
||||
.query(queryBuilder.build())
|
||||
.humanQuery(query)
|
||||
.quality(qualityLimit)
|
||||
.year(year)
|
||||
@ -183,20 +169,7 @@ public class QueryFactory {
|
||||
problems.add("Search term \"" + displayStr + "\" too long");
|
||||
}
|
||||
}
|
||||
private SpecificationLimit parseSpecificationLimit(String str) {
|
||||
int startChar = str.charAt(0);
|
||||
|
||||
int val = Integer.parseInt(str.substring(1));
|
||||
if (startChar == '=') {
|
||||
return SpecificationLimit.equals(val);
|
||||
} else if (startChar == '<') {
|
||||
return SpecificationLimit.lessThan(val);
|
||||
} else if (startChar == '>') {
|
||||
return SpecificationLimit.greaterThan(val);
|
||||
} else {
|
||||
return SpecificationLimit.none();
|
||||
}
|
||||
}
|
||||
|
||||
private QueryStrategy parseQueryStrategy(String str) {
|
||||
return switch (str.toUpperCase()) {
|
||||
@ -211,14 +184,4 @@ public class QueryFactory {
|
||||
default -> QueryStrategy.AUTO;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
private boolean anyPartIsStopWord(String[] parts) {
|
||||
for (String part : parts) {
|
||||
if (WordPatterns.isStopWord(part)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
@ -1,19 +1,16 @@
|
||||
package nu.marginalia.functions.searchquery;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import io.prometheus.client.Histogram;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.api.searchquery.*;
|
||||
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.db.DomainBlacklist;
|
||||
import nu.marginalia.index.api.IndexClient;
|
||||
import nu.marginalia.functions.searchquery.svc.QueryFactory;
|
||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -33,18 +30,18 @@ public class QueryGRPCService extends QueryApiGrpc.QueryApiImplBase {
|
||||
|
||||
|
||||
private final QueryFactory queryFactory;
|
||||
private final DomainBlacklist blacklist;
|
||||
private final IndexClient indexClient;
|
||||
|
||||
@Inject
|
||||
public QueryGRPCService(QueryFactory queryFactory,
|
||||
DomainBlacklist blacklist,
|
||||
IndexClient indexClient)
|
||||
{
|
||||
this.queryFactory = queryFactory;
|
||||
this.blacklist = blacklist;
|
||||
this.indexClient = indexClient;
|
||||
}
|
||||
|
||||
/** GRPC endpoint that parses a query, delegates it to the index partitions, and then collects the results.
|
||||
*/
|
||||
public void query(RpcQsQuery request, StreamObserver<RpcQsResponse> responseObserver)
|
||||
{
|
||||
try {
|
||||
@ -55,16 +52,20 @@ public class QueryGRPCService extends QueryApiGrpc.QueryApiImplBase {
|
||||
var params = QueryProtobufCodec.convertRequest(request);
|
||||
var query = queryFactory.createQuery(params, ResultRankingParameters.sensibleDefaults());
|
||||
|
||||
RpcIndexQuery indexRequest = QueryProtobufCodec.convertQuery(request, query);
|
||||
List<RpcDecoratedResultItem> bestItems = executeQueries(indexRequest, request.getQueryLimits().getResultsTotal());
|
||||
var indexRequest = QueryProtobufCodec.convertQuery(request, query);
|
||||
|
||||
// Execute the query on the index partitions
|
||||
List<RpcDecoratedResultItem> bestItems = indexClient.executeQueries(indexRequest);
|
||||
|
||||
// Convert results to response and send it back
|
||||
var responseBuilder = RpcQsResponse.newBuilder()
|
||||
.addAllResults(bestItems)
|
||||
.setSpecs(indexRequest)
|
||||
.addAllSearchTermsHuman(query.searchTermsHuman);
|
||||
|
||||
if (query.domain != null)
|
||||
if (query.domain != null) {
|
||||
responseBuilder.setDomain(query.domain);
|
||||
}
|
||||
|
||||
responseObserver.onNext(responseBuilder.build());
|
||||
responseObserver.onCompleted();
|
||||
@ -75,44 +76,19 @@ public class QueryGRPCService extends QueryApiGrpc.QueryApiImplBase {
|
||||
}
|
||||
}
|
||||
|
||||
private static final Comparator<RpcDecoratedResultItem> comparator =
|
||||
Comparator.comparing(RpcDecoratedResultItem::getRankingScore);
|
||||
|
||||
|
||||
private boolean isBlacklisted(RpcDecoratedResultItem item) {
|
||||
return blacklist.isBlacklisted(UrlIdCodec.getDomainId(item.getRawItem().getCombinedId()));
|
||||
}
|
||||
public record DetailedDirectResult(ProcessedQuery processedQuery,
|
||||
List<DecoratedSearchResultItem> result) {}
|
||||
|
||||
/** Local query execution, without GRPC. */
|
||||
public DetailedDirectResult executeDirect(
|
||||
String originalQuery,
|
||||
QueryParams params,
|
||||
ResultRankingParameters rankingParameters,
|
||||
int count) {
|
||||
ResultRankingParameters rankingParameters) {
|
||||
|
||||
var query = queryFactory.createQuery(params, rankingParameters);
|
||||
var items = indexClient.executeQueries(QueryProtobufCodec.convertQuery(originalQuery, query));
|
||||
|
||||
var items = executeQueries(
|
||||
QueryProtobufCodec.convertQuery(originalQuery, query),
|
||||
count)
|
||||
.stream().map(QueryProtobufCodec::convertQueryResult)
|
||||
.toList();
|
||||
|
||||
return new DetailedDirectResult(query, items);
|
||||
}
|
||||
|
||||
public record DetailedDirectResult(ProcessedQuery processedQuery,
|
||||
List<DecoratedSearchResultItem> result) {}
|
||||
|
||||
@SneakyThrows
|
||||
List<RpcDecoratedResultItem> executeQueries(RpcIndexQuery indexRequest, int totalSize) {
|
||||
var results = indexClient.executeQueries(indexRequest);
|
||||
|
||||
results.sort(comparator);
|
||||
results.removeIf(this::isBlacklisted);
|
||||
if (results.size() > totalSize) {
|
||||
results = results.subList(0, totalSize);
|
||||
}
|
||||
return results;
|
||||
return new DetailedDirectResult(query, Lists.transform(items, QueryProtobufCodec::convertQueryResult));
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -5,7 +5,6 @@ import com.google.inject.Inject;
|
||||
import nu.marginalia.functions.searchquery.query_parser.model.QWord;
|
||||
import nu.marginalia.functions.searchquery.query_parser.model.QWordGraph;
|
||||
import nu.marginalia.functions.searchquery.query_parser.model.QWordPathsRenderer;
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.segmentation.NgramLexicon;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
@ -45,11 +44,17 @@ public class QueryExpansion {
|
||||
strategy.expand(graph);
|
||||
}
|
||||
|
||||
List<List<String>> coherences = createSegments(graph);
|
||||
List<List<String>> optionalPhraseConstraints = createSegments(graph);
|
||||
|
||||
// also create a segmentation that is just the entire query
|
||||
List<String> fullPhraseConstraint = new ArrayList<> ();
|
||||
for (var qw : graph) {
|
||||
fullPhraseConstraint.add(qw.word());
|
||||
}
|
||||
|
||||
var compiled = QWordPathsRenderer.render(graph);
|
||||
|
||||
return new Expansion(compiled, coherences);
|
||||
return new Expansion(compiled, optionalPhraseConstraints, fullPhraseConstraint);
|
||||
}
|
||||
|
||||
private static final Pattern dashPattern = Pattern.compile("-");
|
||||
@ -131,6 +136,10 @@ public class QueryExpansion {
|
||||
nodes.add(qw);
|
||||
}
|
||||
|
||||
if (nodes.size() <= 1) {
|
||||
return List.of();
|
||||
}
|
||||
|
||||
String[] words = nodes.stream().map(QWord::stemmed).toArray(String[]::new);
|
||||
|
||||
// Grab all segments
|
||||
@ -141,15 +150,11 @@ public class QueryExpansion {
|
||||
}
|
||||
allSegments.sort(Comparator.comparing(NgramLexicon.SentenceSegment::start));
|
||||
|
||||
if (allSegments.isEmpty()) {
|
||||
return List.of();
|
||||
}
|
||||
Set<List<String>> constraints = new HashSet<>();
|
||||
|
||||
Set<NgramLexicon.SentenceSegment> bestSegmentation =
|
||||
findBestSegmentation(allSegments);
|
||||
|
||||
List<List<String>> coherences = new ArrayList<>();
|
||||
|
||||
for (var segment : bestSegmentation) {
|
||||
|
||||
int start = segment.start();
|
||||
@ -159,14 +164,14 @@ public class QueryExpansion {
|
||||
for (int i = start; i < end; i++) {
|
||||
components.add(nodes.get(i).word());
|
||||
}
|
||||
coherences.add(components);
|
||||
constraints.add(components);
|
||||
|
||||
// Create an n-gram search term for the segment
|
||||
String word = String.join("_", components);
|
||||
graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word);
|
||||
}
|
||||
|
||||
return coherences;
|
||||
return new ArrayList<>(constraints);
|
||||
}
|
||||
|
||||
private Set<NgramLexicon.SentenceSegment> findBestSegmentation(List<NgramLexicon.SentenceSegment> allSegments) {
|
||||
@ -209,5 +214,5 @@ public class QueryExpansion {
|
||||
void expand(QWordGraph graph);
|
||||
}
|
||||
|
||||
public record Expansion(String compiledQuery, List<List<String>> extraCoherences) {}
|
||||
public record Expansion(String compiledQuery, List<List<String>> optionalPharseConstraints, List<String> fullPhraseConstraint) {}
|
||||
}
|
||||
|
@ -1,22 +1,24 @@
|
||||
package nu.marginalia.functions.searchquery.query_parser;
|
||||
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.language.encoding.AsciiFlattener;
|
||||
import nu.marginalia.util.transform_list.TransformList;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class QueryParser {
|
||||
|
||||
private final QueryTokenizer tokenizer = new QueryTokenizer();
|
||||
|
||||
public List<QueryToken> parse(String query) {
|
||||
List<QueryToken> basicTokens = tokenizer.tokenizeQuery(query);
|
||||
List<QueryToken> basicTokens = tokenizeQuery(query);
|
||||
|
||||
TransformList<QueryToken> list = new TransformList<>(basicTokens);
|
||||
|
||||
list.transformEach(QueryParser::handleQuoteTokens);
|
||||
list.transformEach(QueryParser::trimLiterals);
|
||||
list.transformEach(QueryParser::handleQuoteTokens);
|
||||
list.transformEachPair(QueryParser::createNegatedTerms);
|
||||
list.transformEachPair(QueryParser::createPriorityTerms);
|
||||
list.transformEach(QueryParser::handleSpecialOperations);
|
||||
@ -26,6 +28,96 @@ public class QueryParser {
|
||||
return list.getBackingList();
|
||||
}
|
||||
|
||||
private static final Pattern noisePattern = Pattern.compile("[,\\s]");
|
||||
|
||||
public List<QueryToken> tokenizeQuery(String rawQuery) {
|
||||
List<QueryToken> tokens = new ArrayList<>();
|
||||
|
||||
String query = AsciiFlattener.flattenUnicode(rawQuery);
|
||||
query = noisePattern.matcher(query).replaceAll(" ");
|
||||
|
||||
int chr = -1;
|
||||
int parenDepth = 0;
|
||||
for (int i = 0; i < query.length(); i++) {
|
||||
chr = query.charAt(i);
|
||||
|
||||
if ('(' == chr) {
|
||||
parenDepth++;
|
||||
tokens.add(new QueryToken.LParen());
|
||||
}
|
||||
else if (')' == chr) {
|
||||
parenDepth--;
|
||||
tokens.add(new QueryToken.RParen());
|
||||
}
|
||||
else if ('"' == chr) {
|
||||
int end = query.indexOf('"', i+1);
|
||||
|
||||
if (end == -1) {
|
||||
end = query.length();
|
||||
}
|
||||
|
||||
tokens.add(new QueryToken.Quot(query.substring(i + 1, end).toLowerCase()));
|
||||
|
||||
i = end;
|
||||
}
|
||||
else if ('-' == chr) {
|
||||
tokens.add(new QueryToken.Minus());
|
||||
}
|
||||
else if ('?' == chr) {
|
||||
tokens.add(new QueryToken.QMark());
|
||||
}
|
||||
else if (!Character.isSpaceChar(chr)) {
|
||||
|
||||
// search for the end of the term
|
||||
int end = i+1;
|
||||
int prevC = -1;
|
||||
int c = -1;
|
||||
for (; end < query.length(); end++) {
|
||||
prevC = c;
|
||||
c = query.charAt(end);
|
||||
|
||||
if (prevC == '\\')
|
||||
continue;
|
||||
if (c == ' ')
|
||||
break;
|
||||
|
||||
// special case to deal with possible RPAREN token at the end,
|
||||
// but we don't want to break if it's likely part of the search term
|
||||
if (c == '(' && prevC != ')' && parenDepth > 0)
|
||||
break;
|
||||
}
|
||||
|
||||
String displayStr = query.substring(i, end);
|
||||
String str = trimEscape(displayStr.toLowerCase());
|
||||
|
||||
tokens.add(new QueryToken.LiteralTerm(str, displayStr));
|
||||
|
||||
i = end-1;
|
||||
}
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
|
||||
private String trimEscape(String str) {
|
||||
if (!str.contains("\\")) {
|
||||
return str;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder(str.length());
|
||||
for (int j = 0; j < str.length(); j++) {
|
||||
char c = str.charAt(j);
|
||||
if (c == '\\') {
|
||||
if (j + 1 < str.length()) {
|
||||
sb.append(str.charAt(j + 1));
|
||||
j++;
|
||||
}
|
||||
} else {
|
||||
sb.append(c);
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private static void normalizeDomainName(TransformList<QueryToken>.Entity entity) {
|
||||
var t = entity.value();
|
||||
|
||||
@ -60,10 +152,22 @@ public class QueryParser {
|
||||
if (str.isBlank())
|
||||
return;
|
||||
|
||||
if (str.endsWith(":") || str.endsWith(".")) {
|
||||
// Remove trailing punctuation
|
||||
int lastChar = str.charAt(str.length() - 1);
|
||||
if (":.,!?$'".indexOf(lastChar) >= 0)
|
||||
entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 1), lt.displayStr()));
|
||||
}
|
||||
|
||||
// Remove term elements that aren't indexed by the search engine
|
||||
if (str.endsWith("'s"))
|
||||
entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 2), lt.displayStr()));
|
||||
if (str.endsWith("()"))
|
||||
entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 2), lt.displayStr()));
|
||||
if (str.startsWith("$"))
|
||||
entity.replace(new QueryToken.LiteralTerm(str.substring(1), lt.displayStr()));
|
||||
|
||||
if (entity.isBlank()) {
|
||||
entity.remove();
|
||||
}
|
||||
}
|
||||
|
||||
private static void createNegatedTerms(TransformList<QueryToken>.Entity first, TransformList<QueryToken>.Entity second) {
|
||||
@ -104,15 +208,19 @@ public class QueryParser {
|
||||
String str = t.str();
|
||||
|
||||
if (str.startsWith("q") && str.matches("q[=><]\\d+")) {
|
||||
entity.replace(new QueryToken.QualityTerm(str.substring(1)));
|
||||
var limit = parseSpecificationLimit(str.substring(1));
|
||||
entity.replace(new QueryToken.QualityTerm(limit, str));
|
||||
} else if (str.startsWith("near:")) {
|
||||
entity.replace(new QueryToken.NearTerm(str.substring(5)));
|
||||
} else if (str.startsWith("year") && str.matches("year[=><]\\d{4}")) {
|
||||
entity.replace(new QueryToken.YearTerm(str.substring(4)));
|
||||
var limit = parseSpecificationLimit(str.substring(4));
|
||||
entity.replace(new QueryToken.YearTerm(limit, str));
|
||||
} else if (str.startsWith("size") && str.matches("size[=><]\\d+")) {
|
||||
entity.replace(new QueryToken.SizeTerm(str.substring(4)));
|
||||
var limit = parseSpecificationLimit(str.substring(4));
|
||||
entity.replace(new QueryToken.SizeTerm(limit, str));
|
||||
} else if (str.startsWith("rank") && str.matches("rank[=><]\\d+")) {
|
||||
entity.replace(new QueryToken.RankTerm(str.substring(4)));
|
||||
var limit = parseSpecificationLimit(str.substring(4));
|
||||
entity.replace(new QueryToken.RankTerm(limit, str));
|
||||
} else if (str.startsWith("qs=")) {
|
||||
entity.replace(new QueryToken.QsTerm(str.substring(3)));
|
||||
} else if (str.contains(":")) {
|
||||
@ -120,6 +228,21 @@ public class QueryParser {
|
||||
}
|
||||
}
|
||||
|
||||
private static SpecificationLimit parseSpecificationLimit(String str) {
|
||||
int startChar = str.charAt(0);
|
||||
|
||||
int val = Integer.parseInt(str.substring(1));
|
||||
if (startChar == '=') {
|
||||
return SpecificationLimit.equals(val);
|
||||
} else if (startChar == '<') {
|
||||
return SpecificationLimit.lessThan(val);
|
||||
} else if (startChar == '>') {
|
||||
return SpecificationLimit.greaterThan(val);
|
||||
} else {
|
||||
return SpecificationLimit.none();
|
||||
}
|
||||
}
|
||||
|
||||
private static void handleAdvisoryTerms(TransformList<QueryToken>.Entity entity) {
|
||||
var t = entity.value();
|
||||
if (t instanceof QueryToken.LParen) {
|
||||
|
@ -1,69 +0,0 @@
|
||||
package nu.marginalia.functions.searchquery.query_parser;
|
||||
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
|
||||
import nu.marginalia.language.encoding.AsciiFlattener;
|
||||
import nu.marginalia.language.sentence.SentenceExtractorStringUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class QueryTokenizer {
|
||||
private static final Pattern noisePattern = Pattern.compile("[,\\s]");
|
||||
|
||||
public List<QueryToken> tokenizeQuery(String rawQuery) {
|
||||
List<QueryToken> tokens = new ArrayList<>();
|
||||
|
||||
String query = AsciiFlattener.flattenUnicode(rawQuery);
|
||||
query = noisePattern.matcher(query).replaceAll(" ");
|
||||
|
||||
for (int i = 0; i < query.length(); i++) {
|
||||
int chr = query.charAt(i);
|
||||
|
||||
if ('(' == chr) {
|
||||
tokens.add(new QueryToken.LParen());
|
||||
}
|
||||
else if (')' == chr) {
|
||||
tokens.add(new QueryToken.RParen());
|
||||
}
|
||||
else if ('"' == chr) {
|
||||
int end = query.indexOf('"', i+1);
|
||||
|
||||
if (end == -1) {
|
||||
end = query.length();
|
||||
}
|
||||
|
||||
tokens.add(new QueryToken.Quot(query.substring(i + 1, end).toLowerCase()));
|
||||
|
||||
i = end;
|
||||
}
|
||||
else if ('-' == chr) {
|
||||
tokens.add(new QueryToken.Minus());
|
||||
}
|
||||
else if ('?' == chr) {
|
||||
tokens.add(new QueryToken.QMark());
|
||||
}
|
||||
else if (Character.isSpaceChar(chr)) {
|
||||
//
|
||||
}
|
||||
else {
|
||||
|
||||
int end = i+1;
|
||||
for (; end < query.length(); end++) {
|
||||
if (query.charAt(end) == ' ' || query.charAt(end) == ')')
|
||||
break;
|
||||
}
|
||||
|
||||
String displayStr = query.substring(i, end);
|
||||
String str = SentenceExtractorStringUtils.toLowerCaseStripPossessive(displayStr);
|
||||
|
||||
tokens.add(new QueryToken.LiteralTerm(str, displayStr));
|
||||
|
||||
i = end-1;
|
||||
}
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -248,16 +248,29 @@ public class QWordGraph implements Iterable<QWord> {
|
||||
@Override
|
||||
public Iterator<QWord> iterator() {
|
||||
return new Iterator<>() {
|
||||
QWord next = null;
|
||||
QWord pos = QWord.beg();
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return !pos.isEnd();
|
||||
if (next == null) {
|
||||
if (pos.isEnd()) {
|
||||
return false;
|
||||
}
|
||||
next = getNextOriginal(pos).getFirst();
|
||||
}
|
||||
|
||||
return !next.isEnd();
|
||||
}
|
||||
|
||||
@Override
|
||||
public QWord next() {
|
||||
pos = getNextOriginal(pos).getFirst();
|
||||
if (!hasNext()) {
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
|
||||
pos = next;
|
||||
next = null;
|
||||
return pos;
|
||||
}
|
||||
};
|
||||
|
@ -1,6 +1,8 @@
|
||||
package nu.marginalia.functions.searchquery.query_parser.token;
|
||||
|
||||
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
|
||||
public sealed interface QueryToken {
|
||||
String str();
|
||||
String displayStr();
|
||||
@ -11,25 +13,18 @@ public sealed interface QueryToken {
|
||||
record AdviceTerm(String str, String displayStr) implements QueryToken {}
|
||||
record PriorityTerm(String str, String displayStr) implements QueryToken {}
|
||||
|
||||
record QualityTerm(String str) implements QueryToken {
|
||||
public String displayStr() {
|
||||
return "q" + str;
|
||||
}
|
||||
record QualityTerm(SpecificationLimit limit, String displayStr) implements QueryToken {
|
||||
public String str() { return displayStr; }
|
||||
|
||||
}
|
||||
record YearTerm(String str) implements QueryToken {
|
||||
public String displayStr() {
|
||||
return "year" + str;
|
||||
}
|
||||
record YearTerm(SpecificationLimit limit, String displayStr) implements QueryToken {
|
||||
public String str() { return displayStr; }
|
||||
}
|
||||
record SizeTerm(String str) implements QueryToken {
|
||||
public String displayStr() {
|
||||
return "size" + str;
|
||||
}
|
||||
record SizeTerm(SpecificationLimit limit, String displayStr) implements QueryToken {
|
||||
public String str() { return displayStr; }
|
||||
}
|
||||
record RankTerm(String str) implements QueryToken {
|
||||
public String displayStr() {
|
||||
return "rank" + str;
|
||||
}
|
||||
record RankTerm(SpecificationLimit limit, String displayStr) implements QueryToken {
|
||||
public String str() { return displayStr; }
|
||||
}
|
||||
record NearTerm(String str) implements QueryToken {
|
||||
public String displayStr() {
|
||||
|
@ -1,5 +1,7 @@
|
||||
package nu.marginalia.util.transform_list;
|
||||
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Consumer;
|
||||
@ -30,7 +32,7 @@ import java.util.function.Predicate;
|
||||
* </pre>
|
||||
* </code>
|
||||
*/
|
||||
public class TransformList<T> {
|
||||
public class TransformList<T extends QueryToken> {
|
||||
private final List<T> backingList;
|
||||
|
||||
public TransformList(List<T> backingList) {
|
||||
@ -138,6 +140,10 @@ public class TransformList<T> {
|
||||
value = newValue;
|
||||
}
|
||||
|
||||
public boolean isBlank() {
|
||||
return value == null || value.str().isBlank();
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
action = Action.REMOVE;
|
||||
}
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.functions.searchquery.query_parser.model;
|
||||
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.Comparator;
|
||||
@ -100,7 +101,8 @@ class QWordGraphTest {
|
||||
assertEquals("q b ( c | d )", graph.compileToQuery());
|
||||
}
|
||||
|
||||
@Test // this test is a bit flaky, the order of the variants is not guaranteed
|
||||
@Disabled // flaky, the order of the variants is not guaranteed
|
||||
@Test
|
||||
void testCompile5() {
|
||||
// Construct a graph like
|
||||
|
||||
|
@ -1,16 +1,16 @@
|
||||
package nu.marginalia.query.svc;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.functions.searchquery.QueryFactory;
|
||||
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
|
||||
import nu.marginalia.functions.searchquery.svc.QueryFactory;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||
import nu.marginalia.segmentation.NgramLexicon;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@ -57,7 +57,12 @@ public class QueryFactoryTest {
|
||||
|
||||
@Test
|
||||
void qsec10() {
|
||||
try (var lines = Files.lines(Path.of("/home/vlofgren/Exports/qsec10/webis-qsec-10-training-set/webis-qsec-10-training-set-queries.txt"))) {
|
||||
Path webis = Path.of("/home/vlofgren/Exports/qsec10/webis-qsec-10-training-set/webis-qsec-10-training-set-queries.txt");
|
||||
|
||||
if (!Files.exists(webis))
|
||||
return;
|
||||
|
||||
try (var lines = Files.lines(webis)) {
|
||||
lines.limit(1000).forEach(line -> {
|
||||
String[] parts = line.split("\t");
|
||||
if (parts.length == 2) {
|
||||
@ -124,24 +129,6 @@ public class QueryFactoryTest {
|
||||
assertEquals(2000, size.value());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testQuotedStopwords() {
|
||||
{
|
||||
// the is a stopword, so it should generate an ngram search term
|
||||
var specs = parseAndGetSpecs("\"the shining\"");
|
||||
assertEquals("the_shining", specs.query.compiledQuery);
|
||||
}
|
||||
|
||||
{
|
||||
// tde isn't a stopword, so we should get the normal behavior
|
||||
var specs = parseAndGetSpecs("\"tde shining\"");
|
||||
assertEquals("( shining tde | tde_shining )", specs.query.compiledQuery);
|
||||
assertEquals(List.of("tde_shining"), specs.query.searchTermsAdvice);
|
||||
assertEquals(List.of(List.of("tde", "shining")), specs.query.searchTermCoherences);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testParseQualityEq() {
|
||||
var quality = parseAndGetSpecs("q=2000").quality;
|
||||
@ -212,12 +199,38 @@ public class QueryFactoryTest {
|
||||
var subquery = parseAndGetSpecs("The");
|
||||
System.out.println("Time: " + (System.currentTimeMillis() - start));
|
||||
System.out.println(subquery);
|
||||
} @Test
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExpansion6() {
|
||||
long start = System.currentTimeMillis();
|
||||
var subquery = parseAndGetSpecs("burning the nerves in the neck");
|
||||
System.out.println("Time: " + (System.currentTimeMillis() - start));
|
||||
System.out.println(subquery);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExpansion7() {
|
||||
long start = System.currentTimeMillis();
|
||||
var subquery = parseAndGetSpecs("amazing work being done");
|
||||
System.out.println("Time: " + (System.currentTimeMillis() - start));
|
||||
System.out.println(subquery);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExpansion8() {
|
||||
long start = System.currentTimeMillis();
|
||||
var subquery = parseAndGetSpecs("success often consists of");
|
||||
System.out.println("Time: " + (System.currentTimeMillis() - start));
|
||||
System.out.println(subquery);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParsing() {
|
||||
long start = System.currentTimeMillis();
|
||||
var subquery = parseAndGetSpecs("strlen()");
|
||||
assertEquals("strlen", subquery.query.compiledQuery);
|
||||
System.out.println("Time: " + (System.currentTimeMillis() - start));
|
||||
System.out.println(subquery);
|
||||
}
|
||||
}
|
@ -15,6 +15,7 @@ dependencies {
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:libraries:message-queue')
|
||||
implementation project(':code:functions:search-query:api')
|
||||
|
||||
|
@ -6,6 +6,8 @@ import lombok.SneakyThrows;
|
||||
import nu.marginalia.api.searchquery.IndexApiGrpc;
|
||||
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
||||
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
||||
import nu.marginalia.db.DomainBlacklistImpl;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
@ -14,6 +16,7 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
@ -22,21 +25,34 @@ import java.util.concurrent.Executors;
|
||||
public class IndexClient {
|
||||
private static final Logger logger = LoggerFactory.getLogger(IndexClient.class);
|
||||
private final GrpcMultiNodeChannelPool<IndexApiGrpc.IndexApiBlockingStub> channelPool;
|
||||
private final DomainBlacklistImpl blacklist;
|
||||
private static final ExecutorService executor = Executors.newFixedThreadPool(32);
|
||||
|
||||
@Inject
|
||||
public IndexClient(GrpcChannelPoolFactory channelPoolFactory) {
|
||||
public IndexClient(GrpcChannelPoolFactory channelPoolFactory, DomainBlacklistImpl blacklist) {
|
||||
this.channelPool = channelPoolFactory.createMulti(
|
||||
ServiceKey.forGrpcApi(IndexApiGrpc.class, ServicePartition.multi()),
|
||||
IndexApiGrpc::newBlockingStub);
|
||||
this.blacklist = blacklist;
|
||||
}
|
||||
|
||||
private static final Comparator<RpcDecoratedResultItem> comparator =
|
||||
Comparator.comparing(RpcDecoratedResultItem::getRankingScore);
|
||||
|
||||
|
||||
/** Execute a query on the index partitions and return the combined results. */
|
||||
@SneakyThrows
|
||||
public List<RpcDecoratedResultItem> executeQueries(RpcIndexQuery indexRequest) {
|
||||
var futures =
|
||||
channelPool.call(IndexApiGrpc.IndexApiBlockingStub::query)
|
||||
.async(executor)
|
||||
.runEach(indexRequest);
|
||||
List<RpcDecoratedResultItem> results = new ArrayList<>();
|
||||
|
||||
final int resultsTotal = indexRequest.getQueryLimits().getResultsTotal();
|
||||
final int resultsUpperBound = resultsTotal * channelPool.getNumNodes();
|
||||
|
||||
List<RpcDecoratedResultItem> results = new ArrayList<>(resultsUpperBound);
|
||||
|
||||
for (var future : futures) {
|
||||
try {
|
||||
future.get().forEachRemaining(results::add);
|
||||
@ -46,7 +62,20 @@ public class IndexClient {
|
||||
}
|
||||
}
|
||||
|
||||
// Sort the results by ranking score and remove blacklisted domains
|
||||
results.sort(comparator);
|
||||
results.removeIf(this::isBlacklisted);
|
||||
|
||||
// Keep only as many results as were requested
|
||||
if (results.size() > resultsTotal) {
|
||||
results = results.subList(0, resultsTotal);
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
private boolean isBlacklisted(RpcDecoratedResultItem item) {
|
||||
return blacklist.isBlacklisted(UrlIdCodec.getDomainId(item.getRawItem().getCombinedId()));
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -5,7 +5,5 @@ public class IndexMqEndpoints {
|
||||
public static final String INDEX_RERANK = "INDEX-RERANK";
|
||||
public static final String INDEX_REPARTITION = "INDEX-REPARTITION";
|
||||
public static final String SWITCH_INDEX = "SWITCH-INDEX";
|
||||
|
||||
public static final String SWITCH_LINKDB = "SWITCH_LINKDB";
|
||||
|
||||
}
|
||||
|
@ -15,12 +15,15 @@ dependencies {
|
||||
implementation 'org.jgrapht:jgrapht-core:1.5.2'
|
||||
|
||||
implementation project(':third-party:commons-codec')
|
||||
implementation project(':third-party:parquet-floor')
|
||||
|
||||
implementation project(':code:index:api')
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
|
||||
implementation project(':code:libraries:array')
|
||||
implementation project(':code:libraries:btree')
|
||||
implementation project(':code:libraries:coded-sequence')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:common:config')
|
||||
@ -28,14 +31,16 @@ dependencies {
|
||||
implementation project(':code:common:linkdb')
|
||||
implementation project(':code:common:service')
|
||||
|
||||
implementation project(':code:functions:search-query:api')
|
||||
implementation project(':code:processes:converting-process:model')
|
||||
|
||||
implementation project(':code:functions:search-query:api')
|
||||
implementation project(':code:index:index-forward')
|
||||
implementation project(':code:index:index-reverse')
|
||||
implementation project(':code:index:query')
|
||||
implementation project(':code:index:index-journal')
|
||||
|
||||
|
||||
implementation libs.slop
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation libs.prometheus
|
||||
@ -66,9 +71,11 @@ dependencies {
|
||||
testImplementation project(':code:libraries:array')
|
||||
|
||||
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||
testImplementation libs.commons.codec
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
testImplementation project(':code:libraries:term-frequency-dict')
|
||||
testImplementation project(':code:libraries:braille-block-punch-cards')
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
}
|
||||
|
@ -15,10 +15,13 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
dependencies {
|
||||
implementation project(':code:libraries:array')
|
||||
implementation project(':code:libraries:btree')
|
||||
implementation project(':code:libraries:coded-sequence')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:index:query')
|
||||
implementation project(':code:index:index-journal')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:process')
|
||||
implementation project(':code:processes:converting-process:model')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
@ -26,7 +29,9 @@ dependencies {
|
||||
implementation libs.roaringbitmap
|
||||
implementation libs.fastutil
|
||||
implementation libs.trove
|
||||
implementation libs.slop
|
||||
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
@ -1,127 +0,0 @@
|
||||
package nu.marginalia.index.forward;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import org.roaringbitmap.longlong.LongConsumer;
|
||||
import org.roaringbitmap.longlong.Roaring64Bitmap;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class ForwardIndexConverter {
|
||||
|
||||
private final ProcessHeartbeat heartbeat;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final IndexJournalReader journalReader;
|
||||
private final Path outputFileDocsId;
|
||||
private final Path outputFileDocsData;
|
||||
private final DomainRankings domainRankings;
|
||||
|
||||
|
||||
public ForwardIndexConverter(ProcessHeartbeat heartbeat,
|
||||
IndexJournalReader journalReader,
|
||||
Path outputFileDocsId,
|
||||
Path outputFileDocsData,
|
||||
DomainRankings domainRankings
|
||||
) {
|
||||
this.heartbeat = heartbeat;
|
||||
this.journalReader = journalReader;
|
||||
this.outputFileDocsId = outputFileDocsId;
|
||||
this.outputFileDocsData = outputFileDocsData;
|
||||
this.domainRankings = domainRankings;
|
||||
}
|
||||
|
||||
public enum TaskSteps {
|
||||
GET_DOC_IDS,
|
||||
GATHER_OFFSETS,
|
||||
SUPPLEMENTAL_INDEXES,
|
||||
FORCE,
|
||||
FINISHED
|
||||
}
|
||||
|
||||
public void convert() throws IOException {
|
||||
deleteOldFiles();
|
||||
|
||||
logger.info("Domain Rankings size = {}", domainRankings.size());
|
||||
|
||||
try (var progress = heartbeat.createProcessTaskHeartbeat(TaskSteps.class, "forwardIndexConverter")) {
|
||||
progress.progress(TaskSteps.GET_DOC_IDS);
|
||||
|
||||
LongArray docsFileId = getDocIds(outputFileDocsId, journalReader);
|
||||
|
||||
progress.progress(TaskSteps.GATHER_OFFSETS);
|
||||
|
||||
// doc ids -> sorted list of ids
|
||||
|
||||
Long2IntOpenHashMap docIdToIdx = new Long2IntOpenHashMap((int) docsFileId.size());
|
||||
docsFileId.forEach(0, docsFileId.size(), (pos, val) -> docIdToIdx.put(val, (int) pos));
|
||||
|
||||
progress.progress(TaskSteps.SUPPLEMENTAL_INDEXES);
|
||||
|
||||
// docIdToIdx -> file offset for id
|
||||
|
||||
LongArray docFileData = LongArrayFactory.mmapForWritingConfined(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size());
|
||||
|
||||
var pointer = journalReader.newPointer();
|
||||
while (pointer.nextDocument()) {
|
||||
long docId = pointer.documentId();
|
||||
int domainId = UrlIdCodec.getDomainId(docId);
|
||||
|
||||
long entryOffset = (long) ForwardIndexParameters.ENTRY_SIZE * docIdToIdx.get(docId);
|
||||
|
||||
int ranking = domainRankings.getRanking(domainId);
|
||||
long meta = DocumentMetadata.encodeRank(pointer.documentMeta(), ranking);
|
||||
|
||||
docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta);
|
||||
docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, pointer.documentFeatures());
|
||||
}
|
||||
|
||||
progress.progress(TaskSteps.FORCE);
|
||||
|
||||
docFileData.force();
|
||||
docsFileId.force();
|
||||
|
||||
docFileData.close();
|
||||
docsFileId.close();
|
||||
|
||||
progress.progress(TaskSteps.FINISHED);
|
||||
} catch (IOException ex) {
|
||||
logger.error("Failed to convert", ex);
|
||||
throw ex;
|
||||
}
|
||||
}
|
||||
|
||||
private LongArray getDocIds(Path outputFileDocs, IndexJournalReader journalReader) throws IOException {
|
||||
Roaring64Bitmap rbm = new Roaring64Bitmap();
|
||||
journalReader.forEachDocId(rbm::add);
|
||||
|
||||
LongArray ret = LongArrayFactory.mmapForWritingConfined(outputFileDocs, rbm.getIntCardinality());
|
||||
rbm.forEach(new LongConsumer() {
|
||||
int offset;
|
||||
@Override
|
||||
public void accept(long value) {
|
||||
ret.set(offset++, value);
|
||||
}
|
||||
});
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
private void deleteOldFiles() throws IOException {
|
||||
Files.deleteIfExists(outputFileDocsId);
|
||||
Files.deleteIfExists(outputFileDocsData);
|
||||
}
|
||||
|
||||
}
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user