From 71d789aab06d1dda05ebaecff822dd0990cb1fe3 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 14:53:53 +0100 Subject: [PATCH 01/14] (index) Tweak result valuation renormalization --- .../src/main/java/nu/marginalia/ranking/ResultValuator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java index 2a856258..2d564e5b 100644 --- a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java +++ b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java @@ -221,7 +221,7 @@ public class ResultValuator { if (value < 0) value = 0; - return Math.sqrt((1.0 + scalingFactor) / (1.0 + value)) + Math.sqrt(penalty); + return Math.sqrt((1.0 + scalingFactor + penalty) / (1.0 + value)); } } From 396299c1db028dedbc5b69d7cf41945714807d9c Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 15:20:18 +0100 Subject: [PATCH 02/14] (index) Reduce the value of site and site-adjacent in BM25P calculations --- .../ranking/factors/Bm25Factor.java | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/factors/Bm25Factor.java b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/factors/Bm25Factor.java index 13c99ecc..a11281db 100644 --- a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/factors/Bm25Factor.java +++ b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/factors/Bm25Factor.java @@ -53,9 +53,11 @@ public class Bm25Factor { } private static double evaluatePriorityScore(SearchResultKeywordScore keyword) { + int pcount = keyword.positionCount(); + double qcount = 0.; if ((keyword.encodedWordMetadata() & WordFlags.Site.asBit()) != 0) - qcount += 2.; + qcount += 0.5; if ((keyword.encodedWordMetadata() & WordFlags.SiteAdjacent.asBit()) != 0) qcount += 0.5; if ((keyword.encodedWordMetadata() & WordFlags.UrlPath.asBit()) != 0) @@ -66,12 +68,16 @@ public class Bm25Factor { qcount += 2.5; if ((keyword.encodedWordMetadata() & WordFlags.Title.asBit()) != 0) qcount += 1.5; - if ((keyword.encodedWordMetadata() & WordFlags.Subjects.asBit()) != 0) - qcount += 1.25; - if ((keyword.encodedWordMetadata() & WordFlags.NamesWords.asBit()) != 0) - qcount += 0.25; - if ((keyword.encodedWordMetadata() & WordFlags.TfIdfHigh.asBit()) != 0) - qcount += 0.5; + + if (pcount > 2) { + if ((keyword.encodedWordMetadata() & WordFlags.Subjects.asBit()) != 0) + qcount += 1.25; + if ((keyword.encodedWordMetadata() & WordFlags.NamesWords.asBit()) != 0) + qcount += 0.25; + if ((keyword.encodedWordMetadata() & WordFlags.TfIdfHigh.asBit()) != 0) + qcount += 0.5; + } + return qcount; } From 1694b4d6ef722c97cbf3a537f1eb0d5e557116f6 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 15:20:38 +0100 Subject: [PATCH 03/14] (valuation) Increase the penalty for adtech a bit --- .../src/main/java/nu/marginalia/ranking/ResultValuator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java index 2d564e5b..1fa2a133 100644 --- a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java +++ b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java @@ -155,7 +155,7 @@ public class ResultValuator { } if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING_ADTECH.getFeatureBit())) - penalty += 5.0 * largeSiteFactor; + penalty += 7.5 * largeSiteFactor; if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.AFFILIATE_LINK.getFeatureBit())) penalty += 5.0 * largeSiteFactor; From 0e970b803773a464ffe0e09aac124153c69e9ebb Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 15:32:33 +0100 Subject: [PATCH 04/14] (valuation) Tweaking penalties a bit --- .../src/main/java/nu/marginalia/ranking/ResultValuator.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java index 1fa2a133..961a9e81 100644 --- a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java +++ b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java @@ -160,6 +160,9 @@ public class ResultValuator { if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.AFFILIATE_LINK.getFeatureBit())) penalty += 5.0 * largeSiteFactor; + if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.COOKIES.getFeatureBit())) + penalty += 2.5 * largeSiteFactor; + if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING.getFeatureBit())) penalty += 2.5 * largeSiteFactor; @@ -221,7 +224,7 @@ public class ResultValuator { if (value < 0) value = 0; - return Math.sqrt((1.0 + scalingFactor + penalty) / (1.0 + value)); + return Math.sqrt((1.0 + scalingFactor + 10 * penalty) / (1.0 + value)); } } From 5c2561d05d30e68ec6fa7964d18745214adbad9a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 16:23:00 +0100 Subject: [PATCH 05/14] (search) Add query strategy requiring link --- .../java/nu/marginalia/index/query/limit/QueryStrategy.java | 1 + .../java/nu/marginalia/index/results/IndexResultValuator.java | 4 +++- .../java/nu/marginalia/query/svc/QueryLimitsAccumulator.java | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/code/features-index/index-query/src/main/java/nu/marginalia/index/query/limit/QueryStrategy.java b/code/features-index/index-query/src/main/java/nu/marginalia/index/query/limit/QueryStrategy.java index c15ab6ea..024828f9 100644 --- a/code/features-index/index-query/src/main/java/nu/marginalia/index/query/limit/QueryStrategy.java +++ b/code/features-index/index-query/src/main/java/nu/marginalia/index/query/limit/QueryStrategy.java @@ -9,6 +9,7 @@ public enum QueryStrategy { REQUIRE_FIELD_SUBJECT, REQUIRE_FIELD_URL, REQUIRE_FIELD_DOMAIN, + REQUIRE_FIELD_LINK, AUTO } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java index e19d3809..1e51fbd6 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java @@ -172,7 +172,9 @@ public class IndexResultValuator { else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) { return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.UrlDomain.asBit()); } - + else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_LINK) { + return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.ExternalLink.asBit()); + } return true; } diff --git a/code/services-core/query-service/src/main/java/nu/marginalia/query/svc/QueryLimitsAccumulator.java b/code/services-core/query-service/src/main/java/nu/marginalia/query/svc/QueryLimitsAccumulator.java index 663d4cfc..f1f17bed 100644 --- a/code/services-core/query-service/src/main/java/nu/marginalia/query/svc/QueryLimitsAccumulator.java +++ b/code/services-core/query-service/src/main/java/nu/marginalia/query/svc/QueryLimitsAccumulator.java @@ -45,6 +45,7 @@ public class QueryLimitsAccumulator implements TokenVisitor { case "RF_SITE" -> QueryStrategy.REQUIRE_FIELD_SITE; case "RF_URL" -> QueryStrategy.REQUIRE_FIELD_URL; case "RF_DOMAIN" -> QueryStrategy.REQUIRE_FIELD_DOMAIN; + case "RF_LINK" -> QueryStrategy.REQUIRE_FIELD_LINK; case "SENTENCE" -> QueryStrategy.SENTENCE; case "TOPIC" -> QueryStrategy.TOPIC; default -> QueryStrategy.AUTO; From 109bec372cddd52afd6784c60777630f0db60fde Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 16:30:46 +0100 Subject: [PATCH 06/14] (index) Adjust BM25 parameters --- .../ranking/factors/Bm25Factor.java | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/factors/Bm25Factor.java b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/factors/Bm25Factor.java index a11281db..43a63ab6 100644 --- a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/factors/Bm25Factor.java +++ b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/factors/Bm25Factor.java @@ -56,16 +56,27 @@ public class Bm25Factor { int pcount = keyword.positionCount(); double qcount = 0.; + if ((keyword.encodedWordMetadata() & WordFlags.Site.asBit()) != 0) qcount += 0.5; if ((keyword.encodedWordMetadata() & WordFlags.SiteAdjacent.asBit()) != 0) qcount += 0.5; - if ((keyword.encodedWordMetadata() & WordFlags.UrlPath.asBit()) != 0) - qcount += 1.25; - if ((keyword.encodedWordMetadata() & WordFlags.UrlDomain.asBit()) != 0) - qcount += 1.25; - if ((keyword.encodedWordMetadata() & WordFlags.ExternalLink.asBit()) != 0) + + if ((keyword.encodedWordMetadata() & WordFlags.ExternalLink.asBit()) != 0) { qcount += 2.5; + + if ((keyword.encodedWordMetadata() & WordFlags.UrlDomain.asBit()) != 0) + qcount += 2.5; + else if ((keyword.encodedWordMetadata() & WordFlags.UrlPath.asBit()) != 0) + qcount += 1; + } + else { + if ((keyword.encodedWordMetadata() & WordFlags.UrlPath.asBit()) != 0) + qcount += 1; + if ((keyword.encodedWordMetadata() & WordFlags.UrlDomain.asBit()) != 0) + qcount += 1.5; + } + if ((keyword.encodedWordMetadata() & WordFlags.Title.asBit()) != 0) qcount += 1.5; From e53bb70bef7dc833c88f689d6fbf052f45c9f3cb Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 16:51:26 +0100 Subject: [PATCH 07/14] (converter) Penalize chatgpt content farm spam --- .../processor/logic/DocumentValuator.java | 33 ++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java index 218f16b8..af080a3a 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java @@ -11,6 +11,7 @@ import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.select.NodeVisitor; +import java.util.List; import java.util.Set; public class DocumentValuator { @@ -21,6 +22,7 @@ public class DocumentValuator { int textLength) throws DisqualifiedException { double scriptPenalty = getScriptPenalty(parsedDocument); + double chatGptPenalty = getChatGptContentFarmPenalty(parsedDocument); int rawLength = crawledDocument.documentBody.length(); @@ -30,7 +32,36 @@ public class DocumentValuator { return Math.log(textLength / (double) (1+rawLength))*htmlStandard.scale + htmlStandard.offset - - scriptPenalty; + - scriptPenalty + - chatGptPenalty; + } + + private double getChatGptContentFarmPenalty(Document parsedDocument) { + // easily 90% of modern AI-authored content farm spam has these nonsense headers + + boolean benefitsOf = false, keyBenefits = false, keyTakeaways = false; + + outer: + for (String tagName : List.of("h1", "h2", "h3")) { + for (var elem : parsedDocument.getElementsByTag(tagName)) { + if (benefitsOf && keyBenefits && keyTakeaways) + break outer; + + String text = elem.text().toLowerCase(); + + benefitsOf = benefitsOf || text.startsWith("benefits of"); + keyBenefits = keyBenefits || text.startsWith("key benefits"); + keyTakeaways = keyTakeaways || text.startsWith("key takeaways"); + } + } + + double penalty = 0; + + if (benefitsOf) penalty += 10; + if (keyBenefits) penalty += 5; + if (keyTakeaways) penalty += 5; + + return penalty; } From b3c8fa74cc5cefcd6479e5444de0ae884be3a1d9 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 17:21:12 +0100 Subject: [PATCH 08/14] (feature) Add another doubleclick variant to the adtech trackers --- .../marginalia/converting/processor/logic/FeatureExtractor.java | 1 + 1 file changed, 1 insertion(+) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java index 741b6740..83b06bd5 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java @@ -39,6 +39,7 @@ public class FeatureExtractor { "googlesyndication.com", "smartadserver.com", "doubleclick.com", + "doubleclick.net", "2mdn.com", "dmtry.com", "amazon-adsystem.com", From 33c2188c87111ad99c2b5e629037790b3508004a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 17:27:25 +0100 Subject: [PATCH 09/14] (feature) More trackers --- .../converting/processor/logic/FeatureExtractor.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java index 83b06bd5..c38f63f9 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java @@ -58,7 +58,10 @@ public class FeatureExtractor { "personalized-ads-consent", "_taboola", "nativeads", - "skimlinks" + "skimlinks", + "moapt", + "juicyads.com", + "counter.yadro.ru" ); private final AdblockSimulator adblockSimulator; From fdec565b34747ea3167fca9d7c37250c248106f3 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 23:14:03 +0100 Subject: [PATCH 10/14] (converter) Add upper 128KB limit to how much HTML we'll parse --- .../processor/plugin/HtmlDocumentProcessorPlugin.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 7d973909..44da6008 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -107,6 +107,10 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin throw new DisqualifiedException(DisqualificationReason.LANGUAGE); } + if (documentBody.length() > 128_000) { // 128kb + documentBody = documentBody.substring(0, 128_000); + } + Document doc = Jsoup.parse(documentBody); if (!metaRobotsTag.allowIndexingByMetaTag(doc)) { From 9e3386dbbb70a3c52216ede3bd4ba35027d19696 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 4 Jan 2024 13:18:07 +0100 Subject: [PATCH 11/14] (search) Fetch fewer results per page This is a test to evaluate how this impacts load times. --- .../main/java/nu/marginalia/search/SearchQueryParamFactory.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchQueryParamFactory.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchQueryParamFactory.java index 6b913402..95439273 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchQueryParamFactory.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchQueryParamFactory.java @@ -31,7 +31,7 @@ public class SearchQueryParamFactory { SpecificationLimit.none(), SpecificationLimit.none(), List.of(), - new QueryLimits(1, 100, 200, 8192), + new QueryLimits(1, 25, 200, 8192), profile.searchSetIdentifier ); From aca217cf9ae63d199afcf852eb53d9cfab115df1 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 4 Jan 2024 13:27:14 +0100 Subject: [PATCH 12/14] (qs) Better metrics for QS --- .../src/main/java/nu/marginalia/query/QueryGRPCService.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/code/services-core/query-service/src/main/java/nu/marginalia/query/QueryGRPCService.java b/code/services-core/query-service/src/main/java/nu/marginalia/query/QueryGRPCService.java index 2322c1ee..9e14ef15 100644 --- a/code/services-core/query-service/src/main/java/nu/marginalia/query/QueryGRPCService.java +++ b/code/services-core/query-service/src/main/java/nu/marginalia/query/QueryGRPCService.java @@ -23,6 +23,7 @@ public class QueryGRPCService extends QueryApiGrpc.QueryApiImplBase { private static final Histogram wmsa_qs_query_time_grpc = Histogram.build() .name("wmsa_qs_query_time_grpc") + .labelNames("timeout", "count") .linearBuckets(0.05, 0.05, 15) .help("QS-side query time (GRPC endpoint)") .register(); @@ -69,7 +70,10 @@ public class QueryGRPCService extends QueryApiGrpc.QueryApiImplBase { io.grpc.stub.StreamObserver responseObserver) { try { - wmsa_qs_query_time_grpc.time(() -> { + wmsa_qs_query_time_grpc + .labels(Integer.toString(request.getQueryLimits().getTimeoutMs()), + Integer.toString(request.getQueryLimits().getResultsTotal())) + .time(() -> { var params = QueryProtobufCodec.convertRequest(request); var query = queryFactory.createQuery(params); From ef02b712ad2ff5fb8168cee4e1c73ffd18d613d9 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 5 Jan 2024 13:17:29 +0100 Subject: [PATCH 13/14] (build) Remove false depdencency between icp and index-service This dependency causes the executor service docker image to change when the index service docker image changes. --- code/processes/index-constructor-process/build.gradle | 2 -- 1 file changed, 2 deletions(-) diff --git a/code/processes/index-constructor-process/build.gradle b/code/processes/index-constructor-process/build.gradle index d3b81107..e92db1b6 100644 --- a/code/processes/index-constructor-process/build.gradle +++ b/code/processes/index-constructor-process/build.gradle @@ -32,8 +32,6 @@ dependencies { implementation project(':code:features-index:index-journal') implementation project(':code:features-index:domain-ranking') - implementation project(':code:services-core:index-service') - implementation libs.bundles.slf4j implementation libs.guice implementation libs.bundles.mariadb From 302c53a8e70c9e37c03e0d2650d4083f370bf647 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 5 Jan 2024 13:19:59 +0100 Subject: [PATCH 14/14] (build) Enable reproducible builds in build.gradle Settings for enabling reproducible builds for all subprojects were added to improve build consistency. This includes preserving file timestamps and ordering files reproducibly. This is primarily of help for docker, since it uses hashes to determine if a file or image layer has changed. --- build.gradle | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/build.gradle b/build.gradle index d7f74837..4bda69de 100644 --- a/build.gradle +++ b/build.gradle @@ -11,8 +11,8 @@ version 'SNAPSHOT' compileJava.options.encoding = "UTF-8" compileTestJava.options.encoding = "UTF-8" -// Enable preview features for the entire project subprojects.forEach {it -> + // Enable preview features for the entire project it.tasks.withType(JavaCompile).configureEach { options.compilerArgs += ['--enable-preview'] } @@ -22,6 +22,12 @@ subprojects.forEach {it -> it.tasks.withType(Test).configureEach { jvmArgs += ['--enable-preview'] } + + // Enable reproducible builds for the entire project + it.tasks.withType(AbstractArchiveTask).configureEach { + preserveFileTimestamps = false + reproducibleFileOrder = true + } } allprojects {