From fa9b4e4352254396e0e76a6373e9161c3736ce00 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 12 Feb 2023 10:57:07 +0100 Subject: [PATCH] A tiny release between crawls (#138) Bringing online new ranking changes Co-authored-by: Viktor Lofgren Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/138 --- marginalia_nu/build.gradle | 4 +- .../util/BrailleBlockPunchCards.java | 4 +- .../java/nu/marginalia/util/ParallelPipe.java | 2 +- .../nu/marginalia/util/RandomWriteFunnel.java | 1 - .../java/nu/marginalia/util/StringPool.java | 54 +- .../util/array/algo/IntArrayBase.java | 13 +- .../util/array/algo/LongArrayBase.java | 1 + .../util/array/delegate/ShiftedIntArray.java | 17 + .../util/array/delegate/ShiftedLongArray.java | 19 + .../util/array/page/PagingIntArray.java | 22 + .../util/array/page/PagingLongArray.java | 33 + .../marginalia/util/bigstring/BigString.java | 17 + .../util/bigstring/CompressedBigString.java | 39 + .../util/bigstring/PlainBigString.java | 26 + .../marginalia/util/dict/DictionaryData.java | 4 - .../marginalia/util/dict/DictionaryMap.java | 11 + ...Map.java => OffHeapDictionaryHashMap.java} | 7 +- .../util/dict/OnHeapDictionaryMap.java | 23 + .../util/gregex/GuardedRegexFactory.java | 12 - .../util/language/DocumentDebugger.java | 5 +- .../util/language/WordPatterns.java | 18 +- .../processing/DocumentKeywordExtractor.java | 23 +- .../language/processing/KeywordCounter.java | 4 +- .../language/processing/KeywordExtractor.java | 106 +- .../util/language/processing/NameCounter.java | 2 +- .../language/processing/SubjectCounter.java | 40 +- .../model/DocumentLanguageData.java | 3 +- .../processing/model/KeywordMetadata.java | 6 +- .../language/processing/model/WordRep.java | 2 +- .../{ => sentence}/SentenceExtractor.java | 314 +--- .../SentenceExtractorStringUtils.java | 93 + .../sentence/SentenceSegmentSplitter.java | 72 + .../util/ranking/BuggyReversePageRank.java | 39 - .../util/ranking/BuggyStandardPageRank.java | 45 - .../util/ranking/tool/DedupTool.java | 89 - .../marginalia/wmsa/client/GsonFactory.java | 3 + .../wmsa/configuration/ServiceDescriptor.java | 2 - .../assistant/dict/TermFrequencyDict.java | 43 +- .../wmsa/edge/converting/ConverterMain.java | 7 +- .../converting/LinkKeywordExtractorMain.java | 2 +- .../processor/DocumentProcessor.java | 8 +- .../converting/processor/DomainProcessor.java | 2 +- .../edge/converting/processor/SiteWords.java | 3 +- .../logic/CommonKeywordExtractor.java | 2 +- .../processor/logic/LinkParser.java | 3 + .../edge/crawling/CrawledDomainReader.java | 87 +- .../edge/crawling/blocklist/UrlBlocklist.java | 29 +- .../edge/crawling/model/CrawledDocument.java | 5 +- .../crawling/retreival/CrawlerRetreiver.java | 4 +- .../edge/crawling/retreival/HttpFetcher.java | 5 +- .../wmsa/edge/index/EdgeIndexControl.java | 7 +- .../wmsa/edge/index/EdgeIndexModule.java | 7 - .../wmsa/edge/index/IndexServicesFactory.java | 40 +- .../index/client/EdgeIndexLocalService.java | 12 +- .../edge/index/lexicon/KeywordLexicon.java | 3 +- .../model/EdgePageDocumentsMetadata.java | 25 +- .../wmsa/edge/index/model/QueryLimits.java | 4 + .../wmsa/edge/index/model/QueryStrategy.java | 5 + .../edge/index/postings/DomainRankings.java | 43 + .../index/postings/IndexResultValuator.java | 37 +- .../wmsa/edge/index/postings/SearchIndex.java | 3 +- .../index/postings/SearchIndexControl.java | 6 +- .../forward/ForwardIndexConverter.java | 19 +- .../postings/forward/ForwardIndexReader.java | 15 +- .../forward/ParamMatchingQueryFilter.java | 17 +- .../reverse/ReverseIndexConverter.java | 31 +- .../reverse/ReverseIndexPrioReader.java | 12 - .../postings/reverse/ReverseIndexReader.java | 5 + .../edge/index/query/IndexQueryParams.java | 1 + .../edge/index}/ranking/RankingAlgorithm.java | 139 +- .../edge/index/ranking/ReversePageRank.java} | 10 +- .../edge/index/ranking/StandardPageRank.java} | 11 +- .../accumulator/RankingResultAccumulator.java | 6 + .../RankingResultBitSetAccumulator.java | 17 + .../RankingResultHashMapAccumulator.java | 21 + .../RankingResultListAccumulator.java | 24 + .../ranking/data}/RankingDomainData.java | 4 +- .../ranking/data}/RankingDomainFetcher.java | 25 +- ...RankingDomainFetcherForSimilarityData.java | 103 ++ .../ranking/old/OldReversePageRankV2.java | 2 +- .../index}/ranking/old/StandardPageRank.java | 5 +- .../tool/CreateBrowseDomainRanksTool.java} | 54 +- .../index}/ranking/tool/PerusePageRankV2.java | 10 +- .../ranking/tool/PrintDomainRanksTool.java | 67 + .../ranking/tool/UpdateDomainRanksTool.java} | 26 +- .../svc/EdgeIndexDomainQueryService.java | 4 +- .../index/svc/EdgeIndexLexiconService.java | 6 +- .../edge/index/svc/EdgeIndexQueryService.java | 17 +- .../index/svc/EdgeIndexSearchSetsService.java | 202 +- .../index/svc/searchset/RankingSearchSet.java | 33 +- .../svc/searchset/SearchSetIdentifier.java | 7 + .../index/svc/searchset/SmallSearchSet.java | 4 +- .../StackOverflowPostProcessor.java | 2 +- .../wikipedia/WikipediaProcessor.java | 2 +- .../wmsa/edge/model/EdgeDomain.java | 35 +- .../search/EdgeSearchResultKeywordScore.java | 5 + .../model/search/EdgeSearchSpecification.java | 10 +- .../wmsa/edge/search/query/QueryFactory.java | 47 +- .../wmsa/edge/search/query/QueryParser.java | 7 + .../wmsa/edge/search/query/QueryVariants.java | 12 +- .../svc/EdgeSearchQueryIndexService.java | 14 +- .../wmsa/edge/tools/AdblockTesterTool.java | 2 +- .../edge/tools/ConverterLogicTestTool.java | 4 +- .../edge/tools/CrawlDataExtractorTool.java | 2 +- .../wmsa/renderer/RendererService.java | 9 - .../wmsa/renderer/SmhiRendererService.java | 82 - .../wmsa/renderer/client/RendererClient.java | 15 - .../request/smhi/RenderSmhiIndexReq.java | 13 - .../request/smhi/RenderSmhiPrognosReq.java | 11 - .../wmsa/smhi/SmhiScraperService.java | 79 - .../marginalia/wmsa/smhi/model/Parameter.java | 9 - .../nu/marginalia/wmsa/smhi/model/Plats.java | 42 - .../marginalia/wmsa/smhi/model/Platser.java | 16 - .../wmsa/smhi/model/PrognosData.java | 41 - .../marginalia/wmsa/smhi/model/Tidpunkt.java | 75 - .../wmsa/smhi/model/dyn/Dygnsdata.java | 40 - .../wmsa/smhi/model/index/IndexPlats.java | 13 - .../wmsa/smhi/model/index/IndexPlatser.java | 28 - .../wmsa/smhi/scraper/PlatsReader.java | 44 - .../wmsa/smhi/scraper/SmhiScraperMain.java | 32 - .../wmsa/smhi/scraper/SmhiScraperModule.java | 12 - .../smhi/scraper/crawler/SmhiBackendApi.java | 88 - .../smhi/scraper/crawler/SmhiCrawler.java | 106 -- .../crawler/entity/SmhiEntityStore.java | 62 - .../templates/edge/parts/search-footer.hdb | 3 + .../wmsa/edge/crawling/LinkParserTest.java | 1 + .../edge/crawling/SentenceExtractorTest.java | 26 +- .../model/EdgePageDocumentsMetadataTest.java | 26 +- .../forward/ForwardIndexConverterTest.java | 14 +- .../reverse/ReverseIndexConverterTest.java | 13 +- .../reverse/ReverseIndexConverterTest2.java | 9 +- .../service/EdgeIndexIntegrationTest.java | 25 +- .../EdgeIndexIntegrationTestModule.java | 7 +- .../svc/searchset/RankingSearchSetTest.java | 37 + .../integration/arxiv/ArxivParserTest.java | 2 +- .../stackoverflow/StackOverflowPostsTest.java | 2 +- .../integration/wikipedia/WikipediaTest.java | 2 +- .../edge/search/query/QueryVariantsTest.java | 2 +- .../scraper/crawler/SmhiBackendApiTest.java | 31 - third_party/README.md | 8 +- .../com/github/datquocnguyen/FWObject.java | 9 + .../github/datquocnguyen/RDRPOSTagger.java | 4 +- .../java/com/github/datquocnguyen/Utils.java | 8 +- .../com/google/gson/stream/JsonReader.java | 1637 +++++++++++++++++ .../jdkoverride/LargeLineBufferedReader.java | 559 ++++++ 145 files changed, 3896 insertions(+), 1952 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/bigstring/BigString.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/bigstring/CompressedBigString.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/bigstring/PlainBigString.java rename marginalia_nu/src/main/java/nu/marginalia/util/dict/{DictionaryHashMap.java => OffHeapDictionaryHashMap.java} (96%) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/dict/OnHeapDictionaryMap.java rename marginalia_nu/src/main/java/nu/marginalia/util/language/processing/{ => sentence}/SentenceExtractor.java (54%) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceExtractorStringUtils.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceSegmentSplitter.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/DedupTool.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/QueryLimits.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/DomainRankings.java rename marginalia_nu/src/main/java/nu/marginalia/{util => wmsa/edge/index}/ranking/RankingAlgorithm.java (67%) rename marginalia_nu/src/main/java/nu/marginalia/{util/ranking/BetterReversePageRank.java => wmsa/edge/index/ranking/ReversePageRank.java} (79%) rename marginalia_nu/src/main/java/nu/marginalia/{util/ranking/BetterStandardPageRank.java => wmsa/edge/index/ranking/StandardPageRank.java} (77%) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultAccumulator.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultBitSetAccumulator.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultHashMapAccumulator.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultListAccumulator.java rename marginalia_nu/src/main/java/nu/marginalia/{util/ranking => wmsa/edge/index/ranking/data}/RankingDomainData.java (87%) rename marginalia_nu/src/main/java/nu/marginalia/{util/ranking => wmsa/edge/index/ranking/data}/RankingDomainFetcher.java (84%) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcherForSimilarityData.java rename marginalia_nu/src/main/java/nu/marginalia/{util => wmsa/edge/index}/ranking/old/OldReversePageRankV2.java (99%) rename marginalia_nu/src/main/java/nu/marginalia/{util => wmsa/edge/index}/ranking/old/StandardPageRank.java (98%) rename marginalia_nu/src/main/java/nu/marginalia/{util/ranking/tool/UpdateDomainRanksTool.java => wmsa/edge/index/ranking/tool/CreateBrowseDomainRanksTool.java} (54%) rename marginalia_nu/src/main/java/nu/marginalia/{util => wmsa/edge/index}/ranking/tool/PerusePageRankV2.java (97%) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/PrintDomainRanksTool.java rename marginalia_nu/src/main/java/nu/marginalia/{util/ranking/tool/UpdateDomainRanksTool2.java => wmsa/edge/index/ranking/tool/UpdateDomainRanksTool.java} (75%) delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/SmhiRendererService.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/request/smhi/RenderSmhiIndexReq.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/request/smhi/RenderSmhiPrognosReq.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/SmhiScraperService.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/Parameter.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/Plats.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/Platser.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/PrognosData.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/Tidpunkt.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/dyn/Dygnsdata.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/index/IndexPlats.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/index/IndexPlatser.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/scraper/PlatsReader.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/scraper/SmhiScraperMain.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/scraper/SmhiScraperModule.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/scraper/crawler/SmhiBackendApi.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/scraper/crawler/SmhiCrawler.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/scraper/crawler/entity/SmhiEntityStore.java create mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/svc/searchset/RankingSearchSetTest.java delete mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/smhi/scraper/crawler/SmhiBackendApiTest.java create mode 100644 third_party/src/main/java/com/google/gson/stream/JsonReader.java create mode 100644 third_party/src/main/java/jdkoverride/LargeLineBufferedReader.java diff --git a/marginalia_nu/build.gradle b/marginalia_nu/build.gradle index e1c96de0..638c1e30 100644 --- a/marginalia_nu/build.gradle +++ b/marginalia_nu/build.gradle @@ -106,8 +106,9 @@ dependencies { implementation group: 'org.yaml', name: 'snakeyaml', version: '1.30' - implementation 'com.syncthemall:boilerpipe:1.2.2' implementation 'com.github.luben:zstd-jni:1.5.2-2' + implementation 'org.lz4:lz4-java:1.8.0' + implementation 'com.github.vladimir-bukhtoyarov:bucket4j-core:7.5.0' implementation 'de.rototor.jeuclid:jeuclid-core:3.1.14' @@ -126,7 +127,6 @@ dependencies { implementation 'org.roaringbitmap:RoaringBitmap:0.9.32' implementation group: 'mysql', name: 'mysql-connector-java', version: '8.0.29' - implementation 'com.github.Marcono1234:gson-record-type-adapter-factory:0.2.0' testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2' diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/BrailleBlockPunchCards.java b/marginalia_nu/src/main/java/nu/marginalia/util/BrailleBlockPunchCards.java index e0a3c9db..ebf139d0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/BrailleBlockPunchCards.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/BrailleBlockPunchCards.java @@ -2,11 +2,13 @@ package nu.marginalia.util; public class BrailleBlockPunchCards { + private static final char brailleBlockBase = '\u2800'; + public static String printBits(int val, int bits) { StringBuilder builder = new StringBuilder(); for (int b = 0; b < bits; b+=8, val>>>=8) { - builder.append((char)('\u2800'+bin2brail(val))); + builder.append((char)(brailleBlockBase + bin2brail(val))); } return builder.toString(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ParallelPipe.java b/marginalia_nu/src/main/java/nu/marginalia/util/ParallelPipe.java index fd1ae119..853af8fb 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ParallelPipe.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/ParallelPipe.java @@ -42,7 +42,7 @@ public abstract class ParallelPipe { @SneakyThrows private void runProcessThread() { while (expectingInput || !inputs.isEmpty()) { - var in = inputs.poll(1, TimeUnit.SECONDS); + var in = inputs.poll(10, TimeUnit.SECONDS); if (in != null) { try { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java b/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java index a52f9e63..4e21c76e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/RandomWriteFunnel.java @@ -108,7 +108,6 @@ public class RandomWriteFunnel implements AutoCloseable { private void eval(ByteBuffer dest) throws IOException { flushBuffer(); - channel.force(false); channel.position(0); buffer.clear(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/StringPool.java b/marginalia_nu/src/main/java/nu/marginalia/util/StringPool.java index 5911a497..7ab9b651 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/StringPool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/StringPool.java @@ -1,20 +1,33 @@ package nu.marginalia.util; +import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap; + +import java.util.Arrays; import java.util.HashMap; public class StringPool { - private final HashMap words; - public StringPool() { - this.words = new HashMap<>(1000); + private final HashMap words; + private final Object2LongOpenHashMap ages; + private final int maxCap; + + long idx; + + private StringPool(int capacity, int maxCap) { + this.ages = new Object2LongOpenHashMap<>(capacity); + this.words = new HashMap<>(capacity); + this.maxCap = maxCap; } - public StringPool(int capacity) { - words = new HashMap<>(capacity); + public static StringPool create(int capacity) { + return new StringPool(capacity, capacity * 10); } public String internalize(String str) { + prune(); + final String ret = words.putIfAbsent(str, str); + ages.put(ret, idx++); if (null == ret) return str; @@ -22,6 +35,37 @@ public class StringPool { return ret; } + public String[] internalize(String[] str) { + + for (int i = 0; i < str.length; i++) { + str[i] = internalize(str[i]); + } + + return str; + } + + public void prune() { + + if (words.size() < maxCap) + return; + + long[] ageValues = ages.values().toLongArray(); + Arrays.sort(ageValues); + + long cutoff = ageValues[ageValues.length - maxCap / 10]; + + words.clear(); + ages.forEach((word, cnt) -> { + if (cnt >= cutoff) { + words.put(word, word); + } + }); + ages.clear(); + words.forEach((w,w2) -> { + ages.put(w, idx); + }); + } + public void flush() { words.clear(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArrayBase.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArrayBase.java index bf5249a6..94e462b7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArrayBase.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/IntArrayBase.java @@ -18,22 +18,14 @@ public interface IntArrayBase extends BulkTransferArray { } } - default void increment(long pos) { - set(pos, get(pos) + 1); - } - default void swap(long pos1, long pos2) { int tmp = get(pos1); set(pos1, get(pos2)); set(pos2, tmp); } - default void swapn(int n, long pos1, long pos2) { - for (int i = 0; i < n; i++) { - int tmp = get(pos1+i); - set(pos1+i, get(pos2+i)); - set(pos2+i, tmp); - } + default void increment(long pos) { + set(pos, get(pos) + 1); } default int getAndIncrement(long pos) { @@ -47,6 +39,7 @@ public interface IntArrayBase extends BulkTransferArray { set(start+i, buffer.get(i + bufferStart)); } } + default void get(long start, long end, IntBuffer buffer, int bufferStart) { for (int i = 0; i < (end-start); i++) { buffer.put(i + bufferStart, get(start + i)); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArrayBase.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArrayBase.java index 508fdf9a..216e089b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArrayBase.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArrayBase.java @@ -28,6 +28,7 @@ public interface LongArrayBase extends BulkTransferArray { set(pos2, tmp); } + /** Behavior not defined for overlapping ranges */ default void swapn(int n, long pos1, long pos2) { for (int i = 0; i < n; i++) { long tmp = get(pos1+i); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ShiftedIntArray.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ShiftedIntArray.java index b7dc343e..a920c99a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ShiftedIntArray.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ShiftedIntArray.java @@ -2,6 +2,7 @@ package nu.marginalia.util.array.delegate; import com.upserve.uppend.blobs.NativeIO; import nu.marginalia.util.array.IntArray; +import nu.marginalia.util.array.algo.SortingContext; import nu.marginalia.util.array.buffer.IntQueryBuffer; import nu.marginalia.util.array.functional.IntBinaryIOOperation; import nu.marginalia.util.array.functional.IntIOTransformer; @@ -61,6 +62,16 @@ public class ShiftedIntArray implements IntArray { delegate.get(shift+start, shift+end, buffer); } + @Override + public int getAndIncrement(long pos) { + return delegate.getAndIncrement(shift + pos); + } + + @Override + public void fill(long start, long end, int val) { + delegate.fill(start + shift, end + shift, val); + } + @Override public long size() { return size; @@ -97,6 +108,12 @@ public class ShiftedIntArray implements IntArray { return delegate.isSorted(shift + start, shift + end); } + + public void sortLargeSpan(SortingContext ctx, long start, long end) throws IOException { + delegate.sortLargeSpan(ctx, start, end); + } + + public long search(int key) { if (size < 128) { return linearSearch(key); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ShiftedLongArray.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ShiftedLongArray.java index cc6386d6..53a4f89b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ShiftedLongArray.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/delegate/ShiftedLongArray.java @@ -3,6 +3,7 @@ package nu.marginalia.util.array.delegate; import com.upserve.uppend.blobs.NativeIO; import nu.marginalia.util.array.LongArray; import nu.marginalia.util.array.algo.LongArraySearch; +import nu.marginalia.util.array.algo.SortingContext; import nu.marginalia.util.array.buffer.LongQueryBuffer; import nu.marginalia.util.array.functional.LongBinaryIOOperation; import nu.marginalia.util.array.functional.LongIOTransformer; @@ -62,6 +63,16 @@ public class ShiftedLongArray implements LongArray { delegate.get(shift+start, shift+end, buffer); } + @Override + public long getAndIncrement(long pos) { + return delegate.getAndIncrement(shift + pos); + } + + @Override + public void fill(long start, long end, long val) { + delegate.fill(start + shift, end + shift, val); + } + @Override public long size() { return size; @@ -106,6 +117,14 @@ public class ShiftedLongArray implements LongArray { return delegate.isSortedN(sz, shift + start, shift + end); } + public void sortLargeSpanN(SortingContext ctx, int sz, long start, long end) throws IOException { + delegate.sortLargeSpanN(ctx, sz, start, end); + } + + public void sortLargeSpan(SortingContext ctx, long start, long end) throws IOException { + delegate.sortLargeSpan(ctx, start, end); + } + public long searchN(int sz, long key) { if (size < 128) { return linearSearchN(sz, key); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/page/PagingIntArray.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/page/PagingIntArray.java index 9fdbd21d..6b44fecb 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/page/PagingIntArray.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/page/PagingIntArray.java @@ -2,6 +2,7 @@ package nu.marginalia.util.array.page; import com.upserve.uppend.blobs.NativeIO; import nu.marginalia.util.array.IntArray; +import nu.marginalia.util.array.algo.SortingContext; import nu.marginalia.util.array.buffer.IntQueryBuffer; import nu.marginalia.util.array.delegate.ReferenceImplIntArrayDelegate; import nu.marginalia.util.array.functional.IntBinaryIOOperation; @@ -113,6 +114,11 @@ public class PagingIntArray extends AbstractPagingArray } } + @Override + public int getAndIncrement(long pos) { + return pages[partitioningScheme.getPage(pos)].getAndIncrement(partitioningScheme.getOffset(pos)); + } + @Override public void get(long start, long end, int[] buffer) { if (partitioningScheme.isSamePage(start, end)) { @@ -272,6 +278,22 @@ public class PagingIntArray extends AbstractPagingArray } } + + public void sortLargeSpan(SortingContext ctx, long start, long end) throws IOException { + if (partitioningScheme.isSamePage(start, end)) { + int sOff = partitioningScheme.getOffset(start); + int eOff = partitioningScheme.getEndOffset(start, end); + + if (eOff > sOff) { + pages[partitioningScheme.getPage(start)].sortLargeSpan(ctx, sOff, eOff); + } + } + else { + defaults.sortLargeSpan(ctx, start, end); + } + } + + public void write(Path fileName) throws IOException { try (var channel = (FileChannel) Files.newByteChannel(fileName, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) { for (int i = 0; i < pages.length; i++) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/page/PagingLongArray.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/page/PagingLongArray.java index e7f0a983..597979ef 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/page/PagingLongArray.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/page/PagingLongArray.java @@ -2,6 +2,7 @@ package nu.marginalia.util.array.page; import com.upserve.uppend.blobs.NativeIO; import nu.marginalia.util.array.LongArray; +import nu.marginalia.util.array.algo.SortingContext; import nu.marginalia.util.array.buffer.LongQueryBuffer; import nu.marginalia.util.array.delegate.ReferenceImplLongArrayDelegate; import nu.marginalia.util.array.functional.LongBinaryIOOperation; @@ -118,6 +119,11 @@ public class PagingLongArray extends AbstractPagingArray sOff) { + pages[partitioningScheme.getPage(start)].sortLargeSpanN(ctx, sz, sOff, eOff); + } + } + else { + defaults.sortLargeSpanN(ctx, sz, start, end); + } + } + + public void sortLargeSpan(SortingContext ctx, long start, long end) throws IOException { + if (partitioningScheme.isSamePage(start, end)) { + int sOff = partitioningScheme.getOffset(start); + int eOff = partitioningScheme.getEndOffset(start, end); + + if (eOff > sOff) { + pages[partitioningScheme.getPage(start)].sortLargeSpan(ctx, sOff, eOff); + } + } + else { + defaults.sortLargeSpan(ctx, start, end); + } + } public void write(Path fileName) throws IOException { try (var channel = (FileChannel) Files.newByteChannel(fileName, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/bigstring/BigString.java b/marginalia_nu/src/main/java/nu/marginalia/util/bigstring/BigString.java new file mode 100644 index 00000000..48c4c053 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/bigstring/BigString.java @@ -0,0 +1,17 @@ +package nu.marginalia.util.bigstring; + +public interface BigString { + static BigString encode(String stringValue) { + if (stringValue.length() > 64) { + return new CompressedBigString(stringValue); + } + else { + return new PlainBigString(stringValue); + } + } + String decode(); + + byte[] getBytes(); + + int length(); +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/bigstring/CompressedBigString.java b/marginalia_nu/src/main/java/nu/marginalia/util/bigstring/CompressedBigString.java new file mode 100644 index 00000000..1b84e576 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/bigstring/CompressedBigString.java @@ -0,0 +1,39 @@ +package nu.marginalia.util.bigstring; + +import net.jpountz.lz4.LZ4Compressor; +import net.jpountz.lz4.LZ4Factory; +import net.jpountz.lz4.LZ4FastDecompressor; + +import java.nio.charset.StandardCharsets; + +public class CompressedBigString implements BigString { + private final int originalSize; + private final int length; + private final byte[] encoded; + + private static final LZ4Factory lz4Factory = LZ4Factory.fastestInstance();; + private static final LZ4Compressor compressor = lz4Factory.fastCompressor(); + private static final LZ4FastDecompressor decompressor = lz4Factory.fastDecompressor(); + + public CompressedBigString(String stringValue) { + byte[] byteValue = stringValue.getBytes(StandardCharsets.UTF_16); + originalSize = byteValue.length; + encoded = compressor.compress(byteValue); + length = stringValue.length(); + } + + @Override + public String decode() { + return new String(getBytes(), StandardCharsets.UTF_16); + } + + @Override + public byte[] getBytes() { + return decompressor.decompress(encoded, originalSize); + } + + @Override + public int length() { + return length; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/bigstring/PlainBigString.java b/marginalia_nu/src/main/java/nu/marginalia/util/bigstring/PlainBigString.java new file mode 100644 index 00000000..5af3a5c8 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/bigstring/PlainBigString.java @@ -0,0 +1,26 @@ +package nu.marginalia.util.bigstring; + +import java.nio.charset.StandardCharsets; + +public class PlainBigString implements BigString { + private final String value; + + public PlainBigString(String value) { + this.value = value; + } + + @Override + public String decode() { + return value; + } + + @Override + public byte[] getBytes() { + return value.getBytes(StandardCharsets.UTF_8); + } + + @Override + public int length() { + return value.length(); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java index 9e89b730..492417a0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java @@ -1,8 +1,5 @@ package nu.marginalia.util.dict; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import java.nio.ByteBuffer; import java.nio.LongBuffer; import java.util.ArrayList; @@ -10,7 +7,6 @@ import java.util.ArrayList; public class DictionaryData { private final int DICTIONARY_BANK_SIZE; - private static final Logger logger = LoggerFactory.getLogger(DictionaryData.class); private final ArrayList banks = new ArrayList<>(100); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryMap.java b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryMap.java index fad45130..fb13893e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryMap.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryMap.java @@ -1,6 +1,17 @@ package nu.marginalia.util.dict; public interface DictionaryMap { + int NO_VALUE = Integer.MIN_VALUE; + + static DictionaryMap create() { + if (Boolean.getBoolean("small-ram")) { + return new OnHeapDictionaryMap(); + } + else { + return new OffHeapDictionaryHashMap(1L << 31); + } + } + int size(); int put(long key); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java b/marginalia_nu/src/main/java/nu/marginalia/util/dict/OffHeapDictionaryHashMap.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java rename to marginalia_nu/src/main/java/nu/marginalia/util/dict/OffHeapDictionaryHashMap.java index f66599d3..f906c45a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/dict/OffHeapDictionaryHashMap.java @@ -16,15 +16,14 @@ import static nu.marginalia.util.FileSizeUtil.readableSize; * Spiritually influenced by GNU Trove's hash maps * LGPL 2.1 */ -public class DictionaryHashMap implements DictionaryMap { - private static final Logger logger = LoggerFactory.getLogger(DictionaryHashMap.class); +public class OffHeapDictionaryHashMap implements DictionaryMap { + private static final Logger logger = LoggerFactory.getLogger(OffHeapDictionaryHashMap.class); private static final Gauge probe_count_metrics = Gauge.build("wmsa_dictionary_hash_map_probe_count", "Probing Count") .register(); private final int bufferCount; private final IntBuffer[] buffers; - public static final int NO_VALUE = Integer.MIN_VALUE; private final DictionaryData dictionaryData; @@ -35,7 +34,7 @@ public class DictionaryHashMap implements DictionaryMap { private final AtomicInteger sz = new AtomicInteger(0); - public DictionaryHashMap(long sizeMemory) { + public OffHeapDictionaryHashMap(long sizeMemory) { final int intSize = 4; bufferCount = 1 + (int) ((intSize*sizeMemory) / (1<<30)); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/dict/OnHeapDictionaryMap.java b/marginalia_nu/src/main/java/nu/marginalia/util/dict/OnHeapDictionaryMap.java new file mode 100644 index 00000000..a9f4063f --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/dict/OnHeapDictionaryMap.java @@ -0,0 +1,23 @@ +package nu.marginalia.util.dict; + +import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap; + +public class OnHeapDictionaryMap implements DictionaryMap { + private final Long2IntOpenHashMap entries = new Long2IntOpenHashMap(100_000, 0.75f); + + @Override + public int size() { + return entries.size(); + } + + @Override + public int put(long key) { + entries.putIfAbsent(key, entries.size()); + return get(key); + } + + @Override + public int get(long key) { + return entries.getOrDefault(key, NO_VALUE); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/gregex/GuardedRegexFactory.java b/marginalia_nu/src/main/java/nu/marginalia/util/gregex/GuardedRegexFactory.java index 16dd6e59..800fc621 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/gregex/GuardedRegexFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/gregex/GuardedRegexFactory.java @@ -19,9 +19,6 @@ public class GuardedRegexFactory { public static GuardedRegex contains(String substring, @Language("RegExp") String regex) { return new GuardedRegexContains(substring, regex); } - public static GuardedRegex minLength(int minLength, @Language("RegExp") String regex) { - return new GuardedRegexMinLength(minLength, regex); - } private record GuardedRegexContains(String contains, Pattern pattern) implements GuardedRegex { public GuardedRegexContains(String contains, String pattern) { @@ -32,15 +29,6 @@ public class GuardedRegexFactory { return s.contains(contains) && pattern.matcher(s).find(); } } - private record GuardedRegexMinLength(int minLength, Pattern pattern) implements GuardedRegex { - public GuardedRegexMinLength(int minLength, String pattern) { - this(minLength, Pattern.compile(pattern)); - } - - public boolean test(String s) { - return s.length() >= minLength && pattern.matcher(s).find(); - } - } private record GuardedRegexStartsWith(String start, Pattern pattern) implements GuardedRegex { public GuardedRegexStartsWith(String start, String pattern) { this(start, Pattern.compile(pattern)); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java index fb081c95..a693dcdc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java @@ -4,7 +4,7 @@ import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.processing.KeywordCounter; import nu.marginalia.util.language.processing.KeywordExtractor; import nu.marginalia.util.language.processing.NameCounter; -import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.sentence.SentenceExtractor; import nu.marginalia.util.language.processing.model.DocumentSentence; import nu.marginalia.util.language.processing.model.WordRep; import nu.marginalia.util.language.processing.model.tag.WordSeparator; @@ -68,9 +68,6 @@ public class DocumentDebugger { Set reps = new HashSet<>(); -// kc.count(languageData, 0.75).forEach(rep -> reps.add(rep.stemmed)); -// kc.count(languageData).forEach(rep -> reps.add(rep.stemmed)); - try (var pw = new PrintWriter(new FileOutputStream(output.toFile()))) { for (var sent : languageData.titleSentences) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java index 4766706e..c2fc0045 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java @@ -1,5 +1,7 @@ package nu.marginalia.util.language; +import org.apache.commons.lang3.StringUtils; + import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; @@ -123,14 +125,24 @@ public class WordPatterns { if (!filter(s)) { return true; } - if (isTopWord(s)) { + + String sLc; + if (StringUtils.isAllLowerCase(s)) { + sLc = s; + } + else { + sLc = s.toLowerCase(); + } + + if (isTopWord(sLc)) { return true; } + return false; } - public static boolean isTopWord(String s) { - return topWords.contains(s.toLowerCase()); + public static boolean isTopWord(String strLowerCase) { + return topWords.contains(strLowerCase); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java index fddd7e28..557e8d58 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java @@ -35,9 +35,7 @@ public class DocumentKeywordExtractor { List titleWords = extractTitleWords(documentLanguageData); List wordsNamesAll = nameCounter.count(documentLanguageData, 2); - List subjects = subjectCounter.count(documentLanguageData); - - tfIdfCounter.countHisto(keywordMetadata, documentLanguageData); + List subjects = subjectCounter.count(keywordMetadata, documentLanguageData); for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed); for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed); @@ -59,11 +57,12 @@ public class DocumentKeywordExtractor { getWordPositions(keywordMetadata, documentLanguageData); - List wordsNamesAll = nameCounter.count(documentLanguageData, 2); - List subjects = subjectCounter.count(documentLanguageData); - List wordsTfIdf = tfIdfCounter.countHisto(keywordMetadata, documentLanguageData); + List wordsNamesAll = nameCounter.count(documentLanguageData, 2); + List subjects = subjectCounter.count(keywordMetadata, documentLanguageData); + + for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed); for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed); for (var rep : subjects) keywordMetadata.subjectKeywords().add(rep.stemmed); @@ -94,7 +93,7 @@ public class DocumentKeywordExtractor { ret.merge(word.stemmed(), posBit, this::bitwiseOr); } - for (var span : keywordExtractor.getNames(sent)) { + for (var span : keywordExtractor.getProperNames(sent)) { ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); } } @@ -108,7 +107,7 @@ public class DocumentKeywordExtractor { ret.merge(word.stemmed(), posBit, this::bitwiseOr); } - for (var span : keywordExtractor.getNames(sent)) { + for (var span : keywordExtractor.getProperNames(sent)) { ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); } @@ -155,16 +154,16 @@ public class DocumentKeywordExtractor { if (!word.isStopWord()) { String w = AsciiFlattener.flattenUnicode(word.wordLowerCase()); if (WordPatterns.singleWordQualitiesPredicate.test(w)) { - wordsBuilder.add(w, metadata.forWord(flagsTemplate, word.stemmed())); + wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, word.stemmed())); } } } - for (var names : keywordExtractor.getNames(sent)) { + for (var names : keywordExtractor.getProperNames(sent)) { var rep = new WordRep(sent, names); String w = AsciiFlattener.flattenUnicode(rep.word); - wordsBuilder.add(w, metadata.forWord(flagsTemplate, rep.stemmed)); + wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, rep.stemmed)); } } @@ -218,7 +217,7 @@ public class DocumentKeywordExtractor { continue; } - wordsBuilder.add(flatWord, metadata.forWord(metadata.wordFlagsTemplate(), word.stemmed) | additionalMeta); + wordsBuilder.add(flatWord, metadata.getMetadataForWord(metadata.wordFlagsTemplate(), word.stemmed) | additionalMeta); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java index fa24cbcd..2ee90f6b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java @@ -43,8 +43,8 @@ public class KeywordCounter { counts.mergeInt(rep.stemmed, 1, Integer::sum); - var instanceSet = instances.computeIfAbsent(rep.stemmed, k -> new HashSet<>(500)); - if (instanceSet.size() < 250) { + var instanceSet = instances.computeIfAbsent(rep.stemmed, k -> new HashSet<>(16)); + if (instanceSet.size() < 4) { instanceSet.add(rep); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordExtractor.java index 7e56830e..8673ac4c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordExtractor.java @@ -7,14 +7,12 @@ import nu.marginalia.util.language.processing.model.tag.WordSeparator; import java.lang.ref.SoftReference; import java.util.ArrayList; -import java.util.Collections; import java.util.List; -import java.util.Set; public class KeywordExtractor { - public WordSpan[] getNames(DocumentSentence sentence) { - List spans = new ArrayList<>(sentence.length()); + public WordSpan[] getProperNames(DocumentSentence sentence) { + List spans = new ArrayList<>(2 * sentence.length()); for (int i = 0; i < sentence.length(); i++) { if (isProperNoun(i, sentence)) @@ -57,27 +55,73 @@ public class KeywordExtractor { return spans.toArray(WordSpan[]::new); } - public WordSpan[] getKeywordsFromSentence(DocumentSentence sentence) { - if (sentence.keywords != null) { - return sentence.keywords.get(); - } - List spans = new ArrayList<>(sentence.length()); - Set topWords = Collections.emptySet(); + public WordSpan[] getNouns(DocumentSentence sentence) { + List spans = new ArrayList<>(2 * sentence.length()); for (int i = 0; i < sentence.length(); i++) { - if (isName(i, sentence, topWords) || isTopAdj(i, sentence, topWords)) + if (isNoun(i, sentence)) spans.add(new WordSpan(i, i+1)); } for (int i = 1; i < sentence.length(); i++) { if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; } - if (isName(i, sentence, topWords)) { - if (isName(i - 1, sentence, topWords) || isTopAdj(i-1, sentence, topWords)) + if (isNoun(i, sentence) + && (isNoun(i-1, sentence)) || "JJ".equals(sentence.posTags[i-1])) { + spans.add(new WordSpan(i - 1, i + 1)); + } + } + + for (int i = 2; i < sentence.length(); i++) { + if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; } + if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; } + + if ((isNoun(i, sentence)) + && (isJoiner(sentence, i-1) || isNoun(i-1, sentence)) + && (isNoun(i-2, sentence)) || "JJ".equals(sentence.posTags[i-2])) + spans.add(new WordSpan(i-2, i+1)); + } + + for (int i = 3; i < sentence.length(); i++) { + if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; } + if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; } + if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; } + + if (isNoun(i, sentence) && (isNoun(i-3, sentence) || "JJ".equals(sentence.posTags[i-3]))) { + if (isNoun(i - 1, sentence) && isNoun(i - 2, sentence)) + spans.add(new WordSpan(i-3, i+1)); + else if (isJoiner(sentence, i-2) && sentence.posTags[i-1].equals("DT")) + spans.add(new WordSpan(i-3, i+1)); + else if ((isJoiner(sentence, i-1) ||isNoun(i-1, sentence)) + && (isJoiner(sentence, i-2)||isNoun(i-2, sentence))) + spans.add(new WordSpan(i-3, i+1)); + } + } + + return spans.toArray(WordSpan[]::new); + } + + + public WordSpan[] getKeywordsFromSentence(DocumentSentence sentence) { + if (sentence.keywords != null) { + return sentence.keywords.get(); + } + List spans = new ArrayList<>(2 * sentence.length()); + + for (int i = 0; i < sentence.length(); i++) { + if (isName(i, sentence) || isTopAdj(i, sentence)) + spans.add(new WordSpan(i, i+1)); + } + + for (int i = 1; i < sentence.length(); i++) { + if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; } + + if (isName(i, sentence)) { + if (isName(i - 1, sentence) || isTopAdj(i-1, sentence)) spans.add(new WordSpan(i - 1, i + 1)); } - if (sentence.posTags[i].equals("CD") && isName(i-1, sentence, topWords)) { + if (sentence.posTags[i].equals("CD") && isName(i-1, sentence)) { spans.add(new WordSpan(i - 1, i + 1)); } } @@ -86,16 +130,16 @@ public class KeywordExtractor { if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; } if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; } - if (isName(i, sentence, topWords)) { - if ((isName(i-1, sentence, topWords) || isTopAdj(i-1, sentence, topWords)) - && (isName(i-2, sentence, topWords) || isTopAdj(i-2, sentence, topWords))) { + if (isName(i, sentence)) { + if ((isName(i-1, sentence) || isTopAdj(i-1, sentence)) + && (isName(i-2, sentence) || isTopAdj(i-2, sentence))) { spans.add(new WordSpan(i - 2, i + 1)); } else if ((isProperNoun(i-1, sentence) || isJoiner(sentence, i-1)) && isProperNoun(i-2, sentence)) { spans.add(new WordSpan(i - 2, i + 1)); } } - else if (sentence.posTags[i].equals("CD") && isName(i-1, sentence, topWords) && isName(i-2, sentence, topWords)) { + else if (sentence.posTags[i].equals("CD") && isName(i-1, sentence) && isName(i-2, sentence)) { spans.add(new WordSpan(i - 2, i + 1)); } } @@ -105,10 +149,10 @@ public class KeywordExtractor { if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; } if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; } - if (isName(i, sentence, topWords) && - (isName(i-1, sentence, topWords) || isTopAdj(i-1, sentence, topWords)) && - (isName(i-2, sentence, topWords) || isTopAdj(i-2, sentence, topWords)) && - (isName(i-3, sentence, topWords) || isTopAdj(i-3, sentence, topWords))) { + if (isName(i, sentence) && + (isName(i-1, sentence) || isTopAdj(i-1, sentence)) && + (isName(i-2, sentence) || isTopAdj(i-2, sentence)) && + (isName(i-3, sentence) || isTopAdj(i-3, sentence))) { spans.add(new WordSpan(i - 3, i + 1)); } else if (isProperNoun(i, sentence) && isProperNoun(i-3, sentence)) { @@ -134,7 +178,9 @@ public class KeywordExtractor { public boolean isProperNoun(int i, DocumentSentence sent) { return "NNP".equals(sent.posTags[i]) || "NNPS".equals(sent.posTags[i]); } - + public boolean isNoun(int i, DocumentSentence sent) { + return sent.posTags[i].startsWith("NN"); + } public boolean isJoiner(DocumentSentence sent, int i) { if(sent.posTags[i].equals("IN")) { return true; @@ -183,21 +229,13 @@ public class KeywordExtractor { return true; } - private boolean isName(int i, DocumentSentence sentence, Set topWords) { - if (!topWords.isEmpty()) { - String posTag = sentence.posTags[i]; - String word = sentence.stemmedWords[i]; - - return ((topWords.contains(word)) && (posTag.startsWith("N") || "VBN".equals(posTag)) && !sentence.isStopWord(i)); - } - - + private boolean isName(int i, DocumentSentence sentence) { String posTag = sentence.posTags[i]; - return (posTag.startsWith("N") || "VBN".equals(posTag)) && !sentence.isStopWord(i); + return (posTag.startsWith("N") || "VBG".equals(posTag)|| "VBN".equals(posTag)) && !sentence.isStopWord(i); } - private boolean isTopAdj(int i, DocumentSentence sentence, Set topWords) { + private boolean isTopAdj(int i, DocumentSentence sentence) { String posTag = sentence.posTags[i]; return (posTag.startsWith("JJ") || posTag.startsWith("R") || posTag.startsWith("VBG")); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/NameCounter.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/NameCounter.java index 476b7b5d..221790d6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/NameCounter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/NameCounter.java @@ -20,7 +20,7 @@ public class NameCounter { for (int i = 0; i < dld.sentences.length; i++) { DocumentSentence sent = dld.sentences[i]; - var keywords = keywordExtractor.getNames(sent); + var keywords = keywordExtractor.getProperNames(sent); for (var span : keywords) { if (span.size() <= 1) continue; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SubjectCounter.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SubjectCounter.java index af774898..b0f46f30 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SubjectCounter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SubjectCounter.java @@ -1,9 +1,11 @@ package nu.marginalia.util.language.processing; import nu.marginalia.util.language.processing.model.DocumentLanguageData; +import nu.marginalia.util.language.processing.model.KeywordMetadata; import nu.marginalia.util.language.processing.model.WordRep; import nu.marginalia.util.language.processing.model.WordSpan; import nu.marginalia.util.language.processing.model.tag.WordSeparator; +import org.apache.commons.lang3.StringUtils; import java.util.*; import java.util.stream.Collectors; @@ -23,13 +25,13 @@ public class SubjectCounter { // Greeks bearing gifts -> Greeks // Steve McQueen drove fast | cars -> Steve McQueen - public List count(DocumentLanguageData dld) { + public List count(KeywordMetadata keywordMetadata, DocumentLanguageData dld) { Map counts = new HashMap<>(); Map> instances = new HashMap<>(); for (var sentence : dld.sentences) { - for (WordSpan kw : keywordExtractor.getNames(sentence)) { + for (WordSpan kw : keywordExtractor.getNouns(sentence)) { if (kw.end + 2 >= sentence.length()) { continue; } @@ -46,20 +48,46 @@ public class SubjectCounter { String stemmed = rep.stemmed; - counts.merge(stemmed, -1, Integer::sum); instances.computeIfAbsent(stemmed, s -> new HashSet<>()).add(rep); } } } - int best = counts.values().stream().mapToInt(Integer::valueOf).min().orElse(0); + Map scores = new HashMap<>(instances.size()); + for (String stemmed : instances.keySet()) { + scores.put(stemmed, getTermTfIdf(keywordMetadata, stemmed)); + } - return counts.entrySet().stream().sorted(Map.Entry.comparingByValue()) - .filter(e -> e.getValue()<-2 && e.getValue()<=best*0.75) + return scores.entrySet().stream() + .filter(e -> e.getValue() >= 150) .flatMap(e -> instances.getOrDefault(e.getKey(), Collections.emptySet()).stream()) .collect(Collectors.toList()); } + private int getTermTfIdf(KeywordMetadata keywordMetadata, String stemmed) { + if (stemmed.contains("_")) { + int sum = 0; + String[] parts = StringUtils.split(stemmed, '_'); + + if (parts.length == 0) { + return 0; + } + + for (String part : parts) { + sum += getTermTfIdf(keywordMetadata, part); + } + + return sum / parts.length; + } + + var meta = keywordMetadata.wordsTfIdf().get(stemmed); + if (meta != null) { + return meta.tfIdfNormalized(); + } + + return 0; + } + private boolean isDetOrAdverbOrVerb(String posTag) { return "DT".equals(posTag) // determinant || "RB".equals(posTag) // adverb diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentLanguageData.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentLanguageData.java index 7d829fd6..89b95fd0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentLanguageData.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentLanguageData.java @@ -2,12 +2,13 @@ package nu.marginalia.util.language.processing.model; import gnu.trove.map.hash.TObjectIntHashMap; import lombok.AllArgsConstructor; +import nu.marginalia.util.language.processing.sentence.SentenceExtractor; import java.util.Arrays; import java.util.stream.Stream; /** - * @see nu.marginalia.util.language.processing.SentenceExtractor + * @see SentenceExtractor */ @AllArgsConstructor public class DocumentLanguageData { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/KeywordMetadata.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/KeywordMetadata.java index 1018b5cf..58e53551 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/KeywordMetadata.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/KeywordMetadata.java @@ -17,9 +17,6 @@ public record KeywordMetadata(HashSet titleKeywords, ) { - private static final KeywordCounter.WordFrequencyData empty = new KeywordCounter.WordFrequencyData(0, 0); - private static final int TF_IDF_HIGH_LIMIT = 64; - public KeywordMetadata(EnumSet flags) { this(new HashSet<>(50), new HashSet<>(10), new HashSet<>(50), new HashMap<>(15_000), @@ -31,7 +28,8 @@ public record KeywordMetadata(HashSet titleKeywords, this(EnumSet.noneOf(EdgePageWordFlags.class)); } - public long forWord(EnumSet flagsTemplate, String stemmed) { + private static final KeywordCounter.WordFrequencyData empty = new KeywordCounter.WordFrequencyData(0, 0); + public long getMetadataForWord(EnumSet flagsTemplate, String stemmed) { KeywordCounter.WordFrequencyData tfidf = wordsTfIdf.getOrDefault(stemmed, empty); EnumSet flags = flagsTemplate.clone(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/WordRep.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/WordRep.java index c59e13f8..5f87894f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/WordRep.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/WordRep.java @@ -24,7 +24,7 @@ public class WordRep implements Comparable { @Override public int compareTo(@NotNull WordRep o) { - return stemmed.compareTo(o.stemmed); + return word.compareTo(o.word); } @Override diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SentenceExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceExtractor.java similarity index 54% rename from marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SentenceExtractor.java rename to marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceExtractor.java index 08886928..2957eaa9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SentenceExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceExtractor.java @@ -1,16 +1,14 @@ -package nu.marginalia.util.language.processing; +package nu.marginalia.util.language.processing.sentence; import com.github.datquocnguyen.RDRPOSTagger; import com.github.jknack.handlebars.internal.lang3.StringUtils; -import gnu.trove.list.array.TIntArrayList; import gnu.trove.map.hash.TObjectIntHashMap; -import lombok.AllArgsConstructor; -import lombok.Getter; import lombok.SneakyThrows; +import nu.marginalia.util.StringPool; import nu.marginalia.util.language.conf.LanguageModels; +import nu.marginalia.util.language.processing.HtmlTagCleaner; import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.util.language.processing.model.DocumentSentence; -import nu.marginalia.util.language.processing.model.tag.WordSeparator; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.stemmer.PorterStemmer; @@ -24,25 +22,22 @@ import javax.inject.Inject; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Optional; +import java.util.*; import java.util.regex.Pattern; -import static nu.marginalia.util.language.WordPatterns.*; - public class SentenceExtractor { private SentenceDetectorME sentenceDetector; private final RDRPOSTagger rdrposTagger; private final PorterStemmer porterStemmer = new PorterStemmer(); - private boolean legacyMode = false; private static final Logger logger = LoggerFactory.getLogger(SentenceExtractor.class); private static final HtmlTagCleaner tagCleaner = new HtmlTagCleaner(); + private final ThreadLocal stringPool = ThreadLocal.withInitial(() -> StringPool.create(10_000)); + + @SneakyThrows @Inject public SentenceExtractor(LanguageModels models) { try (InputStream modelIn = new FileInputStream(models.openNLPSentenceDetectionData.toFile())) { @@ -66,6 +61,22 @@ public class SentenceExtractor { final String text = asText(doc); final DocumentSentence[] textSentences = extractSentencesFromString(text); + String title = getTitle(doc, textSentences); + + TObjectIntHashMap counts = calculateWordCounts(textSentences); + var titleSentences = extractSentencesFromString(title.toLowerCase()); + return new DocumentLanguageData(textSentences, titleSentences, counts); + } + + public DocumentLanguageData extractSentences(String text, String title) { + final DocumentSentence[] textSentences = extractSentencesFromString(text); + + TObjectIntHashMap counts = calculateWordCounts(textSentences); + + return new DocumentLanguageData(textSentences, extractSentencesFromString(title.toLowerCase()), counts); + } + + private String getTitle(Document doc, DocumentSentence[] textSentences) { String title = doc.getElementsByTag("title").text() + " . " + Optional.ofNullable(doc.getElementsByTag("h1").first()).map(Element::text).orElse(""); @@ -82,34 +93,7 @@ public class SentenceExtractor { } } - TObjectIntHashMap counts = calculateWordCounts(textSentences); - var titleSentences = extractSentencesFromString(title.toLowerCase()); - return new DocumentLanguageData(textSentences, titleSentences, counts); - } - - public DocumentLanguageData extractSentences(String text) { - final DocumentSentence[] textSentences = extractSentencesFromString(text); - - String title = ""; - for (DocumentSentence textSentence : textSentences) { - if (textSentence.length() > 0) { - title = textSentence.originalSentence.toLowerCase(); - break; - } - } - - TObjectIntHashMap counts = calculateWordCounts(textSentences); - - return new DocumentLanguageData(textSentences, extractSentencesFromString(title.toLowerCase()), counts); - } - - - public DocumentLanguageData extractSentences(String text, String title) { - final DocumentSentence[] textSentences = extractSentencesFromString(text); - - TObjectIntHashMap counts = calculateWordCounts(textSentences); - - return new DocumentLanguageData(textSentences, extractSentencesFromString(title.toLowerCase()), counts); + return title; } @@ -125,79 +109,95 @@ public class SentenceExtractor { return counts; } - private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)"); - -// private static final Pattern badCharPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|(\\.(\\s+|$))"); - - private boolean isBadChar(char c) { - if (c >= 'a' && c <= 'z') return false; - if (c >= 'A' && c <= 'Z') return false; - if (c >= '0' && c <= '9') return false; - if ("_#@.".indexOf(c) >= 0) return false; - if (c >= '\u00C0' && c <= '\u00D6') return false; - if (c >= '\u00D8' && c <= '\u00F6') return false; - if (c >= '\u00F8' && c <= '\u00FF') return false; - - return true; - } - private String sanitizeString(String s) { - char[] newChars = new char[s.length()]; - int pi = 0; - - for (int i = 0; i < newChars.length; i++) { - char c = s.charAt(i); - if (!isBadChar(c)) { - newChars[pi++] = c; - } - else { - newChars[pi++] = ' '; - } - } - - s = new String(newChars, 0, pi); - - if (s.startsWith(".")) { - s = s.substring(1); - if (s.isBlank()) - return ""; - } - return s; - - } - public DocumentSentence extractSentence(String text) { - var wordsAndSeps = splitSegment(text); + var wordsAndSeps = SentenceSegmentSplitter.splitSegment(text); var words = wordsAndSeps.words; var seps = wordsAndSeps.separators; - var lc = toLc(wordsAndSeps.words); + var lc = SentenceExtractorStringUtils.toLowerCaseStripPossessive(wordsAndSeps.words); return new DocumentSentence( - sanitizeString(text), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc) + SentenceExtractorStringUtils.sanitizeString(text), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc) ); } - public String normalizeSpaces(String s) { - if (s.indexOf('\t') >= 0) { - s = s.replace('\t', ' '); - } - if (s.indexOf('\n') >= 0) { - s = s.replace('\n', ' '); - } - return s; - } - public DocumentSentence[] extractSentencesFromString(String text) { String[] sentences; - String textNormalizedSpaces = normalizeSpaces(text); + String textNormalizedSpaces = SentenceExtractorStringUtils.normalizeSpaces(text); try { sentences = sentenceDetector.sentDetect(textNormalizedSpaces); } catch (Exception ex) { + // shitty fallback logic sentences = StringUtils.split(textNormalizedSpaces, '.'); } + sentences = preCleanSentences(sentences); + + final String[][] tokens = new String[sentences.length][]; + final int[][] separators = new int[sentences.length][]; + final String[][] posTags = new String[sentences.length][]; + final String[][] tokensLc = new String[sentences.length][]; + final String[][] stemmedWords = new String[sentences.length][]; + + for (int i = 0; i < tokens.length; i++) { + + var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sentences[i]); + tokens[i] = wordsAndSeps.words; + separators[i] = wordsAndSeps.separators; + if (tokens[i].length > 250) { + tokens[i] = Arrays.copyOf(tokens[i], 250); + separators[i] = Arrays.copyOf(separators[i], 250); + } + for (int j = 0; j < tokens[i].length; j++) { + while (tokens[i][j].endsWith(".")) { + tokens[i][j] = StringUtils.removeEnd(tokens[i][j], "."); + } + } + } + + var sPool = stringPool.get(); + + for (int i = 0; i < tokens.length; i++) { + tokens[i] = sPool.internalize(tokens[i]); + } + + for (int i = 0; i < tokens.length; i++) { + posTags[i] = rdrposTagger.tagsForEnSentence(tokens[i]); + // don't need to internalize this + } + + for (int i = 0; i < tokens.length; i++) { + tokensLc[i] = SentenceExtractorStringUtils.toLowerCaseStripPossessive(tokens[i]); + tokensLc[i] = sPool.internalize(tokensLc[i]); + } + + for (int i = 0; i < tokens.length; i++) { + stemmedWords[i] = stemSentence(tokensLc[i]); + stemmedWords[i] = sPool.internalize(stemmedWords[i]); + } + + DocumentSentence[] ret = new DocumentSentence[sentences.length]; + for (int i = 0; i < ret.length; i++) { + String fullString; + + if (i == 0) { + fullString = SentenceExtractorStringUtils.sanitizeString(sentences[i]); + } + else { + fullString = ""; + } + + ret[i] = new DocumentSentence(fullString, tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]); + } + return ret; + } + + private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)"); + + private String[] preCleanSentences(String[] sentences) { + if (sentences.length > 250) { sentences = Arrays.copyOf(sentences, 250); } @@ -212,53 +212,13 @@ public class SentenceExtractor { sentenceList.add(s); } } - sentences = sentenceList.toArray(String[]::new); - - final String[][] tokens = new String[sentences.length][]; - final int[][] separators = new int[sentences.length][]; - final String[][] posTags = new String[sentences.length][]; - final String[][] tokensLc = new String[sentences.length][]; - final String[][] stemmedWords = new String[sentences.length][]; - - for (int i = 0; i < tokens.length; i++) { - - var wordsAndSeps = splitSegment(sentences[i]); //tokenizer.tokenize(sentences[i]); - tokens[i] = wordsAndSeps.words; - separators[i] = wordsAndSeps.separators; - if (tokens[i].length > 250) { - tokens[i] = Arrays.copyOf(tokens[i], 250); - separators[i] = Arrays.copyOf(separators[i], 250); - } - for (int j = 0; j < tokens[i].length; j++) { - while (tokens[i][j].endsWith(".")) { - tokens[i][j] = StringUtils.removeEnd(tokens[i][j], "."); - } - } - } - - for (int i = 0; i < tokens.length; i++) { - posTags[i] = rdrposTagger.tagsForEnSentence(tokens[i]); - } - - for (int i = 0; i < tokens.length; i++) { - tokensLc[i] = toLc(tokens[i]); - } - - for (int i = 0; i < tokens.length; i++) { - stemmedWords[i] = stemSentence(tokensLc[i]); - } - - DocumentSentence[] ret = new DocumentSentence[sentences.length]; - for (int i = 0; i < ret.length; i++) { - ret[i] = new DocumentSentence(sanitizeString(sentences[i]), tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]); - } - return ret; + return sentenceList.toArray(String[]::new); } private String[] stemSentence(String[] strings) { String[] stemmed = new String[strings.length]; for (int i = 0; i < stemmed.length; i++) { - var sent = cleanPossessive(strings[i]); + var sent = SentenceExtractorStringUtils.stripPossessive(strings[i]); try { stemmed[i] = porterStemmer.stem(sent); } @@ -269,27 +229,6 @@ public class SentenceExtractor { return stemmed; } - private String cleanPossessive(String s) { - int end = s.length(); - - if (s.endsWith("\'")) { - return s.substring(0, end-1); - } else if (end > 2 && s.charAt(end-2) == '\'' && "sS".indexOf(s.charAt(end-1))>=0) { - return s.substring(0, end-2).toLowerCase(); - } - else { - return s; - } - } - - private String[] toLc(String[] words) { - String[] lower = new String[words.length]; - for (int i = 0; i < lower.length; i++) { - lower[i] = cleanPossessive(words[i]).toLowerCase(); - } - return lower; - } - public String asText(Document dc) { tagCleaner.clean(dc); @@ -299,67 +238,6 @@ public class SentenceExtractor { return text.substring(0, (int) (text.length()*0.95)); } - @AllArgsConstructor @Getter - private static class WordsAndSeparators { - String[] words; - int[] separators; - } - private WordsAndSeparators splitSegment(String segment) { - var matcher = wordBreakPattern.matcher(segment); - - List words = new ArrayList<>(segment.length()/6); - TIntArrayList separators = new TIntArrayList(segment.length()/6); - - int start = 0; - int wordStart = 0; - while (wordStart <= segment.length()) { - if (!matcher.find(wordStart)) { - words.add(segment.substring(wordStart)); - separators.add(WordSeparator.SPACE); - break; - } - - if (wordStart != matcher.start()) { - words.add(segment.substring(wordStart, matcher.start())); - separators.add(segment.substring(matcher.start(), matcher.end()).isBlank() ? WordSeparator.SPACE : WordSeparator.COMMA); - } - wordStart = matcher.end(); - } - - String[] parts = words.toArray(String[]::new); - int length = 0; - for (int i = 0; i < parts.length; i++) { - if (parts[i].isBlank() || parts[i].length() >= MAX_WORD_LENGTH || characterNoisePredicate.test(parts[i])) { - parts[i] = null; - } - else { - length++; - } - } - - String[] ret = new String[length]; - int[] seps = new int[length]; - for (int i = 0, j=0; i < parts.length; i++) { - if (parts[i] != null) { - seps[j] = separators.getQuick(i); - ret[j++] = parts[i]; - } - } - - for (int i = 0; i < ret.length; i++) { - if (ret[i].startsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(1); } - if (ret[i].endsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(0, ret[i].length()-1); } - } - return new WordsAndSeparators(ret, seps); - } - - - public boolean isLegacyMode() { - return legacyMode; - } - public void setLegacyMode(boolean legacyMode) { - this.legacyMode = legacyMode; - } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceExtractorStringUtils.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceExtractorStringUtils.java new file mode 100644 index 00000000..08a1605c --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceExtractorStringUtils.java @@ -0,0 +1,93 @@ +package nu.marginalia.util.language.processing.sentence; + +import java.util.Arrays; +import java.util.Objects; + +public class SentenceExtractorStringUtils { + + public static String sanitizeString(String s) { + char[] newChars = new char[s.length()]; + int pi = 0; + boolean changed = false; + for (int i = 0; i < newChars.length; i++) { + char c = s.charAt(i); + if (!isBadChar(c)) { + newChars[pi++] = c; + } + else { + changed = true; + newChars[pi++] = ' '; + } + } + + if (changed) { + s = new String(newChars, 0, pi); + } + + if (s.startsWith(".")) { + s = s.substring(1); + } + + if (s.isBlank()) { + return ""; + } + + return s; + + } + + private static boolean isBadChar(char c) { + if (c >= 'a' && c <= 'z') return false; + if (c >= 'A' && c <= 'Z') return false; + if (c >= '0' && c <= '9') return false; + if ("_#@.".indexOf(c) >= 0) return false; + if (c >= '\u00C0' && c <= '\u00D6') return false; + if (c >= '\u00D8' && c <= '\u00F6') return false; + if (c >= '\u00F8' && c <= '\u00FF') return false; + + return true; + } + + public static String normalizeSpaces(String s) { + if (s.indexOf('\t') >= 0) { + s = s.replace('\t', ' '); + } + if (s.indexOf('\n') >= 0) { + s = s.replace('\n', ' '); + } + return s; + } + + + public static String toLowerCaseStripPossessive(String word) { + String val = stripPossessive(word).toLowerCase(); + + if (Objects.equals(val, word)) { + return word; + } + + return val; + } + + public static String[] toLowerCaseStripPossessive(String[] words) { + String[] lc = new String[words.length]; + Arrays.setAll(lc, i ->SentenceExtractorStringUtils.toLowerCaseStripPossessive(words[i])); + return lc; + } + + public static String stripPossessive(String s) { + int end = s.length(); + + if (s.endsWith("'")) { + return s.substring(0, end-1); + } + + if (s.endsWith("'s") || s.endsWith("'S")) { + return s.substring(0, end-2); + } + + return s; + } + + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceSegmentSplitter.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceSegmentSplitter.java new file mode 100644 index 00000000..6a4516cf --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceSegmentSplitter.java @@ -0,0 +1,72 @@ +package nu.marginalia.util.language.processing.sentence; + +import gnu.trove.list.array.TIntArrayList; +import lombok.AllArgsConstructor; +import lombok.Getter; +import nu.marginalia.util.language.processing.model.tag.WordSeparator; + +import java.util.ArrayList; +import java.util.List; + +import static nu.marginalia.util.language.WordPatterns.*; + +public class SentenceSegmentSplitter { + + + @AllArgsConstructor + @Getter + public static class SeparatedSentence { + String[] words; + int[] separators; + } + + public static SeparatedSentence splitSegment(String segment) { + var matcher = wordBreakPattern.matcher(segment); + + List words = new ArrayList<>(segment.length()/6); + TIntArrayList separators = new TIntArrayList(segment.length()/6); + + int wordStart = 0; + while (wordStart <= segment.length()) { + if (!matcher.find(wordStart)) { + words.add(segment.substring(wordStart)); + separators.add(WordSeparator.SPACE); + break; + } + + if (wordStart != matcher.start()) { + words.add(segment.substring(wordStart, matcher.start())); + separators.add(segment.substring(matcher.start(), matcher.end()).isBlank() ? WordSeparator.SPACE : WordSeparator.COMMA); + } + wordStart = matcher.end(); + } + + String[] parts = words.toArray(String[]::new); + int length = 0; + for (int i = 0; i < parts.length; i++) { + if (parts[i].isBlank() || parts[i].length() >= MAX_WORD_LENGTH || characterNoisePredicate.test(parts[i])) { + parts[i] = null; + } + else { + length++; + } + } + + String[] ret = new String[length]; + int[] seps = new int[length]; + for (int i = 0, j=0; i < parts.length; i++) { + if (parts[i] != null) { + seps[j] = separators.getQuick(i); + ret[j++] = parts[i]; + } + } + + for (int i = 0; i < ret.length; i++) { + if (ret[i].startsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(1); } + if (ret[i].endsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(0, ret[i].length()-1); } + } + return new SeparatedSentence(ret, seps); + } + + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java deleted file mode 100644 index 485ba353..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyReversePageRank.java +++ /dev/null @@ -1,39 +0,0 @@ -package nu.marginalia.util.ranking; - - -public class BuggyReversePageRank extends RankingAlgorithm { - - - public BuggyReversePageRank(RankingDomainFetcher domains, String... origins) { - super(domains, origins); - } - - @Override - RankVector createNewRankVector(RankVector rank) { - - double rankNorm = rank.norm(); - RankVector newRank = new RankVector(0); - - for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) { - - var links = linkDataSrc2Dest[domainId]; - - if (links != null && links.size() > 0) { - double newRankValue = 0; - - for (int j = 0; j < links.size(); j++) { - newRankValue += rank.get(links.getQuick(j)) / links.size(); - } - - newRank.set(domainId, 0.85*newRankValue/rankNorm); - } - } - return newRank; - } - - @Override - void adjustRankVector(RankVector vector, double dNorm, double oldNorm) { - originDomainIds.forEach(id -> vector.increment(domainIdToIndex.get(id), dNorm/oldNorm)); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java deleted file mode 100644 index 836bcdfe..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BuggyStandardPageRank.java +++ /dev/null @@ -1,45 +0,0 @@ -package nu.marginalia.util.ranking; - - -public class BuggyStandardPageRank extends RankingAlgorithm { - - public BuggyStandardPageRank(RankingDomainFetcher domains, String... origins) { - super(domains, origins); - } - - @Override - RankingAlgorithm.RankVector createNewRankVector(RankingAlgorithm.RankVector rank) { - RankVector newRank = new RankVector(0); - - for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) { - - var links = linkDataSrc2Dest[domainId]; - double newRankValue = 0; - - if (links != null && links.size() > 0) { - for (int j = 0; j < links.size(); j++) { - int linkedDomain = links.getQuick(j); - - int linkSize = 1; - var bl = linkDataSrc2Dest[linkedDomain]; - if (bl != null) { - linkSize = bl.size(); - } - - newRankValue += rank.get(linkedDomain) / linkSize; - - } - } - - newRank.set(domainId, 0.85 * newRankValue); - } - return newRank; - } - - @Override - void adjustRankVector(RankingAlgorithm.RankVector vector, double dNorm, double oldNorm) { - originDomainIds.forEach(id -> vector.increment(id, dNorm/originDomainIds.size())); - vector.incrementAll(0.14*dNorm/vector.size()); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/DedupTool.java b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/DedupTool.java deleted file mode 100644 index d6f95f51..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/DedupTool.java +++ /dev/null @@ -1,89 +0,0 @@ -package nu.marginalia.util.ranking.tool; - -import lombok.AllArgsConstructor; -import lombok.Getter; -import lombok.SneakyThrows; -import lombok.ToString; -import nu.marginalia.wmsa.configuration.module.DatabaseModule; -import org.mariadb.jdbc.Driver; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.*; -import java.util.concurrent.LinkedBlockingQueue; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -public class DedupTool { - - private static final Logger logger = LoggerFactory.getLogger(DedupTool.class); - - public Set originDomains = new HashSet<>(); - public Set originDomainIds = new HashSet<>(); - public final long domainIdMax = -1; - public int domainCount; - private volatile static int rankMax; - - public int maxId() { - return (int) domainIdMax; - } - public int domainCount() { - return domainCount; - } - - static LinkedBlockingQueue uploadQueue = new LinkedBlockingQueue<>(10); - volatile static boolean running = true; - - @AllArgsConstructor @ToString @Getter - static class Data { - String url; - int id; - String domain; - } - - @SneakyThrows - public static void main(String... args) { - Driver driver = new Driver(); - var ds = new DatabaseModule().provideConnection(); - - Map>> domainToHashToUrl = new HashMap<>(); - - try (var conn = ds.getConnection(); - var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.DOMAIN_NAME FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL"); - var updateStmt = conn.prepareStatement("UPDATE EC_URL SET STATE='redirect' WHERE ID=?") - - ) { - fetchStmt.setFetchSize(10_000); - var rsp = fetchStmt.executeQuery(); - while (rsp.next()) { - domainToHashToUrl.computeIfAbsent(rsp.getInt(1), i -> new HashMap<>()) - .computeIfAbsent(rsp.getInt(2), i -> new ArrayList<>()).add(new Data(rsp.getString(3), rsp.getInt(4), rsp.getString(5))); - } - - - List updateIds = new ArrayList<>(); - - domainToHashToUrl.forEach((domain, hashes) -> { - hashes.forEach((hash, urls) -> { - if (urls.size() > 1) { - Comparator c = Comparator.comparing(d -> d.domain.length()); - var urls2 = urls.stream().sorted(c.thenComparing(d -> d.url.length())) - .collect(Collectors.partitioningBy(d -> d.url.endsWith("/"))); - - Stream - .concat(urls2.get(true).stream(),urls2.get(false).stream()).skip(1) - .map(Data::getId) - .forEach(updateIds::add); - } - }); - }); - - for (int id : updateIds) { - updateStmt.setInt(1, id); - updateStmt.executeUpdate(); - } - } - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/GsonFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/GsonFactory.java index c0af0c12..393b2ea5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/GsonFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/GsonFactory.java @@ -2,6 +2,7 @@ package nu.marginalia.wmsa.client; import com.google.gson.*; import marcono1234.gson.recordadapter.RecordTypeAdapterFactory; +import nu.marginalia.util.bigstring.BigString; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.id.EdgeId; @@ -24,6 +25,8 @@ public class GsonFactory { .registerTypeAdapter(EdgeDomain.class, (JsonDeserializer) (json, typeOfT, context) -> new EdgeDomain(json.getAsString())) .registerTypeAdapter(EdgeId.class, (JsonDeserializer>) (json, typeOfT, context) -> new EdgeId<>(json.getAsInt())) .registerTypeAdapter(EdgeId.class, (JsonSerializer>) (src, typeOfSrc, context) -> new JsonPrimitive(src.id())) + .registerTypeAdapter(BigString.class, (JsonDeserializer) (json, typeOfT, context) -> BigString.encode(json.getAsString())) + .registerTypeAdapter(BigString.class, (JsonSerializer) (src, typeOfT, context) -> new JsonPrimitive(src.decode())) .serializeSpecialFloatingPointValues() .create(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java index 62d57aea..6acbaea6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java @@ -13,7 +13,6 @@ import nu.marginalia.wmsa.memex.MemexMain; import nu.marginalia.wmsa.podcasts.PodcastScraperMain; import nu.marginalia.wmsa.renderer.RendererMain; import nu.marginalia.wmsa.resource_store.ResourceStoreMain; -import nu.marginalia.wmsa.smhi.scraper.SmhiScraperMain; import org.apache.logging.log4j.core.lookup.MainMapLookup; import java.util.Map; @@ -26,7 +25,6 @@ public enum ServiceDescriptor { AUTH("auth", 5003, AuthMain.class), API("api", 5004, ApiMain.class), - SMHI_SCRAPER("smhi-scraper",5012, SmhiScraperMain.class), PODCST_SCRAPER("podcast-scraper", 5013, PodcastScraperMain.class), EDGE_INDEX("edge-index", 5021, EdgeIndexMain.class), diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/TermFrequencyDict.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/TermFrequencyDict.java index 2a29e2a4..4d87ec96 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/TermFrequencyDict.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/TermFrequencyDict.java @@ -2,9 +2,10 @@ package nu.marginalia.wmsa.edge.assistant.dict; import ca.rmen.porterstemmer.PorterStemmer; import gnu.trove.map.hash.TLongIntHashMap; +import gnu.trove.set.hash.TLongHashSet; import nu.marginalia.util.language.LanguageFilter; import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.sentence.SentenceExtractor; import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.wmsa.configuration.WmsaHome; import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruningFilter; @@ -18,11 +19,10 @@ import javax.annotation.Nullable; import javax.inject.Inject; import javax.inject.Singleton; import java.io.*; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.Arrays; -import java.util.HashSet; -import java.util.Set; import java.util.concurrent.ForkJoinPool; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; @@ -101,12 +101,15 @@ public class TermFrequencyDict { fjp.execute(() -> { + TLongHashSet words = new TLongHashSet(10_000); + for (var doc : domain.doc) { + if (doc.documentBody == null) continue; docCount.incrementAndGet(); - Document parsed = Jsoup.parse(doc.documentBody); + Document parsed = Jsoup.parse(doc.documentBody.decode()); parsed.body().filter(new DomPruningFilter(0.5)); DocumentLanguageData dld = se.get().extractSentences(parsed); @@ -115,28 +118,30 @@ public class TermFrequencyDict { return; } - Set words = new HashSet<>(10_000); - for (var sent : dld.sentences) { for (var word : sent) { - words.add(word.stemmed()); + words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8))); } } - fjp.execute(() -> { - synchronized (counts) { - for (var word : words) { - counts.adjustOrPutValue(longHash(word.getBytes()), 1, 1); - } - } - }); + synchronized (counts) { + words.forEach(w -> { + counts.adjustOrPutValue(w, 1, 1); + return true; + }); + } + words.clear(); } + + System.out.println(domain.domain + "\t" + counts.size()); }); + + } fjp.shutdown(); - fjp.awaitTermination(10, TimeUnit.SECONDS); + fjp.awaitTermination(10, TimeUnit.DAYS); try (var dos = new DataOutputStream(Files.newOutputStream(Path.of(outFile)))) { synchronized (counts) { @@ -155,14 +160,6 @@ public class TermFrequencyDict { } System.out.println(docCount.get()); -// -// counts.forEachEntry((w,c) -> { -// if (c > 3L) { -// System.out.println(w + ":" + c); -// } -// return true; -// }); - } public static long getStringHash(String s) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java index b0bb7a9e..8b941d92 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java @@ -46,17 +46,12 @@ public class ConverterMain { InstructionsCompiler compiler, Gson gson ) throws Exception { - - ; - - - logger.info("Starting pipe"); try (WorkLog processLog = plan.createProcessWorkLog(); ConversionLog log = new ConversionLog(plan.process.getDir())) { instructionWriter = new LoadInstructionWriter(log, plan.process.getDir(), gson); - var pipe = new ParallelPipe("Crawler", 20, 4, 2) { + var pipe = new ParallelPipe("Crawler", 16, 4, 2) { @Override protected ProcessingInstructions onProcess(CrawledDomain domainData) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java index f8de6c0c..f9557c97 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java @@ -150,7 +150,7 @@ public class LinkKeywordExtractorMain { for (var doc : crawledDomain.doc) { if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) { - anchorTextExtractor.processDocument(doc.url, doc.documentBody); + anchorTextExtractor.processDocument(doc.url, doc.documentBody.decode()); } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java index 339389fe..71ac3945 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java @@ -7,7 +7,7 @@ import nu.marginalia.util.gregex.GuardedRegex; import nu.marginalia.util.gregex.GuardedRegexFactory; import nu.marginalia.util.language.LanguageFilter; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; -import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.sentence.SentenceExtractor; import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.util.language.processing.model.KeywordMetadata; import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException; @@ -178,11 +178,13 @@ public class DocumentProcessor { private DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument) throws DisqualifiedException, URISyntaxException { - if (languageFilter.isBlockedUnicodeRange(crawledDocument.documentBody)) { + String documentBody = crawledDocument.documentBody.decode(); + + if (languageFilter.isBlockedUnicodeRange(documentBody)) { throw new DisqualifiedException(DisqualificationReason.LANGUAGE); } - Document doc = Jsoup.parse(crawledDocument.documentBody); + Document doc = Jsoup.parse(documentBody); if (AcceptableAds.hasAcceptableAdsTag(doc)) { // I've never encountered a website where this hasn't been a severe indicator diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java index d9ff7ef1..e5ed00e5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java @@ -42,7 +42,7 @@ public class DomainProcessor { fixBadCanonicalTags(crawledDomain.doc); - StringPool stringPool = new StringPool(1000 + 100 * crawledDomain.doc.size()); + StringPool stringPool = StringPool.create(1000 + 100 * crawledDomain.doc.size()); for (var doc : crawledDomain.doc) { var processedDoc = documentProcessor.process(doc, crawledDomain); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/SiteWords.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/SiteWords.java index b5a5191f..87e8c931 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/SiteWords.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/SiteWords.java @@ -33,8 +33,7 @@ public class SiteWords { Set commonSiteWords = new HashSet<>(10); commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain, - EdgePageWordFlags.Subjects, - EdgePageWordFlags.TfIdfHigh)); + EdgePageWordFlags.Subjects)); commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain, EdgePageWordFlags.Title)); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/CommonKeywordExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/CommonKeywordExtractor.java index bcd2d505..cabb6454 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/CommonKeywordExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/CommonKeywordExtractor.java @@ -11,7 +11,7 @@ public class CommonKeywordExtractor { private static final int MIN_REQUIRED_DOCUMENTS = 25; - private static final int REQUIRED_TOTAL_COUNT_FOR_CONSIDERATION = 100; + private static final int REQUIRED_TOTAL_COUNT_FOR_CONSIDERATION = 15; private static final double QUALIFYING_PROPORTION_FOR_KEYWORD = .25; private static final int MAX_SITE_KEYWORDS_TO_EXTRACT = 5; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java index 5e893725..45611e08 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java @@ -126,6 +126,9 @@ public class LinkParser { if (doesUrlStringHaveProtocol(s)) { return s; } + else if (s.startsWith("//")) { // scheme-relative URL + return baseUrl.proto + ":" + s; + } String[] parts = paramSeparatorPattern.split(s, 2); String path = parts[0]; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainReader.java index b23ee5dc..cb7aa8f9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawledDomainReader.java @@ -2,11 +2,11 @@ package nu.marginalia.wmsa.edge.crawling; import com.github.luben.zstd.ZstdInputStream; import com.google.gson.Gson; +import jdkoverride.LargeLineBufferedReader; import nu.marginalia.wmsa.client.GsonFactory; import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; -import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; @@ -19,61 +19,41 @@ import java.util.concurrent.TimeUnit; public class CrawledDomainReader { private final Gson gson = GsonFactory.get(); - private final ForkJoinPool pool = new ForkJoinPool(4); + private final ForkJoinPool pool = new ForkJoinPool(6); public CrawledDomainReader() { } public CrawledDomain read(Path path) throws IOException { - List docs = new ArrayList<>(); - CrawledDomain domain = null; + DomainDataAssembler domainData = new DomainDataAssembler(); + try (var br = new LargeLineBufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))))) { + String line; + while ((line = br.readLine()) != null) { + if (line.startsWith("//")) { + String identifier = line; + String data = br.readLine(); - try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))))) { - br.mark(2); - boolean legacy = '{' == br.read(); - br.reset(); - - if (legacy) { - domain = gson.fromJson(br, CrawledDomain.class); - } - else { - String line; - while ((line = br.readLine()) != null) { - if (line.startsWith("//")) { - String nextLine = br.readLine(); - if (nextLine == null) break; - - if (line.equals(CrawledDomain.SERIAL_IDENTIFIER)) { - domain = gson.fromJson(nextLine, CrawledDomain.class); - } else if (line.equals(CrawledDocument.SERIAL_IDENTIFIER)) { - pool.execute(() -> { - var doc = gson.fromJson(nextLine, CrawledDocument.class); - synchronized (docs) { - docs.add(doc); - } - }); - } - } else if (line.charAt(0) == '{') { - domain = gson.fromJson(line, CrawledDomain.class); - } + pool.execute(() -> deserializeLine(identifier, data, domainData)); } } } - pool.awaitQuiescence(10, TimeUnit.SECONDS); + while (!pool.awaitQuiescence(1, TimeUnit.SECONDS)); - if (domain == null) { - return null; + return domainData.assemble(); + } + + + private void deserializeLine(String identifier, String data, DomainDataAssembler assembler) { + if (null == data) { + return; } - - if (!docs.isEmpty()) { - if (domain.doc == null) - domain.doc = new ArrayList<>(); - - domain.doc.addAll(docs); + if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) { + assembler.acceptDomain(gson.fromJson(data, CrawledDomain.class)); + } else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) { + assembler.acceptDoc(gson.fromJson(data, CrawledDocument.class)); } - return domain; } public CrawledDomain readRuntimeExcept(Path path) { @@ -84,4 +64,27 @@ public class CrawledDomainReader { throw new RuntimeException(ex); } } + + private static class DomainDataAssembler { + private CrawledDomain domainPrototype; + private final List docs = new ArrayList<>(); + + public synchronized void acceptDomain(CrawledDomain domain) { + this.domainPrototype = domain; + } + + public synchronized void acceptDoc(CrawledDocument doc) { + docs.add(doc); + } + + public synchronized CrawledDomain assemble() { + if (!docs.isEmpty()) { + if (domainPrototype.doc == null) + domainPrototype.doc = new ArrayList<>(); + + domainPrototype.doc.addAll(docs); + } + return domainPrototype; + } + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java index c8bfbb11..1a00e161 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java @@ -24,7 +24,7 @@ public class UrlBlocklist { patterns.add(s -> s.contains("-download-free")); // long base64-strings in URLs are typically git hashes or the like, rarely worth crawling - patterns.add(GuardedRegexFactory.minLength(48, ".*/[^/]*[a-f0-9]{32,}(/|$)")); + patterns.add(this::hashTest); // link farms &c patterns.add(GuardedRegexFactory.contains("/download", "/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$")); @@ -38,6 +38,33 @@ public class UrlBlocklist { } + public boolean hashTest(String path) { + // look for strings might be a git hash (i.e. long hexadecimal strings) + // there is no good guard for a regular expression for this so hand-rolling this + // is necessary + + int runLength = 0; + int minLength = 32; + + if (path.length() <= minLength + 2) + return false; + + for (int i = 0; i < path.length(); i++) { + int c = path.charAt(i); + + if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f')) { + runLength++; + } + else if (runLength >= minLength) { + return true; + } + else { + runLength = 0; + } + } + return runLength >= minLength; + } + public boolean isUrlBlocked(EdgeUrl url) { try { if (badDomains.contains(url.domain.domain)) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawledDocument.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawledDocument.java index d43315a0..497f323f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawledDocument.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawledDocument.java @@ -1,6 +1,8 @@ package nu.marginalia.wmsa.edge.crawling.model; import lombok.Builder; +import nu.marginalia.util.bigstring.BigString; +import nu.marginalia.util.bigstring.CompressedBigString; @Builder public class CrawledDocument implements SerializableCrawlData { @@ -16,8 +18,7 @@ public class CrawledDocument implements SerializableCrawlData { public String crawlerStatusDesc; public String headers; - public String documentBody; - + public BigString documentBody; public String documentBodyHash; public String canonicalUrl; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java index 5e60ec3a..f950e831 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java @@ -206,7 +206,7 @@ public class CrawlerRetreiver { if (doc.documentBody != null) { - doc.documentBodyHash = createHash(doc.documentBody); + doc.documentBodyHash = createHash(doc.documentBody.decode()); Optional parsedDoc = parseDoc(doc); EdgeUrl url = new EdgeUrl(doc.url); @@ -251,7 +251,7 @@ public class CrawlerRetreiver { private Optional parseDoc(CrawledDocument doc) { if (doc.documentBody == null) return Optional.empty(); - return Optional.of(Jsoup.parse(doc.documentBody)); + return Optional.of(Jsoup.parse(doc.documentBody.decode())); } public boolean isSameDomain(EdgeUrl url) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java index d215d66e..4532156f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java @@ -7,6 +7,7 @@ import crawlercommons.robots.SimpleRobotRulesParser; import lombok.AllArgsConstructor; import lombok.SneakyThrows; import lombok.ToString; +import nu.marginalia.util.bigstring.BigString; import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus; import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic; @@ -271,7 +272,7 @@ public class HttpFetcher { .canonicalUrl(canonical) .httpStatus(rsp.code()) .url(responseUrl.toString()) - .documentBody(strData) + .documentBody(BigString.encode(strData)) .build(); } @@ -325,7 +326,7 @@ public class HttpFetcher { private SimpleRobotRules parseRobotsTxt(CrawledDocument doc) { return robotsParser.parseContent(doc.url, - doc.documentBody.getBytes(StandardCharsets.UTF_8), + doc.documentBody.getBytes(), doc.contentType, userAgent); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java index 980d0d32..87f65926 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java @@ -2,6 +2,7 @@ package nu.marginalia.wmsa.edge.index; import com.google.inject.Inject; +import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService; import java.io.IOException; @@ -9,14 +10,16 @@ import java.io.IOException; public class EdgeIndexControl { private final IndexServicesFactory servicesFactory; + private final EdgeIndexSearchSetsService searchSetsService; @Inject - public EdgeIndexControl(IndexServicesFactory servicesFactory) { + public EdgeIndexControl(IndexServicesFactory servicesFactory, EdgeIndexSearchSetsService searchSetsService) { this.servicesFactory = servicesFactory; + this.searchSetsService = searchSetsService; } public void regenerateIndex() throws IOException { - servicesFactory.convertIndex(); + servicesFactory.convertIndex(searchSetsService.getDomainRankings()); System.gc(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexModule.java index 986f1874..361a7d47 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexModule.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexModule.java @@ -13,13 +13,6 @@ public class EdgeIndexModule extends AbstractModule { public void configure() { - if (Boolean.getBoolean("small-ram")) { - bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 27); - } - else { - bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 31); - } - } @Provides diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java index d069ec05..dcc00e34 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java @@ -5,11 +5,11 @@ import com.google.inject.Singleton; import com.google.inject.name.Named; import lombok.SneakyThrows; import nu.marginalia.util.array.LongArray; -import nu.marginalia.util.dict.DictionaryHashMap; -import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklist; +import nu.marginalia.util.dict.DictionaryMap; import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView; import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.wmsa.edge.index.postings.DomainRankings; import nu.marginalia.wmsa.edge.index.postings.SearchIndex; import nu.marginalia.wmsa.edge.index.postings.SearchIndexReader; import nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexConverter; @@ -20,6 +20,7 @@ import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexConverter; import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexPrioReader; import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexPriorityParameters; import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexReader; +import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -33,7 +34,6 @@ import java.util.concurrent.Callable; @Singleton public class IndexServicesFactory { private final Path tmpFileDir; - private final EdgeDomainBlacklist domainBlacklist; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -48,7 +48,6 @@ public class IndexServicesFactory { private final PartitionedDataFile revPrioIndexWords; private volatile static KeywordLexicon keywordLexicon; - private final Long dictionaryHashMapSize; private final Path searchSetsBase; @@ -59,14 +58,10 @@ public class IndexServicesFactory { public IndexServicesFactory( @Named("tmp-file-dir") Path tmpFileDir, @Named("partition-root-slow") Path partitionRootSlow, - @Named("partition-root-fast") Path partitionRootFast, - @Named("edge-dictionary-hash-map-size") Long dictionaryHashMapSize, - EdgeDomainBlacklist domainBlacklist + @Named("partition-root-fast") Path partitionRootFast ) throws IOException { this.tmpFileDir = tmpFileDir; - this.dictionaryHashMapSize = dictionaryHashMapSize; - this.domainBlacklist = domainBlacklist; this.writerIndexFile = new PartitionedDataFile(partitionRootSlow, "page-index.dat"); this.keywordLexiconFile = new RootDataFile(partitionRootSlow, "dictionary.dat"); @@ -98,7 +93,7 @@ public class IndexServicesFactory { public KeywordLexicon getKeywordLexicon() { if (keywordLexicon == null) { final var journal = new KeywordLexiconJournal(keywordLexiconFile.get()); - keywordLexicon = new KeywordLexicon(journal, new DictionaryHashMap(dictionaryHashMapSize)); + keywordLexicon = new KeywordLexicon(journal, DictionaryMap.create()); } return keywordLexicon; } @@ -109,15 +104,15 @@ public class IndexServicesFactory { } - public void convertIndex() throws IOException { - convertForwardIndex(); - convertFullReverseIndex(); - convertPriorityReverseIndex(); + public void convertIndex(DomainRankings domainRankings) throws IOException { + convertForwardIndex(domainRankings); + convertFullReverseIndex(domainRankings); + convertPriorityReverseIndex(domainRankings); } - private void convertFullReverseIndex() throws IOException { + private void convertFullReverseIndex(DomainRankings domainRankings) throws IOException { logger.info("Converting full reverse index"); @@ -125,6 +120,7 @@ public class IndexServicesFactory { var journalReader = new SearchIndexJournalReaderSingleFile(longArray); var converter = new ReverseIndexConverter(tmpFileDir, journalReader, + domainRankings, revIndexWords.get(NEXT_PART).toPath(), revIndexDoc.get(NEXT_PART).toPath()); @@ -133,7 +129,7 @@ public class IndexServicesFactory { tryGc(); } - private void convertPriorityReverseIndex() throws IOException { + private void convertPriorityReverseIndex(DomainRankings domainRankings) throws IOException { logger.info("Converting priority reverse index"); @@ -143,6 +139,7 @@ public class IndexServicesFactory { var converter = new ReverseIndexConverter(tmpFileDir, journalReader, + domainRankings, revPrioIndexWords.get(NEXT_PART).toPath(), revPrioIndexDoc.get(NEXT_PART).toPath()); @@ -151,13 +148,14 @@ public class IndexServicesFactory { tryGc(); } - private void convertForwardIndex() throws IOException { + private void convertForwardIndex(DomainRankings domainRankings) throws IOException { logger.info("Converting forward index data"); - new ForwardIndexConverter(tmpFileDir, + new ForwardIndexConverter( writerIndexFile.get(0), fwdIndexDocId.get(NEXT_PART).toPath(), - fwdIndexDocData.get(NEXT_PART).toPath()) + fwdIndexDocData.get(NEXT_PART).toPath(), + domainRankings) .convert(); tryGc(); @@ -215,8 +213,8 @@ public class IndexServicesFactory { } } - public SearchIndex createIndexBucket() { - return new SearchIndex(this, new EdgeIndexControl(this)); + public SearchIndex createIndexBucket(EdgeIndexSearchSetsService searchSetsService) { + return new SearchIndex(this, new EdgeIndexControl(this, searchSetsService)); } public SearchIndexReader getSearchIndexReader() throws IOException { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexLocalService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexLocalService.java index 00e518e3..585c9a14 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexLocalService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexLocalService.java @@ -3,7 +3,8 @@ package nu.marginalia.wmsa.edge.index.client; import com.google.inject.Inject; import com.google.inject.Singleton; import com.google.inject.name.Named; -import nu.marginalia.util.dict.DictionaryHashMap; +import nu.marginalia.util.dict.OffHeapDictionaryHashMap; +import nu.marginalia.util.dict.DictionaryMap; import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.KeywordListChunker; @@ -32,14 +33,9 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient { @Inject public EdgeIndexLocalService(@Named("local-index-path") Path path) throws IOException { - long hashMapSize = 1L << 31; - - if (Boolean.getBoolean("small-ram")) { - hashMapSize = 1L << 27; - } var lexiconJournal = new KeywordLexiconJournal(path.resolve("dictionary.dat").toFile()); - lexicon = new KeywordLexicon(lexiconJournal, new DictionaryHashMap(hashMapSize)); + lexicon = new KeywordLexicon(lexiconJournal, DictionaryMap.create()); indexWriter = new SearchIndexJournalWriterImpl(lexicon, path.resolve("index.dat").toFile()); } @@ -72,7 +68,7 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient { String word = words[i]; long id = lexicon.getOrInsert(word); - if (id != DictionaryHashMap.NO_VALUE) { + if (id != OffHeapDictionaryHashMap.NO_VALUE) { ids[putIdx++] = id; ids[putIdx++] = meta[i]; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java index 27514b58..5f02cb98 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java @@ -4,7 +4,6 @@ import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import io.prometheus.client.Gauge; import lombok.SneakyThrows; -import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.util.dict.DictionaryMap; import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; import org.slf4j.Logger; @@ -55,7 +54,7 @@ public class KeywordLexicon implements AutoCloseable { private int getOrInsert(byte[] bytes) { if (bytes.length >= Byte.MAX_VALUE) { logger.warn("getOrInsert({}), illegal length {}", new String(bytes), bytes.length); - return DictionaryHashMap.NO_VALUE; + return DictionaryMap.NO_VALUE; } final long key = hashFunction.hashBytes(bytes).padToLong(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadata.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadata.java index 4847d9fc..4331131f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadata.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadata.java @@ -8,7 +8,8 @@ import java.util.Set; import static java.lang.Math.max; import static java.lang.Math.min; -public record EdgePageDocumentsMetadata(int encSize, +public record EdgePageDocumentsMetadata(int rank, + int encSize, int topology, int year, int sets, @@ -16,9 +17,13 @@ public record EdgePageDocumentsMetadata(int encSize, byte flags) { + public static final long RANK_MASK = 0xFFL; + public static final int RANK_SHIFT = 48; + public static final long ENCSIZE_MASK = 0xFFL; - public static final int ENCSIZE_SHIFT = 48; + public static final int ENCSIZE_SHIFT = 40; public static final int ENCSIZE_MULTIPLIER = 50; + public static final long TOPOLOGY_MASK = 0xFFL; public static final int TOPOLOGY_SHIFT = 32; @@ -39,7 +44,7 @@ public record EdgePageDocumentsMetadata(int encSize, this(defaultValue()); } public EdgePageDocumentsMetadata(int topology, int year, int sets, int quality, EnumSet flags) { - this(0, topology, year, sets, quality, encodeFlags(flags)); + this(0, 0, topology, year, sets, quality, encodeFlags(flags)); } public EdgePageDocumentsMetadata withSize(int size) { @@ -49,7 +54,7 @@ public record EdgePageDocumentsMetadata(int encSize, final int encSize = (int) Math.min(ENCSIZE_MASK, Math.max(1, size / ENCSIZE_MULTIPLIER)); - return new EdgePageDocumentsMetadata(encSize, topology, year, sets, quality, flags); + return new EdgePageDocumentsMetadata(rank, encSize, topology, year, sets, quality, flags); } private static byte encodeFlags(Set flags) { @@ -63,7 +68,8 @@ public record EdgePageDocumentsMetadata(int encSize, } public EdgePageDocumentsMetadata(long value) { - this( (int) ((value >>> ENCSIZE_SHIFT) & ENCSIZE_MASK), + this( (int) ((value >>> RANK_SHIFT) & RANK_MASK), + (int) ((value >>> ENCSIZE_SHIFT) & ENCSIZE_MASK), (int) ((value >>> TOPOLOGY_SHIFT) & TOPOLOGY_MASK), (int) ((value >>> YEAR_SHIFT) & YEAR_MASK), (int) ((value >>> SETS_SHIFT) & SETS_MASK), @@ -84,12 +90,13 @@ public record EdgePageDocumentsMetadata(int encSize, ret |= min(YEAR_MASK, max(0, year)) << YEAR_SHIFT; ret |= min(TOPOLOGY_MASK, max(0, topology)) << TOPOLOGY_SHIFT; ret |= min(ENCSIZE_MASK, max(0, encSize)) << ENCSIZE_SHIFT; + ret |= min(RANK_MASK, max(0, rank)) << RANK_SHIFT; return ret; } public boolean isEmpty() { - return encSize == 0 && topology == 0 && sets == 0 && quality == 0 && year == 0 && flags == 0; + return encSize == 0 && topology == 0 && sets == 0 && quality == 0 && year == 0 && flags == 0 && rank == 0; } public static int decodeQuality(long encoded) { @@ -112,6 +119,12 @@ public record EdgePageDocumentsMetadata(int encSize, return ENCSIZE_MULTIPLIER * (int) ((encoded >>> ENCSIZE_SHIFT) & ENCSIZE_MASK); } + public static int decodeRank(long encoded) { + return (int) ((encoded >>> RANK_SHIFT) & RANK_MASK); + } + public static long encodeRank(long encoded, int rank) { + return encoded | min(RANK_MASK, max(0, rank)) << RANK_SHIFT; + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/QueryLimits.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/QueryLimits.java new file mode 100644 index 00000000..dce78343 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/QueryLimits.java @@ -0,0 +1,4 @@ +package nu.marginalia.wmsa.edge.index.model; + +public record QueryLimits(int resultsByDomain, int resultsTotal, int timeoutMs, int fetchSize) { +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/QueryStrategy.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/QueryStrategy.java index 7f3632c1..d8682a61 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/QueryStrategy.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/QueryStrategy.java @@ -3,5 +3,10 @@ package nu.marginalia.wmsa.edge.index.model; public enum QueryStrategy { SENTENCE, TOPIC, + + REQUIRE_FIELD_SITE, + REQUIRE_FIELD_TITLE, + REQUIRE_FIELD_SUBJECT, + AUTO } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/DomainRankings.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/DomainRankings.java new file mode 100644 index 00000000..d6ddcd62 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/DomainRankings.java @@ -0,0 +1,43 @@ +package nu.marginalia.wmsa.edge.index.postings; + +import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; +import it.unimi.dsi.fastutil.ints.Int2ShortOpenHashMap; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import static java.lang.Math.max; +import static java.lang.Math.min; + +public class DomainRankings { + private final Int2ShortOpenHashMap rankings; + + private final int MAX_MEANINGFUL_RANK = 50_000; + private final int MAX_RANK_VALUE = 255; + private final int MIN_RANK_VALUE = 1; + private final double RANK_SCALING_FACTOR = (double) MAX_RANK_VALUE / MAX_MEANINGFUL_RANK; + + public DomainRankings() { + rankings = new Int2ShortOpenHashMap(); + } + public DomainRankings(Int2IntOpenHashMap values) { + rankings = new Int2ShortOpenHashMap(values.size()); + values.forEach(this::putRanking); + } + + private void putRanking(int domainId, int value) { + rankings.put(domainId, scaleRank(value)); + } + + private short scaleRank(int value) { + double rankScaled = RANK_SCALING_FACTOR * value; + return (short) min(MAX_RANK_VALUE, max(MIN_RANK_VALUE, rankScaled)); + } + + public int getRanking(int domainId) { + return rankings.getOrDefault(domainId, (short) MAX_RANK_VALUE); + } + + public int size() { + return rankings.size(); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexResultValuator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexResultValuator.java index d94661e8..da2e92f8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexResultValuator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexResultValuator.java @@ -7,6 +7,8 @@ import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap; import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; +import nu.marginalia.wmsa.edge.index.model.QueryStrategy; +import nu.marginalia.wmsa.edge.index.query.IndexQueryParams; import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem; import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery; @@ -17,6 +19,7 @@ import java.util.Objects; public class IndexResultValuator { private final IndexMetadataService metadataService; private final List> searchTermVariants; + private final IndexQueryParams queryParams; private final int[] termIdsAll; private final TLongHashSet resultsWithPriorityTerms; @@ -24,9 +27,10 @@ public class IndexResultValuator { private final TObjectIntHashMap termToId = new TObjectIntHashMap<>(10, 0.75f, -1); private final TermMetadata termMetadata; - public IndexResultValuator(SearchIndexControl indexes, TLongList results, List subqueries) { + public IndexResultValuator(SearchIndexControl indexes, TLongList results, List subqueries, IndexQueryParams queryParams) { this.metadataService = new IndexMetadataService(indexes); this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList(); + this.queryParams = queryParams; var lexiconReader = Objects.requireNonNull(indexes.getLexiconReader()); IntArrayList termIdsList = new IntArrayList(); @@ -114,10 +118,15 @@ public class IndexResultValuator { docMetadata, resultsWithPriorityTerms.contains(searchResult.combinedId) ); + searchResult.scores.add(score); setScore += score.termValue(); + if (!filterRequired(metadata, queryParams.queryStrategy())) { + setScore += 1000; + } + if (termIdx == 0) { setScore += score.documentValue(); } @@ -130,6 +139,19 @@ public class IndexResultValuator { return setScore/setSize; } + private boolean filterRequired(long metadata, QueryStrategy queryStrategy) { + if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) { + return EdgePageWordFlags.Site.isPresent(metadata); + } + else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) { + return EdgePageWordFlags.Subjects.isPresent(metadata); + } + else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) { + return EdgePageWordFlags.Title.isPresent(metadata); + } + return true; + } + private double calculateTermCoherencePenalty(int urlId, TObjectIntHashMap termToId, List termList) { long maskDirectGenerous = ~0; long maskDirectRaw = ~0; @@ -139,6 +161,9 @@ public class IndexResultValuator { | EdgePageWordFlags.Subjects.asBit() | EdgePageWordFlags.Synthetic.asBit(); + int termCount = 0; + double tfIdfSum = 1.; + for (String term : termList) { var meta = termMetadata.getTermMetadata(termToId.get(term), urlId); long positions; @@ -156,18 +181,22 @@ public class IndexResultValuator { maskDirectGenerous &= positions; } + termCount++; + tfIdfSum += EdgePageWordMetadata.decodeTfidf(meta); } + double avgTfIdf = termCount / tfIdfSum; + if (maskAdjacent == 0) { - return 40; + return Math.max(-2, 40 - 0.5 * avgTfIdf); } if (maskDirectGenerous == 0) { - return 20; + return Math.max(-1, 20 - 0.3 * avgTfIdf); } if (maskDirectRaw == 0) { - return 2; + return Math.max(-1, 15 - 0.2 * avgTfIdf); } return Long.numberOfTrailingZeros(maskDirectGenerous)/5. - Long.bitCount(maskDirectGenerous); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndex.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndex.java index 6d70fab6..72c0e13f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndex.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndex.java @@ -92,7 +92,8 @@ public class SearchIndex { SearchIndexReader.IndexQueryBuilder query = switch(params.queryStrategy()) { case SENTENCE -> indexReader.findWordAsSentence(orderedIncludes); - case TOPIC -> indexReader.findWordAsTopic(orderedIncludes); + case TOPIC, REQUIRE_FIELD_SITE, REQUIRE_FIELD_TITLE, REQUIRE_FIELD_SUBJECT + -> indexReader.findWordAsTopic(orderedIncludes); case AUTO -> indexReader.findWordTopicDynamicMode(orderedIncludes); }; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndexControl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndexControl.java index a1475af3..42e7e32f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndexControl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndexControl.java @@ -6,6 +6,7 @@ import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.edge.index.IndexServicesFactory; import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView; import nu.marginalia.wmsa.edge.index.postings.journal.writer.SearchIndexJournalWriterImpl; +import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService; import nu.marginalia.wmsa.edge.index.svc.EdgeOpsLockService; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -26,13 +27,14 @@ public class SearchIndexControl { @Inject public SearchIndexControl(IndexServicesFactory servicesFactory, - EdgeOpsLockService opsLockService) { + EdgeOpsLockService opsLockService, + EdgeIndexSearchSetsService searchSetsService) { this.servicesFactory = servicesFactory; this.primaryIndexWriter = servicesFactory.getIndexWriter(0); this.secondaryIndexWriter = servicesFactory.getIndexWriter(1); - index = servicesFactory.createIndexBucket(); + index = servicesFactory.createIndexBucket(searchSetsService); this.opsLockService = opsLockService; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverter.java index e066f734..8d821c88 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverter.java @@ -3,6 +3,8 @@ package nu.marginalia.wmsa.edge.index.postings.forward; import com.upserve.uppend.blobs.NativeIO; import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap; import nu.marginalia.util.array.LongArray; +import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; +import nu.marginalia.wmsa.edge.index.postings.DomainRankings; import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReader; import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile; import org.roaringbitmap.IntConsumer; @@ -18,26 +20,26 @@ import java.nio.file.Path; import static nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexParameters.*; public class ForwardIndexConverter { - private static final int RWF_BIN_SIZE = 10_000_000; - private final Path tmpFileDir; private final File inputFile; private final Logger logger = LoggerFactory.getLogger(getClass()); private final Path outputFileDocsId; private final Path outputFileDocsData; + private final DomainRankings domainRankings; - public ForwardIndexConverter(Path tmpFileDir, + public ForwardIndexConverter( File inputFile, Path outputFileDocsId, - Path outputFileDocsData + Path outputFileDocsData, + DomainRankings domainRankings ) { - this.tmpFileDir = tmpFileDir; this.inputFile = inputFile; this.outputFileDocsId = outputFileDocsId; this.outputFileDocsData = outputFileDocsData; + this.domainRankings = domainRankings; } public void convert() throws IOException { @@ -50,6 +52,8 @@ public class ForwardIndexConverter { logger.info("Converting {} {}",inputFile, journalReader.fileHeader); + logger.info("Domain Rankings size = {}", domainRankings.size()); + try { LongArray docsFileId = getDocIds(outputFileDocsId, journalReader); @@ -68,7 +72,10 @@ public class ForwardIndexConverter { journalReader.forEach(entry -> { long entryOffset = (long) ENTRY_SIZE * docIdToIdx.get(entry.urlId()); - docFileData.set(entryOffset + METADATA_OFFSET, entry.docMeta()); + int ranking = domainRankings.getRanking(entry.domainId()); + long meta = EdgePageDocumentsMetadata.encodeRank(entry.docMeta(), ranking); + + docFileData.set(entryOffset + METADATA_OFFSET, meta); docFileData.set(entryOffset + DOMAIN_OFFSET, entry.domainId()); }); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexReader.java index 7b4c66ff..c4080574 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexReader.java @@ -29,20 +29,30 @@ public class ForwardIndexReader { logger.info("Switching forward index"); + ids = loadIds(idsFile); + data = loadData(dataFile); + } + + private static TLongIntHashMap loadIds(Path idsFile) throws IOException { var idsArray = LongArray.mmapRead(idsFile); idsArray.advice(NativeIO.Advice.Sequential); - ids = new TLongIntHashMap((int) idsArray.size(), 0.5f, -1, -1); + var ids = new TLongIntHashMap((int) idsArray.size(), 0.5f, -1, -1); // This hash table should be of the same size as the number of documents, so typically less than 1 Gb idsArray.forEach(0, idsArray.size(), (pos, val) -> { ids.put(val, (int) pos); }); - data = LongArray.mmapRead(dataFile); + return ids; + } + private static LongArray loadData(Path dataFile) throws IOException { + var data = LongArray.mmapRead(dataFile); data.advice(NativeIO.Advice.Random); + + return data; } private int idxForDoc(long docId) { @@ -55,6 +65,7 @@ public class ForwardIndexReader { return data.get(ENTRY_SIZE * offset + METADATA_OFFSET); } + public int getDomainId(long docId) { long offset = idxForDoc(docId); if (offset < 0) return 0; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ParamMatchingQueryFilter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ParamMatchingQueryFilter.java index a3c30bab..81f671c5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ParamMatchingQueryFilter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/forward/ParamMatchingQueryFilter.java @@ -16,7 +16,7 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf { @Override public boolean test(long docId) { - var post = forwardIndexReader.docPost(docId); + var post = forwardIndexReader.docPost(docId & 0xFFFF_FFFFL); if (!validateDomain(post)) { return false; @@ -33,6 +33,11 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf { if (!validateSize(post)) { return false; } + + if (!validateRank(post)) { + return false; + } + return true; } @@ -51,6 +56,7 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf { return limit.test(quality); } + private boolean validateYear(ForwardIndexReader.DocPost post) { if (params.year().type() == SpecificationLimitType.NONE) return true; @@ -69,6 +75,15 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf { return params.size().test(postVal); } + private boolean validateRank(ForwardIndexReader.DocPost post) { + if (params.rank().type() == SpecificationLimitType.NONE) + return true; + + int postVal = EdgePageDocumentsMetadata.decodeRank(post.meta()); + + return params.rank().test(postVal); + } + @Override public double cost() { return 32; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverter.java index f53a1a90..2cf79112 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverter.java @@ -8,6 +8,7 @@ import nu.marginalia.util.array.functional.LongBinaryIOOperation; import nu.marginalia.util.array.functional.LongIOTransformer; import nu.marginalia.util.array.functional.LongTransformer; import nu.marginalia.util.btree.BTreeWriter; +import nu.marginalia.wmsa.edge.index.postings.DomainRankings; import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalStatistics; import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReader; @@ -32,18 +33,22 @@ public class ReverseIndexConverter { private final Logger logger = LoggerFactory.getLogger(getClass()); private final SearchIndexJournalReaderSingleFile journalReader; + private final DomainRankings domainRankings; private final Path outputFileWords; private final Path outputFileDocs; - + private final SortingContext sortingContext; public ReverseIndexConverter(Path tmpFileDir, SearchIndexJournalReaderSingleFile journalReader, + DomainRankings domainRankings, Path outputFileWords, Path outputFileDocs) { this.tmpFileDir = tmpFileDir; this.journalReader = journalReader; + this.domainRankings = domainRankings; this.outputFileWords = outputFileWords; this.outputFileDocs = outputFileDocs; + this.sortingContext = new SortingContext(tmpFileDir, 64_000); } public void convert() throws IOException { @@ -56,7 +61,7 @@ public class ReverseIndexConverter { final SearchIndexJournalStatistics statistics = journalReader.getStatistics(); final Path intermediateUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat"); - SortingContext sortingContext = new SortingContext(tmpFileDir, 64_000); + try { final long wordsFileSize = statistics.highestWord() + 1; @@ -187,7 +192,7 @@ public class ReverseIndexConverter { } } - private static class IntermediateIndexConstructor implements SearchIndexJournalReaderSingleFile.LongObjectConsumer, AutoCloseable { + private class IntermediateIndexConstructor implements SearchIndexJournalReaderSingleFile.LongObjectConsumer, AutoCloseable { private final LongArray wordRangeEnds; private final IntArray wordRangeOffset; @@ -205,12 +210,26 @@ public class ReverseIndexConverter { @Override public void accept(long docId, SearchIndexJournalEntry.Record record) { - final long urlId = docId & 0xFFFF_FFFFL; - final int wordId = record.wordId(); + /* Encode the ID as + * + * 32 bits 32 bits + * [ ranking | url-id ] + * + * in order to get low-ranking documents to be considered first + * when sorting the items. + */ + + int domainId = (int) (docId >>> 32); + long rankingId = (long) domainRankings.getRanking(domainId) << 32; + + int urlId = (int) (docId & 0xFFFF_FFFFL); + long rankEncodedId = rankingId | urlId; + + final int wordId = record.wordId(); long offset = startOfRange(wordId); - documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), urlId); + documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), rankEncodedId); documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), record.metadata()); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexPrioReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexPrioReader.java index 26a39bcf..b2ce74dc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexPrioReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexPrioReader.java @@ -47,18 +47,6 @@ public class ReverseIndexPrioReader { return new ReverseIndexEntrySource(createReaderNew(offset), ReverseIndexEntrySourceBehavior.DO_PREFER); } - public int numDocuments(int wordId) { - if (wordId < 0) - return 0; - - long offset = words.get(wordId); - - if (offset < 0) - return 0; - - return createReaderNew(offset).numEntries(); - } - private BTreeReader createReaderNew(long offset) { return new BTreeReader(documents, ReverseIndexParameters.bTreeContext, offset); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexReader.java index 5679c5be..6f4475e7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexReader.java @@ -53,6 +53,11 @@ public class ReverseIndexReader { } public EntrySource documents(int wordId, ReverseIndexEntrySourceBehavior behavior) { + if (null == words) { + logger.warn("Reverse index is not ready, dropping query"); + return new EmptyEntrySource(); + } + if (wordId < 0 || wordId >= words.size()) return new EmptyEntrySource(); long offset = words.get(wordId); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexQueryParams.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexQueryParams.java index 298e6c01..031410fc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexQueryParams.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/query/IndexQueryParams.java @@ -7,6 +7,7 @@ import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit; public record IndexQueryParams(SpecificationLimit qualityLimit, SpecificationLimit year, SpecificationLimit size, + SpecificationLimit rank, SearchSet searchSet, QueryStrategy queryStrategy ) diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/RankingAlgorithm.java similarity index 67% rename from marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/RankingAlgorithm.java index ca6f7b62..2e8589e4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingAlgorithm.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/RankingAlgorithm.java @@ -1,21 +1,21 @@ -package nu.marginalia.util.ranking; +package nu.marginalia.wmsa.edge.index.ranking; -import gnu.trove.list.TIntList; import gnu.trove.list.array.TIntArrayList; import gnu.trove.map.hash.TIntIntHashMap; import gnu.trove.map.hash.TIntObjectHashMap; import it.unimi.dsi.fastutil.ints.IntArrays; -import it.unimi.dsi.fastutil.ints.IntComparator; -import org.roaringbitmap.RoaringBitmap; +import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultAccumulator; +import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainData; +import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Arrays; -import java.util.Comparator; import java.util.HashSet; import java.util.Set; -import java.util.function.IntToDoubleFunction; -import java.util.stream.IntStream; +import java.util.function.Supplier; + +import static java.lang.Math.min; public abstract class RankingAlgorithm { protected final TIntObjectHashMap domainsById = new TIntObjectHashMap<>(); @@ -85,6 +85,10 @@ public abstract class RankingAlgorithm { logger.info("Origin Domains: {}", originDomainIds.size()); } + public RankingDomainData getDomainData(int id) { + return domainsById.get(id); + } + public void addPeripheralNodes() { int newNodesIdxCutoff = domainIdToIndex.size(); @@ -133,29 +137,7 @@ public abstract class RankingAlgorithm { return domainsById.size(); } - - public RankVector pageRankVector() { - RankVector rank = new RankVector(1.d / domainsById.size()); - - int iter_max = 100; - for (int i = 0; i < iter_max; i++) { - RankVector newRank = createNewRankVector(rank); - - double oldNorm = rank.norm(); - double newNorm = newRank.norm(); - double dNorm = oldNorm - newNorm ; - if (i < iter_max-1) { - adjustRankVector(newRank, dNorm, oldNorm); - } - - rank = newRank; - } - - return rank; - } - - - public RoaringBitmap pageRank(int resultCount) { + public T pageRank(int resultCount, Supplier> accumulatorP) { RankVector rank = new RankVector(1.d / domainsById.size()); int iter_max = 100; @@ -174,10 +156,10 @@ public abstract class RankingAlgorithm { } - return rank.getRanking(resultCount); + return rank.getRanking(resultCount, accumulatorP).get(); } - public RoaringBitmap pageRankWithPeripheralNodes(int resultCount) { + public T pageRankWithPeripheralNodes(int resultCount, Supplier> accumulatorP) { RankVector rank = new RankVector(1.d / domainsById.size()); int iter_max = 100; @@ -201,32 +183,11 @@ public abstract class RankingAlgorithm { logger.info("PRWPN iteration done"); - return rank.getRanking(resultCount); + return rank.getRanking(resultCount, accumulatorP).get(); } abstract void adjustRankVector(RankVector vector, double dNorm, double oldNorm); - public TIntList pageRank(IntToDoubleFunction weight, int resultCount) { - RankVector rank = new RankVector(1.d / domainsById.size()); - - int iter_max = 100; - for (int i = 0; i < iter_max; i++) { - RankVector newRank = createNewRankVector(rank); - - double oldNorm = rank.norm(); - double newNorm = newRank.norm(); - double dNorm = oldNorm - newNorm ; - - if (i < iter_max-1) { - adjustRankVector(newRank, dNorm, oldNorm); - } - - rank = newRank; - } - - return rank.getRanking(weight, resultCount); - } - abstract RankVector createNewRankVector(RankVector rank); public boolean includeInRanking(RankingDomainData data) { @@ -245,9 +206,9 @@ public abstract class RankingAlgorithm { public void setMaxKnownUrls(int maxKnownUrls) { this.maxKnownUrls = maxKnownUrls; } - public class RankVector { private final double[] rank; + public RankVector(double defaultValue) { rank = new double[domainIndexToId.size()]; if (defaultValue != 0.) { @@ -271,9 +232,8 @@ public abstract class RankingAlgorithm { public double norm() { double v = 0.; - for (int i = 0; i < rank.length; i++) { - if (rank[i] > 0) { v+=rank[i]; } - else { v -= rank[i]; } + for (double value : rank) { + v += Math.abs(value); } return v; } @@ -281,74 +241,39 @@ public abstract class RankingAlgorithm { public double norm(RankVector other) { double v = 0.; for (int i = 0; i < rank.length; i++) { - double dv = rank[i] - other.get(i); - - if (dv > 0) { v+=dv; } - else { v -= dv; } + v += Math.abs(rank[i] - other.get(i)); } return v; } - public TIntList getRanking(IntToDoubleFunction other, int numResults) { - TIntArrayList list = new TIntArrayList(numResults); + public RankingResultAccumulator getRanking(int numResults, Supplier> accumulatorP) { - Comparator comparator = Comparator.comparing(i -> Math.sqrt(other.applyAsDouble(domainIdToIndex.get(i)) * rank[i])); - - IntStream.range(0, rank.length) - .boxed() - .sorted(comparator.reversed()) - .map(domainIndexToId::get) - .limit(numResults) - .forEach(list::add); - - return list; - } - - public RoaringBitmap getRanking(int numResults) { if (numResults < 0) { numResults = domainIdToIndex.size(); } - if (numResults >= rank.length) { - numResults = rank.length; - } + numResults = min(numResults, min(domainIdToIndex.size(), rank.length)); - RoaringBitmap list = new RoaringBitmap(); + int[] nodes = sortOrder(rank); + var accumulator = accumulatorP.get(); - int[] nodes = new int[rank.length]; - Arrays.setAll(nodes, i->i); - IntComparator comp = (i,j) -> (int) Math.signum(rank[j] - rank[i]); - IntArrays.quickSort(nodes, comp); - - int i; - - for (i = 0; i < numResults; i++) { + for (int i = 0; i < numResults; i++) { int id = domainIndexToId.get(nodes[i]); if (includeInRanking(domainsById.get(id))) - list.add(id); + accumulator.add(id, i); } - for (; i < nodes.length && domainsById.size() < numResults; i++) { - int id = domainIndexToId.get(nodes[i]); + return accumulator; + } + private static int[] sortOrder(double[] values) { - if (includeInRanking(domainsById.get(id))) - list.add(id); - } + int[] ret = new int[values.length]; + Arrays.setAll(ret, i->i); + IntArrays.quickSort(ret, (i,j) -> (int) Math.signum(values[j] - values[i])); - - return list; + return ret; } - - public void incrementAll(double v) { - for (int i = 0; i < rank.length; i++) { - rank[i]+=v; - } - } - - int size() { - return domainsById.size(); - } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/ReversePageRank.java similarity index 79% rename from marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/ReversePageRank.java index 7d3b17c4..0c202958 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterReversePageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/ReversePageRank.java @@ -1,10 +1,12 @@ -package nu.marginalia.util.ranking; +package nu.marginalia.wmsa.edge.index.ranking; -public class BetterReversePageRank extends RankingAlgorithm { +import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher; + +public class ReversePageRank extends RankingAlgorithm { - public BetterReversePageRank(RankingDomainFetcher domains, String... origins) { + public ReversePageRank(RankingDomainFetcher domains, String... origins) { super(domains, origins); } @@ -20,8 +22,6 @@ public class BetterReversePageRank extends RankingAlgorithm { double newRankValue = 0; if (links != null && links.size() > 0) { - - for (int j = 0; j < links.size(); j++) { var revLinks = linkDataDest2Src[links.getQuick(j)]; newRankValue += rank.get(links.getQuick(j)) / revLinks.size(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/StandardPageRank.java similarity index 77% rename from marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/StandardPageRank.java index f1f9b0b1..d9302fd6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/BetterStandardPageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/StandardPageRank.java @@ -1,9 +1,11 @@ -package nu.marginalia.util.ranking; +package nu.marginalia.wmsa.edge.index.ranking; -public class BetterStandardPageRank extends RankingAlgorithm { +import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher; - public BetterStandardPageRank(RankingDomainFetcher domains, String... origins) { +public class StandardPageRank extends RankingAlgorithm { + + public StandardPageRank(RankingDomainFetcher domains, String... origins) { super(domains, origins); } @@ -38,8 +40,7 @@ public class BetterStandardPageRank extends RankingAlgorithm { @Override void adjustRankVector(RankVector vector, double dNorm, double oldNorm) { - originDomainIds.forEach(id -> vector.increment(id, 0.15 / originDomainIds.size() /* dNorm/originDomainIds.size() */ )); -// vector.incrementAll(0.14*dNorm/vector.size()); + originDomainIds.forEach(id -> vector.increment(id, 0.15 / originDomainIds.size() )); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultAccumulator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultAccumulator.java new file mode 100644 index 00000000..fea37b00 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultAccumulator.java @@ -0,0 +1,6 @@ +package nu.marginalia.wmsa.edge.index.ranking.accumulator; + +public interface RankingResultAccumulator { + void add(int domainId, int rank); + T get(); +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultBitSetAccumulator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultBitSetAccumulator.java new file mode 100644 index 00000000..26e72522 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultBitSetAccumulator.java @@ -0,0 +1,17 @@ +package nu.marginalia.wmsa.edge.index.ranking.accumulator; + +import org.roaringbitmap.RoaringBitmap; + +public class RankingResultBitSetAccumulator implements RankingResultAccumulator { + private final RoaringBitmap result = new RoaringBitmap(); + + @Override + public void add(int domainId, int rank) { + result.add(domainId); + } + + @Override + public RoaringBitmap get() { + return result; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultHashMapAccumulator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultHashMapAccumulator.java new file mode 100644 index 00000000..653806ed --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultHashMapAccumulator.java @@ -0,0 +1,21 @@ +package nu.marginalia.wmsa.edge.index.ranking.accumulator; + +import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; + +public class RankingResultHashMapAccumulator implements RankingResultAccumulator { + private final Int2IntOpenHashMap result; + + public RankingResultHashMapAccumulator(int size) { + result = new Int2IntOpenHashMap(size); + } + + @Override + public void add(int domainId, int rank) { + result.put(domainId, rank); + } + + @Override + public Int2IntOpenHashMap get() { + return result; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultListAccumulator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultListAccumulator.java new file mode 100644 index 00000000..663483e4 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/accumulator/RankingResultListAccumulator.java @@ -0,0 +1,24 @@ +package nu.marginalia.wmsa.edge.index.ranking.accumulator; + +import gnu.trove.list.array.TIntArrayList; + +public class RankingResultListAccumulator implements RankingResultAccumulator { + private final TIntArrayList result; + + public RankingResultListAccumulator(int size) { + result = new TIntArrayList(size); + } + public RankingResultListAccumulator() { + result = new TIntArrayList(10_000); + } + + @Override + public void add(int domainId, int rank) { + result.add(domainId); + } + + @Override + public TIntArrayList get() { + return result; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainData.java similarity index 87% rename from marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainData.java index 2a4b0f65..4a59daf4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainData.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainData.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.ranking; +package nu.marginalia.wmsa.edge.index.ranking.data; import lombok.AllArgsConstructor; import lombok.Data; @@ -10,7 +10,7 @@ public class RankingDomainData { public final int id; public final String name; private int alias; - private EdgeDomainIndexingState state; + public EdgeDomainIndexingState state; public final int knownUrls; public int resolveAlias() { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcher.java similarity index 84% rename from marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcher.java index 1c2e6849..ff2b7e18 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/RankingDomainFetcher.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcher.java @@ -1,6 +1,7 @@ -package nu.marginalia.util.ranking; +package nu.marginalia.wmsa.edge.index.ranking.data; import com.google.inject.Inject; +import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; @@ -11,12 +12,13 @@ import java.sql.SQLException; import java.util.function.Consumer; import java.util.function.IntConsumer; +@Singleton public class RankingDomainFetcher { - private final HikariDataSource dataSource; - private final EdgeDomainBlacklistImpl blacklist; - private final Logger logger = LoggerFactory.getLogger(getClass()); + protected final HikariDataSource dataSource; + protected final EdgeDomainBlacklistImpl blacklist; + protected final Logger logger = LoggerFactory.getLogger(getClass()); - private final boolean getNames = false; + protected boolean getNames = false; @Inject public RankingDomainFetcher(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) { @@ -24,6 +26,10 @@ public class RankingDomainFetcher { this.blacklist = blacklist; } + public void retainNames() { + this.getNames = true; + } + public void getDomains(Consumer consumer) { String query; if (getNames) { @@ -49,14 +55,19 @@ public class RankingDomainFetcher { getDomains(query, consumer); } - private void getDomains(String query, Consumer consumer) { + protected void getDomains(String query, Consumer consumer) { try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(query)) { stmt.setFetchSize(10000); var rsp = stmt.executeQuery(); while (rsp.next()) { int id = rsp.getInt(1); if (!blacklist.isBlacklisted(id)) { - consumer.accept(new RankingDomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5))); + consumer.accept( + new RankingDomainData(id, + rsp.getString(2), + rsp.getInt(3), + EdgeDomainIndexingState.valueOf(rsp.getString(4)), + rsp.getInt(5))); } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcherForSimilarityData.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcherForSimilarityData.java new file mode 100644 index 00000000..dddaeebb --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/data/RankingDomainFetcherForSimilarityData.java @@ -0,0 +1,103 @@ +package nu.marginalia.wmsa.edge.index.ranking.data; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.function.Consumer; + +@Singleton +public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher { + final boolean hasData; + + @Inject + public RankingDomainFetcherForSimilarityData(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) { + super(dataSource, blacklist); + + hasData = isDomainNeighborTablePopulated(dataSource); + } + + private static boolean isDomainNeighborTablePopulated(HikariDataSource dataSource) { + try (var conn = dataSource.getConnection(); + var stmt = conn.createStatement(); + var rs = stmt.executeQuery("SELECT DOMAIN_ID FROM EC_DOMAIN_NEIGHBORS_2 LIMIT 1")) { + + return rs.next(); + } + catch (SQLException ex) { + LoggerFactory + .getLogger(RankingDomainFetcherForSimilarityData.class) + .error("Failed to get count from EC_DOMAIN_NEIGHBORS_2", ex); + return false; + } + } + public boolean hasData() { + return hasData; + } + + public void eachDomainLink(DomainLinkConsumer consumer) { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement("SELECT DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS FROM EC_DOMAIN_NEIGHBORS_2")) + { + stmt.setFetchSize(10000); + + var rsp = stmt.executeQuery(); + + while (rsp.next()) { + int src = rsp.getInt(1); + int dst = rsp.getInt(2); + + // these "links" are bidi + consumer.accept(src, dst); + consumer.accept(dst, src); + } + } + catch (SQLException ex) { + logger.error("Failed to fetch domain links", ex); + } + } + + public void getDomains(Consumer consumer) { +// String query = +// """ +// SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0) +// FROM EC_DOMAIN +// LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID +// INNER JOIN EC_DOMAIN_LINK ON DEST_DOMAIN_ID=EC_DOMAIN.ID +// WHERE SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID +// GROUP BY EC_DOMAIN.ID +// HAVING COUNT(SOURCE_DOMAIN_ID)>5 +// """; + + String query; + if (getNames) { + query = + """ + SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0) + FROM EC_DOMAIN + INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID + GROUP BY EC_DOMAIN.ID + """; + } + else { + query = + """ + SELECT EC_DOMAIN.ID,"",DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0) + FROM EC_DOMAIN + INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID + GROUP BY EC_DOMAIN.ID + """; + } + + getDomains(query, consumer); + } + + + public void getPeripheralDomains(Consumer consumer) { + // This is not relevant for this variant of pagerank since it is bidirectional + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/OldReversePageRankV2.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/old/OldReversePageRankV2.java similarity index 99% rename from marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/OldReversePageRankV2.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/old/OldReversePageRankV2.java index 02823563..59fcda0d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/OldReversePageRankV2.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/old/OldReversePageRankV2.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.ranking.old; +package nu.marginalia.wmsa.edge.index.ranking.old; import com.zaxxer.hikari.HikariDataSource; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/StandardPageRank.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/old/StandardPageRank.java similarity index 98% rename from marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/StandardPageRank.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/old/StandardPageRank.java index 74bef70a..cd58f7be 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/old/StandardPageRank.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/old/StandardPageRank.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.ranking.old; +package nu.marginalia.wmsa.edge.index.ranking.old; import com.zaxxer.hikari.HikariDataSource; @@ -125,7 +125,6 @@ public class StandardPageRank { final TIntArrayList empty = new TIntArrayList(); - double rankNorm = rank.norm(); RankVector newRank = new RankVector(0); for (DomainData domain : domains.valueCollection()) { @@ -176,8 +175,6 @@ public class StandardPageRank { } }); } - - TIntHashSet deadEnds = new TIntHashSet(domains.size()); } private class RankVector { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/CreateBrowseDomainRanksTool.java similarity index 54% rename from marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/CreateBrowseDomainRanksTool.java index e251092f..f4cb6197 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/CreateBrowseDomainRanksTool.java @@ -1,43 +1,30 @@ -package nu.marginalia.util.ranking.tool; +package nu.marginalia.wmsa.edge.index.ranking.tool; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; -import nu.marginalia.util.ranking.BuggyStandardPageRank; -import nu.marginalia.util.ranking.RankingDomainFetcher; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; +import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank; +import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator; +import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData; import org.mariadb.jdbc.Driver; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.sql.SQLException; -import java.util.HashSet; -import java.util.Set; import java.util.concurrent.LinkedBlockingQueue; -public class UpdateDomainRanksTool { +public class CreateBrowseDomainRanksTool { - private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool.class); + private static final Logger logger = LoggerFactory.getLogger(CreateBrowseDomainRanksTool.class); - public Set originDomains = new HashSet<>(); - public Set originDomainIds = new HashSet<>(); - public final long domainIdMax = -1; - public int domainCount; - private volatile static int rankMax; - - public int maxId() { - return (int) domainIdMax; - } - public int domainCount() { - return domainCount; - } static final LinkedBlockingQueue uploadQueue = new LinkedBlockingQueue<>(10); volatile static boolean running = true; @SneakyThrows public static void main(String... args) { - org.mariadb.jdbc.Driver driver = new Driver(); + Driver driver = new Driver(); var conn = new DatabaseModule().provideConnection(); long start = System.currentTimeMillis(); @@ -45,20 +32,21 @@ public class UpdateDomainRanksTool { logger.info("Ranking"); var ds = new DatabaseModule().provideConnection(); - var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); - var spr = new BuggyStandardPageRank(domains, "memex.marginalia.nu"); + var domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds)); + var rpr = new StandardPageRank(domains, args); - rankMax = spr.size()*2; uploader.start(); - var rankData = spr.pageRankWithPeripheralNodes(rankMax); - for (int i : rankData) { + var rankData = rpr.pageRankWithPeripheralNodes(1000, RankingResultListAccumulator::new); + + rankData.forEach(i -> { try { uploadQueue.put(i); } catch (InterruptedException e) { e.printStackTrace(); } - } + return true; + }); long end = System.currentTimeMillis(); running = false; @@ -68,24 +56,14 @@ public class UpdateDomainRanksTool { } public static void uploadThread(HikariDataSource dataSource) { - int i = 0; - try (var conn = dataSource.getConnection()) { - logger.info("Resetting rank"); - try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET RANK=1")) { - stmt.executeUpdate(); - } - - logger.info("Updating ranks"); - try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET RANK=? WHERE ID=?")) { + try (var stmt = conn.prepareStatement("INSERT IGNORE INTO EC_RANDOM_DOMAINS(DOMAIN_SET, DOMAIN_ID) VALUES (3, ?)")) { while (running || (!running && !uploadQueue.isEmpty())) { var job = uploadQueue.take(); - stmt.setDouble(1, i++ / (double) rankMax); - stmt.setInt(2, job); + stmt.setInt(1, job); stmt.executeUpdate(); } } - } catch (SQLException | InterruptedException throwables) { throwables.printStackTrace(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/PerusePageRankV2.java similarity index 97% rename from marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/PerusePageRankV2.java index 89c1dfb9..4fbdd08b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/PerusePageRankV2.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/PerusePageRankV2.java @@ -1,4 +1,4 @@ -package nu.marginalia.util.ranking.tool; +package nu.marginalia.wmsa.edge.index.ranking.tool; import com.zaxxer.hikari.HikariDataSource; @@ -10,9 +10,9 @@ import it.unimi.dsi.fastutil.ints.IntArrays; import it.unimi.dsi.fastutil.ints.IntComparator; import lombok.AllArgsConstructor; import lombok.SneakyThrows; -import nu.marginalia.util.ranking.RankingAlgorithm; -import nu.marginalia.util.ranking.RankingDomainData; -import nu.marginalia.util.ranking.RankingDomainFetcher; +import nu.marginalia.wmsa.edge.index.ranking.RankingAlgorithm; +import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainData; +import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; import org.jetbrains.annotations.NotNull; @@ -33,8 +33,6 @@ public class PerusePageRankV2 { TIntArrayList[] linkDataSrc2Dest; TIntArrayList[] linkDataDest2Src; - private static final boolean getNames = true; - private final Logger logger = LoggerFactory.getLogger(getClass()); static final LinkedBlockingQueue uploadQueue = new LinkedBlockingQueue<>(10); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/PrintDomainRanksTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/PrintDomainRanksTool.java new file mode 100644 index 00000000..60f12008 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/PrintDomainRanksTool.java @@ -0,0 +1,67 @@ +package nu.marginalia.wmsa.edge.index.ranking.tool; + +import lombok.SneakyThrows; +import nu.marginalia.wmsa.configuration.module.DatabaseModule; +import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; +import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank; +import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator; +import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher; +import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData; +import org.mariadb.jdbc.Driver; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.atomic.AtomicInteger; + +public class PrintDomainRanksTool { + + private static final Logger logger = LoggerFactory.getLogger(PrintDomainRanksTool.class); + + private volatile static int rankMax; + + static final LinkedBlockingQueue uploadQueue = new LinkedBlockingQueue<>(10); + volatile static boolean running = true; + + @SneakyThrows + public static void main(String... args) { + Driver driver = new Driver(); + var conn = new DatabaseModule().provideConnection(); + + long start = System.currentTimeMillis(); + + logger.info("Ranking"); + var ds = new DatabaseModule().provideConnection(); + + RankingDomainFetcher domains; + if (Boolean.getBoolean("use-link-data")) { + domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); + domains.retainNames(); + } + else { + domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds)); + domains.retainNames(); + } + + var rpr = new StandardPageRank(domains, args); + + rankMax = rpr.size(); + + var rankData = rpr.pageRankWithPeripheralNodes(rankMax, RankingResultListAccumulator::new); + + AtomicInteger cnt = new AtomicInteger(); + rankData.forEach(i -> { + + var data = rpr.getDomainData(i); + + System.out.printf("%d %s %s\n", cnt.getAndIncrement(), data.name, data.state); + return true; + }); + + long end = System.currentTimeMillis(); + running = false; + + logger.info("Done in {}", (end - start)/1000.0); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/UpdateDomainRanksTool.java similarity index 75% rename from marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/UpdateDomainRanksTool.java index 55f16a5a..714e3028 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/ranking/tool/UpdateDomainRanksTool.java @@ -1,11 +1,12 @@ -package nu.marginalia.util.ranking.tool; +package nu.marginalia.wmsa.edge.index.ranking.tool; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; -import nu.marginalia.util.ranking.BetterReversePageRank; -import nu.marginalia.util.ranking.RankingDomainFetcher; +import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank; +import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl; +import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData; import org.mariadb.jdbc.Driver; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -13,12 +14,10 @@ import org.slf4j.LoggerFactory; import java.sql.SQLException; import java.util.concurrent.LinkedBlockingQueue; -public class UpdateDomainRanksTool2 { +public class UpdateDomainRanksTool { - private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool2.class); + private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool.class); - public final long domainIdMax = -1; - public int domainCount; private volatile static int rankMax; static final LinkedBlockingQueue uploadQueue = new LinkedBlockingQueue<>(10); @@ -34,21 +33,22 @@ public class UpdateDomainRanksTool2 { logger.info("Ranking"); var ds = new DatabaseModule().provideConnection(); - var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); - var rpr = new BetterReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org"); + var domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds)); + var rpr = new StandardPageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com"); - var rankVector = rpr.pageRankVector(); rankMax = rpr.size(); uploader.start(); - var rankData = rpr.pageRankWithPeripheralNodes(rankMax); - for (int i : rankData) { + var rankData = rpr.pageRankWithPeripheralNodes(rankMax, RankingResultListAccumulator::new); + + rankData.forEach(i -> { try { uploadQueue.put(i); } catch (InterruptedException e) { e.printStackTrace(); } - } + return true; + }); long end = System.currentTimeMillis(); running = false; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexDomainQueryService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexDomainQueryService.java index a51352c1..90ac84a4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexDomainQueryService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexDomainQueryService.java @@ -5,7 +5,7 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import io.prometheus.client.Histogram; import nu.marginalia.util.array.buffer.LongQueryBuffer; -import nu.marginalia.util.dict.DictionaryHashMap; +import nu.marginalia.util.dict.OffHeapDictionaryHashMap; import nu.marginalia.wmsa.client.GsonFactory; import nu.marginalia.wmsa.edge.index.postings.SearchIndexControl; import nu.marginalia.wmsa.edge.index.query.IndexResultDomainDeduplicator; @@ -101,7 +101,7 @@ public class EdgeIndexDomainQueryService { private OptionalInt lookUpWord(String s) { int ret = indexes.getLexiconReader().get(s); - if (ret == DictionaryHashMap.NO_VALUE) { + if (ret == OffHeapDictionaryHashMap.NO_VALUE) { return OptionalInt.empty(); } return OptionalInt.of(ret); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java index 7aa33038..40f7cf64 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java @@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.index.svc; import com.google.inject.Inject; import com.google.inject.Singleton; import com.google.protobuf.InvalidProtocolBufferException; -import nu.marginalia.util.dict.DictionaryHashMap; +import nu.marginalia.util.dict.OffHeapDictionaryHashMap; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.KeywordListChunker; import nu.marginalia.wmsa.edge.index.IndexServicesFactory; @@ -51,7 +51,7 @@ public class EdgeIndexLexiconService { final int wordId = lr.get(word); - if (DictionaryHashMap.NO_VALUE == wordId) { + if (OffHeapDictionaryHashMap.NO_VALUE == wordId) { response.status(404); return ""; } @@ -110,7 +110,7 @@ public class EdgeIndexLexiconService { String word = words[i]; long id = keywordLexicon.getOrInsert(word); - if (id != DictionaryHashMap.NO_VALUE) { + if (id != OffHeapDictionaryHashMap.NO_VALUE) { ids[putIdx++] = id; ids[putIdx++] = meta[i]; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java index 0b8c08f4..5988df3b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java @@ -12,7 +12,7 @@ import io.prometheus.client.Histogram; import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.ints.IntList; import nu.marginalia.util.array.buffer.LongQueryBuffer; -import nu.marginalia.util.dict.DictionaryHashMap; +import nu.marginalia.util.dict.OffHeapDictionaryHashMap; import nu.marginalia.wmsa.client.GsonFactory; import nu.marginalia.wmsa.edge.index.postings.EdgeIndexQuerySearchTerms; import nu.marginalia.wmsa.edge.index.postings.IndexResultValuator; @@ -115,11 +115,13 @@ public class EdgeIndexQueryService { TLongHashSet consideredUrlIds; public SearchQuery(EdgeSearchSpecification specsSet) { - this.fetchSize = specsSet.fetchSize; - this.budget = new IndexSearchBudget(specsSet.timeoutMs); + var limits = specsSet.queryLimits; + + this.fetchSize = limits.fetchSize(); + this.budget = new IndexSearchBudget(limits.timeoutMs()); this.subqueries = specsSet.subqueries; - this.limitByDomain = specsSet.limitByDomain; - this.limitTotal = specsSet.limitTotal; + this.limitByDomain = limits.resultsByDomain(); + this.limitTotal = limits.resultsTotal(); this.consideredUrlIds = new TLongHashSet(fetchSize * 4); @@ -127,6 +129,7 @@ public class EdgeIndexQueryService { specsSet.quality, specsSet.year, specsSet.size, + specsSet.rank, getSearchSet(specsSet), specsSet.queryStrategy); } @@ -151,7 +154,7 @@ public class EdgeIndexQueryService { } } - final var evaluator = new IndexResultValuator(indexes, results, subqueries); + final var evaluator = new IndexResultValuator(indexes, results, subqueries, queryParams); ArrayList items = new ArrayList<>(results.size()); ArrayList refusedItems = new ArrayList<>(results.size()); @@ -293,7 +296,7 @@ public class EdgeIndexQueryService { private OptionalInt lookUpWord(String s) { int ret = indexes.getLexiconReader().get(s); - if (ret == DictionaryHashMap.NO_VALUE) { + if (ret == OffHeapDictionaryHashMap.NO_VALUE) { return OptionalInt.empty(); } return OptionalInt.of(ret); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexSearchSetsService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexSearchSetsService.java index a09047eb..834caf67 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexSearchSetsService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexSearchSetsService.java @@ -2,20 +2,20 @@ package nu.marginalia.wmsa.edge.index.svc; import com.google.inject.Inject; import com.google.inject.Singleton; -import com.zaxxer.hikari.HikariDataSource; -import gnu.trove.list.TIntList; -import gnu.trove.list.array.TIntArrayList; import lombok.SneakyThrows; -import nu.marginalia.util.ranking.BetterReversePageRank; -import nu.marginalia.util.ranking.BetterStandardPageRank; -import nu.marginalia.util.ranking.RankingDomainFetcher; +import nu.marginalia.wmsa.edge.index.ranking.ReversePageRank; +import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank; +import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultHashMapAccumulator; +import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher; +import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultBitSetAccumulator; import nu.marginalia.wmsa.edge.index.IndexServicesFactory; import nu.marginalia.wmsa.edge.index.model.RankingSettings; +import nu.marginalia.wmsa.edge.index.postings.DomainRankings; +import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData; import nu.marginalia.wmsa.edge.index.svc.searchset.RankingSearchSet; import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSet; import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetAny; import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier; -import org.roaringbitmap.RoaringBitmap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -23,137 +23,47 @@ import java.io.IOException; @Singleton public class EdgeIndexSearchSetsService { - private final HikariDataSource dataSource; - private RankingDomainFetcher rankingDomains; - private final RankingSettings rankingSettings; private final Logger logger = LoggerFactory.getLogger(getClass()); + private final RankingDomainFetcher rankingDomains; + private final RankingDomainFetcher similarityDomains; + private final RankingSettings rankingSettings; - private final SearchSet anySet = new SearchSetAny(); + + // Below are binary indices that are used to constrain a search private volatile RankingSearchSet retroSet; private volatile RankingSearchSet smallWebSet; private volatile RankingSearchSet academiaSet; + private final SearchSet anySet = new SearchSetAny(); + + // The ranking value of the domains used in sorting the domains + private volatile DomainRankings domainRankings = new DomainRankings(); @Inject - public EdgeIndexSearchSetsService(HikariDataSource dataSource, - RankingDomainFetcher rankingDomains, + public EdgeIndexSearchSetsService(RankingDomainFetcher rankingDomains, + RankingDomainFetcherForSimilarityData similarityDomains, RankingSettings rankingSettings, IndexServicesFactory servicesFactory) throws IOException { - this.dataSource = dataSource; + this.rankingDomains = rankingDomains; + + if (similarityDomains.hasData()) { + this.similarityDomains = similarityDomains; + } + else { + // on test environments the cosine similarity graph may not be present + logger.info("Domain similarity is not present, falling back on link graph"); + this.similarityDomains = rankingDomains; + } + this.rankingSettings = rankingSettings; smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, servicesFactory.getSearchSetsBase().resolve("small-web.dat")); academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, servicesFactory.getSearchSetsBase().resolve("academia.dat")); retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, servicesFactory.getSearchSetsBase().resolve("retro.dat")); - - logger.info("SearchIndexDao ranking settings = {}", rankingSettings); } - public void recalculateAll() { - updateAcademiaDomains(); - updateRetroDomains(); - updateSmallWebDomains(); - } - - @SneakyThrows - public RoaringBitmap goodUrls() { - RoaringBitmap domains = new RoaringBitmap(); - RoaringBitmap urls = new RoaringBitmap(); - - try (var connection = dataSource.getConnection()) { - try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NULL AND IS_ALIVE")) { - stmt.setFetchSize(10_000); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - domains.add(rsp.getInt(1)); - } - } - - // For some reason, doing this "INNER JOIN" in Java is significantly faster than doing it in SQL - try (var stmt = connection.prepareStatement("SELECT ID,DOMAIN_ID FROM EC_URL WHERE VISITED AND EC_URL.STATE='OK'")) { - stmt.setFetchSize(10_000); - var rsp = stmt.executeQuery(); - while (rsp.next()) { - if (domains.contains(rsp.getInt(2))) { - urls.add(rsp.getInt(1)); - } - } - } - - } - - return urls; - } - - @SneakyThrows - public void updateRetroDomains() { - var spr = new BetterStandardPageRank(rankingDomains,rankingSettings.retro.toArray(String[]::new)); - var data = spr.pageRankWithPeripheralNodes(spr.size() / 2); - - synchronized (this) { - retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data); - retroSet.write(); - } - } - - @SneakyThrows - public void updateSmallWebDomains() { - var rpr = new BetterReversePageRank(rankingDomains, rankingSettings.small.toArray(String[]::new)); - rpr.setMaxKnownUrls(750); - var data = rpr.pageRankWithPeripheralNodes(rpr.size()); - - synchronized (this) { - smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data); - smallWebSet.write(); - } - } - - @SneakyThrows - public void updateAcademiaDomains() { - var spr = new BetterStandardPageRank(rankingDomains, rankingSettings.academia.toArray(String[]::new)); - var data = spr.pageRankWithPeripheralNodes(spr.size()/2); - - synchronized (this) { - academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data); - academiaSet.write(); - } - } - - @SneakyThrows - public TIntList getStandardDomains() { - TIntArrayList results = new TIntArrayList(); - - try (var connection = dataSource.getConnection(); - var stmt = connection.prepareStatement( - """ - SELECT ID FROM EC_DOMAIN - WHERE INDEXED>0 - AND STATE='ACTIVE' - AND DOMAIN_ALIAS IS NULL - ORDER BY ID ASC - """); - ) { - var rs = stmt.executeQuery(); - while (rs.next()) { - results.add(rs.getInt(1)); - } - } - return results; - - } - - @SneakyThrows - public TIntList getSpecialDomains() { - TIntArrayList results = new TIntArrayList(); - try (var connection = dataSource.getConnection(); - var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE='SPECIAL'") - ) { - var rs = stmt.executeQuery(); - while (rs.next()) { - results.add(rs.getInt(1)); - } - } - return results; + public DomainRankings getDomainRankings() { + return domainRankings; } public SearchSet getSearchSetByName(SearchSetIdentifier searchSetIdentifier) { @@ -167,4 +77,54 @@ public class EdgeIndexSearchSetsService { case SMALLWEB -> smallWebSet; }; } + + public void recalculateAll() { + updateAcademiaDomainsSet(); + updateRetroDomainsSet(); + updateSmallWebDomainsSet(); + updateDomainRankings(); + } + + private void updateDomainRankings() { + var spr = new StandardPageRank(similarityDomains, rankingSettings.retro.toArray(String[]::new)); + + var ranks = spr.pageRankWithPeripheralNodes(spr.size() / 2, () -> new RankingResultHashMapAccumulator(100_000)); + synchronized (this) { + domainRankings = new DomainRankings(ranks); + } + } + + @SneakyThrows + public void updateRetroDomainsSet() { + var spr = new StandardPageRank(similarityDomains, rankingSettings.retro.toArray(String[]::new)); + var data = spr.pageRankWithPeripheralNodes(spr.size() / 2, RankingResultBitSetAccumulator::new); + + synchronized (this) { + retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data); + retroSet.write(); + } + } + + @SneakyThrows + public void updateSmallWebDomainsSet() { + var rpr = new ReversePageRank(similarityDomains, rankingSettings.small.toArray(String[]::new)); + rpr.setMaxKnownUrls(750); + var data = rpr.pageRankWithPeripheralNodes(rpr.size(), RankingResultBitSetAccumulator::new); + + synchronized (this) { + smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data); + smallWebSet.write(); + } + } + + @SneakyThrows + public void updateAcademiaDomainsSet() { + var spr = new StandardPageRank(similarityDomains, rankingSettings.academia.toArray(String[]::new)); + var data = spr.pageRankWithPeripheralNodes(spr.size()/2, RankingResultBitSetAccumulator::new); + + synchronized (this) { + academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data); + academiaSet.write(); + } + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/RankingSearchSet.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/RankingSearchSet.java index ceba0d71..7ce90c73 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/RankingSearchSet.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/RankingSearchSet.java @@ -9,21 +9,37 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardOpenOption; +/** A serializable bit map of domains + * + * @see SearchSetIdentifier + * + * */ public class RankingSearchSet implements SearchSet { private final RoaringBitmap set; public final SearchSetIdentifier identifier; public final Path source; + public RankingSearchSet(SearchSetIdentifier identifier, Path source, RoaringBitmap set) { + this.identifier = identifier; + this.source = source; + this.set = set; + } + public RankingSearchSet(SearchSetIdentifier identifier, Path source) throws IOException { this.identifier = identifier; this.source = source; - set = new RoaringBitmap(); if (!Files.exists(source)) { - return; + set = new RoaringBitmap(); } + else { + set = load(source); + } + } + private static RoaringBitmap load(Path source) throws IOException { + var set = new RoaringBitmap(); try (var ds = new DataInputStream(Files.newInputStream(source, StandardOpenOption.READ))) { for (;;) { try { @@ -32,12 +48,7 @@ public class RankingSearchSet implements SearchSet { catch (IOException ex) { break; } } } - } - - public RankingSearchSet(SearchSetIdentifier identifier, Path source, RoaringBitmap set) { - this.identifier = identifier; - this.source = source; - this.set = set; + return set; } @Override @@ -46,7 +57,11 @@ public class RankingSearchSet implements SearchSet { } public void write() throws IOException { - try (var ds = new DataOutputStream(Files.newOutputStream(source, StandardOpenOption.WRITE, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))) { + try (var ds = new DataOutputStream(Files.newOutputStream(source, + StandardOpenOption.WRITE, + StandardOpenOption.CREATE, + StandardOpenOption.TRUNCATE_EXISTING))) + { for (var iter = set.getIntIterator(); iter.hasNext();) { ds.writeInt(iter.next()); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SearchSetIdentifier.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SearchSetIdentifier.java index 59ffcad4..040cd32c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SearchSetIdentifier.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SearchSetIdentifier.java @@ -1,5 +1,12 @@ package nu.marginalia.wmsa.edge.index.svc.searchset; +import nu.marginalia.wmsa.edge.search.model.EdgeSearchProfile; + +/** Identifies a RankingSearchSet, associated with an EdgeSearchProfile + * + * @see RankingSearchSet + * @see EdgeSearchProfile + * */ public enum SearchSetIdentifier { NONE, RETRO, diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SmallSearchSet.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SmallSearchSet.java index 0af63fa5..8f1e8e9a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SmallSearchSet.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/searchset/SmallSearchSet.java @@ -13,8 +13,8 @@ public class SmallSearchSet implements SearchSet { } @Override - public boolean contains(int urlId) { - return entries.contains(urlId); + public boolean contains(int domainId) { + return entries.contains(domainId); } public String toString() { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java index 141a3904..3c8bda78 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java @@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.integration.stackoverflow; import com.google.inject.Inject; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; -import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.sentence.SentenceExtractor; import nu.marginalia.util.language.processing.model.KeywordMetadata; import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java index 5f4d206c..67c51751 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java @@ -1,7 +1,7 @@ package nu.marginalia.wmsa.edge.integration.wikipedia; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; -import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.sentence.SentenceExtractor; import nu.marginalia.util.language.processing.model.KeywordMetadata; import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java index e5b0526c..79e65476 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeDomain.java @@ -11,8 +11,6 @@ import java.util.regex.Pattern; @Getter @Setter @Builder public class EdgeDomain { - private static final Predicate ipPatternTest = Pattern.compile("[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}").asMatchPredicate(); - private static final Predicate govListTest = Pattern.compile(".*\\.(ac|co|org|gov|edu|com)\\.[a-z]{2}").asMatchPredicate(); @Nonnull public final String subDomain; @@ -27,7 +25,7 @@ public class EdgeDomain { var dot = host.lastIndexOf('.'); - if (dot < 0 || ipPatternTest.test(host)) { // IPV6 >.> + if (dot < 0 || looksLikeAnIp(host)) { // IPV6 >.> subDomain = ""; domain = host; } @@ -38,7 +36,7 @@ public class EdgeDomain { domain = host; } else { - if (govListTest.test(host)) + if (looksLikeGovTld(host)) { // Capture .ac.jp, .co.uk int dot3 = host.substring(0, dot2).lastIndexOf('.'); if (dot3 >= 0) { @@ -59,6 +57,35 @@ public class EdgeDomain { } } + private static final Predicate govListTest = Pattern.compile(".*\\.(ac|co|org|gov|edu|com)\\.[a-z]{2}").asMatchPredicate(); + private boolean looksLikeGovTld(String host) { + if (host.length() < 8) + return false; + int cnt = 0; + for (int i = host.length() - 7; i < host.length(); i++) { + if (host.charAt(i) == '.') + cnt++; + } + return cnt >= 2 && govListTest.test(host); + } + + + private static final Predicate ipPatternTest = Pattern.compile("[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}").asMatchPredicate(); + + private boolean looksLikeAnIp(String host) { + if (host.length() < 7) + return false; + + char firstChar = host.charAt(0); + int lastChar = host.charAt(host.length() - 1); + + return Character.isDigit(firstChar) + && Character.isDigit(lastChar) + && ipPatternTest.test(host); + } + + + public EdgeUrl toRootUrl() { // Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http return new EdgeUrl("http", this, null, "/", null); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultKeywordScore.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultKeywordScore.java index 98bf9444..6d97192c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultKeywordScore.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultKeywordScore.java @@ -24,6 +24,11 @@ public record EdgeSearchResultKeywordScore(int set, sum += 20; } + int rank = EdgePageDocumentsMetadata.decodeRank(encodedDocMetadata) - 13; + if (rank < 0) + sum += rank / 2; + else + sum += rank / 4; return sum; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java index a1289a42..84f133d7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java @@ -1,6 +1,7 @@ package nu.marginalia.wmsa.edge.model.search; import lombok.*; +import nu.marginalia.wmsa.edge.index.model.QueryLimits; import nu.marginalia.wmsa.edge.index.model.QueryStrategy; import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier; import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit; @@ -9,23 +10,18 @@ import java.util.List; @ToString @Getter @Builder @With @AllArgsConstructor public class EdgeSearchSpecification { - public List subqueries; public List domains; public SearchSetIdentifier searchSetIdentifier; - public final int limitByDomain; - public final int limitTotal; - public final String humanQuery; - public final int timeoutMs; - public final int fetchSize; - public final SpecificationLimit quality; public final SpecificationLimit year; public final SpecificationLimit size; + public final SpecificationLimit rank; + public final QueryLimits queryLimits; public final QueryStrategy queryStrategy; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java index 6cf8a050..952e0fb2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java @@ -6,6 +6,7 @@ import nu.marginalia.util.language.WordPatterns; import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; +import nu.marginalia.wmsa.edge.index.model.QueryLimits; import nu.marginalia.wmsa.edge.index.model.QueryStrategy; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery; @@ -84,6 +85,8 @@ public class QueryFactory { List problems = new ArrayList<>(); String domain = null; + QueryStrategy queryStrategy = QueryStrategy.AUTO; + var basicQuery = queryParser.parse(query); if (basicQuery.size() >= 8) { @@ -94,6 +97,7 @@ public class QueryFactory { SpecificationLimit qualityLimit = profile.getQualityLimit(); SpecificationLimit year = profile.getYearLimit(); SpecificationLimit size = profile.getSizeLimit(); + SpecificationLimit rank = SpecificationLimit.none(); for (Token t : basicQuery) { if (t.type == TokenType.QUOT_TERM || t.type == TokenType.LITERAL_TERM) { @@ -113,6 +117,12 @@ public class QueryFactory { if (t.type == TokenType.SIZE_TERM) { size = parseSpecificationLimit(t.str); } + if (t.type == TokenType.RANK_TERM) { + rank = parseSpecificationLimit(t.str); + } + if (t.type == TokenType.QS_TERM) { + queryStrategy = parseQueryStrategy(t.str); + } } var queryPermutations = queryParser.permuteQueriesNew(basicQuery); @@ -148,6 +158,8 @@ public class QueryFactory { case QUALITY_TERM: case YEAR_TERM: case SIZE_TERM: + case RANK_TERM: + case QS_TERM: break; // case NEAR_TERM: near = t.str; @@ -179,25 +191,25 @@ public class QueryFactory { } } + int domainLimit; + if (domain != null) { + domainLimit = 100; + } else { + domainLimit = 2; + } + EdgeSearchSpecification.EdgeSearchSpecificationBuilder specsBuilder = EdgeSearchSpecification.builder() .subqueries(subqueries) - .limitTotal(100) + .queryLimits(new QueryLimits(domainLimit, 100, 250, 4096)) .humanQuery(query) - .timeoutMs(250) - .fetchSize(4096) .quality(qualityLimit) .year(year) .size(size) + .rank(rank) .domains(domains) - .queryStrategy(QueryStrategy.AUTO) + .queryStrategy(queryStrategy) .searchSetIdentifier(profile.searchSetIdentifier); - if (domain != null) { - specsBuilder = specsBuilder.limitByDomain(100); - } else { - specsBuilder = specsBuilder.limitByDomain(2); - } - EdgeSearchSpecification specs = specsBuilder.build(); return new EdgeSearchQuery(specs, searchTermsHuman, domain); @@ -210,10 +222,10 @@ public class QueryFactory { if (startChar == '=') { return SpecificationLimit.equals(val); } - else if (startChar == '<'){ + else if (startChar == '<') { return SpecificationLimit.lessThan(val); } - else if (startChar == '>'){ + else if (startChar == '>') { return SpecificationLimit.greaterThan(val); } else { @@ -221,6 +233,17 @@ public class QueryFactory { } } + private QueryStrategy parseQueryStrategy(String str) { + return switch (str.toUpperCase()) { + case "RF_TITLE" -> QueryStrategy.REQUIRE_FIELD_TITLE; + case "RF_SUBJECT" -> QueryStrategy.REQUIRE_FIELD_SUBJECT; + case "RF_SITE" -> QueryStrategy.REQUIRE_FIELD_SITE; + case "SENTENCE" -> QueryStrategy.SENTENCE; + case "TOPIC" -> QueryStrategy.TOPIC; + default -> QueryStrategy.AUTO; + }; + } + private String normalizeDomainName(String str) { return str.toLowerCase(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java index 354ba0ce..5551a67a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java @@ -93,6 +93,10 @@ public class QueryParser { entity.replace(new Token(TokenType.YEAR_TERM, t.str.substring(4), t.displayStr)); } else if (t.str.startsWith("size") && t.str.matches("size[=><]\\d+")) { entity.replace(new Token(TokenType.SIZE_TERM, t.str.substring(4), t.displayStr)); + } else if (t.str.startsWith("rank") && t.str.matches("rank[=><]\\d+")) { + entity.replace(new Token(TokenType.RANK_TERM, t.str.substring(4), t.displayStr)); + } else if (t.str.startsWith("qs=")) { + entity.replace(new Token(TokenType.QS_TERM, t.str.substring(3), t.displayStr)); } else if (t.str.contains(":")) { entity.replace(new Token(TokenType.ADVICE_TERM, t.str, t.displayStr)); } @@ -506,8 +510,11 @@ enum TokenType implements Predicate { QUALITY_TERM, YEAR_TERM, SIZE_TERM, + RANK_TERM, NEAR_TERM, + QS_TERM, + QUOT, MINUS, QMARK, diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryVariants.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryVariants.java index f3e75f91..6d42b599 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryVariants.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryVariants.java @@ -8,7 +8,7 @@ import lombok.Getter; import lombok.ToString; import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.processing.KeywordExtractor; -import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.sentence.SentenceExtractor; import nu.marginalia.util.language.processing.model.DocumentSentence; import nu.marginalia.util.language.processing.model.WordSpan; import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter; @@ -25,12 +25,12 @@ public class QueryVariants { private final Logger logger = LoggerFactory.getLogger(getClass()); private final KeywordExtractor keywordExtractor; - private final SentenceExtractor sentenceExtractor; private final TermFrequencyDict dict; private final PorterStemmer ps = new PorterStemmer(); private final NGramBloomFilter nGramBloomFilter; private final EnglishDictionary englishDictionary; + private final ThreadLocal sentenceExtractor; @Inject public QueryVariants(LanguageModels lm, @@ -40,7 +40,7 @@ public class QueryVariants { this.nGramBloomFilter = nGramBloomFilter; this.englishDictionary = englishDictionary; this.keywordExtractor = new KeywordExtractor(); - this.sentenceExtractor = new SentenceExtractor(lm); + this.sentenceExtractor = ThreadLocal.withInitial(() -> new SentenceExtractor(lm)); this.dict = dict; } @@ -78,10 +78,8 @@ public class QueryVariants { final TreeMap> byStart = new TreeMap<>(); - logger.debug("Q: {}", query); - logger.debug("QAS: {}", joinedQuery); - - var sentence = sentenceExtractor.extractSentence(joinedQuery.joinedQuery); + var se = sentenceExtractor.get(); + var sentence = se.extractSentence(joinedQuery.joinedQuery); for (int i = 0; i < sentence.posTags.length; i++) { if (sentence.posTags[i].startsWith("N") || sentence.posTags[i].startsWith("V")) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryIndexService.java index 0c4cffc2..c9a63bc5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryIndexService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryIndexService.java @@ -4,6 +4,7 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; +import nu.marginalia.wmsa.edge.index.model.QueryLimits; import nu.marginalia.wmsa.edge.index.model.QueryStrategy; import nu.marginalia.wmsa.edge.model.search.*; import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit; @@ -47,11 +48,8 @@ public class EdgeSearchQueryIndexService { .subqueries(sqs) .domains(Collections.emptyList()) .searchSetIdentifier(profile.searchSetIdentifier) - .limitByDomain(limitPerDomain) - .limitTotal(limitTotal) + .queryLimits(new QueryLimits(limitPerDomain, limitTotal, 150, 2048)) .humanQuery("") - .timeoutMs(150) - .fetchSize(2048) .year(SpecificationLimit.none()) .size(SpecificationLimit.none()) .quality(SpecificationLimit.none()) @@ -76,11 +74,13 @@ public class EdgeSearchQueryIndexService { resultList.sort(resultListComparator); - UrlDeduplicator deduplicator = new UrlDeduplicator(processedQuery.specs.limitByDomain); - List retList = new ArrayList<>(processedQuery.specs.limitTotal); + var limits = processedQuery.specs.queryLimits; + + UrlDeduplicator deduplicator = new UrlDeduplicator(limits.resultsByDomain()); + List retList = new ArrayList<>(limits.resultsTotal()); for (var item : resultList) { - if (retList.size() >= processedQuery.specs.limitTotal) + if (retList.size() >= limits.resultsTotal()) break; if (!deduplicator.shouldRemove(item)) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/AdblockTesterTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/AdblockTesterTool.java index bd3c0429..b97fc27b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/AdblockTesterTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/AdblockTesterTool.java @@ -48,7 +48,7 @@ public class AdblockTesterTool { private static void processDocument(CrawledDocument doc) { - Document parsedDocument = Jsoup.parse(doc.documentBody); + Document parsedDocument = Jsoup.parse(doc.documentBody.decode()); if (simulator.hasAds(parsedDocument)) { System.out.println(doc.url); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java index 5288eac1..78d90ccb 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java @@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.tools; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; -import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.sentence.SentenceExtractor; import nu.marginalia.wmsa.configuration.WmsaHome; import nu.marginalia.wmsa.edge.converting.ConverterModule; import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor; @@ -63,7 +63,7 @@ public class ConverterLogicTestTool { if (doc.documentBody == null) continue; Runnable task = () -> { - var parsed = Jsoup.parse(doc.documentBody); + var parsed = Jsoup.parse(doc.documentBody.decode()); parsed.body().filter(new DomPruningFilter(0.5)); var dld = se.extractSentences(parsed); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/CrawlDataExtractorTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/CrawlDataExtractorTool.java index b0e86d7f..cbe59e60 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/CrawlDataExtractorTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/CrawlDataExtractorTool.java @@ -84,7 +84,7 @@ public class CrawlDataExtractorTool { private static void processDocument(CrawledDocument doc) { - Document parsedDocument = Jsoup.parse(doc.documentBody); + Document parsedDocument = Jsoup.parse(doc.documentBody.decode()); if (abs.hasAds(parsedDocument)) { System.out.println(doc.url); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/RendererService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/RendererService.java index 004a677d..fb3d0e9e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/RendererService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/RendererService.java @@ -1,22 +1,15 @@ package nu.marginalia.wmsa.renderer; -import com.google.gson.Gson; import com.google.inject.Inject; import com.google.inject.name.Named; -import nu.marginalia.wmsa.client.GsonFactory; import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.configuration.server.MetricsServer; import nu.marginalia.wmsa.configuration.server.Service; import nu.marginalia.wmsa.resource_store.ResourceStoreClient; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; public class RendererService extends Service { - private final Logger logger = LoggerFactory.getLogger(getClass()); - private final Gson gson = GsonFactory.get(); - private final ResourceStoreClient resourceStoreClient; @@ -24,7 +17,6 @@ public class RendererService extends Service { public RendererService(ResourceStoreClient resourceStoreClient, @Named("service-host") String ip, @Named("service-port") Integer port, - SmhiRendererService smhiRendererService, PodcastRendererService podcastRendererService, StatusRendererService statusRendererService, Initialization initialization, @@ -34,7 +26,6 @@ public class RendererService extends Service { this.resourceStoreClient = resourceStoreClient; - smhiRendererService.start(); podcastRendererService.start(); statusRendererService.start(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/SmhiRendererService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/SmhiRendererService.java deleted file mode 100644 index 56e4ff07..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/SmhiRendererService.java +++ /dev/null @@ -1,82 +0,0 @@ -package nu.marginalia.wmsa.renderer; - -import com.google.gson.Gson; -import com.google.inject.Inject; -import lombok.SneakyThrows; -import nu.marginalia.wmsa.client.GsonFactory; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; -import nu.marginalia.wmsa.renderer.mustache.RendererFactory; -import nu.marginalia.wmsa.renderer.request.smhi.RenderSmhiIndexReq; -import nu.marginalia.wmsa.renderer.request.smhi.RenderSmhiPrognosReq; -import nu.marginalia.wmsa.resource_store.ResourceStoreClient; -import nu.marginalia.wmsa.resource_store.model.RenderedResource; -import nu.marginalia.wmsa.smhi.model.PrognosData; -import nu.marginalia.wmsa.smhi.model.index.IndexPlatser; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import spark.Request; -import spark.Response; -import spark.Spark; - -import java.time.LocalDateTime; -import java.util.concurrent.TimeUnit; - -public class SmhiRendererService { - private final Logger logger = LoggerFactory.getLogger(getClass()); - private final Gson gson = GsonFactory.get(); - - private final RendererFactory rendererFactory = new RendererFactory(); - - private final MustacheRenderer indexRenderer; - private final MustacheRenderer prognosRenderer; - - private final ResourceStoreClient resourceStoreClient; - - - @Inject @SneakyThrows - public SmhiRendererService(ResourceStoreClient resourceStoreClient) { - this.resourceStoreClient = resourceStoreClient; - indexRenderer = rendererFactory.renderer( "smhi/index"); - prognosRenderer = rendererFactory.renderer( "smhi/prognos"); - } - - public void start() { - Spark.post("/render/smhi/index", this::renderSmhiIndex); - Spark.post("/render/smhi/prognos", this::renderSmhiPrognos); - } - - - private Object renderSmhiIndex(Request request, Response response) { - var requestText = request.body(); - var req = gson.fromJson(requestText, RenderSmhiIndexReq.class); - - logger.info("renderSmhiIndex()"); - var resource = new RenderedResource("index.html", - LocalDateTime.MAX, - indexRenderer.render(new IndexPlatser(req.platser))); - - resourceStoreClient.putResource(Context.fromRequest(request), "smhi", resource) - .timeout(10, TimeUnit.SECONDS) - .blockingSubscribe(); - - return ""; - } - - private Object renderSmhiPrognos(Request request, Response response) { - var requestText = request.body(); - var req = gson.fromJson(requestText, RenderSmhiPrognosReq.class); - - logger.info("renderSmhiPrognos({})", req.data.plats.namn); - var resource = new RenderedResource(req.data.plats.getUrl(), - LocalDateTime.now().plusHours(3), - prognosRenderer.render(req.data)); - - resourceStoreClient.putResource(Context.fromRequest(request), "smhi", resource) - .timeout(10, TimeUnit.SECONDS) - .blockingSubscribe(); - - return ""; - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/client/RendererClient.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/client/RendererClient.java index e398f8b7..63537e3b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/client/RendererClient.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/client/RendererClient.java @@ -11,8 +11,6 @@ import nu.marginalia.wmsa.podcasts.model.Podcast; import nu.marginalia.wmsa.podcasts.model.PodcastEpisode; import nu.marginalia.wmsa.podcasts.model.PodcastListing; import nu.marginalia.wmsa.podcasts.model.PodcastNewEpisodes; -import nu.marginalia.wmsa.renderer.request.smhi.RenderSmhiIndexReq; -import nu.marginalia.wmsa.renderer.request.smhi.RenderSmhiPrognosReq; import javax.inject.Inject; import java.util.concurrent.TimeUnit; @@ -24,19 +22,6 @@ public class RendererClient extends AbstractDynamicClient{ super(ServiceDescriptor.RENDERER); } - @SneakyThrows - public Observable render(Context ctx, RenderSmhiPrognosReq req) { - return post(ctx, "/render/smhi/prognos", req) - .timeout(5, TimeUnit.SECONDS, Observable.error(new TimeoutException("RendererClient.renderSmhiPrognos()"))); - } - - - @SneakyThrows - public Observable render(Context ctx, RenderSmhiIndexReq req) { - return post(ctx, "/render/smhi/index", req) - .timeout(5, TimeUnit.SECONDS, Observable.error(new TimeoutException("RendererClient.renderSmhiIndex()"))); - } - @SneakyThrows public Observable render(Context ctx, PodcastNewEpisodes req) { return post(ctx, "/render/podcast/new", req) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/request/smhi/RenderSmhiIndexReq.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/request/smhi/RenderSmhiIndexReq.java deleted file mode 100644 index d585d56f..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/request/smhi/RenderSmhiIndexReq.java +++ /dev/null @@ -1,13 +0,0 @@ -package nu.marginalia.wmsa.renderer.request.smhi; - -import lombok.AllArgsConstructor; -import lombok.Getter; -import lombok.NoArgsConstructor; -import nu.marginalia.wmsa.smhi.model.Plats; - -import java.util.List; - -@NoArgsConstructor @AllArgsConstructor @Getter -public class RenderSmhiIndexReq { - public List platser; -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/request/smhi/RenderSmhiPrognosReq.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/request/smhi/RenderSmhiPrognosReq.java deleted file mode 100644 index ba1746db..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/renderer/request/smhi/RenderSmhiPrognosReq.java +++ /dev/null @@ -1,11 +0,0 @@ -package nu.marginalia.wmsa.renderer.request.smhi; - -import lombok.AllArgsConstructor; -import lombok.Getter; -import lombok.NoArgsConstructor; -import nu.marginalia.wmsa.smhi.model.PrognosData; - -@NoArgsConstructor @AllArgsConstructor @Getter -public class RenderSmhiPrognosReq { - public PrognosData data; -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/SmhiScraperService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/SmhiScraperService.java deleted file mode 100644 index 2b074efb..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/SmhiScraperService.java +++ /dev/null @@ -1,79 +0,0 @@ -package nu.marginalia.wmsa.smhi; - -import com.google.inject.Inject; -import com.google.inject.name.Named; -import io.reactivex.rxjava3.schedulers.Schedulers; -import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.configuration.server.Initialization; -import nu.marginalia.wmsa.configuration.server.MetricsServer; -import nu.marginalia.wmsa.configuration.server.Service; -import nu.marginalia.wmsa.renderer.client.RendererClient; -import nu.marginalia.wmsa.renderer.request.smhi.RenderSmhiIndexReq; -import nu.marginalia.wmsa.renderer.request.smhi.RenderSmhiPrognosReq; -import nu.marginalia.wmsa.smhi.model.Plats; -import nu.marginalia.wmsa.smhi.model.PrognosData; -import nu.marginalia.wmsa.smhi.scraper.crawler.SmhiCrawler; -import nu.marginalia.wmsa.smhi.scraper.crawler.entity.SmhiEntityStore; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import spark.Spark; - -import java.util.Comparator; -import java.util.concurrent.TimeUnit; -import java.util.stream.Collectors; - -public class SmhiScraperService extends Service { - - private final SmhiCrawler crawler; - private final SmhiEntityStore entityStore; - private final RendererClient rendererClient; - private final Logger logger = LoggerFactory.getLogger(getClass()); - private final Initialization initialization; - @Inject - public SmhiScraperService(@Named("service-host") String ip, - @Named("service-port") Integer port, - SmhiCrawler crawler, - SmhiEntityStore entityStore, - RendererClient rendererClient, - Initialization initialization, - MetricsServer metricsServer) { - super(ip, port, initialization, metricsServer); - this.crawler = crawler; - this.entityStore = entityStore; - this.rendererClient = rendererClient; - this.initialization = initialization; - - Spark.awaitInitialization(); - - Schedulers.newThread().scheduleDirect(this::start); - } - - private void start() { - initialization.waitReady(); - rendererClient.waitReady(); - - entityStore.platser.debounce(6, TimeUnit.SECONDS) - .subscribe(this::updateIndex); - entityStore.prognosdata.subscribe(this::updatePrognos); - - crawler.start(); - } - - private void updatePrognos(PrognosData prognosData) { - rendererClient - .render(Context.internal(), new RenderSmhiPrognosReq(prognosData)) - .timeout(30, TimeUnit.SECONDS) - .blockingSubscribe(); - } - - private void updateIndex(Plats unused) { - var platser = entityStore.platser().stream() - .sorted(Comparator.comparing(plats -> plats.namn)) - .collect(Collectors.toList()); - - rendererClient - .render(Context.internal(), new RenderSmhiIndexReq(platser)) - .timeout(30, TimeUnit.SECONDS) - .blockingSubscribe(); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/Parameter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/Parameter.java deleted file mode 100644 index 012e9c24..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/Parameter.java +++ /dev/null @@ -1,9 +0,0 @@ -package nu.marginalia.wmsa.smhi.model; - -public class Parameter { - public String name; - public String levelType; - public String level; - public String unit; - public String[] values; -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/Plats.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/Plats.java deleted file mode 100644 index 7ae39675..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/Plats.java +++ /dev/null @@ -1,42 +0,0 @@ -package nu.marginalia.wmsa.smhi.model; - -import lombok.Getter; -import org.apache.commons.lang3.builder.EqualsBuilder; -import org.apache.commons.lang3.builder.HashCodeBuilder; - -@Getter -public class Plats { - public final String namn; - public final double latitud; - public final double longitud; - - public String getUrl() { - return namn.toLowerCase()+".html"; - } - - public Plats(String namn, String latitud, String longitud) { - this.namn = namn; - this.longitud = Double.parseDouble(longitud); - this.latitud = Double.parseDouble(latitud); - } - - public String toString() { - return String.format("Plats[%s %s %s]", namn, longitud, latitud); - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - - if (o == null || getClass() != o.getClass()) return false; - - Plats plats = (Plats) o; - - return new EqualsBuilder().append(latitud, plats.latitud).append(longitud, plats.longitud).append(namn, plats.namn).isEquals(); - } - - @Override - public int hashCode() { - return new HashCodeBuilder(17, 37).append(namn).append(latitud).append(longitud).toHashCode(); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/Platser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/Platser.java deleted file mode 100644 index c0f7c15f..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/Platser.java +++ /dev/null @@ -1,16 +0,0 @@ -package nu.marginalia.wmsa.smhi.model; - - -import java.util.List; - -public class Platser { - private final List platser; - - public Platser(List platser) { - this.platser = platser; - } - - public List getPlatser() { - return platser; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/PrognosData.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/PrognosData.java deleted file mode 100644 index 1bb1be02..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/PrognosData.java +++ /dev/null @@ -1,41 +0,0 @@ -package nu.marginalia.wmsa.smhi.model; - -import nu.marginalia.wmsa.smhi.model.dyn.Dygnsdata; - -import java.time.LocalDateTime; -import java.time.ZoneId; -import java.time.format.DateTimeFormatter; -import java.util.ArrayList; -import java.util.List; -import java.util.stream.Collectors; - -public class PrognosData { - - public final String crawlTime = LocalDateTime.now().toString(); - - public String approvedTime; - public String referenceTime; - public String expires; - - public Plats plats; - - public final List timeSeries = new ArrayList<>(); - - public String getBastFore() { - return LocalDateTime.parse(crawlTime).atZone(ZoneId.of("Europe/Stockholm")) - .plusHours(3) - .format(DateTimeFormatter.ISO_TIME); - } - public Plats getPlats() { - return plats; - } - - public List getTidpunkter() { - return timeSeries; - } - public List getDygn() { - return timeSeries.stream().map(Tidpunkt::getDate).distinct() - .map(datum -> new Dygnsdata(datum, this)) - .collect(Collectors.toList()); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/Tidpunkt.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/Tidpunkt.java deleted file mode 100644 index a4616df2..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/Tidpunkt.java +++ /dev/null @@ -1,75 +0,0 @@ -package nu.marginalia.wmsa.smhi.model; - -import java.time.ZoneId; -import java.time.ZonedDateTime; -import java.time.format.DateTimeFormatter; -import java.time.format.DateTimeFormatterBuilder; -import java.time.temporal.ChronoField; -import java.util.ArrayList; -import java.util.List; - -public class Tidpunkt { - - private static final ZoneId serverZoneId = ZoneId.of("GMT"); - private static final ZoneId localZoneId = ZoneId.of("Europe/Stockholm"); - private static final DateTimeFormatter timeFormatter = (new DateTimeFormatterBuilder()) - .appendValue(ChronoField.HOUR_OF_DAY, 2) - .appendLiteral(':') - .appendValue(ChronoField.MINUTE_OF_HOUR, 2) - .toFormatter(); - - public String validTime; - - public final List parameters = new ArrayList<>(); - - - private String getParam(String name) { - var data = parameters.stream().filter(p -> name.equals(p.name)).map(p->p.values).findFirst().orElseGet(() -> new String[0]); - if (data.length > 0) { - return data[0]; - } - return null; - } - public String getDate() { - return ZonedDateTime.parse(validTime).toLocalDateTime().atZone(serverZoneId).toOffsetDateTime().atZoneSameInstant(localZoneId).format(DateTimeFormatter.ISO_LOCAL_DATE); - } - - public String getTime() { - return ZonedDateTime.parse(validTime).toLocalDateTime().atZone(serverZoneId).toOffsetDateTime().atZoneSameInstant(localZoneId).format(timeFormatter); - } - - public String getTemp() { - return getParam("t"); - } - public String getMoln() { - return getParam("tcc_mean"); - } - public String getVind() { - return getParam("ws"); - } - public String getByvind() { - return getParam("gust"); - } - public String getNederbord() { - return getParam("pmedian"); - } - public String getNederbordTyp() { - switch(getParam("pcat")) { - case "1": return "S"; - case "2": return "SB"; - case "3": return "R"; - case "4": return "D"; - case "5": return "UKR"; - case "6": return "UKD"; - default: - return ""; - - } - } - public String getVindRiktning() { - return getParam("wd"); - } - public String toString() { - return String.format("Tidpunkt[%s %s]", validTime, getTemp()); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/dyn/Dygnsdata.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/dyn/Dygnsdata.java deleted file mode 100644 index 05f2246a..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/dyn/Dygnsdata.java +++ /dev/null @@ -1,40 +0,0 @@ -package nu.marginalia.wmsa.smhi.model.dyn; - -import nu.marginalia.wmsa.smhi.model.PrognosData; -import nu.marginalia.wmsa.smhi.model.Tidpunkt; - -import java.time.LocalDate; -import java.time.format.DateTimeFormatter; -import java.util.List; -import java.util.stream.Collectors; - -public class Dygnsdata { - public final String date; - private final PrognosData data; - - public Dygnsdata(String date, PrognosData data) { - this.date = date; - this.data = data; - } - - public String getDate() { - return date; - } - public List getData() { - String d = getDate(); - return data.timeSeries.stream().filter(p -> d.equals(p.getDate())).collect(Collectors.toList()); - } - - public String getVeckodag() { - switch (LocalDate.parse(date, DateTimeFormatter.ISO_LOCAL_DATE).getDayOfWeek()) { - case MONDAY: return "Måndag"; - case TUESDAY: return "Tisdag"; - case WEDNESDAY: return "Onsdag"; - case THURSDAY: return "Torsdag"; - case FRIDAY: return "Fredag"; - case SATURDAY: return "Lördag"; - case SUNDAY: return "Söndag"; - } - return "Annandag"; - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/index/IndexPlats.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/index/IndexPlats.java deleted file mode 100644 index 5e3f3a19..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/index/IndexPlats.java +++ /dev/null @@ -1,13 +0,0 @@ -package nu.marginalia.wmsa.smhi.model.index; - -import lombok.AllArgsConstructor; -import lombok.Getter; -import nu.marginalia.wmsa.smhi.model.Plats; - -import java.util.List; - -@Getter @AllArgsConstructor -public class IndexPlats { - String nyckel; - List platser; -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/index/IndexPlatser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/index/IndexPlatser.java deleted file mode 100644 index 7300bcc5..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/model/index/IndexPlatser.java +++ /dev/null @@ -1,28 +0,0 @@ -package nu.marginalia.wmsa.smhi.model.index; - -import lombok.Getter; -import nu.marginalia.wmsa.smhi.model.Plats; - -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - -@Getter -public class IndexPlatser { - final List platserPerNyckel = new ArrayList<>(); - - public IndexPlatser(List platser) { - var platsMap = kategoriseraEfterNyckel(platser); - - platsMap.keySet().stream().sorted() - .forEach(p -> platserPerNyckel.add(new IndexPlats(p, platsMap.get(p)))); - } - - private Map> kategoriseraEfterNyckel(List platser) { - return platser.stream().collect( - Collectors.groupingBy(p -> - p.namn.substring(0, 1) - .toUpperCase())); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/scraper/PlatsReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/scraper/PlatsReader.java deleted file mode 100644 index 3ea0d8cc..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/scraper/PlatsReader.java +++ /dev/null @@ -1,44 +0,0 @@ -package nu.marginalia.wmsa.smhi.scraper; - -import com.google.inject.Inject; -import com.google.inject.Singleton; -import com.google.inject.name.Named; -import com.opencsv.CSVReader; -import nu.marginalia.wmsa.smhi.model.Plats; - -import java.io.InputStreamReader; -import java.nio.charset.StandardCharsets; -import java.util.ArrayList; -import java.util.List; -import java.util.Objects; - -@Singleton -public class PlatsReader { - private final String fileName; - - @Inject - public PlatsReader(@Named("plats-csv-file") String fileName) { - this.fileName = fileName; - } - - public List readPlatser() throws Exception { - List platser = new ArrayList<>(); - - var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream(fileName), - "Kunde inte ladda " + fileName); - try (var reader = new CSVReader(new InputStreamReader(resource, StandardCharsets.UTF_8))) { - for (;;) { - String[] strings = reader.readNext(); - if (strings == null) { - return platser; - } - platser.add(skapaPlats(strings)); - } - } - - } - - private Plats skapaPlats(String[] strings) { - return new Plats(strings[0], strings[1], strings[2]); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/scraper/SmhiScraperMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/scraper/SmhiScraperMain.java deleted file mode 100644 index d3edbf14..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/scraper/SmhiScraperMain.java +++ /dev/null @@ -1,32 +0,0 @@ -package nu.marginalia.wmsa.smhi.scraper; - -import com.google.inject.Guice; -import com.google.inject.Inject; -import com.google.inject.Injector; -import nu.marginalia.wmsa.configuration.MainClass; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; -import nu.marginalia.wmsa.configuration.module.ConfigurationModule; -import nu.marginalia.wmsa.configuration.server.Initialization; -import nu.marginalia.wmsa.smhi.SmhiScraperService; - -import java.io.IOException; - -public class SmhiScraperMain extends MainClass { - private final SmhiScraperService service; - - @Inject - public SmhiScraperMain(SmhiScraperService service) { - this.service = service; - } - - public static void main(String... args) { - init(ServiceDescriptor.SMHI_SCRAPER, args); - - Injector injector = Guice.createInjector( - new SmhiScraperModule(), - new ConfigurationModule()); - injector.getInstance(SmhiScraperMain.class); - injector.getInstance(Initialization.class).setReady(); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/scraper/SmhiScraperModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/scraper/SmhiScraperModule.java deleted file mode 100644 index ffb1793a..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/scraper/SmhiScraperModule.java +++ /dev/null @@ -1,12 +0,0 @@ -package nu.marginalia.wmsa.smhi.scraper; - -import com.google.inject.AbstractModule; -import com.google.inject.name.Names; - -public class SmhiScraperModule extends AbstractModule { - public void configure() { - bind(String.class).annotatedWith(Names.named("plats-csv-file")).toInstance("data/smhi/stader.csv"); - bind(String.class).annotatedWith(Names.named("smhi-user-agent")).toInstance("kontakt@marginalia.nu"); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/scraper/crawler/SmhiBackendApi.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/scraper/crawler/SmhiBackendApi.java deleted file mode 100644 index 9880e317..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/scraper/crawler/SmhiBackendApi.java +++ /dev/null @@ -1,88 +0,0 @@ -package nu.marginalia.wmsa.smhi.scraper.crawler; - -import com.google.inject.Inject; -import com.google.inject.Singleton; -import com.google.inject.name.Named; -import nu.marginalia.wmsa.smhi.model.Plats; -import org.apache.http.Header; -import org.apache.http.HttpHost; -import org.apache.http.client.methods.HttpGet; -import org.apache.http.conn.routing.HttpRoute; -import org.apache.http.impl.client.HttpClients; -import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.time.LocalDateTime; -import java.time.format.DateTimeFormatter; -import java.util.Arrays; -import java.util.Locale; - -@Singleton -public class SmhiBackendApi { - private final Logger logger = LoggerFactory.getLogger(getClass()); - - private final String server = "https://opendata-download-metfcst.smhi.se/api"; - private final PoolingHttpClientConnectionManager connectionManager; - private final String userAgent; - - @Inject - public SmhiBackendApi(@Named("smhi-user-agent") String userAgent) { - this.userAgent = userAgent; - - connectionManager = new PoolingHttpClientConnectionManager(); - connectionManager.setMaxTotal(200); - connectionManager.setDefaultMaxPerRoute(20); - HttpHost host = new HttpHost("https://opendata-download-metfcst.smhi.se"); - connectionManager.setMaxPerRoute(new HttpRoute(host), 50); - } - - public SmhiApiRespons hamtaData(Plats plats) throws Exception { - var client = HttpClients.custom() - .setConnectionManager(connectionManager) - .build(); - - String url = String.format(Locale.US, "%s/category/pmp3g/version/2/geotype/point/lon/%f/lat/%f/data.json", - server, plats.longitud, plats.latitud); - - Thread.sleep(100); - - logger.info("Fetching {} - {}", plats, url); - - HttpGet get = new HttpGet(url); - get.addHeader("User-Agent", userAgent); - - try (var rsp = client.execute(get)) { - var entity = rsp.getEntity(); - String content = new String(entity.getContent().readAllBytes()); - int statusCode = rsp.getStatusLine().getStatusCode(); - - var expires = - Arrays.stream(rsp.getHeaders("Expires")) - .map(Header::getValue) - .map(DateTimeFormatter.RFC_1123_DATE_TIME::parse) - .map(LocalDateTime::from) - .findFirst().map(Object::toString).orElse(""); - - - if (statusCode == 200) { - return new SmhiApiRespons(content, expires, plats); - } - throw new IllegalStateException("Fel i backend " + statusCode + " " + content); - } - - } - -} - -class SmhiApiRespons { - public final String jsonContent; - public final String expiryDate; - public final Plats plats; - - SmhiApiRespons(String jsonContent, String expiryDate, Plats plats) { - this.jsonContent = jsonContent; - this.expiryDate = expiryDate; - this.plats = plats; - } -} \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/scraper/crawler/SmhiCrawler.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/scraper/crawler/SmhiCrawler.java deleted file mode 100644 index c97da68d..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/scraper/crawler/SmhiCrawler.java +++ /dev/null @@ -1,106 +0,0 @@ -package nu.marginalia.wmsa.smhi.scraper.crawler; - -import com.google.gson.*; -import com.google.inject.Inject; -import io.reactivex.rxjava3.core.Maybe; -import io.reactivex.rxjava3.core.Observable; -import io.reactivex.rxjava3.disposables.Disposable; -import io.reactivex.rxjava3.schedulers.Schedulers; -import lombok.SneakyThrows; -import nu.marginalia.wmsa.smhi.model.Plats; -import nu.marginalia.wmsa.smhi.model.PrognosData; -import nu.marginalia.wmsa.smhi.scraper.PlatsReader; -import nu.marginalia.wmsa.smhi.scraper.crawler.entity.SmhiEntityStore; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.lang.reflect.Type; -import java.time.LocalDateTime; -import java.time.format.DateTimeFormatter; -import java.util.ArrayList; -import java.util.List; -import java.util.Optional; -import java.util.concurrent.TimeUnit; - -public class SmhiCrawler { - - private final Logger logger = LoggerFactory.getLogger(getClass()); - - private final Gson gson; - private final SmhiBackendApi api; - private final SmhiEntityStore store; - private final List platser; - private Disposable job; - - @Inject @SneakyThrows - public SmhiCrawler(SmhiBackendApi backendApi, SmhiEntityStore store, PlatsReader platsReader) { - this.api = backendApi; - this.store = store; - this.platser = platsReader.readPlatser(); - - class LocalDateAdapter implements JsonDeserializer { - @Override - public LocalDateTime deserialize(JsonElement json, Type typeOfT, JsonDeserializationContext context) throws JsonParseException { - return LocalDateTime - .parse(json.getAsString(), DateTimeFormatter.ISO_ZONED_DATE_TIME); - } - } - - gson = new GsonBuilder() - .registerTypeAdapter(LocalDateTime.class, new LocalDateAdapter()) - .create(); - } - - public void start() { - job = Observable - .fromIterable(new ArrayList<>(platser)) - .subscribeOn(Schedulers.io()) - .filter(this::isNeedsUpdate) - .take(5) - .flatMapMaybe(this::hamtaData) - .repeatWhen(this::repeatDelay) - .doOnError(this::handleError) - .subscribe(store::offer); - } - public void stop() { - Optional.ofNullable(job).ifPresent(Disposable::dispose); - } - - private Observable repeatDelay(Observable completed) { - return completed.delay(1, TimeUnit.SECONDS); - } - - protected void handleError(Throwable throwable) { - logger.error("Caught error", throwable); - } - - public Maybe hamtaData(Plats plats) { - try { - var data = api.hamtaData(plats); - - PrognosData model = gson.fromJson(data.jsonContent, PrognosData.class); - - model.expires = data.expiryDate; - model.plats = plats; - - return Maybe.just(model); - } - catch (Exception ex) { - logger.error("Failed to fetch data", ex); - return Maybe.empty(); - } - } - - - boolean isNeedsUpdate(Plats plats) { - var prognos = store.prognos(plats); - - if (null == prognos) { - return true; - } - - LocalDateTime crawlTime = LocalDateTime.parse(prognos.crawlTime); - return crawlTime.plusHours(1).isBefore(LocalDateTime.now()); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/scraper/crawler/entity/SmhiEntityStore.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/scraper/crawler/entity/SmhiEntityStore.java deleted file mode 100644 index d2f608aa..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/smhi/scraper/crawler/entity/SmhiEntityStore.java +++ /dev/null @@ -1,62 +0,0 @@ -package nu.marginalia.wmsa.smhi.scraper.crawler.entity; - -import com.google.inject.Singleton; -import io.reactivex.rxjava3.subjects.PublishSubject; -import nu.marginalia.wmsa.smhi.model.Plats; -import nu.marginalia.wmsa.smhi.model.PrognosData; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.concurrent.locks.Lock; -import java.util.concurrent.locks.ReadWriteLock; -import java.util.concurrent.locks.ReentrantReadWriteLock; - -@Singleton -public class SmhiEntityStore { - private final ReadWriteLock rwl = new ReentrantReadWriteLock(); - private final Map data = new HashMap<>(); - - public final PublishSubject platser = PublishSubject.create(); - public final PublishSubject prognosdata = PublishSubject.create(); - Logger logger = LoggerFactory.getLogger(getClass()); - public boolean offer(PrognosData modell) { - Lock lock = this.rwl.writeLock(); - try { - lock.lock(); - if (data.put(modell.plats, modell) == null) { - platser.onNext(modell.plats); - } - prognosdata.onNext(modell); - } - finally { - lock.unlock(); - } - return true; - } - - public List platser() { - Lock lock = this.rwl.readLock(); - try { - lock.lock(); - return new ArrayList<>(data.keySet()); - } - finally { - lock.unlock(); - } - } - - public PrognosData prognos(Plats plats) { - Lock lock = this.rwl.readLock(); - try { - lock.lock(); - return data.get(plats); - } - finally { - lock.unlock(); - } - } -} diff --git a/marginalia_nu/src/main/resources/templates/edge/parts/search-footer.hdb b/marginalia_nu/src/main/resources/templates/edge/parts/search-footer.hdb index 0126478b..dc0d7157 100644 --- a/marginalia_nu/src/main/resources/templates/edge/parts/search-footer.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/parts/search-footer.hdb @@ -56,6 +56,9 @@ year=2005(beta) The document was ostensibly published in 2005 year<2005(beta) The document was ostensibly published in or before 2005 + rank>50(beta) The ranking of the website is at least 50 in a span of 1 - 255 + year<50(beta) The ranking of the website is at most 50 in a span of 1 - 255 + format:html5Filter documents using the HTML5 standard. This is typically modern websites. format:xhtmlFilter documents using the XHTML standard format:html123Filter documents using the HTML standards 1, 2, and 3. This is typically very old websites. diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LinkParserTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LinkParserTest.java index d4a7e428..065310f7 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LinkParserTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LinkParserTest.java @@ -45,6 +45,7 @@ class LinkParserTest { @Test void testRelative() throws URISyntaxException { + assertEquals("http://search.marginalia.nu/", parseLink("//search.marginalia.nu", "/")); assertEquals("http://www.marginalia.nu/test", parseLink("../test", "/")); assertEquals("http://www.marginalia.nu/test", parseLink("test", "/")); assertEquals("http://www.marginalia.nu/foo/test", parseLink("test", "/foo/index.html")); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java index 08dcef4c..344973e4 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java @@ -6,15 +6,18 @@ import nu.marginalia.util.language.WordPatterns; import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; import nu.marginalia.util.language.processing.KeywordExtractor; -import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.sentence.SentenceExtractor; import nu.marginalia.util.language.processing.model.KeywordMetadata; import nu.marginalia.util.language.processing.model.WordRep; import nu.marginalia.util.language.processing.model.WordSpan; import nu.marginalia.util.language.processing.model.tag.WordSeparator; import nu.marginalia.wmsa.configuration.WmsaHome; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; +import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; +import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader; import nu.marginalia.wmsa.edge.model.EdgeDomain; +import org.apache.commons.lang3.tuple.Pair; import org.jsoup.Jsoup; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; @@ -26,6 +29,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.*; import java.util.regex.Pattern; +import java.util.stream.IntStream; @Tag("slow") class SentenceExtractorTest { @@ -38,7 +42,6 @@ class SentenceExtractorTest { newSe = new SentenceExtractor(lm); legacySe = new SentenceExtractor(lm); - legacySe.setLegacyMode(true); } @@ -83,7 +86,7 @@ class SentenceExtractorTest { var dld = se.extractSentences(Jsoup.parse(Files.readString(file.toPath()))); Map counts = new HashMap<>(); for (var sentence : dld.sentences) { - for (WordSpan kw : keywordExtractor.getNames(sentence)) { + for (WordSpan kw : keywordExtractor.getProperNames(sentence)) { if (kw.end + 2 >= sentence.length()) { continue; } @@ -145,7 +148,22 @@ class SentenceExtractorTest { for (var file : Objects.requireNonNull(data.toFile().listFiles())) { var newResult = newSe.extractSentences(Jsoup.parse(Files.readString(file.toPath()))); var newRes = documentKeywordExtractor.extractKeywords(newResult, new KeywordMetadata()); - System.out.println(newRes); + + var terms = IntStream.range(0, newRes.size()).mapToObj(i -> Pair.of(newRes.words.get(i), new EdgePageWordMetadata(newRes.metadata.get(i)))) + .sorted(Comparator.comparing(e -> -e.getValue().tfIdf())) + .limit(100) + .map(Pair::getKey) + .toArray(String[]::new); + System.out.println(Arrays.toString(terms)); + + var terms2 = IntStream.range(0, newRes.size()).mapToObj(i -> Pair.of(newRes.words.get(i), new EdgePageWordMetadata(newRes.metadata.get(i)))) + .sorted(Comparator.comparing(e -> -e.getValue().tfIdf())) + .filter(e -> e.getValue().hasFlag(EdgePageWordFlags.Subjects)) + .limit(100) + .map(Pair::getKey) + .toArray(String[]::new); + System.out.println(Arrays.toString(terms2)); + System.out.println("--"); } System.out.println(System.currentTimeMillis() - st); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadataTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadataTest.java index 9c0d9beb..a3552a85 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadataTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/model/EdgePageDocumentsMetadataTest.java @@ -3,13 +3,15 @@ package nu.marginalia.wmsa.edge.index.model; import org.junit.jupiter.api.Test; +import java.util.EnumSet; + import static org.junit.jupiter.api.Assertions.assertEquals; class EdgePageDocumentsMetadataTest { @Test public void codecYear() { - var meta = new EdgePageDocumentsMetadata(0, 0, 192, 0, 0, (byte) 0); + var meta = new EdgePageDocumentsMetadata(0, 0, 0, 192, 0, 0, (byte) 0); long encoded = meta.encode(); var decoded = new EdgePageDocumentsMetadata(encoded); assertEquals(192, decoded.year()); @@ -17,7 +19,7 @@ class EdgePageDocumentsMetadataTest { @Test public void codecTopology() { - var meta = new EdgePageDocumentsMetadata(0, 192, 0, 0, 0, (byte) 0); + var meta = new EdgePageDocumentsMetadata(0, 0, 192, 0, 0, 0, (byte) 0); long encoded = meta.encode(); var decoded = new EdgePageDocumentsMetadata(encoded); assertEquals(192, decoded.topology()); @@ -25,7 +27,7 @@ class EdgePageDocumentsMetadataTest { @Test public void codecSets() { - var meta = new EdgePageDocumentsMetadata(0, 0, 0, 14, 0, (byte) 0); + var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 14, 0, (byte) 0); long encoded = meta.encode(); var decoded = new EdgePageDocumentsMetadata(encoded); assertEquals(14, decoded.sets()); @@ -33,7 +35,7 @@ class EdgePageDocumentsMetadataTest { @Test public void codecQuality() { - var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 9, (byte) 0); + var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 0, 9, (byte) 0); long encoded = meta.encode(); var decoded = new EdgePageDocumentsMetadata(encoded); assertEquals(9, decoded.quality()); @@ -41,7 +43,7 @@ class EdgePageDocumentsMetadataTest { @Test public void codecFlags() { - var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 0, (byte) 255); + var meta = new EdgePageDocumentsMetadata(0, 0, 0, 0, 0, 0, (byte) 255); long encoded = meta.encode(); System.out.println(Long.toHexString(encoded)); var decoded = new EdgePageDocumentsMetadata(encoded); @@ -57,7 +59,17 @@ class EdgePageDocumentsMetadataTest { assertEquals(50, new EdgePageDocumentsMetadata(0).withSize(4).size()); assertEquals(50, EdgePageDocumentsMetadata.decodeSize(new EdgePageDocumentsMetadata(0).withSize(4).encode())); - assertEquals(50*255, EdgePageDocumentsMetadata.decodeSize(new EdgePageDocumentsMetadata(0).withSize(Integer.MAX_VALUE).encode())); - assertEquals(50*255, new EdgePageDocumentsMetadata(0).withSize(Integer.MAX_VALUE).size()); + assertEquals(50 * 255, EdgePageDocumentsMetadata.decodeSize(new EdgePageDocumentsMetadata(0).withSize(Integer.MAX_VALUE).encode())); + assertEquals(50 * 255, new EdgePageDocumentsMetadata(0).withSize(Integer.MAX_VALUE).size()); + } + + @Test + public void encRank() { + var meta = new EdgePageDocumentsMetadata(5, 22, 3, 8, EnumSet.noneOf(EdgePageDocumentFlags.class)) + .withSize(0xffffffff).encode(); + var enc2 = EdgePageDocumentsMetadata.encodeRank(meta, 83); + + assertEquals(83, EdgePageDocumentsMetadata.decodeRank(enc2)); + assertEquals(5, EdgePageDocumentsMetadata.decodeTopology(enc2)); } } \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverterTest.java index a4b97a7a..e5652faa 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverterTest.java @@ -2,10 +2,11 @@ package nu.marginalia.wmsa.edge.index.postings.forward; import lombok.SneakyThrows; import nu.marginalia.util.array.LongArray; -import nu.marginalia.util.dict.DictionaryHashMap; +import nu.marginalia.util.dict.OffHeapDictionaryHashMap; import nu.marginalia.util.test.TestUtil; import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.wmsa.edge.index.postings.DomainRankings; import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader; import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile; @@ -36,7 +37,6 @@ class ForwardIndexConverterTest { private final Logger logger = LoggerFactory.getLogger(getClass()); Path dataDir; - private Path wordsFile; private Path docsFileId; private Path docsFileData; @@ -47,7 +47,7 @@ class ForwardIndexConverterTest { dictionaryFile = Files.createTempFile("tmp", ".dict"); dictionaryFile.toFile().deleteOnExit(); - keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new DictionaryHashMap(1L<<18)); + keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new OffHeapDictionaryHashMap(1L<<18)); keywordLexicon.getOrInsert("0"); indexFile = Files.createTempFile("tmp", ".idx"); @@ -71,7 +71,6 @@ class ForwardIndexConverterTest { var reader = new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(indexFile)); - wordsFile = dataDir.resolve("words.dat"); docsFileId = dataDir.resolve("docs-i.dat"); docsFileData = dataDir.resolve("docs-d.dat"); } @@ -104,18 +103,15 @@ class ForwardIndexConverterTest { @Test void testForwardIndex() throws IOException { - Path tmpDir = Path.of("/tmp"); - - new ForwardIndexConverter(tmpDir, indexFile.toFile(), docsFileId, docsFileData).convert(); + new ForwardIndexConverter(indexFile.toFile(), docsFileId, docsFileData, new DomainRankings()).convert(); var forwardReader = new ForwardIndexReader(docsFileId, docsFileData); for (int i = 36; i < workSetSize; i++) { - assertEquals(i % 5, forwardReader.getDocMeta(i)); + assertEquals(0x00FF000000000000L | (i % 5), forwardReader.getDocMeta(i)); assertEquals(i/20, forwardReader.getDomainId(i)); } - TestUtil.clearTempDir(dataDir); } diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest.java index 4c210c54..32fcb58b 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest.java @@ -3,11 +3,12 @@ package nu.marginalia.wmsa.edge.index.postings.reverse; import lombok.SneakyThrows; import nu.marginalia.util.array.LongArray; import nu.marginalia.util.array.buffer.LongQueryBuffer; -import nu.marginalia.util.dict.DictionaryHashMap; +import nu.marginalia.util.dict.OffHeapDictionaryHashMap; import nu.marginalia.util.test.TestUtil; import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; +import nu.marginalia.wmsa.edge.index.postings.DomainRankings; import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader; import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile; @@ -43,7 +44,7 @@ class ReverseIndexConverterTest { dictionaryFile = Files.createTempFile("tmp", ".dict"); dictionaryFile.toFile().deleteOnExit(); - keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new DictionaryHashMap(1L<<16)); + keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new OffHeapDictionaryHashMap(1L<<16)); keywordLexicon.getOrInsert("0"); indexFile = Files.createTempFile("tmp", ".idx"); @@ -86,7 +87,7 @@ class ReverseIndexConverterTest { var docsFile = dataDir.resolve("docs.dat"); var journalReader = new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(indexFile)); - new ReverseIndexConverter(tmpDir, journalReader, wordsFile, docsFile) + new ReverseIndexConverter(tmpDir, journalReader, new DomainRankings(), wordsFile, docsFile) .convert(); var reverseIndexReader = new ReverseIndexReader(wordsFile, docsFile); @@ -104,17 +105,17 @@ class ReverseIndexConverterTest { var buffer = new LongQueryBuffer(32); reverseIndexReader.documents(keywordLexicon.getReadOnly("1"), ReverseIndexEntrySourceBehavior.DO_PREFER).read(buffer); - assertArrayEquals(LongStream.range(1, 17).toArray(), buffer.copyData()); + assertArrayEquals(LongStream.range(1, 17).map(v -> v | (255L << 32)).toArray(), buffer.copyData()); System.out.println(buffer); buffer.reset(); reverseIndexReader.documents(keywordLexicon.getReadOnly("2"), ReverseIndexEntrySourceBehavior.DO_PREFER).read(buffer); - assertArrayEquals(LongStream.range(1, 17).map(v -> v*2).toArray(), buffer.copyData()); + assertArrayEquals(LongStream.range(1, 17).map(v -> v*2).map(v -> v | (255L << 32)).toArray(), buffer.copyData()); System.out.println(buffer); buffer.reset(); reverseIndexReader.documents(keywordLexicon.getReadOnly("3"), ReverseIndexEntrySourceBehavior.DO_PREFER).read(buffer); - assertArrayEquals(LongStream.range(1, 17).map(v -> v*3).toArray(), buffer.copyData()); + assertArrayEquals(LongStream.range(1, 17).map(v -> v*3).map(v -> v | (255L << 32)).toArray(), buffer.copyData()); System.out.println(buffer); buffer.reset(); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest2.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest2.java index 6efcbbd3..2525d39b 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest2.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest2.java @@ -3,10 +3,11 @@ package nu.marginalia.wmsa.edge.index.postings.reverse; import lombok.SneakyThrows; import nu.marginalia.util.array.LongArray; import nu.marginalia.util.array.buffer.LongQueryBuffer; -import nu.marginalia.util.dict.DictionaryHashMap; +import nu.marginalia.util.dict.OffHeapDictionaryHashMap; import nu.marginalia.util.test.TestUtil; import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.wmsa.edge.index.postings.DomainRankings; import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader; import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile; @@ -50,7 +51,7 @@ class ReverseIndexConverterTest2 { dictionaryFile = Files.createTempFile("tmp", ".dict"); dictionaryFile.toFile().deleteOnExit(); - keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new DictionaryHashMap(1L<<18)); + keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new OffHeapDictionaryHashMap(1L<<18)); keywordLexicon.getOrInsert("0"); indexFile = Files.createTempFile("tmp", ".idx"); @@ -114,7 +115,7 @@ class ReverseIndexConverterTest2 { Path tmpDir = Path.of("/tmp"); - new ReverseIndexConverter(tmpDir, new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(indexFile)), wordsFile, docsFile).convert(); + new ReverseIndexConverter(tmpDir, new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(indexFile)), new DomainRankings(), wordsFile, docsFile).convert(); var reverseReader = new ReverseIndexReader(wordsFile, docsFile); @@ -139,7 +140,7 @@ class ReverseIndexConverterTest2 { Path tmpDir = Path.of("/tmp"); - new ReverseIndexConverter(tmpDir, new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(indexFile), null, ReverseIndexPriorityParameters::filterPriorityRecord), wordsFile, docsFile).convert(); + new ReverseIndexConverter(tmpDir, new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(indexFile), null, ReverseIndexPriorityParameters::filterPriorityRecord), new DomainRankings(), wordsFile, docsFile).convert(); var reverseReader = new ReverseIndexReader(wordsFile, docsFile); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTest.java index a59e4fd0..0ef29e64 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTest.java @@ -3,10 +3,7 @@ package nu.marginalia.wmsa.edge.index.service; import com.google.inject.Guice; import com.google.inject.Inject; import nu.marginalia.wmsa.configuration.server.Initialization; -import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; -import nu.marginalia.wmsa.edge.index.model.QueryStrategy; +import nu.marginalia.wmsa.edge.index.model.*; import nu.marginalia.wmsa.edge.index.postings.SearchIndexControl; import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader; @@ -80,14 +77,12 @@ public class EdgeIndexIntegrationTest { var rsp = queryService.query( EdgeSearchSpecification.builder() - .timeoutMs(Integer.MAX_VALUE) - .fetchSize(4000) - .limitTotal(10) - .limitByDomain(10) + .queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000)) .queryStrategy(QueryStrategy.SENTENCE) .year(SpecificationLimit.none()) .quality(SpecificationLimit.none()) .size(SpecificationLimit.none()) + .rank(SpecificationLimit.none()) .domains(new ArrayList<>()) .searchSetIdentifier(SearchSetIdentifier.NONE) .subqueries(List.of(new EdgeSearchSubquery( @@ -115,13 +110,11 @@ public class EdgeIndexIntegrationTest { var rsp = queryService.query( EdgeSearchSpecification.builder() - .timeoutMs(Integer.MAX_VALUE) - .fetchSize(4000) - .limitTotal(10) - .limitByDomain(10) + .queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000)) .year(SpecificationLimit.none()) .quality(SpecificationLimit.none()) .size(SpecificationLimit.none()) + .rank(SpecificationLimit.none()) .queryStrategy(QueryStrategy.SENTENCE) .domains(List.of(2)) .subqueries(List.of(new EdgeSearchSubquery( @@ -144,13 +137,11 @@ public class EdgeIndexIntegrationTest { var rsp = queryService.query( EdgeSearchSpecification.builder() - .timeoutMs(Integer.MAX_VALUE) - .fetchSize(4000) - .limitTotal(10) - .limitByDomain(10) + .queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000)) .quality(SpecificationLimit.none()) .year(SpecificationLimit.equals(1998)) .size(SpecificationLimit.none()) + .rank(SpecificationLimit.none()) .queryStrategy(QueryStrategy.SENTENCE) .searchSetIdentifier(SearchSetIdentifier.NONE) .subqueries(List.of(new EdgeSearchSubquery( @@ -173,7 +164,7 @@ public class EdgeIndexIntegrationTest { long fullId = id | ((long) (32 - (id % 32)) << 32); - var header = new SearchIndexJournalEntryHeader(factors.length, fullId, new EdgePageDocumentsMetadata(0, 0, id % 5, id, id % 20, (byte) 0).encode()); + var header = new SearchIndexJournalEntryHeader(factors.length, fullId, new EdgePageDocumentsMetadata(0, 0, 0, id % 5, id, id % 20, (byte) 0).encode()); long[] data = new long[factors.length*2]; for (int i = 0; i < factors.length; i++) { diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTestModule.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTestModule.java index 46d8228c..aaa44c35 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTestModule.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTestModule.java @@ -4,6 +4,7 @@ import com.google.inject.AbstractModule; import com.google.inject.name.Names; import nu.marginalia.util.test.TestUtil; import nu.marginalia.wmsa.edge.index.IndexServicesFactory; +import nu.marginalia.wmsa.edge.index.postings.DomainRankings; import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService; import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetAny; import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier; @@ -39,15 +40,15 @@ public class EdgeIndexIntegrationTestModule extends AbstractModule { @Override protected void configure() { + System.setProperty("small-ram", "true"); try { bind(IndexServicesFactory.class).toInstance(new IndexServicesFactory(Path.of("/tmp"), - slowDir, fastDir, - 1L<<24, - null + slowDir, fastDir )); EdgeIndexSearchSetsService setsServiceMock = Mockito.mock(EdgeIndexSearchSetsService.class); when(setsServiceMock.getSearchSetByName(SearchSetIdentifier.NONE)).thenReturn(new SearchSetAny()); + when(setsServiceMock.getDomainRankings()).thenReturn(new DomainRankings()); bind(EdgeIndexSearchSetsService.class).toInstance(setsServiceMock); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/svc/searchset/RankingSearchSetTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/svc/searchset/RankingSearchSetTest.java new file mode 100644 index 00000000..effa7a1f --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/svc/searchset/RankingSearchSetTest.java @@ -0,0 +1,37 @@ +package nu.marginalia.wmsa.edge.index.svc.searchset; + +import org.junit.jupiter.api.Test; +import org.roaringbitmap.RoaringBitmap; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +import static org.junit.jupiter.api.Assertions.*; + +class RankingSearchSetTest { + + @Test + public void testSerDes() throws IOException { + Path p = Files.createTempFile(getClass().getSimpleName(), ".dat"); + + var bm = new RoaringBitmap(); + bm.add(1); + bm.add(5); + bm.add(7); + bm.add(9); + + RankingSearchSet set = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, p, bm); + set.write(); + + RankingSearchSet set2 = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, p); + assertTrue(set2.contains(1)); + assertTrue(set2.contains(5)); + assertTrue(set2.contains(7)); + assertTrue(set2.contains(9)); + + Files.delete(p); + + } + +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/arxiv/ArxivParserTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/arxiv/ArxivParserTest.java index 97826605..e6e146ee 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/arxiv/ArxivParserTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/arxiv/ArxivParserTest.java @@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.integration.arxiv; import nu.marginalia.util.TestLanguageModels; import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; -import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.sentence.SentenceExtractor; import nu.marginalia.util.language.processing.model.KeywordMetadata; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import nu.marginalia.wmsa.edge.integration.arxiv.model.ArxivMetadata; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsTest.java index b0c98dc9..6971d9b7 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsTest.java @@ -4,7 +4,7 @@ import nu.marginalia.util.ParallelPipe; import nu.marginalia.util.TestLanguageModels; import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; -import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.sentence.SentenceExtractor; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData; import nu.marginalia.wmsa.edge.integration.stackoverflow.model.StackOverflowPost; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaTest.java index a2b60163..40a58e93 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaTest.java @@ -6,7 +6,7 @@ import nu.marginalia.util.TestLanguageModels; import nu.marginalia.util.language.DocumentDebugger; import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; -import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.sentence.SentenceExtractor; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData; import nu.marginalia.wmsa.edge.integration.wikipedia.model.WikipediaArticle; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java index f6d74999..b2477ca8 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java @@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.search.query; import nu.marginalia.util.TestLanguageModels; import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.sentence.SentenceExtractor; import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import org.junit.jupiter.api.BeforeAll; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/smhi/scraper/crawler/SmhiBackendApiTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/smhi/scraper/crawler/SmhiBackendApiTest.java deleted file mode 100644 index d8b252f9..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/smhi/scraper/crawler/SmhiBackendApiTest.java +++ /dev/null @@ -1,31 +0,0 @@ -package nu.marginalia.wmsa.smhi.scraper.crawler; - -import nu.marginalia.wmsa.smhi.model.Plats; -import org.junit.jupiter.api.Test; - -import java.time.LocalDateTime; -import java.time.ZoneId; -import java.time.format.DateTimeFormatter; - -class SmhiBackendApiTest { - - @Test - void hamtaData() throws Exception { - var api = new SmhiBackendApi("nu.marginalia"); - - - System.out.println(api.hamtaData(new Plats("Ystad", "55.42966", "13.82041")) - .jsonContent - ); - } - - @Test - public void testDatum() { - System.out.println(LocalDateTime.parse("2021-05-29T14:06:48Z", - DateTimeFormatter.ISO_ZONED_DATE_TIME) - .atZone(ZoneId.of("GMT")) - .toOffsetDateTime() - .atZoneSameInstant(ZoneId.of("Europe/Stockholm")) - ); - } -} \ No newline at end of file diff --git a/third_party/README.md b/third_party/README.md index b72dec53..bd5f5c85 100644 --- a/third_party/README.md +++ b/third_party/README.md @@ -1,11 +1,15 @@ # Third Party Code This is a mix of code from other projects, that has either been aggressively modified to suite the needs of the project, -or lack an artifact. +or lack an artifact, or to override some default that is inappropriate for the type of data Marginalia throws at the library. ## Sources and Licenses + * [RDRPosTagger](https://github.com/datquocnguyen/RDRPOSTagger) - GPL3 * [PorterStemmer](https://github.com/caarmen/porter-stemmer) - LGPL3 * [Uppend](https://github.com/upserve/uppend) - MIT * [OpenZIM](https://github.com/openzim/libzim) - GPL-2.0 -* [XZ for Java](https://tukaani.org/xz/) - Public Domain \ No newline at end of file +* [XZ for Java](https://tukaani.org/xz/) - Public Domain +* [GSON](https://github.com/google/gson) - Apache-2.0 +* Stanford OpenNLP - Apache-2.0 +* OpenJDK - GPL-2.0 (packaged under jdkoverride) \ No newline at end of file diff --git a/third_party/src/main/java/com/github/datquocnguyen/FWObject.java b/third_party/src/main/java/com/github/datquocnguyen/FWObject.java index 4d89465d..9017f23a 100644 --- a/third_party/src/main/java/com/github/datquocnguyen/FWObject.java +++ b/third_party/src/main/java/com/github/datquocnguyen/FWObject.java @@ -36,4 +36,13 @@ public class FWObject context = new String[13]; } } + + public void reset(boolean check) { + if (check) { + System.arraycopy(contextPrototype, 0, context, 0, 13); + } + else { + Arrays.fill(context, null); + } + } } diff --git a/third_party/src/main/java/com/github/datquocnguyen/RDRPOSTagger.java b/third_party/src/main/java/com/github/datquocnguyen/RDRPOSTagger.java index e51f44ec..a0bea5b2 100644 --- a/third_party/src/main/java/com/github/datquocnguyen/RDRPOSTagger.java +++ b/third_party/src/main/java/com/github/datquocnguyen/RDRPOSTagger.java @@ -103,8 +103,10 @@ public class RDRPOSTagger var initialTags = InitialTagger.EnInitTagger4Sentence(FREQDICT, sentence); String[] tags = new String[initialTags.length]; + FWObject object = new FWObject(true); + for (int i = 0; i < initialTags.length; i++) { - FWObject object = Utils.getObject(sentence, initialTags, initialTags.length, i); + Utils.getObject(object, sentence, initialTags, initialTags.length, i); tags[i] = findFiredNode(object).conclusion; } diff --git a/third_party/src/main/java/com/github/datquocnguyen/Utils.java b/third_party/src/main/java/com/github/datquocnguyen/Utils.java index e422b5ad..4cd91d58 100644 --- a/third_party/src/main/java/com/github/datquocnguyen/Utils.java +++ b/third_party/src/main/java/com/github/datquocnguyen/Utils.java @@ -123,9 +123,9 @@ public class Utils return condition; } - public static FWObject getObject(String[] words, String[] tags, int size, int index) + public static FWObject getObject(FWObject object, String[] words, String[] tags, int size, int index) { - FWObject object = new FWObject(true); + object.reset(true); if (index > 1) { object.context[4] = words[index-2]; @@ -175,9 +175,7 @@ public class Utils else return ""; } - String conclusion = str.substring(str.indexOf("\"") + 1, - str.length() - 1); - return conclusion; + return str.substring(str.indexOf("\"") + 1, str.length() - 1); } public static void main(String[] args) diff --git a/third_party/src/main/java/com/google/gson/stream/JsonReader.java b/third_party/src/main/java/com/google/gson/stream/JsonReader.java new file mode 100644 index 00000000..213feffa --- /dev/null +++ b/third_party/src/main/java/com/google/gson/stream/JsonReader.java @@ -0,0 +1,1637 @@ +/* + * Copyright (C) 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.gson.stream; + +import com.google.gson.internal.JsonReaderInternalAccess; +import com.google.gson.internal.bind.JsonTreeReader; +import java.io.Closeable; +import java.io.EOFException; +import java.io.IOException; +import java.io.Reader; +import java.util.Arrays; + +/** + * Reads a JSON (RFC 7159) + * encoded value as a stream of tokens. This stream includes both literal + * values (strings, numbers, booleans, and nulls) as well as the begin and + * end delimiters of objects and arrays. The tokens are traversed in + * depth-first order, the same order that they appear in the JSON document. + * Within JSON objects, name/value pairs are represented by a single token. + * + *

Parsing JSON

+ * To create a recursive descent parser for your own JSON streams, first create + * an entry point method that creates a {@code JsonReader}. + * + *

Next, create handler methods for each structure in your JSON text. You'll + * need a method for each object type and for each array type. + *

    + *
  • Within array handling methods, first call {@link + * #beginArray} to consume the array's opening bracket. Then create a + * while loop that accumulates values, terminating when {@link #hasNext} + * is false. Finally, read the array's closing bracket by calling {@link + * #endArray}. + *
  • Within object handling methods, first call {@link + * #beginObject} to consume the object's opening brace. Then create a + * while loop that assigns values to local variables based on their name. + * This loop should terminate when {@link #hasNext} is false. Finally, + * read the object's closing brace by calling {@link #endObject}. + *
+ *

When a nested object or array is encountered, delegate to the + * corresponding handler method. + * + *

When an unknown name is encountered, strict parsers should fail with an + * exception. Lenient parsers should call {@link #skipValue()} to recursively + * skip the value's nested tokens, which may otherwise conflict. + * + *

If a value may be null, you should first check using {@link #peek()}. + * Null literals can be consumed using either {@link #nextNull()} or {@link + * #skipValue()}. + * + *

Example

+ * Suppose we'd like to parse a stream of messages such as the following:
 {@code
+ * [
+ *   {
+ *     "id": 912345678901,
+ *     "text": "How do I read a JSON stream in Java?",
+ *     "geo": null,
+ *     "user": {
+ *       "name": "json_newb",
+ *       "followers_count": 41
+ *      }
+ *   },
+ *   {
+ *     "id": 912345678902,
+ *     "text": "@json_newb just use JsonReader!",
+ *     "geo": [50.454722, -104.606667],
+ *     "user": {
+ *       "name": "jesse",
+ *       "followers_count": 2
+ *     }
+ *   }
+ * ]}
+ * This code implements the parser for the above structure:
   {@code
+ *
+ *   public List readJsonStream(InputStream in) throws IOException {
+ *     JsonReader reader = new JsonReader(new InputStreamReader(in, "UTF-8"));
+ *     try {
+ *       return readMessagesArray(reader);
+ *     } finally {
+ *       reader.close();
+ *     }
+ *   }
+ *
+ *   public List readMessagesArray(JsonReader reader) throws IOException {
+ *     List messages = new ArrayList();
+ *
+ *     reader.beginArray();
+ *     while (reader.hasNext()) {
+ *       messages.add(readMessage(reader));
+ *     }
+ *     reader.endArray();
+ *     return messages;
+ *   }
+ *
+ *   public Message readMessage(JsonReader reader) throws IOException {
+ *     long id = -1;
+ *     String text = null;
+ *     User user = null;
+ *     List geo = null;
+ *
+ *     reader.beginObject();
+ *     while (reader.hasNext()) {
+ *       String name = reader.nextName();
+ *       if (name.equals("id")) {
+ *         id = reader.nextLong();
+ *       } else if (name.equals("text")) {
+ *         text = reader.nextString();
+ *       } else if (name.equals("geo") && reader.peek() != JsonToken.NULL) {
+ *         geo = readDoublesArray(reader);
+ *       } else if (name.equals("user")) {
+ *         user = readUser(reader);
+ *       } else {
+ *         reader.skipValue();
+ *       }
+ *     }
+ *     reader.endObject();
+ *     return new Message(id, text, user, geo);
+ *   }
+ *
+ *   public List readDoublesArray(JsonReader reader) throws IOException {
+ *     List doubles = new ArrayList();
+ *
+ *     reader.beginArray();
+ *     while (reader.hasNext()) {
+ *       doubles.add(reader.nextDouble());
+ *     }
+ *     reader.endArray();
+ *     return doubles;
+ *   }
+ *
+ *   public User readUser(JsonReader reader) throws IOException {
+ *     String username = null;
+ *     int followersCount = -1;
+ *
+ *     reader.beginObject();
+ *     while (reader.hasNext()) {
+ *       String name = reader.nextName();
+ *       if (name.equals("name")) {
+ *         username = reader.nextString();
+ *       } else if (name.equals("followers_count")) {
+ *         followersCount = reader.nextInt();
+ *       } else {
+ *         reader.skipValue();
+ *       }
+ *     }
+ *     reader.endObject();
+ *     return new User(username, followersCount);
+ *   }}
+ * + *

Number Handling

+ * This reader permits numeric values to be read as strings and string values to + * be read as numbers. For example, both elements of the JSON array {@code + * [1, "1"]} may be read using either {@link #nextInt} or {@link #nextString}. + * This behavior is intended to prevent lossy numeric conversions: double is + * JavaScript's only numeric type and very large values like {@code + * 9007199254740993} cannot be represented exactly on that platform. To minimize + * precision loss, extremely large values should be written and read as strings + * in JSON. + * + *

Non-Execute Prefix

+ * Web servers that serve private data using JSON may be vulnerable to Cross-site + * request forgery attacks. In such an attack, a malicious site gains access + * to a private JSON file by executing it with an HTML {@code