From 467bf566a994e71737c2ee6f8fbd06c24d1259de Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 11 Jan 2023 19:48:03 +0100 Subject: [PATCH] Hotfixes for 2023-01 release (#137) Co-authored-by: vlofgren Co-authored-by: vlofgren Co-authored-by: Viktor Lofgren Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/137 --- .../util/array/algo/BulkTransferArray.java | 2 + .../util/array/algo/LongArrayBase.java | 1 + .../util/array/page/AbstractPagingArray.java | 19 +++++ .../index/postings/IndexResultValuator.java | 18 +++-- .../reader/SearchIndexJournalCleaner.java | 71 +++++++++++++++++++ .../reader/SearchIndexJournalReadEntry.java | 21 ++++-- .../reader/SearchIndexJournalReader.java | 3 + .../SearchIndexJournalReaderSingleFile.java | 4 ++ .../edge/search/model/EdgeSearchProfile.java | 8 +++ .../wmsa/edge/search/query/QueryFactory.java | 6 +- .../svc/EdgeSearchQueryIndexService.java | 9 --- .../valuation/SearchResultValuator.java | 58 +++++++++------ .../StripSimpleJournalEntriesToolMain.java | 28 ++++++++ .../templates/edge/parts/search-footer.hdb | 6 +- .../templates/edge/parts/search-form.hdb | 1 + 15 files changed, 206 insertions(+), 49 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalCleaner.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/StripSimpleJournalEntriesToolMain.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/BulkTransferArray.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/BulkTransferArray.java index d01d3716..bf0df57d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/BulkTransferArray.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/BulkTransferArray.java @@ -3,4 +3,6 @@ package nu.marginalia.util.array.algo; public interface BulkTransferArray { void set(long start, long end, BufferType buffer, int bufferStart); + + void get(long start, long end, BufferType buffer, int bufferStart); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArrayBase.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArrayBase.java index 03f18bcc..508fdf9a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArrayBase.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArrayBase.java @@ -47,6 +47,7 @@ public interface LongArrayBase extends BulkTransferArray { set(start+i, buffer.get(i + bufferStart)); } } + default void get(long start, long end, LongBuffer buffer, int bufferStart) { for (int i = 0; i < (end-start); i++) { buffer.put(i + bufferStart, get(start + i)); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/page/AbstractPagingArray.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/page/AbstractPagingArray.java index 43a48c16..c772d43e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/page/AbstractPagingArray.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/page/AbstractPagingArray.java @@ -85,4 +85,23 @@ public class AbstractPagingArray, B> { bufferStart += eOff - sOff; } } + + public void get(long start, long end, B buffer, int bufferStart) { + assert end >= start; + + int page = partitioningScheme.getPage(start); + + long endPos; + + for (long pos = start; pos < end; pos = endPos) { + endPos = partitioningScheme.getPageEnd(pos, end); + + int sOff = partitioningScheme.getOffset(pos); + int eOff = partitioningScheme.getEndOffset(start, endPos); + + pages[page++].get(sOff, eOff, buffer, bufferStart); + + bufferStart += eOff - sOff; + } + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexResultValuator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexResultValuator.java index 462e4c7a..d94661e8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexResultValuator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexResultValuator.java @@ -131,7 +131,8 @@ public class IndexResultValuator { } private double calculateTermCoherencePenalty(int urlId, TObjectIntHashMap termToId, List termList) { - long maskDirect = ~0; + long maskDirectGenerous = ~0; + long maskDirectRaw = ~0; long maskAdjacent = ~0; final int flagBitMask = EdgePageWordFlags.Title.asBit() @@ -148,21 +149,28 @@ public class IndexResultValuator { positions = EdgePageWordMetadata.decodePositions(meta); - if (!EdgePageWordMetadata.hasAnyFlags(meta, flagBitMask)) { - maskDirect &= positions; + maskDirectRaw &= positions; + + if (positions != 0 && !EdgePageWordMetadata.hasAnyFlags(meta, flagBitMask)) { maskAdjacent &= (positions | (positions << 1) | (positions >>> 1)); + maskDirectGenerous &= positions; } + } if (maskAdjacent == 0) { return 40; } - if (maskDirect == 0) { + if (maskDirectGenerous == 0) { return 20; } - return Long.numberOfTrailingZeros(maskDirect)/5. - Long.bitCount(maskDirect); + if (maskDirectRaw == 0) { + return 2; + } + + return Long.numberOfTrailingZeros(maskDirectGenerous)/5. - Long.bitCount(maskDirectGenerous); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalCleaner.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalCleaner.java new file mode 100644 index 00000000..8e685a2a --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalCleaner.java @@ -0,0 +1,71 @@ +package nu.marginalia.wmsa.edge.index.postings.journal.reader; + +import nu.marginalia.util.array.LongArray; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.LongBuffer; +import java.nio.file.Path; +import java.util.function.Predicate; + +public class SearchIndexJournalCleaner { + private final SearchIndexJournalReader reader; + + public SearchIndexJournalCleaner(SearchIndexJournalReader reader) { + this.reader = reader; + } + + private long dryRunForNewSize(Predicate entryPredicate) { + long pos = SearchIndexJournalReader.FILE_HEADER_SIZE_LONGS; + + var pt = new ProgressTracker(); + + for (var entry : reader) { + if (entryPredicate.test(entry)) { + pos += entry.totalEntrySizeLongs(); + pt.update(pos); + } + } + + return pos; + } + + public void clean(Path outFile, Predicate entryPredicate) throws IOException { + + System.out.println("Dry run"); + long size = dryRunForNewSize(entryPredicate); + + System.out.println("Copying"); + LongArray outputArray = LongArray.mmapForWriting(outFile, size); + + long pos = SearchIndexJournalReader.FILE_HEADER_SIZE_LONGS; + var pt = new ProgressTracker(); + + LongBuffer adequateBuffer = ByteBuffer.allocateDirect(100*1024*1024).asLongBuffer(); + + for (var entry : reader) { + if (entryPredicate.test(entry)) { + pos += entry.copyTo(pos, adequateBuffer, outputArray); + pt.update(pos); + } + } + + outputArray.set(0, pos*8); + outputArray.set(1, reader.fileHeader().wordCount()); + + outputArray.force(); + } +} + +class ProgressTracker { + long stepSize = 100*1024*1024; + long pos = 0; + + public void update(long pos) { + if (this.pos / stepSize != pos / stepSize) { + System.out.printf("%d Mb\n", (800*pos)/stepSize); + } + this.pos = pos; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReadEntry.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReadEntry.java index 40c2a433..97cd9e98 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReadEntry.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReadEntry.java @@ -5,6 +5,7 @@ import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEn import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader; import java.nio.ByteBuffer; +import java.nio.LongBuffer; public class SearchIndexJournalReadEntry { private final long offset; @@ -15,6 +16,7 @@ public class SearchIndexJournalReadEntry { SearchIndexJournalReadEntry(long offset, LongArray map, long committedSize) { this.map = map; this.committedSize = committedSize; + final long sizeBlock = this.map.get(offset); final long docId = this.map.get(offset + 1); final long meta = this.map.get(offset + 2); @@ -74,18 +76,23 @@ public class SearchIndexJournalReadEntry { } public long nextId() { - return offset + SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS + header.entrySize(); + return offset + totalEntrySizeLongs(); } public SearchIndexJournalReadEntry next() { return new SearchIndexJournalReadEntry(nextId(), map, committedSize); } - public void copyToBuffer(ByteBuffer buffer) { - var dest = buffer.asLongBuffer(); - dest.position(buffer.position() * 8); - dest.limit(buffer.position() * 8 + header.entrySize() + SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS); - map.get(offset, dest); - buffer.position(dest.limit() * 8); + public long copyTo(long pos, LongBuffer adequateBuffer, LongArray destArray) { + long size = totalEntrySizeLongs(); + + map.get(offset, offset + size, adequateBuffer, 0); + destArray.set(pos, pos + size, adequateBuffer, 0); + + return size; + } + + public long totalEntrySizeLongs() { + return header.entrySize() + SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS; } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReader.java index 71811772..a8751f85 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReader.java @@ -1,6 +1,7 @@ package nu.marginalia.wmsa.edge.index.postings.journal.reader; import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalFileHeader; import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalStatistics; import org.jetbrains.annotations.NotNull; @@ -15,6 +16,8 @@ public interface SearchIndexJournalReader extends Iterable resultList = new ArrayList<>(results.size()); - long badQCount = 0; for (var details : resultDecorator.getAllUrlDetails(results)) { - if (details.getUrlQuality() <= -100) { - badQCount++; - continue; - } - details = details.withUrlQualityAdjustment( adjustScoreBasedOnQuery(details, processedQuery.specs)); @@ -85,9 +79,6 @@ public class EdgeSearchQueryIndexService { UrlDeduplicator deduplicator = new UrlDeduplicator(processedQuery.specs.limitByDomain); List retList = new ArrayList<>(processedQuery.specs.limitTotal); - if (badQCount > 0) { - System.out.println(badQCount); - } for (var item : resultList) { if (retList.size() >= processedQuery.specs.limitTotal) break; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/valuation/SearchResultValuator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/valuation/SearchResultValuator.java index 7ea78619..3395b019 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/valuation/SearchResultValuator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/valuation/SearchResultValuator.java @@ -86,7 +86,7 @@ public class SearchResultValuator { } - return bestScore * (0.3 + 0.7 * bestAllTermsFactor) * priorityTermBonus; + return bestScore * (0.1 + 0.9 * bestAllTermsFactor) * priorityTermBonus; } private boolean hasPriorityTerm(List rawScores) { @@ -145,17 +145,41 @@ public class SearchResultValuator { totalFactor *= getAllTermsFactor(keyword, totalWeight, titleLength); } - totalFactor = calculateTermCoherencePenalty(set, totalFactor); + if (set.keywords.length > 1) { + totalFactor = calculateTermCoherencePenalty(set, totalFactor); + } + else { + totalFactor = calculateSingleTermBonus(set, totalFactor); + } return totalFactor; } + private double calculateSingleTermBonus(SearchResultsKeywordSet set, double totalFactor) { + var theKeyword = set.iterator().next(); + + if (theKeyword.wordMetadata.hasFlag(EdgePageWordFlags.Title)) { + return totalFactor * 0.5; + } + else if (theKeyword.wordMetadata.hasFlag(EdgePageWordFlags.Subjects)) { + return totalFactor * 0.6; + } + else if (theKeyword.wordMetadata.hasFlag(EdgePageWordFlags.SiteAdjacent)) { + return totalFactor * 0.65; + } + else if (theKeyword.wordMetadata.hasFlag(EdgePageWordFlags.Site)) { + return totalFactor * 0.7; + } + return totalFactor; + } + private double calculateTermCoherencePenalty(SearchResultsKeywordSet keywordSet, double f) { long maskDirect = ~0; long maskAdjacent = ~0; + byte excludeMask = (byte) (EdgePageWordFlags.Title.asBit() | EdgePageWordFlags.Subjects.asBit() | EdgePageWordFlags.Synthetic.asBit()); - for (var keyword : keywordSet.keywords) { + for (var keyword : keywordSet) { var meta = keyword.wordMetadata; long positions; @@ -163,28 +187,28 @@ public class SearchResultValuator { return f; } + positions = meta.positions(); - if (!EdgePageWordMetadata.hasAnyFlags(meta.flags(), excludeMask)) - { + maskAdjacent &= (positions | (positions << 1) | (positions >>> 1)); + if (positions != 0 && !EdgePageWordMetadata.hasAnyFlags(meta.flags(), excludeMask)) + { maskDirect &= positions; - maskAdjacent &= (positions | (positions << 1) | (positions >>> 1)); } } if (maskAdjacent == 0) { - return 1.2 * f; + return 2 * f; } if (maskDirect == 0) { - return 1.1 * f; + return 1.25 * f; } - if (maskDirect != ~0L) { - double locationFactor = 0.65 + Math.max(0., - 0.35 * Long.numberOfTrailingZeros(maskDirect) / 16. - - Math.sqrt(Long.bitCount(maskDirect) - 1) / 5. + double locationFactor = 0.5 + Math.max(0., + 0.5 * Long.numberOfTrailingZeros(maskDirect) / 16. + - Math.sqrt(Long.bitCount(maskDirect) - 1) / 3. ); return f * locationFactor; @@ -237,16 +261,6 @@ public class SearchResultValuator { return f; } - private double getLengthPenalty(int length) { - if (length < MIN_LENGTH) { - length = MIN_LENGTH; - } - if (length > AVG_LENGTH) { - length = AVG_LENGTH; - } - return (0.5 + 0.5 * length / AVG_LENGTH); - } - private double[] getTermWeights(EdgeSearchResultKeywordScore[] scores) { double[] weights = new double[scores.length]; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/StripSimpleJournalEntriesToolMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/StripSimpleJournalEntriesToolMain.java new file mode 100644 index 00000000..c3d4a1e6 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/StripSimpleJournalEntriesToolMain.java @@ -0,0 +1,28 @@ +package nu.marginalia.wmsa.edge.tools; + +import nu.marginalia.util.array.LongArray; +import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalCleaner; +import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReadEntry; +import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile; + +import java.io.IOException; +import java.nio.file.Path; + +import static nu.marginalia.wmsa.edge.index.model.EdgePageDocumentFlags.Simple; + +public class StripSimpleJournalEntriesToolMain { + + public static void main(String[] args) throws IOException { + Path input = Path.of(args[0]); + Path output = Path.of(args[1]); + + new SearchIndexJournalCleaner(new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(input))) + .clean(output, StripSimpleJournalEntriesToolMain::retainEntry); + + System.out.println("All done!"); + } + + private static boolean retainEntry(SearchIndexJournalReadEntry entry) { + return (entry.header.documentMeta() & Simple.asBit()) == 0; + } +} diff --git a/marginalia_nu/src/main/resources/templates/edge/parts/search-footer.hdb b/marginalia_nu/src/main/resources/templates/edge/parts/search-footer.hdb index fa858789..0126478b 100644 --- a/marginalia_nu/src/main/resources/templates/edge/parts/search-footer.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/parts/search-footer.hdb @@ -52,9 +52,9 @@ q>5The amount of javascript and modern features is at least 5 (on a scale 0 to 25) q<5The amount of javascript and modern features is at most 5 (on a scale 0 to 25) - year>2005The document was ostensibly published in or after 2005 - year=2005The document was ostensibly published in 2005 - year<2005The document was ostensibly published in or before 2005 + year>2005(beta) The document was ostensibly published in or after 2005 + year=2005(beta) The document was ostensibly published in 2005 + year<2005(beta) The document was ostensibly published in or before 2005 format:html5Filter documents using the HTML5 standard. This is typically modern websites. format:xhtmlFilter documents using the XHTML standard diff --git a/marginalia_nu/src/main/resources/templates/edge/parts/search-form.hdb b/marginalia_nu/src/main/resources/templates/edge/parts/search-form.hdb index 75e7f6c7..2a252e56 100644 --- a/marginalia_nu/src/main/resources/templates/edge/parts/search-form.hdb +++ b/marginalia_nu/src/main/resources/templates/edge/parts/search-form.hdb @@ -11,6 +11,7 @@ +