diff --git a/code/api/index-api/build.gradle b/code/api/index-api/build.gradle index c495cef8..6dbcd98f 100644 --- a/code/api/index-api/build.gradle +++ b/code/api/index-api/build.gradle @@ -28,7 +28,8 @@ dependencies { implementation libs.guice implementation libs.rxjava implementation libs.protobuf - implementation libs.gson + implementation libs.bundles.gson + implementation libs.fastutil testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexClient.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexClient.java index f28747b1..8db8772f 100644 --- a/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexClient.java +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/IndexClient.java @@ -30,9 +30,9 @@ public class IndexClient extends AbstractDynamicClient { } @CheckReturnValue - public List query(Context ctx, SearchSpecification specs) { + public SearchResultSet query(Context ctx, SearchSpecification specs) { return wmsa_search_index_api_time.time( - () -> this.postGet(ctx, "/search/", specs, SearchResultSet.class).blockingFirst().getResults() + () -> this.postGet(ctx, "/search/", specs, SearchResultSet.class).blockingFirst() ); } diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultItem.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultItem.java index 396b7a3a..9890b3aa 100644 --- a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultItem.java +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultItem.java @@ -15,14 +15,14 @@ public class SearchResultItem { public final long combinedId; /** How did the subqueries match against the document ? */ - public final List scores; + public final List keywordScores; /** How many other potential results existed in the same domain */ public int resultsFromDomain; public SearchResultItem(long val) { this.combinedId = val; - this.scores = new ArrayList<>(16); + this.keywordScores = new ArrayList<>(16); } public EdgeId getUrlId() { @@ -37,11 +37,11 @@ public class SearchResultItem { } /* Used for evaluation */ - private transient double scoreValue = 1; - public void setScore(double score) { + private transient SearchResultPreliminaryScore scoreValue = null; + public void setScore(SearchResultPreliminaryScore score) { scoreValue = score; } - public double getScore() { + public SearchResultPreliminaryScore getScore() { return scoreValue; } diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultKeywordScore.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultKeywordScore.java index ef286613..a300dd88 100644 --- a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultKeywordScore.java +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultKeywordScore.java @@ -26,68 +26,17 @@ public final class SearchResultKeywordScore { this.hasPriorityTerms = hasPriorityTerms; } - private boolean hasTermFlag(WordFlags flag) { + public boolean hasTermFlag(WordFlags flag) { return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit()); } - public double documentValue() { - long sum = 0; - - sum += DocumentMetadata.decodeQuality(encodedDocMetadata) / 5.; - - sum += DocumentMetadata.decodeTopology(encodedDocMetadata); - - if (DocumentMetadata.hasFlags(encodedDocMetadata, DocumentFlags.Simple.asBit())) { - sum += 20; - } - - int rank = DocumentMetadata.decodeRank(encodedDocMetadata) - 13; - if (rank < 0) - sum += rank / 2; - else - sum += rank / 4; - - return sum; + public int positionCount() { + return Integer.bitCount(positions()); } - public double termValue() { - double sum = 0; - - double tfIdf = WordMetadata.decodeTfidf(encodedWordMetadata); - int positionBits = WordMetadata.decodePositions(encodedWordMetadata); - - if (hasTermFlag(WordFlags.Title)) { - sum -= 15; - } - - if (hasTermFlag(WordFlags.Site) && positionBits != 0) { - sum -= 10; - } else if (hasTermFlag(WordFlags.SiteAdjacent) && positionBits != 0) { - sum -= 5; - } - - if (hasTermFlag(WordFlags.Subjects)) { - sum -= 10; - } - if (hasTermFlag(WordFlags.NamesWords)) { - sum -= 1; - } - - if (hasTermFlag(WordFlags.UrlDomain)) { - sum -= 5; - } - - if (hasTermFlag(WordFlags.UrlPath)) { - sum -= 5; - } - - - sum -= tfIdf / 10.; - sum -= Integer.bitCount(positionBits) / 3.; - - return sum; + public int tfIdf() { + return (int) WordMetadata.decodeTfidf(encodedWordMetadata); } - public int subquery() { return subquery; } @@ -138,8 +87,8 @@ public final class SearchResultKeywordScore { return "SearchResultKeywordScore[" + "set=" + subquery + ", " + "keyword=" + keyword + ", " + - "encodedWordMetadata=" + encodedWordMetadata + ", " + - "encodedDocMetadata=" + encodedDocMetadata + ", " + + "encodedWordMetadata=" + new WordMetadata(encodedWordMetadata) + ", " + + "encodedDocMetadata=" + new DocumentMetadata(encodedDocMetadata) + ", " + "hasPriorityTerms=" + hasPriorityTerms + ']'; } diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultPreliminaryScore.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultPreliminaryScore.java new file mode 100644 index 00000000..1021ea26 --- /dev/null +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultPreliminaryScore.java @@ -0,0 +1,42 @@ +package nu.marginalia.index.client.model.results; + +import org.jetbrains.annotations.NotNull; + +import static java.lang.Boolean.compare; +import static java.lang.Integer.compare; + +public record SearchResultPreliminaryScore(boolean hasSingleTermMatch, + boolean hasPriorityTerm, + int minNumberOfFlagsSet, + int minNumberOfPositions, + int overlappingPositions) + implements Comparable +{ + @Override + public int compareTo(@NotNull SearchResultPreliminaryScore other) { + int diff; + + diff = compare(hasSingleTermMatch, other.hasSingleTermMatch); + if (diff != 0) return diff; + + diff = compare(minNumberOfFlagsSet, other.minNumberOfFlagsSet); + if (diff != 0) return diff; + + diff = compare(hasPriorityTerm, other.hasPriorityTerm); + if (diff != 0) return diff; + + diff = compare(overlappingPositions, other.overlappingPositions); + if (diff != 0) return diff; + + return compare(minNumberOfPositions, other.minNumberOfPositions); + } + + public boolean isGreat() { + return hasSingleTermMatch || (minNumberOfFlagsSet >= 1 && overlappingPositions >= 1); + } + public boolean isEmpty() { + return minNumberOfFlagsSet == 0 + && minNumberOfPositions == 0 + && overlappingPositions == 0; + } +} diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultRankingContext.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultRankingContext.java new file mode 100644 index 00000000..cb9bdf16 --- /dev/null +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultRankingContext.java @@ -0,0 +1,25 @@ +package nu.marginalia.index.client.model.results; + +import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; +import lombok.ToString; + +import java.util.Map; + +@ToString +public class SearchResultRankingContext { + private final int docCount; + private final Object2IntOpenHashMap termCounts = new Object2IntOpenHashMap<>(10, 0.5f); + + public SearchResultRankingContext(int docCount, Map termCounts) { + this.docCount = docCount; + this.termCounts.putAll(termCounts); + } + + public int termFreqDocCount() { + return docCount; + } + + public int frequency(String keyword) { + return termCounts.getOrDefault(keyword, 1); + } +} diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultSet.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultSet.java index ce25a632..3c4b3750 100644 --- a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultSet.java +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultSet.java @@ -9,7 +9,7 @@ import java.util.List; @AllArgsConstructor @Getter @ToString public class SearchResultSet { public List results; - + public SearchResultRankingContext rankingContext; public int size() { return results.size(); } diff --git a/code/common/process/build.gradle b/code/common/process/build.gradle new file mode 100644 index 00000000..a762887b --- /dev/null +++ b/code/common/process/build.gradle @@ -0,0 +1,32 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "5.3.3.3" + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { + implementation libs.notnull + implementation libs.lombok + annotationProcessor libs.lombok + + implementation libs.bundles.slf4j + testImplementation libs.bundles.slf4j.test + + implementation libs.guava + implementation libs.guice + implementation libs.commons.lang3 + + implementation libs.snakeyaml + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + + diff --git a/code/common/process/readme.md b/code/common/process/readme.md new file mode 100644 index 00000000..989a6193 --- /dev/null +++ b/code/common/process/readme.md @@ -0,0 +1,4 @@ +# Process + +Basic functionality for a Process. Processes must include this dependency to ensure +their loggers are configured properly! \ No newline at end of file diff --git a/code/features-crawl/work-log/src/main/java/nu/marginalia/work_log/WorkLog.java b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLog.java similarity index 98% rename from code/features-crawl/work-log/src/main/java/nu/marginalia/work_log/WorkLog.java rename to code/common/process/src/main/java/nu/marginalia/process/log/WorkLog.java index 83734a8c..db5b22a8 100644 --- a/code/features-crawl/work-log/src/main/java/nu/marginalia/work_log/WorkLog.java +++ b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLog.java @@ -1,4 +1,4 @@ -package nu.marginalia.work_log; +package nu.marginalia.process.log; import com.google.errorprone.annotations.MustBeClosed; import org.apache.logging.log4j.util.Strings; diff --git a/code/features-crawl/work-log/src/main/java/nu/marginalia/work_log/WorkLogEntry.java b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLogEntry.java similarity index 68% rename from code/features-crawl/work-log/src/main/java/nu/marginalia/work_log/WorkLogEntry.java rename to code/common/process/src/main/java/nu/marginalia/process/log/WorkLogEntry.java index d88f5276..9f9579f3 100644 --- a/code/features-crawl/work-log/src/main/java/nu/marginalia/work_log/WorkLogEntry.java +++ b/code/common/process/src/main/java/nu/marginalia/process/log/WorkLogEntry.java @@ -1,4 +1,4 @@ -package nu.marginalia.work_log; +package nu.marginalia.process.log; public record WorkLogEntry(String id, String ts, String path, int cnt) { } diff --git a/code/common/model/src/main/java/nu/marginalia/util/ParallelPipe.java b/code/common/process/src/main/java/nu/marginalia/util/ParallelPipe.java similarity index 100% rename from code/common/model/src/main/java/nu/marginalia/util/ParallelPipe.java rename to code/common/process/src/main/java/nu/marginalia/util/ParallelPipe.java diff --git a/code/common/process/src/main/resources/log4j2.properties b/code/common/process/src/main/resources/log4j2.properties new file mode 100644 index 00000000..18eaf147 --- /dev/null +++ b/code/common/process/src/main/resources/log4j2.properties @@ -0,0 +1,9 @@ +log4j2.isThreadContextMapInheritable=true +status = info +appender.console.type = Console +appender.console.name = LogToConsole +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %c{1}- %msg{nolookups}%n +appender.console.filter.http.type = MarkerFilter +rootLogger.level = info +rootLogger.appenderRef.console.ref = LogToConsole diff --git a/code/features-convert/keyword-extraction/readme.md b/code/features-convert/keyword-extraction/readme.md index bb343136..f67702a1 100644 --- a/code/features-convert/keyword-extraction/readme.md +++ b/code/features-convert/keyword-extraction/readme.md @@ -6,7 +6,7 @@ functions based on [POS tags](https://www.ling.upenn.edu/courses/Fall_2003/ling0 ## Central Classes -* [DocumentKeywordExtractor](src/main/java/nu/marginalia/keyword_extraction/DocumentKeywordExtractor.java) +* [DocumentKeywordExtractor](src/main/java/nu/marginalia/keyword/DocumentKeywordExtractor.java) ## See Also diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/DocumentKeywordExtractor.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/DocumentKeywordExtractor.java similarity index 73% rename from code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/DocumentKeywordExtractor.java rename to code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/DocumentKeywordExtractor.java index 7dad2cfe..49d64002 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/DocumentKeywordExtractor.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/DocumentKeywordExtractor.java @@ -1,7 +1,7 @@ -package nu.marginalia.keyword_extraction; +package nu.marginalia.keyword; -import nu.marginalia.keyword_extraction.extractors.*; -import nu.marginalia.keyword_extraction.model.DocumentKeywordsBuilder; +import nu.marginalia.keyword.extractors.*; +import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.language.WordPatterns; import nu.marginalia.language.encoding.AsciiFlattener; import nu.marginalia.language.model.DocumentLanguageData; @@ -73,6 +73,8 @@ public class DocumentKeywordExtractor { } } + + private void createSimpleWords(DocumentKeywordsBuilder wordsBuilder, KeywordMetadata metadata, DocumentLanguageData documentLanguageData) @@ -88,7 +90,7 @@ public class DocumentKeywordExtractor { } String w = AsciiFlattener.flattenUnicode(word.wordLowerCase()); - if (WordPatterns.singleWordQualitiesPredicate.test(w)) { + if (matchesWordPattern(w)) { wordsBuilder.add(w, metadata.getMetadataForWord(word.stemmed())); } } @@ -101,4 +103,43 @@ public class DocumentKeywordExtractor { } } } + + boolean matchesWordPattern(String s) { + // this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4} + + String wordPartSeparator = ".-_/:+*"; + + int i = 0; + + for (int run = 0; run < 15 && i < s.length(); run++, i++) { + char c = s.charAt(i); + if (c >= 'a' && c <= 'z') continue; + if (c >= 'A' && c <= 'Z') continue; + if (c >= '0' && c <= '9') continue; + break; + } + + if (i == 0) + return false; + + for (int j = 0; j < 5; j++) { + if (i == s.length()) return true; + + if (wordPartSeparator.indexOf(s.charAt(i)) < 0) { + return false; + } + + i++; + + for (int run = 0; run < 10 && i < s.length(); run++, i++) { + char c = s.charAt(i); + if (c >= 'a' && c <= 'z') continue; + if (c >= 'A' && c <= 'Z') continue; + if (c >= '0' && c <= '9') continue; + break; + } + } + + return false; + } } diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/KeywordExtractor.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/KeywordExtractor.java similarity index 99% rename from code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/KeywordExtractor.java rename to code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/KeywordExtractor.java index 8117f700..6b3540f0 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/KeywordExtractor.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/KeywordExtractor.java @@ -1,4 +1,4 @@ -package nu.marginalia.keyword_extraction; +package nu.marginalia.keyword; import nu.marginalia.language.WordPatterns; import nu.marginalia.language.model.DocumentSentence; diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/KeywordMetadata.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/KeywordMetadata.java similarity index 95% rename from code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/KeywordMetadata.java rename to code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/KeywordMetadata.java index 0e31558a..32830852 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/KeywordMetadata.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/KeywordMetadata.java @@ -1,7 +1,7 @@ -package nu.marginalia.keyword_extraction; +package nu.marginalia.keyword; import lombok.Builder; -import nu.marginalia.keyword_extraction.extractors.*; +import nu.marginalia.keyword.extractors.*; import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.model.idx.WordFlags; diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/WordReps.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/WordReps.java similarity index 77% rename from code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/WordReps.java rename to code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/WordReps.java index e425166a..c3503445 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/WordReps.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/WordReps.java @@ -1,4 +1,4 @@ -package nu.marginalia.keyword_extraction; +package nu.marginalia.keyword; import nu.marginalia.language.model.WordRep; diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/extractors/ArtifactKeywords.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/ArtifactKeywords.java similarity index 96% rename from code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/extractors/ArtifactKeywords.java rename to code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/ArtifactKeywords.java index 851ee2e6..1b6e3b34 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/extractors/ArtifactKeywords.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/ArtifactKeywords.java @@ -1,4 +1,4 @@ -package nu.marginalia.keyword_extraction.extractors; +package nu.marginalia.keyword.extractors; import nu.marginalia.language.model.DocumentLanguageData; diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/extractors/KeywordPositionBitmask.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java similarity index 95% rename from code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/extractors/KeywordPositionBitmask.java rename to code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java index d32d7de5..79c41366 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/extractors/KeywordPositionBitmask.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java @@ -1,8 +1,8 @@ -package nu.marginalia.keyword_extraction.extractors; +package nu.marginalia.keyword.extractors; import com.google.inject.Inject; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; -import nu.marginalia.keyword_extraction.KeywordExtractor; +import nu.marginalia.keyword.KeywordExtractor; import nu.marginalia.language.model.DocumentLanguageData; /** Generates a position bitmask for each word in a document */ diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/extractors/NameLikeKeywords.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/NameLikeKeywords.java similarity index 93% rename from code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/extractors/NameLikeKeywords.java rename to code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/NameLikeKeywords.java index b8a71a59..c033bdc1 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/extractors/NameLikeKeywords.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/NameLikeKeywords.java @@ -1,13 +1,13 @@ -package nu.marginalia.keyword_extraction.extractors; +package nu.marginalia.keyword.extractors; import com.google.common.base.CharMatcher; import it.unimi.dsi.fastutil.objects.Object2IntMap; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; -import nu.marginalia.keyword_extraction.WordReps; +import nu.marginalia.keyword.WordReps; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentSentence; import nu.marginalia.language.model.WordRep; -import nu.marginalia.keyword_extraction.KeywordExtractor; +import nu.marginalia.keyword.KeywordExtractor; import java.util.*; import java.util.stream.Collectors; diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/extractors/SubjectLikeKeywords.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/SubjectLikeKeywords.java similarity index 95% rename from code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/extractors/SubjectLikeKeywords.java rename to code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/SubjectLikeKeywords.java index 6b99984b..d4a6e428 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/extractors/SubjectLikeKeywords.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/SubjectLikeKeywords.java @@ -1,12 +1,12 @@ -package nu.marginalia.keyword_extraction.extractors; +package nu.marginalia.keyword.extractors; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; -import nu.marginalia.keyword_extraction.WordReps; +import nu.marginalia.keyword.KeywordExtractor; +import nu.marginalia.keyword.WordReps; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.WordRep; import nu.marginalia.language.model.WordSpan; import nu.marginalia.language.model.WordSeparator; -import nu.marginalia.keyword_extraction.KeywordExtractor; import org.apache.commons.lang3.StringUtils; import java.util.*; diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/extractors/TitleKeywords.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/TitleKeywords.java similarity index 87% rename from code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/extractors/TitleKeywords.java rename to code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/TitleKeywords.java index 16f02caf..e1c7eceb 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/extractors/TitleKeywords.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/TitleKeywords.java @@ -1,7 +1,7 @@ -package nu.marginalia.keyword_extraction.extractors; +package nu.marginalia.keyword.extractors; -import nu.marginalia.keyword_extraction.WordReps; -import nu.marginalia.keyword_extraction.KeywordExtractor; +import nu.marginalia.keyword.WordReps; +import nu.marginalia.keyword.KeywordExtractor; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.WordRep; diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/extractors/UrlKeywords.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/UrlKeywords.java similarity index 95% rename from code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/extractors/UrlKeywords.java rename to code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/UrlKeywords.java index 9996f18f..0b91a050 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/extractors/UrlKeywords.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/UrlKeywords.java @@ -1,4 +1,4 @@ -package nu.marginalia.keyword_extraction.extractors; +package nu.marginalia.keyword.extractors; import ca.rmen.porterstemmer.PorterStemmer; import nu.marginalia.model.EdgeDomain; diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/extractors/WordsTfIdfCounts.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/WordsTfIdfCounts.java similarity index 96% rename from code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/extractors/WordsTfIdfCounts.java rename to code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/WordsTfIdfCounts.java index 859685e8..f9080c97 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/extractors/WordsTfIdfCounts.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/WordsTfIdfCounts.java @@ -1,12 +1,12 @@ -package nu.marginalia.keyword_extraction.extractors; +package nu.marginalia.keyword.extractors; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; -import nu.marginalia.keyword_extraction.WordReps; +import nu.marginalia.keyword.WordReps; import nu.marginalia.language.WordPatterns; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentSentence; import nu.marginalia.language.model.WordRep; -import nu.marginalia.keyword_extraction.KeywordExtractor; +import nu.marginalia.keyword.KeywordExtractor; import nu.marginalia.language.model.WordSpan; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.apache.commons.lang3.StringUtils; diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/model/DocumentKeywords.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywords.java similarity index 95% rename from code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/model/DocumentKeywords.java rename to code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywords.java index d0e5d68f..55622cb8 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/model/DocumentKeywords.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywords.java @@ -1,4 +1,4 @@ -package nu.marginalia.keyword_extraction.model; +package nu.marginalia.keyword.model; import nu.marginalia.model.idx.WordMetadata; diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/model/DocumentKeywordsBuilder.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java similarity index 98% rename from code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/model/DocumentKeywordsBuilder.java rename to code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java index 4e706b23..d96a2734 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword_extraction/model/DocumentKeywordsBuilder.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java @@ -1,4 +1,4 @@ -package nu.marginalia.keyword_extraction.model; +package nu.marginalia.keyword.model; import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap; import lombok.Getter; diff --git a/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/DocumentKeywordExtractorTest.java b/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/DocumentKeywordExtractorTest.java new file mode 100644 index 00000000..22d2d7f1 --- /dev/null +++ b/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/DocumentKeywordExtractorTest.java @@ -0,0 +1,24 @@ +package nu.marginalia.keyword; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +class DocumentKeywordExtractorTest { + + @Test + public void testWordPattern() { + DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(null); + + Assertions.assertTrue(extractor.matchesWordPattern("test")); + Assertions.assertTrue(extractor.matchesWordPattern("1234567890abcde")); + Assertions.assertFalse(extractor.matchesWordPattern("1234567890abcdef")); + + Assertions.assertTrue(extractor.matchesWordPattern("test-test-test-test-test")); + Assertions.assertFalse(extractor.matchesWordPattern("test-test-test-test-test-test")); + Assertions.assertTrue(extractor.matchesWordPattern("192.168.1.100/24")); + Assertions.assertTrue(extractor.matchesWordPattern("std::vector")); + Assertions.assertTrue(extractor.matchesWordPattern("c++")); + Assertions.assertTrue(extractor.matchesWordPattern("m*a*s*h")); + Assertions.assertFalse(extractor.matchesWordPattern("Stulpnagelstrasse")); + } +} \ No newline at end of file diff --git a/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword_extraction/SentenceExtractorTest.java b/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/SentenceExtractorTest.java similarity index 95% rename from code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword_extraction/SentenceExtractorTest.java rename to code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/SentenceExtractorTest.java index b0edf5b8..cc31ef4d 100644 --- a/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword_extraction/SentenceExtractorTest.java +++ b/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/SentenceExtractorTest.java @@ -1,8 +1,7 @@ -package nu.marginalia.keyword_extraction; +package nu.marginalia.keyword; import lombok.SneakyThrows; import nu.marginalia.LanguageModels; -import nu.marginalia.language.WordPatterns; import nu.marginalia.language.model.WordRep; import nu.marginalia.language.model.WordSpan; import nu.marginalia.language.sentence.SentenceExtractor; @@ -106,10 +105,6 @@ class SentenceExtractorTest { } - @Test - public void testPattern() { - System.out.println(WordPatterns.singleWordAdditionalPattern.matcher("2.6.18164.el5pae").matches()); - } @SneakyThrows @Test diff --git a/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword_extraction/extractors/ArtifactKeywordsTest.java b/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/extractors/ArtifactKeywordsTest.java similarity index 94% rename from code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword_extraction/extractors/ArtifactKeywordsTest.java rename to code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/extractors/ArtifactKeywordsTest.java index aba92add..4b02f1f5 100644 --- a/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword_extraction/extractors/ArtifactKeywordsTest.java +++ b/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/extractors/ArtifactKeywordsTest.java @@ -1,4 +1,4 @@ -package nu.marginalia.keyword_extraction.extractors; +package nu.marginalia.keyword.extractors; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.test.util.TestLanguageModels; diff --git a/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword_extraction/extractors/NameLikeKeywordsTest.java b/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/extractors/NameLikeKeywordsTest.java similarity index 96% rename from code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword_extraction/extractors/NameLikeKeywordsTest.java rename to code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/extractors/NameLikeKeywordsTest.java index fef7ac21..b08a2353 100644 --- a/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword_extraction/extractors/NameLikeKeywordsTest.java +++ b/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/extractors/NameLikeKeywordsTest.java @@ -1,7 +1,7 @@ -package nu.marginalia.keyword_extraction.extractors; +package nu.marginalia.keyword.extractors; import com.google.common.collect.Sets; -import nu.marginalia.keyword_extraction.KeywordExtractor; +import nu.marginalia.keyword.KeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.test.util.TestLanguageModels; import org.junit.jupiter.api.Test; diff --git a/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword_extraction/extractors/SubjectLikeKeywordsTest.java b/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/extractors/SubjectLikeKeywordsTest.java similarity index 97% rename from code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword_extraction/extractors/SubjectLikeKeywordsTest.java rename to code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/extractors/SubjectLikeKeywordsTest.java index eb3de606..dda3e0a4 100644 --- a/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword_extraction/extractors/SubjectLikeKeywordsTest.java +++ b/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/extractors/SubjectLikeKeywordsTest.java @@ -1,7 +1,7 @@ -package nu.marginalia.keyword_extraction.extractors; +package nu.marginalia.keyword.extractors; import com.google.common.collect.Sets; -import nu.marginalia.keyword_extraction.KeywordExtractor; +import nu.marginalia.keyword.KeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.test.util.TestLanguageModels; diff --git a/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword_extraction/extractors/TitleKeywordsTest.java b/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/extractors/TitleKeywordsTest.java similarity index 99% rename from code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword_extraction/extractors/TitleKeywordsTest.java rename to code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/extractors/TitleKeywordsTest.java index 9e4ec663..cac29c73 100644 --- a/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword_extraction/extractors/TitleKeywordsTest.java +++ b/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/extractors/TitleKeywordsTest.java @@ -1,7 +1,7 @@ -package nu.marginalia.keyword_extraction.extractors; +package nu.marginalia.keyword.extractors; import com.google.common.collect.Sets; -import nu.marginalia.keyword_extraction.KeywordExtractor; +import nu.marginalia.keyword.KeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.test.util.TestLanguageModels; import org.jsoup.Jsoup; diff --git a/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword_extraction/extractors/UrlKeywordsTest.java b/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/extractors/UrlKeywordsTest.java similarity index 96% rename from code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword_extraction/extractors/UrlKeywordsTest.java rename to code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/extractors/UrlKeywordsTest.java index cc0731bc..294da044 100644 --- a/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword_extraction/extractors/UrlKeywordsTest.java +++ b/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/extractors/UrlKeywordsTest.java @@ -1,4 +1,4 @@ -package nu.marginalia.keyword_extraction.extractors; +package nu.marginalia.keyword.extractors; import ca.rmen.porterstemmer.PorterStemmer; import nu.marginalia.model.EdgeUrl; diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/logic/pubdate/PubDateTest.java b/code/features-convert/pubdate/src/test/java/nu/marginalia/pubdate/PubDateTest.java similarity index 89% rename from code/processes/converting-process/src/test/java/nu/marginalia/converting/logic/pubdate/PubDateTest.java rename to code/features-convert/pubdate/src/test/java/nu/marginalia/pubdate/PubDateTest.java index 1ac342cf..64bd1f73 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/logic/pubdate/PubDateTest.java +++ b/code/features-convert/pubdate/src/test/java/nu/marginalia/pubdate/PubDateTest.java @@ -1,4 +1,4 @@ -package nu.marginalia.converting.logic.pubdate; +package nu.marginalia.pubdate; import nu.marginalia.model.crawl.PubDate; import org.junit.jupiter.api.Test; diff --git a/code/features-convert/readme.md b/code/features-convert/readme.md index 69063bc3..26bb0e06 100644 --- a/code/features-convert/readme.md +++ b/code/features-convert/readme.md @@ -9,3 +9,4 @@ * [adblock](adblock/) - Simulates Adblock * [pubdate](pubdate/) - Determines when a document was published * [topic-detection](topic-detection/) - Tries to identify the topic of a website +* [summary-extraction](summary-extraction/) \ No newline at end of file diff --git a/code/features-crawl/work-log/build.gradle b/code/features-convert/summary-extraction/build.gradle similarity index 71% rename from code/features-crawl/work-log/build.gradle rename to code/features-convert/summary-extraction/build.gradle index 1a698e10..f92d329b 100644 --- a/code/features-crawl/work-log/build.gradle +++ b/code/features-convert/summary-extraction/build.gradle @@ -1,7 +1,7 @@ plugins { id 'java' id "io.freefair.lombok" version "5.3.3.3" - + id 'application' id 'jvm-test-suite' } @@ -11,26 +11,28 @@ java { } } +application { + mainClass = 'nu.marginalia.converting.ConverterMain' + applicationName = 'converter-process' +} + +tasks.distZip.enabled = false + dependencies { - implementation libs.notnull implementation libs.lombok annotationProcessor libs.lombok - - implementation libs.bundles.gson - implementation libs.rxjava implementation libs.bundles.slf4j - testImplementation libs.bundles.slf4j.test - implementation libs.guava - implementation libs.guice + implementation libs.notnull - implementation libs.snakeyaml implementation libs.jsoup - implementation libs.zstd - implementation libs.commons.net - - implementation libs.opencsv + implementation libs.guice + implementation libs.guava + implementation libs.bundles.gson + implementation libs.trove + implementation libs.fastutil + implementation libs.commons.lang3 testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit @@ -38,6 +40,7 @@ dependencies { } test { + maxHeapSize = "8G" useJUnitPlatform() } diff --git a/code/features-convert/summary-extraction/readme.md b/code/features-convert/summary-extraction/readme.md new file mode 100644 index 00000000..8c75d238 --- /dev/null +++ b/code/features-convert/summary-extraction/readme.md @@ -0,0 +1,17 @@ +# Summary Extraction + +This feature attempts to find a descriptive passage of text that summarizes +what a search result "is about". It's the text you see below a search result. + +It uses several naive heuristics to try to find something that makes sense, +and there is probably room for improvement. + +There are many good techniques for doing this, but they've sadly not proved +particularly fast. Whatever solution is used needs to be able to summarize of +order of a 100,000,000 documents with a time budget of a couple of hours. + +## Central Classes + +* [SummaryExtractor](src/main/java/nu/marginalia/summary/SummaryExtractor.java) +* [SummaryExtractionFilter](src/main/java/nu/marginalia/summary/SummaryExtractionFilter.java) - DOM pruning algo. + Doesn't always work, but when it works it's pretty good. diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/summary/SummaryExtractionFilter.java b/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/SummaryExtractionFilter.java similarity index 99% rename from code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/summary/SummaryExtractionFilter.java rename to code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/SummaryExtractionFilter.java index 312a22a6..b20f2b3a 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/summary/SummaryExtractionFilter.java +++ b/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/SummaryExtractionFilter.java @@ -1,4 +1,4 @@ -package nu.marginalia.converting.processor.logic.summary; +package nu.marginalia.summary; import com.google.common.base.Strings; import org.apache.commons.lang3.StringUtils; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/summary/SummaryExtractor.java b/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/SummaryExtractor.java similarity index 94% rename from code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/summary/SummaryExtractor.java rename to code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/SummaryExtractor.java index bf803aee..98e81fcf 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/summary/SummaryExtractor.java +++ b/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/SummaryExtractor.java @@ -1,4 +1,4 @@ -package nu.marginalia.converting.processor.logic.summary; +package nu.marginalia.summary; import com.google.inject.Inject; import com.google.inject.name.Named; @@ -19,9 +19,8 @@ public class SummaryExtractor { } public String extractSummary(Document parsed) { - String summaryString; + String summaryString = extractSummaryRaw(parsed); - summaryString = extractSummaryRaw(parsed); summaryString = truncatedCharacters.matcher(summaryString).replaceAll(" "); summaryString = StringUtils.abbreviate(summaryString, "", maxSummaryLength); @@ -81,7 +80,7 @@ public class SummaryExtractor { } if (content.length() > 32) { - // AAAA AAAA AAAA AAAA AAAA AAAA AAAA AAAA + // AAAABBBBCCCCDDDDEEEEFFFFGGGGHHHH return content.toString(); } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/logic/SummaryExtractorTest.java b/code/features-convert/summary-extraction/src/test/java/nu/marginalia/summary/SummaryExtractorTest.java similarity index 66% rename from code/processes/converting-process/src/test/java/nu/marginalia/converting/logic/SummaryExtractorTest.java rename to code/features-convert/summary-extraction/src/test/java/nu/marginalia/summary/SummaryExtractorTest.java index 024f9f83..65021e0c 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/logic/SummaryExtractorTest.java +++ b/code/features-convert/summary-extraction/src/test/java/nu/marginalia/summary/SummaryExtractorTest.java @@ -1,17 +1,13 @@ -package nu.marginalia.converting.logic; +package nu.marginalia.summary; -import nu.marginalia.WmsaHome; -import nu.marginalia.converting.processor.logic.summary.SummaryExtractionFilter; -import nu.marginalia.converting.processor.logic.summary.SummaryExtractor; +import nu.marginalia.summary.SummaryExtractionFilter; +import nu.marginalia.summary.SummaryExtractor; import org.jsoup.Jsoup; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import java.io.FileOutputStream; import java.io.IOException; -import java.io.PrintWriter; -import java.nio.file.Files; import java.nio.file.Path; import java.util.Comparator; import java.util.HashMap; @@ -43,47 +39,6 @@ class SummaryExtractorTest { System.out.println(e.getValue().text()); }); } - @Test - public void testSummaryFilter3() throws IOException { - var data = WmsaHome.getHomePath().resolve("test-data/url-327999153"); - String html = Files.readString(data); - var doc = Jsoup.parse(html); - var filter = new SummaryExtractionFilter(); - doc.filter(filter); - - filter.getSummary(255); - } - - @Test - public void testSummaryFilter2() throws IOException { - var data = WmsaHome.getHomePath().resolve("test-data/"); - - System.out.println("Running"); - - var fos = new PrintWriter(new FileOutputStream("/tmp/summaryDiff.html")); - fos.println(""); - - for (var file : Objects.requireNonNull(data.toFile().listFiles())) { - - var doc = Jsoup.parse(Files.readString(file.toPath())); - fos.println(""); - fos.println(""); - } - - fos.println("
" + file.getName() + "
"); - var filter = new SummaryExtractionFilter(); - - doc.select("header,nav,#header,#nav,#navigation,.header,.nav,.navigation,ul,li").remove(); - doc.filter(filter); - var ret = filter.getSummary(255); - - fos.println(ret); - fos.println(""); - String summary = summaryExtractor.extractSummary(Jsoup.parse(Files.readString(file.toPath()))); - fos.println(summary); - fos.println("
"); - fos.flush(); - } @Test void extractSurrey() throws IOException { diff --git a/code/processes/converting-process/src/test/resources/html/monadnock.html b/code/features-convert/summary-extraction/src/test/resources/html/monadnock.html similarity index 100% rename from code/processes/converting-process/src/test/resources/html/monadnock.html rename to code/features-convert/summary-extraction/src/test/resources/html/monadnock.html diff --git a/code/processes/converting-process/src/test/resources/html/readme.md b/code/features-convert/summary-extraction/src/test/resources/html/readme.md similarity index 100% rename from code/processes/converting-process/src/test/resources/html/readme.md rename to code/features-convert/summary-extraction/src/test/resources/html/readme.md diff --git a/code/processes/converting-process/src/test/resources/html/summarization/187.shtml b/code/features-convert/summary-extraction/src/test/resources/html/summarization/187.shtml similarity index 100% rename from code/processes/converting-process/src/test/resources/html/summarization/187.shtml rename to code/features-convert/summary-extraction/src/test/resources/html/summarization/187.shtml diff --git a/code/processes/converting-process/src/test/resources/html/summarization/surrey.html b/code/features-convert/summary-extraction/src/test/resources/html/summarization/surrey.html similarity index 100% rename from code/processes/converting-process/src/test/resources/html/summarization/surrey.html rename to code/features-convert/summary-extraction/src/test/resources/html/summarization/surrey.html diff --git a/code/processes/converting-process/src/test/resources/html/summarization/surrey.html.1 b/code/features-convert/summary-extraction/src/test/resources/html/summarization/surrey.html.1 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/summarization/surrey.html.1 rename to code/features-convert/summary-extraction/src/test/resources/html/summarization/surrey.html.1 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/index b/code/features-convert/summary-extraction/src/test/resources/html/work-set/index similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/index rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/index diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1021546012 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1021546012 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1021546012 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1021546012 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1028592943 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1028592943 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1028592943 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1028592943 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1081293162 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1081293162 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1081293162 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1081293162 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1105046394 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1105046394 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1105046394 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1105046394 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1146923296 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1146923296 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1146923296 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1146923296 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1194694074 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1194694074 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1194694074 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1194694074 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1207898281 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1207898281 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1207898281 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1207898281 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1268145073 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1268145073 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1268145073 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1268145073 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1294876331 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1294876331 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1294876331 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1294876331 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1314767420 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1314767420 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1314767420 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1314767420 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1316269786 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1316269786 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1316269786 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1316269786 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1316766580 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1316766580 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1316766580 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1316766580 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1319968043 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1319968043 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1319968043 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1319968043 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1338576987 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1338576987 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1338576987 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1338576987 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1341909571 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1341909571 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1341909571 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1341909571 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1369578579 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1369578579 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1369578579 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1369578579 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1437315645 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1437315645 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1437315645 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1437315645 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1458954960 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1458954960 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1458954960 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1458954960 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1475681345 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1475681345 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1475681345 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1475681345 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1498328446 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1498328446 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1498328446 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1498328446 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1507779664 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1507779664 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1507779664 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1507779664 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1540303379 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1540303379 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1540303379 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1540303379 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--154898476 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--154898476 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--154898476 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--154898476 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1552059399 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1552059399 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1552059399 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1552059399 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1557688340 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1557688340 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1557688340 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1557688340 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1584145751 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1584145751 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1584145751 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1584145751 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1605151204 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1605151204 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1605151204 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1605151204 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--162269247 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--162269247 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--162269247 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--162269247 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1624294488 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1624294488 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1624294488 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1624294488 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--164108285 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--164108285 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--164108285 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--164108285 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1645688243 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1645688243 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1645688243 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1645688243 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1658004609 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1658004609 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1658004609 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1658004609 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1658558834 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1658558834 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1658558834 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1658558834 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1698664879 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1698664879 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1698664879 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1698664879 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--169975195 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--169975195 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--169975195 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--169975195 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1701203332 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1701203332 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1701203332 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1701203332 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--17281998 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--17281998 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--17281998 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--17281998 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1742070028 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1742070028 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1742070028 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1742070028 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1745376814 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1745376814 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1745376814 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1745376814 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1749889035 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1749889035 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1749889035 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1749889035 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--176177364 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--176177364 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--176177364 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--176177364 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--177014197 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--177014197 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--177014197 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--177014197 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1794527707 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1794527707 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1794527707 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1794527707 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1797740201 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1797740201 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1797740201 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1797740201 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1799098579 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1799098579 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1799098579 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1799098579 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1959637826 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1959637826 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1959637826 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1959637826 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1971916964 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1971916964 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1971916964 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1971916964 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--1985840368 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1985840368 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--1985840368 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--1985840368 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--2012610859 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--2012610859 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--2012610859 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--2012610859 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--202178680 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--202178680 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--202178680 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--202178680 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--2043528727 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--2043528727 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--2043528727 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--2043528727 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--2081757477 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--2081757477 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--2081757477 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--2081757477 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--2103982576 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--2103982576 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--2103982576 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--2103982576 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--2111558769 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--2111558769 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--2111558769 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--2111558769 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--213168798 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--213168798 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--213168798 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--213168798 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--232544032 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--232544032 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--232544032 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--232544032 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--253010011 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--253010011 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--253010011 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--253010011 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--274250994 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--274250994 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--274250994 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--274250994 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--332442790 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--332442790 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--332442790 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--332442790 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--353437903 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--353437903 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--353437903 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--353437903 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--364546777 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--364546777 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--364546777 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--364546777 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--379129416 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--379129416 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--379129416 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--379129416 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--399428149 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--399428149 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--399428149 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--399428149 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--425233170 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--425233170 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--425233170 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--425233170 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--434612307 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--434612307 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--434612307 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--434612307 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--439772328 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--439772328 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--439772328 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--439772328 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--458002611 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--458002611 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--458002611 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--458002611 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--506010305 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--506010305 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--506010305 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--506010305 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--546773534 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--546773534 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--546773534 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--546773534 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--551288516 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--551288516 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--551288516 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--551288516 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--602577763 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--602577763 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--602577763 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--602577763 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--611668054 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--611668054 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--611668054 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--611668054 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--634771245 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--634771245 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--634771245 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--634771245 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--639320493 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--639320493 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--639320493 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--639320493 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--643179018 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--643179018 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--643179018 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--643179018 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--663772351 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--663772351 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--663772351 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--663772351 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--670789152 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--670789152 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--670789152 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--670789152 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--6797317 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--6797317 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--6797317 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--6797317 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--700978490 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--700978490 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--700978490 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--700978490 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--708035332 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--708035332 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--708035332 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--708035332 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--804917062 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--804917062 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--804917062 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--804917062 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--819771302 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--819771302 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--819771302 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--819771302 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--840796372 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--840796372 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--840796372 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--840796372 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--841445362 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--841445362 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--841445362 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--841445362 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--862385354 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--862385354 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--862385354 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--862385354 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--879796466 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--879796466 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--879796466 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--879796466 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--89134993 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--89134993 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--89134993 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--89134993 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--905197876 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--905197876 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--905197876 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--905197876 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--920328354 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--920328354 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--920328354 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--920328354 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--952827759 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--952827759 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--952827759 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--952827759 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--964018507 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--964018507 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--964018507 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--964018507 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url--972614909 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url--972614909 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url--972614909 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url--972614909 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-10088520 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-10088520 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-10088520 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-10088520 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1013281103 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1013281103 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1013281103 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1013281103 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1019241851 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1019241851 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1019241851 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1019241851 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1059944953 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1059944953 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1059944953 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1059944953 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1118681302 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1118681302 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1118681302 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1118681302 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1179298706 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1179298706 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1179298706 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1179298706 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1191749784 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1191749784 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1191749784 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1191749784 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1207094790 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1207094790 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1207094790 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1207094790 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1213989666 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1213989666 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1213989666 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1213989666 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1222442301 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1222442301 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1222442301 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1222442301 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-130332455 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-130332455 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-130332455 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-130332455 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1311055461 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1311055461 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1311055461 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1311055461 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1391842722 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1391842722 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1391842722 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1391842722 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1457388763 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1457388763 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1457388763 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1457388763 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1506356272 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1506356272 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1506356272 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1506356272 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1511762169 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1511762169 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1511762169 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1511762169 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1534640058 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1534640058 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1534640058 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1534640058 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1551513871 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1551513871 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1551513871 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1551513871 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1567632447 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1567632447 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1567632447 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1567632447 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1623049502 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1623049502 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1623049502 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1623049502 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-163919330 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-163919330 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-163919330 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-163919330 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1661398327 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1661398327 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1661398327 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1661398327 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1724309925 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1724309925 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1724309925 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1724309925 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1736807128 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1736807128 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1736807128 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1736807128 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1739031345 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1739031345 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1739031345 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1739031345 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1755745765 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1755745765 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1755745765 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1755745765 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1802811100 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1802811100 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1802811100 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1802811100 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1805364707 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1805364707 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1805364707 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1805364707 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1832702370 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1832702370 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1832702370 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1832702370 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1853114311 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1853114311 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1853114311 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1853114311 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1924872844 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1924872844 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1924872844 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1924872844 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-197772804 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-197772804 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-197772804 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-197772804 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1984259912 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1984259912 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1984259912 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1984259912 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-1990903988 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1990903988 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-1990903988 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-1990903988 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-2039310951 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-2039310951 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-2039310951 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-2039310951 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-2040857056 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-2040857056 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-2040857056 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-2040857056 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-2052613093 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-2052613093 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-2052613093 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-2052613093 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-2063899866 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-2063899866 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-2063899866 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-2063899866 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-2115548255 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-2115548255 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-2115548255 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-2115548255 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-2127148436 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-2127148436 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-2127148436 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-2127148436 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-2133781904 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-2133781904 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-2133781904 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-2133781904 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-225690385 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-225690385 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-225690385 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-225690385 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-226401955 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-226401955 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-226401955 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-226401955 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-262970770 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-262970770 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-262970770 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-262970770 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-30106798 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-30106798 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-30106798 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-30106798 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-302167335 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-302167335 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-302167335 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-302167335 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-327999153 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-327999153 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-327999153 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-327999153 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-332568225 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-332568225 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-332568225 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-332568225 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-343223418 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-343223418 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-343223418 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-343223418 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-383103932 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-383103932 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-383103932 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-383103932 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-412929678 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-412929678 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-412929678 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-412929678 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-475213997 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-475213997 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-475213997 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-475213997 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-483403121 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-483403121 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-483403121 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-483403121 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-488667993 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-488667993 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-488667993 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-488667993 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-50815201 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-50815201 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-50815201 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-50815201 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-522685905 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-522685905 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-522685905 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-522685905 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-570714305 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-570714305 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-570714305 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-570714305 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-58733529 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-58733529 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-58733529 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-58733529 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-616518304 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-616518304 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-616518304 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-616518304 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-662169426 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-662169426 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-662169426 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-662169426 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-677278788 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-677278788 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-677278788 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-677278788 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-690486170 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-690486170 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-690486170 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-690486170 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-709693331 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-709693331 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-709693331 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-709693331 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-734531556 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-734531556 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-734531556 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-734531556 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-767530276 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-767530276 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-767530276 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-767530276 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-783154014 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-783154014 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-783154014 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-783154014 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-796905237 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-796905237 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-796905237 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-796905237 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-800099955 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-800099955 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-800099955 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-800099955 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-804101946 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-804101946 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-804101946 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-804101946 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-830664902 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-830664902 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-830664902 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-830664902 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-876060686 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-876060686 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-876060686 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-876060686 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-892584998 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-892584998 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-892584998 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-892584998 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-942458463 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-942458463 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-942458463 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-942458463 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-952036171 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-952036171 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-952036171 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-952036171 diff --git a/code/processes/converting-process/src/test/resources/html/work-set/url-968207276 b/code/features-convert/summary-extraction/src/test/resources/html/work-set/url-968207276 similarity index 100% rename from code/processes/converting-process/src/test/resources/html/work-set/url-968207276 rename to code/features-convert/summary-extraction/src/test/resources/html/work-set/url-968207276 diff --git a/code/features-crawl/readme.md b/code/features-crawl/readme.md index cfc9c620..6146b804 100644 --- a/code/features-crawl/readme.md +++ b/code/features-crawl/readme.md @@ -4,5 +4,4 @@ These are bits of search-engine related code that are relatively isolated pieces that benefit from the clarity of being kept separate from the rest of the crawling code. * [crawl-blocklist](crawl-blocklist/) - IP and URL blocklists -* [work-log](work-log/) - Work journal for resuming long processes * [link-parser](link-parser/) - Code for parsing and normalizing links diff --git a/code/features-crawl/work-log/readme.md b/code/features-crawl/work-log/readme.md deleted file mode 100644 index 239935fc..00000000 --- a/code/features-crawl/work-log/readme.md +++ /dev/null @@ -1,4 +0,0 @@ -# Work Log - -This is a journal for keeping track of the state of batch processes so that they can be aborted and -resumed without starting over from scratch. \ No newline at end of file diff --git a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexReader.java b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexReader.java index 76e13951..17c66e07 100644 --- a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexReader.java +++ b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexReader.java @@ -89,4 +89,7 @@ public class ForwardIndexReader { } + public int totalDocCount() { + return idToOffset.size(); + } } diff --git a/code/features-search/query-parser/src/main/java/nu/marginalia/query_parser/QueryPermutation.java b/code/features-search/query-parser/src/main/java/nu/marginalia/query_parser/QueryPermutation.java index 1a51a5b8..62774449 100644 --- a/code/features-search/query-parser/src/main/java/nu/marginalia/query_parser/QueryPermutation.java +++ b/code/features-search/query-parser/src/main/java/nu/marginalia/query_parser/QueryPermutation.java @@ -10,6 +10,8 @@ import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.List; +import java.util.function.Predicate; +import java.util.regex.Pattern; import java.util.stream.Collectors; import static java.util.stream.Stream.concat; @@ -18,6 +20,13 @@ public class QueryPermutation { private final Logger logger = LoggerFactory.getLogger(getClass()); private final QueryVariants queryVariants; + public static final Pattern wordPattern = Pattern.compile("[#]?[_@.a-zA-Z0-9'+\\-\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+[#]?"); + public static final Pattern wordAppendixPattern = Pattern.compile("[.]?[0-9a-zA-Z\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]{1,3}[0-9]?"); + + public static final Predicate wordQualitiesPredicate = wordPattern.asMatchPredicate(); + + public static final Predicate wordAppendixPredicate = wordAppendixPattern.asMatchPredicate(); + public static final Predicate wordPredicateEither = wordQualitiesPredicate.or(wordAppendixPredicate); public QueryPermutation(QueryVariants queryVariants) { this.queryVariants = queryVariants; @@ -31,12 +40,12 @@ public class QueryPermutation { var token = items.get(i); if (start < 0) { - if (token.type == TokenType.LITERAL_TERM && WordPatterns.wordQualitiesPredicate.test(token.str)) { + if (token.type == TokenType.LITERAL_TERM && wordQualitiesPredicate.test(token.str)) { start = i; } } else { - if (token.type != TokenType.LITERAL_TERM || !WordPatterns.wordPredicateEither.test(token.str)) { + if (token.type != TokenType.LITERAL_TERM || !wordPredicateEither.test(token.str)) { end = i; break; } @@ -68,12 +77,12 @@ public class QueryPermutation { var token = items.get(i); if (start < 0) { - if (token.type == TokenType.LITERAL_TERM && WordPatterns.wordQualitiesPredicate.test(token.str)) { + if (token.type == TokenType.LITERAL_TERM && wordQualitiesPredicate.test(token.str)) { start = i; } } else { - if (token.type != TokenType.LITERAL_TERM || !WordPatterns.wordPredicateEither.test(token.str)) { + if (token.type != TokenType.LITERAL_TERM || !wordPredicateEither.test(token.str)) { end = i; break; } diff --git a/code/features-search/query-parser/src/main/java/nu/marginalia/query_parser/QueryVariants.java b/code/features-search/query-parser/src/main/java/nu/marginalia/query_parser/QueryVariants.java index d0590183..6acdaed4 100644 --- a/code/features-search/query-parser/src/main/java/nu/marginalia/query_parser/QueryVariants.java +++ b/code/features-search/query-parser/src/main/java/nu/marginalia/query_parser/QueryVariants.java @@ -5,7 +5,7 @@ import lombok.EqualsAndHashCode; import lombok.Getter; import lombok.ToString; import nu.marginalia.LanguageModels; -import nu.marginalia.keyword_extraction.KeywordExtractor; +import nu.marginalia.keyword.KeywordExtractor; import nu.marginalia.language.statistics.EnglishDictionary; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.ngram_bloom_filter.NGramBloomFilter; diff --git a/code/features-crawl/crawl-plan/build.gradle b/code/features-search/result-ranking/build.gradle similarity index 55% rename from code/features-crawl/crawl-plan/build.gradle rename to code/features-search/result-ranking/build.gradle index bc53b0cc..adf5e1a1 100644 --- a/code/features-crawl/crawl-plan/build.gradle +++ b/code/features-search/result-ranking/build.gradle @@ -2,6 +2,8 @@ plugins { id 'java' id "io.freefair.lombok" version "5.3.3.3" + id "de.undercouch.download" version "5.1.0" + id 'jvm-test-suite' } @@ -13,36 +15,26 @@ java { dependencies { implementation project(':code:common:model') - implementation project(':code:common:config') - implementation project(':code:features-crawl:work-log') - implementation project(':code:libraries:guarded-regex') - implementation project(':code:process-models:crawling-model') + implementation project(':code:common:service') + implementation project(':code:api:index-api') - implementation libs.notnull implementation libs.lombok annotationProcessor libs.lombok - implementation libs.bundles.gson - implementation libs.rxjava implementation libs.bundles.slf4j - testImplementation libs.bundles.slf4j.test - - implementation libs.guava implementation libs.guice - - implementation libs.snakeyaml - implementation libs.jsoup - implementation libs.zstd - - implementation libs.commons.net - - implementation libs.opencsv + implementation libs.notnull + implementation libs.trove + implementation libs.fastutil testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit testImplementation libs.mockito + testImplementation project(':code:libraries:term-frequency-dict') + testImplementation project(':code:libraries:braille-block-punch-cards') } + test { useJUnitPlatform() } diff --git a/code/features-search/result-ranking/readme.md b/code/features-search/result-ranking/readme.md new file mode 100644 index 00000000..d862fc78 --- /dev/null +++ b/code/features-search/result-ranking/readme.md @@ -0,0 +1,12 @@ +# Result Ranking + +Contains various heuristics for deciding which search results are important +with regard to a query. + +## Central Classes + +* [ResultValuator](src/main/java/nu/marginalia/ranking/ResultValuator.java) + +## See Also + +* [features-index/domain-ranking](../../features-index/domain-ranking) - Ranks domains \ No newline at end of file diff --git a/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/ResultKeywordSet.java b/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/ResultKeywordSet.java new file mode 100644 index 00000000..52cb299b --- /dev/null +++ b/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/ResultKeywordSet.java @@ -0,0 +1,24 @@ +package nu.marginalia.ranking; + +import nu.marginalia.index.client.model.results.SearchResultKeywordScore; +import org.jetbrains.annotations.NotNull; + +import java.util.Arrays; +import java.util.Iterator; + +public record ResultKeywordSet(SearchResultKeywordScore[] keywords) implements Iterable { + @NotNull + @Override + public Iterator iterator() { + return Arrays.stream(keywords).iterator(); + } + + public int length() { + return keywords.length; + } + + @Override + public String toString() { + return "%s[%s]".formatted(getClass().getSimpleName(), Arrays.toString(keywords)); + } +} diff --git a/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java b/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java new file mode 100644 index 00000000..6dd0dd64 --- /dev/null +++ b/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java @@ -0,0 +1,128 @@ +package nu.marginalia.ranking; + +import nu.marginalia.index.client.model.results.SearchResultRankingContext; +import nu.marginalia.index.client.model.results.SearchResultKeywordScore; +import nu.marginalia.model.idx.DocumentMetadata; +import nu.marginalia.ranking.factors.*; + +import javax.inject.Inject; +import javax.inject.Singleton; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +import static java.lang.Math.min; + +@Singleton +public class ResultValuator { + private final TermFlagsFactor termFlagsFactor; + private final Bm25Factor bm25Factor; + private final TermCoherenceFactor termCoherenceFactor; + + private final PriorityTermFactor priorityTermFactor; + + @Inject + public ResultValuator(TermFlagsFactor termFlagsFactor, + Bm25Factor bm25Factor, + TermCoherenceFactor termCoherenceFactor, + PriorityTermFactor priorityTermFactor) { + + this.termFlagsFactor = termFlagsFactor; + this.bm25Factor = bm25Factor; + this.termCoherenceFactor = termCoherenceFactor; + this.priorityTermFactor = priorityTermFactor; + } + + public double calculateSearchResultValue(List scores, + int length, + int titleLength, + SearchResultRankingContext ctx) + { + int sets = numberOfSets(scores); + + double bestBm25Factor = 10; + double allTermsFactor = 1.; + + final double priorityTermBonus = priorityTermFactor.calculate(scores); + + for (int set = 0; set <= sets; set++) { + ResultKeywordSet keywordSet = createKeywordSet(scores, set); + + final double bm25 = bm25Factor.calculate(keywordSet, length, ctx); + + bestBm25Factor = min(bestBm25Factor, bm25); + allTermsFactor *= getAllTermsFactorForSet(keywordSet, titleLength); + + } + + var meta = docMeta(scores); + + double docFactor = meta.map(this::getDocFactor).orElse(1.); + double lenFactor = Math.max(1.0, 2500. / (1.0 + length)); + + return bestBm25Factor * (0.4 + 0.6 * allTermsFactor) * priorityTermBonus * docFactor * lenFactor; + } + + private double getDocFactor(DocumentMetadata docMeta) { + + int topology = docMeta.topology(); + if (topology <= 1) + return 0.8; + if (topology == 2) + return 0.9; + if (topology == 3) + return 0.95; + if (topology == 4) + return 0.98; + + return 1.; + + } + + private Optional docMeta(List rawScores) { + return rawScores + .stream().map(SearchResultKeywordScore::encodedDocMetadata) + .map(DocumentMetadata::new).findFirst(); + } + + public double getAllTermsFactorForSet(ResultKeywordSet set, int titleLength) { + double totalFactor = 1.; + + totalFactor *= termFlagsFactor.calculate(set, titleLength); + + if (set.length() > 1) { + totalFactor *= 1.0 - 0.5 * termCoherenceFactor.calculate(set); + } + + assert (Double.isFinite(totalFactor)); + + return totalFactor; + } + + private ResultKeywordSet createKeywordSet(List rawScores, + int thisSet) + { + ArrayList scoresList = new ArrayList<>(rawScores.size()); + + for (var score : rawScores) { + if (score.subquery != thisSet) + continue; + if (score.keyword.contains(":")) + continue; + + scoresList.add(score); + } + + return new ResultKeywordSet(scoresList.toArray(SearchResultKeywordScore[]::new)); + + } + + private int numberOfSets(List scores) { + int maxSet = 0; + for (var score : scores) { + maxSet = Math.max(maxSet, score.subquery); + } + return 1 + maxSet; + } + +} diff --git a/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/factors/Bm25Factor.java b/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/factors/Bm25Factor.java new file mode 100644 index 00000000..561c6426 --- /dev/null +++ b/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/factors/Bm25Factor.java @@ -0,0 +1,39 @@ +package nu.marginalia.ranking.factors; + +import nu.marginalia.index.client.model.results.SearchResultRankingContext; +import nu.marginalia.ranking.ResultKeywordSet; + +/** This is a fairly coarse estimation of BM-25, + * since document count can't be accurately accessed at this point + */ +public class Bm25Factor { + private static final int AVG_LENGTH = 5000; + + public double calculate(ResultKeywordSet keywordSet, int length, SearchResultRankingContext ctx) { + final double scalingFactor = 750.; + + final int docCount = ctx.termFreqDocCount(); + + final double wf1 = 0.7; + double k = 2; + + double sum = 0.; + + for (var keyword : keywordSet) { + double count = keyword.positionCount(); + + double wt = ctx.frequency(keyword.keyword); + + final double invFreq = Math.log(1.0 + (docCount - wt + 0.5)/(wt + 0.5)); + + sum += invFreq * (count * (k + 1)) / (count + k * (1 - wf1 + wf1 * AVG_LENGTH/length)); + } + + double ret = Math.sqrt((1.0 + scalingFactor) / (1.0 + sum)); + + assert (Double.isFinite(ret)); + + return ret; + } + +} diff --git a/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/factors/PriorityTermFactor.java b/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/factors/PriorityTermFactor.java new file mode 100644 index 00000000..78073f15 --- /dev/null +++ b/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/factors/PriorityTermFactor.java @@ -0,0 +1,30 @@ +package nu.marginalia.ranking.factors; + +import nu.marginalia.index.client.model.results.SearchResultKeywordScore; +import nu.marginalia.ranking.ResultKeywordSet; + +import java.util.List; + +/** Rewards results that have a priority term */ +public class PriorityTermFactor { + public double calculate(List scores) { + + for (var result : scores) { + if (result.hasPriorityTerms()) { + return 0.5; + } + } + + return 1.0; + } + + public double calculate(ResultKeywordSet set) { + for (var result : set) { + if (result.hasPriorityTerms()) { + return 0.5; + } + } + + return 1.0; + } +} diff --git a/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/factors/TermCoherenceFactor.java b/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/factors/TermCoherenceFactor.java new file mode 100644 index 00000000..79fb08f0 --- /dev/null +++ b/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/factors/TermCoherenceFactor.java @@ -0,0 +1,40 @@ +package nu.marginalia.ranking.factors; + +import nu.marginalia.ranking.ResultKeywordSet; + +/** Rewards documents where terms appear frequently within the same sentences, + * and where this overlap is early in the document + */ +public class TermCoherenceFactor { + + public double calculate(ResultKeywordSet keywordSet) { + int mask = combinedMask(keywordSet); + + return bitsSetFactor(mask) * (0.8 + 0.2 * bitPositionFactor(mask)); + } + + double bitsSetFactor(int mask) { + final int bitsSetInMask = Integer.bitCount(mask); + + return Math.pow(bitsSetInMask/32.0, 0.25); + } + + double bitPositionFactor(int mask) { + int start = Integer.numberOfTrailingZeros(mask); + + return 1 - (start)/32.0; + } + + int combinedMask(ResultKeywordSet keywordSet) { + int mask = ~0; + + for (var keyword : keywordSet) { + long positions = keyword.positions(); + + mask &= positions; + } + + return mask; + } + +} \ No newline at end of file diff --git a/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/factors/TermFlagsFactor.java b/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/factors/TermFlagsFactor.java new file mode 100644 index 00000000..23829025 --- /dev/null +++ b/code/features-search/result-ranking/src/main/java/nu/marginalia/ranking/factors/TermFlagsFactor.java @@ -0,0 +1,85 @@ +package nu.marginalia.ranking.factors; + +import nu.marginalia.index.client.model.results.SearchResultKeywordScore; +import nu.marginalia.model.idx.WordFlags; +import nu.marginalia.ranking.ResultKeywordSet; + +public class TermFlagsFactor { + + public double calculate(ResultKeywordSet set, int titleLength) { + + double totalFactorInvertSum = 0; + + for (var keyword : set) { + double termFactor = calculateSingleTerm(keyword, titleLength); + + assert (termFactor != 0.); + + totalFactorInvertSum += 1 / (termFactor); + } + + if (totalFactorInvertSum == 0.) { + return 1.; + } + + return set.length() / totalFactorInvertSum; + } + + public double calculateSingleTerm(SearchResultKeywordScore keyword, int titleLength) { + double f = 1.; + + int posCount = keyword.positionCount(); + + final boolean title = keyword.hasTermFlag(WordFlags.Title); + final boolean site = keyword.hasTermFlag(WordFlags.Site); + final boolean siteAdjacent = keyword.hasTermFlag(WordFlags.SiteAdjacent); + final boolean urlDomain = keyword.hasTermFlag(WordFlags.UrlDomain); + final boolean urlPath = keyword.hasTermFlag(WordFlags.UrlPath); + + final boolean names = keyword.hasTermFlag(WordFlags.NamesWords); + final boolean subject = keyword.hasTermFlag(WordFlags.Subjects); + + if (title) { + f *= titleFactor(titleLength); + } + + if (posCount != 0) { + if (site) { + f *= 0.75; + } else if (siteAdjacent) { + f *= 0.8; + } + + if (subject) { + f *= 0.8; + } + else if (names) { + f *= 0.85; + } + } + assert (Double.isFinite(f)); + if (urlDomain) { + f *= 0.8; + } + else if (urlPath && posCount > 1) { + f *= 0.9; + } + assert (Double.isFinite(f)); + + return f; + } + + static double titleFactor(int titleLength) { + if (titleLength <= 64) { + return 0.5; + } + else if (titleLength < 96) { + return 0.75; + } + + // likely keyword stuffing if the title is this long + return 0.9; + + } + +} diff --git a/code/services-core/search-service/src/test/java/nu/marginalia/search/valuation/SearchResultValuatorTest.java b/code/features-search/result-ranking/src/test/java/nu/marginalia/ranking/ResultValuatorTest.java similarity index 72% rename from code/services-core/search-service/src/test/java/nu/marginalia/search/valuation/SearchResultValuatorTest.java rename to code/features-search/result-ranking/src/test/java/nu/marginalia/ranking/ResultValuatorTest.java index e3b22bcc..3305c015 100644 --- a/code/services-core/search-service/src/test/java/nu/marginalia/search/valuation/SearchResultValuatorTest.java +++ b/code/features-search/result-ranking/src/test/java/nu/marginalia/ranking/ResultValuatorTest.java @@ -1,26 +1,29 @@ -package nu.marginalia.search.valuation; +package nu.marginalia.ranking; +import nu.marginalia.index.client.model.results.SearchResultRankingContext; import nu.marginalia.index.client.model.results.SearchResultKeywordScore; -import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.WordMetadata; +import nu.marginalia.ranking.factors.*; +import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.mockito.Mockito; import java.util.EnumSet; import java.util.List; +import java.util.Map; import java.util.Set; import static org.mockito.Mockito.when; -class SearchResultValuatorTest { +class ResultValuatorTest { TermFrequencyDict dict; - SearchResultValuator valuator; + ResultValuator valuator; @BeforeEach public void setUp() { @@ -28,7 +31,12 @@ class SearchResultValuatorTest { dict = Mockito.mock(TermFrequencyDict.class); when(dict.docCount()).thenReturn(100_000); - valuator = new SearchResultValuator(dict); + valuator = new ResultValuator( + new TermFlagsFactor(), + new Bm25Factor(), + new TermCoherenceFactor(), + new PriorityTermFactor() + ); } List titleOnlyLowCountSet = List.of( @@ -52,22 +60,17 @@ class SearchResultValuatorTest { ); - List first = List.of( - new SearchResultKeywordScore(0, "bob", - wordMetadata(202, Set.of(1,3,4,6,7,9,10,11), EnumSet.of(WordFlags.TfIdfHigh)), - docMetadata(0, 2010, 0, 5, EnumSet.noneOf(DocumentFlags.class)), - false) - ); - @Test void evaluateTerms() { when(dict.getTermFreq("bob")).thenReturn(10L); + SearchResultRankingContext context = new SearchResultRankingContext(100000, + Map.of("bob", 10)); - double titleOnlyLowCount = valuator.evaluateTerms(titleOnlyLowCountSet, 10_000, 32); - double titleLongOnlyLowCount = valuator.evaluateTerms(titleOnlyLowCountSet, 10_000, 72); - double highCountNoTitle = valuator.evaluateTerms(highCountNoTitleSet, 10_000, 32); - double highCountSubject = valuator.evaluateTerms(highCountSubjectSet, 10_000, 32); + double titleOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, 10_000, 32, context); + double titleLongOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, 10_000, 72, context); + double highCountNoTitle = valuator.calculateSearchResultValue(highCountNoTitleSet, 10_000, 32, context); + double highCountSubject = valuator.calculateSearchResultValue(highCountSubjectSet, 10_000, 32, context); System.out.println(titleOnlyLowCount); System.out.println(titleLongOnlyLowCount); diff --git a/code/features-search/result-ranking/src/test/java/nu/marginalia/ranking/factors/TermCoherenceFactorTest.java b/code/features-search/result-ranking/src/test/java/nu/marginalia/ranking/factors/TermCoherenceFactorTest.java new file mode 100644 index 00000000..c9d8dd00 --- /dev/null +++ b/code/features-search/result-ranking/src/test/java/nu/marginalia/ranking/factors/TermCoherenceFactorTest.java @@ -0,0 +1,100 @@ +package nu.marginalia.ranking.factors; + +import nu.marginalia.bbpc.BrailleBlockPunchCards; +import nu.marginalia.index.client.model.results.SearchResultKeywordScore; +import nu.marginalia.model.idx.WordMetadata; +import nu.marginalia.ranking.ResultKeywordSet; +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.*; + +class TermCoherenceFactorTest { + + TermCoherenceFactor termCoherenceFactor = new TermCoherenceFactor(); + @Test + public void testAllBitsSet() { + var allPositionsSet = createSet( + ~0, ~0 + ); + + int mask = termCoherenceFactor.combinedMask(allPositionsSet); + + assertEquals(1.0, termCoherenceFactor.bitPositionFactor(mask), 0.01); + assertEquals(1.0, termCoherenceFactor.bitsSetFactor(mask), 0.01); + + assertEquals(1.0, termCoherenceFactor.calculate(allPositionsSet)); + } + + @Test + public void testNoBitsSet() { + var allPositionsSet = createSet( + 0, 0 + ); + + int mask = termCoherenceFactor.combinedMask(allPositionsSet); + + assertEquals(0, termCoherenceFactor.bitPositionFactor(mask), 0.01); + assertEquals(0, termCoherenceFactor.bitsSetFactor(mask), 0.01); + + assertEquals(0, termCoherenceFactor.calculate(allPositionsSet)); + } + + @Test + public void testLowPosMatches() { + var allPositionsSet = createSet( + List.of(0, 1, 2, 3), List.of(0, 1, 2, 3) + ); + + int mask = termCoherenceFactor.combinedMask(allPositionsSet); + printMask(mask); + + assertEquals(1.0, termCoherenceFactor.bitPositionFactor(mask), 0.01); + } + + @Test + public void testHiPosMatches() { + var allPositionsSet = createSet( + List.of(28, 29, 30, 31), List.of(28, 29, 30, 31) + ); + + int mask = termCoherenceFactor.combinedMask(allPositionsSet); + printMask(mask); + assertEquals(0.125, termCoherenceFactor.bitPositionFactor(mask), 0.01); + } + + @Test + public void testBitMatchScaling() { + for (int i = 1; i < 32; i++) { + System.out.println(i + ":" + termCoherenceFactor.bitsSetFactor((1 << i) - 1)); + } + } + + void printMask(int mask) { + System.out.println(BrailleBlockPunchCards.printBits(mask, 32)); + } + + ResultKeywordSet createSet(List... maskPositions) { + int[] positions = new int[maskPositions.length]; + + for (int i = 0; i < maskPositions.length; i++) { + for (int pos : maskPositions[i]) { + positions[i] |= (1< + * This is in dire need of oversight. Here be towering dragons with names, + * a skull next to their HP bar, and their own Mick Gordon soundtrack just + * for the battle. + * + */ public class WordPatterns { public static final int MIN_WORD_LENGTH = 1; public static final int MAX_WORD_LENGTH = 64; public static final String WORD_TOKEN_JOINER = "_"; - public static final Pattern wordPattern = Pattern.compile("[#]?[_@.a-zA-Z0-9'+\\-\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+[#]?"); - public static final Pattern wordAppendixPattern = Pattern.compile("[.]?[0-9a-zA-Z\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]{1,3}[0-9]?"); - public static final Pattern wordBreakPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))"); - public static final Pattern characterNoisePattern = Pattern.compile("^[/+\\-]+$"); - public static final Pattern singleWordAdditionalPattern = - Pattern.compile("[\\da-zA-Z]{1,15}([.\\-_/:][\\da-zA-Z]{1,10}){0,4}"); - - public static final Predicate singleWordQualitiesPredicate = singleWordAdditionalPattern.asMatchPredicate(); - public static final Predicate wordQualitiesPredicate = wordPattern.asMatchPredicate(); - - public static final Predicate wordAppendixPredicate = wordAppendixPattern.asMatchPredicate(); - public static final Predicate wordPredicateEither = wordQualitiesPredicate.or(wordAppendixPredicate); - public static final Predicate characterNoisePredicate = characterNoisePattern.asMatchPredicate(); public static final Set topWords; static { diff --git a/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java b/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java index abb4ec8a..606f8630 100644 --- a/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java +++ b/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java @@ -4,7 +4,6 @@ import com.github.datquocnguyen.RDRPOSTagger; import gnu.trove.map.hash.TObjectIntHashMap; import lombok.SneakyThrows; import nu.marginalia.LanguageModels; -import nu.marginalia.language.encoding.HtmlTagCleaner; import nu.marginalia.util.StringPool; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentSentence; @@ -33,7 +32,7 @@ public class SentenceExtractor { private final PorterStemmer porterStemmer = new PorterStemmer(); private static final Logger logger = LoggerFactory.getLogger(SentenceExtractor.class); - private static final HtmlTagCleaner tagCleaner = new HtmlTagCleaner(); + private static final SentenceExtractorHtmlTagCleaner tagCleaner = new SentenceExtractorHtmlTagCleaner(); private final ThreadLocal stringPool = ThreadLocal.withInitial(() -> StringPool.create(10_000)); @@ -58,10 +57,13 @@ public class SentenceExtractor { } public DocumentLanguageData extractSentences(Document doc) { - final String text = asText(doc); + var clone = doc.clone(); + tagCleaner.clean(clone); + + final String text = asText(clone); final DocumentSentence[] textSentences = extractSentencesFromString(text); - String title = getTitle(doc, textSentences); + String title = getTitle(clone, textSentences); TObjectIntHashMap counts = calculateWordCounts(textSentences); var titleSentences = extractSentencesFromString(title.toLowerCase()); @@ -72,8 +74,8 @@ public class SentenceExtractor { final DocumentSentence[] textSentences = extractSentencesFromString(text); TObjectIntHashMap counts = calculateWordCounts(textSentences); - - return new DocumentLanguageData(textSentences, extractSentencesFromString(title.toLowerCase()), counts); + var titleSentences = extractSentencesFromString(title.toLowerCase()); + return new DocumentLanguageData(textSentences, titleSentences, counts); } private String getTitle(Document doc, DocumentSentence[] textSentences) { @@ -230,9 +232,6 @@ public class SentenceExtractor { } public String asText(Document dc) { - - tagCleaner.clean(dc); - String text = dc.getElementsByTag("body").text(); return text.substring(0, (int) (text.length()*0.95)); diff --git a/code/libraries/language-processing/src/main/java/nu/marginalia/language/encoding/HtmlTagCleaner.java b/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractorHtmlTagCleaner.java similarity index 93% rename from code/libraries/language-processing/src/main/java/nu/marginalia/language/encoding/HtmlTagCleaner.java rename to code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractorHtmlTagCleaner.java index c2865f14..63cd12e7 100644 --- a/code/libraries/language-processing/src/main/java/nu/marginalia/language/encoding/HtmlTagCleaner.java +++ b/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractorHtmlTagCleaner.java @@ -1,11 +1,11 @@ -package nu.marginalia.language.encoding; +package nu.marginalia.language.sentence; import org.jsoup.nodes.Document; import org.jsoup.nodes.TextNode; import java.util.regex.Pattern; -public class HtmlTagCleaner { +public class SentenceExtractorHtmlTagCleaner { public final int MAX_CODE_TAG_LENGTH = 32; public final Pattern codeTagJunkPattern = Pattern.compile("(\\.|<|>|<|>|\\([^)]*\\)[;]?$)"); diff --git a/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java b/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java index 4eb0dccf..7c521080 100644 --- a/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java +++ b/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java @@ -1,5 +1,6 @@ package nu.marginalia.language.sentence; +import com.google.common.base.CharMatcher; import gnu.trove.list.array.TIntArrayList; import lombok.AllArgsConstructor; import lombok.Getter; @@ -7,6 +8,7 @@ import nu.marginalia.language.model.WordSeparator; import java.util.ArrayList; import java.util.List; +import java.util.regex.Pattern; import static nu.marginalia.language.WordPatterns.*; @@ -20,6 +22,9 @@ public class SentenceSegmentSplitter { int[] separators; } + private static final CharMatcher noiseCharacterMatcher = CharMatcher.anyOf("/*-"); + private static final Pattern wordBreakPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))"); + public static SeparatedSentence splitSegment(String segment) { var matcher = wordBreakPattern.matcher(segment); @@ -44,7 +49,7 @@ public class SentenceSegmentSplitter { String[] parts = words.toArray(String[]::new); int length = 0; for (int i = 0; i < parts.length; i++) { - if (parts[i].isBlank() || parts[i].length() >= MAX_WORD_LENGTH || characterNoisePredicate.test(parts[i])) { + if (parts[i].isBlank() || parts[i].length() >= MAX_WORD_LENGTH || noiseCharacterMatcher.matchesAllOf(parts[i])) { parts[i] = null; } else { diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HtmlTagCleanerTest.java b/code/libraries/language-processing/src/test/java/nu/marginalia/language/encoding/SentenceExtractorHtmlTagCleanerTest.java similarity index 74% rename from code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HtmlTagCleanerTest.java rename to code/libraries/language-processing/src/test/java/nu/marginalia/language/encoding/SentenceExtractorHtmlTagCleanerTest.java index 07f72179..dc21d379 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HtmlTagCleanerTest.java +++ b/code/libraries/language-processing/src/test/java/nu/marginalia/language/encoding/SentenceExtractorHtmlTagCleanerTest.java @@ -1,14 +1,14 @@ -package nu.marginalia.crawling; +package nu.marginalia.language.encoding; -import nu.marginalia.language.encoding.HtmlTagCleaner; +import nu.marginalia.language.sentence.SentenceExtractorHtmlTagCleaner; import org.jsoup.Jsoup; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.*; -class HtmlTagCleanerTest { +class SentenceExtractorHtmlTagCleanerTest { - final HtmlTagCleaner tagCleaner = new HtmlTagCleaner(); + final SentenceExtractorHtmlTagCleaner tagCleaner = new SentenceExtractorHtmlTagCleaner(); public String cleanTag(String text) { var doc = Jsoup.parse(text); diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/InstructionTag.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/InstructionTag.java index 9b03794b..7527229c 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/InstructionTag.java +++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/InstructionTag.java @@ -12,6 +12,9 @@ public enum InstructionTag { PROC_DOCUMENT(LoadProcessedDocument.class), PROC_DOCUMENT_ERR(LoadProcessedDocumentWithError.class), PROC_DOMAIN(LoadProcessedDomain.class), + + DOMAIN_METADATA(LoadDomainMetadata.class), + RSS(LoadRssFeed.class); public final Class clazz; diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java index 2f156026..4583f31d 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java +++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java @@ -1,6 +1,6 @@ package nu.marginalia.converting.instruction; -import nu.marginalia.keyword_extraction.model.DocumentKeywords; +import nu.marginalia.keyword.model.DocumentKeywords; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.DomainIndexingState; @@ -22,4 +22,6 @@ public interface Interpreter { void loadKeywords(EdgeUrl url, DocumentMetadata metadata, DocumentKeywords words); void loadDomainRedirect(DomainLink link); + + void loadDomainMetadata(EdgeDomain domain, int knownUrls, int goodUrls, int visitedUrls); } diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomainMetadata.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomainMetadata.java new file mode 100644 index 00000000..88da806c --- /dev/null +++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadDomainMetadata.java @@ -0,0 +1,28 @@ +package nu.marginalia.converting.instruction.instructions; + +import nu.marginalia.converting.instruction.Instruction; +import nu.marginalia.converting.instruction.InstructionTag; +import nu.marginalia.converting.instruction.Interpreter; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; + +import java.util.Arrays; + +public record LoadDomainMetadata(EdgeDomain domain, int knownUrls, int goodUrls, int visitedUrls) implements Instruction { + + @Override + public void apply(Interpreter interpreter) { + interpreter.loadDomainMetadata(domain, knownUrls, goodUrls, visitedUrls); + } + + @Override + public boolean isNoOp() { + return false; + } + + @Override + public InstructionTag tag() { + return InstructionTag.DOMAIN_METADATA; + } + +} diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadKeywords.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadKeywords.java index 9757b0c3..b33103ee 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadKeywords.java +++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadKeywords.java @@ -1,6 +1,6 @@ package nu.marginalia.converting.instruction.instructions; -import nu.marginalia.keyword_extraction.model.DocumentKeywords; +import nu.marginalia.keyword.model.DocumentKeywords; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.instruction.InstructionTag; diff --git a/code/process-models/crawling-model/build.gradle b/code/process-models/crawling-model/build.gradle index d2803c21..7a5e5fab 100644 --- a/code/process-models/crawling-model/build.gradle +++ b/code/process-models/crawling-model/build.gradle @@ -14,6 +14,7 @@ java { dependencies { implementation project(':code:common:model') + implementation project(':code:common:process') implementation project(':code:libraries:big-string') implementation project(':code:api:index-api') implementation project(':code:common:service-discovery') @@ -27,6 +28,7 @@ dependencies { implementation libs.notnull implementation libs.gson + implementation libs.snakeyaml implementation libs.zstd testImplementation libs.bundles.slf4j.test diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java index a76dcb7f..a32893a2 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java @@ -1,10 +1,12 @@ package nu.marginalia.crawling.model; +import lombok.AllArgsConstructor; import lombok.Builder; import lombok.ToString; import nu.marginalia.bigstring.BigString; @Builder +@AllArgsConstructor @ToString public class CrawledDocument implements SerializableCrawlData { public String crawlId; diff --git a/code/features-crawl/crawl-plan/src/main/java/nu/marginalia/crawl_plan/CrawlerSpecificationLoader.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlerSpecificationLoader.java similarity index 91% rename from code/features-crawl/crawl-plan/src/main/java/nu/marginalia/crawl_plan/CrawlerSpecificationLoader.java rename to code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlerSpecificationLoader.java index dd424a9f..cf6fb1fb 100644 --- a/code/features-crawl/crawl-plan/src/main/java/nu/marginalia/crawl_plan/CrawlerSpecificationLoader.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlerSpecificationLoader.java @@ -1,9 +1,8 @@ -package nu.marginalia.crawl_plan; +package nu.marginalia.crawling.model.spec; import com.github.luben.zstd.ZstdInputStream; import com.google.gson.Gson; import com.google.gson.JsonStreamParser; -import nu.marginalia.crawling.model.CrawlingSpecification; import nu.marginalia.model.gson.GsonFactory; import java.io.BufferedReader; diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawlingSpecification.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlingSpecification.java similarity index 92% rename from code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawlingSpecification.java rename to code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlingSpecification.java index 696c5e43..fea0f867 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawlingSpecification.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/spec/CrawlingSpecification.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawling.model; +package nu.marginalia.crawling.model.spec; import lombok.AllArgsConstructor; import lombok.NoArgsConstructor; diff --git a/code/features-crawl/crawl-plan/src/main/java/nu/marginalia/crawl_plan/CrawlPlan.java b/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java similarity index 95% rename from code/features-crawl/crawl-plan/src/main/java/nu/marginalia/crawl_plan/CrawlPlan.java rename to code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java index 27f39565..5003b132 100644 --- a/code/features-crawl/crawl-plan/src/main/java/nu/marginalia/crawl_plan/CrawlPlan.java +++ b/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java @@ -1,14 +1,14 @@ -package nu.marginalia.crawl_plan; +package plan; -import com.google.errorprone.annotations.MustBeClosed; import lombok.AllArgsConstructor; import lombok.NoArgsConstructor; import lombok.ToString; -import nu.marginalia.work_log.WorkLog; import nu.marginalia.crawling.io.CrawledDomainReader; -import nu.marginalia.work_log.WorkLogEntry; import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.crawling.model.CrawlingSpecification; +import nu.marginalia.crawling.model.spec.CrawlerSpecificationLoader; +import nu.marginalia.crawling.model.spec.CrawlingSpecification; +import nu.marginalia.process.log.WorkLog; +import nu.marginalia.process.log.WorkLogEntry; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -120,7 +120,6 @@ public class CrawlPlan { throw new RuntimeException(ex); } } - @MustBeClosed public DomainsIterable domainsIterable() throws IOException { return new DomainsIterable(); } diff --git a/code/features-crawl/crawl-plan/src/main/java/nu/marginalia/crawl_plan/CrawlPlanLoader.java b/code/process-models/crawling-model/src/main/java/plan/CrawlPlanLoader.java similarity index 94% rename from code/features-crawl/crawl-plan/src/main/java/nu/marginalia/crawl_plan/CrawlPlanLoader.java rename to code/process-models/crawling-model/src/main/java/plan/CrawlPlanLoader.java index 790effb2..cc7aae3f 100644 --- a/code/features-crawl/crawl-plan/src/main/java/nu/marginalia/crawl_plan/CrawlPlanLoader.java +++ b/code/process-models/crawling-model/src/main/java/plan/CrawlPlanLoader.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawl_plan; +package plan; import org.yaml.snakeyaml.Yaml; diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index d476affe..1033220e 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -19,7 +19,10 @@ application { tasks.distZip.enabled = false dependencies { + implementation project(':code:common:process') + implementation project(':third-party:porterstemmer') + implementation project(':third-party:count-min-sketch') implementation project(':code:api:index-api') implementation project(':code:common:model') @@ -40,10 +43,13 @@ dependencies { implementation project(':code:features-convert:topic-detection') implementation project(':code:features-convert:pubdate') implementation project(':code:features-convert:keyword-extraction') + implementation project(':code:features-convert:summary-extraction') + implementation project(':code:features-crawl:crawl-blocklist') implementation project(':code:features-crawl:link-parser') - implementation project(':code:features-crawl:work-log') - implementation project(':code:features-crawl:crawl-plan') + + testImplementation project(':code:libraries:term-frequency-dict') + implementation libs.lombok annotationProcessor libs.lombok implementation libs.bundles.slf4j @@ -53,7 +59,8 @@ dependencies { implementation libs.jsoup implementation libs.guice - implementation libs.gson + implementation libs.guava + implementation libs.bundles.gson implementation libs.zstd diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java index 52991f44..58aa8b04 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConversionLog.java @@ -4,7 +4,7 @@ import com.github.luben.zstd.ZstdOutputStream; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.converting.instruction.Interpreter; -import nu.marginalia.keyword_extraction.model.DocumentKeywords; +import nu.marginalia.keyword.model.DocumentKeywords; import nu.marginalia.converting.instruction.instructions.DomainLink; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; @@ -64,4 +64,7 @@ public class ConversionLog implements AutoCloseable, Interpreter { @Override public void loadDomainRedirect(DomainLink link) {} + + @Override + public void loadDomainMetadata(EdgeDomain domain, int knownUrls, int goodUrls, int visitedUrls) {} } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index 58a5c367..31fa4bb1 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -4,9 +4,9 @@ import com.google.gson.Gson; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; -import nu.marginalia.work_log.WorkLog; -import nu.marginalia.crawl_plan.CrawlPlanLoader; -import nu.marginalia.crawl_plan.CrawlPlan; +import nu.marginalia.process.log.WorkLog; +import plan.CrawlPlanLoader; +import plan.CrawlPlan; import nu.marginalia.converting.compiler.InstructionsCompiler; import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.processor.DomainProcessor; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterModule.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterModule.java index cc1b654d..e7a70aeb 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterModule.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterModule.java @@ -5,7 +5,7 @@ import com.google.inject.AbstractModule; import com.google.inject.name.Names; import nu.marginalia.LanguageModels; import nu.marginalia.WmsaHome; -import nu.marginalia.crawl_plan.CrawlPlan; +import plan.CrawlPlan; import nu.marginalia.model.gson.GsonFactory; public class ConverterModule extends AbstractModule { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriter.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriter.java index aad7e1cc..826c41cd 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriter.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriter.java @@ -6,7 +6,7 @@ import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.instruction.Interpreter; -import nu.marginalia.keyword_extraction.model.DocumentKeywords; +import nu.marginalia.keyword.model.DocumentKeywords; import nu.marginalia.converting.instruction.instructions.DomainLink; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; @@ -112,12 +112,11 @@ public class InstructionWriter { @Override public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) { - ok++; + } @Override public void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) { - error++; } @Override @@ -126,5 +125,11 @@ public class InstructionWriter { @Override public void loadDomainRedirect(DomainLink link) {} + + @Override + public void loadDomainMetadata(EdgeDomain domain, int knownUrls, int goodUrls, int visitedUrls) { + ok += goodUrls; + error += visitedUrls - goodUrls; + } } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java index dd01b54d..36b112fa 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java @@ -29,9 +29,6 @@ public class DocumentsCompiler { if (details != null) { ret.add(new LoadProcessedDocument(doc.url, doc.state, details.title, details.description, HtmlFeature.encode(details.features), details.standard.name(), details.length, details.hashCode, details.quality, details.pubYear)); } - else { - ret.add(new LoadProcessedDocumentWithError(doc.url, doc.state, doc.stateReason)); - } } private void compileWords(List ret, ProcessedDocument doc) { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DomainMetadataCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DomainMetadataCompiler.java new file mode 100644 index 00000000..e80f42eb --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DomainMetadataCompiler.java @@ -0,0 +1,42 @@ +package nu.marginalia.converting.compiler; + +import nu.marginalia.converting.instruction.Instruction; +import nu.marginalia.converting.instruction.instructions.LoadDomainMetadata; +import nu.marginalia.converting.model.ProcessedDocument; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import org.jetbrains.annotations.NotNull; + +import java.util.HashSet; +import java.util.List; +import java.util.Optional; +import java.util.Set; + +public class DomainMetadataCompiler { + + + public void compile(List ret, EdgeDomain domain, @NotNull List documents) { + + int visitedUrls = 0; + int goodUrls = 0; + + Set knownUrls = new HashSet<>(documents.size() * 2); + + for (var doc : documents) { + visitedUrls++; + + if (doc.isOk()) { + goodUrls++; + } + + knownUrls.add(doc.url); + + Optional.ofNullable(doc.details) + .map(details -> details.linksInternal) + .ifPresent(knownUrls::addAll); + } + + ret.add(new LoadDomainMetadata(domain, knownUrls.size(), goodUrls, visitedUrls)); + } + +} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java index a2242961..a7076334 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java @@ -6,11 +6,16 @@ import nu.marginalia.converting.instruction.instructions.LoadProcessedDomain; import nu.marginalia.converting.model.ProcessedDomain; import java.util.ArrayList; +import java.util.Collections; import java.util.List; +import java.util.Objects; + +import static java.util.Objects.requireNonNullElse; public class InstructionsCompiler { private final UrlsCompiler urlsCompiler; private final DocumentsCompiler documentsCompiler; + private final DomainMetadataCompiler domainMetadataCompiler; private final FeedsCompiler feedsCompiler; private final LinksCompiler linksCompiler; private final RedirectCompiler redirectCompiler; @@ -18,12 +23,14 @@ public class InstructionsCompiler { @Inject public InstructionsCompiler(UrlsCompiler urlsCompiler, DocumentsCompiler documentsCompiler, + DomainMetadataCompiler domainMetadataCompiler, FeedsCompiler feedsCompiler, LinksCompiler linksCompiler, RedirectCompiler redirectCompiler) { this.urlsCompiler = urlsCompiler; this.documentsCompiler = documentsCompiler; + this.domainMetadataCompiler = domainMetadataCompiler; this.feedsCompiler = feedsCompiler; this.linksCompiler = linksCompiler; this.redirectCompiler = redirectCompiler; @@ -46,6 +53,8 @@ public class InstructionsCompiler { redirectCompiler.compile(ret, domain.domain, domain.redirect); } + domainMetadataCompiler.compile(ret, domain.domain, requireNonNullElse(domain.documents, Collections.emptyList())); + return ret; } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocument.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocument.java index 508b03a4..a725be0a 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocument.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocument.java @@ -1,7 +1,7 @@ package nu.marginalia.converting.model; import lombok.ToString; -import nu.marginalia.keyword_extraction.model.DocumentKeywordsBuilder; +import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.model.crawl.UrlIndexingState; import nu.marginalia.model.EdgeUrl; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/MetaRobotsTag.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/MetaRobotsTag.java new file mode 100644 index 00000000..2c061a72 --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/MetaRobotsTag.java @@ -0,0 +1,29 @@ +package nu.marginalia.converting.processor; + +import org.jsoup.nodes.Document; + +import javax.inject.Singleton; + +@Singleton +public class MetaRobotsTag { + private final String searchEngineName = "marginalia-search"; + + public boolean allowIndexingByMetaTag(Document doc) { + var robotsContent = doc.select("meta[name=robots]").attr("content"); + + if (isForbidden(robotsContent)) { + var marginaliaTag = doc.select( "meta[name=" + searchEngineName + "]").attr("content"); + return isExplicitlyAllowed(marginaliaTag); + } + + return true; + } + + private boolean isForbidden(String robotsContent) { + return robotsContent.contains("noindex") || robotsContent.contains("none"); + } + + private boolean isExplicitlyAllowed(String robotsContent) { + return robotsContent.contains("all"); + } +} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DomPruningFilter.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/dom/DomPruningFilter.java similarity index 98% rename from code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DomPruningFilter.java rename to code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/dom/DomPruningFilter.java index b8c5b056..ba236aac 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DomPruningFilter.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/dom/DomPruningFilter.java @@ -1,4 +1,4 @@ -package nu.marginalia.converting.processor.logic; +package nu.marginalia.converting.processor.logic.dom; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java index acce4e47..4ccfdafe 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java @@ -5,7 +5,7 @@ import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.language.LanguageFilter; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.converting.model.HtmlStandard; -import nu.marginalia.keyword_extraction.model.DocumentKeywordsBuilder; +import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.converting.model.ProcessedDocumentDetails; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 24382f98..26379cf6 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -2,16 +2,18 @@ package nu.marginalia.converting.processor.plugin; import com.google.inject.Inject; import com.google.inject.name.Named; +import nu.marginalia.converting.processor.MetaRobotsTag; +import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; import nu.marginalia.converting.processor.logic.links.LinkProcessor; -import nu.marginalia.converting.processor.logic.summary.SummaryExtractor; +import nu.marginalia.summary.SummaryExtractor; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.keyword_extraction.DocumentKeywordExtractor; +import nu.marginalia.keyword.DocumentKeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.idx.DocumentFlags; -import nu.marginalia.keyword_extraction.model.DocumentKeywordsBuilder; +import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.converting.processor.logic.*; @@ -47,6 +49,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin private final SummaryExtractor summaryExtractor; private final PubDateSniffer pubDateSniffer; + private final MetaRobotsTag metaRobotsTag; private static final DocumentValuator documentValuator = new DocumentValuator(); private static final LinkParser linkParser = new LinkParser(); @@ -60,7 +63,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin TitleExtractor titleExtractor, DocumentKeywordExtractor keywordExtractor, SummaryExtractor summaryExtractor, - PubDateSniffer pubDateSniffer) { + PubDateSniffer pubDateSniffer, + MetaRobotsTag metaRobotsTag) { this.minDocumentLength = minDocumentLength; this.minDocumentQuality = minDocumentQuality; this.sentenceExtractor = sentenceExtractor; @@ -70,6 +74,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin this.keywordExtractor = keywordExtractor; this.summaryExtractor = summaryExtractor; this.pubDateSniffer = pubDateSniffer; + this.metaRobotsTag = metaRobotsTag; + } @Override @@ -89,7 +95,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin Document doc = Jsoup.parse(documentBody); - if (doc.select("meta[name=robots]").attr("content").contains("noindex")) { + if (!metaRobotsTag.allowIndexingByMetaTag(doc)) { throw new DisqualifiedException(DisqualificationReason.FORBIDDEN); } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java index 1ced3ee0..c7753fbc 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java @@ -4,11 +4,11 @@ import com.google.inject.Inject; import com.google.inject.name.Named; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.keyword_extraction.DocumentKeywordExtractor; +import nu.marginalia.keyword.DocumentKeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.idx.DocumentFlags; -import nu.marginalia.keyword_extraction.model.DocumentKeywordsBuilder; +import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.converting.model.DisqualifiedException; diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java new file mode 100644 index 00000000..faf5fbb0 --- /dev/null +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java @@ -0,0 +1,105 @@ +package nu.marginalia.converting; + + +import com.google.inject.Guice; +import com.google.inject.Injector; +import nu.marginalia.bigstring.BigString; +import nu.marginalia.converting.processor.DomainProcessor; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.crawl.DomainIndexingState; +import nu.marginalia.model.crawl.UrlIndexingState; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Path; +import java.time.LocalTime; +import java.util.*; + +import static org.junit.jupiter.api.Assertions.*; + +public class ConvertingIntegrationTest { + + + DomainProcessor domainProcessor; + + @BeforeEach + public void setUp() { + Injector injector = Guice.createInjector( + new ConvertingIntegrationTestModule() + ); + + domainProcessor = injector.getInstance(DomainProcessor.class); + } + + @Test + public void testEmptyDomain() { + var docs = new ArrayList(); + + var ret = domainProcessor.process( + new CrawledDomain("123", "memex.marginalia.nu", null, "OK", "-", "127.0.0.1", + docs, Collections.emptyList())); + + assertEquals(ret.state, DomainIndexingState.ACTIVE); + assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu")); + assertTrue(ret.documents.isEmpty()); + } + + @Test + public void testMemexMarginaliaNu() throws IOException { + var ret = domainProcessor.process(readMarginaliaWorkingSet()); + assertEquals(ret.state, DomainIndexingState.ACTIVE); + assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu")); + + assertFalse(ret.documents.isEmpty()); + + Map resultsByStatusCount = new HashMap<>(); + + ret.documents.forEach(doc -> { + resultsByStatusCount.merge(doc.state, 1, Integer::sum); + }); + assertTrue(resultsByStatusCount.get(UrlIndexingState.OK) > 5); + } + + private CrawledDomain readMarginaliaWorkingSet() throws IOException { + String index = readClassPathFile("memex-marginalia/index"); + String[] files = index.split("\n"); + + var docs = new ArrayList(); + + for (String file : files) { + Path p = Path.of("memex-marginalia/").resolve(file); + + var doc = new CrawledDocument("1", + "https://memex.marginalia.nu/" + file, + "text/html", + LocalTime.now().toString(), + 200, + "OK", + "", + "", + BigString.encode(readClassPathFile(p.toString())), + Double.toString(Math.random()), + "https://memex.marginalia.nu/" + file, + null + ); + docs.add(doc); + } + + return new CrawledDomain( + "1", + "memex.marginalia.nu", + null, + "OK", + "", + "127.0.0.1", + docs, Collections.emptyList()); + } + + private String readClassPathFile(String s) throws IOException { + return new String(Objects.requireNonNull(ClassLoader.getSystemResourceAsStream(s)).readAllBytes()); + } + +} diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTestModule.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTestModule.java new file mode 100644 index 00000000..f92b1bc9 --- /dev/null +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTestModule.java @@ -0,0 +1,17 @@ +package nu.marginalia.converting; + +import com.google.inject.AbstractModule; +import com.google.inject.name.Names; +import nu.marginalia.LanguageModels; +import nu.marginalia.WmsaHome; + +public class ConvertingIntegrationTestModule extends AbstractModule { + public void configure() { + bind(Double.class).annotatedWith(Names.named("min-document-quality")).toInstance(-15.); + bind(Integer.class).annotatedWith(Names.named("min-document-length")).toInstance(250); + bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128); + bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255); + + bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels()); + } +} diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/logic/DomPruningFilterTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/logic/DomPruningFilterTest.java deleted file mode 100644 index 9e3b0684..00000000 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/logic/DomPruningFilterTest.java +++ /dev/null @@ -1,12 +0,0 @@ -package nu.marginalia.converting.logic; - -import org.junit.jupiter.api.Test; - -import java.io.IOException; - -class DomPruningFilterTest { - @Test - public void test() throws IOException { - - } -} \ No newline at end of file diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/logic/MetaRobotsTagTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/logic/MetaRobotsTagTest.java new file mode 100644 index 00000000..ab9efcec --- /dev/null +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/processor/logic/MetaRobotsTagTest.java @@ -0,0 +1,68 @@ +package nu.marginalia.converting.processor.logic; + +import nu.marginalia.converting.processor.MetaRobotsTag; +import org.jsoup.Jsoup; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class MetaRobotsTagTest { + MetaRobotsTag metaRobotsTag = new MetaRobotsTag(); + @Test + public void testNoTag() { + String html = """ + + + Hello + + """; + + assertTrue(metaRobotsTag.allowIndexingByMetaTag(Jsoup.parse(html))); + } + + @Test + public void testRobotsNoIndexTag() { + String html = """ + + + + Hello + + + + """; + + assertFalse(metaRobotsTag.allowIndexingByMetaTag(Jsoup.parse(html))); + } + + @Test + public void testRobotsNoneTag() { + String html = """ + + + + Hello + + + + """; + + assertFalse(metaRobotsTag.allowIndexingByMetaTag(Jsoup.parse(html))); + } + + @Test + public void testExplicitlyAllowMarginalia() { + String html = """ + + + + Hello + + + + + """; + + assertTrue(metaRobotsTag.allowIndexingByMetaTag(Jsoup.parse(html))); + } +} \ No newline at end of file diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/05-test.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/05-test.gmi new file mode 100644 index 00000000..06af2912 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/05-test.gmi @@ -0,0 +1,48 @@ + + + + + MEMEX - + + + + + +
+ +
+ +
+
+

/05-test.gmi is gone

+

+Cursed testing file + +

+
+ + + +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/code/index.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/code/index.gmi new file mode 100644 index 00000000..cf9bb580 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/code/index.gmi @@ -0,0 +1,64 @@ + + + + + MEMEX - Code + + + + + + +
+ +
+ + +
+
+

Code

+
+No code in this directory, but there is a git repo with the source for the memex, encyclopedia, search engine and so forth below.
+
+https://git.marginalia.nu/
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/commons/dialogue.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/commons/dialogue.gmi new file mode 100644 index 00000000..95358740 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/commons/dialogue.gmi @@ -0,0 +1,156 @@ + + + + + MEMEX - Even the ancient Greeks appreciated a good dialogue + + + + + + +
+ +
+ + +
+
+

Even the ancient Greeks appreciated a good dialogue

+
+A fool has sought out the book's greatest scientist, and asked to share his knowledge. The scientist has gone to the beginning of the book to try to understand its origin better.
+
+F: Is it true, as they say, that you only know truths?
+
+S: Almost only truths, or most things I know are true, or at least more true than I thought yesterday. I can prove it by the scientific method. We in science have studied everything from paragraph breaks to letters in this book, and there is very little we do not know. For example, we can predict how the book will continue. Did you know, for example, that after a full stop comes a letter in almost every case? It is one of the early breakthroughs in science.
+
+F: It's an impressive knowledge you possess. But do you know who wrote the book?
+
+S: An author? Superstitions! There is no scientific evidence that this book has anything but emerged from a blank page. Nowhere. I have traveled to the first sentence, and before it there was a headline, and before that there was nothing.
+
+F: If the book does not have an author, then how can it look like it does?
+
+S: It is chance and the laws of grammar that govern how the book develops, not an author. After vowels almost always comes consonant, words are generally under ten characters long. Where is this author? We have tools that allow us to see everything down to the last detail, and very reliable models for studying whole paragraphs. Nowhere in the entire book is there concrete evidence of the author's existence. It is only magical thinking that is based on a lack of understanding of language and grammar. Do you realize how much of the language we are beginning to understand now, science has come so far that we are beginning to be able to deduce the existence of completely new words. Suffrage, for example, we are almost certain is a valid word even though it only occurs in this sentence.
+
+F: It's impressive what science has accomplished, but what do you mean by existence there?
+
+S: It's a little hard to express, but I can give examples.
+
+S: A sentence exists, it is made of words, which are made of letters. Both you and I can point to these parts. If you are not a bigger fool than you seem to be, you should agree.
+
+F: I think I understand, but do you have more examples.
+
+S: I exist, I am a character in the book. You can point to me and say, there's the scientist! I am not made of letters directly in the same way as a word, but without letters I would not have existed. It is also possible to see that I exist through the words I give rise to.
+
+F: That's probably true. But how do we know that there is nothing outside the book?
+
+S: Think like this - The more we talk, the longer it gets, but you can not reach the edge or travel outside. It's just fantasies. There is nothing there. So not just white paper that may be what you think of when you think of nothing, but not even white paper.
+
+F: It's a little hard to imagine a nothing that is nothing more than white paper actually.
+
+S: It is impossible to imagine. We are only written to imagine the kind of environment we are in.
+
+F: Are there other books outside of this one?
+
+S: There is no evidence for that, and if it does, we would probably not be able to go there and check, because then it would be just an extension of this book, a bad extra chapter that was cut off and never published, for example. It becomes quite pointless and unscientific to speculate about this.
+
+F: If, if the book has an author, would that author really be in the book then? Even if he actually wrote himself into the book, would it really be the author, or just a representation of the author who is not really the same?
+
+S: We still have to relate to what we can prove, measure and deduce with reason. If there is anything other than that, it still has no meaning or impact on our existence.
+
+F: Now I think I think I see what it is you are missing. I spoke to a desperate artist a while ago. He tried to reach outside himself, and paint a picture in a way that did not reflect himself. Time and time again he failed. No matter what he did, with a shaky hand, by throwing paint on the canvas, by building a machine that painted, the painting still became a reflection of the artist's condition. All his expressions were expressions of himself. The paintings could not help but express the artist. In the same way, the author would not be able to write a book that is not permeated by who he is. It is in the nature of creation that the created is colored by the creator, and that which is created has an inevitable meaning. The painting in the example inevitably reflects the artist, who inevitably reflects the book,which inevitably reflects the author.
+
+S: If the author exists, which can not be proven.
+
+F: Whether existence as we define it in the book is meaningfully applicable to the author, which is quite doubtful.
+
+S: We will not get further here, now leave me alone to continue studying the book we are in.
+
+F: You're so right, I'll disturb the mystic instead.
+
+- Said and done -
+
+F: What are you doing, old friend?
+
+M: I'm preparing a ritual to appease the ancient Greeks. Every time the second part starts, I do this.
+
+F: What kind of ritual?
+
+M: It is a dialogue, because it is written that the ancient Greeks appreciated such at the beginning of the book.
+
+F: Then it's probably best not to upset the ancient Greeks. It does not say much other than that they are old, but already in the first sentence. So they must be old already at the beginning of the book, and be older than the book, because how else could they be old already?
+
+F: It's hard to argue against this. Maybe the author is such an ancient Greek.
+
+M: Yes, that's what we think. The oldest and most Greek, because why else would the author write like that if it was not a message to us characters?
+
+F: Do you think we can understand what the author's purpose is in writing the book?
+
+M: It's probably hard to imagine the author's motives. What we do know is that the author has written the book, and can change the book as he pleases. The author is omnipotent, and there is nothing he can not write or erase!
+
+F: It must probably be true, if you have written the whole book, there is probably nothing, within the book in any case, that you can not do.
+
+M: Exactly! And that's why we need to make the author happy by holding this dialogue.
+
+F: But how do we know that this is what the author wants us to do?
+
+M: Because it's written at the beginning!
+
+F: Again - can there be no other purpose to what is written at the beginning? A purpose that we may not understand in our most limited worldview as characters in a book. It is very rare to create something for the sake of what was created, most often you have a purpose with what you create, that you want to achieve something with the creation?
+
+M: What we think is that the author wrote the book for the reader. The reader can be seen as part of the author by some, but somewhat separately by others. The author arranged the letters, but it is the reader who gives them meaning by reading the words, thus giving the whole story a kind of temporary existence. But since there may be several readers, we will live again and again; and live as long as the text remains.
+
+M: The author makes it possible to read, but reading creates the meaning of the authorship.
+
+F: But what does this say about us characters? How should we behave?
+
+M: We should probably follow the author's command: "Even the ancient Greeks appreciated a good dialogue".
+
+F: What if this is not his bidding then? What if the author wants poetry in meter, or a shopping list for dinner?
+
+M: Shut up now before you upset the author! Many stupid questions can be asked, but some things can not be questioned! He may yet decide to stop writing!
+
+-- the end --
+
+This is a translation of a piece I wrote in Swedish, I've gone over it to iron out any errors but some may yet linger
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/commons/search-failure-modes.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/commons/search-failure-modes.gmi new file mode 100644 index 00000000..9e10cf62 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/commons/search-failure-modes.gmi @@ -0,0 +1,72 @@ + + + + + MEMEX - Failure Modes in Search + + + + + + +
+ +
+ + +
+
+

Failure Modes in Search

+
+Searches fail either due to incorrect prioritization of search results, failure to identify keywords in search results (especially for n-gram based approaches), or an insufficient document pool to provide relevant search results.
+
+Prioritization failures are indicated when there are an abundance of search results, but a poor match with the search terms.
+
+Keyword failures can be distinguished from pool limitations by investigating whether the n-gram is present in the dictionary.
+
+

Topics

+
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/commons/self-interest.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/commons/self-interest.gmi new file mode 100644 index 00000000..86c3b7ce --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/commons/self-interest.gmi @@ -0,0 +1,80 @@ + + + + + MEMEX - On acting in self-interest + + + + + + +
+ +
+ + +
+
+

On acting in self-interest

+
+I write texts for myself. If someone finds what I write worthwhile, that is great, but that is not why I write.
+
+For the longest time, I didn't even tell anyone I had written something. I just published it on the internet and gave nobody a link. Because readers aren't the point, and I didn't want them to become a factor.
+
+I build software for myself, unless someone pays me to build it for them. If someone finds use in my software, that is great, but that is not why I build it. I buid it so that I can benefit from it and draw enjoyment from the process.
+
+I act for myself. If someone finds benefit in my actions, that is great, but that is not the ultimate purpose of my actions.
+
+Sometimes we forget, or pretend this is not the case, or get confused and think that neglecting our own benefit will somehow make us happy, even though it should be clear that if we don't tend to ourselves, others neither can or will make up for that neglect.
+
+In the end, every living being does what they imagine will bring them happiness. This may absolutely include helping others, but that is a means to an end. Helping others is just another way of reshaping the world to be more in line with our ideas of how we think it ought to be.
+
+This truth is ugly for those who cannot see that they too are dignified, that they too are entitled to flourishing.
+
+

Topic

+
+/topic/moral-philosophy.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/dead.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/dead.gmi new file mode 100644 index 00000000..c4b73a2b --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/dead.gmi @@ -0,0 +1,44 @@ + + + + + + + MEMEX - + + + + + +
+ +
+
+
+

/dead.gmi is gone

+

+It was never here, I swear + +

+
+ + + +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/debian-laptop-install-log.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/debian-laptop-install-log.gmi new file mode 100644 index 00000000..b0bdc77b --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/debian-laptop-install-log.gmi @@ -0,0 +1,149 @@ + + + + + MEMEX - Installing Debian Bullseye on HP Spectre x360 (dual boot) + + + + + + +
+ +
+ + +
+
+

Installing Debian Bullseye on HP Spectre x360 (dual boot)

+
+The computer in question is a model 15-df0002no bought in 2020.
+
+This is an install log of sorts, mostly for myself but others may have use of it as well.
+
+Disclaimer: This is not a beginner's guide. I've used Linux for over 20 years and can solve most issues that installing a linux distribution throws my way.
+
+If you don't know your way around BIOS, disk partitioning, linux driver loading, or grub; then misguided attempts at follow these notes as instructions may well brick your laptop, and I probably won't be able to help you unfuck it.
+
+

Preparation

+
+
    +
  • Disabled bitlocker in Windows 10.
+
+
    +
  • Disabled secure boot in BIOS. If you haven't disabled bitlocker, it will prompt you for a decrypt key.
+
+
    +
  • Reduced windows 10 partition from within windows since the installer didn't seem to have tools for that. I want to dual boot, and this seems to have worked smoothly.
+
+
+
+The default netinst image doesn't ship firmware that is compatible with the wifi (there is only wifi). Use the nonfree image to get iwlwifi
+http://cdimage.debian.org/cdimage/unofficial/non-free/cd-including-firmware/
+
+I had a problem where the installer refused to authenticate on wifi, overall detection seems a bit spotty even with the nonfree image. I eventually got it working. I changed to the terminal and modprobed iwlwifi as it was detecting network device. I'm not sure if that fixed it, but it worked after. This problem seems limited to the installer as far as I can tell. Beyond installing, it's been rock solid.
+
+Touchpad didn't work in the graphical installer, but worked after the system was installed.
+
+Install went smoothly beyond that.
+
+

Results

+
+
    +
  • The touch pad and touch screen works well, better than Windows, at least the way I use it. Less frustrating accidental cursor movements with the typing detection!
+
+
    +
  • Audio works out of the box.
+
+
    +
  • Webcam works out of the box.
+
+
    +
  • Graphics were a bit weird initially, had a few system freezes, but installing nvidia drivers seems to have fixed it.
+
+
    +
  • Suspend to RAM seems to work.
+
+
    +
  • Suspend to Disk doesn't seem to work.
+
+
    +
  • Setting global scale to 200-250% in KDE 5 and upping a few fonts seems to fix virtually all HiDPI-issues, except for SDDM and the console framebuffer. You may also want to resize the start bar size, but that is personal preference.
+
+
    +
  • SDDM DPI problems fix:
+
+ Put in /etc/sddm.conf.d/kde_settings.conf
+
+[X11]
+ServerArguments=-dpi 240
+EnableHiDPI=true
+
+I haven't tested the SD card reader or the fingerprint reader.
+
+

Problems that could be problems for others but aren't for me

+
+
    +
  • The DPI of the console is absurdly tiny
+
+I haven't really attempted to fix this, for all I know it could be easily remedied.
+
+

Conclusions

+
+Overall it works beyond my wildest expectations. I expected jank, mis-scaled fonts, a barely working touchpad, graphics that didn't work. I got none of that. I got a well-performing Linux laptop.
+
+It works better with Debian Bullseye than it ever did with Windows 10. Well worth it.
+
+

Update several months later

+
+Yes, it's still amazing.
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/debian-laptop.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/debian-laptop.gmi new file mode 100644 index 00000000..b3d2b9c0 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/debian-laptop.gmi @@ -0,0 +1,48 @@ + + + + + MEMEX - + + + + + +
+ +
+ + + + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/feed.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/feed.gmi new file mode 100644 index 00000000..9daa0a66 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/feed.gmi @@ -0,0 +1,48 @@ + + + + + MEMEX - + + + + + +
+ +
+ +
+
+

/feed.gmi is gone

+

+This feed is highly gemini-specific + +

+
+ + + +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/fragments-old-web.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/fragments-old-web.gmi new file mode 100644 index 00000000..8e7970fe --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/fragments-old-web.gmi @@ -0,0 +1,48 @@ + + + + + MEMEX - + + + + + +
+ +
+ + + + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/good-video-games.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/good-video-games.gmi new file mode 100644 index 00000000..0cde3931 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/good-video-games.gmi @@ -0,0 +1,129 @@ + + + + + MEMEX - Good Video Games + + + + + + +
+ +
+ + +
+
+

Good Video Games

+
+A brief list of video games I've enjoyed the last few years. Mostly indie games.
+
+

FPS

+
+
    +
  • Cruelty Squad
  • +
  • Hedon: Bloodrite
+
+

CRPG

+
+
    +
  • Disco Elysium
+
+

Puzzle

+
+
    +
  • Legend of Grimrock II
  • +
  • Yuppie Psycho
  • +
  • Heaven's Vault
+
+

Vanias

+
+
    +
  • Blasphemous
  • +
  • Axiom Verge
  • +
  • Astalon: Tears of the Earth
+
+

Roguelikes

+
+
    +
  • Dead Cells
  • +
  • Vampire Survivors
+
+

Strategy

+
    +
  • Factorio
+
+

VN mystery

+
+
    +
  • Nine Hours, Nine Persons, Nine Doors
  • +
  • AI: The Somnium Files
+
+

Point & Click

+
+
    +
  • Lamplight City
+
+

David Lynch Simulators

+
+
    +
  • Immortality
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/guestbook/footer.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/guestbook/footer.gmi new file mode 100644 index 00000000..7a9bd506 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/guestbook/footer.gmi @@ -0,0 +1,50 @@ + + + + + MEMEX - + + + + + +
+ +
+ +
+
+

/guestbook/footer.gmi is gone

+

+Gemini-server specific + +

+
+ + + +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/guestbook/header.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/guestbook/header.gmi new file mode 100644 index 00000000..e10683a1 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/guestbook/header.gmi @@ -0,0 +1,50 @@ + + + + + MEMEX - + + + + + +
+ +
+ +
+
+

/guestbook/header.gmi is gone

+

+Gemini-server specific + +

+
+ + + +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/guestbook/index.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/guestbook/index.gmi new file mode 100644 index 00000000..2f0351be --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/guestbook/index.gmi @@ -0,0 +1,50 @@ + + + + + MEMEX - + + + + + +
+ +
+ +
+
+

/guestbook/index.gmi is gone

+

+Gemini-server specific + +

+
+ + + +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/index b/code/processes/converting-process/src/test/resources/memex-marginalia/index new file mode 100644 index 00000000..ba00a124 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/index @@ -0,0 +1,155 @@ +index.gmi +links/index.gmi +links/bookmarks.gmi +links/fragments-old-web.gmi +links/linkpocalypse.gmi +links/articles.gmi +links/aggregators.gmi +debian-laptop.gmi +topics.gmi +05-test.gmi +guestbook/index.gmi +guestbook/footer.gmi +guestbook/header.gmi +debian-laptop-install-log.gmi +todo/index.gmi +todo/todo.gmi +todo/done.gmi +junk/bssl.gmi +junk/DedupTest.gmi +junk/server.gmi +junk/eldritch-oneliner.gmi +junk/very-old-code.cc.gmi +fragments-old-web.gmi +commons/self-interest.gmi +commons/dialogue.gmi +commons/search-failure-modes.gmi +pics/index.gmi +pics/links/index.gmi +pics/raster-test/index.gmi +search-about.gmi +todo.gmi +code/index.gmi +writing-topics.gmi +topic/moral-philosophy.gmi +topic/index.gmi +topic/nlnet.gmi +topic/cooking.gmi +topic/astrolabe.gmi +topic/satire.gmi +topic/programming.gmi +topic/platforms.gmi +topic/server.gmi +topic/web-design.gmi +dead.gmi +projects/index.gmi +projects/gemini-server.gmi +projects/encyclopedia/index.gmi +projects/edge/index.gmi +projects/edge/faq.gmi +projects/edge/api.gmi +projects/edge/search-tips.gmi +projects/edge/changelog.gmi +projects/edge/supporting.gmi +projects/edge/for-webmasters.gmi +projects/edge/top-20.gmi +projects/edge/about.gmi +projects/edge/privacy.gmi +projects/edge/design-notes.gmi +projects/wmsa.gmi +projects/memex.gmi +projects/edge.gmi +test.gmi +recipes/index.gmi +recipes/omelette-bacon.gmi +recipes/french-borscht.gmi +recipes/chicken-soup.gmi +one-weird-trick.gmi +special/index.gmi +special/redirect.gmi +special/tombstone.gmi +server.gmi +log/48-i-have-no-capslock.gmi +log/37-keyword-extraction.gmi +log/62-marginaliacoin.gmi +log/61-botspam-apocalypse.gmi +log/13-test.gmi +log/24-silly-hats.gmi +log/47-drive-failure.gmi +log/index.gmi +log/68-wizards-vs-sorcerers.gmi +log/28-web-browsing.gmi +log/25-october-update.gmi +log/26-personalized-pagerank.gmi +log/bargain-bin-btree.gmi +log/55-lexicon-rubberduck.gmi +log/00-linkpocalypse.gmi +log/72-new-approach-to-ranking.gmi +log/49-marginalia-1-year.gmi +log/72-are-you-ok.gmi +log/14-enter-the-circle-of-blame.gmi +log/38-old-and-new.gmi +log/42-dark.gmi +log/65-scaling-doesnt-scale.gmi +log/45-unfuck-internet-discoverability.gmi +log/44-discovery-and-design.gmi +log/31-ngram-needles.gmi +log/12-bye-bye-gmail.gmi +log/70-faster-index-joins.gmi +log/66-carbon-dating.gmi +log/18-soaring-high.gmi +log/53-better-hard-drive-metaphor.gmi +log/08-whatever-happened-to-the-memex.gmi +log/73-new-approach-to-ranking.gmi +log/23-re-software-and-branding.gmi +log/71-memex-design.gmi +log/52-growing-pains.gmi +log/40-wasted-resources.gmi +log/34-internet-arguments.gmi +log/27-getting-with-the-times.gmi +log/19-website-discoverability-crisis.gmi +log/11-dying-every-day.gmi +log/56-uncertain-future.gmi +log/60-prescriptive-descriptions.gmi +log/67-best-ideas-afk.gmi +log/todo.gmi +log/soaring-high.gmi +log/07-local-backlinks.gmi +log/33-rude-guests.gmi +log/36-localized-programming-languages.gmi +log/46-anatomy-of-search-engine-spam.gmi +log/20-dot-com-link-farms.gmi +log/35-keeping-gemini-difficult.gmi +log/17-git-isnt-a-web-service.gmi +log/57-dont-know-how-to-build-software.gmi +log/41-search-result-relevance.gmi +log/15-stages-of-being.gmi +log/59-anchor-text.gmi +log/43-pseodonymous.gmi +log/50-meditation-on-software-correctness.gmi +log/64-hundred-million.gmi +log/16-cursed-motivation.gmi +log/39-normie-hypothesis.gmi +log/03-writing-for-reading.gmi +log/58-marginalia-open-source.gmi +log/06-optimization.gmi +log/69-creepy-website-similarity.gmi +log/01-astrolabe.gmi +log/13-static-html.gmi +log/63-marginalia-crawler.gmi +log/02-re-tests.gmi +log/51-the-file-startup.gmi +log/30-unintuitive-optimization.gmi +log/22-against-the-flood.gmi +log/10-astrolabe-2-sampling-bias.gmi +log/09-system-upgrade.gmi +log/21-new-solutions-old-problems.gmi +log/32-bot-apologetics.gmi +log/29-botnet-ddos.gmi +log/04-link-farms.gmi +log/05-minds-field.gmi +log/74-marginalia-2-years.gmi +log/54-bargain-bin-btree.gmi +feed.gmi +good-video-games.gmi +worklog.gmi diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/index.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/index.gmi new file mode 100644 index 00000000..1ca895b9 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/index.gmi @@ -0,0 +1,74 @@ + + + + + MEMEX - marginalia.nu + + + + + + +
+ +
+ + + + + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/junk/DedupTest.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/junk/DedupTest.gmi new file mode 100644 index 00000000..b10b6738 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/junk/DedupTest.gmi @@ -0,0 +1,50 @@ + + + + + MEMEX - + + + + + +
+ +
+ +
+
+

/junk/DedupTest.gmi is gone

+

+ + +

+
+ + + +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/junk/bssl.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/junk/bssl.gmi new file mode 100644 index 00000000..7d78c63a --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/junk/bssl.gmi @@ -0,0 +1,70 @@ + + + + + MEMEX - /junk/bssl.gmi + + + + + + +
+ +
+ + +
+
+
+  bear seek  bear seek
+  seek lest  seek lest
+  
+  bear seek  bear seek
+  seek lest  seek lest
+
+
+  old lamps for new
+  old lamps for new
+
+
+  see you at the party richter! 
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/junk/eldritch-oneliner.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/junk/eldritch-oneliner.gmi new file mode 100644 index 00000000..8997bf6b --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/junk/eldritch-oneliner.gmi @@ -0,0 +1,70 @@ + + + + + MEMEX - Eldritch Oneliner + + + + + + +
+ +
+ + +
+
+

Eldritch Oneliner

+
+I needed to insert a dictionary into a SQL database.
+
+
+(echo -n "INSERT INTO REF_DICTIONARY (TYPE, WORD, DEFINITION) VALUES"; (jq < kaikki.org-dictionary-English.json 'select( .pos=="noun" or .pos=="verb" or .pos=="name" or .pos=="adj" ) | {pos: .pos, word: .word, meaning: .senses[].glosses[]} | select( .meaning | length<128 ) | [.pos, .word, .meaning] | @csv' -r | sed 's/\(.*\)/(\1),/g;' | tr -d "\n")) | sed 's/,$/;/' > dict.sql
+
+For loading wikipedia titles
+
+
+(echo "INSERT IGNORE INTO REF_WIKI_TITLE(NAME) VALUES";(grep -e "^[a-zA-Z0-9_']\+$" enwiki-20210701-all-titles-in-ns0 | sed 's/\(.*\)/("\1"),/g') | tr -d \\n) | sed 's/,$/;/' > wiki.sql
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/junk/server.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/junk/server.gmi new file mode 100644 index 00000000..94020f17 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/junk/server.gmi @@ -0,0 +1,50 @@ + + + + + MEMEX - + + + + + +
+ +
+ + + + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/junk/very-old-code.cc.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/junk/very-old-code.cc.gmi new file mode 100644 index 00000000..7f842303 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/junk/very-old-code.cc.gmi @@ -0,0 +1,422 @@ + + + + + MEMEX - Very old code + + + + + + +
+ +
+ + +
+
+

Very old code

+
+A sample of very old code I once wrote, with no thought of ever coming back to maintain it. Then I came back 10+ years later and got it running with a lot of effort.
+
+The result (even if the compression doesn't do it justice):
+
+https://archive.org/details/NebulabrotFractalTumblinginFourDimensions
+
+
+#include <stdlib.h>
+#include <math.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "SDL.h"
+
+#define WIDTH 1920
+#define HEIGHT 1080
+#define LENGTH 50000
+#define LIMG   50
+#define LIMB   1
+#define SUBSTEP 0.125
+
+#include <vector>
+#include <utility>
+#include <iostream>
+#include <fstream>
+
+using namespace std;
+
+double mapr[WIDTH][HEIGHT];
+double mapg[WIDTH][HEIGHT];
+double mapb[WIDTH][HEIGHT];
+
+double theta;
+int Z = 0;
+int W = 0;
+const double SQRT2 = M_SQRT2*2.0;
+double angle[6] = { 0, 0, 0, 0, 0, 0, } ;
+vector<pair<double,double>> coords;
+vector<pair<double,double>> origins;
+vector<int> lens;
+int li;
+
+void mapbrot(double x, double y, double z, double w) {
+	double cx = x + z, cy = y + w;
+	double cx2 = cx*cx, cy2 = cy*cy;
+	int iter = 0;
+	double cx0 = cx;
+	double cy0 = cy;
+
+	vector<pair<double,double>> orbit;
+
+//	orbit.push_back(make_pair(cy,cx));
+	while(cx2+cy2 < 4) {
+		if(iter++ > LENGTH) return;
+
+		cy = 2*cx*cy + y;
+		cx = cx2 - cy2 + x;
+		cx2 = cx*cx;
+		cy2 = cy*cy;
+
+		orbit.push_back(make_pair(cy,cx));
+
+		if(cx == cx0 && cy == cy0) return;
+	}
+	coords.insert(end(coords), begin(orbit), end(orbit));
+	origins.push_back({x+z,y+w});
+	lens.push_back(iter);
+}
+
+void render() {
+
+	double ca0 = cos(angle[0]);
+	double ca1 = cos(angle[1]);
+	double ca2 = cos(angle[2]);
+	double ca3 = cos(angle[3]);
+	double ca4 = cos(angle[4]);
+	double ca5 = cos(angle[5]);
+
+	double sa0 = sin(angle[0]);
+	double sa1 = sin(angle[1]);
+	double sa2 = sin(angle[2]);
+	double sa3 = sin(angle[3]);
+	double sa4 = sin(angle[4]);
+	double sa5 = sin(angle[5]);
+
+	long iter = 0;
+	for (int i = 0; i < lens.size(); i++) {
+		double x = origins[i].first;
+		double y = origins[i].second;
+
+
+		int li = lens[i];
+		for (int j = 0; j < li; j++,iter++) {
+
+			double xp = coords[iter].first * ca0 - coords[iter].second * sa0;
+			double yp = coords[iter].first * sa0 + coords[iter].second * ca0;
+			double zp = x * ca1 - y * sa1;
+			double wp = x * sa1 + y * ca1;
+
+			double cxp = xp * ca2 - zp * sa2;
+			double czp = xp * sa2 + zp * ca2;
+			double cyp = yp * ca3 - wp * sa3;
+			double cwp = yp * sa3 + wp * ca3;
+
+			xp = cxp * ca4 - cwp * sa4;
+			wp = cxp * sa4 + cwp * ca4;
+			yp = cyp * ca5 - czp * sa5;
+			zp = cyp * sa5 + czp * ca5;
+
+			int xc = (WIDTH - HEIGHT) / 2 + (1.5*yp+2.5)*HEIGHT/4.0;
+			int yc = (1.5*xp+2)*HEIGHT/4.0;
+
+			if(xc>=0 && yc>=0 && xc < WIDTH && yc < HEIGHT) {
+
+				mapr[xc][yc]+=j/(double) li;
+				mapg[xc][yc]+=1 - j/(double) li;
+			//mapg[xc][yc]++;
+				mapb[xc][yc]++;
+//`				mapb[xc][yc]+=1.0 - j/(double) li;
+			}
+
+			xp = -coords[iter].first * ca0 - coords[iter].second * sa0;
+			yp = -coords[iter].first * sa0 + coords[iter].second * ca0;
+			zp = x*ca1 + y * sa1;
+			wp = x*sa1 - y * ca1;
+
+			cxp = xp * ca2 - zp * sa2;
+			czp = xp * sa2 + zp * ca2;
+			cyp = yp * ca3 - wp * sa3;
+			cwp = yp * sa3 + wp * ca3;
+
+			xp = cxp * ca4 - cwp * sa4;
+			wp = cxp * sa4 + cwp * ca4;
+			yp = cyp * ca5 - czp * sa5;
+			zp = cyp * sa5 + czp * ca5;
+
+			xc = (WIDTH - HEIGHT)/2 + (1.5*yp+2.5)*HEIGHT/4.0;
+			yc = (1.5*xp+2)*HEIGHT/4.0;
+
+			if(xc>=0 && yc>=0 && xc < WIDTH && yc < HEIGHT) {
+				mapr[xc][yc]+=j/(double) li;
+				mapg[xc][yc]+=1 - j/(double) li;
+//				mapg[xc][yc]++;
+				mapb[xc][yc]++;
+			}
+		}
+	}
+}
+
+double min(double a, double b) {
+  if (a < b) {
+    return a;
+  }
+  else {
+    return b;
+  }
+}
+
+SDL_Surface* s = NULL;
+int mval[3] = { 0, 0, 0 };
+void paint() {
+	int x, y;
+	if (mval[0] == 0) {
+		for(x = 0; x < WIDTH; x++) {
+			for(y = 0; y < HEIGHT; y++) {
+				if(mapr[x][y] > mval[0]) mval[0] = mapr[x][y];
+				if(mapg[x][y] > mval[1]) mval[1] = mapg[x][y];
+				if(mapb[x][y] > mval[2]) mval[2] = mapb[x][y];
+			}
+		}
+	}
+
+
+	for(x = 0; x < WIDTH; x++) {
+		for(y = 0; y < HEIGHT; y++) {
+
+			SDL_Rect r;
+			r.x = x;
+			r.y = y;
+			r.w = 1;
+			r.h = 1;
+			int R = 255.0 * min(1.0, mapr[x][y] / (double)mval[0]);
+			int G = 255.0 * min(1.0, mapg[x][y] / (double)mval[1]);
+			int B = 255.0 * min(1.0, mapb[x][y] / (double)mval[2]);
+			SDL_FillRect(s, &r, SDL_MapRGB(s->format, R, G, B));
+			mapr[x][y] = mapg[x][y] = mapb[x][y] = 0;
+		}
+	}
+	SDL_Flip(s);
+
+};
+
+void mbrotsweep() {
+	int X, Y;
+	for(X = 0; X < WIDTH; X++) {
+		for(Y = 0; Y < HEIGHT; Y++) {
+			mapr[X][Y] = 0;
+			mapg[X][Y] = 0;
+			mapb[X][Y] = 0;
+		}
+	}
+
+	render();
+	paint();
+
+}
+
+void save() {
+	static int i = 0;
+	char filename[16];
+	sprintf(filename, "out%.4d.bmp.tmp", i);
+	SDL_WM_SetCaption(filename, NULL);
+	SDL_SaveBMP(s, filename);
+	char filename2[16];
+	sprintf(filename2, "out%.4d.bmp", i++);
+	rename(filename, filename2);
+
+}
+
+
+void exportFile() {
+	ofstream of("out.dat", ios::out | ios::binary | ios::trunc);
+	printf("%d\n", lens.size());
+	int sz = lens.size();
+	of.write(reinterpret_cast<char *>(&sz), sizeof(sz));
+//	of << lens.size();
+	for (int i : lens) {
+		of.write(reinterpret_cast<char *>(&i), sizeof(i));
+	}
+
+	printf("%d\n", origins.size());
+	sz = origins.size();
+	of.write(reinterpret_cast<char *>(&sz), sizeof(sz));
+	for (pair<double,double> p : origins) {
+		of.write(reinterpret_cast<char *>(&p.first), sizeof(p.first));
+		of.write(reinterpret_cast<char *>(&p.second), sizeof(p.second));
+	}
+	printf("%d\n", coords.size());
+	sz = coords.size();
+	of.write(reinterpret_cast<char *>(&sz), sizeof(sz));
+	for (pair<double,double> p : coords) {
+		of.write(reinterpret_cast<char *>(&p.first), sizeof(p.first));
+		of.write(reinterpret_cast<char *>(&p.second), sizeof(p.second));
+	}
+
+}
+void importFile() {
+	ifstream ifs("out.dat", ios::in | ios::binary);
+	int i; double x; double y;
+	ifs.read(reinterpret_cast<char *>(&i), sizeof(i));
+	printf("%d\n", i);
+	fflush(NULL);
+	lens.reserve(i);
+	for (int j = 0; j < i; j++) {
+		int k;
+		ifs.read(reinterpret_cast<char *>(&k), sizeof(k));
+		lens.push_back(k);
+	}
+	ifs.read(reinterpret_cast<char *>(&i), sizeof(i));
+	printf("%d\n", i);
+	fflush(NULL);
+	origins.reserve(i);
+	for (int j = 0; j < i; j++) {
+		ifs.read(reinterpret_cast<char *>(&x), sizeof(x));
+		ifs.read(reinterpret_cast<char *>(&y), sizeof(y));
+		origins.push_back({x,y});
+	}
+	ifs.read(reinterpret_cast<char *>(&i), sizeof(i));
+	printf("%d\n", i);
+	fflush(NULL);
+	coords.reserve(i);
+	for (int j = 0; j < i; j++) {
+		ifs.read(reinterpret_cast<char *>(&x), sizeof(x));
+		ifs.read(reinterpret_cast<char *>(&y), sizeof(y));
+		coords.push_back({x,y});
+	}
+
+}
+
+int main(int argc, char* argv[]) {
+	SDL_Init(SDL_INIT_VIDEO | SDL_INIT_TIMER);
+	s = SDL_SetVideoMode(WIDTH, HEIGHT, 16, SDL_SWSURFACE);
+
+	double x, y;
+
+
+
+/*	for(x = 0; x < WIDTH; x+=SUBSTEP) {
+		printf("%2.2f%% - %d - %2.2f Mb \n", (100.0*x)/WIDTH, coords.size(), 8 * coords.size() / 1024 / 1024.);
+		for(y = 0; y < (HEIGHT)/2; y+=SUBSTEP) {
+			mapbrot((double)4*(x-WIDTH/2) / (double) WIDTH, (double)4*(y-HEIGHT/2) / (double) HEIGHT, 0, 0);
+		}
+	}*/
+//	exportFile();
+	importFile();
+
+	const double step = M_PI / 40000.0;
+	double t = M_PI/3.0;
+	for(t = 0; t < 2*M_PI; t+=step) {
+		angle[2]=M_PI+M_PI*cos(t);
+		angle[3]=M_PI+M_PI*cos(2*t);
+		angle[4]=M_PI+M_PI*cos(3*t);
+		angle[5]=M_PI+M_PI*cos(5*t);
+		mbrotsweep();
+		save();
+	}
+
+	/*
+	for(angle[2] = 0; angle[2] < 2*M_PI; angle[2]+=step) {
+		frame(angle);
+		save();
+	}
+	angle[2] = 0;
+	for(angle[3] = 0; angle[3] < 2*M_PI; angle[3]+=step) {
+		frame(angle);
+		save();
+	}
+	angle[3] = 0;
+	for(angle[4] = 0; angle[4] < 2*M_PI; angle[4]+=step) {
+		frame(angle);
+		save();
+	}
+	angle[4] = 0;
+	for(angle[5] = 0; angle[5] < 2*M_PI; angle[5]+=step) {
+		frame(angle);
+		save();
+	}
+	angle[5] = 0;
+
+	for(angle[2] = 0; angle[2] < M_PI/2; angle[2]+=step) {
+		frame(angle);
+		save();
+	}
+	angle[2] = M_PI/2;
+	for(angle[3] = 0; angle[3] < M_PI; angle[3]+=step) {
+		frame(angle);
+		save();
+	}
+	angle[3] = M_PI;
+	for(angle[5] = 0; angle[5] < M_PI; angle[5]+=step) {
+		frame(angle);
+		save();
+	}
+	angle[5] = M_PI;
+	for(angle[2] = M_PI/2; angle[2] < M_PI; angle[2]+=step) {
+		frame(angle);
+		save();
+	}
+	angle[2] = M_PI;
+	for(angle[3] = M_PI; angle[3] < 2*M_PI; angle[3]+=step) {
+		frame(angle);
+		save();
+	}
+	angle[3] = M_PI*2;
+	for(angle[5] = M_PI; angle[5] < M_PI*2; angle[5]+=step) {
+		frame(angle);
+		save();
+	}*/
+	printf("\nALL DONE\n");
+
+	for(;;) SDL_Flip(s);
+}
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/links/aggregators.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/links/aggregators.gmi new file mode 100644 index 00000000..646075ee --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/links/aggregators.gmi @@ -0,0 +1,74 @@ + + + + + MEMEX - Aggregators + + + + + + +
+ +
+ + + + + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/links/articles.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/links/articles.gmi new file mode 100644 index 00000000..92963a26 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/links/articles.gmi @@ -0,0 +1,82 @@ + + + + + MEMEX - Articles + + + + + + +
+ +
+ + +
+
+

Articles

+
+Links to articles I found interesting.
+
+https://encyclopedia.marginalia.nu/wiki/Peregrinus_Proteus
+
+https://icyphox.sh/blog/2019-09-17/
+
+https://www.thebehavioralscientist.com/articles/the-death-of-behavioral-economics
+
+https://worthdoingbadly.com/nn-adversarial/
+
+https://nullprogram.com/blog/2019/03/22/
+
+http://www.winestockwebdesign.com/Essays/Eternal_Mainframe.html
+
+http://nausicaa.net/miyazaki/interviews/miyazaki_kurosawa_p1.html
+
+https://www.mywvhome.com/sixties/turbine.html
+
+https://interfacecritique.net/book/olia-lialina-from-my-to-me/
+
+https://adamdrake.com/command-line-tools-can-be-235x-faster-than-your-hadoop-cluster.html
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/links/bookmarks.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/links/bookmarks.gmi new file mode 100644 index 00000000..ed2b76f8 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/links/bookmarks.gmi @@ -0,0 +1,165 @@ + + + + + MEMEX - Bookmarks + + + + + + +
+ +
+ + +
+
+

Bookmarks

+
+

Tech

+
+https://datagubbe.se/
+http://ebb.org/bkuhn/
+http://advsys.net/ken/default.htm
+https://dataswamp.org/~solene/
+http://boston.conman.org/
+https://sylvaindurand.org/
+https://www.geoffchappell.com/
+https://nullprogram.com/
+https://www.atarimagazines.com/
+https://www.righto.com/
+
+

Humanities

+
+https://stpeter.im/
+https://www.romeartlover.it/
+https://www.sfpoetry.com/index.html
+http://art-bin.com/aaehome.html
+https://www.monadnock.net/
+http://www.gutenberg.org/
+
+

Misc

+
+http://www.marksmart.net/
+http://ajroach42.com/
+https://wiki.xxiivv.com/site/home.html
+https://schmud.de/
+https://windows95tips.com/
+
+

Interesting

+
+http://voynich.nu/
+https://stonepages.com/
+http://www.kancoll.org/books/perry/
+http://spacekate.com/
+
+

Hacks

+
+http://www.ex-parrot.com/pete/upside-down-ternet.html
+
+

Web Design

+
+https://felix.plesoianu.ro/
+https://neustadt.fr/
+https://ind.ie/ethical-design/
+https://nownownow.com/
+https://neocities.org/
+
+

Games

+
+http://www.gameboomers.com/
+http://www.whipassgaming.com/
+http://www.homeoftheunderdogs.net/
+
+https://meatfighter.com/
+
+

Art

+
+http://godxiliary.com/
+http://sod.jodi.org/index.html
+https://www.floppyswop.co.uk
+https://www.dedware.com/
+http://www.lileks.com/
+https://dannarchy.com/
+
+

Other bookmark-lists

+
+https://datagubbe.se/links/
+https://mineralexistence.com/bookmarks.html
+https://wiki.xxiivv.com/site/bookmarks.html
+https://hd-dn.com/bookmarks/
+https://flamedfury.com/links/
+
+/links/
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/links/fragments-old-web.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/links/fragments-old-web.gmi new file mode 100644 index 00000000..8de469d8 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/links/fragments-old-web.gmi @@ -0,0 +1,111 @@ + + + + + MEMEX - Fragments of the Old Web + + + + + + +
+ +
+ + +
+
+

Fragments of the Old Web

+
+The following is a list of curiosities I've found while crawling the internet looking for websites. A sort of greatest hits from my search engine.
+
+There is no real system or theme to the links, other than the fact that they have made me go "huh, that's neat" while visiting them.
+
+Most of these are effectively impossible to find on Google, since they don't use HTTPS, and aren't optimized for mobile, and aren't plastered in ads and tracking scripts.
+
+I feel that's a shame, because they are pretty cool and deserve visitors.
+
+

Links

+
+http://www.deater.net/metrocat/metrocat.html
+https://www.sfpoetry.com/index.html
+https://loveblender.com/
+http://www.ranum.com/security/computer_security/editorials/dumb/feynman.html
+http://www.silverscreentest.com/koala/eucalyptus/february05.htm
+http://www.dreamart.us/Dante.htm
+http://www.harveyhouses.net/
+http://www.oceanstar.com/
+http://www.marksmart.net/randommusic/IMS/IMSText.php
+http://hyperreal.org/~mpesce/
+http://voynich.nu/
+https://medicolegal.tripod.com/pillsburypacts.htm
+http://rsc03.net/Swimming.html
+http://bubblegun.com/
+https://benbest.com/
+http://www.arcadiasystems.org/
+https://www.harekrsna.com/
+http://classicallibrary.org
+https://templeofmathematics.com/Temple_of_Seshat.html
+http://policepoems.com/
+http://theboojum.com/Tales/Dumptruk/dumptruk.htm
+http://www.redhotred.com/
+http://www.smbmovie.com/
+
+

More

+
+ + + +
+

Topic

+
+/topic/web-design.gmi
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/links/index.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/links/index.gmi new file mode 100644 index 00000000..44d89144 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/links/index.gmi @@ -0,0 +1,66 @@ + + + + + MEMEX - Links + + + + + + +
+ +
+ + + + + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/links/linkpocalypse.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/links/linkpocalypse.gmi new file mode 100644 index 00000000..41469cbc --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/links/linkpocalypse.gmi @@ -0,0 +1,50 @@ + + + + + MEMEX - + + + + + +
+ +
+ + + + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/00-linkpocalypse.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/00-linkpocalypse.gmi new file mode 100644 index 00000000..08c73539 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/00-linkpocalypse.gmi @@ -0,0 +1,112 @@ + + + + + MEMEX - Thoughts on the linkpocalypse [2021-06-30] + + + + + + +
+ +
+ + +
+
+

Thoughts on the linkpocalypse [2021-06-30]

+
+For a long while, I have been puzzled by the strangest problem: My attention span is really bad when I use a computer. I'm an avid reader of esoteric books. I have (recently) read the notoriously dry Confessions of Saint Augustine in print, it was a slog for certain, but it really doesn't compare with the struggles I have when it comes to bringing myself to reading even a few paragraphs of text on a screen. It surely can't be the screen itself, can it? That doesn't seem plausible.
+
+What's more puzzling still is I don't remember this always being the case. I've read longer texts on a screen before. So what is going on? Has the presentation changed somehow?
+
+That's easy enough to check
+
+ + +
+The thing that perhaps sticks out the most is the sheer number of hyperlinks. The modern wikipedia article has nearly 30 of them within the first paragraphs of text, and it's further surrounded by a cloud of links to the margins.
+
+When reading the text, we must make a decision when arriving at each link what to make of it. Should we click it? No, continue. Should we click it? No, continue. We don't make these decisions consciously for the most part, but we still need to make them. We are further pummeled by links to the sides that clamor for our attention.
+
+Encarta also has a few links in the text in some articles, but they are very muted, and don't pop out the way the modern wikipedia version does.
+
+Let's see what happens if we mute the links in wikipedia
+
+ +
+Isn't that immediately a lot better? There doesn't appear to be any thought or purpose behind the hyperlinks in the wikipedia article. It's convenient to be able to go from any article to almost any other article no matter how weakly related, for sure, but that if that interconnectedness comes at the expense of readability, someone should be asking themselves whether it's worth the cost.
+
+Encarta assumes you will be curious and look things up if you read something that piques your interest. That is a pretty good system. It allows for a lot more purposeful agency from the user.
+
+

Topics

+
+/topic/web-design.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/01-astrolabe.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/01-astrolabe.gmi new file mode 100644 index 00000000..150d207f --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/01-astrolabe.gmi @@ -0,0 +1,157 @@ + + + + + MEMEX - The Astrolabe Part I: Lenscraft [2021-07-07] + + + + + + +
+ +
+ + +
+
+

The Astrolabe Part I: Lenscraft [2021-07-07]

+
+Something you probably know, but may not have thought about a lot is that the Internet is large. It is unbelievably vast beyond any human comprehension. What you think of as "The Internet" is a tiny fraction of that vast space with its billions upon billions of websites.
+
+We use various technologies, such as link aggregators and search engines to find our way and make sense of it all. Our choices in navigational aides also shapes the experience we have of the Internet. These convey a warped sense of what the Internet truly is. There is no way of not doing that. Since nothing can communicate the raw reality of the internet to a human mind, concessions need to be made. Some content needs to be promoted, other needs to be de-emphasized. An objective rendering is a pipe dream, even a fair random sample is a noisy incomprehensible mess.
+
+It is a common sentiment on the small web that the Internet has changed, somehow. It isn't what it used to be. It's too commercial, there are too many ads, nobody is being authentic, pages take forever to load, the blogosphere is dead, there's no substance anymore just pictures and hot air, and variations on this theme. I'm going to propose a different interpretation: Maybe it's not the Internet that has changed, but the lenses you are viewing it through. Google has changed. Reddit has changed. Facebook has changed. Twitter has changed. Maybe the Internet has changed too, but what if it hasn't, what if it is still out there?
+
+Google makes its money from storefronts and ads. If you were in their shoes, wouldn't you also promote search results that have ads, or are storefronts? Facebook makes its money from ad impressions. If you were in their shoes, wouldn't you also promote content that maximizes idle scrolling? I'm not asking whether this is good, or for the best, or even ethically defensible; I'm saying it makes perfect sense to, given their incentives, to create lenses for the internet that emphasizes behaviors that serve their economic interests.
+
+Making matters worse, entire industries--much shadier still--have arisen to exploit the algorithms used by these big companies, further displacing the less commercialized aspects of the web. These are like parasitic fish attached to the underbelly of the leviathan, their existence only made possible by the sheer size and dominance of these colossal cornerstones of the modern Internet.
+
+You can be mad all day about the BonziBuddy business model of these big companies, but that's not going to change much other than needlessly raising your blood pressure. It's much more effective to make deliberate choices about which technologies you use, based on what value you find in them. Google is amazing for locating online storefronts and information about local businesses. So use it for that.
+
+An option to the humanly impossible quest of exposing yourself to the Internet without the distortion of an intermediate search engine or link aggregator, is to construct a series of alternative navigational aides that promote different virtues, and emphasize the content we find interesting, and filter out the content we already see more than enough of to have our fill.
+
+Most search engines that position themselves as alternatives to Google aren't. Sure they may not be quite as invasive, but they're really all just doing the same thing Google does, just slightly worse. The bigger problem with Google is its lack of interesting search results, and very little effort seems to be made toward solving that most pressing problem.
+
+I don't even think you need a big budget to attack this problem. On the contrary, I think the scope of the work usually grows to fit your budget. I think the only way to know if you can make it to the stars is to audaciously reach for them yourself.
+
+

The Work Begins - Winter 2021

+
+To demonstrate this point, I set out to construct a search engine that seeks out the web I want to browse. The crazy, creative web. My aim was to "compete" with the scope of Google c.a. year 2000. They had a billion URL index. I felt I could do the same. I could do it better. I didn't need a data center, or tens of thousands of employees. I didn't need planetscale computing or whatever the latest buzzword is. I could be scrappier, I could build this myself, I could run it on small hardware in my living room.
+
+At first I felt my arms were indeed long enough to slap God, so I figured the ultimate insult to big tech would be to run this on a Raspberry Pi 4-cluster. I figured most software nowadays is slow because it's too complicated, and the numbers sort of indicated this might actually be possible. After all, a billion 64 bit integers is just short of 8 Gb. You could allow 100b worth of data per URL and still fit within a 1 Tb hard drive. That seemed on the verge of doable, and that was all I felt necesssary to proceed!
+
+Unfortunately, that was a bit too audacious. I had to rein in my ambitions to make it work. The hardware was simply too limited. It worked for a few million URLs, but not much beyond. Index look-ups were slow. Back to the drawing board. I built a server out of consumer hardware. It needed to sit in my living room, and I didn't want a noisy tower of jet engines bellowing heat into my apartment, so unfortunately no 42U server rack filled to brim with supermicros; but a Node 804 with packing a Ryzen 3900X, 128 Gb RAM, and a 4 drive IronWolf ZFS. It does make some noise when its crawling, but it's absolutely nothing compared to commercial gear.
+
+Several months of work later and the search engine works pretty well. It's severely lacking in bells and whistles, but it works better than I had ever imagined it would when I set out, and shows exactly what I wanted to demonstrate: The Internet, as you remember it, it hasn't gone anywhere. What has changed is the lenses you view it through. You simply can't find it on Google anymore because it isn't always "mobile first", it doesn't always use HTTPS, it doesn't always have adsense ads. But it's still mostly there.
+
+I initially called the search engine "edge crawler", since I envisioned it would seek out a sort of silver lining of quality sites within the web, which I guess it does to an extent, but its scope is much broader than I had originally intended, so I'm rebranding it as the "astrolabe" in keeping with the somewhat space:y theme of gemini. An astrolabe is an antique and medieval tool with large applications in (medieval) astronomy, astrology, timekeeping and navigation. [1]
+
+
+

Check it out:

+
+ + +
+Note that I will have to take this down to rebuild the index sometimes. Don't expect complete uptime. I'm gonna need even more hardware before that is gonna happen. But there's always wiby.me to keep you company when its down.
+
+Right now it only crawls HTTP and HTTPS, but I am working on adding some form of gemini support as well down the line. Exactly what form that takes will have to be decided, as the feature not only needs to be useful, but non-disruptive--both for the search engine and the gemini space.
+
+

A sample of what is to come

+
+It's been an interesting technical challenge and I've had to reinvent a few wheels to make it work. I plan on doing more detailed write-ups on some of the considerations when designing a search engine like this. Since I don't plan on commercializing this, I will divulge a lot of information about the algorithms I'm using.
+
+These are the obstacles any search engine will have to face:
+
+

Crawling Economy

+
+I don't mean in the sense of money, but rather in what you are getting out of your crawling.
+
+
    +
  • Link quality assessment - PageRank and the like was amazing twenty-five years ago, but maybe there are options? Maybe the biggest enemy of PageRank is the hegemony of PageRank-like algorithms.
  • +
  • Even without PageRank, link farms are a significant problem, and they are often constructed to be effectively endless. It's common to see random subdomains matching DNS wildcards linking to random URLs that randomly generate links to more of the same.
  • +
  • Balancing discovery with indexing. Ideally, you want to discover new domains faster than your indexer runs dry, but not so fast that the indexer can't ever keep up. You also don't want to spend an exorbitant amount of time indexing every page on for example wikipedia, that's not useful. Wikipedia is well known and has its own search feature.
+
+

Indexing and Presentation

+
+
    +
  • Existing database solutions do not scale well to indexes that range in the hundreds of millions of entries and hundreds of gigabytes of data. O(n) means hours of work. A poor choice of O(log n) can mean minutes. Every query algorithm needs constant access time or O(log n) on an aggressively reduced working set. A search engine needs to be fast, it should produce near instantaneous results.
  • +
  • Snippet generation is very hard. HTML is not a particularly standardized format, and reliably producing a page summary that is relevant is not something I've solved yet. Not for a lack of trying. Nobody is using <meta name="description"> anymore, barely even old websites.
  • +
  • Tokenization. What constitutes a word? Is "R2D2" a word? Is "243a7b722df240d7a886c69b0758e57d" a word? If I search for "bears", should I expect to match the word "bear"? Tweaking this rules can have a very large effect on the size of the dictionary used by the index, and indeed the index itself.
+
+Stay tuned, and you may learn what is in the "filthTable", how to memory map 100 Gb of data in a language that hard caps memory maps to 2 Gb, and more.
+
+
+ +
+

Topics

+
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/02-re-tests.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/02-re-tests.gmi new file mode 100644 index 00000000..2965e7c0 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/02-re-tests.gmi @@ -0,0 +1,120 @@ + + + + + MEMEX - Re: To unit test or not to unit test, that is the question [ 2021-07-08 ] + + + + + + +
+ +
+ + +
+
+

Re: To unit test or not to unit test, that is the question [ 2021-07-08 ]

+gemini://gemini.conman.org/boston/2021/07/07.1
+
+I felt the need to add some thoughts tangentially related to this post by Sean Conner.
+
+

Why do we hold unit tests in such high regard?

+
+Enterprise software development (Agile with a TM at the end), and to an increasing degree open source software development has really accepted the Unit Test as personal lord and savior deep within their souls. If it doesn't have coverage, it's bad. If it has coverage, it's good.
+
+ +
+Anyway, it's an appealing notion that quality can be quantified, but it very rarely is the case. Attempts at quantifying quality usually tends to shift what we mean by quality to no longer be particularly useful. The quantitative and the qualitative realms are in their essence orthogonal, you really can't compute how well a program fits its purpose and if you try, what you are computing is something else.
+
+Let's be systematic:
+
+

Are unit tests sufficient for quality in code?

+
+Since we find low quality code with unit tests all the time, this proposition simply cannot be true.
+
+

Are unit tests necessary for quality in code?

+
+There are other paradigms for code quality, and many examples of code that has never been unit tested yet has high quality. Almost anything written in assembly, for example. There are also other QA paradigms. In-code assertions are great and extremely underutilized today, they make all your testing better.
+
+So for the question of necessity -- no.
+
+

Are unit tests useful for code quality?

+
+This part is entirely subjective. In my experience, they can absolutely be helpful, and I do write a lot of tests for some code, but they can also be useless, even an obstacle to quality; so I don't test all code for the sake of testing it. Tests don't have intrinsic value, but should have a purpose. If you don't know what purpose a test has, you shouldn't write it. That purpose can be to get at some hard to reach code for manual debugging, to exhaust edge cases in a tricky algorithm, to prevent regression during refactoring, any number of things. However if the only purpose of the test is to increase coverage, then it is a harmful test. It adds maintenance cost, it comes at a cognitive penalty, and it took time that could be spent doing something actually useful. As much as testing forces you to break the code apart, breaking the code apart too much just leaves it fragmented and unnecessarily complicated.
+
+In the end, tests are a tool. A bit like mouse traps. If you've covered the entire floor in mouse traps and they've yet to catch a single mouse, then that's just making life harder on yourself. If you put some where you suspect mice, and they sometimes catch a mouse, that's great value for a small investment.
+
+Prudence is a greatly undervalued virtue in software development. I think true-sounding principles are some of the deadliest things in this business, they completely shut down any sort of evaluative or critical thinking we might otherwise employ. A lot of IT-people claim to be skeptics, but they only seem to employ that skepticism toward things they don't believe, which is a place where it has little use. Principles can seem so true, promise so much, and oftentimes they do, but they also make us completely blind to the fact that sometimes they're superstitions that simply don't hold water.
+
+
    +
  • Test coverage is great, except when it isn't.
  • +
  • Segregating data and I/O is great, except when it isn't.
  • +
  • Breaking apart code into smaller pieces is great, except when it isn't.
  • +
  • Elaborate commit messages are great, except when they aren't.
  • +
  • Mocking is bad, except when it's not.
  • +
  • All test should have a purpose, except when they shouldn't.
  • +
  • The principle of not trusting principles is great, except when it isn't.
+
+It's not from a lack of having been there. I've had ample sips of the kool-aid too. Ten years ago I'd read some book by Uncle Bob and it seemed very true, and he did have great points at times. Except when they weren't.
+
+I do think we should at least occasionally approach these sacred doctrines with a degree of flippant irreverence. At least in personal projects where failure is a learning opportunity. It's really the only way to test if and when they are true.
+
+But, que sçay-je?
+
+

Topics

+
+/topic/programming.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/03-writing-for-reading.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/03-writing-for-reading.gmi new file mode 100644 index 00000000..a15923ab --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/03-writing-for-reading.gmi @@ -0,0 +1,129 @@ + + + + + MEMEX - Writing for Reading [2021-07-12] + + + + + + +
+ +
+ + +
+
+

Writing for Reading [2021-07-12]

+
+I'm struck by how easy it is to read things on Gemini. Not just skimming, but actual reading, like you would a book. Not everything written is great, but it's usually worth reading all the same. Venturing out into the land of contemporary HTML is different, and it's not a subtle difference.
+
+I've written before about the plague of inline links on Wikipedia, and this is largely a continuation of that discourse looking at other design elements.
+
+It's an increasingly established truth that our attention span is somehow being dwindled, and to combat this there seems to exist a sort of unfortunate arms race to draw the reader's attention to what is being written, accompanied with calls to write simpler texts with short sentences and catchphrases that can be absorbed even at the briefest of glances. Colors, too. And images, doesn't matter how irrelevant. They gotta be there or people won't look!
+
+Maybe in part this is a cause of the problem it's trying to solve. If you call attention to everything, you call attention to nothing; all while cluttering up the presentation with colors and doodads that actively impair reading specifically because they constantly attract the attention of the reader away from reading. It's like having a conversation in a crowded and noisy bar: Of course you need to shout short sentences, that is no place for nuanced rhetoric.
+
+It's well known in the expressive crafts that the bigger your movements are, the smaller your capacity for nuance becomes. It seems all but forgotten that you can emphasise things in text without any use of colors or fonts. You can use abrupt sentences. You can use exclamation points! OR USE CAPS TOO!
+
+You can isolate a key point to really give them that extra punch.
+
+Most important of all: If you exercise restraint with these techniques, and reserve the attention grabbers to the parts where it really is merited, they they have a much bigger impact than if you use everything everywhere.
+
+The fact that you often have ads or other distracting elements surrounding or indeed within the lines of text doesn't help either, often specifically designed to draw the reader's attention away from reading. What I wanted to show is that the ads aren't the only problem even if they may be at the other end of the rope in this pointless tug-of-war for the reader's attention.
+
+ +
+The heading I don't have a problem with even though it's a different color, but I count eight different points of emphasis in those two paragraphs. Undoubtedly well-meaning, but it's hard to do anything but skim text such as that. It's especially unfortunate since rebase is one of those concepts that people really seem to struggle with when learning git.
+
+The subject matter itself is hard to read too, even in the plainest of man page renderings, no amount of typography can help remedy git's leaky abstractions (or lack of abstractions).
+
+ +
+Again it has more points of emphasis than it has lines, and most of the things it emphasises would have been emphasised even without changes in color or font. Words like Accept-Post and HTTP do already stick out, because they are capitalized differently. The effect, when reading the text is skimming. The reader's eyes are probably going to bounce between Accept-Post, HTTP, media types, Accept-Post, POST, POST, 415 Unsupported Media Type and finally Accept-Post; with zero comprehension of what those paragraphs actually conveyed.
+
+Even though these examples are from technical writing, you see similar problems across most genres. In the cases the text itself is relatively free from calls to attention, it's often surrounded in colorful design elements suggesting where to go next.
+
+Our ability to focus probably hasn't changed, what's happened is a stylistic metamorphosis of what we are trying to read.
+
+

Referenced Texts

+
+ +
+

Referenced Webpages

+
+https://git-scm.com/docs/git-rebase
+https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Post
+
+

Topics

+
+/topic/web-design.gmi
+
+

Acknowledgements

+
+Thanks idiomdrottning for pointing out the superfluity of some of my own internal references in a previous version of this text (sparse as they were).
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/04-link-farms.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/04-link-farms.gmi new file mode 100644 index 00000000..619cbc3f --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/04-link-farms.gmi @@ -0,0 +1,133 @@ + + + + + MEMEX - On Link Farms [2021-07-14] + + + + + + +
+ +
+ + +
+
+

On Link Farms [2021-07-14]

+
+I'm in the midst of rebuilding the index of my search engine to allow for better search results, and I've yet again found need to revisit how I handle link farms. It's an ongoing arms race between search engines and link farmers to adjust (and circumvent) the detection algorithms. Detection and mitigation of link farms is something I've found I need to modify very frequently, as they are constantly evolving to look more like real websites.
+
+In the mean time, I'll share an autopsy of how link farms operate, and some ways I've approached them. It's a strange and shady business that doesn't get talked about a lot. The advertisement industry is shady. The SEO industry is shadier still. This is the shady part of the SEO industry. It's shady cubed, some real cloak-and-dagger stuff.
+
+The point of a link farm is to manipulate the algorithms used by search engines, typically Google, which several degrees simplified rates a website by how much traffic it gets. Link farms can also serve as vectors for scams and malware, since they allow the construction of unpredictable URLs across different domains that point to similar content, that's hard to detect for spam filters and antivirus software.
+
+Their modus operandi seems to be as follows:
+
+
    +
  • They register one or several domains somewhere, it's usually .xyz because they are cheap
  • +
  • They buy some cheap cloud computing someplace, very often Alibaba
  • +
  • They point wildcard records for *.their-domains.xyz to their cloud ingress
  • +
  • They upload a website that responds to every URL with a bunch of links to random subdomains with random URLs. Occasionally they will be freebooting content off social media like reddit, or from articles or blog posts to make their content look less machine generated, but surprisingly often they'll straight up be lists of keywords and links.
  • +
  • They buy expiring domain names and put links to the link farm, and also spam them in forums and free wordpress, blogspot, etc.-blogs.
+
+The fact that they are often using the cheapest domain names should indicate that they register a lot of domains. Often they are shilling hotels or travel-related products, there's also a strange cluster that's squatting in domains that once belonged to pages about blues music; and there's finally a large operation that seem to target the east-asian online shopping market.
+
+The age of man will have expired before you're done indexing just one of these effectively endless tangles of domains and hyperlinks so simply powering through is not really an option.
+
+I do have some flagging of domains with large numbers of subdomains, but that's a pretty expensive operation that is only possible to run every 10 minutes, and by the time they're detectable, they've already polluted the index quite a bit. Think links across 10 domains x 500 subdomains x 10000 known URLs; for one link farming operation. So far I've identified nearly ten thousand domains, and I do not think this is exhaustive. This is a last resort measure to catch the ones that get through.
+
+It's much better to weed out the rotten eggs before they enter the machinery, and I've found the far most effective solution to this to apply scorched earth tactics, and indiscriminately exclude entire swathes of addresses from crawling. My index is never going to be a complete one anyway, no search engine does that, so I'll ruthlessly take any measure that increases the quality.
+
+I'm restricting the crawling of subdomains in the new generic TLDs and some ccTLDs. As mentioned earlier, .xyz is especially rife with these sites. I think it's a combination of cheap domain names and weak oversight; I've read that they have been a major source of email spam as well. An unfortunate side effect is that this cuts off a lot of domain hacks. "cr.yp.to" is one site I for example currently will not index despite it having otherwise interesting content.
+
+I'm also IP-blocking sites that don't use the www-subdomain, when they are hosted in Hong Kong, China, Taiwan, India, Russia, Ukraine, or South Africa. It's not the least fair as there are legitimate websites of interests hosted in these countries and domains, but again it's very effective.
+
+Repeatedly I'm met with the disheartening conclusion that we just can't have nice things.
+
+

Appendix: Number of identified link farm domains by TLD

+
+
+xyz    2622 gTLD
+com    1776 gTLD
+tw     535  ccTLD Taiwan
+online 511  gTLD
+top    265  gTLD
+pw     249  ccTLD Palau
+icu    204  gTLD
+net    167  gTLD
+asia   117  gTLD
+site   72   gTLD
+
+I would present a breakdown by country, but that would entail making nearly ten thousand DNS queries in rapid succession, and that's just an unnecessary waste of resources.
+
+

Topics

+
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/05-minds-field.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/05-minds-field.gmi new file mode 100644 index 00000000..2e37e6e0 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/05-minds-field.gmi @@ -0,0 +1,77 @@ + + + + + MEMEX - The Mind's A Field [2021-07-18] + + + + + + +
+ +
+ + +
+
+

The Mind's A Field [2021-07-18]

+
+I find if I do not regularly plant interesting thoughts in my mind, it will rarely grow them spontaneously. I need to read interesting books, or have interesting conversations; if I do not, I'll look back at my ideas and find I haven't really had any in a very long time. Thoughts will grow whether I take care to plant them or not, but what grows if I have been careless is weeds.
+
+I find if I do not regularly harvest interesting thoughts from my mind, they will occupy the same space forever and completely displace new ideas. I need to file them away in writing somewhere to be able to let them go.
+
+I find if I do not ever rotate the type of thoughts I attempt to grow, if I never change ambitions, never try new activities but always stick to the same old groove, my mind will eventually grow barren and malnourished and stricken with the debilitating notion that I already know what there is to know.
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/06-optimization.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/06-optimization.gmi new file mode 100644 index 00000000..b951d89a --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/06-optimization.gmi @@ -0,0 +1,259 @@ + + + + + MEMEX - Index Optimizations [2021-07-23] + + + + + + +
+ +
+ + +
+
+

Index Optimizations [2021-07-23]

+
+
+ Don't chase small optimizations
+
+Said some smart person at some particular time, probably. If not, he ought to have; if worse comes to worst, I'm declaring it now. The cost of 2% here and 0.5% there is high, and the benefits are (by definition) low.
+
+I have been optimizing Astrolabe, my search engine. The different kind of Search Engine Optimization. I've spent a lot of time recently doing soft optimization, improving the quality and relevance of search results, to great results. I'll write about that later.
+
+This post is all about about war stories.
+
+The search index simply grew beyond what the code could deal with. The characteristic behavior of dealing with very large amounts of data is that whatever you're doing works well, until you hit a brick wall, where it suddenly doesn't work at all. This has happened at a few times already.
+
+

Problem #1 - Wildly random writes

+
+Part of my the search engine reads a list of URLs and words. I'm presenting a scheme of the file here so that you can get a grasp for the layout. Imagine letters are URLs and and numbers are words here. In reality it's all integers but we can pretend it's not.
+
+
+(A) 1 5 7 8 (B) 3 2 (C) 1 5 7 (E) 2 8 9 etc...
+
+This is converted into two files that make up an implicit look-up table, and a sorted list of URLs grouped by which words they contain. I'll attempt to illustrate the layout.
+
+First the two files, horizontally and side-by-side. Presented vertically is the value the lookup table will arrive at for each index (1-indexed).
+
+
+  0 0     2   3   4 4   5 5     7     9    WORDS (position in URLs)
+  | | A C | B | B | | C | | A C | A E | E  URLS
+0 + |     |   |   | |   | |     |     |
+1 --+     |   |   | |   | |     |     |
+2 --------+   |   | |   | |     |     |
+3 ------------+   | |   | |     |     |
+4 ----------------+ |   | |     |     |
+5 ------------------+   | |     |     |
+6 ----------------------+ |     |     |
+7 ------------------------+     |     |
+8 ------------------------------+     |
+9 ------------------------------------+
+
+So to find URLs that contain word '7', you would look at the range in the urls file starting at words[7] and ending at words[8]; in this case, that's indices 5 and 7; the so words are A and C.
+
+It's confusing, but what matters is this: The input file is typically of the order of a few gigabytes, and the output files can be in the tens of gigabytes. To rearrange the data in this fashion requires a lot of random writes, the order of the input file doesn't correlate with the order of the output file, and it's too much data to buffer in memory.
+
+The destination is a consumer grade SSD. These SSDs do not deal well with tiny random writes at all. It's just too slow.
+
+The first order solution I was using was to mmap the file and let the operating system sort out the write order, which worked until it suddenly didn't. Conversion of a large index, a process that repeats this process 20 times, usually took around an hour. That is below the pain threshold. This is run once or twice a day while actively crawling the web, and not having much of an impact on the operations of the search engine, so that's tolerable.
+
+Then out of the blue, it stopped taking an hour, the conversion time increased to over 24 hours.
+
+What had happened is that the file had gotten too big to entirely keep in memory, and consequently the random writing pattern incurred extreme thrashing, with ceaseless page faults.
+
+The file in the example would write in this order:
+
+
+A_________
+A____A____
+A____A_A__
+A_B__A_A__
+A_BB_A_A__
+ACBB_A_A__
+ACBBCA_A__
+ACBBCACA__
+ACBBCACAE_
+ACBBCACAEE
+
+The solution was to first write >writing instructions< in a series of files on disk, that is arranging them in buckets based on their destination address in the final file. This effectively increases the amount of data to be written by 150%, but that's fine as long as it's fast. (Nobody look too carefully at the SMART values for the SSD I'm using exclusively as a working space for these index files)
+
+The instructions, schematically, look like this:
+
+File 1: A@0 B@2 B@3 C@1
+File 2: A@5 C@4 C@6
+File 3: A@7 E@8 E@9
+
+These can be evaluated on a by-file basis to organize the writes to eliminate thrashing, and so the writing speed is back to being comparable with the original solution.
+
+The instructions above would evaluate like this
+
+
+A_________ - File 1 -
+A_B_______
+A_BB______
+ACBB______
+ACBB_A____ - File 2 -
+ACBBCA____
+ACBBCAC___
+ACBBCACA__ - File 3 -
+ACBBCACAE_
+ACBBCACAEE
+
+
+

Problem #2 - A Very Large Dictionary

+
+A few days later I ran into a problem with keeping the search term dictionary in memory. The dictionary is a one-way mapping between a string (a word), to a unique integer id. These IDs are the "words" from the previous section.
+
+The index crashed when the dictionary was approximately 380 million terms. This needs to be very fast, and there aren't a lot of canned solutions that deal with the particular scenario. I've been using GNU Trove's custom hash tables. From experimentation, the B+-trees popular in SQL databases don't deal gracefully with this type of usage. The disk size of the dictionary was 6 Gb, but the memory footprint was closer to 24 Gb and the dreaded OOM-killer kept killing my process.
+
+

Java is wasteful

+
+The thing when you have of order a billion items is that evey byte translates to a gigabyte of memory. Normally a few bytes here and there really doesn't matter, but in this domain, you need to be extremely frugal.
+
+First I needed to work around the fact that Java has a 16 byte object header associated with every object. The solution was to allocate off-heap memory (an extremely unpleasant interface that allows some interface to basic malloc()-memory) rather than 380 million byte[]-instances. I also ended up implementing my own hash table and memory allocator specifically for this scheme.
+
+This shaves 4 or so Gb off the memory footprint. Down to 20 Gb for 6 Gb of data. Better, but still not good.
+
+ +
+

Text is redundant

+
+The dictionary entries themselves are single-byte encoded strings, sometimes joined by underscores to represent sequences of words. The heading of this section would produce the terms "text", "is", "redundant", "text_is", "is_redundant", "text_is_redundant". That's a lot of redundancy.
+
+
+0  text
+1  is
+2  redundant
+3  text_is
+4  is_redundant
+5  text_is_redundant
+
+As an observation based on what the data looks like, there are more joined words than regular words. One would indeed expect there to be more permutations of the items of a set than items in the set for sets that are larger than two items. This would imply two avenues of improvement:
+
+

Reduce the number of single words

+
+Not much to do here, I implemented better language identification based on dictionary overlap with 1000-most-common-words lists for the target languages. The search engine targets English, Swedish and Latin; the languages I can understand. This is in part to reduce the dictionary to a feasible size, and in part because I can't quality control search results I can't read.
+
+Languages that join words without hyphens are especially problematic. Looking at you, Germany; I found the first instance of "programmierungsmodelle" after over 300 million dictionary entries.
+
+

Optimize how joined words are stored

+
+Perhaps a way forward is using the fact that the dictionary already is a mapping from string to integer, to compress the data. For some function F, the data can be stored as
+
+
+0 "text"
+1 "is"
+2 "redundant"
+3 F(0,1)
+4 F(1,2)
+5 F(3,4)
+
+As long as the output of F is in a separate binary namespace from regular strings, that's fine. To this end, integers need to be prefixed by a marker byte, luckily there's 32 available items at the bottom of the ASCII table I used that are guaranteed to never appear in the dictionary entries. Integers are 4 bytes each though, and the marker byte is another, so this would only be helpful for strings that are in excess of 9 bytes.
+
+But! These integers are often smaller than a full integer, you can represent all the integers in the example with <= 3 bits. You could store the entire pair in a single byte if you really try, like so:
+
+F(0,1) = Marker+(0000001)
+F(1,2) = Marker+(0000110)
+F(3,4) = Marker+(0011100)
+
+The 32 available marker bytes can then encode how many bits from the right the break between numbers are. This is extremely fiddly programming and I freely admit it took several hours to iron out all the corner cases.
+
+I got it right in the end, mostly thanks to a comprehensive battery of unit tests, and suddenly the size of the dictionary binary data was almost halved.
+
+Likewise, I devised a few schemes for representing integers in the smallest necessary binary format, helpful as there are a lot of random integers floating around on the internet. There are a few more schemes you could implement, but then you are chasing small percentages and that's not worth it.
+
+Actually evaluating these compressed byte schemes would be pretty slow, but luckily there's no need for that. The bytes are used exclusively as keys for the dictionary. All they need to be is a unique representation of the input that is cheap to calculate.
+
+In all, this reduced the memory footprint of the dictionary by 8Gb, from in excess of 24Gb to 16Gb; and the entries seem to be encoded at an average of 6 bytes per entry, down from 15. If anyone thought it would be "good enough" to just calculate a hash wide enough to ensure there's probably no collisions, then it would almost certainly be more expensive. Even an 10 byte hash would feel pretty sketchy for a billion+ items (10^-7 collision rate).
+
+This was helpful, but the precious cherry on top is realizing the applicability of Zipf's law. Preparing the dictionary with a list of dictionary items in order of most common occurrence gives a compression ratio of 60-70%, since the bit-length of the index effectively becomes inversely related to the probability of finding the word! The most common words become the least amount of bits!
+
+GZip compresses the old data by 63% (that's the ballpark my own compression arrived at!), and the new one by 21%. That's not at all bad given how cheap it is.
+
+--
+
+About half of this is live and running on the search engine right now, the rest will probably go live next week.
+
+

Links and further reading

+
+ +
+ + +

Topics

+
+/topic/programming.gmi
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/07-local-backlinks.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/07-local-backlinks.gmi new file mode 100644 index 00000000..74d4b56b --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/07-local-backlinks.gmi @@ -0,0 +1,99 @@ + + + + + MEMEX - Local Backlinks [2021-07-26] + + + + + + +
+ +
+ + +
+
+

Local Backlinks [2021-07-26]

+
+Maintaining links is difficult. My gemini server doesn't have a lot of pages, but already maintaining links between relevant pages is growing more tedious by the page. It's going to become untenable soon.
+
+In part inspired by Antenna, I had the idea of extracting local backlinks, and automatically appending them to the pages that are linked. That way all local links are effectively bidirectional. If new a new post links to an old post, the old post automatically links to the new post. Old pages will thus over time accumulate more links to new pages without manual maintenance.
+
+Extracting this information was a relatively easy scripting job, the output ends up in two easily parsable text files, one with links and one with page titles.
+
+These can then be read by the server and used to create the links dynamically, as well as used to lint existing links and highlight dead ones. This does require a modicum of discipline when writing the gemini markup, as it expects all local links to start with the pattern "=> /", but that is also something that can be checked automatically.
+
+I've written before about the over-linking problem on Wikipedia, that is something I'm careful about not recreating here as the backlinks would further amplify the problem.
+
+An unexpected emergent feature is that automatic back-linking allows for the creation of topical ad-hoc indicies. Merely creating an empty file and referring to it in pages will populate it with links to those pages. Is this useful? I don't know yet, but I will experiment and see if it brings any value. I do think it may help reduce the urge to recreate such topical indices within the posts themselves, and thus to mitigate the risk for over-linking.
+
+

The Code

+
+ + +/links.txt
+/titles.txt
+
+

Referenced Pages

+
+ +
+

Referenced Websites

+
+ +
+

Topics

+
+/topic/web-design.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/08-whatever-happened-to-the-memex.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/08-whatever-happened-to-the-memex.gmi new file mode 100644 index 00000000..9cc13e0b --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/08-whatever-happened-to-the-memex.gmi @@ -0,0 +1,98 @@ + + + + + MEMEX - Whatever happened to the Memex? [2021-07-28] + + + + + + +
+ +
+ + +
+
+

Whatever happened to the Memex? [2021-07-28]

+
+I stumbled upon the Memex, which is a spiritual predecessor to hypertext technology. It was supposed to be a sort of personal data store that allows the user to link and annotate various documents in order to produce a sort of external memory, a private knowledge bank that associates ideas in a similar way a human brain does. The operator could also save and share associative trails through the information.
+
+I found it fascinating because it seems so extremely doable with today's technology, yet at the same time extremely foreign to how we use technology today, i.e. as a social tool, a mechanism for self-promotion.
+
+There are elements of wiki in the Memex, but the big difference is that the Memex is a personal database, rather than one big shared "world-brain" like wikipedia could be described as; the latter is problematic as it shuts down independent thought rather than augmenting it. Truth becomes something to look up, rather than investigate.
+
+Andy Matuschak seems to have discovered a similar idea with his "evergreen notes"-concept, but seems to emphasize the habitual note-writing over the technological aspect. He derives his ideas from an interesting 17th century index card technique called "zettelkasten" that also has similarities to hypertext.
+
+There may be a benefit to having relatively immutable entries in a knowledge database, such as with the Memex (which was supposed to use microfilm), or boxes full of index cards. Having editable pages, like in a wiki, may lead to endless fiddling with largely inconsequential details.
+
+I do think that it's very easy to fall into the trap of worrying so much about your organizational tools and habits that you don't actually use them for much of anything useful other than writing about your organizational tools and habits; and some of the people involved with zettelkasten seems to have fallen into that trap head first.
+
+The rabbit hole further took me to a recording of "The Mother of All Demos", a 1968 live-demo of hypertext technology and various computer-human-interaction experiments. The demo is 90 minutes long from multiple locales. That is impressive to say the least. I found some of the presentational and navigational capabilities fascinating, like a strange hybrid between a wiki and vim.
+
+In closing I think we aren't using leveraging hypertext nearly as much as we could. Almost everything is using *HyperText* MarkupLanguage, but almost nothing makes use of the rather astounding associative capabilities of hypertext.
+
+

A "memex trail" to follow

+
+ + + +https://notes.andymatuschak.org/About_these_notes
+ +
+

Replies

+
+ +
+

Topics

+
+/topic/web-design.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/09-system-upgrade.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/09-system-upgrade.gmi new file mode 100644 index 00000000..3e3f3d05 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/09-system-upgrade.gmi @@ -0,0 +1,167 @@ + + + + + MEMEX - The System Upgrade [2021-07-30] + + + + + + +
+ +
+ + +
+
+

The System Upgrade [2021-07-30]

+
+Early this winter, when I set up the server that would eventually become marginalia.nu, I did so in order to try out some technology I thought looked cool (proxmox, zfs), and stuff I was exposed to at work and didn't really see the point of so as to see if we could get on better terms with if I had more control (kubernetes).
+
+I based the system on ProxMox, a Linux based virtualization server, which ran a series of virtual machines and containers.
+
+
+ProxMox
+┠─ git (ubuntu-server)
+┠─ mariadb container
+┠─┒ kubernetes (ubuntu-server)
+┃ ┠─┒ WMSA (my software)
+┃ ┃ ┠─┒ search engine
+┃ ┃ ┃ ┠ crawler x 2
+┃ ┃ ┃ ┠ crawler orchestrator
+┃ ┃ ┃ ┠ index server
+┃ ┃ ┃ ┠ assistant server
+┃ ┃ ┃ ┠ archive server
+┃ ┃ ┃ ┖ search backend
+┃ ┃ ┠ rendered page cache
+┃ ┃ ┠ static page renderer
+┃ ┃ ┠ reddit front-end
+┃ ┃ ┠ podcast RSS aggregator
+┃ ┃ ┖ SMHI API front-end (swedish weather forecasts)
+┃ ┠ elastisearch
+┃ ┠ fluentd
+┃ ┠ prometheus
+┃ ┠ kibana
+┃ ┠ grafana
+┃ ┠ letsencrypt automation
+┃ ┠ nginx server 
+┃ ┠ docker repository
+┃ ┖ nginx ingress
+┖─ Gemini (ubuntu-server)
+
+This set-up grew increasingly untenable. Not only was it very difficult to get an overview of what was actually happening, all of these choices have small costs associated with them, of RAM, of space, of CPU; and taken together, I ended up only being able to productively use about half of the ram on my server for what I wanted to.
+
+The Linux OOM killer kept reaping the search engine index process with 50 out of 128 Gb available memory that was just lost in the layers of abstractions somewhere.
+
+I also have some really neat hardware coming soon; an Optane 900P, which I'm very excited to see what I can do with. It promises low-latency random I/O, which is exactly what I want. This also mandated a rethinking of how this all works in order to make good use of.
+
+Someone famously declared
+
+
+ “Let's use Kubernetes!”
+ Now you have 8 problems
+
+I do think this is largely a correct analysis. There may be a scale which you'll see more benefits from kubernetes than drawbacks, but that scale is enormous. For smaller operations like mine, certainly anywhere you can count the servers on a few hands, I do think there's a Thoureauian conclusion to draw here: The complexity of working with a solution like kubernetes can only be handled using a tool like kubernetes. In the small domain, such automation creates *more* work, not less. This abstraction is a complication, rather than a simplification, if the concrete isn't already very complicated.
+
+You have logs across dozens of containers, so you can't grep them anymore, so you need elasticsearch and fluentd. But raw elasticsearch is a headache, so you need kibana too. Oh hey, now it's gotten even more complicated. Can't even see when stuff goes down. Better set up monitoring that alerts you. Let's see, prometheus is good. But the GUI is nasty, better get grafana too.
+
+This is how the snowball rolls. Adding things makes the set-up more complicated, which mandates adding even more things to deal with the complexity, which makes them more complicated, which...
+
+ +
+I'm going to be very blunt and say I don't like kubernetes. Things keep changing and breaking, and when you look for a solution, what you find doesn't work because some variable has changed name again, or a repository has been renamed.
+
+The ecosystem seems very immature. When it works it's not bad, but when it breaks (and boy does it ever break), you're in for a very unpleasant time. I get a sort of Vincent Adultman-vibe from the entire ecosystem. Everyone talks about what is suitable for production, but everything keeps inexplicably breaking, nothing is ever easy to fix; and the solution is always some inexplicable snippet on stackoverflow you're just supposed to blindly run without really understanding.
+
+I also get the feeling dealing with kubernetes that YAML is the new XML. The problem with XML wasn't really the formatting, that's just an inconvenience. The problem was the megabytes worth of configuration in enterprise software. The YAML keeps growing to meet the needs of the growing YAML.
+
+It's not all bad though. I do actually like the idea of microservices. If you do them properly and unix-like while at the same time don't get *too* in love with them so that you can't see how bigger services can be good sometimes too. They're a big reason of why my stuff actually works. I can redeploy parts of the system while others are running. That's amazing because my index server has a boot-up time of up to an hour.
+
+

The new set-up

+
+Migration took about 12 hours, and that included changes to the software and setting up git hooks for easy deployment. I got rid of proxmox and zfs and went with Debian Buster and ext4 instead. I kicked out kubernetes and half of that ecosystem, and I'm not using any containerization at all.
+
+It's as simple as that. I have one memory in one kernel, one system to keep up to date and patched. I can actually tell you most of what is running on it and what it's doing.
+
+This is it:
+
+
+Debian
+┠─ mariadb
+┠─┒ WMSA (my software)
+┃ ┠─┒ search engine
+┃ ┃ ┠ crawler x 2
+┃ ┃ ┠ crawler orchestrator
+┃ ┃ ┠ index server
+┃ ┃ ┠ assistant server
+┃ ┃ ┠ archive server
+┃ ┃ ┖ search backend
+┃ ┠ rendered page cache
+┃ ┠ static page renderer
+┃ ┠ reddit front-end
+┃ ┠ podcast RSS aggregator
+┃ ┖ SMHI API front-end (swedish weather forecasts)
+┠─ nginx
+┠─ git is just a /home directory
+┖─ gemini server
+
+

Topics

+
+/topic/server.gmi
+
+

External Links

+
+ +
+ +
+ + + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/10-astrolabe-2-sampling-bias.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/10-astrolabe-2-sampling-bias.gmi new file mode 100644 index 00000000..4aeabe31 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/10-astrolabe-2-sampling-bias.gmi @@ -0,0 +1,151 @@ + + + + + MEMEX - The Astrolabe Part II: The Magic Power of Sampling Bias [2021-08-03] + + + + + + +
+ +
+ + +
+
+

The Astrolabe Part II: The Magic Power of Sampling Bias [2021-08-03]

+
+As I have mentioned earlier, perhaps the biggest enemy of PageRank is the hegemony of PageRank-style algorithms. Once an algorithm like that becomes not only dominant, but known, it also creates a market for leveraging its design particulars.
+
+Homogenous ecosystems are almost universally bad. It doesn't really matter if it's every computer running Windows XP, or every farmer planting genetically identical barley, what you get is extreme susceptibility to exploitation.
+
+It's why we have link farms, it's why there's an SEO industry, and it's in part why the internet has gotten so bad since those that cater to the algorithm are shaped by it, and those who don't are invisible.
+
+

Quality Assessment

+
+To get search results that are interesting again, a some different method needs to be devised.
+
+If the problem is that everyone is trying to cheat the popularity contest, maybe we can cut the gordian knot by looking at something other than popularity.
+
+Maybe we can infer that websites that specifically don't try to win the popularity contest have some intrinsic value. Maybe we can cook up a measurement that looks for indicators of SEO, and punishes that.
+
+This in mind, I created a score based on mark-up. Simplified it calculates a score that roughly gauges how "plain" a webpage is.
+
+
+       length_text     -script_tags
+  Q =  -----------  x e
+       length_markup
+
+There are other factors too, specific words also reduce the score, mostly pertaining to porn, bitcoin and warez, as those are problem areas that yield very few legitimate results and a lot of spam.
+
+For the rest of the post when I use the word quality, I will refer to this score. "Low quality" is not a judgement, but a number.
+
+Note that for each script tag, quality is reduced by 63%.
+
+
    +
  • 1 script tag and quality can be no more than 37%
  • +
  • 2 script tags and quality can be no more than 13%
  • +
  • 3 script tags and quality can be no more than 5%
+
+... and so forth. Script tags are the biggest factor in a web page's quality assessment.
+
+There are drawbacks to this, not every use of javascript is exploitative. Sometimes it brings usefulness, but those web sites will be de-prioritized.
+
+

Indexing

+
+This score drives the crawling priority of each website the crawler discovers. It flavors the quality of the outgoing links too, so that to best effort, websites are crawled in a decreasing order of quality.
+
+Naturally the assumption doesn't hold that a website looks like the websites that link to it, but I think the reverse assumption is better. Low quality websites rarely link to high quality websites.
+
+The search engine will only index one or two pages low quality pages it encounters and then probably never look back.
+
+Indexed websites are then sorted in eleven different buckets based on their quality (actually its negated logarithm, from 0 through -10). These buckets allow the index to be queried in order of decreasing quality, as the index has no other awareness of the pages' quality.
+
+Given that there are very real constraints on how big the index can get, maybe 20-30 million URLs, the main priority in crawling is finding the most salient pages and aggressively rejecting everything else. One million high quality URLs is better than a billion low quality URLs.
+
+While in general I am a friend of Voltaire and advocate tolerance well beyond what most people would consider reasonable, in this case I promote extreme prejudice. Ruthless concessions need to be made to ensure quality. If it raises the quality of the index, nothing is off limits.
+
+I talked about that a bit in the post on link farms I made earlier.
+
+

Relevant Search Results

+
+When it's time to query the index, during searching, the index buckets are queried in decreasing order of quality. The results are then sorted in order of how many incoming links the domain has weighted by the page's quality.
+
+Superficially this is an outdated and broken way of building a search engine since link farms and other trash results will almost by definition produce high numbers of incoming links, but what makes it work is the shewed sample created by the crawling process. It is possible to find results from the full gamut of quality, but low quality results are just rarer.
+
+It's not that the search results are picked in order of how many links they have, it's the results that have already been picked that are prioritized in that order in order to present the best ones first.
+
+I implemented this last point relatively recently, and the result has been pretty remarkable. As long as you are within an area where there actually is pages to find, the search engine not only finds them, but often shows relevant results at the top. I'm really happy with how well it's working now.
+
+Then there's the problem areas, where you can't find anything relevant. I mentioned porn and bitcoin earlier, but also travel, security systems, locksmithing, SEO; these topics do not produce good results. They seem absolutely inundated with spam. I've blacklisted the spam domains, but it's been like peeling layers off an onion. The more I removed the less there remained, until eventually there was nothing at the core.
+
+It remains a niche search engine. I do use it as my default search engine on my phone mostly because I believe in eating your own dogfood, but it's still challenging. I keep bouncing between it and the big search engines. If I can't find it on mine, I try theirs. If I can't find it there, I try mine some more. It's a coin toss sometimes.
+
+ + +
+https://search.marginalia.nu/
+
+

Topic

+
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/11-dying-every-day.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/11-dying-every-day.gmi new file mode 100644 index 00000000..5add930b --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/11-dying-every-day.gmi @@ -0,0 +1,90 @@ + + + + + MEMEX - Dying, Every Day (Re: Last times) [2021-08-04] + + + + + + +
+ +
+ + +
+
+

Dying, Every Day (Re: Last times) [2021-08-04]

+
+Dece's post "last times" made me associate to one of my favorite thoughts from Roman philosopher Seneca, who was counting his days having fallen out of favor with Emperor Nero.
+
+In the first of his moral epistles to Lucilius, he asks:
+
+
+ Quem mihi dabis [...] qui intellegat se cotidie mori?
+
+
+ Who can you show me [...] that understands he is dying every day?
+
+It's a fascinating and useful reversal of perspective. Our final day is not the day our life is suddenly taken away from us, but the last of our allotted days. Every preceding day has already been marked by our dying. Every day is a day that will never return, every moment is sand irreversibly flowing through the hour glass.
+
+We are like wax candles that live only through perishing. What we should fear is not the day we run out of wax and the flame goes out, but that we squandered that flame, that we weren't bringing enough light and warmth into the world.
+
+How some people have dealt with these last two years' pandemic is a pretty good indicator that we, as a whole, should meditate more on the fate we're headed for. We all inevitably die, but making a fool out of ourselves doing so is less inevitable.
+
+The theme of contemplating death has been a recurring sentiment in western thought until fairly recently, and I think it's a shame it has gone away because it actually is a quite useful topic of meditation that lets us come to terms with our mortality.
+
+Public clockworks were often inscribed with reminders like "ultima forsan"--perhaps the last [hour]. The words "carpe diem" have survived, but what has been largely lost is that they carry the same grim urgency that tomorrow may not come, so you'd better not squander the present.
+
+In Horace's poem, "Carpe diem" is preceded by the words "even as we talk, envious time is running out", and followed by a call not to put trust in tomorrow.
+
+gemini://dece.space/notes/20210803-last-times.gmi
+
+

Topic

+
+/topic/moral-philosophy.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/12-bye-bye-gmail.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/12-bye-bye-gmail.gmi new file mode 100644 index 00000000..ba5222ba --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/12-bye-bye-gmail.gmi @@ -0,0 +1,86 @@ + + + + + MEMEX - Bye, Bye, Gmail [2021-08-04] + + + + + + +
+ +
+ + +
+
+

Bye, Bye, Gmail [2021-08-04]

+
+I finally got around to moving @marginalia.nu off gmail. I've been planning to do so for a while, and vascillated between self-hosting and using a provider. I ended up going for the latter. Even though I do like to self-host my stuff as much as possible, email servers seem like a lot of work. And I got a VPS account included in the price, which is nice. Means I can use it to do some off-site backups without having to use dropbox or similar.
+
+Maybe this is a strange sentiment around these parts, but privacy is the smallest beef I have with google. I don't particularly care if they can read my emails or learn of my surfing habits.
+
+What I do care about is the fact that gmail is a terrible email provider that seems outright hostile to its users.
+
+When it launched fifteen years ago, it was fantastic. It was streets ahead of the competitors. It filtered out so much spam, it was clean and sleek and had features, it was the suprelative free email provider. Inbox had even more features that made it even cleaner and sleeker.
+
+The inbox space you got was absolutely humongous too. This was at the time you would have to constantly delete the tail of the inbox to keep it below 100 Mb or whatever. Gmail gave you gigabytes of data at launch, and that number just kept growing. It was a pretty decent publicity stunt.
+
+To jog your memory, here's a screenshot of gmail around launch.
+
+ +
+Then over the years, something happened. Looking squarely at my gmail inbox, almost all I see is spam. Sales, promotions, offers; left, right, center. Inexplicably it's being let through the spam filter. Some of it has even been placed there by google as "promoted" items. That is, I'm getting spam emails that weren't even sent to me.
+
+This just isn't what I signed up to. Looking back I can see that it's been going on for a while, but I've just gotten so used to the state I didn't really think about how absurd it is.
+
+So I'm saying goodbye to gmail as my primary email provider, and good riddance.
+
+

Topics

+
+/topic/platforms.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/13-static-html.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/13-static-html.gmi new file mode 100644 index 00000000..3882c48e --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/13-static-html.gmi @@ -0,0 +1,145 @@ + + + + + MEMEX - Rendered static HTML [2021-08-13] + + + + + + +
+ +
+ + +
+
+

Rendered static HTML [2021-08-13]

+
+The technological choices we make determine the rules we have to abide by.
+
+If every page load incurs hundreds of database calls on the server, and 30 seconds of javascripting on the front-end, then obviously you need to reduce the number of page loads to a minimum. They are frustrating for the user and expensive for the server. This makes the front-end even more slow and stateful, and so the urgency for reducing page loads increases even further.
+
+So what if we don't do any of that? What if we just serve static HTML instead? For the server it's such a lightweight even a raspberry pi can hold the fort at moderate traffic, and on the front-end it's just as fast.
+
+Of course, rendering HTML isn't free. Depending on how much data we're talking about, it can take time. But it's time you spend once, or at least infrequently. Not only is the result faster, it's better for the environment. We can host more stuff on less server, and the clients don't need to use nearly as much wattage presenting it. As long as your data is fetched more often than it is altered, it's an improvement.
+
+The sacrifice is of course all those small alterations, modifying content is what becomes expensive, everything else is virtually free. This means you can't afford to change the content based on the visitor's history. Everyone gets the same page. In this paradigm, you need hardware in proportion to the rate your content is mutated, not the amount of content, or really even the number of users. This since you can cache the content extremely cheaply using ETags.
+
+What I want to show is the profound downstream effects of making a different design decision. A piece of counterfactual web-design history.
+
+

Case 1: Reddit

+
+I have experimented with this approach for a while, and among my first attempts was a front-end for reddit. It's a relatively kind use case, where I use their APIs to fetch the few subreddits I frequent, and render the threads and comments and keep the results in memory, backed by a disk-based long-term storage for fault tolerance. I also wrap their submission API, posts to which triggers an immediate re-rendering of the affected thread or subreddit, giving the illusion that it's always fresh when it's in practice usually a maybe 10 minutes behind the real deal.
+
+It's overall pretty fast and light. "Real" reddit has approximately an 8 Mb payload. My front-end has payload usually sits around 1-2 Kb. It pulls some stylesheets and a font or two, still rarely going above 50 Kb.
+
+Of course my design is also a lot more stripped down, aiming for a degree of functionality somewhere between your average mailing list and a pre-2000s internet forum. What I originally wanted to explore was how the reddit experience would change if you removed votes, direct messages and most images, and made it a pure text-based discussion board. The result has a very different feel to it, when you must judge each comment for itself, without the ability to see how other people have judged it.
+
+

Case 2: Wikipedia

+
+Why not go for broke, right? I've harped about the questionable design choices of wikipedia before, and while they do let you inject CSS (if you log in), page loads are still incredibly slow and it's bringing me a lot of frustration.
+
+They do license their page content under CC-BY-SA, so why not use that license to impose my flavor of design improvements and produce a version of wikipedia designed with the singular purpose of making it as easy to read as possible, purging it of inline links and footnotes, images and most tables.
+
+Wikipedia doesn't want you to scrape their live site because it's apparently very expensive to render.
+
+How delightfully apropos! I guess that is what's up with the slow page loads.
+
+A way around that is that they do offer data dumps for download in various formats. So I grabbed a ZIM archive--that's an archive format for rendered wikipedia readers that's relatively standardized--and found an abandoned library for reading such files, tinkered with it a bit because it was apparently written in the time of Hildegard of Bingen and so read the file data a single byte at a time. The library was as a result about 100 times slower than it needed to be.
+
+After that I wrote a program that extracts every HTML page, subjects them to a pretty severe DOM-massage that removes most inline links and stuffs them at the end of the page. Then I write them as gzip-compressed HTML to disk. The output is for the most part pristine HTML. You don't even need a browser to read it. Netcat is plenty.
+
+Formulas were a bit tricky, and the best solution I could find was rendering them into PNG and inserting them directly into the HTML. As long as nobody tells Donald Knuth, I think I may get away with this cruel affront to typesetting mathematics ;-)
+
+Rendering takes about 24 hours and produces some 14 million files, 60 Gb in total. I have no doubt it could be done faster, but a day's worth of cooking really isn't even that bad since these dumps come out about once every six or so months.
+
+

Thoughts

+
+Two things become apparent after using the scrubbed encyclopedia for a while.
+
+The first is that it really is a lot easier to read once you remove all the points of distraction. I start reading it like a book. I've gotten stuck reading articles in a way I rarely do in Wikipedia. I've learned quite a lot too. This has been my hypothesis since before I embarked on this project, that inline hyperlinks and images do more to disrupt readability than to enhance it.
+
+The second observation is more surprising: I find it far more apparent when I don't fully grasp a topic. It is as though hyperlinks makes us think that information is available to us, and because of that, we estimate that we essentially already understand the topic, beacuse we could find out later.
+
+This is of course not sound logic at all, but I think that is what happens when we see an underlined word we aren't quite sure what it is. So we keep reading as though we did know, and never go back to click the link, because if you click every link, you won't get past the first sentence in any article.
+
+The experience when reading the scrubbed encyclopedia is one of needing to take notes of things to look up later, one of barely understanding the text even in areas I'm quite well versed, even pages I've previously read in Wikipedia.
+
+I wonder if this effect is part of why there are so many experts these days. Covid breaks out, and everyone is suddenly an immunologist. IPCC report drops and everyone is suddenly a climate scientist. If there's a war, everyone is a general; if someone wants to lose weight, everyone is an expert on that too (even if they've never tried themselves). Nevermind the fact that it takes a decade of studies to even get a PhD, nobody seems to need any of that.
+
+

Links

+
+ + +
+https://reddit.marginalia.nu/
+https://encyclopedia.marginalia.nu/
+https://encyclopedia.marginalia.nu/wiki/Hildegard_Of_Bingen
+
+https://dumps.wikimedia.org/
+
+

Topics

+
+/topic/web-design.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/13-test.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/13-test.gmi new file mode 100644 index 00000000..d0decd24 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/13-test.gmi @@ -0,0 +1,50 @@ + + + + + MEMEX - + + + + + +
+ +
+ +
+
+

/log/13-test.gmi is gone

+

+This was just a test file!! + +

+
+ + + +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/14-enter-the-circle-of-blame.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/14-enter-the-circle-of-blame.gmi new file mode 100644 index 00000000..14cbeebb --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/14-enter-the-circle-of-blame.gmi @@ -0,0 +1,95 @@ + + + + + MEMEX - Enter the Circle of Blame [2021-08-15] + + + + + + +
+ +
+ + +
+
+

Enter the Circle of Blame [2021-08-15]

+
+So that IPCC report, huh. It's provoked interesting behavior in people. I'll skip over the minority that deny the findings, many people seem to agree that the report isn't great news. Then they stop to look around and start pointing fingers.
+
+In summary:
+
+
    +
  • The producers blame the consumers for making the wrong purchases.
  • +
  • The consumers blame the producers for producing the wrong things.
  • +
  • The voters blame the politicians for incorrect policies.
  • +
  • The politicians blame the voters for expressing the wrong wants.
  • +
  • The right blames the market for not adapting fast enough.
  • +
  • The left blames capitalism for being shortsighted.
  • +
  • The dogs blame the cats and the cats blame the mice.
+
+We can run around this endless circle of blame until it's 2121, and we still won't have gotten any closer to finding a solution. What we're looking for is some witch to burn, someone to be really mad at for causing us grief; but what we need is to change our behavior. Don't forget: The producers are people. The consumers are people. The voters are people. The politicians are people. The rich, the poor, the right, the left, the market, capitalism; it's all people.
+
+It's all people. Making decisions. With consequences.
+
+But in our heads, what it's all down to is: It's someone else's fault. It's always someone else's fault. It becomes a discussion of who is to blame, who to judge, rather than what we ourselves can do.
+
+Guilt, in the legal sense, must be on some party. This feels like doing something, but it's a great mirage. In the grand scheme of things, blaming someone else, dragging them to the stake to be burned, it does nothing, it does less than nothing: It wastes time. The problems they caused still remain and we are no closer to solving them.
+
+That does not hold true for our ability to affect change. Sometimes, if we want the world to change, we can't wait for the people who are to blame to fix the problem. Odds are they never will based on the fact that they are the same people who willfully neglected the problem into existence in the first place.
+
+Almost every single one of us ten billion people are, to some degree, to blame for this mess. Who is more to blame is utterly irrelevant to fixing the problem. Here's a plot twist: All the people pointing fingers and assigning blame are perfectly correct. Why yes, there are other people who are sinners too. But so are we.
+
+We can't control other people's actions, but we can control our own; so that's where it must start. If we think the world should be in such-and-such way, it is our obligation to act in a way that is aligned with that.
+
+The question that remains is this: How long will you wait until you start acting the way you, yourself, think you should act? How long will you wait until you start taking responsibility for your actions?
+
+Just sitting on our hands and blaming other people for not fixing our problems doesn't align with anything other agenda than making us sanctimonious hypocrites. It may feel good, but it doesn't do good.
+
+Almost every single one of us need to make changes if we want to course correct. And no single person can fix it alone. I can't fix it alone. You can't fix it alone. Elon Musk can't fix it alone, Xi Jinping can't fix it alone. Everyone needs to look to themselves and what they can do, regardless of what other people are doing. We simply cannot compel other people to think like we do, as the only thing we do have absolute control over are our own everyday decisions.
+
+But nevermind all that, let's get back to pointing fingers and doing nothing instead. Maybe if we get really angry about some group that thinks differently, that'll surely get us somewhere!
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/15-stages-of-being.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/15-stages-of-being.gmi new file mode 100644 index 00000000..a66f119b --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/15-stages-of-being.gmi @@ -0,0 +1,130 @@ + + + + + MEMEX - Stages of Being [2021-08-23] + + + + + + +
+ +
+ + +
+
+

Stages of Being [2021-08-23]

+
+@sdfgeoff asked an interesting question on station just a while ago
+
+
+ How much of your lives do you spend living (or watching) someone elses?
+
+It reminded me of an interesting tool kit for understanding being.
+
+The neoplatonists describe a hierarchy of being. This is sometimes attributed to Renaissance enfant terrible Pico della Mirandola's oration of the dignity of man, but it's given the most cursory mention. I've attached the quote at the bottom of the post. Plato's Republic is probably a better source, even if it doesn't draw up the hierarcy quite in this fashion.
+
+

Existence

+
+At the lowest rung is existence. Examples of things that merely exist are rocks, water, the sky. These existences can change, but can not enact change.
+
+

Growing

+
+At the next rung is growing. Examples of things that exist and grow are trees, grass, mushrooms, algae, germs. These existences can enact change, but only in a simple and mindless fashion.
+
+

Sensing

+
+At the next rung is sensing and reacting. Examples of things that exist, grow and sense/react are most animals. They are slavishly compelled to seek food and sex and to avoid harm. This is a nature of being that is reactive. It's a nature of behaving that is easily replicated with a computer.
+
+

Thinking

+
+At the final rung is thinking. Examples of things that exist, grow, react and think are human beings. We are capable of overriding our instincts, and change our judgements and wants. We are capable of choosing to endure almost any hardship if we opt to. We are capable of reshaping the world to suit or our will, or reshaping our will to suit the world. This is a nature of being that is active. Thinking existence means we can engage in a dialogue with the universe, rather than go down a series of if-then-else statements based on what nature throws our way.
+
+It's an anthropocentric point of view, but it's out of an anthropocentric treatise on the potential of human beings to step beyond their base animal nature and actually do great things, a potential that Mirandola posits exists in all humans.
+
+The key point is that all things in the hierarchy have all the natures of the rungs below them as well. This is what justifies putting them in that order.
+
+Human beings are not just capable of having an active human nature but can also regress to a reactive animal nature as well, we can become vain and vicious, helpless slaves to the carrot and stick that is pleasure and pain; or even a vegetative life that is only marked by mindless consumption. But both those human existences are a waste of human potential compared to the life where thinking is allowed to pilot the ship.
+
+
+ [N]ever think, my friend, that you are free while your belly rules you and the part below the belly, since you will then have masters who can either furnish you the means of pleasure or deprive you of them [...]
+
+- Emperor Julian, Oration VI
+
+I do not think we need to live as ascetics who renounce every pleasure and choose every hardship, but I do suggest that maybe the spiritual practice of periods of fasting and abstinence may not be entirely about following arbitrary religious mandates: Bouts of denial does help exercise those muscles of willpower and resolve that seem so severely atrophied in many these times. After all, hunger makes the metaphorical food, when it comes, all the tastier. At least for the man who can endure the wait.
+
+

Links

+
+gemini://station.martinrue.com/sdfgeoff/80736a6a7b88497ea98a50784d9d8e77
+ + +
+

Pico's quote

+
+
+ But upon man, at the moment of his creation, God bestowed seeds pregnant with all possibilities, the germs of every form of life. Whichever of these a man shall cultivate, the same will mature and bear fruit in him. If vegetative, he will become a plant; if sensual, he will become brutish; if rational, he will reveal himself a heavenly being; if intellectual, he will be an angel and the son of God.
+
+- Pico della Mirandola, Oration on the Dignity of Man
+
+
+

Topic

+
+/topic/moral-philosophy.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/16-cursed-motivation.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/16-cursed-motivation.gmi new file mode 100644 index 00000000..edf4a806 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/16-cursed-motivation.gmi @@ -0,0 +1,96 @@ + + + + + MEMEX - Cursed Motivation [2021-08-27] + + + + + + +
+ +
+ + +
+
+

Cursed Motivation [2021-08-27]

+
+A question I often see asked is one along the lines of
+
+
+ How do I motivate myself do (something)
+
+... where something may be eat healthier, go to the gym, work on some project, study hard, &c.
+
+This idea of motivation is interesting. I think it in part comes from the school system, where teachers and parents often talk about motivating the children to study, perhaps with some sort of reward system. I haven't been able to pinpoint exactly who introduced the idea, but my hunch is based on never seeing the particular usage of the word in a book printed before the late 20th century.
+
+What we seem to mean by the word is that feeling of excitement for a task that compels us to do it. But in practice, motivation is a fair weather friend at best. Motivation goes away the first hint of an obstacle. The eye of the tiger screeches to a halt and your montage is replaced by the harsh reality that work is still work. As a means to compelling yourself (or someone else) to follow through on something, it's a disasterously useless tool. It is however a very useful money-making tool. There is an entire industry dedicated to selling useless motivation. Motivational speekers, motivational books, motivational posters. But it is, to be perfectly clear, snake oil they are selling.
+
+Moving beyond the assumption that motivation will be the solution, there is a second problem with the question being asked. The asker sees himself as two people, when in reality, he is one person with two sets of wants, the short-term wants, and long-term wants; wants that he doesn't understand are in conflict and doesn't know to weigh against each other.
+
+We can characterize the short-term wants as those wants that reflect on what we want to experience. These are the wants of motivation, of craving, of fear, of anxiety. Long term wants on the other hand are wants of what sort of person we want to be. These are existential wants, of ideals, of morals.
+
+This is not a modern problem. Saint Augustine, living some 1600 years ago, famously declared
+
+
+ Give me chastity, but not just yet.
+
+This disparity between what we actually do, and how we think we ought to act, is one humans seem to have rediscovered in regular intervals. Modern economists and behavioral scientists talk of the value-action gap, the moral philosophers of antiquity talk of moral incontinence. It's arguably different words for the same thing.
+
+A part of the problem is understanding. As mentioned, it seems oddly popular to view yourself as two different people. All that does is breed a sense of helplessness and a feeling of not being in control. But as much as you can raise your arm on the command of your will, you are in control now and always. This appearance of a lack of control is a mirage brought on by refusing to acknowledge your vacillating goals and unclear ambitions.
+
+The superior approach, I think, is to reflect on your actions, and to build an understanding of things as they are in contrast to what they promise they will be. For example, many of the things we crave only seem a good idea in the future, but seem a bad idea in the past. Alcohol is perhaps the best example of this effect, but even beyond things that cause a literal hangover, this is a common pattern. The opposite pattern of appearances can be found in things that seem like chores. Going to the gym on a rainy day, cleaning your house, going to the dentist: Few look forward to these things, but fewer still regret having done them. Should we really trust these questionable promises of the future in the face of what the past teaches us again and again?
+
+This practice of reflection also needs to extend to long-term wants. Are they actually things we want, or things we think others will be impressed by? Fulfilling other peoples' goals isn't necessarily fulfilling your own. The reason we compel ourselves to follow through with these long-term wants is because it is what we want. If we take resolute steps in the wrong direction, if we follow through on what we don't want, we're off than having wandered aimlessly and not followed through on anything. We also need to be brutally honest with ourselves about the reasons why we do (or avoid) some things for this to work.
+
+As a closing note, I'll mention self-esteem, as a highly related topic people seem to have all manner of strange ideas about. They think they need to go to the gym and get buff in order to get self-esteem; they do, and improve their self-esteem, but it was following through on that long-term want, rather than shifting physical appearance, that improved their self-esteem. It is in the name, *self*-esteem. You build regard for yourself by doing the things you think you should do. You lose it by doing things you think you shouldn't do.
+
+

Topic

+
+/topic/moral-philosophy.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/17-git-isnt-a-web-service.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/17-git-isnt-a-web-service.gmi new file mode 100644 index 00000000..e505dbaf --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/17-git-isnt-a-web-service.gmi @@ -0,0 +1,109 @@ + + + + + MEMEX - Git Isn't A Web Service [2021-08-28] + + + + + + +
+ +
+ + +
+
+

Git Isn't A Web Service [2021-08-28]

+
+This an expansion on a comment I left on Lettuce's gemlog post, "Personal Experiences and Opinions on Version Control Software".
+
+I've seen similar questions posed several times recently, in essence people searching for a good git provider.
+
+The thing is you don't need a git provider. Git is a shell command, and you can host a server yourself with almost no extra work. You can even host it off a system you don't have administrative access to.
+
+It's a shame git has become synonymous with this large web-application overgrowth for so many these days. It is and remains a fairly complete shell command, and github and its clones are third party extras that have latched themselves onto the ecosystem and seem to be doing their best sucking the life out of it through gamification and other morally questionable startup practices.
+
+Remember sourceforce, once the paragon of everything open source, remember when they were bought up and subsequently caught with their fingers in the cookie jar bundling malware with the software on their site? The lesson we should have learned wasn't "lets move everything from one huge platform to another huge platform"; but rather the lesson was that we so desperately need to learn was that we should host or projects ourselves if we want to retain any sense of control over them.
+
+

Set-up

+
+Self-hosting git is extremely easy to set up. You need a raspberry pi or any computer with ssh access and git.
+
+I linked to the official documentation below, but the really quick way to get started is to do this on the server:
+
+
+$ mkdir my-project
+$ cd my-project
+$ git init --bare
+
+And then you do this on the client from your existing git project:
+
+
+$ git remote add origin git-user@server:my-project
+
+If you want to move from one existing remote to your new server, you use 'set-url' instead of 'add'.
+
+That's it! Now you have an off-site backup for your code.
+
+If you want a web interface for sharing your code more publicly, something like gitweb is a good alternative to gitlab and similar, it's much more lightweight (and a bit barebones), but also very easy to set up. Please refer to the git book links below for instructions.
+
+

Links

+
+gemini://gemini.ctrl-c.club/~lettuce/git-services.gmi
+
+ + +
+

Topic

+
+/topic/platforms.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/18-soaring-high.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/18-soaring-high.gmi new file mode 100644 index 00000000..4fe668d3 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/18-soaring-high.gmi @@ -0,0 +1,97 @@ + + + + + MEMEX - Soaring High [2021-09-02] + + + + + + +
+ +
+ + +
+
+

Soaring High [2021-09-02]

+
+I'm currently indexing with my search engine. This isn't an always-on sort of an affair, but rather something I turn on and off as it tends to require at least some degree of babysitting.
+
+I've also been knocked out by the side-effects of the vaccine shot I got the other day, so it's been mostly hands-off "parenting".
+
+What I'm trying to figure out just how far I can take it. I really don't know. I took some backups and just let it do its thing relatively unmonitored.
+
+I've done this several times before. I let it go, and find where it falls apart, fix it, and let it go again to see which new soaring heights it will reach.
+
+So far this run there has been a few points of data congestion that needed clearing out. A semaphore here, some optimization there, but for the most part, it has just been chugging along with an ominous ease.
+
+I've run it for a few days now, and the index is about thirty-five million URLs in size. The size when I started was about fifteen million. Thirty five million is already breaking a record. Will it make fifty million? A hundred? I really don't know. It could easily blow up in a calamitous fireball in the next fifteen minutes.
+
+The true limits are beyond my comprehension. There are too many variables at play to predict where the they are, and whether it is *the* limit. So far there has always been another optimization that has been able to save the day. For how long will those boons continue? Surely there must be some upper bound to a search engine hosted in an apartment living room.
+
+I know of one boundary: The dictionary hash table. I know it will start to degrade noticeably, like most hash tables, when it's about 75% full. I know it growth seems linear, or at least linear-bounded, slowly ticking up by about six million entries per hour. I know that it's full capacity is 2.1 billion entries, and it's currently at 752 million. That means we are half way to 75% full. That is a vague boundary. A lethargic death-by-brownout, rather than a splat against some unseen brick wall.
+
+I know a brick wall, too, the partition holding the files for the mariadb instance that keeps track of URL metadata is about 37% full, at 44 Gb. That could be what blows up. MariaDB could also quite simply grind to a halt because it doesn't get enough ram. I've assigned it 40 Gb. It should be enough for a while. But I really don't know for how long.
+
+Maybe it's some envious plot of the operating system resource management that will ground the flight of the search engine. Right now the server OS gets 55 Gb of buffer memory. So far that keeps the disk thrashing at bay. So far.
+
+Incidentally, searching is a tad slow now, but that's not because it's approaching some performance limit, but because the caches aren't properly populated. Searching more fixes the problem. But there's also a script I run which just goes through a dictionary making queries that brings the query time down. Looks like a DOS attack, just spamming dozens of searches per second in a loop, but it's actually the opposite. Ironically, the worst thing for the search engine's performance is not getting enough search queries.
+
+Meanwhile, the search engine keeps indexing. Perhaps like a bumblebee crossing a road, blissfully unaware of the windshield hurdling its way.
+
+

Links

+
+https://search.marginalia.nu/
+gemini://marginalia.nu/search
+
+

Topics

+
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/19-website-discoverability-crisis.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/19-website-discoverability-crisis.gmi new file mode 100644 index 00000000..396ad4f6 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/19-website-discoverability-crisis.gmi @@ -0,0 +1,132 @@ + + + + + MEMEX - The Small Website Discoverability Crisis [2021-09-08] + + + + + + +
+ +
+ + +
+
+

The Small Website Discoverability Crisis [2021-09-08]

+
+There are a lot of small websites on the Internet: Interesting websites, beautiful websites, unique websites.
+
+Unfortunately they are incredibly hard to find. You cannot find them on Google or Reddit, and while you can stumble onto them with my search engine, it is not in a very directed fashion.
+
+It is an unfortunate state of affairs. Even if you do not particularly care for becoming the next big thing, it's still discouraging to put work into a website and get next to no traffic beyond the usual bots.
+
+You get a dead-sea effect. Traffic is evaporating, and small websites are dying, which brings even fewer visitors. Rinse and repeat.
+
+Blogs limp along through RSS and Atom, but relying on feeds shapes everything you write into a blog entry. It's stifling, homogenizing. The blogosphere, what remains of it, is incredibly samey.
+
+I feel there ought to be a solution to this, a better way of doing things that can help, and perhaps the Internet as a whole is an irredeemable mess that will never mend, but maybe we can (somehow) make it easier for those who are actually looking to find what they seek.
+
+Maybe there are lessons that can be drawn from what works on Gemini, and what doesn't work on HTTP, that can synthesize into a sketch for a solution.
+
+Gemini seems to be discovering automatic link feeds (e.g. Antenna), and on gemini-scale it works pretty well. But I'm just going to state that automatic link feeds do not seem to work on HTTP any more. You end up with a flood of astroturfing, vapid click-bait and blogspam (i.e. reddit). Stemming the flood demands a ton of moderation and still results in dismal results.
+
+As a whole, I think centralized and algorithmic approaches are extremely exposed to manipulation when applied on the internet.
+
+Web rings are cute, but I think they are a bit too random to help. Likewise, curated link directories were a thing back when the Internet was in its infancy, but the task of maintaining such a directory is a full time job.
+
+You could go for some sort of web-of-trust model to only allow trusted submitters access to an automatic link feed, but that practice is excluding and creates yet more walled gardens, which impairs the very discoverability I'm trying to help.
+
+Instead, perhaps there is a much simpler solution.
+
+

Simple federated bookmarking

+
+A proposal, dear reader: Create a list of bookmarks linking to websites you find interesting, and publish it for the world to see. You decide what constitutes "interesting".
+
+The model is as recursive as it is simple. There is nothing preventing a list of bookmarks from linking to another list of bookmarks.
+
+The creation of a bookmark list is a surprisingly fun project, it has some of the appeal of scrapbooking; and the end-result is also appealing to browse through.
+
+It's a bit strange, almost nobody seems to be doing this. Looking through a sample of personal websites, very few of them has links to other personal websites. A hyperlink isn't a marriage proposal. It is enough to find some redeeming quality in a website to link to it. It costs nothing, and helps bring traffic to pages that you yourself think deserve it.
+
+If we actually want these small websites to flourish as a healthy community, we need to promote each other much more than we do. It is advertisement, yes, but in earnest. I like it when other people link to my stuff. What sort of hypocrite would I then be if I only ever linked to my own websites?
+
+Leading by example, I set up my own list of bookmarks:
+
+https://memex.marginalia.nu/links/bookmarks.gmi
+
+

Replies and Comments

+
+gemini://station.martinrue.com/kevinsan/a25c7f2fd24b487483244c938f2217e7
+gemini://szczezuja.flounder.online/gemlog/2021-09-08-Planned-undiscoverability-of-small-sites.gmi
+
+

Topic

+
+/topic/web-design.gmi
+/topic/platforms.gmi
+/topic/astrolabe.gmi
+/links/bookmarks.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/20-dot-com-link-farms.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/20-dot-com-link-farms.gmi new file mode 100644 index 00000000..1dd8caac --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/20-dot-com-link-farms.gmi @@ -0,0 +1,130 @@ + + + + + MEMEX - The Curious Case of the Dot-Com Link Farms [2021-09-09] + + + + + + +
+ +
+ + +
+
+

The Curious Case of the Dot-Com Link Farms [2021-09-09]

+
+I spent some time today weeding out yet more link-farms from my search engine's index.
+
+Typically what I would do is just block the subnet assigned to the VPS provider they're on, and that does seem to work rather well. The cloud providers that don't police what they host is almost always home to quite a lot of this stuff, so I don't particularly mind scorching some earth in the name of a clean index.
+
+Today's link farms turned out to be more of a three-pipe problem. They had a pretty predictable address pattern, so it wasn't incredibly difficult to round them all up. Below are two examples out of a million or so URLs I flagged.
+
+I'm redacting the full addresses. If you click on them, at best you end up at an online casino or a porn site, but there's a pretty decent chance you'll be exposed to malware.
+
+
+http://█████-███████████████.com/radiorules/wp-content/plugins/book/bacterial-activation-of-type-i-interferons-2014/
+http://███████████.com/pdf/download-a-companion-to-renaissance-drama-blackwell-companions-to-literature-and-culture.html
+
+It's strange because a large portion of them had .com domains, some had .org and a few even .edu. That's unusual, because these top level domains are expensive and inaccessible. We're also talking about about 20,000 domains.
+
+My initial response was something like "wow, this operation has deep pockets! That's a quarter of a million dollars per year in registration fees alone." Actually, a bit too deep, the more I thought about the economics of it all, the less it added up.
+
+One curious aspect is that they didn't quite seem to link very closely to each other. Most link farms do, but the most reliable way of finding these links was to go on URL pattern alone.
+
+Visiting the domain's index page without the full URL usually presented a reasonably innocent-looking website, a few of them were personal sites, some were businesses. Sometimes with signs of poor maintenance, but it seemed to be something someone at some point put actual work into building; not just some low-effort copy-paste facade put up to fool the VPS provider.
+
+That's another clue. Often times link farms will try to look innocent, but I think that's only part of what's going on here.
+
+It slowly dawned upon me
+
+

It's all compromised WordPress deployments!

+
+Yeah, what if these web sites aren't merely fronts, but actual websites made by people and not scripts? Maybe the reason they can afford a quarter of a million dollars in registration fees is because they aren't paying any of it? What if what I'm looking at is in fact 20,000 hacked WordPress deployments?
+
+If you have a web server (or really any TCP port open to the internet), you've probably seen the constant probing. You know, the stuff...
+
+
+2021-09-08T05:54:22+02:00 "GET //site/wp-includes/wlwmanifest.xml HTTP/1.1"
+2021-09-08T05:54:23+02:00 "GET //cms/wp-includes/wlwmanifest.xml HTTP/1.1"
+2021-09-08T05:54:24+02:00 "GET //sito/wp-includes/wlwmanifest.xml HTTP/1.1"
+2021-09-08T09:53:28+02:00 "GET /wp-login.php HTTP/1.1"
+2021-09-08T09:53:29+02:00 "GET /wp-login.php HTTP/1.1"
+2021-09-08T09:53:30+02:00 "GET /wp-login.php HTTP/1.1"
+2021-09-08T10:00:03+02:00 "GET /wp-content/plugins/wp-file-manager/readme.txt HTTP/1.1"
+2021-09-08T14:32:41+02:00 "GET /wp/ HTTP/1.1"
+2021-09-08T23:52:56+02:00 "GET /wp-content/plugins/wp-file-manager/readme.txt HTTP/1.1"
+2021-09-08T23:52:59+02:00 "GET /wp-content/plugins/wp-file-manager/readme.txt HTTP/1.1"
+
+I think this is what they do to you if you actually do happen to run an older WordPress installation.
+
+

Related Links

+
+https://search.marginalia.nu/
+/log/04-link-farms.gmi
+
+

Topics

+
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/21-new-solutions-old-problems.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/21-new-solutions-old-problems.gmi new file mode 100644 index 00000000..357d48a2 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/21-new-solutions-old-problems.gmi @@ -0,0 +1,162 @@ + + + + + MEMEX - New Solutions Creating Old Problems [2021-09-14] + + + + + + +
+ +
+ + +
+
+

New Solutions Creating Old Problems [2021-09-14]

+
+I've spent some time the last week optimizing how the search engine identifies appropriate search results, putting far more consideration into where and how the search terms appear in the page when determining the order they are presented.
+
+Search-result relevance is a pretty difficult problem, but I do think the changes has brought the search engine in a very good direction.
+
+A bit simplified, I'm building tiered indices, ranging from
+
+
    +
  • Words in the title and first H1-tag
  • +
  • Words in the title, all H*-tags, and <B>-tags, keyword meta-tags.
  • +
  • Capitalized Words in text
  • +
  • Words in text
+
+The indices are queried in the order listed above, so that (hopefully) most relevant results are extracted before mere off-hand mentions.
+
+Another change is that queries are broken down into several possible N-grams, which are searched in decreasing order of length. I did this to a very basic degree before, but this is much more exhaustive.
+
+Determining that a term doesn't exist in the index is an incredibly fast O(1) process, so performing many queries for N-grams that don't exist isn't a problem, even if this results in a large number of queries for a single search.
+
+Example: If you type "Starcraft 2 Legacy of the Void" into the search bar, the search server will perform these queries:
+
+
+starcraft_2_legacy_of|the_void 
+starcraft_2|legacy_of_the_void 
+starcraft_2_legacy|of_the_void 
+starcraft_2_legacy|of_the|void 
+starcraft_2|legacy_of_the|void 
+starcraft_2|legacy|of_the_void 
+starcraft_2|legacy_of|the_void 
+starcraft_2|legacy|of_the|void 
+
+The search code only constructs (up to) 4-grams, and caps them to at most 16 to prevent denial-of-service searches that generate astronomical numbers of queries in the backend.
+
+There is no "starcraft|2|legacy|of|the|void" because "2", "of", and "the" are stop words; that is words that are not indexed in isolation and can be trivially discarded from consideration.
+
+I think I've made good progress, since a lot of the problems I'm starting to encounter aren't teething problems, but the sort of problems "real" search engines struggle with. That's actually pretty exciting!
+
+

Keyword Stuffing and Search Engine Manipulation

+
+Keyword stuffing is really an old problem, and why many search engines for example disregard keyword-tags. It really is what it sounds like. I ended up looking at the tag only when it is sufficiently short. This seems a workable compromise for now.
+
+I also had some problems with extremely SEO-savvy sites showing up in the top results. Like, your mobile apps and stuff, but that turned out to be the result of a bug in the order the indices were prioritized, so now they are back in the bottom of the page.
+
+

Very Esoteric Queries

+
+If you search for "Scamander", you'll get an idea of what I mean.
+
+It's a river in Turkey, known today as Karamenderes. In the Iliad, Achilles who is known for his mood swings, gets so goddamn angry he picks a fight with the river Scamander, known as Xanthos by the gods (yeah, I don't get it either). More recently, Newt Scamander is also some J.K. Rowling character.
+
+There just aren't any good results for Scamander. If you scroll down quite a bit you may find a passage in Cratylus by Plato where Socrates is appealing to the wisdom of the Iliad to make a point about names and their relationship to what they represent, but that's the absolute highlight of the search results.
+
+You get better results if you qualify the search as "scamander iliad", or "newt scamander", but this is a tricky one. It hopefully will improve as I index further.
+
+To be fair, there really aren't any good results on google either. Just storefronts shilling Harry Potter merchandise, but that's to be expected.
+
+

Political Extremism, and Other Distasteful Content

+
+There has always been some amount of results where the author is frothing at the mouth over cultural marxists or the jews or Trump or various culture wars nonsense, but that's just the nature of the Internet in the 2020s. For a while it felt like I was getting too many of these results, even in queries it really shouldn't show up, but it seems to have settled down a bit.
+
+In general, I do not believe it is my job to police other peoples' ideas, no matter how much I disagree with them. Thought-policing is far greater evil than disagreeable ideas.
+
+At the same time I don't want my search engine to become the go-to search engine for extremists. That's not a good look. But I'll cross that bridge when I come to it.
+
+So far I'm doing nothing as long as they aren't doing bait-and-switch tactics that cause them to show up in innocent queries. If I find something especially distasteful I might just blacklist the site.
+
+I've employed a similar tactic toward porn, escort sites, and the like. If I find them while searching for something innocent, I'm blacklisting them; but I'm not going out of my way to make sure they don't exist anywhere in the index, as even if I wanted to, that's just not feasible. There is a lot of smut on the Internet.
+
+

Search Engine

+
+https://search.marginalia.nu/
+
+

See Also

+
+https://encyclopedia.marginalia.nu/wiki/N-gram
+https://encyclopedia.marginalia.nu/wiki/Stop_word
+
+

Topic

+
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/22-against-the-flood.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/22-against-the-flood.gmi new file mode 100644 index 00000000..19ed320e --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/22-against-the-flood.gmi @@ -0,0 +1,133 @@ + + + + + MEMEX - Against the Flood [2021-09-19] + + + + + + +
+ +
+ + +
+
+

Against the Flood [2021-09-19]

+
+So hacker news apparently discovered my search engine, and really took a liking to the idea. Actually that's a bit of an understatement, the thread has gotten 3.3k points and lingered on the front page for half a week. And I wasn't planning for it to go quite that public yet. It has quietly been online for a while, but it was only very recently it started to feel like it was really coming together. It wasn't perfect, there was still a lot of jankiness and limitations that could have been fixed with more time. The index was half the size it should have been. Someone discovered it and shared it. It took off like a rocket, and I'm still at a loss for words at the reception it's gotten. I have received so many encouraging comments, emails, offers of collaboration, a few have even joined the patreon. I've been working through all the messages and I aim to reply to them all, but it takes time. I'm very grateful for all of this, since I half thought I was alone in this.
+
+In building this, I had a hunch I was the next TempleOS-guy, quietly building something ambitious the world just wouldn't be able to relate to. Turns out that just the case at all.
+
+But rewinding back a bit to last Thursday when this all began. I looked at a log and noticed I got more searches than usual. It quickly turned into a lot more searches. The logs just kept scrolling at a dizzying rate as I was tailing them. I didn't know then, but the server was getting about 2 search queries per second, a sustained load that lasted most of the night. The server withstood the barrage without going down, without even feeling slow.
+
+To be perfectly clear, my server, and I have just one of them, it's a single computer. It is not a 42U tower like what you see on /r/homelab, but simple consumer hardware. The motherboard is a kinda shitty mATX board, the CPU is a Ryzen 3900X, and it has 128 Gb of RAM but no ECC. Stick a high end GPU in it and it would basically be a gaming PC with a silly amount of RAM and a weird disk configuration. The modest little cube sits quietly humming in my living room next to a UPS I got a few weeks ago because of all the the thunderstorms and outages this summer.
+
+My home network flows through a cheap router I've had since 2006, 100 mbit, I purchased it when I first moved to my own apartment. I really think this is the craziest part of the whole story. If anything were to just keel over and die at managing tens of HTTP requests per second, it would be that piece IBM-beige antiquity (actually looking at the backside reveals that it was once grayish-white, but sitting in the sun for 15 years does things to plastic).
+
+I had done some performance testing, and knew the search engine ought to hold up to a decent search pressure. But you don't really know if the ship floats until it's in the water, and here it suddenly found itself on an unexpected maiden voyage across a stormy ocean. There's a lot of moving parts in software this complex, and only one of them needs to scale poorly to bring it all down. But apparently not. In fact, due to how memory mapping interacts with disk caching, it searches faster now than it did before.
+
+

How is this even possible?

+
+I'm too well-acquainted with survivorship bias to pretend I know exactly what the secret sauce is. But I can offer some guesses:
+
+
    +
  • I serve most things that doesn't need to be dynamic as static HTML off nginx. This means that page loads are tiny, in many cases they can be less than 10 KB. I do load some fonts, but they should only load once, and even so the page load is about 100 KB.
  • +
  • I don't use cookies except to manage log-ins to MEMEX and my reddit front-end. This means the server doesn't have to keep track of session data for anyone other than myself. I don't have exact figures of how many people visited my server, but if I go on how many searches I got, it's probably around a half a million to a million visits. That's half a million sessions that didn't need to be managed by the webserver.
  • +
  • I originally built the search engine targeting a Raspberry PI cluster. It's been quite a while since I migrated off it, but I do think this shaped the original design in a way where it needed to be extremely thrifty in terms of making use of hardware. Overall I think targeting small hardware is a very good practice when designing performant software, as it becomes extremely evident whenever you are doing something that is inefficient.
  • +
  • Java, for all its warts, boilerplate, and unfashionable enterprise-ness, is pretty good for building reliable web services.
+
+
+

The Future

+
+I'm still processing all of this. It's extremely encouraging how many people seem to like the idea. The project is in its infancy, and I have many ideas for improvements. There are also things that need to be tested to see if they work. It's probably going to be a pretty bumpy road, but I'm extremely grateful that I have people with me.
+
+Below are the things I'm working toward right now.
+
+

Short term

+
+
    +
  • There are some pretty arbitrary limitations on the search terms. I do think they can be softened a bit.
  • +
  • When you search for something and there are no good results, you currently get seemingly random links instead of an empty page. I'd like to try to see if I can prevent this, as it makes people think the search engine isn't working properly.
  • +
  • There's a lot of junk in the index due to a few bugs I recently discovered; binary soup, and pages with character encoding errors. These are hard to get rid of, so I need to re-crawl these pages and reconstruct the index. I will probably do this in a few weeks when the public attention has died down a bit, as it means taking it all down for a day, and then having awful search results for a few more days.
  • +
  • I want to see if I can, if not automatically perform, at least suggest alternative search queries, pluralization, term re-ordering, etc. NLP is pretty hard though, and there doesn't seem to be good libraries.
  • +
  • I'm thinking of resurrecting my pi cluster and using it as a a smaller test environment so that I don't break "production" as much now that I have actual users. Should also help with keeping the performance in check.
+
+

Long term

+
+
    +
  • I may opensource a few of the specialized components used in the search engine. I built them typically because I couldn't find anything available that fit my rather unique requirements.
  • +
  • I want to crawl gemini space as well as HTTP.
  • +
  • I want to experiment with using links-descriptions to paint additional search terms on pages. This is nontrivial from a storage and computation standpoint when operating under my hardware constraints.
+
+

Pictures

+
+ + +
+

Links

+
+ + +
+

Topic

+
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/23-re-software-and-branding.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/23-re-software-and-branding.gmi new file mode 100644 index 00000000..f0704eb6 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/23-re-software-and-branding.gmi @@ -0,0 +1,87 @@ + + + + + MEMEX - Re: Software and Branding [2021-09-21] + + + + + + +
+ +
+ + +
+
+

Re: Software and Branding [2021-09-21]

+
+gemini://friends.riverside.camp/~clarity/journal/branding.gmi
+gemini://idiomdrottning.org/re-branding
+
+Some interesting thoughts going around on the topic of branding in software and websites. I've had thoughts like these too, and designed a lot of my website purposefully un-branded. I have no logos, no banners, barely navigational links. I figured I would just see what happens if you subvert this paradigm of web design, since it is stuff you typically just "scroll past" to get to what you care about like you skip the beginning of every youtube video. Since the point has never been to create a brand, I figured I would elevate the message so far above everything else that there simply is nothing else.
+
+In light of how it's turning out, I'm starting to re-evaluate this decision a bit. I see people get really confused when they visit my web pages. All the pages have the same nicotine color, the same visual style, and there is no logo to really distinguish them, and to the extent I sign my work it's hidden in some corner as I believe the message is always more important than the messenger.
+
+As annoying as it is to have branding, I'm beginning to get the sense that navigation hinges upon visual landmarks and distinct features, on the established language of expectations and signs. And removing all these landmarks creates a disorienting whiteout.
+
+The bigger my website is getting, the more confusing it is becoming to navigate. I think I need to think about how this all should connect together. Right now almost none of it does.
+
+I have these pieces, but going from one to the other isn't easy at all, it hinges upon either exploring my defunct blog (in Swedish), or going deep into the projects directory of the memex.
+
+It's been in the back of my mind for a while now, I need to figure out a way to tie these things all together:
+
+https://memex.marginalia.nu/
+https://encyclopedia.marginalia.nu/
+https://search.marginalia.nu/
+https://reddit.marginalia.nu/
+https://www.marginalia.nu/podcast/new.html
+
+

Topics

+
+/topic/web-design.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/24-silly-hats.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/24-silly-hats.gmi new file mode 100644 index 00000000..ea84a3dc --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/24-silly-hats.gmi @@ -0,0 +1,103 @@ + + + + + MEMEX - Thoughts on Silly Hats [2021-09-27] + + + + + + +
+ +
+ + +
+
+

Thoughts on Silly Hats [2021-09-27]

+
+If you look back in history to the turn of the 20th century, you will find a lot of people wearing hats. Women wore some arguably pretty funny over the top hats. A while earlier, over the men were arguably funny top hats. This isn't the first time men and women wore funny clothes. The aristocrats in 1680s France looked pretty silly too.
+
+The reason we do this, wear silly hats, is because it is fashionable. Compliance with the some perceived fashion trend is one way we compete with fellow human beings, a measuring stick we use to evaluate our standing within society. Oh, you merely wear a modest and peculiar hat? Well mine is bigger and sillier still, therefore I am better!
+
+Eventually everyone collectively realizes things have gotten out of hand, and things die down for a few decades until a new form of hat starts to emerge.
+
+It isn't just in fashion we do this. Any norm can be a hat. Teenagers often seek out really obscure music or movies for the sake of having something that the other kids don't have, it creates identity, even if it's "the guy that listens to micronesian corecore music from the '70s". Another harmless example is imposing limitations and strictures on what we eat.
+
+It absolutely happens in software too. There are definitely people who perceive themselves as gods among men for using the most insanely obscure compile-everything-by-hand Linux distribution, or only using software that adheres to some super strict set of license requirements.
+
+Before you break out the pitchforks, I'm not saying that any of this is pointless. In fact, eschewing norms can be another example of a silly hat; and a society without values, or with values we do not subscribe to, is not something that is conducive to happiness (c.f. Durkheim's Anomie). That is, we need these measuring sticks to impose some semblance of structure on our social surroundings, and that structure isn't inherently a prison, but something we crave. We are animals that feel good wearing silly hats, and naked without them. There really is no getting away from the silly hats.
+
+The hats have a dark side, too.
+
+Some participants of hustle culture makes a silly hat of their poor life balance, working 160 hours a week and barely stopping to sleep. Some people make a silly hat out of their physique, starving themselves to stay impressively thin, or living in a gym to stay impressively wide.
+
+Intolerance is a hat many compete in growing to silly proportions. When they perceive that some intolerance is approved of they grow theirs even more intolerant.
+
+The opposite can also be a silly hat, turning the other cheek even in the face of the most grievous insult.
+
+On that note, according to Eusebius of Caesarea, Christian ascetic Origen of Alexandria supposedly took his pious chastity so far that he castrated himself. It's questionable whether this actually happened. It more than likely was a smear campaign levied against Origen, but the fact that this was an accusations someone thought sounded credible does say a lot on its own about the self-destructive power of silly hats.
+
+While there appears no way of getting rid of hat-wearing without causing far bigger problems than the hats ever were, I think it's good to be aware when we are engaged in this practice as it is a force that can drive us to do silly things indeed. Maybe we should be better at deflating the hats before they grow silly beyond all proportion.
+
+

Illustrations

+
+ + +
+

Further Reading

+
+https://encyclopedia.marginalia.nu/wiki/Anomie
+
+

Replies

+
+https://matthewgraybosch.com/blog/re-thoughts-on-silly-hats.html
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/25-october-update.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/25-october-update.gmi new file mode 100644 index 00000000..43436bc3 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/25-october-update.gmi @@ -0,0 +1,139 @@ + + + + + MEMEX - Astrolabe - The October Update [2021-10-01] + + + + + + +
+ +
+ + +
+
+

Astrolabe - The October Update [2021-10-01]

+
+https://search.marginalia.nu
+
+The October Update is live. It introduced drastically improved topic identification and an actual ranking algorithm; and the result is interesting to say the least. What's striking is how much it's beginning to feel like a search engine. When it fails to find stuff, you can kinda see how.
+
+I've played with it for a while now and it does seem to produce relevant results for a lot of topics. A trade down in whimsical results but a big step up if you are looking for something specific, at least within the domain of topics where there are results to find.
+
+What really cool is how non-commercial a lot of the results are. If you search for say "mechanical keyboards", at the time of writing, 9 out of the 10 first entries are personal blogs. The Google result is... uh... yeah, a good example of why I started this project.
+
+

Ranking Algorithm Overview

+
+The ranking algorithm is a weighted link-count, that counts distinct links on a domain-by-domain basis given that they come from sites that have been indexed sufficiently thoroughly.
+
+It really does seem to produce pretty decent results. Here are the current top 15 domains.
+
+
++-------------------------------+---------+
+| URL_PART                      | QUALITY |
++-------------------------------+---------+
+| www.fourmilab.ch              | 92.8000 |
+| www.debian.org                | 91.8000 |
+| digital.library.upenn.edu     | 77.7000 |
+| www.panix.com                 | 77.1000 |
+| www.ibiblio.org               | 75.7000 |
+| users.erols.com               | 73.6000 |
+| www.openssh.com               | 70.5000 |
+| xroads.virginia.edu           | 66.7000 |
+| www.openbsd.org               | 65.4000 |
+| www.levity.com                | 63.4000 |
+| www.catb.org                  | 61.7000 |
+| www.webspawner.com            | 59.9000 |
+| www-personal.umich.edu        | 59.0000 |
+| onlinebooks.library.upenn.edu | 55.7000 |
+| www.postfix.org               | 49.1000 |
++-------------------------------+---------+
+
+

Walls of Text

+
+A strange thing that's happened is that it seems to really strongly prefer long form wall-of-text style pages, especially with very little formatting. I'd like to tweak this a bit, it's looking a bit too 1996 and this isn't supposed to be a "live" Wayback machine.
+
+Part of this may be because the search engine premieres results where keywords that appear the most frequently in a page, especially when they overlap with the title. It does trip up a lot of keyword stuffing-style SEO, since if you put all keywords in a page, then nothing sticks out. However, in shorter pages, topical words may not appear sufficiently often.
+
+I've implemented optional filtering based on HTML standards, and I think with some adjustments I might be able to just add a "modern HTML" filter that picks up on stuff that looks like it's written after y2k based on the choice of tags and such. Unfortunately just going by DTD doesn't seem to work very well, as it appears many have "upgraded" their HTML3 stuff to HTML5 by changing the DTD at the top of the page and keeping the page mostly the same. I'm gonna have to be cleverer than that, but it feels reasonably doable.
+
+

Red October?

+
+I received some justified complaints that there were a bit too much right wing extremism in the search results in the August index. I haven't removed anything, but I've tweaked relevance of some domains and it does seem to have made a significant difference.
+
+I did the same for some very angry baptists who kept cropping up telling video game fans they were going to burn in hell in eternity if they didn't repent and stop worshiping false idols.
+
+My main approach to this is to go after the stuff that is visible. If you go out of your way to look for extremist stuff, then you are probably going to find it. However if this type of vitriol shows up in other searches it is a problem.
+
+The commies seem less likely to crop up in regular search results, so I haven't gone after them quite as hard. This may give the current state of the search engine a somewhat left-wing feel. One could argue it does compensate for the far-right feel of the September index.
+
+Ultimately I really don't care about politics. I think loud political people are exhausting. Maybe you care about politics, that's entirely fine; I probably care about some things you don't want to hear about as well. I just don't want hateful tirades showing up in any search results, whether they are left, right, religious, atheist, pro-this, anti-that. These angry people feel so strongly about their convictions they think they are entitled to impose on everyone whether they want to listen or not. It's really the last part I disagree with.
+
+

Link Highlights

+
+To wrap things up, I wanted to highlight a few cool links I've found these last few days. Topically they are all over the map. Just see if you find something you enjoy.
+
+http://papillon.iocane-powder.net/
+https://meatfighter.com/castlevania3-password/
+http://www.sydlexia.com/top100snes.htm
+https://www.tim-mann.org/trs80/doc/Guide.txt
+https://schmud.de/
+
+

Topics

+
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/26-personalized-pagerank.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/26-personalized-pagerank.gmi new file mode 100644 index 00000000..892aacb5 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/26-personalized-pagerank.gmi @@ -0,0 +1,460 @@ + + + + + MEMEX - Experimenting with Personalized PageRank [2021-10-02] + + + + + + +
+ +
+ + +
+
+

Experimenting with Personalized PageRank [2021-10-02]

+
+The last few days I've felt like my first attempt at a ranking algorithm for the search engine was pretty good, like it was producing some pretty interesting results. It felt close to what I wanted to accomplish.
+
+The first ranking algorithm was a simple link-counting algorithm that did some weighting to promote pages that look in a certain fashion. It did seem to keep the page quality up, but also seemed to as a strange side-effect promote very "1996"-looking websites. This isn't quite what I wanted to accomplish, I wanted to promote new sites as well as long as they were rich in content.
+
+This morning I was reading through the original paper on PageRank, an algorithm I had mostly discounted as I thought it would be too prone to manipulation, mostly based on Google's poor performance. I had done some trials earlier and the results weren't particularly impressive. Junk seemed to float to the top and what I wanted at the top was in the middle somewhere.
+
+Then I noticed toward the end the authors mention something called "Personalized PageRank"; a modification of the algorithm that skews the results toward a certain subset of the graph.
+
+The authors claim
+
+
+ These types of personalized PageRanks are virtually immune to manipulation by commercial interests. For a page to get a high PageRank, it must convince an important page, or a large number of non-important pages to link to it.
+
+Huh. My interest was piqued.
+
+The base algorithm models a visitor randomly clicking links and bases the ranking of the distribution of where the visitor is most likely to end up.
+
+The modification of the algorithm in simplicity introduces a set of pages that a hypothetical visitor spontanenously goes back to when they get bored with the current domain. The base algorithm instead has the visitor leaving to a random page. In the base algorithm this helps escape from loops, but in the modified algorithm it also introduces a bias nodes pages adjacent to that set.
+
+I implemented the algorithm. PageRank is a very simple algorithm so this wasn't more than a few hours. I used my own memex.marginalia.nu as the set of pages the bored visitor goes to, as it has a lot of links to pages I like. The algorithm ran for a few seconds and then converged into something beautiful: A list of small personal websites.
+
+No, wait. This doesn't cut it.
+
+

Jesus. H. Christ. On. An. Actual. Penny. Farthing. What. I. Don't. Even. HUH?!

+
+The top 1000 results were almost ALL personal websites, like of the sort that was actually interesting! It's... it's the small web! It's the living breathing blogosphere! It's *everything* I wanted to make available and discoverable! I did some testing on a smaller index, and it actually kinda worked. I pushed it into production, and it works. It's amazing!
+
+What's great is that even though I didn't plan for this, my search index design allows me to actually roll with *both* algorithms at the same time; I can even mix the results. So I put a drop down where you can choose which ranking algorithm you want. I could probably add in a third algorithm as well!
+
+It's very exciting. There is probably more stuff I can tweak but it seems to produce very good results.
+
+

Read More

+
+ +
+

Appendix - A Lot Of Domains

+
+This is going to be a lot of domains, a top-25 ranking based on which domain the PageRank biases towards. I'm not hyperlinking them, but sample a few with copy&paste. They are mostly pretty interesting.
+
+

memex.marginalia.nu

+
+The current seed
+
+search.marginalia.nu
+twtxt.xyz
+wiki.xxiivv.com
+www.loper-os.org
+lee-phillips.org
+memex.marginalia.nu
+www.lord-enki.net
+jim.rees.org
+www.ranprieur.com
+ranprieur.com
+john-edwin-tobey.org
+tilde.town
+www.ii.com
+equinox.eulerroom.com
+cyborgtrees.com
+lobste.rs
+www.teddydd.me
+collapseos.org
+0xff.nu
+antoine.studio
+parkimminent.com
+jitterbug.cc
+www.awalvie.me
+www.lambdacreate.com
+desert.glass
+mineralexistence.com
+milofultz.com
+ameyama.com
+nchrs.xyz
+ftrv.se
+www.wileywiggins.com
+www.leonrische.me
+forum.camendesign.com
+nilfm.cc
+terra.finzdani.net
+kokorobot.ca
+www.tinybrain.fans
+void.cc
+akkartik.name
+100r.co
+sentiers.media
+llllllll.co
+www.paritybit.ca
+sr.ht
+eli.li
+usesthis.com
+marktarver.com
+mvdstandard.net
+blmayer.dev
+dulap.xyz
+
+

stpeter.im

+
+Let's try someone who is more into the humanities.
+
+monadnock.net
+coccinella.im
+www.coccinella.im
+kingsmountain.com
+metajack.im
+anglosphere.com
+www.kingsmountain.com
+test.ralphm.net
+ralphm.net
+badd10de.dev
+xmpp.org
+memex.marginalia.nu
+copyfree.org
+etwof.com
+chrismatthewsciabarra.com
+www.chrismatthewsciabarra.com
+www.igniterealtime.org
+www.xmcl.org
+www.jxplorer.org
+search.marginalia.nu
+www.bitlbee.org
+perfidy.org
+www.gracion.com
+stpeter.im
+www.ircap.es
+www.ircap.net
+www.ircap.com
+dismail.de
+wiki.mcabber.com
+www.knowtraffic.com
+www.rage.net
+fsci.in
+trypticon.org
+www.riseofthewest.net
+www.riseofthewest.com
+fsci.org.in
+www.planethofmann.com
+www.badpopcorn.com
+muquit.com
+www.muquit.com
+git.disroot.org
+www.hackint.org
+www.skills-1st.co.uk
+glyph.twistedmatrix.com
+www.thenewoil.xyz
+leechcraft.org
+anarchobook.club
+ripple.ryanfugger.com
+swisslinux.org
+mikaela.info
+
+

lobste.rs

+
+These results are pretty similar to the MEMEX bunch, but with a bigger slant toward the technical I feel. Most of these people have a github link on their page.
+
+siskam.link
+brandonanzaldi.com
+neros.dev
+matthil.de
+www.gibney.org
+www.possiblerust.com
+kevinmahoney.co.uk
+werat.dev
+coq.io
+64k.space
+tomasino.org
+axelsvensson.com
+call-with-current-continuation.org
+secretchronicles.org
+adripofjavascript.com
+alexwennerberg.com
+nogweii.net
+evaryont.me
+reykfloeter.com
+www.chrisdeluca.me
+hauleth.dev
+mkws.sh
+danilafe.com
+knezevic.ch
+mort.coffee
+writepermission.com
+danso.ca
+chown.me
+syuneci.am
+feed.junglecoder.com
+magit.vc
+antranigv.am
+nathan.run
+barnacl.es
+soap.coffee
+www.craigstuntz.com
+pzel.name
+eloydegen.com
+robertodip.com
+vincentp.me
+vfoley.xyz
+www.uraimo.com
+creativegood.com
+stratus3d.com
+shitpost.plover.com
+forums.foundationdb.org
+hristos.co
+hristos.lol
+julienblanchard.com
+euandre.org
+
+

www.xfree86.org

+
+Next up is an older site, and the results seem to reflect the change in seed quite well. Not all of them are old, but the *feel* is definitely not the same as the previous ones.
+
+x-tt.osdn.jp
+www.tjansen.de
+www.blueeyedos.com
+asic-linux.com.mx
+checkinstall.izto.org
+hobbes.nmsu.edu
+www.stevengould.org
+greenfly.org
+www.parts-unknown.com
+www.afterstep.org
+lagarcavilla.org
+brltty.app
+aput.net
+openmap-java.org
+www.splode.com
+links.twibright.com
+www.dolbeau.name
+www.dbsoft.org
+dbsoft.org
+www.sanpei.org
+www.dubbele.com
+www.sgtwilko.f9.co.uk
+www.anti-particle.com
+www.climatemodeling.org
+www.sealiesoftware.com
+sealiesoftware.com
+openbsdsupport.org
+www.momonga-linux.org
+www.varlena.com
+www.semislug.mi.org
+www.dcc-jpl.com
+www.tfug.org
+www.usermode.org
+www.mewburn.net
+www.herdsoft.com
+xfree86.org
+www.xfree86.org
+www.tinmith.net
+tfug.org
+james.hamsterrepublic.com
+www.dummzeuch.de
+arcgraph.de
+www.fluxbox.org
+www.treblig.org
+josephpetitti.com
+www.lugo.de
+fluxbox.org
+petitti.org
+shawnhargreaves.com
+ml.42.org
+
+

xroads.virginia.edu

+
+Old academic website related to American history.
+
+www.sherwoodforest.org
+www.expo98.msu.edu
+www.trevanian.com
+www.lachaisefoundation.org
+www.toysrbob.com
+darianworden.com
+twain.lib.virginia.edu
+dubsarhouse.com
+www.carterfamilyfold.org
+essays.quotidiana.org
+va400.org
+webpage.pace.edu
+www.wyomingtalesandtrails.com
+wyomingtalesandtrails.com
+bbll.com
+graybrechin.net
+genealogy.ztlcox.com
+www.bbll.com
+www.graybrechin.net
+www.thomasgenweb.com
+thomasgenweb.com
+www.granburydepot.org
+www.northbankfred.com
+www.melville.org
+www.stratalum.org
+mtmen.org
+www.mtmen.org
+onter.net
+www.tommymarkham.com
+www.robert-e-howard.org
+www.straw.com
+www.foucault.de
+www.antonart.com
+www.footguard.org
+www.taiwanfirstnations.org
+jmisc.net
+www.jmisc.net
+www.thegospelarmy.com
+jimlong.com
+pixbygeorge.info
+www.boskydellnatives.com
+www.imagesjournal.com
+www.onter.net
+silentsaregolden.com
+imagesjournal.com
+www.frozentrail.org
+www.pocahontas.morenus.org
+vinnieream.com
+www.historyinreview.org
+www.sandg-anime-reviews.net
+
+

www.subgenius.com

+
+www.quiveringbrain.com
+revbeergoggles.com
+www.seesharppress.com
+www.vishalpatel.com
+www.revbeergoggles.com
+seesharppress.com
+www.digital-church.com
+lycanon.org
+www.lycanon.org
+all-electric.com
+www.wd8das.net
+fictionliberationfront.net
+www.fictionliberationfront.net
+www.radicalartistfoundation.de
+cca.org
+cyberpsychos.netonecom.net
+www.stylexohio.com
+StylexOhio.com
+www.theleader.org
+theleader.org
+www.annexed.net
+principiadiscordia.com
+www.evil.com
+www.the-philosophers-stone.com
+the-philosophers-stone.com
+www.hackersdictionary.com
+kernsholler.net
+www.kernsholler.net
+www.booze-bibbing-order-of-bacchus.com
+www.westley.org
+www.bigmeathammer.com
+www.littlefyodor.com
+www.isotopecomics.com
+sacred-texts.com
+www.tarsierjungle.net
+www.monkeyfilter.com
+www.slackware.com
+www.nick-andrew.net
+www.eidos.org
+www.templeofdin.co.uk
+saintstupid.com
+www.saintstupid.com
+www.rapidpacket.com
+www.mishkan.com
+www.consortiumofgenius.com
+www.xenu-directory.net
+www.cuke-annex.com
+www.nihilists.net
+nihilists.net
+madmartian.com
+
+
+

Topic

+
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/27-getting-with-the-times.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/27-getting-with-the-times.gmi new file mode 100644 index 00000000..769345c2 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/27-getting-with-the-times.gmi @@ -0,0 +1,103 @@ + + + + + MEMEX - Getting with the times [2021-10-06] + + + + + + +
+ +
+ + +
+
+

Getting with the times [2021-10-06]

+
+Since my search engine has expanded its scope to include blogs as well as primordial text documents, I've done some thinking about how to keep up with newer websites that actually grow and see updates.
+
+Otherwise, as the crawl goes on, it tends to find fewer and fewer interesting web pages, and as the interesting pages are inevitably crawled to exhaustion, accumulate an ever growing amount of junk.
+
+Re-visiting each page and looking for new links in previously visited pages is probably off the table, that's something I can maybe do once a month.
+
+Thinking about this for more than a few minutes, the obvious answer is syndication. Most blogs publish either RSS or Atom feeds. They are designed to let you know when there has been an update, and pretty trivial to parse especially if you are just looking for links.
+
+Extracting a bunch of RSS feeds from previously downloaded web pages was an easy enough affair, took about an hour to chew through some gigabyte of compressed HTML and insert the result into a database table.
+
+It struck me that this would be incredibly vulnerable to search engine manipulation if I just crawled every link I found in the RSS feeds in fair and democratic order. Someone content mill could just spew out thousands of articles per day full of links.
+
+There does seem to be some easy ways of limiting the potential damage:
+
+
    +
  • Only consider documents from the same domain.
  • +
  • Reduce the number of documents per visit to a low number (currently 6).
  • +
  • Don't count these document towards the link database.
+
+Since the goal is to add new documents without allowing websites to use the mechanism for manipulating the search rankings, this seems like a good set-up.
+
+The next problem is a problem of priority. I identified 290,000 RSS feeds, and I don't want to visit them all as 90% of what I would get is crap. Sturgeon's Law seems to apply to the Internet as much as anywhere.
+
+If only there was some sort of ranking algorithm for websites... yeah. Of course! Limiting the RSS spider to the top 15,000 domains according to BlogRank cuts out *most* of the crap, while isolating exactly the sort of websites that I would like to keep refreshed.
+
+It should take approximately a day to run through the RSS feeds. That also seems a reasonable poll rate.
+
+It's an experiment. We'll see how it turns out. If it works out, maybe it will be able to read about the Facebook outage in a few days...
+
+

Topics

+
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/28-web-browsing.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/28-web-browsing.gmi new file mode 100644 index 00000000..6061084b --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/28-web-browsing.gmi @@ -0,0 +1,88 @@ + + + + + MEMEX - Web Browsing [2021-10-09] + + + + + + +
+ +
+ + +
+
+

Web Browsing [2021-10-09]

+
+An idea I've had for a long time with regards to navigating the web is to find a way to browse it.
+
+"Browse" a difficult word to use, because it has a newer connotation of just using a web browser, I mean it in the old pre-Internet sense, browse like when you flip through a magazine, or peruse an antiques shop, not really looking for anything in particular just sort of seeing if anything catches your eye.
+
+Stumbleupon used to do this pretty well, although completely randomly. I wanted something with more direction.
+
+In a previous attempt, I had an idea that you could use outgoing links to accomplish effect, but the result just wasn't particularly impressive. With the discovery (and subsequent bastardization of) the PPR algorithm, I gave it another shot.
+
+I calculated a modified personalized pagerank for every domain in my search engine and stored it away in a database. This is about a million domains, but I excluded the periphery of the graph so in practice it was more like 150k domains that needed getting a ranking. It's easy to run this in parallel so it only took about two hours, that's manageable.
+
+The presentation is super sketchy and not nearly finished, but the effect is so cool I wanted to share it.
+
+A sample of origin points:
+
+ +
+ +
+ +
+ +
+

Topic

+
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/29-botnet-ddos.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/29-botnet-ddos.gmi new file mode 100644 index 00000000..71fc1279 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/29-botnet-ddos.gmi @@ -0,0 +1,102 @@ + + + + + MEMEX - The Mystery of the Ceaseless Botnet DDoS [2021-10-10] + + + + + + +
+ +
+ + +
+
+

The Mystery of the Ceaseless Botnet DDoS [2021-10-10]

+
+I've been dealing with a botnet for the last few days, that's been sending junk search queries at an increasingly aggressive rate. They were reasonably easy to flag and block but just kept increasing the rate until that stopped working.
+
+Long story short, my patience ran out and put my website behind cloudflare. I didn't want to have to do this, because it does introduce a literal man in the middle and that kinda undermines the whole point of HTTPS, but I just don't see any way around it. I just can't spend every waking hour playing whac-a-mole with thousands of compromised servers flooding me with 50,000 search requests an hour. That's five-six times more than when I was on the front page of HackerNews, and the attempts only increased.
+
+I don't understand what their game is.
+
+The thought crossed my mind it could be a racket to get people to sign up for CDNs services, wouldn't be the first time someone selling protective services arranged problems to solve, but it doesn't quite add up. These queries I'm getting...
+
+The search queries they've been sending are weird.
+
+I've had, for quite some time, bots spamming queries for casino sites and online pharmacies and what have you, I assume this is to estimate their search ranking and figure out if their SEO is doing its job.
+
+A second guess is that it could also be some sort of attempt to manipulate search engines that build predictive models based on previous search queries for automatic suggestions, but I don't do that so that's not accomplishing anything.
+
+This traffic has only been a harmless smattering of visits, so I've let them do this since they've mostly been wasting their time and not doing me any harm.
+
+These new bots have been searching for... keywords, often related to downloading pirated software or movies.
+
+At first I thought it was someone looking for content to file DMCA complaints about, but they were really aggressive so I blocked them, and then they started cropping up from other IPs and it became pretty apparent it was a botnet. Addresses were very random and the requests were well orchestrated.
+
+Out of curiosity I pointed my web browser to a few of the IPs, and perhaps unsurprisingly the ones that responded showed login pages for enterprise grade routers and similar hardware. Not hard to imagine how they ended up as part of the bot net.
+
+But for the keywords, it looks eerily a lot like the sort of keyword stuffing you get in compromised wordpress sites. I wonder if the two are related somehow. Maybe it's the same people doing the wordpress compromising that is spamming the search engine?
+
+It's really strange because they can't be looking at the search results at all, they're way overspecified so they are almost never going to return any meaningful responses. I guess that does speak for the suggestion manipulation hypothesis.
+
+I have a lot more questions than I have answers at this point.
+
+

Topics

+
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/30-unintuitive-optimization.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/30-unintuitive-optimization.gmi new file mode 100644 index 00000000..8ceb09ec --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/30-unintuitive-optimization.gmi @@ -0,0 +1,120 @@ + + + + + MEMEX - Unintuitive Optimization [2021-10-13] + + + + + + +
+ +
+ + +
+
+

Unintuitive Optimization [2021-10-13]

+
+Optimization is arguably a lot about intuition. You have a hunch, and see if it sticks. Sure you can use profilers and instrumentation, but they are more like hunch generators than anything else.
+
+This one wasn't as intuitive, at least not to me, but it makes sense when you think about it.
+
+I have an 8 Gb file of dense binary data. This data consists of 4 Kb chunks and is an unsorted list containing first an URL identifier with metadata and then a list of word identifiers. This is a sort of journal that the indexer produces during crawling. Its main benefit is that this can be done quickly with very high fault tolerance. Since it's only ever added to, if anything does go wrong you can just truncate the bad part at the end and keep going.
+
+I construct a reverse index out of this journal. The code reads this file sequentially multiple times to create pairs of files, partitioned first by search algorithm and then by which part of the document the word was found.
+
+Roughly
+
+
+For each partition [0...6]
+  For each each sub-index [0..6]:
+    Figure out how many URLs there are
+    Create a list of URLs
+    Write an index for the URL file
+
+This takes hours. This does several slow things, including unordered writing and sorting of multiple gigabytes binary of data, but the main bottle neck seems to be just reading this huge file 105 times (it's reading from a mechanical NAS drive) so you can't just throw more threads at this and hope it goes away.
+
+I had the hunch I should try to pre-partition the file, see if maybe I could get it to fit in the filesystem cache.
+
+This part feels a bit unintuitive to me. The problem, usually, is that you are doing disk stuff in the first place, so the solution, usually, is to reduce the amount of disk stuff. Here I'm adding to it instead.
+
+New algorithm:
+
+
+For each partition [1...6]
+  Write chunks pertaining to partition to a new file
+
+For each partition [1...6]
+  For each each sub-index [1..6]:
+    Figure out how many URLs there are
+    Create a list of URLs
+    Write an index for the URL file
+
+As the partitions do overlap, it means writing approximately 13 Gb to a slow mechanical drive, but it also means the conversion doesn't need to re-read the same irrelevant data dozens of times. The prepartitioned files are much smaller and will indeed fit snugly in the filesystem cache.
+
+This does reduce the amount of stuff to read by quite a lot, if you crunch the numbers it goes from 1.2Tb to 267 Gb (assuming 21 passes per partition).
+
+
+884M    0/preconverted.dat
+1.6G    1/preconverted.dat
+91M     2/preconverted.dat
+928M    3/preconverted.dat
+192M    4/preconverted.dat
+1.2G    5/preconverted.dat
+7.8G    6/preconverted.dat
+
+The last of the files is bigger because the last partition accepts the 90% of the domains no algorithm thinks is particularly worthwhile. Sturgeon's Law is extremely applicable to the field.
+
+Running through the last partition takes a long as running through partitions 0-5. Conversion time was slashed from hours to just over 40 minutes.
+
+A success!
+
+

Topics

+
+/topic/programming.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/31-ngram-needles.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/31-ngram-needles.gmi new file mode 100644 index 00000000..bd8b9a02 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/31-ngram-needles.gmi @@ -0,0 +1,133 @@ + + + + + MEMEX - Shaking N-gram needles from large haystacks [2021-10-22] + + + + + + +
+ +
+ + +
+
+

Shaking N-gram needles from large haystacks [2021-10-22]

+
+A recurring problem when searching for text is identifying which parts of the text are in some sense useful. A first order solution is to just extract every word from the text, and match documents against whether they contain those words. This works really well if you don't have a lot of documents to search through, but as the corpus of documents grows, so does the number of matches.
+
+It's possible to bucket the words based on where they appear in the document, but this is not something I'm doing at the moment and not something I will implement in the foreseeable future.
+
+A next order solution is to consider N-grams of words, that is pairs, triples, quadruples, etc. On paper this is a great idea, as it allows you to basically perform limited free text search. The caveat is that the number of potential N-grams grows extremely quickly, and a very small amount of them are ever going to be useful; this makes enumerating them an effective impossibility (and enumeration is necessary to save space and reduce query time).
+
+Extracting some N-grams from the previous sentences, you can see that there are some possibly useful for search: "potential N-grams", "free text search, "next order solution"; they refer to something, but many more N-grams don't mean anything, "are ever", "of them are", "is that", "of potential". They are numerous and they are word salad taken in isolation.
+
+One way of reducing the number of N-grams to a reasonable level is to look for repetition in a document, or things that are Capitalized As Names. Both of these methods will retreive very useful needles from the haystack that make very good search results. The problem is that this leaves a lot of meat on the bone. This is the bulk of my current keyword processing, and the result has been just such: The results are often good but usually few.
+
+Returning to the impossible task of enumerating all possible N-grams, maybe we can somehow reduce the scope. Maybe if we had a list of the sort of N-grams that refer to things, places, people; we could escape the combinatorial hellscape that enumerating all possible word combinations. This extends beyond nouns, and includes things like "The Man from UNCLE", "The Word of God", "Dancing Shoes". Maybe a grammarian somewhere has a good word for this class, but let's just call them keywords. A noteworthy part is that these types of noun-like sentence fragments seem to have less ambiguity than words alone. A word like "chair" can both be something you sit on, and a boardmember. Reducing ambiguity is always useful for a search engine.
+
+One approach to reducing the number of N-grams to consider is to grab the list of N-grams found through the repetition method, and to look for them in all documents. This does effectively reduce the scope, but the method has flaws. It tends to bias toward certain segments, especially religious terminology, since it is very common to paraphrase scripture in those circles, which creates repetition. Another concern is that it vulnerable to manipulation through keyword spam.
+
+The other model is to create a keyword lexicon from an external source. There are many possible sources, but it turns out that Wikipedia is very useful for this. Most of their inline links contain viable keywords, in all hundreds of millions of samples, so it is quite feasible to grab the keywords that appear more than a couple of times. That is in itself relatively straightforward from an OpenZIM dump. Sideloading additional keywords from tens of millions of documents will take a while, but I'm doing it as an experiment to see if this approach needs adjustment before doing a full rebuild.
+
+Twenty four cores on full blast and a load average in the mid 30s for a couple of days is totally fine <.<
+
+

Topics

+
+/topic/astrolabe.gmi
+
+

See Also

+
+/log/21-new-solutions-old-problems.gmi
+
+

Appendix A - Reusing Previously Extracted Words

+
+Note: This algorithm ignores single words
+
+
+music_magazine_may_1994
+recordings_of_die_zauberflöte
+the_absolute_sound_issue
+the_gramophone_january_2006
+american_record_guide_march
+bbc_music_magazine_february
+x_window_system
+iroquois_county_genealogical_society
+omega_opera_archive
+international_opera_collector
+
+

Appendix B - Wikipedia

+
+
+rolex_sports_car_series
+wellington
+metropolitan_opera_radio_broadcasts
+red_cross
+anime
+composer
+the_saturday_evening_post
+pitcher
+court_of_appeal
+indianapolis
+microsoft
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/32-bot-apologetics.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/32-bot-apologetics.gmi new file mode 100644 index 00000000..aaa798e6 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/32-bot-apologetics.gmi @@ -0,0 +1,118 @@ + + + + + MEMEX - Bot Apologetics [2021-10-25] + + + + + + +
+ +
+ + +
+
+

Bot Apologetics [2021-10-25]

+
+There has been a bit of discussion over on Gemini recently regarding poorly behaved bots. I feel I need to add some perspective from the other side; as a bot operator (even though I don't operate Gemini bots).
+
+Writing a web spider is pretty easy on paper. You have your standards, and you can test against your own servers to make sure it behaves before you let it loose.
+
+You probably don't want to pound the server into silicon dust, so you add a crawl delay and parallelize the crawling, and now you have code that's a lot harder to comprehend. This is likely the cause of some weird bot behavior, including mishandling of redirect loops or repeated visits to the same address. Multi-threaded orchestration based on a rapidly mutating data set is difficult to get right (the working set of the spider by necessity changes as it goes). You can iron a lot of this out locally, but some problems won't crop up until you really push the limits with real-world scenarios.
+
+Next, the practical reality of web servers is that standards are more like vague recommendations, and no local testing can prepare your bot for encountering real data, which is at best malformed and sometimes straight up adversarial.
+
+The only way to exhaustively test a bot is to let it run and see if it seems to do what it does.
+
+The Internet, whether over HTTP or Gemini, is a fractal of unexpected corner cases. In Gemini this is compounded by the fact that a lot of people have written their own servers, in HTTP servers are (usually) somewhat compliant but oh boy is HTML a dumpster fire.
+
+It's a bit difficult to figure out what you are getting from the server. You can get Content-type as a server header or a HTML header. You can also get charset as a meta tag. HTML is served dozens upon dozens of DTDs.
+
+This one is fun:
+
+<!DOCTYPE HTML PUBLIC "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::extensions to HTML 4.0//EN">
+
+Server error handling sometimes causes some problems for a spider:
+
+
    +
  • You fetch a URL, http://www.example.com/foo
  • +
  • The page you get in return is a file-not-found error page, but it's served with an OK status code. The error page contains the relative URL bar/
  • +
  • You index http://www.example.com/foo/bar and get the same error page
  • +
  • You index http://www.example.com/foo/bar/bar and get the same error page
  • +
  • You index http://www.example.com/foo/bar/bar/bar and get the same error page
+
+&c
+
+This class of errors shouldn't happen according to the standards, but it crops up relatively often. It's part of a wider problem with assuming that the Internet is a bunch of static files, when it in practice is often dynamically generated at-visit. This also means you can't just do a simple hash of the pages you've visited to detect a loop like this, since they may include a generation timestamp or some other minor difference.
+
+The wider problem of degenerate URLs is a constant obstacle, and normalization that repairs every case is probably impossible, even a passing solution involves a decent amount of mind-reading and guesswork.
+
+Example: Is "page" in "http://example.com/page" a poorly normalized path ("page/"), or a file with no ending? Both are valid interpretations.
+
+Then there's robots.txt. In this file, you will find things like:
+
+
    +
  • Every character encoding known to man
  • +
  • ASCII art
  • +
  • Emojis
  • +
  • PHP errors
  • +
  • MySQL errors
  • +
  • HTML code
  • +
  • DIY directives
  • +
  • Infinite crawl-delays (eff. days/page)
  • +
  • Robots also get directives from HTML tags, sometimes conflicting with robots.txt.
+
+This was just a short sampler of the types of stuff a bot needs to deal with.
+
+What I wanted to say is that writing a bot is a lot harder than one would think. It's unfair to assume malice or incompetence when a bot misbehaves: Probably only way you will ever get a reasonably well behaving web spider is to build a somewhat poorly behaving one and go from there.
+
+

Topics

+
+/topic/web-design.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/33-rude-guests.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/33-rude-guests.gmi new file mode 100644 index 00000000..601dfde6 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/33-rude-guests.gmi @@ -0,0 +1,95 @@ + + + + + MEMEX - The Parable of A Rude Guest [2021-10-28] + + + + + + +
+ +
+ + +
+
+

The Parable of A Rude Guest [2021-10-28]

+
+You are invited to a dinner party. After talking for a while food is served on the table. You pounce. "Haha, suckers!", you think, and load all the food on your plate and leave nothing but scraps for the hosts. You feel victorious. Serves them right for inviting you into their home. You wolf down the food with ravenous appetite while they look on.
+
+That was tasty, but now you got a piece of meat stuck between your teeth so you go to the bathroom and borrow some floss and use one the hosts' toothbrushes. You also use the toilet but don't flush because you don't think you are going to use it again.
+
+You walk around in your underwear because the jeans you were wearing became uncomfortably tight after you stuffed yourself with food intended for four people.
+
+You notice the hosts have a painting you don't like, so you take it down and put up a poster for a band instead. Not because you like the band, but they pay you a small sum of money for every home you put up one of their posters in, and you gotta make a living somehow. You sit down on their coach, still in your underwear, and light a cigar. The hosts cough and signal that you should put it out, but you don't let that get in the way of celebrating a victorious dinner.
+
+That's no way to behave as a guest. Taking more than what is appropriate and exploiting a family that opened their home in friendship may not be overtly illegal, but it certainly is rude and inconsiderate. The behavior would have been fine if we did it in our own homes, but we weren't at home, we were guests.
+
+While this story is exaggerated and a bit preposterous, it is also very close to how many websites seem to operate.
+
+Users open their browser to a website, not more text than this page has, and the website promptly downloads 25 Mb worth of junk, connects to hundreds of servers to share information about the user.
+
+The reader is constantly disturbed by popovers that prompt them to subscribe to newsletters and ads keep appearing in the text, which animate and blink and make the text move around so that it becomes much harder to read the text. The article may also misrepresent itself to make it look like it look like it has answers it in fact does not provide, intentionally wasting the time lent to the website by the reader.
+
+The website consumes greedily whatever resources are available, both in terms of computation and user attention, to serve ends that at best are orthogonal to the needs of the user, and often outright act against the users' benefit. Again, none of this is overtly illegal, but it is rude and inconsiderate.
+
+For some reason we tend to view the website as the host, and the visitor as the guest, but in practice it is the other way around the way modern websites push most of the computation to the clients.
+
+Like guests to a dinner party, we as website owners should respect our hosts, the users, by not gorging ourselves on resources and overstepping the boundaries of our hosts, and especially not viewing the users as suckers for foolishly allowing us to borrow their computers and time. We aren't entitled to using their resources, we've been invited to borrow them, and that's a trust we shouldn't abuse.
+
+

Topics

+
+/topic/web-design.gmi
+
+
+

Discussion

+
+https://maya.land/responses/2022/02/23/website-visitors-are-still-the-visitors.html
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/34-internet-arguments.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/34-internet-arguments.gmi new file mode 100644 index 00000000..f1e267d8 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/34-internet-arguments.gmi @@ -0,0 +1,96 @@ + + + + + MEMEX - A Polemic Against Internet Arguments [2021-11-02] + + + + + + +
+ +
+ + +
+
+

A Polemic Against Internet Arguments [2021-11-02]

+
+I want you to consider for a moment all the human lifetime wasted in ideological stalemates on the Internet, all that energy, all that anger and frustration. Imagine if you take even a fraction of that time, and put it to creating something constructive instead, learning skills, doing anything meaningful.
+
+It boggles the mind, doesn't it? It must amount to entire human lifetimes every week.
+
+Ideology, or in a wider sense, ethics, is all about what should be done. How we should live. These aren't statements about the world, but opinions about what the world should look like. They aren't true or false, and any argument against them always boils down to "I disagree!". Arguing about ethical systems is some of the least constructive things a human can do. It is more pointless than masturbation, which at least feels good for a moment.
+
+I'll even argue these stalemates can be discarded as fruitless based on the fact that they exist. If there truly was a clear cut answer as to which ethical system was superior, we wouldn't stuck in endless disagreement. It feels like you can make headway, but it's a Chinese finger trap: The harder you try the more stuck you get. The only way to get out is to accept that other people, in fact, do not agree.
+
+Instead, some get really angry that other people don't share their views, and spend hours every day frothing at the mouth over this. They send emails, they boycott, they rage themselves to sleep every night that the universe dares permit these insolent other-thinkers to continue existing in the same universe.
+
+This is terribly myopic and self-centered. The assumption is that these other people insist on believing the wrong thing in spite of the clear evidence to the contrary. You can see it, so clearly they must be able to, yet they willfully ignore the truth.
+
+What if they just have different perspective than you have? The question you should be asking is what they are seeing, as kind rational and intelligent beings, that makes them come to such radically different conclusions to yours. I'm not saying this is in any means easy, it requires distance that few seem to possess.
+
+-
+
+Meanwhile some of those other people are doing something productive with their time. It's constructive human action that shapes the world. Arguing about ethics doesn't. It never has, not in other ways than letting resentment fester in those hearts that allowed it to enter.
+
+Take the time spent arguing about nonsense like the philosophical purity of software licensees, and put it to building software that has a license you agree with instead.
+
+Take the time spent infuriated at Covid policy, and spend it on living your life instead. Regardless of your position, if you are concerned about loss of life, it's a deep irony how frivolously you and everyone debating this over the last two years have been letting your own precious life time run out in the sand.
+
+Instead of attempting to tear down what others have built, or sabotaging them because they don't agree with you on some ideological point, instead build something yourself instead that embodies your ideas of good. Or just flourish as a human being. The only way humans end up pathetic is through self-sabotage, all of us have far more potential than we dare to imagine. Even if you spend your entire life learning and creating, you will still never be done, even on your death bed there will be more things you want to do and understand.
+
+Show the world sort of great things your system of values will lead to. That is much more convincing than theories. That is why people like Socrates or Jesus are role models to this day. Neither of them spent their days complaining on the Internet about how shitty everything was.
+
+If you are frustrated and think the world is blind to what is good, don't spend all day trying to convince others with words or theoretical justifications: Show them, embody your values.
+
+Once you've become that saintly paragon of good, when you are truly beyond reproach, then your you can convincingly tell others how to live, then your words will carry weight.
+
+

Topics

+
+/topic/moral-philosophy.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/35-keeping-gemini-difficult.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/35-keeping-gemini-difficult.gmi new file mode 100644 index 00000000..aeafa0b6 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/35-keeping-gemini-difficult.gmi @@ -0,0 +1,82 @@ + + + + + MEMEX - Keeping Gemini Difficult [ 2021-11-04 ] + + + + + + +
+ +
+ + +
+
+

Keeping Gemini Difficult [ 2021-11-04 ]

+
+This is a response to the post "Making Gemini Easy" over on ~tomasino, and the title is a bit tongue-in-cheek haha-but-no-really.
+
+gemini://tilde.team/~tomasino/journal/20211103-making-gemini-easy.gmi
+
+I think the idea that we need to shield the users from how technology works is a terrible, terrible mistake. It disempowers the users, and concentrates power in the hands of a technological elite, and that divide is only going to grow.
+
+We already have an alarming number of people working with computers, and some may even be programmers, that simply do not understand how computers work. Their only concept of a computer is the user interface on the screen. The rest is unintelligible wizardry. Nobody has told them, it's been deemed too complicated, nothing for them to worry their little heads about.
+
+If you treat people like children, they act like children, they think like children, they for all intents and purposes become trapped in perpetual childhood. Helpless and dependent, forever. What we need to do is treat people as dignified human beings capable of learning and understanding and overcoming challenges, if we do, they become capable, they learn and rise to the occasion, they are empowered and become independent; in short, they're allowed to enter adulthood.
+
+We need to teach the users of technology how to make a fire on their own, even though they may get burnt, and even though it requires more than installing an app on the store.
+
+In that regards, I think one of best aspects of Gemini is that it actually has a bar to entry. If you want to do more than just reading, you need to invest some time into understanding what you are contributing to. It's not incredibly difficult, but it does demand a modicum of technological understanding.
+
+That obstacle is a ultimately good thing. As much as it excludes those who are unwilling to invest in overcoming it, also it elevates those who do.
+
+

Topics

+
+/topic/platforms.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/36-localized-programming-languages.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/36-localized-programming-languages.gmi new file mode 100644 index 00000000..e73e392c --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/36-localized-programming-languages.gmi @@ -0,0 +1,103 @@ + + + + + MEMEX - Localized Programming Languages [ 2021-11-05 ] + + + + + + +
+ +
+ + +
+
+

Localized Programming Languages [ 2021-11-05 ]

+
+This is an reply to a series of posts on anglo-centrism in programming languages that have been floating around in Gemini lately.
+
+gemini://nytpu.com/gemlog/2021-10-31.gmi
+gemini://alsd.eu/en/2021-11-04-thoughts-anglocentrism-cs.gmi
+
+Around thirty years ago I was a kid with a computer. I learned to program quite a few years before I learned English. I also used DOS without understanding English. I knew what to type to do things, but I didn't know what the words meant. I could start programs, I'd play in QBASIC, write small programs and amusements. To me "PRINT" was the word that made text appear on the screen. I learned years later the word meant something in English. To show you what my child eyes saw, I think rot13 does convey the experience quite well;
+
+
+X = 0
+QB
+  CEVAG "X=", X
+  X = X + 1
+JUVYR X < 10
+
+I didn't know what any of the words meant, but I knew what they did, and I don't think it was that much of an obstacle. I also don't think having the words in my native Swedish would have furthered my understanding very much. What 6-7 year old knows words like "variable" or "vector"? I knew them by what they did.
+
+Sometimes the compiler gave me an error I didn't understand, but like most children, I was tenacious and didn't let that deter me from drawing moving shapes on the screen.
+
+In the years between then and now, I have learned English.
+
+It should also be added that localized programming languages have been tried. I don't remember which version it was, but some '90s version(s) of Microsoft Office shipped with a weird localized BASIC-language, where all the keywords were translated into the local language. It was bizarre to say the least, and unexpectedly not a big hit.
+
+In practice, having localized programming languages means you need to live in a big country with lots of programmers to get quality software for your computer. The local developers are also cut off from sharing or selling their code internationally. This really makes things much worse in the very small countries it's supposedly trying to help.
+
+Likewise, trying to design some Esperanto programming language to somehow level the playing field, I dunno, I don't think it actually helps. Or rather, I don't think English is as big of an obstacle as it's made out to be.
+
+In the end, the names and keywords in programming languages are hints as to what the keywords do, even a programming that draws heavily on English doesn't require a particularly large vocabulary or deep grammatical understanding, you can get away with knowing a couple of dozen simple words. That is not a steep price to pay for the ability to partake in a global community of software development.
+
+The more I think about it, the less I understand what the problem is.
+
+
+

Replies

+
+https://text.eapl.mx/re-localized-programming-languages
+
+

Topics

+
+/topic/programming.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/37-keyword-extraction.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/37-keyword-extraction.gmi new file mode 100644 index 00000000..80ace108 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/37-keyword-extraction.gmi @@ -0,0 +1,121 @@ + + + + + MEMEX - A Jaunt Through Keyword Extraction [ 2021-11-11 ] + + + + + + +
+ +
+ + +
+
+

A Jaunt Through Keyword Extraction [ 2021-11-11 ]

+
+Search results are only as good as the search engine's ability to figure out what a page is about. Sure a keyword may appear in a page, but is it the topic of the page, or just some off-hand mention?
+
+I didn't really know anything about data mining or keyword extraction starting out, so I've had to learn on the fly. I'm just going to briefly list some of my first naive attempts at keyword extraction, just to give a context.
+
+
    +
  • Extract every keyword.
+
+
    +
  • Extract recurring N-grams.
+
+
    +
  • Extract the most frequent N-grams, and N-grams are Capitalized Like Names or occur in titles.
+
+
    +
  • Use a dictionary extracted from Wikipedia data to extract names-of-things.
+
+These approaches are ignorant of grammar, and really kind of blunt. As good as the keywords they find are, they also hoover up a lot of grammatical nonsense and give a decent number of false positives. Since they lack any contect, they can't tell whether "care" is a noun or a verb, for example.
+
+Better results seem to require a better understanding of grammar. I tried Apache's OpenNLP, and the results were fantastic. It was able to break down sentences, identify words, tag them with grammatical function. Great. Except also extremely slow. Too slow to be of practical use.
+
+Thankfully I found an alternative in Dat Quoc Nguyen's RDRPOSTagger. Much faster, and still much more accurate than anything I had used before. In practice I usually prefer dumb solutions to fancy machine learning. The former is almost always faster and usually more than good enough.
+
+Armed with a part-of-speech tagger, and most of the same regular expressions used before to break down sentences and words, allowed some successful experimentation with standard keyword extraction algorithms such as TF-IDF and TextRank.
+
+TF-IDF is a measure of how often a term appears in a document in relationship to how often it occurs in all documents.
+
+TextRank is basically just PageRank applied to text. You create a graph of adjacent words and calculate the eigenvector. It's fast, works well, and shares PageRank's ability to be biased toward a certain sections of the graph. This means it can be used to extract additional useful sets of keywords, such as "keywords related to the words in the topic".
+
+How often a keyword occurs in these various approaches to keyword extraction can be further used to create tiered sets of keywords. If every algorithm agrees a keyword is relevant, hits for such a keyword is prioritized over keywords that only one of the algorithms considers important.
+
+There is a considerable amount of tweaking and adjusting and intuition involved in getting these things just right, and I've been fussing over them for several weeks and could probably have kept doing that for several more, but eventually decided that it has to be good enough. The improvements are already so large that they ought to provide a significant boost to the relevance of the search results.
+
+I'm almost ready to kick off the upgrade for the November upgrade. Over all it's looking really promising.
+
+

Topic

+
+/topic/astrolabe.gmi
+
+

See Also

+
+/log/31-ngram-needles.gmi
+/log/26-personalized-pagerank.gmi
+/log/21-new-solutions-old-problems.gmi
+
+https://github.com/datquocnguyen/RDRPOSTagger
+ + +https://encyclopedia.marginalia.nu/wiki/TF-IDF
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/38-old-and-new.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/38-old-and-new.gmi new file mode 100644 index 00000000..520a4779 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/38-old-and-new.gmi @@ -0,0 +1,94 @@ + + + + + MEMEX - Old and New [ 2021-11-12 ] + + + + + + +
+ +
+ + +
+
+

Old and New [ 2021-11-12 ]

+
+I've been thinking recently about the emphasis put on "new", specifically for search engines, but the discussion has some merit even in a wider context. I will start wide and narrow down.
+
+It is common to conflate new with good, and most being young sometime between 1950-2000 will indeed have seen marvellous improvements in quality of life and technology with each passing year. In the light of that, it's at least easy to explain how one might confuse the two.
+
+Some even came to think that this period of stable prosperity, unlike the others that came before it, is somehow an exception, or that we are all an exception set apart from history. In practice and like all the other periods of stable prosperity, it seems to have been a historical fluke, a trend that if it hasn't outright reversed, has at least slowed down significantly. Change is inevitable, improvement is not.
+
+When something that we thought brought prosperity stops working, we have the unfortunate habit of doubling down on what is no longer working. Human sacrifice no longer appeases the gods? Sacrifice more humans! Change stopped bringing prosperity? Worship change harder.
+
+Progress has brought so many gifts that nobody seems to have even thought of asking the question "toward what?" Progress implies if not a destiny at least some destination. Well what do I know, maybe there is some MC Escher-alternative where we can keep progressing at the same rate forever toward nothing in particular.
+
+That was a bit of a tangent, but the point is that there is really no reason to think that new things are better or worse than old things. Oftentimes, the way things have gotten old is that they have worked so well nobody thought to replace them. In all times, there were good and bad things. The bad things rarely stick around, which means that the old things that do linger often have good reason to do so. The bar for good that new things must overcome keeps getting higher.
+
+The opposite isn't true either, old isn't necessarily better. The axes of old/new, good/bad are probably mostly orthogonal.
+
+

Recency in Search

+
+It seems important to some search engine developers, to pick up on changes very quickly.
+
+For of search results, oftentimes new is completely irrelevant. If I'm looking for a recipe, does it matter if it's a new recipe? Do I even want the newest recipe? Isn't it a selling point that it's your grandma's original recipe. I may want a novel recipe, one that I haven't tried, but that is not the same as saying it was just published. Of course it is a good thing if the food I eat is new, but food is perishable in a way that a recipe is not.
+
+The ones that mostly stand to benefit from search results being fresh are the websites. They want to make changes in people's behavior by making changes to their site. They want to watch in near real-time how the visitors come pouring in. Does that actually benefit the users in any way?
+
+Even in the case of news, new isn't always better. More often than not, news articles that deal with a current topic consist almost entirely of speculation, rumor, gossip and opinion. If ever, it is only much later, once the dust has settled that balanced factual information gets printed.
+
+From a practical standpoint, the most recently published documents are the documents a search engine knows the least about. They could be good or they could be bad. Keeping up with changes seems like a job for RSS. A search engine stands to benefit from being far more judicious.
+
+

Topic

+
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/39-normie-hypothesis.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/39-normie-hypothesis.gmi new file mode 100644 index 00000000..38d91f49 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/39-normie-hypothesis.gmi @@ -0,0 +1,84 @@ + + + + + MEMEX - A brief hypothesis about "normies" [2021-11-13] + + + + + + +
+ +
+ + +
+
+

A brief hypothesis about "normies" [2021-11-13]

+
+The phenomenon of "normies" is an interesting one. The term itself is a bit problematic and not one I'd typically use, but as a phenomenon they are still worth investigating.
+
+Their perhaps biggest distinguishing feature is that they don't get "it", whatever it is. It's tempting to think that these are an especially mindless type of person with no personality and little in terms of thought going on.
+
+I have a theory that normies may not actually exist. That is, you can't actually show me a person that is a normie. They are a mirage.
+
+Rather, the appearance of normies may be a symptom of a forum, a social medium, group, seeing a large influx of outsiders, so large and fast that they can't integrate in the culture of the group.
+
+A group can assimilate a small steady influx of outsiders, they will quickly learn the established injokes, values, and cultural paradigms; but if the influx becomes too big, then the newcomers will mostly be interacting with other newcomers, and transmission of the group's culture will be slow and distorted.
+
+Because a majority of the people they see are fellow newcomers, this creates an emergent "stranger culture" that is a vague imitation of the original culture of the group, but much more impersonal and reserved. If this is a one-time thing, the culture may stabilize, but if new members keep joining at a fast rate, "normies" appear.
+
+A normie is an outsider that thinks they are an insider, made possible because everyone they meet is an outsider too.
+
+Normies are a property of the dynamics of the social medium, not of its members. Any forum that sufficiently aggressively promotes growth will ultimately struggle with this.
+
+Close communities turn into anonymous bustling city streets.
+
+A community needs an element of stability, there are social authorities, known dynamics, collective values. In a rapidly growing social medium, you very much can't step into the same river twice; a river that washes away any complex social dynamics and relationships. You are always talking to new people, strangers, normies.
+
+Additionally you get a sort of dead sea effect, where those that create a real sense of community go elsewhere to look for that community, because it can't be found in a place where everyone is always a stranger. Eventually what you are left with is a transitory hub, nobody sticks around because there is nothing to stick around for.
+
+Isolationism is perhaps not a measured response to this, but growth for the sake of growth is undeniably extremely harmful for any community.
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/40-wasted-resources.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/40-wasted-resources.gmi new file mode 100644 index 00000000..edb534e4 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/40-wasted-resources.gmi @@ -0,0 +1,103 @@ + + + + + MEMEX - Wasted Resources [2021-12-04] + + + + + + +
+ +
+ + +
+
+

Wasted Resources [2021-12-04]

+
+At a previous job, we had a new and fancy office. The light switches were state of the art. There was an on button, and a separate off button. When you pressed the on button, the lights would fade on. When you pressed the off button, they would fade off. In the cloud somewhere was two functions that presumably looked a bit like this:
+
+
+fun turnOnLamp() {
+  while (!bright()) increaseBrightness();
+}
+fun turnOffLamp() {
+  while (!dark()) decreaseBrightness();
+}
+
+I have deduced this from the fact that if you pressed both buttons at the same time, the lights would flicker on and off until someone was contacted to restart something. It is a marvellous time to be alive when you need to reboot your light switches because of a race condition. Modern computers are so fast that we often don't even recognize when we are doing things inefficiently. We can end messages half way around the world to turn on the lights and it seems like it's just a wire between the switch and the lamp.
+
+In my code there was a performance snag recently with a piece of logic that used Java streams quite liberally. I had written it that way beacuse this logic was pretty hairy and streams can be a lot more expressive, and I tend to prioritize that in the first version of the code and go and optimize later when necessary.
+
+The code iterated over an array and looked for spans that matched a combination of criteria. Imagine a couple of dozen constructions of this general shape:
+
+
+  return IntStream.range(1, words.size())
+           .filter(i -> predA(sentence, i))
+           .filter(i -> predB(sentence, i-1))
+           .filter(i -> predC(sentence, i-2))
+           .map(i -> new Span(i-2, i+1))
+           .toArray(Span[]::new);
+
+I replaced it with code of the form
+
+
+  ArrayList<Span> ret = new ArrayList<>();
+  
+  for (int i = 2; i < words.size(); i++) {
+    if (predA(sentence, i) && predB(sentence,i-1) && predC(sentence,i-2)) {
+      ret.add(new Span(i-2, i+1));
+    }
+  }
+  
+  return ret.toArray(Span[]::new);
+
+The code was about an order of magnitude faster as a result. I do feel a bit uneasy about this. If it wasn't for the fact that I work with humongous datasets, I wouldn't have noticed there was a difference. Both are fast on a modern CPU. Yet, a lot of the code we write simply isn't as fast as it could be, and while speed may not be the biggest deal, it's consuming resources, both system resources and energy.
+
+I do think Parkinson's law is applicable. The inefficiency of the code grows to meet the performance of the hardware. This is probably why user interfaces today hardly seem faster than they did 25 years ago. Back then they were slow because they read data off a floppy disk, today they are slow because they are executing 40 megabytes of javascript and sending data across the entire world to render a button.
+
+I've always felt that targeting slow hardware makes your code better on all systems. If it performs well on a raspberry pi, it performs well everywhere under any circumstances.
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/41-search-result-relevance.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/41-search-result-relevance.gmi new file mode 100644 index 00000000..b3dd310f --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/41-search-result-relevance.gmi @@ -0,0 +1,144 @@ + + + + + MEMEX - Search Result Relevance [2021-12-10] + + + + + + +
+ +
+ + +
+
+

Search Result Relevance [2021-12-10]

+
+This entry is about a few problems the search engine has been struggling with lately, and how I've been attempting to remedy them.
+
+Before the article starts, I wanted to share an amusing new thing in the world of Internet spam.
+
+For a while, people have been adding things like "reddit" to the end of their Google queries to get less blog spam. Well, guess what? The blog spammers are adding "reddit" to the end of their titles now.
+
+/pics/reddit-spam.png
+
+One of the great joys of this project is watching the spammers' strategies evolve in real time.
+
+

Few Results

+
+A persistent problem I've had is simply not getting a lot of results. A part of this is because the index is small, sure, but it still seems like there should be more. Oftentimes there *are* more, if you alter the query a little bit, but that's really hard to see.
+
+I've had some code generating alternate queries for a while (like pluralizing/depluralizing words), but it's been almost comically dumb and only added additional terms in a few rare cases. A big constraint is budgetary, I simply can't try every possible permutation.
+
+A new approach is to use part-of-speech information to limit which variants are tested, as well as using a term frequency dictionary to filter out alternatives that probably don't exist anywhere in the index.
+
+To give you an idea of what it's generating, this is the n-grams it will search for if you enter "The Man of Tomorrow".
+
+the_man_of_tomorrow
+man_of_tomorrow
+the_man, tomorrow
+the_man, tomorrows
+man, tomorrow
+man, tomorrows
+
+I'm choosing this not only because it illustrates the re-writing logic, but also because it's a bit of a pathological case that shows some bad rewrites. Some of these are clearly more relevant than others. "man, tomorrows" is pretty useless. The queries are evaluated in the listed order, so in most cases it doesn't matter too much.
+
+It will also try some additional rewrites, such as concatenating terms under certain circumstances, and breaking them apart in others.
+
+"TRS80" will produce "trs80" and "trs_80", and conversely "TRS-80" will also yield a "trs80"-term.
+
+"Raspberry Pi 2" will produce
+
+raspberry_pi_2
+raspberrypi, 2
+raspberry, pi_2
+raspberry_pi, 2
+raspberry, pi, 2
+
+

Query Refinement

+
+The next big problem has been that the search engine has been spectacularly good for narrow topics. If your search term was one topic, and that topic was broadly within the range of things covered by the index, oh boy did it occasionally produce some stellar results.
+
+If you however tried to refine the results by adding more search terms, the results often drastically got worse.
+
+For example: If you searched for "graph algorithms", you found a beautiful page on graph algorithms, including Strongly Connected Components. If you searched for "graph algorithms SCC", that page ranked very low, and instead most of what floated to the top was junk. That's pretty weird. It took a while to figure out what was going wrong.
+
+While the search engine has gotten reasonably good at figuring out which search terms are relevant to a document, it was bad at figuring out which search terms are relevant to a query. This is fine if there is only one term, but for multiple terms, things fall apart. It would, in short, use the relevance of the least relevant term (with regard to the document) to rate the relevance of the search result.
+
+If we consider a query like "C++ tutorial", ignoring N-grams, we can see that these terms are not equal. Ideally we'd like all terms to be highly relevant, but in the case that they aren't, it's much better to show results that are highly relevant to "C++" but only briefly mentions "tutorial", than terms that are highly relevant to "tutorial", but only briefly mention "C++".
+
+A way of using this is to consider the term frequency of the search term across all documents. Terms that occur often are probably less informative than terms that are rarer.
+
+Ideally you would use something like Okapi BM25, but the information that ranking function requires is not something that is readily available the way the search index is currently implemented, so I've had to, using what I have available, cook up something that behaves in a similar way; a average weighted on in inverse document frequency.
+
+Both these changes are pretty rough, and still need some more polish, but I do think they are steps in a good direction. At the time of writing, these features are incubating, and only fully enabled for the 'Experimental' index. When I'm happy with how it works, I will apply it to the other indices.
+
+

See Also

+
+https://encyclopedia.marginalia.nu/wiki/TF-IDF
+https://encyclopedia.marginalia.nu/wiki/Okapi_BM25
+
+

Topics

+
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/42-dark.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/42-dark.gmi new file mode 100644 index 00000000..9e8758fb --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/42-dark.gmi @@ -0,0 +1,84 @@ + + + + + MEMEX - Dark [2022-01-02] + + + + + + +
+ +
+ + +
+
+

Dark [2022-01-02]

+
+As is often the case these dark winter seasons, I've fallen into a bit of a funk. Inspiration it seems is as rare as sunlight, and sunlight is scarce indeed in the winters of the north.
+
+I do know what is missing, novelty. I've fallen into consuming "content". Infinite scroll is the torture rack of the spirit. What is necessary doing new things and seeing new inspiring sights, exposing myself to new inspiring thoughts.
+
+Given a lack of options, I perused the section of my bookshelf that contains books I have not yet read, and after some vacillation decided on Herodotus' Histories.
+
+What a treat! I've only read a few dozen pages and already it has provided many strange anecdotes, from Gyges' unlikely usurpation of the Lydian throne to the Persian King Cyrus II's surrogate mother named "bitch". This is exactly the medicine.
+
+I keep coming back to this conclusion, that if I do not mind what I consume, and fall into consuming what is uninspiring and boring, then I too find myself having uninspiring and boring thoughts; and an uninspired and boring life.
+
+

See Also

+
+/log/05-minds-field.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/43-pseodonymous.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/43-pseodonymous.gmi new file mode 100644 index 00000000..b80cf213 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/43-pseodonymous.gmi @@ -0,0 +1,92 @@ + + + + + MEMEX - Pseudonymous [2022-01-15] + + + + + + +
+ +
+ + +
+
+

Pseudonymous [2022-01-15]

+
+A person might think I'm illusive, writing and working under a pseudonym. It's not that I'm hiding, if you send me an email, I'll respond to you with an email address containing a decent chunk of my real name. It's not out of shame I wear clothes.
+
+Besides bringing utility, marginalia.nu is an experiment, a bit of an art project, a place to challenge conventions and see what is and isn't necessary.
+
+There is risk for a conflict of interest if your website is both your resumé and where you put your creative works and thoughts. Those are from different spheres and perhaps can't mix too much without detriment to the authenticity of the latter.
+
+Further, I think that there is far too much emphasis on identity. There is value in being a bit of a blank canvas. I like reading historical authors specifically because there is so much ambiguity in who they were, what they looked like, how they were like. This lets their words speak for themselves.
+
+In the end, I write and work under a pseudonym for the same reason none of the pages on this website have logos, why I don't gather statistics and analytics, why I've built all this software myself:
+
+To see what happens.
+
+This is not science, I don't run A/B tests on unsuspecting users like the sociopaths in big tech. My visitors aren't lab rats to be studied. I'm the rat, and this marginalia.nu is my maze; you're the one in with the clipboard and white coat.
+
+Maybe some of this is taking away too much, maybe it isn't. We'll never know if nobody tests the waters.
+
+

Replies

+
+gemini://szczezuja.space/gemlog/2022-01-16-Re-pseudonymous.gmi
+gemini://capsule.usebox.net/gemlog/20220118-re-pseudonymous.gmi
+gemini://lyk.so/gemlog/009-why-a-pseudonym.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/44-discovery-and-design.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/44-discovery-and-design.gmi new file mode 100644 index 00000000..5b6ce021 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/44-discovery-and-design.gmi @@ -0,0 +1,102 @@ + + + + + MEMEX - Discovery and Design Considerations [ 2022-01-18 ] + + + + + + +
+ +
+ + +
+
+

Discovery and Design Considerations [ 2022-01-18 ]

+
+It's been a productive several weeks. I've got the feature pulling updates from RSS working, as mentioned earlier.
+
+I've spent the last weeks designing the search engine's web design, and did the MEMEX too for good measure.
+
+It needed to be done as the blog theme that previously made the foundation for the design off had several problems, including loading a bunch of unnecessary fonts, and not using the screen space of desktop browsers well at all.
+
+Contrary to what one might think, I don't hate colors or non-brutalist design, I just dislike how its often abused to the detriment of the visitor.
+
+An important consideration is having a clean interface that doesn't unnecessarily drag attention away from what the visitor is attemping to focus on. It's been previously mentioned the disastrously noisy web design of Wikipedia. The search engine has gotten a bit noisier than it was before, but hopefully it's not gotten too noisy.
+
+Furthermore, I've overhauled the random exploration mode.
+
+Discovery is one of the main missions of this project, and it's been a vision for quite some time to offer some alternative means of literally browsing the internet, perusing its domains like you would flipping through a magazine.
+
+On the one hand, you can get a random selection from about 10,000 domains in the personal website sphere, but it's also possible to manually direct the search and show sites adjacent to a particular domain, using a combination of straight link-information and Personalized PageRank.
+
+The mechanics of extracting random interesting links have been around for a while, but the design was more than a little bit rough.
+
+An idea came to my mind that perhaps it would work better with some visual element to offer a taste of the flavor of the websites. It's easy enough to slap together a script that does that: Take one headless chromium, sprinkle a pinch of python, couple of weeks later you have one low-res screenshot per domain across half a million or so domains. (It's still running, by the way)
+
+https://search.marginalia.nu/explore/random
+
+You can either just refresh the "random"-page to get new domains, or click the "Similar Domains"-links to get results adjacent to that particular domain. It's pretty entertaining.
+
+The problem is just how to get visitors to discover this feature, since I specifically don't want distrating elements that draw attention to themselves. This is doubly tricky because of the strict no-cookie policy of search.marginalia.nu. Many sites would probably have something like a one-time dismissable window, or effect, or animation. That is simply not doable here.
+
+The single remaining option is to improve the signal to noise ratio that the links don't vanish in the noise.
+
+

See Also

+
+/log/00-linkpocalypse.gmi
+/log/03-writing-for-reading.gmi
+/log/27-getting-with-the-times.gmi
+
+

Topic

+
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/45-unfuck-internet-discoverability.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/45-unfuck-internet-discoverability.gmi new file mode 100644 index 00000000..f5730b43 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/45-unfuck-internet-discoverability.gmi @@ -0,0 +1,88 @@ + + + + + MEMEX - Can we unfuck internet discoverability? [ 2022-02-04 ] + + + + + + +
+ +
+ + +
+
+

Can we unfuck internet discoverability? [ 2022-02-04 ]

+
+I've been thinking a lot about how difficult it has become to discover quality content on the Internet, not because it isn't there, but because the signal to noise ratio is really bad, and most venues of discovery don't seem to be able to handle it.
+
+Recommendation algorithms seem to work almost too well, to the point where it's all kind of just showing you things you already like, rarely anything new that you might like. It's an absolute tragedy both for small websites and for their potential audience.
+
+Certainly discovery on the Internet could be made better.
+
+I've tried discussing this problem in various avenues, but mostly what you get is long tirades about how bad google or reddit is. Let's not even dwell on what other people are doing that isn't working, instead let's build something that does work. If I walk into a library and ask for a 20 good books to read, then I will get 20 books and most of them will be good. Why couldn't that be a thing with websites as well?
+
+It's why I built my search engine, and it's what I've tried to mitigate with exploration mode. Neither are perfect, but both seem close. Dealing with the search engine database I have, and doing various experiments, I think it should be possible to build something genuinely useful in this space. I'm not at all sure how but I think there are entirely new things that could be tried.
+
+If you too want to work on this, please let me know. Maybe we can collaborate somehow. I'm trying to gather some like-minded people. I'm sitting on a lot of data from my search engine, and have at least some hardware to spare.
+
+For inspiration, I'm making available a fun and useful dataset, a link database. It's available under CC-BY-SA-NC 4.0. To keep it manageable, it's on a first domain level, making it 13 million entries. You can download it below. This is real production data. Build something cool, make graphviz diagrams, whatever. Have fun!
+
+https://downloads.marginalia.nu/
+
+

See Also

+
+/log/19-website-discoverability-crisis.gmi
+
+

Topic

+
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/46-anatomy-of-search-engine-spam.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/46-anatomy-of-search-engine-spam.gmi new file mode 100644 index 00000000..bbfc76d9 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/46-anatomy-of-search-engine-spam.gmi @@ -0,0 +1,132 @@ + + + + + MEMEX - The Anatomy of Search Engine Spam [2022-02-07] + + + + + + +
+ +
+ + +
+
+

The Anatomy of Search Engine Spam [2022-02-07]

+
+Black hat SEO is endlessly fascinating phenomenon to study. This post is about some tactics they use to make their sites rank higher.
+
+The goal of blackhat SEO is to boost the search engine ranking of a page nobody particularly wants to see, usually ePharma, escort services, online casinos, shitcoins, hotel bookings; the bermuda pentagon of shady websites.
+
+The theory behind most modern search engines is that if you get links from a high ranking domain, then your domain gets a higher ranking as well, which increases the traffic. The reality is a little more complicated than that, but this is a sufficient mental model to understand the basic how-to.
+
+

Comment Spam

+
+Creating a bot that spams links in forums, guestbooks, comment fields, wikis is a really old-fashioned method. These links were never intended for humans to click on, but for search engines to register.
+
+In practice, since the rel=nofollow became standard practice, this is not particularly effective anymore as the attribute tells search engines to disregard the link. Some comment spam lingers as a mechanism for controlling botnets, sharing some of the cryptic eeriness of the numbers stations of the cold war.
+
+Source control systems, mailing lists, issue trackers, pull request systems, and so forth are also targets for spam, some of which do not to this date append rel=nofollow to their links to this date.
+
+

Dead Links

+
+An often overlooked side of link rot is that when a site dies, links often linger to the domain. This allows a spammer to simply register that domain, and immediately have search engine clout.
+
+This seems like a fairly low-level problem, probably won't be fixed without changes to DNS or the way HTML addresses resources.
+
+

Hacked Websites

+
+This is another way of piggybacking on a domain's ranking.
+
+Especially in older websites you can find strange hidden links. They may be hidden from rendering (style="display: none"), or they may be hidden from the human editor (perhaps 400 blank spaces to the right of a line of text). This seems to be manual work.
+
+

Link Farms, Link Rings

+
+There are websites full of almost nothing but links to similar websites. Not intended for humans, but for search engines. The pages appear dynamically generated with wildcard subdomains, almost invariably on cheap clounds and with cheap tlds.
+
+Alone this isn't very useful, but combined with some of the other techniques, appears to act as a sort of lens, magnifying the ranking of a target domain.
+
+ +
+

Wordpress

+
+Among newer websites, there are a lot of hacked wordpress instances, anyone with a web server will see probes for wordpress vulnerabilities several times per hour. What happens when they succeed is often not immediately noticeable, but often hundreds or thousands of pages are added, hidden, full of link spam, taking the same rough shape of the link farms mentioned previously.
+
+ +
+

Questionable Sponsorships

+
+Online casinos almost seem to have marketing as their primary expense, and have been observed sponsoring open source projects in exchange for a link to their domains.
+
+It may of course be hard to reject money, especially when in need, but at the same time, but maybe this practice should be stigmatized more than it is.
+
+

In Closing

+
+There are no doubt other techniques being used as well, but these appear to be the most common. It's an uphill battle, but knowing is a big part in combating this problem.
+
+Beyond all else, "rel=nofollow" should be mandatory for all links submitted by users, if nothing else because you become a far less appealing target for spammers.
+
+

Topic

+
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/47-drive-failure.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/47-drive-failure.gmi new file mode 100644 index 00000000..5d6ac5a6 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/47-drive-failure.gmi @@ -0,0 +1,96 @@ + + + + + MEMEX - Drive Failure [ 2022-02-19 ] + + + + + + +
+ +
+ + +
+
+

Drive Failure [ 2022-02-19 ]

+
+Not what I had intended to do this Saturday, but a hard drive failed on the server this morning, or at least so it seemed. MariaDB server went down, dmesg was full of error messages for the nvme drive it's running off. That's a pretty important drive.
+
+The drive itself may actually be okay, the working hypothesis is either the drive itself or the bus overheated and reset. After a reboot the system seems fine.
+
+That particular drive, an Optane NVMe stick, has worked impressively well for quite a while. It cost an arm and a leg and has some fairly impressive performance, so I would be sad if it failed.
+
+Fortunately there doesn't appear to to be actual data loss. fsck is fine, mysqlcheck is fine. Even if there was data loss, there is a good system of weekly backups of critical data on a different hard drive that should prevent serous data loss from individual drives failing.
+
+Even if there turns out to be some sort of quiet creeping corruption that only unravels after festering for weeks, the worst that will happen is that the server resets back to the state of last week and that's really that.
+
+In the mean time, the system is up and running again. We'll have to see if this was a one-off event or if one or more components requires replacement.
+
+I've been putting off an upgrade of this system. The motherboard I'm using also doesn't appear to be entirely stable which is more than a bit uncomfortable. The chassis is too small and runs hot, and I have a few SSDs that are getting pretty worn. Time is fast approaching when I have to upgrade this system.
+
+I'm very happy I have generous Patreons to soften the blow. Hardware is not cheap.
+
+

dmesg

+
+If anyone is curious what the error looked like, I'm appending it below.
+
+
+[17160266.929320] nvme nvme0: controller is down; will reset: CSTS=0xffffffff, PCI_STATUS=0xffff
+[17160266.985525] print_req_error: I/O error, dev nvme0n1, sector 195060096
+[17160267.013350] nvme nvme0: Removing after probe failure status: -19
+[17160267.041306] print_req_error: I/O error, dev nvme0n1, sector 153936816
+[17160267.041466] EXT4-fs warning (device nvme0n1p1): ext4_end_bio:323: I/O error 10 writing to inode 15 (offset 0 size 0 starting block 19242118)
+
+
+

Topic

+
+/topic/server.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/48-i-have-no-capslock.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/48-i-have-no-capslock.gmi new file mode 100644 index 00000000..bf170413 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/48-i-have-no-capslock.gmi @@ -0,0 +1,162 @@ + + + + + MEMEX - I have no capslock and I must scream [ 2022-02-21 ] + + + + + + +
+ +
+ + +
+
+

I have no capslock and I must scream [ 2022-02-21 ]

+
+In a near future, a team of desktop computer designers are looking at the latest telemetry and updating the schematics of the hardware-as-a-service self-assembling nanohardware.
+
+Steve: "Hmm, they don't seem to be using the power button very often."
+
+Bob: "Compared to the other buttons, it's only used 0.1% of the time"
+
+Steve: "Remove it?"
+
+Bob: "Remove it!"
+
+Computers now instantly boot up when plugged into the wall, and run until the plug is pulled. No more start-up time, the cases are aesthetically cleaner, and manufacturing cost is down at least a fraction of a dollar.
+
+Confident with this success, they turn their attention to the keyboard.
+
+Steve: "Have you noticed that most letters the users type are lower case? Only in rare instances are they in upper case!"
+
+Bob: "Hmm, but sometimes it may be useful to write in upper case too."
+
+Steve: "Let's compromise and put the shift and capslock keys in a little hatch on the back of the keyboard for a test group, maybe that's enough for the power users. That will clean up the keyboard design for everyone else..."
+
+Bob: "Done! And deployed!"
+
+this is just great, who is going to bother turning their keyboard over just to write an upper case letter?
+
+steve: "now the usage has dropped to 0.000001% in the test group! see, nobody needs those keys!"
+
+bob: "wow! you were right, let's push a patch immediately to remove them for everyone!"
+
+steve: "now i'm looking at the punctuation, they're also a lot more rare than the letters"
+
+bob: "put them in the hatch on the back?"
+
+steve: "sure"
+
+alright i guess this is how it is now xxxx we'll adapt
+
+steve xxx you know im looking at these letter distributions and most letters seem to be in the group xxx etaoinshrdlu xxx
+
+bob xxx youre right xxx who knew two thirds of the alphabet was redundant xxx imagine how clean the keyboard would be with less than twenty keys
+
+steve xxx i wanna xxx lets just do it
+
+bob xxx the design is going to be so minimalist and clean after this xxx not even jobs thought this far outside the box
+
+
+i ant euen estress hou hrustratin this data driuen desin nonsense is use isnt usefulness interoetatin data is really dittioult
+
+steue aaa it ould e oleaner to ust aue one ei thou
+ooo aaa iea it ould aaa e have one this ar hu not all the uai
+
+
+aa aa aaa aaaaaa aaaaaa aa aaaaaaa aaaaaaaa
+aaaaaaaaaaa aaaaa aaaaaa aaaaaaaaa aaaaaaaa
+aaaaa aaa aa aaa aaa aaaaaa aaaaaaaaaaa
+aaaaaaaa aaaaa aaaaaa aaaaaaaaaaa aaaaaa aaaa
+aaa aa aaaaaaaa aaaaaaa aaaaaaaaaa aaaaa
+
+

aaaaaa

+
+ +
+ +
+

+
+
+       a
+     a    a
+  a     a
+       a
+       a
+       a
+  a    a
+        a
+          a
+
+

Discussion

+
+This post seems to have resonated with people. There's a discussion here, among other places:
+
+https://news.ycombinator.com/item?id=30421399
+
+

Topics

+
+/topic/satire.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/49-marginalia-1-year.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/49-marginalia-1-year.gmi new file mode 100644 index 00000000..74631e37 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/49-marginalia-1-year.gmi @@ -0,0 +1,85 @@ + + + + + MEMEX - Marginalia Search: 1 year [ 2022-02-25 ] + + + + + + +
+ +
+ + +
+
+

Marginalia Search: 1 year [ 2022-02-25 ]

+
+I've caught some bug and don't have the energy to write more than a brief note.
+
+I want to commemorate the fact that work on the Marginalia search engine started one year ago. The first commit was on February 26th 2021, and contained a sketch for a website crawler and some data models.
+
+In many ways, the paint is barely dry, yet it feels like this project has been around for a long while.
+
+One year ago:
+
+
    +
  • There was no search.marginalia.nu.
  • +
  • There was no encyclopedia.marginalia.nu
  • +
  • There was no memex.marginalia.nu
+
+I built all this on my own, from scratch, just following a crazy idea to its logical conclusion.
+
+I think that's pretty neat, but what I'm the most proud of is that it has actually had an impact. It's let people discover new things and rediscover old, it's made people think and re-evaluate the state of the Internet. I even know a few who have created websites inspired by what they've found using my little search engine.
+
+It's just been a success beyond my wildest imagination.
+
+

Topic

+
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/50-meditation-on-software-correctness.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/50-meditation-on-software-correctness.gmi new file mode 100644 index 00000000..b3835965 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/50-meditation-on-software-correctness.gmi @@ -0,0 +1,140 @@ + + + + + MEMEX - A meditation on correctness in software [ 2022-03-14 ] + + + + + + +
+ +
+ + +
+
+

A meditation on correctness in software [ 2022-03-14 ]

+
+Let's define a simple mathematical function, the function will perform integer factoring. It will take an integer, and return two integers, the product of which is the first integer.
+
+
+  F(int32 n) = (int32 A, int32 B) 
+
+so that
+
+  
+  A*B = n
+
+This is fairly straight forward, mathematical, objective. Let's examine some answers an implementation might give.
+
+
+  F 50 = (5, 10) on ARM
+  F 50 = (10, 5) on Intel
+
+This seems like a bug, so let's add the requirement that A <= B for deterministic results.
+
+  
+Depending on language what comes next may or may not be defined behavior, but let's use a programming language where signed integers overflow, then we might get this result:
+
+  
+  F 2 = (-2, 2147483647)
+
+Now, as everyone no doubt will recognize, 2147483647 is a Mersenne prime (2^31 - 1), and the answer satisfies every requirement posed so far. This again *seems* like a bug, we clearly meant to say A and B must be positive.
+
+New scenario! F(60):
+
+  
+  F 60 = (2, 30) on most days
+  F 60 = (1, 60) on the programmer's birthday
+  F 60 = (5, 12) during monsoon season
+  F 60 = (6, 10) when venus is in retorgrade
+  
+Yet again, this seems wrong, we don't expect a mathematical function to depend on the calendar. Perhaps we meant that A must be the lowest prime factor.
+
+Let's consider F(7)
+
+
+  F 7 ?= (1, 7) -- no, 1 isn't a prime
+  F 7 ?= (7, 1) -- no, 7 is greater than 1
+  F 7 = error!
+  
+These requirements are impossible to satisfy when n = 7. What we meant to say was that A must be a prime factor, or 1 if n is prime.
+
+That actually still leaves F(1):
+
+
+  F 1 ?= (1,1) -- no, A=1 isn't a prime, and B isn't a prime so A isn't permitted to be 1.
+
+So now A must be a prime factor, or 1 if n is a prime or 1.
+
+Let's leave those particular weeds and consider F(-4)
+
+
+  F -4 ?= (-2, 2) -- no, -2 isn't a prime
+  F -4 ?= (-4, 1) -- no, -4 isn't a prime
+  F -4 ?= (1, -4) -- no, A > B
+  F -4 ?= (2, 2147483646) -- yes!(?!?)
+
+The last entry satisfies every requirement (again in signed integer arithmetic); 2 is a prime and a factor or -4, the smallest, 2 is less than 2147483646, 2 is positive. ... yet it feels like a bug. Let's just do like Alexander and bring a sword to this knot and require that n > 0, this also gets rid of the degenerate zero case.
+
+Some reader may object and say this is because of signed integers, but believe me, floating point isn't better, fixed point has gotchas as well. This post isn't really about integers, it's about our relationship to requirements.
+
+While the requirements may seem simple, the function may strictly speaking open a socket to some SaaS-service that performs prime factoring. From the requirements it's impossible to tell. It would be unexpected for a factoring function to run out of file descriptors or not work during a network outage, but given the requirements provided so far, it might; and we might call that a bug too.
+
+This is how software development goes, on all levels, low level programming, high level programming, front-end programming, back-end programming.
+
+What I want to argue is that this is something that happens a lot: Bugs, more often than not, aren't a breaches of requirements, but rather the code surprising us in some fashion, upon which we quickly invent some new implicit requirements the code is breaking that we would not have been able to tell you before the discovery of the bug.
+
+Software correctness is indeed praised by many, but in many cases it's not entirely clear what it even means for software to be correct. In reality, it often boils down to some hand-wavy principle of least surprise, where a staggering amount of software requirements are entirely made up on the fly in response to the behavior of the code.
+
+You may violently disagree with the inflammatory accusation that comes next, but if this is the case, is there any other word for software that repeatedly surprises its users through frequent design changes than this?: Buggy.
+
+

Topics

+
+/topic/programming.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/51-the-file-startup.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/51-the-file-startup.gmi new file mode 100644 index 00000000..08552b40 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/51-the-file-startup.gmi @@ -0,0 +1,189 @@ + + + + + MEMEX - The Static File Startup [ 2022-03-18 ] + + + + + + +
+ +
+ + +
+
+

The Static File Startup [ 2022-03-18 ]

+
+Note: This is satirical in nature. Slight CW if you are at a point in life where "Office Space" has unveiled itself as a disturbing existential horror movie. This taps into that the same darkness.
+
+
+
+A tale of six brave Internet pioneers.
+
+
+  Senior Business Founder / Senior CEO -- Zach
+  Senior Tech Lead / Senior Architect / Senior CTO -- Kevin
+  Senior Backend dev
+  Senior Frontend dev -- Erin
+  Two Senior UX engineers
+
+Deadline: 6 months
+
+

Mission

+
+We're going to disrupt Internet hosting and serve static files in a novel way through a cloud based SaaS. In the MVP it's just "Thus Spake Zarathustra" in the original German and the lyrics to Auld Lang Syne (we found both on archive.org under the public domain).
+
+We're really stoked to innovate in this space, and think our SaaS-offering will really disrupt how the Internet works.
+
+

Sprint 1

+
+We've decided to store the files in a NoSQL database, as files have a hierarchical nature, and relational databases are from the past. We're all very committed to using modern technology.
+
+We've opted to choose Cassandra as that is the first ad that came up when googling "2022 nosql database". Nobody knows how to use Cassandra, but that will probably work out fine. Most of the team was spending the sprint trying to find tutorials on stackoverflow.
+
+We've gotten started on the REST APIs, it's slow work because everyone has very strong opinions.
+
+The debate was between
+
+
+  GET /files/:filename
+
+and
+
+
+  GET /file/:filename
+
+We went with the second after Kevin threatened to quit.
+
+
+We finished the Sprint with a hackathon.
+
+

Sprint 2

+
+The backend will be implemented in Java 18 and SpringBoot and WebFlux, it will load configuration from a yaml file and then pass along requests to cassandra. Our backend guy had a nervous breakdown so Kevin, Erin and the UX team are piecing this together from Baeldung samples. So far it works fairly well, although we're struggling getting WebFlux to perform as well as we'd hoped. We're not sure why.
+
+The frontend will be a SPA that reads the URL-fragment and sends a request to the backend and fetches the document. We found a javascript library that sends asynchronous requests and loops until they return (asynchronous is good because it's faster), and another that replaces the current page contents with the string you provide, and finally one that checks whether a string is empty, although that last one has a lot of dependencies so it takes like 10 minutes to build...
+
+

Sprint 3

+
+We felt that javascript was kind of from the past, so we're going all in on Kevin's hackathon project of concept that transpiles a functional dialect of Algol 68 into webassembly (except Kevin was adamant all the variable names must be emojis, and when pressed why he just sent an aubergine and left Slack). It's been a lot of work getting it to work but it feels stable now.
+
+We also got a tote bag with the company logo, pretty sweet.
+
+

Sprint 4

+
+The UX team is really not happy with the document just loading when you visit, they want to add a transition animation and smooth scrolling when you load the page, we also really need analytics and tracking to see how far the visitor scrolls and when they select something. To get all this to work we need to render the text onto a canvas with WebGL. We apparently also urgently need third party fonts but the CEO says they are illegal in Europe so now we need a consent popover for European IPs before they can view the file.
+
+

Sprint 5

+
+It's turned the Algolmoji transpiler was really buggy, and Kevin has quit, so most of us have been attempting to fix it. Whenever you use emoji skin color modifiers the code crashes. It got out on twitter and there is a real shit-storm about it from two really angry accounts. VC are really nervous and threatening to pull our funding. This is a big problem and all hands are on deck and they've been crunching hard to address the problem.
+
+It also turns out that Nietzsche was a German. We're really nervous the angry twitter accounts will discover this, but we've invested too much to change the 50% of the MVP now.
+
+

Sprint 6

+
+Most of the code works now, although there is a memory leak in the Algolmoji interpreter so the browser crashes the tab if you scroll too far into Also Sprach Zarathustra. We're trying to figure out why. We suspect it may be a bug in the operating system. We're not convinced any window can scroll that far, it's a very long text file, like far longer than anything else, by our calculations the graphics card can't have enough vram to store that many pixels at once. Hopefully it's not a showstopper.
+
+It also turns out we can't use GET for the REST API, we need to do a weird thing and use POST because the path may contain "../" and then for some reason the app server starts resolving files in the system directory. Yikes, that was a close call. This is not idiomatic REST but we're too far in to find a good solution.
+
+

Sprint 8

+
+This sprint has been devoted to getting the DevOps flow going, we've set up a process for building docker images of the front-end and back-end, with git hooks that deploy directly to kubernetes. Most of us have only written YAML these two weeks, including the founders. How can there be this much YAML?
+
+At the same time, we've gotten far! Grafana, Kibana, Elasticsearch, Prometheus, Letsencrypt, it's all set up and working. We're very proud. We're also scaling automatically, which may be necessary, as the back-end code seems very slow and keeps crashing for some reason.
+
+Serving static files isn't easy but thankfully we have all these great open source solutions to help. Can't even imagine the work we'd have to do keeping this system running manually.
+
+The feeling is good. We're confident we're gonna make it big.
+
+

Sprint 9

+
+We've finally managed to set up Cassandra! It's been really stressful, but we finally found a tutorial that worked after three months of googling! PHEW! The entire MVP almost works, although the character encodings are all wrong. Hopefully that doesn't matter too much.
+
+

Sprint 10

+
+Another hackathon.
+
+Erin was experimenting with old fashioned web technology and discovered you can just serve files with a regular web server, like straight from a directory, and these web servers are like free and super easy to set up. She demonstrated our entire start-up can be replaced with a raspberry pi. Over two months of work. Millions of dollars of start-up swag down the drain? We've had a meeting. Do we gamble that nobody knows and push ahead, we can't compete with this if it's common knowledge. Do we pivot to hosting images instead? The founders were reassuring we could still make it big. Monday morning we discovered Zach's took the remaining money and ran. We don't know what to do.
+
+VC is threatening to sue.
+
+Help.
+
+

Topics

+
+/topic/programming.gmi
+/topic/satire.gmi
+
+

Responses

+
+gemini://perplexing.space/2022/re-the-static-file-startup.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/52-growing-pains.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/52-growing-pains.gmi new file mode 100644 index 00000000..36dfdefd --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/52-growing-pains.gmi @@ -0,0 +1,163 @@ + + + + + MEMEX - Growing Pains [ 2022-03-23 ] + + + + + + +
+ +
+ + +
+
+

Growing Pains [ 2022-03-23 ]

+
+The search engine index has grown quite considerably the last few weeks. It's actually surpassed 50 million documents, which is quite some milestone. In February it was sitting at 27-28 million or so.
+
+About 80% of this is side-loading all of stackoverflow and stackexchange, and part of it is additional crawling.
+
+The crawler has to date fetched 91 million URLs, but only about a third of what is fetched actually qualifies for indexing for various reasons, some links may be dead, some may be redirects, some may just have too much javascript and cruft to qualify.
+
+As a result of this growth spurt, some scaling problems have made themselves apparent. This isn't the first time this has happened, and it's nothing that can't be fixed.
+
+

Preconversion

+
+The search crawler writes index data in a journal. It does this in an order that makes sense from its point of view, basically groups of words that occur per document, this is because the crawler downloads one URL at a time.
+
+Schematically the data looks like this:
+
+
+  in memex.marginalia.nu/log/48-i-have-no-capslock.gmi these keywords were important: "capslock", "keyboard", "computer", "memex"
+
+
+  in memex.marginalia.nu/log/50-meditation-on-software-correctness.gmi these keywords were important: "software", "bug", "requirements", "memex"
+
+For a search engine to actually be able to seach, it can't go through records like that one by one. This would start falling apart at just a few thousand documents. Instead, the data needs to be transposed, so that it is arranged in terms of documents per keyword. That is, like this:
+
+
+   "memex" occurs in: memex.marginalia.nu/log/50-meditation-on-software-correctness.gmi, memex.marginalia.nu/log/48-i-have-no-capslock.gmi
+   
+   "keyboard" occurs in: memex.marginalia.nu/log/48-i-have-no-capslock.gmi
+   
+   "bug" occurs in: memex.marginalia.nu/log/50-meditation-on-software-correctness.gmi
+   
+This transposition is a fairly slow process as every document to be indexed needs to be considered. At the current time, it's about 35 Gb worth of dense binary data (both keywords and URLs are represented as numbers).
+
+To make it possible to search different parts of the internet, to search among blogs or among academia separately, the search engine has multiple smaller indices that contain only some websites. Originally, when building these indices, the entire file was considered.
+
+The transposition process reads through the file multiple times, first to figure out how many documents are per word, then to tentatively put those documents into a file ordered by word, then to create a mapping of keywords.
+
+Because it takes this many passes, it is faster to have a pre-conversion step that breaks it down into the data each index is interested in beforehand.
+
+Preconversion reads through the file once, and produces eight different files containing only a subset of the original data. This has worked well to speed up transposition until recently, but has now gotten untenably slow.
+
+The problem was that it was reading the big index file, as well as writing to the eight smaller index files on the same mechanical hard drive, causing the disk to have to seek very aggressively, eating into the time it is available for reading and writing, degrading its performance.
+
+The fix was easy, write the preconversion output to another physical hard drive, and the payout was shaving the 20 minute preconversion down by five minutes. I would like it if it was faster than that, but this is still a big improvement.
+
+

SQL

+
+The search engine uses a SQL database for some of its metadata, MariaDB to be specific.
+
+The database contains mappings from internal numeric IDs to domains, urls; mappings between domains and URLs, linking information between domains, and so on. The usage is fairly basic but there's just a lot of data in the database. At this point, it takes 30 seconds to do a select count(*) from the URLs table, which contains at the time of writing 218 million known URLs.
+
+I'm low-key looking to migrate away from this as it's not all that well suited for the job, and doesn't make good use of the Optane drive it's on. It's just I don't quite know where I'm going to go. If I can't find anything, I may whip up a bespoke solution, but I'd rather not as there is enough work with the (bespoke) search engine index.
+
+Until then I'm putting out fires. I've tried tuning the database a bit better, and it may have gotten faster but not enough to make a real difference.
+
+Most of the slow queries are joins, especially touching the URLs table.
+
+Fixing them is not as easy as just adding an index. These tables are well indexed, but they're getting so large that even *with* appropriate indices, queries can easily can take several seconds (without indices it would be more like 30 minutes). Ten second page loads are not a great experience, and it's also a bit of a vulnerability on my end, as one could feasibly overload the SQL server by refreshing a page that causes such a query a couple of times per second.
+
+I'm getting around it by precalculating data that doesn't necessarily need to be live, such as the views for number of known, visited and indexed documents in the site overviews. The downside is that information may be stale, and in most cases this isn't a huge problem, but from a maintenance perspective, it's more that can go wrong.
+
+There was also a pretty significant performance degradation in the exploration feature for similar reasons, but that should be fixed now as well. Some of the images currently load fairly slowly, but that should pass in a couple of days. That particular drive is being used for a very large calculation and sees a lot of access contention.
+
+

MariaDB Memory leak

+
+It also turns out that MariaDB has a bit of a memory leak. It's probably not from a bug like forgetting to de-allocate resources, but rather from looking at what other people are saying, it seems a problem with heap fragmentation. The effect is the same, causing it to very slowly accumulate memory usage.
+
+For more conventional use, this might be fine, but as it stands, I'm really pushing the limits of the hardware so it's important that services stay relatively fixed in memory requirements otherwise stuff starts getting killed off by the dreaded OOMKiller.
+
+I'm not a huge fan of having cron jobs that restart services at random intervals so I would like to avoid that if at all possible. Word is that you can mitigate this memory creep by changing the memory allocator from malloc to something like tcmalloc or jemalloc. This does seem to at least slow it down a tad.
+
+

What's Next?

+
+It works until it doesn't, and then I go looking for something else that works. This has been how building this search engine has been, pretty much from day one. Getting to a million documents required optimizations, so did getting to ten million. I think I can make it to a hundred, and no doubt that will require yet more tweaks and fine-tuning.
+
+It's difficult to predict what will break beforehand. I know the index transposition will get slower and slower, I know the SQL database is struggling, but it may be something else entirely that blows out next.
+
+There are paradigms that are almost guaranteed to scale well up to a very large scale without these problems, but the crux is that they have a fairly high constant cost.
+
+That is, building the system that way would not allow me to do as much with the hardware the system is running on. The admittance fee to large scale computing is large scale computers.
+
+I'd like to see if I can't do the same with a small computer instead, if nothing else than for the sake of shifting the power balance. What if you don't actually need a data center and a budget the size of the GDP of a small country to run a search engine. What if it's within the reach of humans? Wouldn't that be a kicker...
+
+

See Also

+
+ +
+/log/06-optimization.gmi
+
+

Topic

+
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/53-better-hard-drive-metaphor.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/53-better-hard-drive-metaphor.gmi new file mode 100644 index 00000000..eff1a17f --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/53-better-hard-drive-metaphor.gmi @@ -0,0 +1,92 @@ + + + + + MEMEX - Is There A Better Hard Drive Metaphor? [ 2022-04-03 ] + + + + + + +
+ +
+ + +
+
+

Is There A Better Hard Drive Metaphor? [ 2022-04-03 ]

+
+This is mostly a post to complain about something that chafes. I wish there was a programming language (ideally several) that acknowledged that computers have hard drives, not just a processor, RAM and other_devices[].
+
+Something that has struck me when I've been working with the search engine is how unfinished the metaphor for accessing physical disks is in most programming languages. It feels like an after-thought, half left to the operating system to figure out, a byzantine relic of the days when computers had tape drives and not SSDs.
+
+Reading and writing files is clunky and awkward no matter how you do it. Objects and classes are representations of bytes in memory, effortlessly integrated in the language. Why can't they be representations of bytes on a disk? Between mmap and custom allocators, this seems extremely doable.
+
+It's a jarring contrast to the rest of almost any programming language other than perhaps C. In fact, what you've got is effectively C, with all of its problems and more.
+
+In the rest of the languages, there may be some token effort toward reading files structured as streams of objects, but in general, you are stuck filling buffers with bytes. There is a marked lack of expressiveness in this type of programming.
+
+This has fed into this divide where there are robed greybeard mystics who have peered beyond the veil and know how to access a hard drive effectively, performing eldritch rites with B-trees, ring buffers, and other forbidden data structures you vaguely may remember from your deepest slumber during that one class as a second year undergraduate; and the rest of the programmers who will never awaken to this unspeakable knowledge.
+
+Often we use a DBMS as a stopgap solution to get around the sorry state of disk access, but that in itself is a kludge in many cases. Object-relational mapping is a shoe that never quite fits, and if possible SQL integrates even worse into other programming languages than disk access does.
+
+Besides, relational databases are hard, it's still too arcane. Let's just turn it into a kv-store with JSON blobs for each value.
+
+The file system itself is actually a database too. They're based on pretty much the exact same principles and data structures as your average DBMS.
+
+So what you've got is a nightmarish turducken, the file system as the OG NoSQL database from generations past, containing a relational SQL database, containing an ad-hoc NoSQL database. It's like that old Xzibit meme, putting a database in a database so you can database while you database.
+
+Feels like a state of surrender. Sorry guys, hard drives were too difficult to figure out, let's bury them in abstractions and forget they exist.
+
+We're off on a bad track here, probably starting with the fact that the operating system is attempting to hide the fact that hard drives are block devices from the userspace, while this is something you urgently need to lean into if you want your program to be fast.
+
+With all the development on new programming languages going on, one more guaranteed to be memory safe than the other, is anyone working on a language that properly integrates hard drives as a first class citizen? Would be a neat thing to explore.
+
+

Topics

+
+/topic/programming.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/54-bargain-bin-btree.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/54-bargain-bin-btree.gmi new file mode 100644 index 00000000..ab1481c4 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/54-bargain-bin-btree.gmi @@ -0,0 +1,170 @@ + + + + + MEMEX - The Bargain Bin B-Tree [ 2022-04-07 ] + + + + + + +
+ +
+ + +
+
+

The Bargain Bin B-Tree [ 2022-04-07 ]

+
+I've been working lately on a bit of an overhaul of how the search engine does indexing. How it indexes its indices. "Index" is a bit of an overloaded term here, and it's not the first that will crop up.
+
+Let's start from the beginning and build up and examine the problem of searching for a number in a list of numbers. You have a long list of numbers, let's sort them because why not.
+
+I will print a short list of numbers, but extend it in your mind.
+
+
+  1 3 4 5 7 9 10 14 16 17 18 20 21 27 28
+
+You're tasked with finding whether 20 is a member of this list. The simplest solution is to go over it one by one, as a linear search.
+
+This is fine if the list is short, but our list is unimaginably long. Think at least 15 megabytes, not 15 items. We want finding these items to be blazingly fast, too. People are waiting for search results page to load. Can't spend 20 minutes checking every item on the list.
+
+This list actually indexes itself. The next step from a linear search is to use a binary search. Any sorted list implicitly forms a search tree structure.
+
+Check the middle item, [14], is that smaller or larger than 20? It's smaller. Check the item in the middle between 14 and 28? Well what do you know, that's [28]. A hit in two tests.
+
+The worst case for a list of 15 items is four tests, the average is about three, or rather, approximately log2(N-1). This is pretty great when dealing with RAM. Even for 15 megabytes, the expected number of tests is only 24. The distribution is skewed very heavily toward this value, with a 75% chance to get 24 or 25 tests. Even 150 gigabytes you get just 38 tests. Memory reads are very fast, so this is pretty sweet! Mission accomplished!
+
+

SSDs

+
+Now, if the list is 150 gigabytes, we probably can't keep it in memory. We must keep it on disk instead. No worry, you might think. SSDs are great, they have good random access performance! That's a statement that should come with an asterisk of the sort you find after "free magazine subscription".
+
+Programming for SSDs is complicated, and on top of that, there's what the SSDs themselves do, what the SSD controller does, what the SATA protocol does, what the operating system does, what the programming language does. All these factors affect how much juice you get when you squeeze.
+
+For most tasks it doesn't matter, you can treat SSD reads as effectively free. A search engine index isn't most tasks, however. It's a weird niche of a task that breaks the mold even for most traditional DBMS approaches.
+
+Practically speaking, we can imagine that SSDs read and write data in chunks. The size is device specific and pretty hard to actually find out, but 4096 bytes is a good guess as even if it's wrong, it aligns up with the page size of the operating system, which is another thing we really need to align ourselves with if we want to go fast.
+
+What you need to know is that when you tell your operating system to go read a byte off a memory mapped SSD, you will get a 4K page of data. The OS may decide to read more than that, but it won't read less. The same when you write, except this time it's the SSD hardware that is forcing each write to be exactly one block. It tries to pool and gather writes but that's not always possible. Bottom line is that sequential I/O is fast, the more random and small the I/O is, the worse things get. You don't need to wait for the seek like you do with mechanical hard drives, but that doesn't mean random I/O is free like in RAM.
+
+There is a name for this phenomenon: I/O amplification. There are multiple sources of I/O amplification, from the hardware, from the operating system, from the algorithm you're using. It's a measure of how much work is done relative the the minimal work task. If a task to insert 1 byte in the middle of an array list means you have to shift 40,000,000 bytes to the right in that list, the write amplification is 40 million. If you want to find 1 byte from a linked list and you have to go through 25,000,000 bytes worth of nodes to get it, then the read amplification is 25 million.
+
+The operating system reads and caches the surrounding data in case you might need it. If you read one byte, you read it from disk, and then you go to read the byte next to it, you're reading from memory and not from disk, which is so much faster it may as well be considered free.
+
+If you are reading scattered bytes off an SSD, like you would during a binary search, you're going to cause dozens of these page faults, that is, the operating system will actually have to go fetch the bytes rather than read from its cache; and not only those bytes, but the blocks they belong to.
+
+It is much much much slower to read from a disk than from main memory, so the performance of a disk based index can be measured in how may pages it touches. The bad news is that our binary search touches a lot of them. Our 38 tests from before is roughly equivalent to touching 28 pages. Even though we're only looking at 38 positions in the index file (a total of 304 bytes), we're causing the operating system to to have to read something like 11 Kb of data. Ouch.
+
+There must be a better way.
+
+

The Bargain Bin B-Tree

+
+If we lean into the reality of reading data within 4K pages being cheap to the point of nearly being free, then we can create a tree structure based on 4K pages. You'd normally go for a proper B-tree or a LST-tree, but since this is a static index with no real-time updates, we can for something a bit stupider.
+
+The search engine uses 64 bit words for these indexes. Each 4K page fits 512 words.
+
+Let's build an implicit B-tree with branching factor 512. This is balanced search tree of height Θ(1+log512(N)), where each node is a sorted list of numbers implicitly indexing the children of the node. I'll spare you the details of the implementation.
+
+This data structure is like a conventional B-tree's provincial cousin, disregarding any concern for inserts or data mutation for the sake of reducing the index size. I will admit, I don't know what this data structure is called. It's simple in so many ways it may not even have a name. I can imagine Donald Knuth looked at it, furrowed his brows, and brushed it off into the garbage like stray pencil shavings.
+
+I'm tentatively calling it The Bargain Bin B-Tree until someone tells me it has an established name.
+
+Before we dismiss it, let's never the less see how it performs. At first glance each look-up touches exactly 2+log512(N) pages. We can reduce those 28 average page reads from the binary search to 6 guaranteed page reads in every case. This is a decent start.
+
+But wait, there's more! The physical size of this index is a fraction over 1/512 that of the original data for large data volumes. The index is only about 300 Mb if our data-to-be-indexed is 150 gigabytes!
+
+We can easily fit the entire tree neatly in memory, and then we only ever need to do a single disk read to find our needle. Only if the data creeps into the terabytes, we may need to start looking at two reads per look-up.
+
+Huh!
+
+This is pretty good, as long as this model of performance holds! But does it...?
+
+It turns out it's extremely difficult to actually benchmark disk reads, beacuse you're dealing with layers and layers of caching abstractions. It's easier to do with writes, but even in that case it's hardly trivial.
+
+The more times you run a read benchmark, the faster it gets as the operating system begins to cache the data even beyond what you'd expect to see in the field. With Java, the virtual machine also speculatively recompiles the code as it figures out which parts are hot.
+
+This observer effect is very difficult to get around. The closer you examine the problem, the harder it is to tell what you are even benchmarking (other than the benchmark itself).
+
+Overall, it's really hard to find good resources on programming for SSDs, perhaps in part because it's so hard to benchmark. I don't like being told not to worry about this when disk I/O is the primary limiting factor of my search engine.
+
+So I've gathered a pittance of links I could find here.
+
+ + +
+ +
+ +
+ +
+
+Please let me know if you are aware of additional reading on this topic.
+
+

Write-in suggestions

+
+ + +
+

Topics

+
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/55-lexicon-rubberduck.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/55-lexicon-rubberduck.gmi new file mode 100644 index 00000000..28b7096c --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/55-lexicon-rubberduck.gmi @@ -0,0 +1,128 @@ + + + + + MEMEX - Lexicon Architectural Rubberducking [ 2022-04-11 ] + + + + + + +
+ +
+ + +
+
+

Lexicon Architectural Rubberducking [ 2022-04-11 ]

+
+I'm going to think out loud for a moment about a problem I'm considering.
+
+RAM is a precious resource on any server. Look at VPS servers, and you'll be hard pressed to find one with much more than 32 Gb. Look at leasing a dedicated server, and it's the RAM that really drives up the price. My server has 128 Gb, and it it's so full it needs to unbutton its pants to sit down comfortably. Anything I can offload to disk is great.
+
+A significant amount of the memory usage is in the lexicon. The lexicon is a mapping between search terms, words (sometimes N-grams) to a unique numeric ID, as these IDs are a lot more space-efficient than indexing words as strings.
+
+The contract for the lexicon is that every time you enter a specific string, you get the same number back. This number is unique to the string.
+
+At the moment of writing, the lexicon has about 620,000,000 entries.
+
+These strings are of average length 6-7 bytes, so the smallest it's ever going to get is about 4-5 Gb. The strings are already compressed.
+
+What I'm using is:
+
+
+  8 Gb off-heap for a hash table 
++ 6 Gb on-heap for metadata
++ 5 Gb off-heap for the string data itself
+-------
+= about 20 Gb
+
+Assigning unique IDs to arbitrary length strings isn't entirely a trivial problem when the number of IDs creeps toward the billions, but this memory consumption is still unreasonable.
+
+Maybe a DBMS can fix this? URLs mapping on MariaDB, 200k URLs, is just ridiculously large ~40Gb. MariaDB probably can't solve this with the hardware I have available. Maybe some other database?
+
+

What if we just use hashes as identifiers?

+
+Can we find a hash of such a size that we can accept hash collisions as so unlikely it won't matter?
+
+The Birthday Paradox becomes a significant problem when the number of items N is such that the number of distinct hash values M < N^2.
+
+
+M   = 18446744073709551616 = 2^64 
+N^2 = 384400000000000000   = (620,000,000)^2
+
+It *could* work with a 64 bit hash, but a 128 bit hash would feel a lot less sketchy. It would also use a lot more space. Caveat: I would need a very good hash function for this math to work out. Murmur3?
+
+

Hold my beer...

+
+What if we create a hash table on disk, the key is the hash from above, we size it to 2^32 entries, this should allow for a lexicon of ~2^31 entries with good retrieval performance.
+
+Disk size would be 16 or 32 Gb depending on 64 or 128 bit hashes. We can use the cell number the final hash is put into as an ID.
+
+This is just crazy enough to work, but it would depend on having extremely solid random write IOPS on the disk, or enough RAM to do the construction entirely in memory. Maybe journal the writes and then reconstruct the hash table only after a crash. This *may* be acceptable, but makes a ton of RAM and/or enterprise SSDs mandatory for running this software.
+
+An additional drawback is that this mapping can't ever grow beyond 2 billion entries. This may be acceptable, might be able to scooch it up by multiples of 2 by by partitioning on some bit that isn't part of the table hash. The drawback is that this configuration can't be changed without reconstructing the entire index.
+
+The real upside is that this may make it possible to remove the requirement for 7 bit ASCII keywords.
+
+Need to sleep on this.
+
+

Topics

+
+/topic/programming.gmi
+/topic/astrolabe.gmi
+
+

See Also

+
+/log/06-optimization.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/56-uncertain-future.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/56-uncertain-future.gmi new file mode 100644 index 00000000..1906fb89 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/56-uncertain-future.gmi @@ -0,0 +1,92 @@ + + + + + MEMEX - Uncertain Future For Marginalia Search [ 2022-04-28 ] + + + + + + +
+ +
+ + +
+
+

Uncertain Future For Marginalia Search [ 2022-04-28 ]

+
+I found myself effectively without a job on short notice.
+
+I'm not at all worried about finding another one, I have savings, and I have experience, and I have demonstrable skill. What I am concerned about is finding a source of income that's compatible with putting some time on my personal projects.
+
+Last bunch of years, I've been working 32 hour weeks, which is a pretty sweet deal especially combined with the zero hour commute you get working from home during the pandemic. Not every employer is fine with that, and while I do have options, I'm in a worse bargaining position than I have been before.
+
+In the short term I'm going to take a few months' hiatus and work full time on the search engine. I have a lot of ideas that I haven't had the time to thoroughly explore that will hopefully produce decent improvements to some of the areas that are most lacking.
+
+Perhaps the biggest thing is looking over the design a bit in terms of affordances, coming from something like Google, it can be difficult to find the sort of queries that produce interesting and worthwhile results. I think this can give a bad first impression.
+
+Another thing that needs fixing is the summary text for each search result. It's bad more often than it is good as things stand right now, and often makes good search results look like bad hits.
+
+Finally, I'm working on changing the crawler model to be more aligned with WARCs, the format used by internet archive and various other crawlers such as commons crawl. This is half-done already, but hopefully should make the data more portable and open for collaborating with other crawling efforts.
+
+Ideally I would love to be able to work on this for a longer time, but search engines just aren't all that profitable, less still with my general lack of business sense. I'm incredibly grateful that I have patreon supporters and so on, but right now they just about pay the cost of running the hardware. But who knows, maybe I'll build something so impressive in the next months as to open up some alternatives.
+
+Not a likely turn of events, but still.
+
+Reaching for the stars may not let us grasp them, we do all the same stand taller through the effort.
+
+

Topics

+
+/topic/astrolabe.gmi
+
+

See Also

+
+ + + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/57-dont-know-how-to-build-software.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/57-dont-know-how-to-build-software.gmi new file mode 100644 index 00000000..e8feffd2 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/57-dont-know-how-to-build-software.gmi @@ -0,0 +1,114 @@ + + + + + MEMEX - I don't know how to build software [ 2022-05-06 ] + + + + + + +
+ +
+ + +
+
+

I don't know how to build software [ 2022-05-06 ]

+
+There are a lot of ways of building software, there are many languages you could choose to build it with, many libraries to rely on, many frameworks to leverage, many architectural approaches, many platforms to choose, many paradigms of daily operations to follow.
+
+It takes years to get in-depth experience with just one permutation of these options.
+
+I've been programming for over twenty years, only half the time professionally, but that is how long I've been building software. I've built about twelve applications in my twenty years of development, of varying size and complexity.
+
+This has granted me in-depth experience with three programming languages out of dozens, a few dozen libraries out of thousands, a few architectural principles, two operating systems, three source control systems, two operations paradigms.
+
+I have seen approaches coincide with problems, and I've seen approaches coincide with success, but usually only one or two times; from this I can't actually know if the failures and successes were accidental to approach.
+
+I could have hopped languages and stacks more often, and I have indeed dabbled in many more, but not enough to actually judge their merits. Doing this, I would have cause to be even less certain.
+
+I think this is fairly representative. No matter how long you've been working with this, I don't think a human being exists that have actually tried everything so extensively as to be able to say whether it works, especially so with the fractalline explosion of libraries and frameworks and languages that have emerged the last few years. How could anyone judge which of these is the best, given nobody can have experience with them all?
+
+Most of the critics of waterfall haven't actually worked in such a project. Most of the critics of either C or javascript haven't actually built anything noteworthy with it. Most of the critics of OOP don't have much practical experience with it. Most of these criticisms are criticisms of the strange and other, not the deeply familiar.
+
+Still when I go talk to other programmers, I find very strong and confident opinions about what is the best way of building software, of why this-and-that is the superior approach to all others.
+
+Putting aside judgements about whether this is good or bad, and just evaluating the discourse for what it is, here is a fascinating question: Given there isn't enough human lifetime that this actually can come from experience, where do all these confident opinions about building software actually come from?
+
+Dunning-Kruger may spring to some people's minds, but let's quickly dismiss that notion, as the public conception of D-K isn't actually how it works, and the effect may not even exist[1].
+
+Do we merely parrot other people's seemingly confident opinions? This seems insufficient, there still must be a source for these parroted opinions somewhere.
+
+Maybe if you're a cloud provider or if you've developed a project or you're a consultant selling services, maybe then you have a vested interest in promoting your way of doing it, of promoting using your tools.
+
+Is it all self-promotion? Is this what we're all parroting?
+
+This could be it, but where does that actually leave programmers, where does that leave the discussion? Is it just a pointless tug-of-war between who can most successfully recruit people to shill their product?
+
+A lot of large software projects fail fairly spectacularly. Many successful projects started experimentally as some guy hacking on something, despite not having armies of seasoned consultants and top of the line architects with all the right certifications.
+
+I think it would be beneficial to start thinking about programming more from personal experience, and less from theoretical models because these theoretical models don't seem particularly well founded in reality.
+
+It's funny how many in engineering consider themselves skeptics, but they often extend this skepticism only toward what they already doubted, rarely to what they take for granted (which is where it would actually do good).
+
+What do *I* know? This is the question, not "what is regarded as true".
+
+I don't know how to build software.
+
+

See Also

+
+ +
+https://encyclopedia.marginalia.nu/wiki/Useful_idiot
+
+

Topic

+
+/topic/programming.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/58-marginalia-open-source.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/58-marginalia-open-source.gmi new file mode 100644 index 00000000..1e2a790f --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/58-marginalia-open-source.gmi @@ -0,0 +1,120 @@ + + + + + MEMEX - marginalia.nu goes open source [ 2022-05-27 ] + + + + + + +
+ +
+ + +
+
+

marginalia.nu goes open source [ 2022-05-27 ]

+
+After a bit of soul searching with regards to the future of the website, I've decided to open source the code for marginalia.nu, all of its services, including the search engine, encyclopedia, memex, etc.
+
+A motivating factor is the search engine has sort of grown to a scale where it's becoming increasingly difficult to productively work on as a personal solo project. It needs more structure. What's kept me from open sourcing it so far has also been the need for more structure. The needs of the marginalia project, and the needs of an open source project have effectively aligned.
+
+So fuck it. Let's make Marginalia Search an open source search engine.
+
+I don't know how much traction this will get in terms of contributions, but as search is like a fractal of fun and interesting problems to be tackled it's almost a bit cruel to keep it all to myself.
+
+There's some effort in documenting the project and cleaning up the build process needed before this can get going in earnest, but that will be an ongoing task for quite some while. This work was needed regardless, and if nothing else this serves as a good vehicle for introducing some process into the development of this project and getting around to slaying some of those ancient dragons (this is necessary at this point regardless).
+
+

Sources and Hosting

+
+I feel GitHub has taken an incredibly toxic turn with its emphasis on social features, and in general dislike the notion of renting space on the Internet, therefore I'm hosting the sources on a gitea instance.
+
+https://git.marginalia.nu/marginalia/marginalia.nu
+
+As of right now the code is very as-is. There is still some work to get it to a point where it's even possible to run on another machine.
+
+I'm currently looking for hosting for a large term frequency data file that is necessary for several of the search engine's core functions. I really don't have the bandwidth to serve it myself. It's only a couple of hundred megabytes so it'll probably be solvable somehow.
+
+

Q&A

+
+

What if the SEO people learn all the secrets?

+
+They're probably going to figure them out anyway. If Google teaches us anything, it's that attempting to hide what you are doing from the SEO industry flat out doesn't work.
+
+What shields Marginalia from SEO spam isn't security through obscurity, but that it places demands on websites that are mutually contradictory to Google's demands. As long as Marginalia Search is smaller than Google, Marginalia is safe.
+
+

I don't like Java

+
+I know a lot of people break out in eczema when exposed to this language. Rest assured it's not enterprisey Java, and between the JVM's ability to interoperate with other languages (including Python and Scheme), and the fact that the entire system is based around web services, there's *probably* something that can be done to accommodate for other languages-of-choice.
+
+

What is the license?

+
+It's AGPLv3.
+
+

I have strong negative opinions on something about the project

+
+If you feel the need to complain about how something doesn't align with your personal philosophical convictions and fails to satisfy your criteria for ideological purity, please write a really long and angry essay about this topic, and send it to <kontakt@marginalia.nu>.
+
+Don't forget to press caps lock as you begin typing to save your pinky fingers, I wouldn't want to be responsible for nasty RSI.
+
+

Topics

+
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/59-anchor-text.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/59-anchor-text.gmi new file mode 100644 index 00000000..56fca95f --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/59-anchor-text.gmi @@ -0,0 +1,195 @@ + + + + + MEMEX - Fun with Anchor Text Keywords [ 2022-06-23 ] + + + + + + +
+ +
+ + +
+
+

Fun with Anchor Text Keywords [ 2022-06-23 ]

+
+Anchor texts are a very useful source of keywords for a search engine, and in an older version of the search engine, it used the text of such hyperlinks as a supplemental source for keywords, but due to a few redesigns, this feature has fallen off.
+
+Last few days has been spent working on trying to re-implement it in a new and more powerful fashion. This has largely been enabled by a crawler re-design from a few months ago, which offers the crawled data in a lot more useful fashion and allows a lot more flexible post-processing.
+
+It is easy enough to grab hyperlinks within the same domain that is being crawled and process them on the spot and assign the keywords to each document.
+
+Unfortunately these are often not very useful.
+
+Not only are the keywords often non-descriptive, 'read more'-type stuff, there's an additional benefit to external links, as they are other people describing websites. That tends to be more align well with the sort of keywords people enter into a search engine. When we use a search engine, we're not infrequently describing the document we're looking for.
+
+"python manual"
+"cheap car parts"
+"job interview tips"
+
+This is why the best links are other websites' links, but they are also the hardest links to deal with.
+
+There are practical problems, as the keywords are not located near the document they refer to, but rather scattered over other documents. Before being loaded, they must be deduplicated and grouped by the document they refer to.
+
+The grouping is necessary because it saves a lot of work for the index construction to be able to say "here is a document, and these are its keywords: [...]", rather than loading them one by one.
+
+Grouping can be done by pre-sorting into a few dozens or hundreds different output files, making the file sizes manageable for fine-grained in-memory sorting and loading later.
+
+Of this the deduplication is harder problem due to the sheer volume of data. To show why keyword deduplication is tricky, let's break out the napkin math!
+
+
    +
  • If we have 100,000,000 documents
  • +
  • Each document has on average 4 unique keywords
  • +
  • Each keyword is on average 9 bytes
  • +
  • Each URL is on average 51 bytes
  • +
  • Then all (document,keyword) requires at least 4x60x100,000,000 bytes
  • +
  • That's roughly 24 Gb
  • +
  • That's without considering any sort of language overhead!
+
+Oof :-(
+
+This has the potential to be a real memory hog, maybe you could get away with it but it seems super sketchy. You could of course keep it on disk, but then it would be impossibly slow and a real nasty IOPS hog.
+
+There are enough weeks long processing jobs in this search engine, and it really doesn't need more of them.
+
+Thinking about this for a while, the solution that sprang to mind was pretty simple.
+
+A big old bloom filter.
+
+Make it 2 Gb or so, which means a bit set with a cardinality of 16 billion. Hash collisions would be expected as the birthday paradox limit where there is a 50% chance of a single hash collision is sqrt(16 billion)=126k. That's arguably within what is acceptable as at the expected 4 keywords per document, the filter is only populated to a degree of 0.00025%, which also becomes its worst case false rejection rate assuming a perfect hash function.
+
+Call it an optimist's hash set. Sometimes good enough is good enough, and the solution is nice and constant in both time and space.
+
+

Results

+
+Having run some trials extracting keywords for links to documents currently indexed by the search engine, the results are promising.
+
+The code is extremely fast, almost surprisingly so, it runs through even a large body of documents such as StackOverflow in about an hour.
+
+The raw output from the experiment can be downloaded here:
+
+ + + +
+Below are keywords sorted by frequency, which will tend to raise the least informative keywords to the top. It illustrates how there is a significant lexicon of junk keywords that needs to be excluded, demonstratives like 'here' and 'this', navigation elements and so forth.
+
+

External Links, Crawled Data Subset 10k domains

+
+
+    408 website
+    399 page
+    350 link
+    201 race
+    200 web
+
+Note: The sample is skewed by a racing website that basically has the anchor text 'race' for a lot of links.
+
+

Internal Links, Crawled Data Subset 10k domains

+
+
+  17385 content
+  17276 skip
+  14664 next
+  10986 previous
+   7549 read
+
+

External Links, StackOverflow

+
+StackOverflow seems to provide high value keywords overall, even its junk words are frequently informative.
+
+
+   4701 documentation
+   3061 docs
+   2680 link
+   2418 page
+   2418 here
+   1885 article
+   1813 tutorial
+   1539 example
+   1207 guide
+   1174 official
+   1079 doc
+
+Wikipedia seems less useful, because a lot of its links just mirror the title of the website they link to, which means they don't provide any additional information.
+
+It would be interesting to look at Reddit comments as well. While it is basically the internet capital of astroturfing, given that the links are filtered by all the criteria needed for inclusion in the search database, it may still be a good source.
+
+In general, the limited scope of the search engine and the existing filtering is probably something that has a decent chance of limiting the impact of spam links.
+
+

Closing thoughts

+
+This is far from finished, but it's a very promising lead.
+
+There will be a major upgrade of the search engine coming in about a month or so, mostly necessitated by running out of disk space on the database hard drive, and there is no way of moving forward with this without essentially rebuilding the database. I have ample backups so it's not as scary as it sounds, worse comes to worst it'll go back to being a stale copy of itself. It's honestly a welcome opportunity to fix old design mistakes to make the code more workable. This feature is slated to be included in that upgrade.
+
+

Topics

+
+/topic/astrolabe.gmi
+
+

See Also

+
+ + +
+ + + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/60-prescriptive-descriptions.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/60-prescriptive-descriptions.gmi new file mode 100644 index 00000000..de5bbc0f --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/60-prescriptive-descriptions.gmi @@ -0,0 +1,97 @@ + + + + + MEMEX - On Prescriptive Descriptions [ 2022-07-14 ] + + + + + + +
+ +
+ + +
+
+

On Prescriptive Descriptions [ 2022-07-14 ]

+
+I'd like to discuss a mental somersault that I've found has caused me a lot of grief in the past, which is prescriptive descriptions. Let's break this down a bit:
+
+
    +
  • A descriptive statement is a statement about how something is.
  • +
  • A prescriptive statement is a statement about how something must be, a rule or a law.
+
+If I stay up a bit late, do most of my work in the evenings, wake up tired and just sort of putter about until noon, I might describe myself as a night person because of this.
+
+I might not realize that if I cut down on the coffee, get more physical activity, and get to bed a bit earlier, I (the same person) might fly out of bed in the morning and do most of my work before noon, and then get sort of tired and not do much work later that evening. That's what a morning-person does.
+
+Now, let me ask you a question: Am I a night person because I stay up late, or do I stay up late because I am a night person?
+
+I will argue the nature of descriptive statements is that they describe how things are (or have been). Having reason to describe something is the effect of the appearance of things, not their cause. Describing a tomato as red is caused by the appearance of the tomato being red, it does not cause the tomato to be red. Being a violinist does not cause a person to play the violin, playing the violin in causes them to earn the description of a violinist.
+
+Likewise, being a night-person never caused me to stay up late, staying up late caused me to earn the description of night-person.
+
+It's common, and I've certainly fallen for this trap as well in the past, to flip this around and look at descriptions of yourself as immutable laws of nature.
+
+Granted, things have a tendency toward momentum, what has been in the past often tends to continue into the future, but generalizing this into some iron law of nature essentially lends us to the absurd conclusion that nothing can ever
+change, doubly absurd when we're applying this to the context of decision-making: Whatever I have done before, I must always do again?!
+
+I am lazy, I am shy, I am clumsy, I am stupid, I can't learn new things, I dislike new experiences, I'm not into that thing I haven't ever tried. You hear this stuff all the time, often presented as reasons not to try or do things. These types of descriptions can be a significant obstacle to enjoying life. They're self-imposed shackles.
+
+Even on the flip side, if I say I am strong, I am productive, I am a paragon of virtue, I never lie, I never fail, I can do anything; these types of descriptions can pave the way to burn-out, a source of self-loathing or regret, an obstacle to accepting yourself and seeking much needed help.
+
+The conclusion isn't to change how you describe yourself, but to understand the nature of descriptions as describing the past, rather than proscribing the future; and look to understand things are they are, rather than how they appear to be based on past experiences.
+
+
+
+Also, before someone inevitably crawls out of the woodwork to tell me how they really are a night-person: Like, sure.
+
+

Topic

+
+/topic/moral-philosophy.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/61-botspam-apocalypse.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/61-botspam-apocalypse.gmi new file mode 100644 index 00000000..d3463899 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/61-botspam-apocalypse.gmi @@ -0,0 +1,102 @@ + + + + + MEMEX - Botspam Apocalypse [ 2022-08-03 ] + + + + + + +
+ +
+ + +
+
+

Botspam Apocalypse [ 2022-08-03 ]

+
+Bots are absolutely crippling the Internet ecosystem.
+
+The "future" in the film Terminator 2 is set in the 2020s. If you apply its predictions to the running of a website, it's honestly very accurate.
+
+Modern bot traffic is virtually indistinguishable from human traffic, and can pummel any self-hosted service into the ground, flood any form with comment spam, and is a chronic headache for almost any small scale web service operator.
+
+They're a major part in killing off web forums, and a significant wet blanket on any sort of fun internet creativity or experimentation.
+
+The only ones that can survive the robot apocalypse is large web services. Your reddits, and facebooks, and twitters, and SaaS-comment fields, and discords. They have the economies of scale to develop viable countermeasures, to hire teams of people to work on the problem full time and maybe at least keep up with the ever evolving bots.
+
+The rest are forced to build web services with no interactivity, or seek shelter behind something like Cloudflare, which discriminates against specific browser configurations and uses IP reputation to selectively filter traffic.
+
+If Marginalia Search didn't use Cloudflare, it couldn't serve traffic. There has been upwards of 15 queries per second from bots. There is just no way to deal with that sort of traffic, barely even to reject it. The search engine is hosted on residential broadband, it's hosted on a souped up PC.
+
+I can't afford to operate a datacenter to cater to traffic that isn't even human. This spam traffic is all from botnets with IPs all over the world. Tens, maybe hundreds of thousands of IPs, each with a relatively modest query rates, so rate limiting does all of bupkis.
+
+The only option is to route all search traffic through this sketchy third party service. It sucks in a wider sense because it makes the Internet worse, it drives further centralization of any sort of service that offers communication or interactivity, it turns us all into renters rather than owners of our presence on the web. That is the exact opposite of what we need.
+
+The other option would be to require a log-in from the users, which besides from being inconvenient, I don't want to know who is using the search engine, but if I don't know who is using the search engine, I can't know who is abusing the search engine.
+
+Cloudflare is the *lesser* evil in this case. It's not fair, but it at least allows the service to stay open and serve traffic in a way that at least doesn't inconvenience all human visitors all the time.
+
+The API gateway is another stab at this, you get to choose from either a public API with a common rate limit, or revealing your identity with an API key (and sacrificing anonymity).
+
+The other alternatives all suck to the extent of my knowledge, they're either prohibitively convoluted, or web3 cryptocurrency micro-transaction nonsense that while sure it would work, also monetizes every single interaction in a way that is more dystopian than the actual skull-crushing robot apocalypse.
+
+If anyone could go ahead and find a solution to this mess, that would be great, because it's absolutely suffocating the internet, and it's painful to think about all the wonderful little projects that get cancelled or abandoned when faced with the reality of having to deal with such an egregiously hostile digital ecosystem.
+
+

See Also

+
+/log/29-botnet-ddos.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/62-marginaliacoin.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/62-marginaliacoin.gmi new file mode 100644 index 00000000..61885830 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/62-marginaliacoin.gmi @@ -0,0 +1,93 @@ + + + + + MEMEX - Marginaliacoin, and hidden forums [ 2022-08-18 ] + + + + + + +
+ +
+ + +
+
+

Marginaliacoin, and hidden forums [ 2022-08-18 ]

+
+I discovered someone has made a cryptocurrency called "Memex Marginalia Inu". It appears to have been created February 23, which is around when the entry "I Have No Capslock And I Must Scream" went absurdly viral to the point where Elon Musk tweeted a link to it.
+
+ +
+Mr Musk's twitter orbit is exceptionally strange. The tweet was followed by a deluge of bizarre activity, strange emails with calls about stonk canine lunar expeditions, and apparently also a cryptocurrency land-grab of sorts. I can't claim to understand why, but many of the emails got after the tweet were on the theme "what does this mean?", almost as though Elon's tweet was some sort of prophetic omen.
+
+Needless to say, I do not endorse this. As I will chant like a gregorian monk everytime I encounter anything related to cryptocurrencies: Crypto Is A Ponzi Scheme.
+
+
+   ***
+
+The forums are hiding. At least, that's how it appears. I've started a new batch of website-crawling, and made some changes to bring more forums into the search engine.
+
+I like forums and wish to help them get more visibility, what few are still alive. In some sense it will also serve as a sort of inventory.
+
+Although I discovered many forums, even though they run traditional forum software (like phpBB or vBulletin), appear to masquerade as static HTML or wordpress sites in the URL path, using rewrite rules. Very curious.
+
+It could be to help with Google's search ranking, or maybe it's to hide from bot spam.
+
+It looks like it will take maybe 14 days to complete the crawl. Operating a search engine on a small network budget is something that requires patience.
+
+

Topic

+
+/topic/astrolabe.gmi
+
+

See Also

+
+/log/61-botspam-apocalypse.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/63-marginalia-crawler.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/63-marginalia-crawler.gmi new file mode 100644 index 00000000..ae938dcb --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/63-marginalia-crawler.gmi @@ -0,0 +1,193 @@ + + + + + MEMEX - The Evolution of Marginalia's crawling [ 2022-08-23 ] + + + + + + +
+ +
+ + +
+
+

The Evolution of Marginalia's crawling [ 2022-08-23 ]

+
+In the primordial days of Marginalia Search, it used a dynamic approach to crawling the Internet.
+
+It ran a number of crawler threads, 32 or 64 or some such, that fetched jobs from a director service, that grabbed them straight out of the URL database, these jobs were batches of 100 or so documents that needed to be crawled.
+
+Crawling was not planned ahead of time, but rather decided through a combination of how much of a website had been visited, and the quality score of that website determined where to go next. It also promoted crawling websites adjacent to high quality websites.
+
+Tweaking this process to get a good mix of depth and breadth was pretty tricky, but for a moment, this approach worked very well.
+
+Initially the crawling was seeded with a few dozen URLs. This dynamic crawling approach allowed the process to bootstrap itself.
+
+The crawler did on-the-fly processing, that is extraction of keywords and so on, and loading, that is insertion into the URL database and search engine index. Downloaded websites were saved in a big shared tarball in the order they were retrieved across all threads.
+
+While great for bootstrapping, this approach doesn't scale. Eventually the crawler spent more time waiting for the database to offer up new crawl instructions than it did waiting for websites to load. It also demanded the database to have a lot more indices than otherwise necessary, making writes slower and the disk footprint bigger than necessary.
+
+The benefits were bigger when the search engine was small and starting up. At this point it already knows a fairly decent chunk of the Internet, at least the sort of websites that are interesting to crawl.
+
+Orchestration was also very incredibly finicky, to be expected. A director process kept track of all the crawling, it knew which domains were being crawled to avoid multiple crawler threads attacking the same domain at the same time. State was distributed over the SQL database, the director, and the crawler.
+
+This was a multi-process multi-threaded application with a rapidly mutating shared state, with a patchwork of caches to make it perform well. There's good reason we usually avoid those types of designs when possible.
+
+I knew this as I designed the thing, but at the time, the search engine was just a hobby project with no users and in such a context it's fun and educational to try cursed design patterns.
+
+At best it worked decently well. Better than you'd expect for a design that is inevitably going to have more mystery race conditions than that time the Scooby Doo-gang went to visit the Nürburgring.
+
+I would never quite figure out why it seemed to sometimes re-crawl some jobs.
+
+A just as big problem is that the crawler did everything all at once, which made debugging very difficult. The idea of archiving the downloaded HTML was good, but the execution was lacking, since it was all in huge tarballs and out of order.
+
+Tarballs do not allow random access, so retrieving the HTML code for a specific address to investigate how it interacted with the code could take several minutes as the system had to comb through hundreds of gigabytes of data to find it. (I store these archives on a cheap 5k RPM NAS drive).
+
+This design was scrapped for something more robust and scalable.
+
+

Batch Crawling

+
+First, a crawl plan is created, this is essentially a compressed file where each line is a JSON entry containing an ID, a domain name, and a list of URLs to crawl. This is specified ahead of time rather than on-the-fly like before.
+
+The IDs are randomized, and used to determine order of crawling. This shuffles the order of domains, and reduces the likelihood of the crawler visiting the same backing server under different domain names even for servers with many subdomains (like sourceforge or neocities).
+
+The process is broken into three sequential steps, all mediated by compressed JSON. Crawling, Processing, Loading.
+
+Schematically:
+
+
+    //====================\\
+    || Compressed JSON:   ||  Specifications
+    || ID, Domain, Urls[] ||  File
+    || ID, Domain, Urls[] ||
+    || ID, Domain, Urls[] ||
+    ||      ...           ||
+    \\====================//
+          |
+    +-----------+  
+    |  CRAWLING |  Fetch each URL and 
+    |    STEP   |  output to file
+    +-----------+
+          |
+    //========================\\
+    ||  Compressed JSON:      || Crawl
+    ||  Status, HTML[], ...   || Files
+    ||  Status, HTML[], ...   ||
+    ||  Status, HTML[], ...   ||
+    ||     ...                ||
+    \\========================//
+          |
+    +------------+
+    | PROCESSING |  Analyze HTML and 
+    |    STEP    |  extract keywords 
+    +------------+  features, links, URLs
+          |
+    //==================\\
+    || Compressed JSON: ||  Processed
+    ||  URLs[]          ||  Files
+    ||  Domains[]       ||
+    ||  Links[]         ||  
+    ||  Keywords[]      ||
+    ||    ...           ||
+    ||  URLs[]          ||
+    ||  Domains[]       ||
+    ||  Links[]         ||    
+    ||  Keywords[]      ||
+    ||    ...           ||
+    \\==================//
+          |
+    +------------+
+    |  LOADING   | Insert URLs in DB
+    |    STEP    | Insert keywords in Index
+    +------------+    
+    
+
+The emphasis of this design is that each computational step is isolated and repeatable, and the intermediate data steps are portable and inspectable. It works as you would expect a networked application to work, except the "network traffic" is written as a record in a file in the filesystem and acted upon in a later and separate step.
+
+Each step in crawling and processing is resumable and idempotent. A journal file tracking what's confirmed to be finished is used to continue if the process is aborted.
+
+The first design lacked these aspects, which made developing new features quite miserable since it needed to be done either on small ad-hoc datasets, or live in production.
+
+It should be conceded that the new design would probably not have worked well for bootstrapping itself. The first design was a necessary intermediate step to obtain the data to move on to this one.
+
+The original crawler was also smarter in many ways, and since it did everything all at once was able to short-circuit crawling if it detected that it didn't find anything interesting at all. The new crawler is dumber and has much worse signal-to-noise ratio when it comes to what is actually downloaded.
+
+Compressed JSON isn't the fastest format for reading or writing, but it doesn't need to be either since the bottleneck is the network connection. As such the intermediate protocol can be optimized for what's most convenient for development.
+
+The crawler consists of a number of threads that each take a domain name and set of URLs and downloads them in sequence, while adding newly discovered URLs to the queue up until a crawl depth limit. These threads are mostly waiting, either for I/O or for the delay between crawls.
+
+This could be designed to reduce the number of threads, by rotating tasks among a smaller set of threads using some form of priority queue, but as it stands the network is the bottleneck so that's probably just complicating things for no reason at this point.
+
+A gotcha I recently ran into when attempting to scale up crawling was that by default, an astronomical number of sockets ended up stuck in TIME_WAIT, which is a sort of clean-up state the Linux kernel puts sockets in to avoid data loss. For 1024 parallel connections, I tens of thousands of sockets in this state. This filled up the conntrack-table of the router, packets were dropped, connections were refused. That's not great. Setting SO_LINGER with a low number reduced this to a more manageable 2-3x the number of connections.
+
+Part of the problem is some of the network hardware is pretty low powered, far removed from anything enterprise grade and it just doesn't deal with huge numbers of connections well. (If anyone knows good ways of tweaking Linux servers and OpenWRT routers to deal with this, please email them to me ;-)
+
+Another approach to widening the crawl is to speed it up. There's not much to do when there's a robots.txt specifying crawl-delay or if you get HTTP Status 429 requesting a slow-down, but for the rest, when there is no indication of how fast to crawl, instead of sleeping for a fixed 1-second crawl interval as has been the default, the crawler has been modified to mirror the behavior of the server. If it takes 400ms to serve the request and process the data received (including write it to disk), the crawler will wait 400ms to request again (but no less than 250 ms, and no more than 2500 ms). This way slow servers get a more lenient treatment, while faster servers that can handle more don't get unnecessary courtesy.
+
+There is also a point to not crawling too aggressively across sites, to reduce the amount of DNS queries, and avoid tripping bot detection algorithms.
+
+The big benefit of this crawling overhaul is the portable data, it is much easier to work with and develop against. It's much easier to inspect the behavior of the code and to find bugs. It's accelerated the development cycle of things like advertisement-detection. I built a filter for detecting which documents contain recipes, from idea to working feature in less than two hours. It would have simply not been possible without the portable crawl data that's now available. The design also allows testing the system with subsets of real production data.
+
+It is an improvement in almost every way and what's most exciting is the doors it opens for developing the rest of the search engine.
+
+

Topic

+
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/64-hundred-million.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/64-hundred-million.gmi new file mode 100644 index 00000000..1d094248 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/64-hundred-million.gmi @@ -0,0 +1,84 @@ + + + + + MEMEX - Marginalia's Index Reaches 100,000,000 Documents [ 2022-10-21 ] + + + + + + +
+ +
+ + +
+
+

Marginalia's Index Reaches 100,000,000 Documents [ 2022-10-21 ]

+
+A very brief note to announce reaching a long term goal and major milestone for marginalia search.
+
+The search engine now indexes 106,857,244 documents!
+
+The previous record was a bit south of seventy million. A hundred million has been a pie-in-the-sky goal for a very long time. It's seemed borderline impossible to index a that many documents on a PC. Turns out it's not. It's more than possible.
+
+Twice this may even be technically doable, but is way past the pain point of sheer logistics. It's already a real headache to deal with this much data.
+
+
    +
  • The crawl takes two weeks.
  • +
  • Processing the crawl data to extract keywords and features takes several days.
  • +
  • Loading the processed data into the database takes another day.
  • +
  • Constructing the index takes another day.
+
+A hundred million probably more than good enough.
+
+Focus should instead be on improving the quality of what is indexed, on making it better, faster, more relevant. Sadly it's not as easy to find vanity goals like hitting 100,000,000 in that area.
+
+

Topics

+
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/65-scaling-doesnt-scale.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/65-scaling-doesnt-scale.gmi new file mode 100644 index 00000000..2c5c9c6a --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/65-scaling-doesnt-scale.gmi @@ -0,0 +1,126 @@ + + + + + MEMEX - Scaling doesn't scale [ 2022-10-25 ] + + + + + + +
+ +
+ + +
+
+

Scaling doesn't scale [ 2022-10-25 ]

+
+By which I mean there are deeply problematic assumptions in the very notion of scaling: Scaling changes the rules, and scaling problems exist in both directions. If what you are doing effortlessly scales up, it almost always means it's egregiously sub-optimal given your present needs.
+
+These assertions are all very abstract. I'll illustrate with several examples, to try and build an intuition for scaling. You most likely already know what I'm saying is true, but you may need reminding that this is how it works.
+
+Let's look at nature first.
+
+Put a fly on a body of water, and it will stand on the water. Put a horse on a body of water, and assuming the water is deep enough it will sink to its neck and begin to swim ashore. The fly can walk on the ceiling, the horse can not. Horses can't even walk on vertical surfaces. Neither can cats, dogs, humans, really anything bigger than a small lizard.
+
+Why? Because if you make something bigger, its weight grows more quickly than its surface area. Things like surface tension, tensile strength scale with area, while weight scales with volume.
+
+If you are tasked with towing a heavy truck that has broken down somewhere, odds are you will pick a thick rope of modest length before you choose the extremely long string laying next to it. If you pick the string, odds are you will double it several times to increase its cross sectional area at the expense of its length. It is only increasing its cross section that makes it stronger, not its length.
+
+While the examples are from physics, the phenomenon is more fundamental than that. It affects nearly everything. For example, it also affects social relations.
+
+Consider a group of friends.
+
+
    +
  • Alice and Bob are in a room; there is 1 social relation in the room. Alice knows Bob.
+
+
    +
  • Eve enters the room; there are 3 social relations in the room. Alice knows Bob, Bob knows Eve, Eve knows Alice.
+
+
    +
  • Steve comes along; there are now 6 social relations in the room. (AB, AE, AS, BE, BS, ES).
+
+
    +
  • Finally James kramers into the room; there are 10 social relations in the room. (AB, AE, AS, AJ, BE, BS, BJ, ES, EJ, SJ)
+
+If you double the number of members of a social setting, you roughly quadruple number of potential interpersonal relations. In practice, it's even worse because relationships may involve more than two actors. Bob may be jealous of James and Alice who are in a romantic relationship and holy crap is Eve secretly sleeping with James too! Alice is going to be so mad at James and Eve and thankful to Bob. Steve will scratch his head at the soap opera plot among the other four.
+
+The formula for the number of 2-person relationships is n x (n-1)/2, or n choose 2. Which brings us to combinatorics and probabilities.
+
+Let's say you found a great deal on extremely unreliable hardware online, and now have a warehouse full of computers that only give the correct response 50% of the time.
+
+You want to make use of these computers. You decide to set up a cluster to reduce the error rate, use three computers that will use a consensus algorithm to vote on the response.
+
+With three nodes, the probability of at least a single failure is 87.5%, at least double failure is 50%, and triple failure is 12.5%.
+
+Wait adding computers seems to have made no difference! The odds of a double error or more is the same as using a single computer!
+
+What if we use 5 computers? The probability of seeing at least a single failure is 97.5%, at least double failure is 81.2%, triple is 50%, quadruple is 18.7%, quintuple failure is 3.1%. Three is a majority, but the probability of failure is still 50%.
+
+It turns out if your error rate is greater than or equal to 50%, then no matter how many computers you add into the voting pool, it just doesn't improve the situation at all. In this scenario, scaling does nothing.
+
+If you think about it, it's not that strange. A 50% failure rate is a pretty pathological case, but it does fly in the face of the notion that redundancy improves fault tolerance. It usually does but often not as much as you would think.
+
+On a single machine, RAID and error correcting RAM is typically not necessary because the probability of failure is extremely low. In a data center, you are a fool if you're not doing both. The probability of a single drive failing among tens of thousands is staggering. You're replacing them daily. ECC is a must, because cosmic rays flipping bits is a real problem on this scale.
+
+A bigger cluster also has far realistic probabilities of otherwise nearly impossible multiple simultaneous faults, creating a need for yet bigger clusters to compensate.
+
+You may be tempted to think that because in a data center you require double redundancy, RAID-1, and ECC ram; that your dinky little single-server operation needs it as well. That this is somehow the serious way of running a professional server.
+
+The reality is that if you go that route, you're more than likely paying 10-15 times more for a solution to a problem that is indeed a huge headache for a data center but virtually unheard of on a smaller scale.
+
+This absolutely goes for software architecture and development methodology as well. A very common mistake software companies make is in their eagerness to grow emulating the methodology of a much bigger company.
+
+Google is doing this! We should do this too! Netflix is doing that. We should do that too!
+
+Most likely, no, you shouldn't. Netflix has problems that are Netflix-sized. If you aren't Netflix-sized, then you have smaller problems, with smaller solutions.
+
+You may object that these small-scale solutions don't scale up, but the point of this whole essay is that while indeed they don't, scaling problems exist in both directions. Google's solutions don't scale down. If you copy their homework, you're disastrously hobbling yourself the time where you could be running circles around such lumbering giants.
+
+It's not only OK to be small, it's advantageous.
+
+You can do incredible magical things if actually lean into it and make use of the fact that you play by completely different rules than the big guys. You can walk on water, they can not.
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/66-carbon-dating.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/66-carbon-dating.gmi new file mode 100644 index 00000000..eac64ed4 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/66-carbon-dating.gmi @@ -0,0 +1,122 @@ + + + + + MEMEX - Carbon Dating HTML [ 2022-10-27 ] + + + + + + +
+ +
+ + +
+
+

Carbon Dating HTML [ 2022-10-27 ]

+
+One of the more common feature requests I've gotten for Marginalia Search is the ability to search by date. I've been a bit reluctant because this has the smell of a a surprisingly hard problem. Or rather, a surprisingly large number of easy problems.
+
+The initial hurdle we'll encounter is that among structured data, pubDate in available in RDFa, OpenGraph, JSON+LD, and Microdata.
+
+A few examples:
+
+<meta property="datePublished" content="2022-08-24" />
+<meta itemprop="datePublished" content="2022-08-24" />
+<meta property="article:published_time" content="2022-08-24T14:39:14Z" />
+<script type="application/ld+json">
+{"datePublished":"2022-08-24T14:39:14Z"}
+</script>
+
+So far not so that bad. This is at least a case where the web site tells you that here is the pub-date, the exact format of the date may vary, but this is solvable.
+
+HTML5 also introduces a <time> tag, which is sometimes useful.
+
+
+<time pubdate="pubdate" datetime="2022-08-24T14:39:14" />
+<time itemprop="datePublished" datetime="2022-08-24T14:39:14">August 24 2022</time>
+<time datetime="2022-08-24T14:39:14">August 24 2022</time>
+
+The last one may or may not be the timestamp we're looking for, but maybe it is in the right ballpark anyway.
+
+Thus we've taken a first step into the realm of dubious heuristics. Sometimes the URL path contains the year a document was created, typically on the form
+
+
+https://www.example.com/2022/04/why-im-so-great/
+
+Of course /four digits/ may just be some numbers as well. It's not possible to be quite sure, but usually it's right. We can clamp the year to [1989,current year+1] and reduce the false positives.
+
+The HTTP header 'last-modified:' (or Last-Modified) may also provide a hint. It may also be the last time the file was copied on disk. Or complete nonsense. It's also probably a RFC-1123 date.
+
+Alright, this will provide a date for about a quarter of the websites. More than likely, none of these things work. Well to really grasp at straws, we can look for bylines and similar in the DOM using common class names.
+
+It's not really computationally feasible to look at *all* the elements, but these classes usually contain publish or change-dates:
+
+
+.entry-meta
+.byline
+.author
+.submitted
+.footer-info-lastmod
+
+We can also look for text nodes with strings like "Copyright", "Published", "(c)", and so forth.
+
+Although copyright notices aren't great. You often see stuff like "(c) Bob Smith 1997-2017". How to narrow it down? Well we can just split the difference and say 2007, and we would probably be closer to the truth than if we went with the 1997 or 2017, but we can actually guess better than that.
+
+By looking at the HTML standard, we can coarsely make a guess about roughly which decade a website belongs from. New HTML3 is very rare in 2022, HTML5 is impossible in 1995. HTML4 and XHTML is typically indicative of 1999-2014.
+
+So from "(c) Bob Smith 1997-2017", and HTML3 we can take the average of 1997 and 2017, which is 2007, and make an educated guess from the HTML standard, say 1997, average those and arrive at 2002 and then clamp it to 1997-2017 and arrive at an educated guess that the website content from 2002.
+
+In all honestly, I have no good argument why this should work, in fact, averaging averages is rarely a good idea, but in this case it does give very plausible estimates. In general, this is heuristic is mostly necessary when dealing with older web pages, which often don't strictly have a well defined publishing date.
+
+Finally, in keeping with the 30 year old Internet tradition, my own website flagrantly disregards the part of the HTML5 standard that says <articles> must have a <time pubdate>... but is correctly dated using Last-Modified.
+
+

Topics

+
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/67-best-ideas-afk.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/67-best-ideas-afk.gmi new file mode 100644 index 00000000..b317017b --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/67-best-ideas-afk.gmi @@ -0,0 +1,89 @@ + + + + + MEMEX - The best ideas come AFK [ 2022-11-07 ] + + + + + + +
+ +
+ + +
+
+

The best ideas come AFK [ 2022-11-07 ]

+
+I get my best ideas when I'm not working.
+
+This seems paradoxical, but past a point, the more I work on a project the slower it seems to go. I'll find changes to do, but lose any sort of vision.
+
+If I'm not programming at all, I rarely get good ideas as well.
+
+There appears to be some magic stoichiometric mixture where I work on a project for a while, then force myself to take a break somewhere far away from any keyword for a day or two, the ideas start to roll in at a pace where I can barely keep up to write them down.
+
+It seems important that the break from work is a bit boring. Sometimes it takes a day or two to kick in. Too much entertainment appears to ruin the process.
+
+When it works I'm a hazard in traffic and I'll struggle to keep up with a conversation, because my mind is so preoccupied with algorithms and designs I barely notice my surroundings.
+
+[ I'm open to the fact this may also just be some weird brain damage from coding since I was 7... Is it normal to be able to move your visual focus -- eyes stationary -- like a mouse cursor, like even draw selection boxes and vividly imagine context menus ? ]
+
+In many cases I'll come back and implement half a dozen new features in a few hours. Typing the code is never the hard part, it was just a matter of finding the clarity, of realizing it could be done.
+
+The brain appears to do a lot of background work like this.
+
+It is as though you can load it with a problem by working on it actively for a while, but if you overfeed it with information by working too long, it just stalls. It needs a period of no new active inputs to actually arrive at useful results, so you get the aha in the shower, or while taking a walk.
+
+Given this is is how the mind appears to work, it would be interesting to know how to best spawn a "background task" like this on command or via some procedure, how to best curate the inputs to produce the most beneficial outputs.
+
+

See Also

+
+/log/05-minds-field.gmi
+/log/42-dark.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/68-wizards-vs-sorcerers.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/68-wizards-vs-sorcerers.gmi new file mode 100644 index 00000000..307438ee --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/68-wizards-vs-sorcerers.gmi @@ -0,0 +1,94 @@ + + + + + MEMEX - On Wizards and Sorcerers [ 2022-12-23 ] + + + + + + +
+ +
+ + +
+
+

On Wizards and Sorcerers [ 2022-12-23 ]

+
+While this post is about programming, it also draws an extended analogy to dungeons and dragons, specifically two of its classes, that correspond to two attitudes toward programming.
+
+In D&D, wizards study magic. They prepare their magic spells ahead of time. While they may learn a large number of magic spells, they need to prepare them ahead of time and can't just cast them at will.
+
+Wizard programmers prefer up-front design. They apply reason and logic to divide and conquer a large problem, they rely on building blocks like design patterns and algorithms. Wizards rely on explicit knowledge.
+
+D&D sorcerers have an innate connection to the magic. They wield tremendous forces that they sometimes don't quite understand, it's wild and unpredictable. It's not something they've learned to do, but something they've discovered in themselves, a talent.
+
+Sorcerer programmers prefer bottom-up design. The process is creative and chaotic, it's the molding of clay rather than arrangement of bricks. Development relies on building pieces of the software, not yet clear how they will fit together, meanwhile forming an ever deeper intuitive understanding of the problem. Their software is grown, not designed. Sorcerers rely on tacit knowledge.
+
+In practice this is a spectrum. While most developers will probably lean more toward one side of the practice, many will utilize both methods to some extent.
+
+There is notably some degree of misunderstanding between developers on the far ends of the spectrum.
+
+Strong wizards consider sorcerers to be recklessly sloppy and undisciplined. They will often fail to appreciate the depth of the intuitive understanding of the sorcerer. They think sorcery looks like the undisciplined programming of a complete beginner, it is only through uncanny luck they ever get anything done at all, let alone on time.
+
+Strong sorcerers in turn find wizards overbearing and bureaucratic. They don't understand the wizard's need for rules and documents. They feel it is as though the wizards never dare remove their training wheels. They find the wizards' solutions bloated and clunky, and the entire development process unbearably slow and tedious.
+
+In practice, both of these methods work, although they have their weak and strong points.
+
+For well understood problem domains, wizards are far more reliable in crafting something that works, since the entire wizard's process relies on translating explicit knowledge into a software design.
+
+For poorly understood problem domains, sorcerers will often outperform wizards by a fairly significant margin through their ability to quickly build an intuitive understanding of any problem; the act of producing the code is the same as the act of understanding the problem.
+
+Arguably, there's an aspect of personality and talent as well. A strong sorcerer may never be more than a mediocre wizard, no matter how hard they try to apply the same tools; and the opposite may be even more true.
+
+That said, there is probably something to be gained for those on the extreme ends on the spectrum from attempting to at least understand how the other side works, to dabble a bit in the other method. Even though you may probably not be able to master both classes, a strong sorcerer with a modicum of the wizard's discipline; or a strong wizard with just some of the sorcerer's intuition is a force to be reckoned with indeed.
+
+

Topic

+
+/topic/programming.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/69-creepy-website-similarity.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/69-creepy-website-similarity.gmi new file mode 100644 index 00000000..7e717658 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/69-creepy-website-similarity.gmi @@ -0,0 +1,169 @@ + + + + + MEMEX - Creepy Website Similarity [ 2022-12-26 ] + + + + + + +
+ +
+ + +
+
+

Creepy Website Similarity [ 2022-12-26 ]

+
+This is a write-up about an experiment from a few months ago, in how to find websites that are similar to each other. Website similarity is useful for many things, including discovering new websites to crawl, as well as suggesting similar websites in the Marginalia Search random exploration mode.
+
+ +
+The approach chosen was to use the link graph look for websites that are linked to from the same websites. This turned out to work remarkably well.
+
+There are some alternative feature spaces that might have been used, such as TF-IDF data. Using incident links turned out to be surprisingly powerful, almost to an uncanny degree as it's able to find similarities even among websites that Marginalia doesn't index.
+
+As a whole the feature shares a lot of similarity with how you would construct a recommendation algorithm of the type "other shoppers also bought", and in doing so also exposes how creepy they can be. You can't build a recommendation engine without building a tool for profiling. It's largely the same thing.
+
+If you for example point the website explorer to the fringes of politics, it will map that web-space with terrifying accuracy.
+
+ +
+Note again how few of those websites are actually indexed by Marginalia. Only those websites with 'MS' links are! The rest are inferred from the data. On the one hand it's fascinating and cool, on the other it's deeply troubling: If I can create such a map on PC in my living room, imagine what might be accomplished with a datacenter.
+
+You might think "Well what's the problem? QAnon deserves all the scrutiny, give them nowhere to hide!". Except this sort of tool could concievably work just as well as well for mapping democracy advocates in Hong Kong, Putin-critics in Russia, gay people in Uganda, and so forth.
+
+

Implementation details

+
+In practice, cosine similarity is used to compare the similarity between websites. This is a statistical method perhaps most commonly used in machine learning, but it has other uses as well.
+
+Cosine similarity is calculated by taking the inner product of two vectors and dividing by their norms
+
+
+       a x b
+  p = --------- 
+      |a| |b|
+
+As you might remember from linear algebra, this is a measure of how much two vectors "pull in the same direction". The cosine similarity of two identical vectors is unity, and for orthogonal vectors it is zero.
+
+This data has extremely high dimensionality, the vector space consists of nearly 10 million domains, so most "standard" tools like numpy/scipy will not load the data without serious massaging. That juice doesn't appear to be worth the squeeze when it's just as easy to roll what you need on your own (which you'd probably need to do regardless to get it into those tools, Random Reprojection or some such).
+
+Since the vectors in questions are just bitmaps, either a website has a link or it does not, the vector product can be simplified to a logical AND operation. The first stab at the problem was to use RoaringBitmaps.
+
+
+    double cosineSimilarity(RoaringBitmap a, RoaringBitmap b) {
+        double andCardinality = RoaringBitmap.andCardinality(a, b);
+        andCardinality /= Math.sqrt(a.getCardinality());
+        andCardinality /= Math.sqrt(b.getCardinality());
+        return andCardinality;
+    }
+
+
+This works but it's just a bit too slow to be practical. Sacrificing some memory for speed turns out to be necessary. Roaring Bitmaps are memory efficient, but a general purpose library. It's easy to create a drop-in replacement that implements only andCardinality() and getCardinality() in a way that caters to the specifics of the data.
+
+A simple 64 bit bloom filter makes it possible to short-circuit a lot of the calculations since many vectors are small and trivially don't overlap. The vector data is stored in sorted lists. Comparing sorted lists is very cache friendly and fast, while using relatively little memory. Storing a dense matrix would require RAM on the order of hundreds of terabytes so that's no good.
+
+The actual code rewritten for brevity, as a sketch the and-cardinality calculation looks like this, and performs about 5-20x faster than RoaringBitmaps for this specfic use case:
+
+
+
+    int andCardinality(AndCardIntSet a, AndCardIntSet b) {
+
+        if ((a.hash & b.hash) == 0) {
+            return 0;
+        }
+
+        int i = 0, j = 0;
+        int card = 0;
+
+        do {
+            int diff = a.backingList.getQuick(i) - b.backingList.getQuick(j);
+
+            if (diff < 0) i++;
+            else if (diff > 0) j++;
+            else {
+                i++;
+                j++;
+                card++;
+            }
+        } while (i < a.getCardinality() && j < b.getCardinality());
+
+        return card;
+        
+     }
+
+
+This calculates similarities between websites at a rate where it's feasible to pre-calculate the similarities between all known websites within a couple of days. It's on the cusp of being viable to offer ad-hoc calculations, but not quite without being a denial of service-hazard.
+
+To do this in real time, the search space could be reduced using some form of locality-sensitive hash scheme, although for a proof of concept this performs well enough on its own.
+
+

Closing thoughts

+
+This has been online for a while and I've been debating whether to do this write-up. To be honest this is probably the creepiest piece of software I've built.
+
+At the same time, I can't imagine I'm the first to conceive of doing this. To repeat, you almost can't build a suggestions engine without this type of side-effect, and recommendations are *everywhere* these days. They are on Spotify, Youtube, Facebook, Reddit, Twitter, Amazon, Netflix, Google, even small web shops have them.
+
+In that light, it's better to make the discovery public and highlight its potential so that it might serve as an example of how and why these recommendation algorithms are profoundly creepy.
+
+

Topic

+
+/topic/astrolabe.gmi
+/topic/programming.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/70-faster-index-joins.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/70-faster-index-joins.gmi new file mode 100644 index 00000000..fd01fdc1 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/70-faster-index-joins.gmi @@ -0,0 +1,251 @@ + + + + + MEMEX - Faster Index Joins [ 2023-01-03 ] + + + + + + +
+ +
+ + +
+
+

Faster Index Joins [ 2023-01-03 ]

+
+The most common (and most costly) operation of the marginalia search engine's index is something like given a set of documents containing one keyword, find each documents containing another keyword.
+
+The naive approach is to just iterate over each document identifier in the first set and do a membership test in the b-tree containing the second. This is an O(m log n)-operation, which on paper is pretty fast.
+
+It turns out it can be made faster.
+
+A property of the original problem is that you can actually recycle a lot of the calculations when the first set of identifiers is stored in a sorted list. Luckily the data in this case is already sorted by the b-tree structure from which it is retrieved.
+
+When you look up a document in the b-tree, you're provided with the offset in the data block as well as knowledge of an upper bound for the data in that block.
+
+This makes it possible to determine whether the next document in the list is also within the range that it belongs in the same block, and if so a linear search can be performed. This can be repeated until it's known that no additional documents exist in the data block.
+
+The linear search replaces comparatively costly repeated b-tree traversals, and the search is guaranteed to terminate immediately whenever the next item does not belong in the current data block.
+
+The best case performance of this operation is linear, O(m), although this requires very particular and unrealistic arrangement of the data[1]. The average and worse case is O (m log n), with a constant factor is strictly less than or equal to the naive algorithm.
+
+It's extremely hard to provide a good benchmark for how much faster this code is. This type of operation is highly resistant to benchmarking in a meaningful way due to the many layers of caches. Properties of the data may also affect the result. However attempting to isolate or remove such effects is effectively is also questionable as the effects are present in any real-world application.
+
+With caveats underway, using real world data the new algorithm was found to be between 2x-12x faster than the naive algorithm.
+
+The magnitude of this performance improvement came as a bit of a surprise. It would be expected that it might be faster, as linear searching is in many ways rubbing the modern CPU the right way and while binary searching is in many ways not. Still an order-of-magnitude performance improvement is so remarkable I had to go back and double-check the code is actually doing what the code is supposed to do. It does.
+
+Part of why it's so fast is because each tree traversal incurs at 3-5 binary searches each across 4096 bytes of data. Trading that for a single linear search that often stays within the same cache line is a beneficial trade-off indeed. Adding to this, it also saves a lot of index offset calculations which while just arithmetic do add up when performing them tens of thousands of times as is expected during a query. The fastest calculations are those never performed, and this does remove quite a lot of calculations.
+
+A contributing factor in the speed-up is as alluded to previously that the real world search engine data is not random but rather heavily prone to runs where adjacent document identifiers often belong to the same domain which in turn tend to contain similar keywords. That said, a speed-up was observed on even on poorly correlated synthetic data, albeit much smaller.
+
+[1] For a query where a maximal number of items item in the query buffer can be retained or rejected for each data block, the computational work is completely dominated by the linear search. It technically still is O(m + m/B log n), but the block size B will eclipse log n for any and all realistic workloads.
+
+

Code listing

+
+This code attempts to walk down the b-tree as long as there is data in the buffer.
+
+
+public void retainEntries(LongQueryBuffer buffer) {
+    for (BTreePointer pointer = new BTreePointer(header); 
+         buffer.hasMore(); 
+         pointer.resetToRoot()) 
+     {
+        long val = buffer.currentValue();
+        if (!pointer.walkToData(val)) {
+            buffer.rejectAndAdvance();
+        }
+        else {
+            pointer.retainData(buffer);
+        }
+    }
+}
+
+Excerpt from BTreePointer, which encapsulates parameters related to offset calculations (which are omitted for the benefit of the readers' sanity):
+
+
+
+long boundary = an upper bound for the data in the current data block;
+    
+public void retainData(LongQueryBuffer buffer) {
+
+    long dataOffset = findData(buffer.currentValue());
+    
+    if (dataOffset >= 0) {
+        buffer.retainAndAdvance();
+	
+        long searchEnd = ... // omitting distracting offset calculations
+	
+        if (buffer.currentValue() <= boundary) {
+            data.range(dataOffset, searchEnd).retain(buffer, boundary);
+        }
+   }
+   else {
+        buffer.rejectAndAdvance();
+
+        long searchStart = ...
+        long searchEnd = ... 
+	
+        if (buffer.currentValue() <= boundary) {
+            data.range(searchStart, searchEnd).retain(buffer, boundary);
+        }
+   }
+
+}
+
+long findData(long value) {
+        update boundary and return the data layer offset for "value", 
+	       or a negative value if none is found
+}
+
+
+
+Listing of LongArray$retain as referenced in retainData() above:
+
+
+    long get(long offset) {
+      return data from the block at "offset"
+    }
+    
+    void retain(LongQueryBuffer buffer, long boundary, long searchStart, long searchEnd) {
+
+        if (searchStart >= searchEnd) return;
+
+        long bv = buffer.currentValue();
+        long av = get(searchStart);
+        long pos = searchStart;
+
+        while (bv <= boundary && buffer.hasMore()) {
+            if (bv < av) {
+                if (!buffer.rejectAndAdvance()) break;
+                bv = buffer.currentValue();
+                continue;
+            }
+            else if (bv == av) {
+                if (!buffer.retainAndAdvance()) break;
+                bv = buffer.currentValue();
+                continue;
+            }
+            // when (bv > av) we keep scanning through the block
+
+            if (++pos < searchEnd) {
+                av = get(pos);
+            }
+            else {
+                break;
+            }
+        }
+    }
+    
+    // ...
+
+Note the invariant above, bv <= boundary ensures that the code only keeps searching as long as it is known that one or more of the buffer's values can still be retained or rejected.
+
+Although the names are fairly self-explanatory, for the sake of clarity, the query buffer structure operates with two pointers and is outlined below:
+
+
+public class LongQueryBuffer {
+
+    public final long[] data;
+    public int end;
+
+    private int read = 0;
+    private int write = 0;
+    
+    public LongQueryBuffer(long[] data, int size) {
+        this.data = data;
+        this.end = size;
+    }
+    
+    public long currentValue() {
+        return data[read];
+    }
+
+    public boolean rejectAndAdvance() {
+        return ++read < end; // true when more data can be read
+    }
+
+    public boolean retainAndAdvance() {
+        if (read != write) {
+            long tmp = data[write];
+            data[write] = data[read];
+            data[read] = tmp;
+        }
+
+        write++;
+
+        return ++read < end; // true when more data can be read
+    }
+    
+    public boolean hasMore() {
+        return read < end;
+    }
+    
+    public void finalizeFiltering() {
+        end = write;
+        read = 0;
+        write = 0;
+    }
+
+}
+
+

See Also

+
+/log/54-bargain-bin-btree.gmi
+
+

Topic

+
+/topic/programming.gmi
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/71-memex-design.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/71-memex-design.gmi new file mode 100644 index 00000000..f701427d --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/71-memex-design.gmi @@ -0,0 +1,153 @@ + + + + + MEMEX - Memex Design [ 2023-01-13 ] + + + + + + +
+ +
+ + +
+
+

Memex Design [ 2023-01-13 ]

+
+For clarification, this is discussing no other thing called Memex than memex.marginalia.nu, the website you're probably visiting right now. That, or you're reading this over gemini at marginalia.nu, which is serving the same content over a different protocol.
+
+I wanted to build a cross-protocol static site generator designed in a way that is equally understandable by both humans and machines. This groundedness is an appealing property I really admire about the gemini protocol and gemtext format. It's something I want to explore if it's possible to extend to software in general.
+
+It will turn out that designing the system from basic principles, it's possible to extract unexpected emergent features from it, including tag-based categorization.
+
+

1. Filesystem

+
+The memex leans heavily into an explicit filesystem metaphor.
+
+This means that the system is based on entities called documents, and each document has a path that identifies it. This fact is not hidden, but rather front and center. Directory traversal is one of the primary ways to navigate the site.
+
+Filesystems are neat for organizing information in a way that makes sense both for humans and computers.
+
+On a software level, filesystems are the OG NoSQL database that everybody takes for granted and many don't seem to realize even is a database. So they put a database in their database for storing files within their files.
+
+Sometimes if you have millions of entries or want to do non-trivial queries that's absolutely merited, but for a small blog/wiki/digital garden thing, it's more like some tired old Xzibit meme from 2010 that serves to add complexity for no reason.
+
+Filesystems are also complemented well with git.
+
+Every time I create, edit, move, rename, or anything on the memex; a commit is pushed to a local repository. There are currently 853 commits in the log. This means I don't have to worry too much when making changes. They can always be reverted.
+
+Having everything backed by files is great for portability. I could throw away the entire memex rendering software, as long as I keep the data, it's trivial to render a website from it. This will as be true in twenty years as it is today.
+
+

2. Hypertext

+
+So we have a bunch of files in a filesystem. So far, so FTP.
+
+A drawback with a filesystem model is that it does require a modicum of organizational effort, and sometimes if you have extremely large filesystems it's easy to misplace files, but for a small system like this I don't think that is even a concern.
+
+To aid with this, and enable easier navigation between related documents, it's neat if documents could link to each other.
+
+Each document is made to be hypertext. Documents may refer to each other by their filesystem path. A link text may add context to the link. To prevent a gradual build-up of dead links, whenever a file is deleted or moved, a line is automatically added to the special files
+
+/special/tombstone.gmi
+
+/special/redirect.gmi
+
+These files are used to render a ghost of the (re)moved files allowing the possibility of adding context as to what has happened to the resource.
+
+To be clear, these /special/-files are not just a rendered representation of magical hidden database somewhere. It's a basic hypertext document with links. Everything is a file. Everything about the system is encoded in hypertext. Documents are related with links. Link text encodes semantics about the relationship. A list of links with associated link texts is homomorphic to a k-v store. Everything is human readable. This is not just a presentation layer thing. It's turtles all the way down.
+
+

2.1 Backlinks are cool

+
+Unlike what you might expect from hypertext coming from the WWW, links are considered weakly bidirectional within the memex. That is, they reflect information not only about the source, but the destination. When rendering, the full link graph is constructed, and backlink information is added to the destination files as well.
+
+Some of the consequences of doing this are unexpected. To illustrate the sort of things you can do with backlinks, the memex uses them to allow for topical categorization.
+
+For example, given the document you are presently reading is about to link to
+
+/topic/web-design.gmi
+
+it's categorized as a web-design-related document! What you see when you click that link is a list of backlinks.
+
+It would be tempting to abandon the filesystem metaphor and replace it entirely with just a graph of documents linking to each other, although that presents the problem of ... how the heck do you refer to other documents if they have no path? Like you can give them UUIDs or something like that, I suppose, but that system would only make sense to machines.
+
+The beauty (and explicit design goal) of the memex is that it's just as human readable as it is machine readable. You could print the raw data for the memex on paper and still be able to make sense of it.
+
+

3. The remaining owl

+
+The raw sources for the documents are written in a markup language that is a variant of gemtext. The extended gemtext is rendered to HTML (for https://memex.marginalia.nu/ ) and standards compliant gemtext (for gemini://marginalia.nu/ ) whenever a file is created or changed.
+
+To get a feel for the gemtext format, here is an overview:
+
+ +
+The memex uses some extensions in the form of rendering hints that are stripped away when serving the files over gemini. These may direct the renderer to for example generate an Atom feed for a particular directory, or to inline the file listing.
+
+If a directory contains an index.gmi file, it's served along the directory listing.
+
+Updating the memex is a bit slow (usually takes up to a second) since it essentially requires traversing the entire link graph and re-rendering multiple documents and then doing a git commit. The slowness ends there however. Since everything is statically generated, serving traffic is very fast and the server's survived some pretty ridiculous traffic spikes.
+
+This web design is admittedly unconventional, and I fully appreciate it is not immediately obvious how to navigate the website as a consequence. This is in part a deliberate effort to challenge web design conventions. I experience a degree of genre fatigue with the blog format, the wiki format, and so on.
+
+It feels like many of these conventions were instituted decades ago for reasons that are largely since forgotten, and nobody's bothered to challenge them since. What if they're no longer valid, those reasons? That's what the memex is about. To violently shake things and see if something comes loose.
+
+

See Also

+
+/log/43-pseodonymous.gmi
+/projects/memex.gmi
+
+ + + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/72-are-you-ok.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/72-are-you-ok.gmi new file mode 100644 index 00000000..d32f1880 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/72-are-you-ok.gmi @@ -0,0 +1,86 @@ + + + + + MEMEX - Are you ok? [ 2023-01-27 ] + + + + + + +
+ +
+ + +
+
+

Are you ok? [ 2023-01-27 ]

+
+I don't know if I'm just imagining it, but has the Internet gone progressively more crazy the last decade or so?
+
+It's like everyone is so damn angry all the time. If they aren't angry they're bitter and resentful. And when they aren't angry or bitter, they're so depressed they're barely able to crawl out of bed. And if they aren't angry, bitter, or depressed, they have crippling anxiety. Every other week there's some public blow-out where some person or another just loses their shit.
+
+This is the new normal, but it isn't normal.
+
+I think we should talk about this and try and figure out what's going on.
+
+Obviously it's been a stressful couple of years for most people. Divisive characters have been prolific in politics, there's the war, a sense of hopelessness about global warming, the recession which has been looming on the horizon for a while, and on top of that the social isolation during Covid wasn't doing anyone's mental well-being a service.
+
+That's all recent things. There were arguably signs even before the 2020s.
+
+It's noteworthy how the discussion climate appears to have deteriorated with every election cycle since the early 2010s. It gets especially bad during the actual election season, and then relaxes a bit, but it never goes back to where it was, there are always permanent scars and divisions. This has been the case with every political system I have insight into. This doesn't seem to be sustainable. If we keep going like this we're literally gonna tear ourselves apart.
+
+The social media angle is also worth mentioning.
+
+During the pandemic, for many the only window to the world was social media as well as online news that's largely come to mostly report on what they see in social media.
+
+As anyone who has actually gone outside to touch grass will attest, what you see in social media is typically so exaggerated and distorted that it may as well be entirely fictional.
+
+Go outside and it's more or less the same as it has always been. What you see with your eyes and what you see with your phone is shockingly difficult to reconcile.
+
+Not that there aren't bad things happening in the world, that's the way it has always been, but your phone will have you believe that they are more frequent and much closer than they actually are.
+
+I can only speculate. I don't know what's going on. Maybe I'm just imagining things. Maybe I'm the one that's gone crazy.
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/72-new-approach-to-ranking.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/72-new-approach-to-ranking.gmi new file mode 100644 index 00000000..c8e04a49 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/72-new-approach-to-ranking.gmi @@ -0,0 +1,50 @@ + + + + + MEMEX - + + + + + +
+ +
+ + + + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/73-new-approach-to-ranking.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/73-new-approach-to-ranking.gmi new file mode 100644 index 00000000..d6eeb685 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/73-new-approach-to-ranking.gmi @@ -0,0 +1,88 @@ + + + + + MEMEX - A new approach to domain ranking [ 2023-02-06 ] + + + + + + +
+ +
+ + +
+
+

A new approach to domain ranking [ 2023-02-06 ]

+
+This is a very brief post announcing a fascinating discovery.
+
+It appears to be possible to use the cosine similarity approach powering explore2.marginalia.nu as a substitute for the link graph in an eigenvector-based ranking algorithm (i.e. PageRank).
+
+The original PageRank algorithm can be conceptualized as a simulation of where a random visitor would end up if they randomly clicked links on websites. With this model in mind, the modification replaces the link-clicking with using explore2 for navigation.
+
+The performance of PageRank has been deteriorating for decades and it's to a point where it barely is applicable for domain ranking anymore in part due to changes in how websites link to each other, but also a battery of well documented techniques for manipulating the algorithm in order to gain an unfair advantage. You may get decent results at the very top especially with personalized pagerank, but you don't have to scroll particularly far down in the ranking to find spam earning a conspicuously high ranking using a vanilla pagerank approach.
+
+This new approach seems remarkably resistant to existing pagerank manipulation techniques. Given a preference-vector, it stays "on topic" remarkably well.
+
+ +
+

See Also

+
+/log/69-creepy-website-similarity.gmi
+/log/20-dot-com-link-farms.gmi
+/log/04-link-farms.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/74-marginalia-2-years.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/74-marginalia-2-years.gmi new file mode 100644 index 00000000..af1f55de --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/74-marginalia-2-years.gmi @@ -0,0 +1,105 @@ + + + + + MEMEX - Marginalia Search: 2 years, big news [ 2023-02-26 ] + + + + + + +
+ +
+ + +
+
+

Marginalia Search: 2 years, big news [ 2023-02-26 ]

+
+No time like the project's two year anniversary to drop this particular bomb...
+
+Marginalia's gotten an NLNet grant. This means I'll be able to work full time on this project at least a year.
+
+https://nlnet.nl/project/Marginalia/
+
+This grant is essentially the best-case scenario for funding this project. It'll be able to remain independent, open-source, and non-profit.
+
+I won't start in earnest for a few months as I've got loose ends to tie up before I can devote that sort of time. More details to come, but I'll say as much as the first step is a tidying up of the sources and a move off my self-hosted git instance to an external git host yet to be decided.
+
+

Recap

+
+It's been a heck of a year for Marginalia. Some highlights.
+
+The UX has been streamlined quite a bit. Forms for flagging problematic websites and submitting websites to be crawled.
+
+Overall the search result presentation is cleaner. The old search result page used a lot of weird emoji icons to convey information, I was never quite happy with that.
+
+ + +
+The crawler was significantly redesigned.
+
+/log/63-marginalia-crawler.gmi
+
+The index has been almost completely rewritten to be both faster and more space-efficient. I feel a bit bad I still haven't written about this. The re-design allowed the search engine to hit that sweet 100M document milestone a few months ago.
+
+I've had big success experimenting with website similarity metrics, and very recently I combined this method with PageRank. The result is good beyond expectations. The new algorithms are live on the search engine and working so well.
+
+ + +
+There's been improvements in ad-detection, text-summarization, topic filtering, DOM-pruning, sharp sticks...
+
+With the grant there will definitely be a "Marginalia Search: 3 years"-post. I got most of the above done while juggling a lot of other life-stuff alongside Marginalia Search, as a solo dev. It'll be very interesting to see what sort of ground I'll be able to cover while working on this full time!
+
+

Topics

+
+/topic/astrolabe.gmi
+/topic/nlnet.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/bargain-bin-btree.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/bargain-bin-btree.gmi new file mode 100644 index 00000000..e2d3a5b7 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/bargain-bin-btree.gmi @@ -0,0 +1,50 @@ + + + + + MEMEX - + + + + + +
+ +
+ + + + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/index.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/index.gmi new file mode 100644 index 00000000..24bf808e --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/index.gmi @@ -0,0 +1,73 @@ + + + + + MEMEX - Gemlog + + + + + + + + +
+ +
+ + + + + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/soaring-high.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/soaring-high.gmi new file mode 100644 index 00000000..2ee5147d --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/soaring-high.gmi @@ -0,0 +1,50 @@ + + + + + MEMEX - + + + + + +
+ +
+ + + + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/log/todo.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/log/todo.gmi new file mode 100644 index 00000000..a5f5c8f9 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/log/todo.gmi @@ -0,0 +1,46 @@ + + + + + + + MEMEX - + + + + + +
+ +
+
+
+

/log/todo.gmi is gone

+

+ + +

+
+ + + +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/one-weird-trick.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/one-weird-trick.gmi new file mode 100644 index 00000000..69b4df80 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/one-weird-trick.gmi @@ -0,0 +1,64 @@ + + + + + MEMEX - One Weird Trick + + + + + + +
+ +
+ + +
+
+

One Weird Trick

+
+

Gordon Ramsay hates him for this...

+
+If you want to peel a clove of garlic quickly, put it on a cutting board and firmly press the side of a broad knife against it, until it buckles a bit. Then the skin almost falls off with no frustration or mess.
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/pics/index.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/pics/index.gmi new file mode 100644 index 00000000..9e0549a2 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/pics/index.gmi @@ -0,0 +1,62 @@ + + + + + MEMEX - Pics + + + + + + +
+ +
+ + +
+
+

Pics

+
+This is a "miscellaneous box" of images, for reference in other places.
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/pics/links/index.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/pics/links/index.gmi new file mode 100644 index 00000000..694712d1 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/pics/links/index.gmi @@ -0,0 +1,64 @@ + + + + + MEMEX - Links + + + + + + +
+ +
+ + + + + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/pics/raster-test/index.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/pics/raster-test/index.gmi new file mode 100644 index 00000000..12b28068 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/pics/raster-test/index.gmi @@ -0,0 +1,77 @@ + + + + + MEMEX - Floyd-Steinberg dithering + + + + + + +
+ +
+ + +
+
+

Floyd-Steinberg dithering

+
+These are some files from experimenting with dithering.
+
+The purpose is both compresion as well as creating a stylistic alignment with the lo-fi color scheme that constitutes marginalia.nu's visual appearance.
+
+The latter is accomplished through using the colors that appear in the website as a palette for the dithering algorithm. The palette is only 10 colors, which fits with ample room to spare in a 4 bit palette.
+
+The low resolution and somewhat unique appearance is also something that makes the images appear out-of-place anywhere else; a deterrent against freebooting.
+
+Note: Some of the sample files have a bug where white is transparent. I don't really understand what causes it, but it seems the second color in the palette is always made transparent. I've been able to work around it by adding a second white color. Some files (volvo.png and socrates.png) have been re-uploaded with the fixed algorithm.
+
+https://encyclopedia.marginalia.nu/wiki/Floyd-Steinberg_Dithering
+https://encyclopedia.marginalia.nu/wiki/Color_difference
+
+/projects/memex.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge.gmi new file mode 100644 index 00000000..48024841 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge.gmi @@ -0,0 +1,46 @@ + + + + + + + MEMEX - + + + + + +
+ +
+
+
+

/projects/edge.gmi is gone

+

+ + +

+
+ + + +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/about.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/about.gmi new file mode 100644 index 00000000..ca46c95d --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/about.gmi @@ -0,0 +1,159 @@ + + + + + MEMEX - About search.marginalia.nu + + + + + + +
+ +
+ + +
+
+

About search.marginalia.nu

+
+Ever feel like the Internet has gotten a bit... I don't know, samey? There's funny images scrolling by and you blow some air through your nose and keep scrolling and then someone has done something upsetting and you write an angry comment and then you scroll some more.
+
+Remember when you used to explore the Internet, when you used to discover cool little websites made by people and it wasn't just a bunch of low effort content mill listicles and blog spam?
+
+I want to show you that the Internet you used to go exploring is still very much there. There are still tons of small personal websites, and a wealth of long form text from both the past and the present.
+
+So it's a search engine. It's perhaps not the greatest at finding what you already knew was there. Instead it is designed to help you find some things you didn't even know you were looking for.
+
+If you are looking for facts you can trust, this is almost certainly the wrong tool. If you are looking for serendipity, you're on the right track. When was the last time you just stumbled onto something interesting, by the way?
+
+I don't expect this will be the next "big" search engine. This is and will remain a niche tool for a niche audience.
+
+https://search.marginalia.nu/
+
+ +
+ +
+ +
+ +
+

Warning: Experimental software!

+
+This is an experimental toy, and you should expect downtime, data losses, bugs, jank of every flavor. I sadly don't have enough hardware to keep a back-up system running when I take down the server to make alterations. I try not to do this too frivolously, but expect a few minutes of downtime every now and then.
+
+

A Theoretical Justification

+
+In recent years, something has been simmering: Some call it the "Small Internet". I hesitate to call it a movement, that would imply a level of organization and intent that it does not possess. It's a disjointed group of like-minded people that recognize that the Internet has lost a certain je ne sais quoi, it has turned from a wild and creative space, into more of shopping mall. Where ever you go, you're prodded to subscribe to newsletters, to like and comment, to buy stuff.
+
+The formulation of the problem differs from purely aesthetic ones, to ones based on political doctrine. I prefer a humanist explanation. The measure of a website should be how well it enriches the life of – and empowers the visitor, rather how well it enriches the wallet of the website owner, especially not at the expense of the visitor's long-term interests.
+
+Some would vilify search engines and commercial interests for transforming the Internet in such a way, but that is not a particularly productive thing to dwell upon. Even if that is so, that's just all the more opportunity to build something that's better.
+
+-
+
+The search engine calculates a score that aggressively favors text-heavy websites, and punishes those that have too many modern web design features.
+
+This is in a sense the opposite of what most major search engines do, they favor modern websites over old-looking ones. Most links you find here will be nearly impossible to find on a regular search engine, as they aren't sufficiently search engine optimized.
+
+This may seem a choice from some sort of nostalgia, which in part is true, but there is more to it. The hypothesis is something akin to the Lindy-effect: If a webpage has been around for a long time, then odds are it has fundamental redeeming quality that has motivated keeping it around all for that time. Looking at design elements is one way of determining the approximate age of a webpage, and thus predict its usefulness.
+
+The purpose of the tool is primarily to help you find and navigate the non-commercial parts of the internet. Where, for sure, you'll find crack-pots, communists, libertarians, anarchists, strange religious cults, snake oil peddlers, really strong opinions. Yes all manner of strange people.
+
+You'll surely find uncomfortable ideas too, and voices from the past that stubbornly refuse to adapt to the present, but I'm sure you'll survive and find the experience worthwhile, because for every turd you step in, there are also plenty of brilliant and interesting gems to find that for one reason or another didn't live up to the standards of the big search engines.
+
+There's also a point to be made about how the technologies we use shape our view of the world. I'm hoping, by holding up this kaleidoscope to the Internet, you'll become more aware of the spectacles you were already wearing without thinking about them. It really does seem like a different reality.
+
+If this were the looking glass through which you viewed the the Internet, and I'm not making a serious suggestion that it should be, but if it were, then wouldn't you agree that your idea of the web would by seem be very different, more human?
+
+

Further Reading

+
+ + + + +
+

Similar Projects

+
+Competition is perhaps not the word, we're all pulling in the same direction, only with subtly different goals and approaches.
+
+ + + + + + +
+ +
+

Have something to say?

+
+Send me an e-mail at kontakt@marginalia.nu.
+
+

Support the Project

+
+/projects/edge/supporting.gmi
+
+

Links

+
+/projects/edge/index.gmi
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/api.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/api.gmi new file mode 100644 index 00000000..91df9d5b --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/api.gmi @@ -0,0 +1,125 @@ + + + + + MEMEX - API + + + + + + +
+ +
+ + +
+
+

API

+An API for the search engine is available through api.marginalia.nu.
+
+The API is simple enough to be self-explanatory. Examples:
+
+
+https://api.marginalia.nu/public/
+https://api.marginalia.nu/public/search/json+api
+https://api.marginalia.nu/public/search/json+api?index=0
+https://api.marginalia.nu/public/search/json+api?index=0&count=10
+
+The 'index' parameter selects the search index, corresponding to the drop down next to the search field in the main GUI.
+
+

Common Key

+For experimentation, the key "public" is available, as used in the examples on this page. This key has a shared rate limit across all consumers. When this rate limit is hit a HTTP status 503 is returned.
+
+

Key and license

+
+Please send an email to kontakt@marginalia.nu if you want your own key with a separate rate limit. The search engine has seen quite a lot of problems with bot abuse, making this registration step a sad necessity.
+
+No guarantees can be made about uptime or availability.
+
+By default the data is provided under the CC-BY-NC-SA 4.0 license. Other licensing and terms are negotiable.
+
+ +
+

Sample code in python 3

+
+import requests
+
+url = "https://api.marginalia.nu/{key}/search/{query}";
+
+rsp = requests.get(url.format(key='public', query="linear b"));
+
+if rsp.ok:
+  data = rsp.json()
+  print ("Query: ", data['query'])
+  print ("License: ", data['license'])
+  print ("")
+  for result in data['results']:
+      print (result['url'])
+      print ("\t" + result['title'])
+      print ("\t" + result['description'])
+      print ("")
+else:
+    print ("Bad Status " + str(rsp.status_code))
+
+

Something missing?

+
+Please let me know if there are features you would like added to the API.
+
+

See also

+
+ + + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/changelog.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/changelog.gmi new file mode 100644 index 00000000..d213f522 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/changelog.gmi @@ -0,0 +1,325 @@ + + + + + MEMEX - Change Log + + + + + + +
+ +
+ + +
+
+

Change Log

+
+Detailed changelog available here:
+
+https://git.marginalia.nu/marginalia/marginalia.nu/graph?branch=refs%2Fheads%2Frelease
+
+
+

2022 August:

+
+
    +
  • Recipe filter
+
+
    +
  • Ad detection
+
+
    +
  • Query time optimization
+
+

2022 June-July:

+
+
    +
  • Overhaul of the crawler and database model, index and database reconstructed.
+
+

2022 May Changes

+
+
    +
  • Project goes Open Source
+https://git.marginalia.nu/marginalia/marginalia.nu
+
+
    +
  • Added support for a few !bangs, currently !g and !ddg
+
+

2022 April Changes

+
+
    +
  • Added type-ahead suggestions for desktop.
+
+
    +
  • New index backend based on a B-tree variant.
+
+
    +
  • Reworked the crawler to be more compatible with the WARC format.
+
+

2022 March Changes

+
+
    +
  • Side-loaded all of StackExchange and StackOverflow.
+
+
    +
  • Improved the blogocentric algorithm to prioritize smaller sites more effectively.
+
+
    +
  • Removed some mastodon instances from random mode as they aren't very interesting to visit, you just get a log-in screen.
+
+
    +
  • Optimized exploration mode as it was getting quite sluggish.
+
+
    +
  • Added a drilldown link on the search results for narrowing the search to the same domain.
+
+
    +
  • Tuned down the amount of Mastodon instances that crop up in Random Exploration mode. I like the idea of these sites, but there are so many of them and they only show you a sign-up screen when you visit them.
+
+

2022 February Changes

+
+
    +
  • Slightly relaxed the hard limit on how much javascript is allowed on a page, since better heuristics have been found, and this limit does throw out a lot of babies with the bathwater.
+
+Work has been almost at a standstill due to some health issues. I hope to get more productive again soon.
+
+

2022 January Changes

+
+
    +
  • Fixed a minor bug that broke among others, the site:-search
+
+
    +
  • Overhaul of the web design for the search engine.
+
+
    +
  • Random-feature has gotten site screenshots to offer a "flavor" of the site. Site-info is much improved as well.
+
+
    +
  • API access
+https://api.marginalia.nu/
+
+

2021 December Changes

+
+
    +
  • Crawling is smarter and uses the ranking algorithm for prioritizing the order of the results.
+
+
    +
  • Search results are better sorted in terms of how important the search terms are in relation to the query.
+
+
    +
  • The query parser is a lot smarter and generates better alternative search terms to supplement the main query (pluralization, concatenation), guided by a term frequency dictionary.
+
+
    +
  • Additional keywords are extracted for each document. This will add more junk results at the bottom of the page, but hopefully more good matches too.
+
+
    +
  • The maximum query length has been restricted.
+
+ +
+

2021 November Update

+
+
    +
  • Further refinements to keyword extraction. The technically minded can read a few words about that here:
+/log/37-keyword-extraction.gmi
+
+
    +
  • Improved crawling logic to offer more leniency toward sites that have high ranking. This improves the chance of pushing through local minima and discovering additional quality content on those sites.
+
+
    +
  • Mended some fences with a few of the websites that blocked my crawler when it young and unruly, and removed a few sites from the blocklist that didn't belong there. More quality websites in the index!
+
+
    +
  • As an experiment, tagged websites that contain links to amazon, attempt to place cookies on the crawler, contain links to known trackers, contain audio/video tags, and contain javascript.
+
+It's not perfect, it will miss some trackers as well as mistake some honest amazon links for affiliate links.
+
+These special keywords are available:
+
+
+    js:true
+    js:false
+    special:cookies
+    special:affiliate
+    special:media
+    special:tracking
+
+You can of course also exclude them
+
+
+    "keyboard -special:tracking -special:affiliate".
+
+
    +
  • Added outgoing links as search terms. Up to 25 per page. Great for ego-searching.
+
+Example:
+
+    "links:archive.org" 
+will list pages that link to archive.org. This is only available on the highest level of domain, you can't for example search for "search.marginalia.nu", only "marginalia.nu".
+
+

Exploration Mode (Experimental)

+
+If you press the little "🔀" icon next to a search result, you will be brought to a list of domains that might be similar. From there you can keep pressing "🔀" again to explore the web.
+
+This is perhaps best used for navigating the blogosphere, neocities, and similar digital communities.
+
+This is an experimental feature and the user interface is really rough, but it's a lot of fun so that's why I've made it accessible to the public.
+
+ +
+

2021 October Revamp

+
+
    +
  • Introduced a ranking algorithm that takes into consideration both the average quality of the domain, and the number of links to the domain (and their quality). This should mean fewer garbage results and less SEO spam.
+
+
    +
  • Added ANOTHER ranking algorithm along with the first one, a modified PageRank that aggressively biases toward personal websites.
+
+
    +
  • Drastically improved keyword extraction and topic identification quite a bit.
+
+
    +
  • Support for many new types of keywords, including: C#, .308, 5.56mm, comp.lang.c, #hashtag, 90210.
+
+
    +
  • Added the ability to filter on page properties like javascript and declared HTML standard (based on DTD first and guesswork as a fallback).
+
+

Known Problems

+
+
    +
  • The minus keyword doesn't work super reliably.
+
+
    +
  • Keyword extraction may be a bit too conservative.
+
+

2021 September Bugfixes and Tweaks

+
+
    +
  • Reformulated some error messages that words can only exist within a Latin-1 encoding. Also added some automatic suggestions when there are few results, with a link to a tips page.
+
+
    +
  • Fixed a bug where the indexes weren't queried in the right order, and good results would in some circumstances be overwritten with worse results.
+
+
    +
  • Fixed a bug where the same domain could appear too many times in the results.
+
+
    +
  • Search profiles have been added, and the default is a more narrow configuration that's intended to reduce the noise of completely irrelevant search results. I'm not sure if this is necessary with the bug fixes above.
+
+
    +
  • Added support for curly quotes, as some operating systems apparently use those.
+
+

2021 September Maintenance

+
+
    +
  • A full index rebuild. This is mainly to allow for a change in internal modelling that will fix some jankiness.
  • +
  • It also allows for an improvement in index bucketing. This will hopefully improve the quality of the results.
  • +
  • Topic extraction has been improved, among the changes, the crawler will use word-capitalization to pick up likely topics of a page.
+
+Further changes:
+
+
    +
  • Unsupported foreign languages are detected and filtered out more aggressively than before. For now the search engine targets: English, Latin and Swedish. Additional languages may come in the future, but I will probably need to recruit help, as I have no way of ensuring the quality of results I can't read.
  • +
  • Even more aggressive link farm detection.
  • +
  • Charset encoding defaults to ISO8859-1 in the absence of UTF-8 being requested. This prevents a lot of garbled descriptions.
+
+

2021 August - Quality of Life updates

+
+A lot of small features have been added to improve the usefulness of the search engine in finding information.
+
+
    +
  • Support for define:-queries that retreive data from wiktionary.
  • +
  • Mathematical expression evaluations and unit conversions (a bit janky still).
  • +
  • Spell checking for search terms that return no results. If "Farenheit" gives no results, you will be provided with the suggestion to try "Fahrenheit".
  • +
  • The search engine will provide links to (hopefully) useful wikipedia entries.
+
+
+

2021 July Index Rebuild

+
+ The index has been reconstructed (actually several times) to allow for new and exciting dimensions of search. Follows is a summary of some of the bigger feature-changes.
+
+
    +
  • Search results are presented in an order that is likely more useful. Results that contain search terms will be boosted, and the number of links to the results will affect the order of presentation, but is not part of the indexing and crawling considerations, so the same set of results will be presented as previously -- this is not, and never will be a popularity contest.
  • +
  • Support for a wider dictionary of search terms, including words that include numbers, and sequences of up to four words. The search engine will automatically try pairs of words when searching, but additional words will be considered if they are placed within quotes.
  • +
  • Resilience improvements! The index can recover from mild data corruption in a highly best-effort fashion, and the index will recover much faster if it needs to restart, from 30-60 minutes down to 5 minutes.
  • +
  • Blacklisting of link- and content-farms is implemented even more aggressively than in previous versions. There are some areas where an especially heavy hand needed to be employed, including pages pertaining to cryptocurrencies and alarm-systems.
  • +
  • Mobile support has been improved, the contents of the page will no longer overflow.
  • +
  • Terminal based browser support has been improved as well.
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/design-notes.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/design-notes.gmi new file mode 100644 index 00000000..a8a7b9d9 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/design-notes.gmi @@ -0,0 +1,157 @@ + + + + + MEMEX - Notes on Designing a Search Engine + + + + + + +
+ +
+ + +
+
+

Notes on Designing a Search Engine

+
+

robots.txt

+
+People put lists of very specific URLs they do not want you to look at in robots.txt, and I don't specifically mean secret admin log-in pages (even though that happens too), but like embarrassing stuff, dirt, the awkward august 2003 issue of campus magazine when the dean awarded Kony philanthropist of the year. It keeps the search engines out, but human beings can read these files too.
+
+Speaking of robots.txt, there is no standard. Adherence is best-effort by every search engine, and the amount of weird directives you'll find is staggering. Oh, and ASCII art too, little messages. Its cute, but not something you should do if crawler adherence actually matters.
+
+

Standards

+
+The HTML standard is not a standard. A major american university uses <title>-tags for its navigational links. It's a technological marvel how coherently web browsers deal with the completely incoherent web they browse.
+
+

Quality measure

+
+The search engine evaluates the "quality" of a web page with a formula that, a bit simplified looks like
+
+
+       length_text     -script_tags
+  Q =  -----------  x e
+       length_markup
+
+As a consequence, the closer to plain text a website is, the higher it'll score. The more markup it has in relation to its text, the lower it will score. Each script tag is punished. One script tag will still give the page a relatively high score, given all else is premium quality; but once you start having multiple script tags, you'll very quickly find yourself at the bottom of the search results.
+
+Modern web sites have a lot of script tags. The web page of Rolling Stone Magazine has over a hundred script tags in its HTML code. Its quality rating is of the order 10-51%.
+
+/log/10-astrolabe-2-sampling-bias.gmi
+
+

Link Farms

+
+Smut and link farms seems to go hand-in-hand, to the extent have at times filtered out the first to get at the other.
+
+/log/04-link-farms.gmi
+
+

Trade-offs

+
+There is a constant trade-off between usefulness, and efficiency. That is a necessity when running a search engine, typically reserved for a datacenter, on consumer hardware. Do you need to be able to search for slpk-ya-fxc-sg-wh, the serial number of a Yamaha subwoofer? If it comes at the cost of polluting the index with such highly unique entities? At the cost of speed, and size? What about Day[9], is the conventions of occasional digital handles enough to justify increasing the search term dictionary by 20%?
+
+

Standard searches

+
+It's hard to quantify qualitative aspects, but I have some standard tasks I use to evaluate the virtues of the the search engine works.
+
+
    +
  • I want to be able to find an interesting ariticle on Protagoras
  • +
  • Searching for PuTTY ssh should yield a download link relatively easily
+
+While the goal of the search engine is to give an interesting degree of inaccuracy, it can't be too inaccurate either, to the point of being useless or just returning basically random links. These are challenges of promoting sufficiently relevant results. R.P. Feynman is an interesting man, but that doesn't make his cursory mention of silly putty an interesting result. Likewise, people seem to love to attribute man is the measure of all things to Protagoras, but relatively few articles are actually relevant to the man himself.
+
+

Description extraction

+
+The most effective way of extracting a meaningful snippet of text from a web site seems to be to simply look for a piece of text that has a relatively low proportion of markdown. 50% seems a decent enough cut-off.
+
+I've tried various approaches, and this relatively simple approach seems to work by far the best. The problem, in general, is identifying what is navigation and what is content. It's better having no summary than having summaries that look like
+
+
+ Home Blog About RSS feed Follow me on instagram | | | | | (C) 2010 Brown Horse Industries CC-BY-SA 3.0
+
+This is the actual code I use
+
+
+private Optional<String> extractSummaryRaw(Document parsed) {
+  StringBuilder content = new StringBuilder();
+
+  parsed.getElementsByTag("p").forEach(
+        elem -> {
+          if (elem.text().length() > elem.html().length()/2) {
+            content.append(elem.text());
+          }
+      }
+  );
+
+  if (content.length() > 10) {
+    return Optional.of(content.toString());
+  }
+  return Optional.empty();
+}
+
+

Links

+
+/projects/edge/index.gmi
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/faq.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/faq.gmi new file mode 100644 index 00000000..220cffa0 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/faq.gmi @@ -0,0 +1,134 @@ + + + + + MEMEX - FAQ + + + + + + +
+ +
+ + +
+
+

FAQ

+
+

What is this search engine's name?

+
+Let's call it Marginalia Search as that's what most people seem to do.
+
+There is some confusion, perhaps self-inflicted problem as I'm not really into branding and logos, and to make matters worse I've used a lot of different internal names, including Astrolabe and Edge Crawler. But most people seem to favor "marginalia search". Let's just not worry too much about what the "real" name is and use what gets the idea across.
+
+

I'm working on something cool, may I have some data, or API access?

+
+Send me an email and we can talk about it, I'm more than happy to share, but for logistical reasons I can't just put everything on an FTP for ad-hoc access. The Works is hundreds of gigabytes of data, and much of it is in nonstandard binary formats I've designed myself to save space.
+
+

Why do you only support English?

+
+I'm currently focusing on English web content. In part this is because I need to limit the scope of the search engine. I have limited hardware and limited development time.
+
+I'm just one person, and I speak Swedish fluently, English passably, and understand enough Latin to tell my quids from my quods, but the breadth of my linguistic capability ends there.
+
+As such, I couldn't possibly ensure good quality search results in hundreds of languages I don't understand. Half-assed internationalization is, in my personal opinion, a far bigger insult than no internationalization.
+
+

What is the hardware and software stack?

+
+The software is custom built in Java. I use MariaDB for some ancillary metadata.
+
+The hardware is a single consumer-grade computer, a Ryzen 3900X with 128 Gb of RAM (without ECC). I snatched one of the few remaining Optane 900Ps and it's backing the database.
+
+

How big is the index?

+
+It depends when you ask, but the record is 50,000,000 documents, with room to spare for probably 50-100% more. In terms of disk size, we're talking hundreds of gigabytes.
+
+Index size isn't a particularly good metric. It's good for marketing, but in practice an index with a million documents that are all of high quality is better than an index with a billion documents where only a fraction of them are interesting. Sorting the chaff from the wheat is a much harder problem than just building a huge pile of both.
+
+

Where is the data coming from?

+
+I crawl myself. It seems to peak out at 100 documents per second.
+
+

Is this going to replace Google?

+
+No, and it's not trying to. It's trying to complement Google, by being good at what they are bad at. What the world needs is additional search options, not a new top dog.
+
+

Is this open source?

+
+ +
+

What do I do if I a query pops up anything really tasteless or illegal?

+
+Send me an email and I'll see if I can't block the domain.
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/for-webmasters.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/for-webmasters.gmi new file mode 100644 index 00000000..2c42c270 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/for-webmasters.gmi @@ -0,0 +1,119 @@ + + + + + MEMEX - For Webmasters + + + + + + +
+ +
+ + +
+
+

For Webmasters

+
+This search engine is a small non-profit operation, and I don't want it to be cause any inconvenience.
+
+If it is indeed being a nuisance, please let me know! Send an email to <kontakt@marginalia.nu> and I'll do my best to fix it as soon as possible.
+
+Telling me lets me fix whatever problem there is much faster, and if you are experiencing problems, then so are probably others as well.
+
+

Crawler Fingerprint

+
+
+User-Agent: search.marginalia.nu
+IP address: 81.170.128.21
+

robots.txt

+
+The search engine respects robots.txt, and looks for the user-agent "search.marginalia.nu" in specific, as well as general directives. Note that changes to robots.txt may not take effect immediately.
+
+You can also send me an email if something is indexed that you want me to remove.
+
+

Why isn't my page indexed?

+
+Odds are it just hasn't been discovered yet. The search engine has a pretty small index, and makes no pretenses of being complete.
+
+There could be several other reasons, some domain names are rejected because they look too much like domain names that are used by link farms. This mostly means .xyz and .icu. If you are hosted in Russia, China, Hong Kong, or Taiwan, you are also not going to get crawled. I feel bad for the innocent websites this affects, but the sad fact is that easily 90% of the link farms are hosted in these countries, and on these TLDs.
+
+For similar reasons, if you are hosted on a large VPS provider, especially Alibaba, or Psychz; you are not going to get crawled. Google Cloud is the only VPS provider, so far, that seems to effectively crack down on link farms. So that's the safest bet.
+
+The crawler sometimes gets captchad by CDNs like Fastly and CloudFlare, so it may or may not index them depending on whether the bot is being throttled.
+
+Searching for "site:www.yourdomain.tld" will provide you with an analysis. If the search engine is aware of the domain, there should be a button for slating it for crawling.
+
+If you get nothing, then odds are the search engine has no knowledge about the domain yet. Get in contact if you want me to have a look at what's happening.
+
+

A Call To Action

+
+Please link to other websites you like! Keep a bookmark list, a blog roll, whatever. You don't have to try to trap your visitor by only linking to your own stuff.
+
+Links make the Internet more interesting and fun to explore for humans, gives it a community feeling, and it both helps my search engine discover websites and helps it understand which websites are interesting.
+
+ +
+ + + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/index.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/index.gmi new file mode 100644 index 00000000..c95dc882 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/index.gmi @@ -0,0 +1,88 @@ + + + + + MEMEX - Edge + + + + + + +
+ +
+ + +
+
+

Edge

+
+Edge (or rather, Edge Crawler) is one of the internal names of a small independent internet search engine hosted on search.marginalia.nu. The search engine seeks out old, and old-looking websites with lots of text.
+
+The search engine also goes by the name of Astrolabe.
+
+It's a significant part of WMSA, and has subcomponents that do the usual helpful tasks, such as unit conversion, calculations, spell checking, encyclopedia suggestions.
+
+ + +
+
+/projects/wmsa.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/privacy.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/privacy.gmi new file mode 100644 index 00000000..4d0c8b22 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/privacy.gmi @@ -0,0 +1,97 @@ + + + + + MEMEX - Privacy Considerations [2021-10-21] + + + + + + +
+ +
+ + +
+
+

Privacy Considerations [2021-10-21]

+
+This privacy policy is in effect on search.marginalia.nu.
+
+
+Javascript:             Minimal
+Cookies:                No
+Local Storage:          No
+Tracking Pixels:        No
+Social Media Buttons:   No
+Third Party Requests:   No
+CDN                     Yes (sadly)
+Access Logs:            Yes
+Log Retention:          Up to 24h
+
+No information about which links are clicked is gathered, and it is not possible to historically correlate IP address to search terms, and anonymized internal identifiers are designed not to be stable over time.
+
+Overall I try to respect privacy as much as possible while still allowing for things like rate-limiting and bug fixing. There is no tracking and unnecessary logging of IP addresses is reduced to a bare minimum.
+
+Due to a prolonged and aggressive botnet attack I've had to put the server behind a CDN, which means I cannot guarantee perfect anonymity as I do not have insight into what the CDN provider is doing.
+
+Also, with sufficient time and a large IT forensics budget, someone could probably work out who you are and what you have searched for. I have however taken measures to make that as time consuming and expensive as possible, while at the same retaining some ability to diagnose problems with the set up and infrastructure.
+
+Nginx access logging >is< enabled, but old logs are not archived, but rather shredded and purged every 24 hours.
+
+Internal server logs are retained for a longer time period, but IP addresses are anonymized into a 32 bit hash with a random 96 bit salt that rotates on irregular intervals between 5 and 15 minutes. This is necessary for rate limiting.
+
+Don't hesitate to reach out if you have questions or concerns.
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/search-tips.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/search-tips.gmi new file mode 100644 index 00000000..f6b506ba --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/search-tips.gmi @@ -0,0 +1,112 @@ + + + + + MEMEX - Search Tips + + + + + + +
+ +
+ + +
+
+

Search Tips

+
+The search engine isn't very good at answering search queries posed as questions. For the best search results, try to imagine the title of the page you are looking for, and search for that. It takes some experimentation to find the right keywords.
+
+Word-order sometimes matters, and so do the exact words chosen, so sometimes reordering the words or using a synonym can give different results.
+
+The best approach is usually starting from one word, and then adding additional terms as to refine the search, rather than starting from a complicated query and trying to simplify it.
+
+Usually you will land at queries that may look like these:
+
+
    +
  • twin peaks
  • +
  • commander keen
  • +
  • putty ssh
  • +
  • silly putty
  • +
  • doctor who tom baker
  • +
  • x-files squeeze
  • +
  • eugene tooms x-files
  • +
  • scamander iliad
  • +
  • newt scamander
  • +
  • the master roger delgado
  • +
  • plato's stepchildren
  • +
  • excalibur hubbard
  • +
  • excalibur arthur -hubbard
  • +
  • site:monadnock.net plato symposium
+
+

Search Profiles

+
+You can also try to use a different search profile to shape the results.
+
+The "Experimental" profile will enable the latest beta features, and typically return a very wide range of results.
+
+The "Popular" option will prefer academic websites.
+
+The "Blog"-profile will gravitate toward the indieweb and blogosphere.
+
+

Topics

+
+Since the search engine specializes in older(-looking) webpages, it's typically fairly difficult to find information on current events and bleeding edge technology, especially in web design.
+
+A wealth of information seems available on historical people, television and video games that are a few decades old, and past events, recipes and hobbies. There are also a decent number of personal web pages and blogs, even if this is not an exhaustive directory of the blogosphere by any means.
+
+

Discover More Sites

+
+Use the "Explore"-links to fetch websites that are adjacent to a domain. The "Random Websites"-link up at the top will return a wide array of indieweb-adjacent links.
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/supporting.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/supporting.gmi new file mode 100644 index 00000000..dfe6865f --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/supporting.gmi @@ -0,0 +1,95 @@ + + + + + MEMEX - Support This Project + + + + + + +
+ +
+ + +
+
+

Support This Project

+
+I'm just one guy building all of this on my own. I'd like to expand the search engine and make it more useful. I'm paying most of the work out of my own pocket but that only goes so far.
+
+To this end I have a few donation links. Every last bit helps and even a modest contribution is greatly appreciated.
+
+https://www.patreon.com/marginalia_nu
+
+https://www.buymeacoffee.com/marginalia.nu
+
+You can also just send me an email at kontakt@marginalia.nu and we can discuss other alternatives.
+
+

Other

+
+The cheapest way you can support the search engine is by using it, and talking about it, and telling me when things go wrong. Feedback is invaluable, even if it's just "this-and-that query gives bad results", as I often use exactly such bad queries to figure out how to improve the results. I also just like getting emails. Lots of people have reached out and it's very inspiring.
+
+

Topics

+
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/top-20.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/top-20.gmi new file mode 100644 index 00000000..bd6a0a35 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/edge/top-20.gmi @@ -0,0 +1,133 @@ + + + + + MEMEX - Top 20 + + + + + + +
+ +
+ + +
+
+

Top 20

+
+

By Incoming Links

+
+A listing of domains by how many incoming links they have. To be specific, this is unique links by domain-to-domain basis. This is relatively consistent with most top-100 website lists.
+
+
++------------------------+-------+
+| URL_PART               | QTY   |
++------------------------+-------+
+| www.facebook.com       | 65327 |
+| www.youtube.com        | 47293 |
+| twitter.com            | 43001 |
+| en.wikipedia.org       | 31964 |
+| github.com             | 15737 |
+| www.instagram.com      | 14924 |
+| web.archive.org        |  6782 |
+| validator.w3.org       |  5914 |
+| www.apple.com          |  4743 |
+| news.bbc.co.uk         |  4636 |
+| www.gnu.org            |  4619 |
+| www.washingtonpost.com |  4271 |
+| goo.gl                 |  4151 |
+| www.w3.org             |  4065 |
+| www.nps.gov            |  3468 |
+| www.latimes.com        |  2701 |
+| www.ebay.com           |  2312 |
+| books.google.com       |  2265 |
+| www.cafepress.com      |  2257 |
+| www.nasa.gov           |  2113 |
++------------------------+-------+
+

Quality weighted

+
+This is weighted in a similar way the search results are ranked by the search engine.
+
+ (quality-weighted incoming links) x (domain quality)
+
+
++----------------------------+--------------------+
+| URL_PART                   | QTY                |
++----------------------------+--------------------+
+| www.gnu.org                | 215.24882000031243 |
+| xroads.virginia.edu        |  99.51551719790447 |
+| www.levity.com             |  79.67583195394936 |
+| www.fourmilab.ch           |  71.41567713044103 |
+| www.leaderu.com            |  67.63774488714816 |
+| www.chiark.greenend.org.uk |  60.27637178044755 |
+| www.rahul.net              | 58.653324671708994 |
+| files.usgwarchives.net     |   58.5809118191951 |
+| www.hartford-hwp.com       |  57.09051984936602 |
+| math.ucr.edu               |  56.33765697298831 |
+| www.marxists.org           |  54.67132159924675 |
+| cryptome.org               |   53.6266881708856 |
+| www.cygwin.com             | 53.083698286914895 |
+| www.openbsd.org            |  52.07737991403203 |
+| raw.githubusercontent.com  | 51.960594446688795 |
+| www.lysator.liu.se         |  51.45728090178886 |
+| www.whale.to               |  49.17518840241005 |
+| history.hanover.edu        |  48.34961964370164 |
+| www.panix.com              |  43.55703754455768 |
+| users.rcn.com              | 43.536098258182015 |
++----------------------------+--------------------+
+
+

Topic

+
+/topic/astrolabe.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/projects/encyclopedia/index.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/encyclopedia/index.gmi new file mode 100644 index 00000000..f52a9a2c --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/encyclopedia/index.gmi @@ -0,0 +1,106 @@ + + + + + MEMEX - High Readability Encyclopedia + + + + + + +
+ +
+ + +
+
+

High Readability Encyclopedia

+
+https://encyclopedia.marginalia.nu/
+
+This is an encyclopedia based on Wikipedia's database, that strips away most links and almost all visual clutter to provide a more book-like reading experience with fewer distractions.
+
+This is primarily a helpful utility for a search engine focusing on similarly text-oriented websites.
+
+You are welcome to use it for general article reading as well. This may be useful if you are on a low bandwidth connection, since the download size is typically reduced from megabytes to dozens of kilobytes.
+
+What's taken away is all the design elements that your brain would have to filter out to read the text of the article. It seems as though overburdening this mental process causes the reader to start scanning the text instead of reading it, which is experienced as an inability to pay focus.
+
+The cleaning process is not perfect and will occasionally produce strange results, but significant problems should be relatively rare.
+
+

Limitations

+
+This is a "stale" copy of wikipedia, based on an archived copy from January 2021. On the other hand, we used to abide printed encyclopedias that didn't update at all.
+
+Be aware that the cleaning strips away a lot of information, including most references, footnotes, quality warnings, and so forth. Refer to the original wikipedia article for that information.
+
+

Legal

+
+The original Wikipedia text is available under the the Creative Commons Attribution-ShareAlike 3.0 license, and so is the wikipedia text forwarded to you through this service.
+
+

Further Reading

+
+ +
+/log/03-writing-for-reading.gmi
+/log/00-linkpocalypse.gmi
+/log/13-static-html.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/projects/gemini-server.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/gemini-server.gmi new file mode 100644 index 00000000..0d805ccc --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/gemini-server.gmi @@ -0,0 +1,105 @@ + + + + + MEMEX - Gemini Server + + + + + + +
+ +
+ + +
+
+

Gemini Server

+
+Note! This document is out of date! The separate gemini-server software I had running has been absorbed into WMSA.
+
+

Software

+The server is extremely bespoke software. It seemed easy enough to implement the protocol.
+
+ +
+

Changes

+
    +
  • 2021-08-21 - Added auto-banning of some port sniffers that clog up the logs with HTTP requests and whatnot
  • +
  • 2021-08-04 - Added a telnet ingress @ marginalia.nu:9999
  • +
  • 2021-07-26 - Added automatic local backlinks
  • +
  • 2021-07-09 - Added automatic navigational footers to static gmi files
  • +
  • 2021-07-09 - Fixed TLS compatibility issues with Amfora and possibly other implementations
+https://lists.orbitalfox.eu/archives/gemini/2021/006379.html
+https://lists.orbitalfox.eu/archives/gemini/2021/006382.html
+
    +
  • 2021-07-06 - Added a proxy for my search engine
  • +
  • 2021-07-05 - Sanity-check tests against marginalia.nu server
  • +
  • 2021-07-05 - Changed the server to run on docker
  • +
  • 2021-07-05 - Guestbook plugin auto-disables if it can't find its database file
  • +
  • 2021-07-05 - Removed non-standard .gem file-endings and added a rewrite plugin that redirects any old URLs that still point there
  • +
  • 2021-07-04 - Fixed some minor security issues
  • +
  • 2021-07-04 - Added rolling logs
+
+

Hardware

+I put the machine together mostly for a search engine, Because I didn't want an actual rack making noise and heat in my living room, the server is made out of consumer hardware:
+
+
    +
  • Ryzen 9 3900X
  • +
  • 128 Gb RAM
  • +
  • 4x4 Gb IronWolf ZFS
  • +
  • A bunch of SSDs index lookups
+
+

Topics

+/topic/programming.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/projects/index.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/index.gmi new file mode 100644 index 00000000..b74ef5da --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/index.gmi @@ -0,0 +1,67 @@ + + + + + MEMEX - Projects + + + + + + +
+ +
+ + +
+
+

Projects

+
+A listing of the various software projects that constitute marginalia.nu. The website is a bit of an testbed for experiments in design and technology, some things work out, some things don't. Both outcomes are interesting data points.
+
+
+It's all open source, and the code is available at
+
+https://git.marginalia.nu
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/projects/memex.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/memex.gmi new file mode 100644 index 00000000..518740d6 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/memex.gmi @@ -0,0 +1,97 @@ + + + + + MEMEX - Memex + + + + + + +
+ +
+ + +
+
+

Memex

+
+Memex is a hypertext engine, backed by gemtext and git. It's files all the way down. All files are readable as well as writable by both human and machine.
+
+You are looking at Memex now, either through HTTP or Gemini, or possibly over telnet. It has as many ways to access, as well as several uses.
+
+It's a personal information manager, a wiki, a weblog; all these things and none of them.
+
+The aim is to create something that retains the power of a simple file system abstraction, while also providing the strengths of interconnected hypertext while avoiding some of its problems, especially dead links.
+
+ +
+

Name

+
+It's named after Vannevar Bush's hypothetical invention in the 1945 essay "As we may think".
+
+
+ This is the essential feature of the memex. The process of tying two items together is the important part.
+
+This is true for both his memex, and my memex.
+
+https://encyclopedia.marginalia.nu/wiki/Memex
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/projects/wmsa.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/wmsa.gmi new file mode 100644 index 00000000..731fc6a9 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/projects/wmsa.gmi @@ -0,0 +1,103 @@ + + + + + MEMEX - WMSA + + + + + + +
+ +
+ + +
+
+

WMSA

+
+WMSA, and I wish I remember why it's called that, is the software backing marginalia.nu. It's a Java-based application, based on microservices, that does a lot of different things. It has grown organically over a long period of time. It's bespoke software that serves this website and its many functions, and only this website.
+
+

Name

+
+I'm great at naming things, but less stellar at remembering why I named them such.
+
+
    +
  • Weird Micro-Service Architecture?
  • +
  • World's Most Spectacular Application?
+
+It's really anyone's guess right now.
+
+

Status

+
+ + +
+

Notable parts

+
+ +
+ +/projects/encyclopedia/index.gmi
+
+ +
+ +/projects/edge/index.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/recipes/chicken-soup.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/recipes/chicken-soup.gmi new file mode 100644 index 00000000..e318f478 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/recipes/chicken-soup.gmi @@ -0,0 +1,139 @@ + + + + + MEMEX - Chicken Soup [2021-08-28] + + + + + + +
+ +
+ + +
+
+

Chicken Soup [2021-08-28]

+
+Medium difficulty
+6-8 servings
+30 min prep + 2 hours cooking time
+
+Requires 1 large pot, and one skillet that can handle high heat (=cast iron or carbon steel; no teflon or ceramics, or you will ruin it and your health!)
+
+

Ingredients

+
+
    +
  • 1 large carrot + 1 additional carrot for later
  • +
  • 2 onions
  • +
  • 2-3 stalks of celery
  • +
  • 1 leek
+
+
    +
  • a dozen or more button mushrooms
+
+
    +
  • 800 grams of boneless chicken
+
+
    +
  • chicken stock for 1.5L liquid
  • +
  • 1L water
  • +
  • 250 white cooking wine
  • +
  • a splash of cream
+
+
    +
  • bay leaves
  • +
  • fresh thyme
  • +
  • salt
  • +
  • pepper
+
+
    +
  • potatoes
+
+

Instructions

+
+
    +
  • Slice the vegetables and put in a large pot with a small amount of some cooking grease
  • +
  • Put the pot on medium heat and gently brown the vegetables. You should see a gentle column of steam rising from the vegetables (and smell something delicious)
  • +
  • Quart the mushrooms and put in a skillet, sear on high heat to get rid of the water in them
  • +
  • Put the mushrooms with the vegetables
+
+
    +
  • Cut and the chicken into mouth-sized pieces and quickly brown at medium-high heat, salt and pepper as you go
  • +
  • Put the chicken with the vegetables
+
+
    +
  • Add wine, water, chicken stock to the pot
  • +
  • Add a few bay leaves
  • +
  • Make a bouquet garni with the thyme and a carrot (i.e. tie the thyme to a carrot with some cooking twine) -- or just toss it in there if you haven't got the twine or patience.
  • +
  • Bring to a simmer and let it simmer for as long as you have patience, at least 30 minutes but ideally 2+ hours.
+
+
    +
  • 20 minutes before serving, peel and cut potatoes and add to the soup.
  • +
  • Add a splash of cream
  • +
  • Taste and add more salt if necessary
  • +
  • Fish out the bouquet
  • +
  • Serve
+
+Tastes even better the next day!
+
+

Illustrations

+
+ + +
+

Topic

+
+/topic/cooking.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/recipes/french-borscht.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/recipes/french-borscht.gmi new file mode 100644 index 00000000..494f680c --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/recipes/french-borscht.gmi @@ -0,0 +1,162 @@ + + + + + MEMEX - French Borscht [2021-07-05] + + + + + + +
+ +
+ + +
+
+

French Borscht [2021-07-05]

+
+I realize this may offend frenchmen and eastern europeans alike. That's fine, because it turned out alright. There can be peace.
+
+Note that this is more of a sketch than an actual recipe. You will need to make flavor adjustments.
+
+Simplicity: Intermediate
+Meal: Dinner
+Cooking time: 2+ hours
+Health: It healthy for the soul
+Servings: 6-8
+
+Make sure you have 1 small pot, 1 large pot, and a frying pan available.
+
+

Ingredients

+
+

Mirepoix

+
    +
  • 1 large onion (chopped)
  • +
  • 2 stalks of celery (chopped)
  • +
  • 2 carrots (chopped)
  • +
  • 1-2 tbsp tomato puree
+
+

Stew base

+
    +
  • 1 glass of red wine, cooking wine is fine
  • +
  • 2 vegetable bouillon cubes
  • +
  • Water (1L / 4 cups)
  • +
  • 1 bay leaf
  • +
  • Fresh thyme (or dried, whatever)
+
+

Mushrooms

+
+
    +
  • Button mushrooms (sliced)
+
+

Root Vegetables

+
    +
  • 6 red beats cut into smaller pieces, or sliced
  • +
  • 1 parsnip, sliced
  • +
  • 1 parsley root, sliced (can also just use another parsnip)
+
+

Meat

+
    +
  • Meat appropriate for a stew (400g, cut into mouth-pieces)
  • +
  • Bacon, cubed (about 200g)
+
+

Finisher

+
+
    +
  • Creme fraiche
  • +
  • Balsamic vinegar
  • +
  • Salt
+
+

Instructions

+
+
    +
  • Pre-boil the beats in a separate pot
+
+
    +
  • Gently brown the mirepoix vegetables in a stock pot
  • +
  • Add tomato puree and stir around
  • +
  • When you start getting a brown residue at the bottom of the pot, add the stew base
  • +
  • Bring to a slow boil
+
+
    +
  • Add root vegetables
  • +
  • Sear the mushrooms at high heat to get rid of the liquid in them, then add to the stew
  • +
  • Brown the meat in a skillet, salt and pepper, then add to the stew
+
+
    +
  • Let cook for as long as you have patience, but at least 60-90 minutes
  • +
  • Top off with some more water if necessary
  • +
  • It's done when the beats start getting soft and the meat is falling apart
+
+
    +
  • Add some creme fraiche
+
+
    +
  • Give it a taste, if it tastes overly sweet, add some balsamic vinegar
  • +
  • If it tastes off or bland, add salt
+
+

Topics

+
+/topic/cooking.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/recipes/index.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/recipes/index.gmi new file mode 100644 index 00000000..0acb75f3 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/recipes/index.gmi @@ -0,0 +1,62 @@ + + + + + MEMEX - Recipes + + + + + + +
+ +
+ + + + + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/recipes/omelette-bacon.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/recipes/omelette-bacon.gmi new file mode 100644 index 00000000..6db4197f --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/recipes/omelette-bacon.gmi @@ -0,0 +1,114 @@ + + + + + MEMEX - Bacon Omelette [2021-07-04] + + + + + + +
+ +
+ + +
+
+

Bacon Omelette [2021-07-04]

+
+Simplicity: Easy
+Meal: Breakfast
+Health: It healthy for the soul
+Servings: 4, or 2 if you are hungry
+
+

Ingredients

+
+
    +
  • 7-8 strips of bacon
  • +
  • 6 eggs
  • +
  • Cream
  • +
  • Chives
  • +
  • Cheese (grated or to grate)
  • +
  • Black pepper
+
+

Instructions

+
+
    +
  • Cook bacon in a large skillet until crispy
  • +
  • Take out bacon and leave to dry a bit on a paper towel
  • +
  • Turn off the heat and *leave the bacon grease in skillet!*
+
+
    +
  • Crack the eggs in a bowl
  • +
  • Add a splash of cream to the eggs
  • +
  • Add some black pepper
  • +
  • Chop the bacon into smaller bits
  • +
  • Put the bacon in the batter
  • +
  • Whisk
+
+
    +
  • Pour the batter into the skillet
  • +
  • Grate some cheese on top of it
  • +
  • Set it on low-medium heat, add a lid if you've got one that fits
  • +
  • Wait until it sets
  • +
  • Let it cool for a bit!
  • +
  • Cut a generous amount of chives sprinkle on top of it
+
+If you are too impatient to let the omelette cool, putting chives on a piping hot omelette will cook them and then they lose all flavor.
+
+
    +
  • Fold the omelette on top of itself and cut into portion sizes
  • +
  • Eat it while it's hot!
+
+

Topics

+
+/topic/cooking.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/search-about.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/search-about.gmi new file mode 100644 index 00000000..73497b2c --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/search-about.gmi @@ -0,0 +1,48 @@ + + + + + MEMEX - + + + + + +
+ +
+ +
+
+

/search-about.gmi is gone

+

+Confusingly gemini-specific + +

+
+ + + +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/server.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/server.gmi new file mode 100644 index 00000000..a62292a9 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/server.gmi @@ -0,0 +1,48 @@ + + + + + MEMEX - + + + + + +
+ +
+ + + + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/special/index.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/special/index.gmi new file mode 100644 index 00000000..d3229b6f --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/special/index.gmi @@ -0,0 +1,64 @@ + + + + + MEMEX - Special Files + + + + + + +
+ +
+ + +
+
+

Special Files

+
+These special files configure the server to set up "gone"-messages and redirects for files that have been moved or removed.
+
+ + + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/special/redirect.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/special/redirect.gmi new file mode 100644 index 00000000..7d20fd59 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/special/redirect.gmi @@ -0,0 +1,92 @@ + + + + + MEMEX - Redirects + + + + + + +
+ +
+ + +
+
+

Redirects

+
+This file contains redirect URLs
+
+

URLs

+
+ + + + + + + + + + + + + + + + + + + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/special/tombstone.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/special/tombstone.gmi new file mode 100644 index 00000000..273ed6f3 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/special/tombstone.gmi @@ -0,0 +1,85 @@ + + + + + MEMEX - Tombstone + + + + + + +
+ +
+ + +
+
+

Tombstone

+
+This special file contains information about removed resources.
+
+

Removed links

+ + + + + + + + + + +/junk/DedupTest.gmi
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/test.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/test.gmi new file mode 100644 index 00000000..a3390d2e --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/test.gmi @@ -0,0 +1,42 @@ + + + + + + + MEMEX - /test.gmi + + + + + +
+ + +
+
+
+ +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/todo.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/todo.gmi new file mode 100644 index 00000000..7bf0a906 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/todo.gmi @@ -0,0 +1,48 @@ + + + + + MEMEX - + + + + + +
+ +
+ +
+
+

/todo.gmi is gone

+

+Empty file + +

+
+ + + +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/todo/done.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/todo/done.gmi new file mode 100644 index 00000000..9acc3c80 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/todo/done.gmi @@ -0,0 +1,770 @@ + + + + + MEMEX - Done + + + + + + +
+ +
+ + +
+
+ +

Done

+
+

Done 2022-01-30

+
- generate better default thumbnail on the fly (/)
+
+
+

Done 2022-01-19

+
- public API gateway (/)
+
+

Done 2022-01-16

+
- overhaul CSS of MEMEX (/)
+
+
+

Done 2022-01-15

+
- Improved random (/)
+
+      INSERT INTO EC_RANDOM_DOMAINS 
+            SELECT DISTINCT(EC_DOMAIN.ID) FROM EC_DOMAIN_NEIGHBORS A
+            INNER JOIN EC_DOMAIN_NEIGHBORS B ON B.NEIGHBOR_ID=A.DOMAIN_ID
+            INNER JOIN EC_DOMAIN_NEIGHBORS C ON C.NEIGHBOR_ID=B.DOMAIN_ID
+            INNER JOIN EC_DOMAIN ON A.DOMAIN_ID=EC_DOMAIN.ID
+            WHERE C.DOMAIN_ID IN (SELECT ID FROM EC_DOMAIN WHERE URL_PART IN (secret-sauce))
+            AND EC_DOMAIN.STATE>=0;
+
+

Done 2022-01-14

+
- Dark Mode (/)
+
- Screengrabs by domain (/)
+
- Revise exploration mode (/)
+
- Improve keyboard navigation (/)
+
+

Done 2022-01-12

+
- Search redesign (/)
+
- Fixed dictionary corruption bug (/)
+
+

Done 2022-01-04

+
- Improve site:-query QOL (/)
+
- Fix byte folder bug (/)
+
+
- refactor EC_URL (/)
+
+  ALTER TABLE EC_URL MODIFY COLUMN PROTO ENUM('http', 'https', 'gemini') NOT NULL;
+
-- put visit-metadata in separate table (/)
+
+
+

Done 2021-12-03

+
+
- fix bug in language detection (/)
+
-- re-fetching some pages (/)
+
+

Done 2021-12-02

+
+
- new approach for query rewriting (/)
+
+

Done 2021-11-14

+
+
- make site:-queries return a dummy entry when no site information is available (/)
+
+

Done 2021-11-11

+
+
- hybridized ordering of domains on reindex, F(previous rank, previous quality). (/)
+
+
- mark documents with audio, video, object tags (/)
+
+

Done 2021-11-10

+
+
- car service <2021-11-18> (/)
+
+
+

Done 2021-10-30

+
- Add auto redirects for guesswork rss/atom/feed-requests to /log/feed.xml (/)
+
+
+

Done 2021-10-29

+
+
- investigate extracting more keywords (/)
+
-- textrank (/)
+ +
-- sideload additional keywords for most popular sites (/)
+
+

Done 2021-10-12

+
+
- refactor index converter (/)
+
- clean up code garbage (/)
+
+

Done 2021-10-05

+
- trial more vanilla PageRank approach as a tertiary algorithm (/)
+
+
- fix a search result priortization bugs for mixed rankings (/)
+
+ +
+    It is reportedly broken
+ +
+
- fix potential DoS where certain search queries with a large number of common but mutually exclusive terms would take forever to process. (/)
+
+    test query: generic stores underground unusual
+
+

Done 2021-10-03

+
+
- prioritize n-gram matches over word matches (/)
+
- show informative error page when the index server reboots (/)
+
+

Done 2021-10-02

+
+
- Personalized Page Rank (/)
+
- Duelling Algorithms (/)
+
+

Done 2021-10-30

+
+
- Launch October Update (/)
+
+

Done 2021-09-26

+
- fix broken search use-cases (/)
+
-- c language (/)
+
-- 67 chevy (/)
+
-- 68000 (/)
+
-- c# (/)
+
-- @twitterhandle (/)
+
-- #hashtag (/)
+
+
- trial tar based archiving to save the poor ext4 fs (/)
+
+
- use words to tag document format etc (/)
+
+
- dynamic re-bucketing based on something like (/)
+
+    SELECT DEST.URL_PART,EXP(DEST.QUALITY)*SUM(EXP(SOURCE.QUALITY)) AS Q from EC_DOMAIN DEST INNER JOIN EC_DOMAIN_LINK ON DEST.ID=DEST_DOMAIN_ID INNER JOIN EC_DOMAIN SOURCE ON SOURCE.ID=SOURCE_DOMAIN_ID WHERE DEST.INDEXED>0 GROUP BY DEST_DOMAIN_ID
+
+
+

Done 2021-09-19

+
+
- Fix several indexing bugs that hid relevant search results (/)
+
+

Done 2021-09-17

+
+
- Added search profiles (/)
+
+

Done 2021-09-16

+
+
- Rephrased an error message that some people took to mean they weren't speaking a proper language (/)
+
+

Done 2021-09-15

+
+
- Using in-site domain link-names to add search terms (/)
+
- Fixed buggy default content-type (/)
+
- Even more aggressive unicode language dectection (/)
+
+

Done 2021-09-11

+
- Status flag for domains (/)
+
+    Indexed, Active, Blocked
+
- Improve topic detection (/)
+
+

Done 2021-09-09

+
+
- Tuned search results to demote very short results (/)
+
+

Done 2021-09-08

+
+
- Encyclopedia tries harder to find the right article if the case match isn't exact (/)
+
+

Done 2021-09-06

+
+
- Breaking changes for next Index-rebuild (/)
+
-- Change writer bucket scaling to 1/4 (/)
+
-- Move protocol and port from EdgeDomain to EdgeURL (/)
+
-- Change database schemas to reflect (/)
+
-- ISO-8859-1/UTF-8 charset sniffer (/)
+
-- Fixed a bug that would occasionally cause the crawler to re-index the same working set multiple times (/)
+
+
+
+

Done 2021-09-02

+
+
- improve edge-director throughput (/)
+
- give edge-director state for semi-blocking tasks (/)
+
+

Done 2021-08-31

+
+
- optimize URL index size (/)
+
+

Done 2021-08-28

+
+
- clean up gemini navigation (/)
+
- Atom feed for HTTPS and Gemini (/)
+
+
+

Done 2021-08-27

+
- Feed gemini server with rendered gmi-content (/)
+
-- Output the content (/)
+
-- Generate feeds (/)
+ +
-- Switch over (/)
+
+
+

Done 2021-08-26

+
- Absorb gemini server into WMSA (/)
+
+

Done 2021-08-25

+
- wildcard domain for marginalia.nu (/)
+
-- move memex to memex-subdomain (/)
+
+
- feeds on FEED pragma (/)
+
+

Done 2021-08-24

+
- Top nav bar overhaul (/)
+
+

Done 2021-08-23

+
- add marker for which files are todo files (/)
+
+    Added %%%/pragmas for toggling behavior
+
-- Added template helpers for consuming pragmas (/)
+
-- Used to improve topic pages (/)
+
+
- Fixes for git (/)
+
+

Done 2021-08-22

+
- File manager (/)
+
-- Delete (/)
+ +
-- Move/Rename (/)
+
--- System for tombstones/redirects (/)
+
+
- Edit for / does not work (/)
+
+    Needed better support for non-normalized URLs, e.g. //index.gmi
+
+
- Backlinks for index (/)
+
+
+

Done 2021-08-21

+
- Git Integration (/)
+
-- Use commit hooks to trigger pull (/)
+https://git-scm.com/book/uz/v2/Appendix-B%3A-Embedding-Git-in-your-Applications-JGit
+
+
- Recursive directory watch (/)
+
+
- Two column layout (/)
+
+

Done 2021-08-20

+
- Overhaul MEMEX navigation (/)
+
-- Navigation bar (/)
+ +
-- Editing (/)
+
--- Add update-root link (/)
+
+
- Tombstones aren't generated properly on-delete (/)
+
+  The tombstone db wasn't properly
+  reloaded after being updated.
+
+
- Just write static files to disk instead of using an intermediary backend server. (/)
+
-- Use alias directive to set different root for memex path. (/)
+
-- Content-type is finnicky (/)
+
+  I want to serve html-wrapped .gmi and .html 
+       location ~* \.(gmi|png)$ {
+            types {
+                text/html gmi;
+                text/html png;
+            }
+        }
+
+
+

Done 2021-08-19

+
- Move away from statically generated HTML forms in memex (/)
+
+
- Fix stability of podcast scraper (/)
+
+
- Get crawling up again (/)
+
-- Monitoring (/)
+
--- Extraction (/)
+
--- Status page (/)
+
-- Scraper config (/)
+
-- DNS cache (?)
+
-- IP Block CDNs (/)
+
--- Parse CIDR (/)
+
+    Apache Commons.Net SubnetUtil seems to
+    do the job, although it can't deal 
+    with IPV6 :-/
+
--- CloudFlare (/)
+
+    173.245.48.0/20
+    103.21.244.0/22
+    103.22.200.0/22
+    103.31.4.0/22
+    141.101.64.0/18
+    108.162.192.0/18
+    190.93.240.0/20
+    188.114.96.0/20
+    197.234.240.0/22
+    198.41.128.0/17
+    162.158.0.0/15
+    172.64.0.0/13
+    131.0.72.0/22
+    104.16.0.0/13
+    104.24.0.0/14
+    2400:cb00::/32
+    2606:4700::/32
+    2803:f800::/32
+    2405:b500::/32
+    2405:8100::/32
+    2a06:98c0::/29
+    2c0f:f248::/32
+
--- Fastly (/)
+
+    23.235.32.0/20
+    43.249.72.0/22
+    103.244.50.0/24
+    103.245.222.0/23
+    103.245.224.0/24
+    104.156.80.0/20
+    146.75.0.0/17
+    151.101.0.0/16
+    157.52.64.0/18
+    167.82.0.0/17
+    167.82.128.0/20
+    167.82.160.0/20
+    167.82.224.0/20
+    172.111.64.0/18
+    185.31.16.0/22
+    199.27.72.0/21
+    199.232.0.0/16
+
+
- Refactor task management (/)
+
-- Fix prepend (/)
+
-- Add tests (/)
+
+
- Refactor Floyd-Steinberg ditherer (/)
+
+
- Todo move-to-done function puts header last in #Done (/)
+
+

Done 2021-08-16

+
- Pictures-in-HTML (/)
+
-- Implement compression via Floyd-Steinberg dithering (/)
+https://encyclopedia.marginalia.nu/wiki/Floyd%E2%80%93Steinberg_dithering
+http://image4j.sourceforge.net/javadoc/index.html?net/sf/image4j/util/ConvertUtil.html
+
--- Ensure 4 bit (/)
+
--- On upload (/)
+ +
-- Render image views (/)
+
--- Add to index (/)
+
-- Upload form (/)
+
+

Done 2021-08-15

+
- CSS fixes for mobile (/)
+
-- text align for tasks (/)
+
-- indent overflowed tasks (/)
+
+
- Fix CME (/)
+
+    java.util.ConcurrentModificationException: null
+        at java.util.HashMap.forEach(HashMap.java:1428) ~[?:?]
+        at nu.marginalia.wmsa.memex.MemexData.forEach(MemexData.java:51) ~[WMSA-1628951793.jar:?]
+        at nu.marginalia.wmsa.memex.Memex.reRender(Memex.java:49) ~[WMSA-1628951793.jar:?]
+        at io.reactivex.rxjava3.core.Scheduler$PeriodicDirectTask.run(Scheduler.java:566) [WMSA-1628951793.jar:?]
+        at io.reactivex.rxjava3.core.Scheduler$Worker$PeriodicTask.run(Scheduler.java:513) [WMSA-1628951793.jar:?]
+        at io.reactivex.rxjava3.internal.schedulers.ScheduledRunnable.run(ScheduledRunnable.java:65) [WMSA-1628951793.jar:?]
+        at io.reactivex.rxjava3.internal.schedulers.ScheduledRunnable.call(ScheduledRunnable.java:56) [WMSA-1628951793.jar:?]
+        at java.util.concurrent.FutureTask.run(FutureTask.java:264) [?:?]
+        at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:304) [?:?]
+        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1130) [?:?]
+        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:630) [?:?]
+        at java.lang.Thread.run(Thread.java:832) [?:?]
+    ERROR 2021-08-14 16:36:39,467 RxCachedThreadScheduler-2 MemexMain           : Uncaught exception
+    java.util.ConcurrentModificationException: null
+        at java.util.HashMap.forEach(HashMap.java:1428) ~[?:?]
+        at nu.marginalia.wmsa.memex.MemexData.forEach(MemexData.java:51) ~[WMSA-1628951793.jar:?]
+        at nu.marginalia.wmsa.memex.Memex.reRender(Memex.java:49) ~[WMSA-1628951793.jar:?]
+        at io.reactivex.rxjava3.core.Scheduler$PeriodicDirectTask.run(Scheduler.java:566) ~[WMSA-1628951793.jar:?]
+        at io.reactivex.rxjava3.core.Scheduler$Worker$PeriodicTask.run(Scheduler.java:513) ~[WMSA-1628951793.jar:?]
+        at io.reactivex.rxjava3.internal.schedulers.ScheduledRunnable.run(ScheduledRunnable.java:65) [WMSA-1628951793.jar:?]
+        at io.reactivex.rxjava3.internal.schedulers.ScheduledRunnable.call(ScheduledRunnable.java:56) [WMSA-1628951793.jar:?]
+        at java.util.concurrent.FutureTask.run(FutureTask.java:264) [?:?]
+        at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:304) [?:?]
+        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1130) [?:?]
+        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:630) [?:?]
+        at java.lang.Thread.run(Thread.java:832) [?:?]
+
+

Done 2021-08-14

+
- Automatic TODO task categorization (/)
+
- Login API on separate service (/)
+
-- Set up service (/)
+
-- Route requests (/)
+
+
- Fix header auto-location (/)
+
+
- Display top tasks in index (/)
+
+

Done 2021-08-10

+
+
-- + in URLs? (/)
+
+  proxy_pass with / forces nginx to parse the url (why?)
+  Bad:
+        proxy_pass http://127.0.0.1:5025/public/wiki/
+  Good:
+        rewrite ^ $request_uri
+        rewrite ^/(.*) /public/$1 break;
+        return 400;
+        proxy_pass http://127.0.0.1:5025$uri;
+
+
- Encyclopedia (/)
+
-- Search API (/)
+
-- code tags (/)
+
+
+

Done 2021-08-06

+
+
- Memex (/)
+
-- GemtextParser (/)
+
-- Service skeleton (/)
+
-- Link extraction (/)
+
-- Rendering (/)
+
--- Stylesheet (/)
+ +
-- Uppdateringar (/)
+
--- API (/)
+
--- Formulär (/)
+
+

Done 2021-08-04

+
+
- Service Lockdown (/)
+
-- X-Public header in code (/)
+
+
-- Move endpoints (/)
+
--- Resource Store (/)
+
--- Search (/)
+
--- Assistant (/)
+
+
-- Update clients (/)
+
--- Resource Store (/)
+
--- Search Service (/)
+ +
+
-- Update nginx (/)
+
-- Update links on website (/)
+
+
- Tune wiki archive fs (/)
+
+    sudo tune2fs -O ^dir_index /dev/nvme0n1p2
+    
+
- marginalia.nu:9999 "BBS" (/)
+
+

Done 2021-08-03

+
+
- encyclopedia.marginalia.nu (/)
+
+
- Verify automatic backup of git (/)
+
+
- Reddit frontend (/)
+
-- Scraper: (/)
+
-- API: Marginalia 2: (/)
+
+
- Wiki (/)
+
-- on Optane (/)
+
-- fix Hildegard of Bingen (/)
+
+
- Block bots on nginx (/)
+
+    https://kb.linuxlove.xyz/nginx-badbotblocker.html
+
+

Done 2021-08-02

+
+
- Install Optane (/)
+
-- Migrate MariaDB (/)
+
+
- Wiki (/)
+
-- redirects (/)
+
-- top notices (/)
+
+
- Bucket4J rate limiting (/)
+
+
- Service Monitoring (/)
+
+

Done 2021-08-01

+
+
- Update Cert (/)
+
- Backups for git (/)
+
+

Done 2021-07-30

+
+
- Load Wikidata from ZIM (/)
+
- Migrate Server to Debian Buster (/)
+
+

Done 2021-07-28

+
+
- Update description generation algorithm (/)
+
-- Recalculate descriptions (...) (/)
+
+
- Wiki data (/)
+
-- Load data (/)
+
-- Wrap wikipedia (/)
+ +
-- Wikipedia Cleaner (/)
+
+

Done 2021-07-27

+
+
- Spell checker service? (/)
+https://github.com/wolfgarbe/SymSpell
+
+
- Calculations (/)
+
-- Detection (/)
+
-- Parser (/)
+
-- Unit conversion (/)
+
--- Temperature (/)
+
--- Distance (/)
+
--- Weight (/)
+
--- Area (/)
+
--- Volume (/)
+
+

Done 2021-07-26

+
+
- Save websites to disk? (/)
+
-- GZipped (/)
+
-- XFS (?)
+
+
- Local backlinks in GMI (/)
+
-- Parse GMI for links and titles (/)
+
-- Create tags system (/)
+
+
- Use prime sizing for HashMap! (/)
+
-- How to find primes (/)
+
+
- Arbitarary size HashMap (/)
+
+

Done 2021-07-25

+
+
- Syntax for orgmode + GMI in kate (/)
+
+  Use /usr/share/kde4/apps/katepart/syntax/markdown.xml 
+
+

Done 2021-07-23

+
+
- Dictionary analysis in scraping (/)
+
+   It seems viable to estimate 
+   the lanaguage of a document 
+   based on the overlap with a
+   N-most-common-words dictionary. 
+   Threshold 0.05 ok?
+
-- English (/)
+
-- Swedish (/)
+
-- Latin (/)
+
+
- Clean up tests (/)
+
+

Done 2021-07-22

+
+GZip Compression stats:
+
+   63% old
+   21% new
+
+
- Hash map (/)
+
-- Contiguous memory bins (/)
+
+
- Key Folding (/)
+
-- For strings (/)
+
-- For integers (/)
+ +
+
- Debian Desktop (/)
+
-- Docker (/)
+
-- Java 14 (/)
+
-- IntelliJ (/)
+
-- Code (/)
+
-- Gradle (/)
+
-- OrgMode (/)
+
+

Done 2021-07-21

+
+
- Bugfix: Domain Resolution (/)
+
+

Done 2021-07-20

+
+
- Index Changes (/)
+
-- Remove Junk Logging (/)
+
-- Split Query (/)
+
-- Implement in Frontend (/)
+
+
+
- Dictionary Service (/)
+
-- Add Index To Table (/)
+
-- Populate test db (/)
+
-- Build tests (/)
+
-- Integrate into frontend (/)
+
+
+
- Site Information (/)
+
-- Fetch (/)
+
-- 404 (/)
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/todo/index.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/todo/index.gmi new file mode 100644 index 00000000..d98f10da --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/todo/index.gmi @@ -0,0 +1,62 @@ + + + + + MEMEX - Todo + + + + + + +
+ +
+ + +
+
+

Todo

+
+This was a sandbox for testing todo functionality. It's not currently in use.
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/todo/todo.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/todo/todo.gmi new file mode 100644 index 00000000..a78152cf --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/todo/todo.gmi @@ -0,0 +1,110 @@ + + + + + MEMEX - Todo + + + + + + +
+ +
+ + +
+
+ +

Todo

+
+
- finish MEMEX CSS
+
+
- add ability to tune term priority
+
+
+
- move browse feature out of search service
+
-- separate screenshots module (/)
+
+
- random feature in API (?)
+
+
- regenerate Encyclopedia with new dump
+
-- update stylesheet
+
-- update document database (?)
+
+
- different subdomains (operation Voight-Kampff)
+
-- global rate limit HTTPS, secure with no MITM
+
-- global rate limit HTTP for legacy systems
+
-- backup access via Cloudflare for rate-limit breaching events
+
+
- create a form for submitting domains
+
- special/annotations.gmi
+
+  Add ability to add remarks to files, esp.
+  pictures. 
+
- CSV support for MEMEX
+
-- Overhaul file types in general
+
+
- Merge MEMEX with rest of marginalia infrastructure
+
-- Gemini (/)
+
-- Website
+
--- Copy over pages from website (/)
+ +
-- Encyclopedia (?)
+ter
+
+

Backlog

+
+
- Break apart index service
+
-- Index Writer
+
-- Dictionary Service
+
-- Index Reader
+
-- Index Conver
+
+
- Persistent task links (what's a good scheme?) (?)
+
+
- New motherboard for server :-/
+ + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/topic/astrolabe.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/topic/astrolabe.gmi new file mode 100644 index 00000000..21477749 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/topic/astrolabe.gmi @@ -0,0 +1,190 @@ + + + + + MEMEX - Topic: Astrolabe + + + + + + +
+ +
+ + +
+
+

Topic: Astrolabe

+
+ + + + + +
+
/commons/search-failure-modes.gmi
+
Failure Modes in Search - Topics
+ +
/links/fragments-old-web.gmi
+
Fragments of the Old Web - Topic
+ +
/log/01-astrolabe.gmi
+
The Astrolabe Part I: Lenscraft [2021-07-07] - Topics
+ +
/log/04-link-farms.gmi
+
On Link Farms [2021-07-14] - Topics
+ +
/log/06-optimization.gmi
+
Index Optimizations [2021-07-23] - Topics
+ +
/log/10-astrolabe-2-sampling-bias.gmi
+
The Astrolabe Part II: The Magic Power of Sampling Bias [2021-08-03] - Topic
+ +
/log/18-soaring-high.gmi
+
Soaring High [2021-09-02] - Topics
+ +
/log/19-website-discoverability-crisis.gmi
+
The Small Website Discoverability Crisis [2021-09-08] - Topic
+ +
/log/20-dot-com-link-farms.gmi
+
The Curious Case of the Dot-Com Link Farms [2021-09-09] - Topics
+ +
/log/21-new-solutions-old-problems.gmi
+
New Solutions Creating Old Problems [2021-09-14] - Topic
+ +
/log/22-against-the-flood.gmi
+
Against the Flood [2021-09-19] - Topic
+ +
/log/25-october-update.gmi
+
Astrolabe - The October Update [2021-10-01] - Topics
+ +
/log/26-personalized-pagerank.gmi
+
Experimenting with Personalized PageRank [2021-10-02] - Topic
+ +
/log/27-getting-with-the-times.gmi
+
Getting with the times [2021-10-06] - Topics
+ +
/log/28-web-browsing.gmi
+
Web Browsing [2021-10-09] - Topic
+ +
/log/29-botnet-ddos.gmi
+
The Mystery of the Ceaseless Botnet DDoS [2021-10-10] - Topics
+ +
/log/31-ngram-needles.gmi
+
Shaking N-gram needles from large haystacks [2021-10-22] - Topics
+ +
/log/37-keyword-extraction.gmi
+
A Jaunt Through Keyword Extraction [ 2021-11-11 ] - Topic
+ +
/log/38-old-and-new.gmi
+
Old and New [ 2021-11-12 ] - Topic
+ +
/log/41-search-result-relevance.gmi
+
Search Result Relevance [2021-12-10] - Topics
+ +
/log/44-discovery-and-design.gmi
+
Discovery and Design Considerations [ 2022-01-18 ] - Topic
+ +
/log/45-unfuck-internet-discoverability.gmi
+
Can we unfuck internet discoverability? [ 2022-02-04 ] - Topic
+ +
/log/46-anatomy-of-search-engine-spam.gmi
+
The Anatomy of Search Engine Spam [2022-02-07] - Topic
+ +
/log/49-marginalia-1-year.gmi
+
Marginalia Search: 1 year [ 2022-02-25 ] - Topic
+ +
/log/52-growing-pains.gmi
+
Growing Pains [ 2022-03-23 ] - Topic
+ +
/log/54-bargain-bin-btree.gmi
+
The Bargain Bin B-Tree [ 2022-04-07 ] - Topics
+ +
/log/55-lexicon-rubberduck.gmi
+
Lexicon Architectural Rubberducking [ 2022-04-11 ] - Topics
+ +
/log/56-uncertain-future.gmi
+
Uncertain Future For Marginalia Search [ 2022-04-28 ] - Topics
+ +
/log/58-marginalia-open-source.gmi
+
marginalia.nu goes open source [ 2022-05-27 ] - Topics
+ +
/log/59-anchor-text.gmi
+
Fun with Anchor Text Keywords [ 2022-06-23 ] - Topics
+ +
/log/62-marginaliacoin.gmi
+
Marginaliacoin, and hidden forums [ 2022-08-18 ] - Topic
+ +
/log/63-marginalia-crawler.gmi
+
The Evolution of Marginalia's crawling [ 2022-08-23 ] - Topic
+ +
/log/64-hundred-million.gmi
+
Marginalia's Index Reaches 100,000,000 Documents [ 2022-10-21 ] - Topics
+ +
/log/66-carbon-dating.gmi
+
Carbon Dating HTML [ 2022-10-27 ] - Topics
+ +
/log/69-creepy-website-similarity.gmi
+
Creepy Website Similarity [ 2022-12-26 ] - Topic
+ +
/log/70-faster-index-joins.gmi
+
Faster Index Joins [ 2023-01-03 ] - Topic
+ +
/log/74-marginalia-2-years.gmi
+
Marginalia Search: 2 years, big news [ 2023-02-26 ] - Topics
+ +
/projects/edge
+
Edge
+ +
/projects/edge/about.gmi
+
About search.marginalia.nu - Links
+ +
/projects/edge/design-notes.gmi
+
Notes on Designing a Search Engine - Links
+ +
/projects/edge/supporting.gmi
+
Support This Project - Topics
+ +
/projects/edge/top-20.gmi
+
Top 20 - Topic
+
+ + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/topic/cooking.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/topic/cooking.gmi new file mode 100644 index 00000000..fb7c2722 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/topic/cooking.gmi @@ -0,0 +1,75 @@ + + + + + MEMEX - Topic: Cooking + + + + + + +
+ +
+ + + + + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/topic/index.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/topic/index.gmi new file mode 100644 index 00000000..34e44af5 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/topic/index.gmi @@ -0,0 +1,65 @@ + + + + + MEMEX - Topics + + + + + + +
+ +
+ + +
+
+

Topics

+
+A quick way of finding related notes. These "tags" are implemented using automatic backlinks.
+
+ + + + + + + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/topic/moral-philosophy.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/topic/moral-philosophy.gmi new file mode 100644 index 00000000..0bd819f1 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/topic/moral-philosophy.gmi @@ -0,0 +1,87 @@ + + + + + MEMEX - Topic: Moral Philosophy + + + + + + +
+ +
+ + +
+
+

Topic: Moral Philosophy

+
+
+ If we labor to achieve something good, the labor fades but the good deed remains. If we derive pleasure from doing a bad deed, the pleasure fades, but the bad deed remains.
+
+- Musonius Rufus, possibly apocryphal
+
+ + + + + +
+
/commons/self-interest.gmi
+
On acting in self-interest - Topic
+ +
/log/11-dying-every-day.gmi
+
Dying, Every Day (Re: Last times) [2021-08-04] - Topic
+ +
/log/15-stages-of-being.gmi
+
Stages of Being [2021-08-23] - Topic
+ +
/log/16-cursed-motivation.gmi
+
Cursed Motivation [2021-08-27] - Topic
+ +
/log/34-internet-arguments.gmi
+
A Polemic Against Internet Arguments [2021-11-02] - Topics
+ +
/log/60-prescriptive-descriptions.gmi
+
On Prescriptive Descriptions [ 2022-07-14 ] - Topic
+
+ + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/topic/nlnet.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/topic/nlnet.gmi new file mode 100644 index 00000000..67219c1c --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/topic/nlnet.gmi @@ -0,0 +1,75 @@ + + + + + MEMEX - Topic: NLnet + + + + + + +
+ +
+ + + + + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/topic/platforms.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/topic/platforms.gmi new file mode 100644 index 00000000..ed529c8c --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/topic/platforms.gmi @@ -0,0 +1,75 @@ + + + + + MEMEX - Topic: Platforms + + + + + + +
+ +
+ + + + + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/topic/programming.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/topic/programming.gmi new file mode 100644 index 00000000..91d93437 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/topic/programming.gmi @@ -0,0 +1,102 @@ + + + + + MEMEX - Topic: Programming + + + + + + +
+ +
+ + +
+
+

Topic: Programming

+ + + + + +
+
/log/02-re-tests.gmi
+
Re: To unit test or not to unit test, that is the question [ 2021-07-08 ] - Topics
+ +
/log/06-optimization.gmi
+
Index Optimizations [2021-07-23] - Topics
+ +
/log/30-unintuitive-optimization.gmi
+
Unintuitive Optimization [2021-10-13] - Topics
+ +
/log/36-localized-programming-languages.gmi
+
Localized Programming Languages [ 2021-11-05 ] - Topics
+ +
/log/50-meditation-on-software-correctness.gmi
+
A meditation on correctness in software [ 2022-03-14 ] - Topics
+ +
/log/51-the-file-startup.gmi
+
The Static File Startup [ 2022-03-18 ] - Topics
+ +
/log/53-better-hard-drive-metaphor.gmi
+
Is There A Better Hard Drive Metaphor? [ 2022-04-03 ] - Topics
+ +
/log/55-lexicon-rubberduck.gmi
+
Lexicon Architectural Rubberducking [ 2022-04-11 ] - Topics
+ +
/log/57-dont-know-how-to-build-software.gmi
+
I don't know how to build software [ 2022-05-06 ] - Topic
+ +
/log/68-wizards-vs-sorcerers.gmi
+
On Wizards and Sorcerers [ 2022-12-23 ] - Topic
+ +
/log/69-creepy-website-similarity.gmi
+
Creepy Website Similarity [ 2022-12-26 ] - Topic
+ +
/log/70-faster-index-joins.gmi
+
Faster Index Joins [ 2023-01-03 ] - Topic
+ +
/projects/gemini-server.gmi
+
Gemini Server - Topics
+
+ + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/topic/satire.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/topic/satire.gmi new file mode 100644 index 00000000..58f42b70 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/topic/satire.gmi @@ -0,0 +1,69 @@ + + + + + MEMEX - Topic: Satire + + + + + + +
+ +
+ + + + + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/topic/server.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/topic/server.gmi new file mode 100644 index 00000000..23bfd348 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/topic/server.gmi @@ -0,0 +1,69 @@ + + + + + MEMEX - Topic: Server + + + + + + +
+ +
+ + + + + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/topic/web-design.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/topic/web-design.gmi new file mode 100644 index 00000000..59cbdbde --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/topic/web-design.gmi @@ -0,0 +1,96 @@ + + + + + MEMEX - Topic: Web Design + + + + + + +
+ +
+ + +
+
+

Topic: Web Design

+ + + + + +
+
/links/fragments-old-web.gmi
+
Fragments of the Old Web - Topic
+ +
/log/00-linkpocalypse.gmi
+
Thoughts on the linkpocalypse [2021-06-30] - Topics
+ +
/log/03-writing-for-reading.gmi
+
Writing for Reading [2021-07-12] - Topics
+ +
/log/07-local-backlinks.gmi
+
Local Backlinks [2021-07-26] - Topics
+ +
/log/08-whatever-happened-to-the-memex.gmi
+
Whatever happened to the Memex? [2021-07-28] - Topics
+ +
/log/13-static-html.gmi
+
Rendered static HTML [2021-08-13] - Topics
+ +
/log/19-website-discoverability-crisis.gmi
+
The Small Website Discoverability Crisis [2021-09-08] - Topic
+ +
/log/23-re-software-and-branding.gmi
+
Re: Software and Branding [2021-09-21] - Topics
+ +
/log/32-bot-apologetics.gmi
+
Bot Apologetics [2021-10-25] - Topics
+ +
/log/33-rude-guests.gmi
+
The Parable of A Rude Guest [2021-10-28] - Topics
+ +
/log/71-memex-design.gmi
+
Memex Design [ 2023-01-13 ] - 2.1 Backlinks are cool
+
+ + +
+ +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/topics.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/topics.gmi new file mode 100644 index 00000000..bc41adf9 --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/topics.gmi @@ -0,0 +1,48 @@ + + + + + MEMEX - + + + + + +
+ +
+ + + + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/worklog.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/worklog.gmi new file mode 100644 index 00000000..0841243e --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/worklog.gmi @@ -0,0 +1,48 @@ + + + + + MEMEX - + + + + + +
+ +
+ +
+
+

/worklog.gmi is gone

+

+Old and unused file. + +

+
+ + + +
+ + diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/writing-topics.gmi b/code/processes/converting-process/src/test/resources/memex-marginalia/writing-topics.gmi new file mode 100644 index 00000000..feccff5e --- /dev/null +++ b/code/processes/converting-process/src/test/resources/memex-marginalia/writing-topics.gmi @@ -0,0 +1,70 @@ + + + + + MEMEX - Topics to write about + + + + + + +
+ +
+ + +
+
+

Topics to write about

+
+
    +
  • How supposedly secure practices like containerization and ubiquitous encryption makes software harder to inspect and in turn makes us more susceptible to unreliable software by forcing us into a degree of blind trust that is not necessary with supposedly less secure procedures.
  • +
  • Memex writeup
  • +
  • Memex image processing writeup
  • +
  • It's a bit strange, using a hypertext format as an application platform to build a hypertext system upon.
+https://encyclopedia.marginalia.nu/wiki/Inner-Platform_Effect
+
    +
  • The ethics of borrowing computer resources to present a web page
+https://encyclopedia.marginalia.nu/wiki/Parkinson's_law
+
    +
  • The inevitability of progress; contrast, the inevitability of decline
+ + + +
+ +
+ + diff --git a/code/processes/crawl-job-extractor-process/build.gradle b/code/processes/crawl-job-extractor-process/build.gradle index 1c4ef1c4..0aaecd62 100644 --- a/code/processes/crawl-job-extractor-process/build.gradle +++ b/code/processes/crawl-job-extractor-process/build.gradle @@ -19,10 +19,11 @@ application { tasks.distZip.enabled = false dependencies { + implementation project(':code:common:process') + implementation project(':code:common:model') implementation project(':code:common:service') implementation project(':code:process-models:crawling-model') - implementation project(':code:features-crawl:crawl-plan') implementation libs.lombok annotationProcessor libs.lombok diff --git a/code/processes/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java b/code/processes/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java index 5dcf0056..82213c45 100644 --- a/code/processes/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java +++ b/code/processes/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java @@ -3,7 +3,7 @@ package nu.marginalia.crawl; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.crawling.model.CrawlingSpecification; +import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.dbcommon.DomainBlacklistImpl; diff --git a/code/processes/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java b/code/processes/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java index faa4e472..b3df8c94 100644 --- a/code/processes/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java +++ b/code/processes/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java @@ -1,6 +1,6 @@ package nu.marginalia.crawl; -import nu.marginalia.crawling.model.CrawlingSpecification; +import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.dbcommon.DomainBlacklistImpl; import nu.marginalia.service.module.DatabaseModule; diff --git a/code/processes/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobSpecWriter.java b/code/processes/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobSpecWriter.java index f853173f..7622ecd0 100644 --- a/code/processes/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobSpecWriter.java +++ b/code/processes/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobSpecWriter.java @@ -2,7 +2,7 @@ package nu.marginalia.crawl; import com.github.luben.zstd.ZstdOutputStream; import com.google.gson.Gson; -import nu.marginalia.crawling.model.CrawlingSpecification; +import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.model.gson.GsonFactory; import java.io.*; diff --git a/code/processes/crawl-job-extractor-process/src/test/java/nu/marginalia/crawl/CrawlJobSpecWriterTest.java b/code/processes/crawl-job-extractor-process/src/test/java/nu/marginalia/crawl/CrawlJobSpecWriterTest.java index de7117ec..ad9700da 100644 --- a/code/processes/crawl-job-extractor-process/src/test/java/nu/marginalia/crawl/CrawlJobSpecWriterTest.java +++ b/code/processes/crawl-job-extractor-process/src/test/java/nu/marginalia/crawl/CrawlJobSpecWriterTest.java @@ -1,7 +1,7 @@ package nu.marginalia.crawl; -import nu.marginalia.crawl_plan.CrawlerSpecificationLoader; -import nu.marginalia.crawling.model.CrawlingSpecification; +import nu.marginalia.crawling.model.spec.CrawlerSpecificationLoader; +import nu.marginalia.crawling.model.spec.CrawlingSpecification; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; diff --git a/code/processes/crawling-process/build.gradle b/code/processes/crawling-process/build.gradle index 8116b5ec..e2a63284 100644 --- a/code/processes/crawling-process/build.gradle +++ b/code/processes/crawling-process/build.gradle @@ -19,6 +19,8 @@ application { tasks.distZip.enabled = false dependencies { + implementation project(':code:common:process') + implementation project(':code:common:model') implementation project(':code:common:config') implementation project(':code:common:service') @@ -32,8 +34,6 @@ dependencies { implementation project(':code:features-crawl:crawl-blocklist') implementation project(':code:features-crawl:link-parser') - implementation project(':code:features-crawl:work-log') - implementation project(':code:features-crawl:crawl-plan') implementation libs.lombok annotationProcessor libs.lombok diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index 65843528..71d2731c 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -2,11 +2,11 @@ package nu.marginalia.crawl; import nu.marginalia.UserAgent; import nu.marginalia.WmsaHome; -import nu.marginalia.work_log.WorkLog; -import nu.marginalia.crawl_plan.CrawlPlanLoader; -import nu.marginalia.crawl_plan.CrawlPlan; +import nu.marginalia.process.log.WorkLog; +import plan.CrawlPlanLoader; +import plan.CrawlPlan; import nu.marginalia.crawling.io.CrawledDomainWriter; -import nu.marginalia.crawling.model.CrawlingSpecification; +import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.HttpFetcher; import okhttp3.ConnectionPool; diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 262e0a1c..1d2c01f0 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -3,6 +3,7 @@ package nu.marginalia.crawl.retreival; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import lombok.SneakyThrows; +import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.crawling.model.*; import nu.marginalia.ip_blocklist.GeoIpBlocklist; diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/CrawlPlanLoaderTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/CrawlPlanLoaderTest.java index b3a5919e..086529d6 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/CrawlPlanLoaderTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/CrawlPlanLoaderTest.java @@ -1,6 +1,6 @@ package nu.marginalia.crawling; -import nu.marginalia.crawl_plan.CrawlPlanLoader; +import plan.CrawlPlanLoader; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/WorkLogTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/WorkLogTest.java index 4adc7565..34046445 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/WorkLogTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/WorkLogTest.java @@ -1,6 +1,6 @@ package nu.marginalia.crawling; -import nu.marginalia.work_log.WorkLog; +import nu.marginalia.process.log.WorkLog; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index 73dd832b..10229111 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -3,7 +3,7 @@ package nu.marginalia.crawling.retreival; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.HttpFetcher; import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawlingSpecification; +import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.crawling.model.SerializableCrawlData; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Tag; diff --git a/code/processes/experimental/build.gradle b/code/processes/experimental/build.gradle index e493ad7a..33399ae9 100644 --- a/code/processes/experimental/build.gradle +++ b/code/processes/experimental/build.gradle @@ -12,6 +12,8 @@ java { } dependencies { + implementation project(':code:common:process') + implementation project(':code:common:model') implementation project(':code:common:config') implementation project(':code:common:service') @@ -26,7 +28,6 @@ dependencies { implementation project(':code:processes:converting-process') implementation project(':code:features-convert:adblock') implementation project(':code:features-convert:topic-detection') - implementation project(':code:features-crawl:crawl-plan') implementation libs.lombok annotationProcessor libs.lombok diff --git a/code/processes/experimental/src/main/java/nu/marginalia/experimental/AdblockTesterTool.java b/code/processes/experimental/src/main/java/nu/marginalia/experimental/AdblockTesterTool.java index bb5f9d6a..1473b663 100644 --- a/code/processes/experimental/src/main/java/nu/marginalia/experimental/AdblockTesterTool.java +++ b/code/processes/experimental/src/main/java/nu/marginalia/experimental/AdblockTesterTool.java @@ -2,8 +2,8 @@ package nu.marginalia.experimental; import nu.marginalia.adblock.AdblockSimulator; import nu.marginalia.converting.processor.DocumentProcessor; -import nu.marginalia.crawl_plan.CrawlPlanLoader; -import nu.marginalia.crawl_plan.CrawlPlan; +import plan.CrawlPlanLoader; +import plan.CrawlPlan; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; import org.jsoup.Jsoup; diff --git a/code/processes/experimental/src/main/java/nu/marginalia/experimental/ConverterLogicTestTool.java b/code/processes/experimental/src/main/java/nu/marginalia/experimental/ConverterLogicTestTool.java index adb88edd..465d469d 100644 --- a/code/processes/experimental/src/main/java/nu/marginalia/experimental/ConverterLogicTestTool.java +++ b/code/processes/experimental/src/main/java/nu/marginalia/experimental/ConverterLogicTestTool.java @@ -4,9 +4,9 @@ import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; import nu.marginalia.converting.ConverterModule; -import nu.marginalia.crawl_plan.CrawlPlanLoader; -import nu.marginalia.crawl_plan.CrawlPlan; -import nu.marginalia.converting.processor.logic.DomPruningFilter; +import plan.CrawlPlanLoader; +import plan.CrawlPlan; +import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.WmsaHome; import nu.marginalia.converting.processor.DomainProcessor; diff --git a/code/processes/experimental/src/main/java/nu/marginalia/experimental/CrawlDataExtractorTool.java b/code/processes/experimental/src/main/java/nu/marginalia/experimental/CrawlDataExtractorTool.java index 511c92e3..8c13c0db 100644 --- a/code/processes/experimental/src/main/java/nu/marginalia/experimental/CrawlDataExtractorTool.java +++ b/code/processes/experimental/src/main/java/nu/marginalia/experimental/CrawlDataExtractorTool.java @@ -3,8 +3,8 @@ package nu.marginalia.experimental; import lombok.SneakyThrows; import nu.marginalia.adblock.AdblockSimulator; import nu.marginalia.converting.processor.DocumentProcessor; -import nu.marginalia.crawl_plan.CrawlPlanLoader; -import nu.marginalia.crawl_plan.CrawlPlan; +import plan.CrawlPlanLoader; +import plan.CrawlPlan; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.service.module.DatabaseModule; diff --git a/code/processes/loading-process/build.gradle b/code/processes/loading-process/build.gradle index be05fd8c..47ec6f59 100644 --- a/code/processes/loading-process/build.gradle +++ b/code/processes/loading-process/build.gradle @@ -18,6 +18,8 @@ application { tasks.distZip.enabled = false dependencies { + implementation project(':code:common:process') + implementation project(':code:api:index-api') implementation project(':code:common:model') implementation project(':code:common:config') @@ -26,7 +28,6 @@ dependencies { implementation project(':code:common:service-client') implementation project(':code:features-index:lexicon') implementation project(':code:features-index:index-journal') - implementation project(':code:features-crawl:work-log') implementation project(':code:libraries:language-processing') testImplementation project(':code:services-core:search-service') @@ -35,7 +36,6 @@ dependencies { implementation project(':code:process-models:converting-model') implementation project(':code:features-convert:keyword-extraction') - implementation project(':code:features-crawl:crawl-plan') implementation libs.lombok diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java index 4cac1f9d..eb04a06b 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java @@ -5,9 +5,9 @@ import com.google.inject.Inject; import com.google.inject.Injector; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; -import nu.marginalia.work_log.WorkLog; -import nu.marginalia.crawl_plan.CrawlPlanLoader; -import nu.marginalia.crawl_plan.CrawlPlan; +import nu.marginalia.process.log.WorkLog; +import plan.CrawlPlanLoader; +import plan.CrawlPlan; import nu.marginalia.loading.loader.IndexLoadKeywords; import nu.marginalia.loading.loader.Loader; import nu.marginalia.loading.loader.LoaderFactory; @@ -79,6 +79,7 @@ public class LoaderMain { stmt.execute("TRUNCATE TABLE EC_PAGE_DATA"); stmt.execute("TRUNCATE TABLE EC_URL"); stmt.execute("TRUNCATE TABLE EC_DOMAIN_LINK"); + stmt.execute("TRUNCATE TABLE DOMAIN_METADATA"); stmt.execute("SET FOREIGN_KEY_CHECKS = 1"); } catch (SQLException ex) { diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java index 7e5f27c5..fe8c022e 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java @@ -5,7 +5,7 @@ import com.google.inject.AbstractModule; import com.google.inject.name.Names; import nu.marginalia.LanguageModels; import nu.marginalia.WmsaHome; -import nu.marginalia.crawl_plan.CrawlPlan; +import plan.CrawlPlan; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.service.SearchServiceDescriptors; import nu.marginalia.service.descriptor.ServiceDescriptors; diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java index cd239289..dd627f85 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java @@ -2,7 +2,7 @@ package nu.marginalia.loading.loader; import com.google.inject.Inject; import lombok.SneakyThrows; -import nu.marginalia.keyword_extraction.model.DocumentKeywords; +import nu.marginalia.keyword.model.DocumentKeywords; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.id.EdgeId; diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/KeywordListChunker.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/KeywordListChunker.java index 23da66ec..a95e2342 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/KeywordListChunker.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/KeywordListChunker.java @@ -1,7 +1,7 @@ package nu.marginalia.loading.loader; -import nu.marginalia.keyword_extraction.model.DocumentKeywords; +import nu.marginalia.keyword.model.DocumentKeywords; import java.util.ArrayList; import java.util.Collections; diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java index 8bb2c1b0..66eea626 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java @@ -1,6 +1,6 @@ package nu.marginalia.loading.loader; -import nu.marginalia.keyword_extraction.model.DocumentKeywords; +import nu.marginalia.keyword.model.DocumentKeywords; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.DomainIndexingState; @@ -21,6 +21,8 @@ public class Loader implements Interpreter { private final SqlLoadDomainLinks sqlLoadDomainLinks; private final SqlLoadProcessedDomain sqlLoadProcessedDomain; private final SqlLoadProcessedDocument sqlLoadProcessedDocument; + private final SqlLoadDomainMetadata sqlLoadDomainMetadata; + private final IndexLoadKeywords indexLoadKeywords; private static final Logger logger = LoggerFactory.getLogger(Loader.class); @@ -39,6 +41,7 @@ public class Loader implements Interpreter { SqlLoadDomainLinks sqlLoadDomainLinks, SqlLoadProcessedDomain sqlLoadProcessedDomain, SqlLoadProcessedDocument sqlLoadProcessedDocument, + SqlLoadDomainMetadata sqlLoadDomainMetadata, IndexLoadKeywords indexLoadKeywords) { data = new LoaderData(sizeHint); @@ -48,6 +51,7 @@ public class Loader implements Interpreter { this.sqlLoadDomainLinks = sqlLoadDomainLinks; this.sqlLoadProcessedDomain = sqlLoadProcessedDomain; this.sqlLoadProcessedDocument = sqlLoadProcessedDocument; + this.sqlLoadDomainMetadata = sqlLoadDomainMetadata; this.indexLoadKeywords = indexLoadKeywords; processedDocumentList = new ArrayList<>(sizeHint); @@ -128,6 +132,11 @@ public class Loader implements Interpreter { sqlLoadProcessedDomain.loadAlias(data, link); } + @Override + public void loadDomainMetadata(EdgeDomain domain, int knownUrls, int goodUrls, int visitedUrls) { + sqlLoadDomainMetadata.load(data, domain, knownUrls, goodUrls, visitedUrls); + } + public void finish() { // Some work needs to be processed out of order for the database relations to work out diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderFactory.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderFactory.java index d4a24a9b..21435ac0 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderFactory.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderFactory.java @@ -8,6 +8,7 @@ public class LoaderFactory { private final SqlLoadDomainLinks sqlLoadDomainLinks; private final SqlLoadProcessedDomain sqlLoadProcessedDomain; private final SqlLoadProcessedDocument sqlLoadProcessedDocument; + private final SqlLoadDomainMetadata sqlLoadDomainMetadata; private final IndexLoadKeywords indexLoadKeywords; @Inject @@ -16,6 +17,7 @@ public class LoaderFactory { SqlLoadDomainLinks sqlLoadDomainLinks, SqlLoadProcessedDomain sqlLoadProcessedDomain, SqlLoadProcessedDocument sqlLoadProcessedDocument, + SqlLoadDomainMetadata sqlLoadDomainMetadata, IndexLoadKeywords indexLoadKeywords) { this.sqlLoadUrls = sqlLoadUrls; @@ -23,10 +25,11 @@ public class LoaderFactory { this.sqlLoadDomainLinks = sqlLoadDomainLinks; this.sqlLoadProcessedDomain = sqlLoadProcessedDomain; this.sqlLoadProcessedDocument = sqlLoadProcessedDocument; + this.sqlLoadDomainMetadata = sqlLoadDomainMetadata; this.indexLoadKeywords = indexLoadKeywords; } public Loader create(int sizeHint) { - return new Loader(sizeHint, sqlLoadUrls, sqlLoadDomains, sqlLoadDomainLinks, sqlLoadProcessedDomain, sqlLoadProcessedDocument, indexLoadKeywords); + return new Loader(sizeHint, sqlLoadUrls, sqlLoadDomains, sqlLoadDomainLinks, sqlLoadProcessedDomain, sqlLoadProcessedDocument, sqlLoadDomainMetadata, indexLoadKeywords); } } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java index 724aa9cf..49cbd402 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java @@ -8,7 +8,7 @@ import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.model.IndexJournalEntryHeader; import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; import nu.marginalia.index.journal.writer.IndexJournalWriter; -import nu.marginalia.keyword_extraction.model.DocumentKeywords; +import nu.marginalia.keyword.model.DocumentKeywords; import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; import nu.marginalia.model.idx.DocumentMetadata; diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomainMetadata.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomainMetadata.java new file mode 100644 index 00000000..e276e40f --- /dev/null +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadDomainMetadata.java @@ -0,0 +1,38 @@ +package nu.marginalia.loading.loader; + +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.model.EdgeDomain; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.inject.Inject; +import java.sql.SQLException; + +public class SqlLoadDomainMetadata { + private final HikariDataSource dataSource; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + @Inject + public SqlLoadDomainMetadata(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + public void load(LoaderData data, EdgeDomain domain, int knownUrls, int goodUrls, int visitedUrls) { + int domainId = data.getDomainId(domain); + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + INSERT INTO DOMAIN_METADATA(ID,KNOWN_URLS,VISITED_URLS,GOOD_URLS) VALUES (?, ?, ?, ?) + """ + )) + { + stmt.setInt(1, domainId); + stmt.setInt(2, knownUrls); + stmt.setInt(3, visitedUrls); + stmt.setInt(4, goodUrls); + stmt.executeUpdate(); + } catch (SQLException ex) { + logger.warn("SQL error inserting domains", ex); + } + } +} diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/db/DbUpdateRanks.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/db/DbUpdateRanks.java new file mode 100644 index 00000000..21c56884 --- /dev/null +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/db/DbUpdateRanks.java @@ -0,0 +1,54 @@ +package nu.marginalia.index.db; + +import com.zaxxer.hikari.HikariDataSource; +import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.inject.Inject; +import javax.inject.Singleton; +import java.sql.SQLException; + +@Singleton +public class DbUpdateRanks { + private final Logger logger = LoggerFactory.getLogger(getClass()); + private final HikariDataSource dataSource; + + @Inject + public DbUpdateRanks(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + public void execute(Int2IntOpenHashMap ranks) { + try (var conn = dataSource.getConnection(); + var resetStmt = conn.createStatement(); + var updateStmt = conn.prepareStatement("UPDATE EC_DOMAIN SET RANK=? WHERE ID=?")) { + + resetStmt.executeUpdate("UPDATE EC_DOMAIN SET RANK=1"); + + int rankMax = ranks.size(); + int i = 0; + + for (var iter = ranks.int2IntEntrySet().fastIterator(); iter.hasNext(); i++) { + var entry = iter.next(); + + updateStmt.setDouble(1, entry.getIntValue() / (double) rankMax); + updateStmt.setInt(2, entry.getIntKey()); + updateStmt.addBatch(); + + if (i > 100) { + updateStmt.executeBatch(); + i = 0; + } + } + if (i > 0) { + updateStmt.executeBatch(); + } + } + catch (SQLException ex) { + logger.info("Failed to update ranks"); + } + } + + +} diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java index 11329c08..14e31fff 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java @@ -20,6 +20,10 @@ import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.function.LongPredicate; +/** This class delegates SearchIndexReader and deals with the stateful nature of the index, + * i.e. it may be possible to reconstruct the index and load a new set of data. + * + */ @Singleton public class SearchIndex { @@ -131,11 +135,6 @@ public class SearchIndex { ); } - - public IndexQuery getDomainQuery(int wordId, IndexResultDomainDeduplicator localFilter) { - throw new UnsupportedOperationException(""); // TBI - } - /** Replaces the values of ids with their associated metadata, or 0L if absent */ public long[] getTermMetadata(int termId, long[] docs) { return indexReader.getMetadata(termId, docs); @@ -148,4 +147,12 @@ public class SearchIndex { public int getDomainId(long docId) { return indexReader.getDomainId(docId); } + + public int getTotalDocCount() { + return indexReader.totalDocCount(); + } + + public int getTermFrequency(int id) { + return (int) indexReader.numHits(id); + } } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java index 7836e92c..792257af 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java @@ -86,4 +86,7 @@ public class SearchIndexReader { return forwardIndexReader.getDomainId(docId); } + public int totalDocCount() { + return forwardIndexReader.totalDocCount(); + } } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java index af6f100d..d570d20a 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java @@ -1,14 +1,25 @@ package nu.marginalia.index.results; import com.google.inject.Inject; +import gnu.trove.map.hash.TObjectIntHashMap; +import gnu.trove.set.hash.TLongHashSet; +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap; +import nu.marginalia.index.client.model.query.SearchSubquery; import nu.marginalia.index.index.SearchIndex; +import nu.marginalia.index.svc.SearchTermsService; + +import java.util.List; +import java.util.OptionalInt; public class IndexMetadataService { private final SearchIndex index; + private final SearchTermsService searchTermsService; @Inject - public IndexMetadataService(SearchIndex index) { + public IndexMetadataService(SearchIndex index, SearchTermsService searchTermsService) { this.index = index; + this.searchTermsService = searchTermsService; } public long getDocumentMetadata(long urlId) { @@ -23,4 +34,95 @@ public class IndexMetadataService { return index.getTermMetadata(termId, docIdsAll); } + public TermMetadata getTermMetadata(long[] docIdsAll, int[] termIdsList) { + var termdocToMeta = new Long2LongOpenHashMap(docIdsAll.length * termIdsList.length, 0.5f); + + for (int term : termIdsList) { + var metadata = getTermMetadata(term, docIdsAll); + + for (int i = 0; i < docIdsAll.length; i++) { + termdocToMeta.put(termdocKey(term, docIdsAll[i]), metadata[i]); + } + } + + return new TermMetadata(termdocToMeta); + } + + public QuerySearchTerms getSearchTerms(List searchTermVariants) { + + IntArrayList termIdsList = new IntArrayList(); + + TObjectIntHashMap termToId = new TObjectIntHashMap<>(10, 0.75f, -1); + + for (var subquery : searchTermVariants) { + for (var term : subquery.searchTermsInclude) { + if (termToId.containsKey(term)) { + continue; + } + + var id = searchTermsService.lookUpWord(term); + if (id.isPresent()) { + termIdsList.add(id.getAsInt()); + termToId.put(term, id.getAsInt()); + } + } + } + + return new QuerySearchTerms(termToId, termIdsList.toIntArray()); + } + + public TLongHashSet getResultsWithPriorityTerms(List subqueries, long[] resultsArray) { + int[] priorityTermIds = + subqueries.stream() + .flatMap(sq -> sq.searchTermsPriority.stream()) + .distinct() + .map(searchTermsService::lookUpWord) + .filter(OptionalInt::isPresent) + .mapToInt(OptionalInt::getAsInt) + .toArray(); + + var ret = new TLongHashSet(resultsArray.length); + + for (int priorityTerm : priorityTermIds) { + long[] metadata = getTermMetadata(priorityTerm, resultsArray); + for (int i = 0; i < metadata.length; i++) { + if (metadata[i] != 0) ret.add(resultsArray[i]); + } + } + + return ret; + + + } + + public static class TermMetadata { + private final Long2LongOpenHashMap termdocToMeta; + + public TermMetadata(Long2LongOpenHashMap termdocToMeta) { + this.termdocToMeta = termdocToMeta; + } + + public long getTermMetadata(int termId, long docId) { + return termdocToMeta.getOrDefault(termdocKey(termId, docId), 0); + } + + } + + public static class QuerySearchTerms { + private final TObjectIntHashMap termToId; + public final int[] termIdsAll; + + public QuerySearchTerms(TObjectIntHashMap termToId, int[] termIdsAll) { + this.termToId = termToId; + this.termIdsAll = termIdsAll; + } + + public int get(String searchTerm) { + return termToId.get(searchTerm); + } + } + + private static long termdocKey(int termId, long docId) { + return (docId << 32) | termId; + } } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultDomainDeduplicator.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultDomainDeduplicator.java index 265ba5dc..5dd1c9aa 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultDomainDeduplicator.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultDomainDeduplicator.java @@ -12,15 +12,6 @@ public class IndexResultDomainDeduplicator { this.limitByDomain = limitByDomain; } - public boolean test(long value) { - int ranking = (int) (value >>> 32); - if (ranking == Integer.MAX_VALUE) { - return true; - } - - return resultsByRankingId.adjustOrPutValue(ranking, 1, 1) <= limitByDomain; - } - public boolean test(SearchResultItem item) { final long key = item.deduplicationKey(); if (key == 0) diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java index 63abbcf7..a577f038 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java @@ -1,11 +1,8 @@ package nu.marginalia.index.results; import gnu.trove.list.TLongList; -import gnu.trove.map.hash.TObjectIntHashMap; import gnu.trove.set.hash.TLongHashSet; -import it.unimi.dsi.fastutil.ints.IntArrayList; -import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap; -import nu.marginalia.index.svc.SearchTermsService; +import nu.marginalia.index.client.model.results.SearchResultPreliminaryScore; import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.index.query.limit.QueryStrategy; @@ -15,63 +12,38 @@ import nu.marginalia.index.client.model.query.SearchSubquery; import nu.marginalia.index.query.IndexQueryParams; import java.util.List; -import java.util.OptionalInt; public class IndexResultValuator { private final IndexMetadataService metadataService; private final List> searchTermVariants; private final IndexQueryParams queryParams; - private final int[] termIdsAll; - private final TLongHashSet resultsWithPriorityTerms; - private final TObjectIntHashMap termToId = new TObjectIntHashMap<>(10, 0.75f, -1); - private final TermMetadata termMetadata; + private final IndexMetadataService.TermMetadata termMetadata; + private final IndexMetadataService.QuerySearchTerms searchTerms; - public IndexResultValuator(SearchTermsService searchTermsSvc, - IndexMetadataService metadataService, + public IndexResultValuator(IndexMetadataService metadataService, TLongList results, List subqueries, - IndexQueryParams queryParams) { + IndexQueryParams queryParams + ) { + + final long[] resultsArray = results.toArray(); + this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList(); this.queryParams = queryParams; this.metadataService = metadataService; - IntArrayList termIdsList = new IntArrayList(); - - searchTermVariants.stream().flatMap(List::stream).distinct().forEach(term -> { - searchTermsSvc.lookUpWord(term).ifPresent(id -> { - termIdsList.add(id); - termToId.put(term, id); - }); - }); - - final long[] resultsArray = results.toArray(); - - termIdsAll = termIdsList.toArray(new int[0]); - termMetadata = new TermMetadata(resultsArray, termIdsAll); - - int[] priorityTermIds = - subqueries.stream() - .flatMap(sq -> sq.searchTermsPriority.stream()) - .distinct() - .map(searchTermsSvc::lookUpWord) - .filter(OptionalInt::isPresent) - .mapToInt(OptionalInt::getAsInt) - .toArray(); - - resultsWithPriorityTerms = new TLongHashSet(results.size()); - for (int priorityTerm : priorityTermIds) { - long[] metadata = metadataService.getTermMetadata(priorityTerm, resultsArray); - for (int i = 0; i < metadata.length; i++) { - if (metadata[i] != 0) resultsWithPriorityTerms.add(resultsArray[i]); - } - } - + this.searchTerms = metadataService.getSearchTerms(subqueries); + this.termMetadata = metadataService.getTermMetadata(results.toArray(), searchTerms.termIdsAll); + resultsWithPriorityTerms = metadataService.getResultsWithPriorityTerms(subqueries, resultsArray); } - public SearchResultItem evaluateResult(long id) { + private final int flagsFilterMask = + WordFlags.Title.asBit() | WordFlags.NamesWords.asBit() | WordFlags.Subjects.asBit() | WordFlags.TfIdfHigh.asBit(); + + public SearchResultItem calculatePreliminaryScore(long id) { SearchResultItem searchResult = new SearchResultItem(id); final long urlIdInt = searchResult.getUrlIdInt(); @@ -80,160 +52,113 @@ public class IndexResultValuator { long docMetadata = metadataService.getDocumentMetadata(urlIdInt); - double bestScore = 1000; + int maxPosCount = 0; + int maxBitMask = 0; + int maxFlagsCount = 0; + boolean hasSingleTermMatch = false; + for (int querySetId = 0; querySetId < searchTermVariants.size(); querySetId++) { - bestScore = Math.min(bestScore, - evaluateSubquery(searchResult, - docMetadata, - querySetId, - searchTermVariants.get(querySetId)) - ); + + var termList = searchTermVariants.get(querySetId); + + SearchResultKeywordScore[] termScoresForSet = new SearchResultKeywordScore[termList.size()]; + + for (int termIdx = 0; termIdx < termList.size(); termIdx++) { + String searchTerm = termList.get(termIdx); + + long metadata = termMetadata.getTermMetadata( + searchTerms.get(searchTerm), + searchResult.getUrlIdInt() + ); + + var score = new SearchResultKeywordScore( + querySetId, + searchTerm, + metadata, + docMetadata, + resultsWithPriorityTerms.contains(searchResult.combinedId) + ); + + searchResult.keywordScores.add(score); + + termScoresForSet[termIdx] = score; + } + + if (!meetsQueryStrategyRequirements(termScoresForSet, queryParams.queryStrategy())) { + continue; + } + + int minFlagsCount = 8; + int minPosCount = 1000; + int cominedBitMask = ~0; + + for (var termScore : termScoresForSet) { + final int positionCount = Integer.bitCount(termScore.positions()); + final int flagCount = Long.bitCount(termScore.encodedWordMetadata() & flagsFilterMask); + + minPosCount = Math.min(minPosCount, positionCount); + minFlagsCount = Math.min(minFlagsCount, flagCount); + cominedBitMask &= termScore.positions(); + } + + final int combinedBitmaskBitCount = Integer.bitCount(cominedBitMask); + + // Calculate the highest value (overall) of the lowest value (per set) of these search result importance measures + maxBitMask = Math.max(maxBitMask, combinedBitmaskBitCount); + maxPosCount = Math.max(maxPosCount, minPosCount); + maxFlagsCount = Math.max(maxFlagsCount, minFlagsCount); + + hasSingleTermMatch |= (termScoresForSet.length == 1 && minPosCount != 0); } - if (resultsWithPriorityTerms.contains(id)) { - bestScore -= 50; - } + final boolean hasPriorityTerm = resultsWithPriorityTerms.contains(id); - searchResult.setScore(bestScore); + searchResult.setScore(new SearchResultPreliminaryScore( + hasSingleTermMatch, + hasPriorityTerm, + maxFlagsCount, + Math.min(4, maxPosCount), + Math.min(4, maxBitMask) + )); return searchResult; } - private double evaluateSubquery(SearchResultItem searchResult, - long docMetadata, - int querySetId, - List termList) - { - double setScore = 0; - int setSize = 0; + private boolean meetsQueryStrategyRequirements(SearchResultKeywordScore[] termSet, QueryStrategy queryStrategy) { + if (queryStrategy == QueryStrategy.AUTO || + queryStrategy == QueryStrategy.SENTENCE || + queryStrategy == QueryStrategy.TOPIC) { + return true; + } - for (int termIdx = 0; termIdx < termList.size(); termIdx++) { - String searchTerm = termList.get(termIdx); - - final int termId = termToId.get(searchTerm); - - long metadata = termMetadata.getTermMetadata(termId, searchResult.getUrlIdInt()); - - SearchResultKeywordScore score = new SearchResultKeywordScore( - querySetId, - searchTerm, - metadata, - docMetadata, - resultsWithPriorityTerms.contains(searchResult.combinedId) - ); - - searchResult.scores.add(score); - - setScore += score.termValue(); - - if (!filterRequired(metadata, queryParams.queryStrategy())) { - return 1000; + for (var keyword : termSet) { + if (!meetsQueryStrategyRequirements(keyword, queryParams.queryStrategy())) { + return false; } - - if (termIdx == 0) { - setScore += score.documentValue(); - } - - setSize++; - } - - setScore += calculateTermCoherencePenalty(searchResult.getUrlIdInt(), termToId, termList); - - return setScore/setSize; - } - - private boolean filterRequired(long metadata, QueryStrategy queryStrategy) { - if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) { - return WordFlags.Site.isPresent(metadata); - } - else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) { - return WordFlags.Subjects.isPresent(metadata); - } - else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) { - return WordFlags.Title.isPresent(metadata); - } - else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_URL) { - return WordFlags.UrlPath.isPresent(metadata); - } - else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) { - return WordFlags.UrlDomain.isPresent(metadata); } return true; } - private double calculateTermCoherencePenalty(int urlId, TObjectIntHashMap termToId, List termList) { - long maskDirectGenerous = ~0; - long maskDirectRaw = ~0; - long maskAdjacent = ~0; - - final int flagBitMask = WordFlags.Title.asBit() - | WordFlags.Subjects.asBit() - | WordFlags.Synthetic.asBit(); - - int termCount = 0; - double tfIdfSum = 1.; - - for (String term : termList) { - var meta = termMetadata.getTermMetadata(termToId.get(term), urlId); - long positions; - - if (meta == 0) { - return 1000; - } - - positions = WordMetadata.decodePositions(meta); - - maskDirectRaw &= positions; - - if (positions != 0 && !WordMetadata.hasAnyFlags(meta, flagBitMask)) { - maskAdjacent &= (positions | (positions << 1) | (positions >>> 1)); - maskDirectGenerous &= positions; - } - - termCount++; - tfIdfSum += WordMetadata.decodeTfidf(meta); + private boolean meetsQueryStrategyRequirements(SearchResultKeywordScore termScore, QueryStrategy queryStrategy) { + if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) { + return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.Site.asBit()); + } + else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) { + return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.Subjects.asBit()); + } + else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) { + return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.Title.asBit()); + } + else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_URL) { + return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.UrlPath.asBit()); + } + else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) { + return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.UrlDomain.asBit()); } - double avgTfIdf = termCount / tfIdfSum; - - if (maskAdjacent == 0) { - return Math.min(5, Math.max(-2, 40 - 0.5 * avgTfIdf)); - } - - if (maskDirectGenerous == 0) { - return Math.min(5, Math.max(-1, 20 - 0.3 * avgTfIdf)); - } - - if (maskDirectRaw == 0) { - return Math.min(5, Math.max(-1, 15 - 0.2 * avgTfIdf)); - } - - return Long.numberOfTrailingZeros(maskDirectGenerous)/5. - Long.bitCount(maskDirectGenerous); + return true; } - class TermMetadata { - private final Long2LongOpenHashMap termdocToMeta; - - public TermMetadata(long[] docIdsAll, int[] termIdsList) { - termdocToMeta = new Long2LongOpenHashMap(docIdsAll.length * termIdsAll.length, 0.5f); - - for (int term : termIdsList) { - var metadata = metadataService.getTermMetadata(term, docIdsAll); - for (int i = 0; i < docIdsAll.length; i++) { - termdocToMeta.put(termdocKey(term, docIdsAll[i]), metadata[i]); - } - } - - } - - public long getTermMetadata(int termId, long docId) { - return termdocToMeta.getOrDefault(termdocKey(termId, docId), 0); - } - } - - private long termdocKey(int termId, long docId) { - return (docId << 32) | termId; - } } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java index eaebeee7..2c0a23ff 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java @@ -5,14 +5,14 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import gnu.trove.list.TLongList; import gnu.trove.list.array.TLongArrayList; -import gnu.trove.set.hash.TLongHashSet; import io.prometheus.client.Counter; import io.prometheus.client.Gauge; import io.prometheus.client.Histogram; +import nu.marginalia.index.client.model.query.SearchSubquery; import nu.marginalia.index.client.model.results.SearchResultItem; +import nu.marginalia.index.client.model.results.SearchResultRankingContext; import nu.marginalia.index.client.model.results.SearchResultSet; import nu.marginalia.index.client.model.query.SearchSpecification; -import nu.marginalia.index.client.model.query.SearchSubquery; import nu.marginalia.array.buffer.LongQueryBuffer; import nu.marginalia.index.index.SearchIndex; import nu.marginalia.index.index.SearchIndexSearchTerms; @@ -21,8 +21,6 @@ import nu.marginalia.index.searchset.SearchSet; import nu.marginalia.index.results.IndexResultValuator; import nu.marginalia.index.query.IndexQuery; import nu.marginalia.index.results.IndexResultDomainDeduplicator; -import nu.marginalia.index.query.IndexQueryParams; -import nu.marginalia.index.query.IndexSearchBudget; import nu.marginalia.index.svc.searchset.SmallSearchSet; import nu.marginalia.model.gson.GsonFactory; import org.slf4j.Logger; @@ -34,10 +32,7 @@ import spark.Request; import spark.Response; import spark.Spark; -import java.util.ArrayList; -import java.util.List; - -import static java.util.Comparator.comparingDouble; +import java.util.*; @Singleton public class IndexQueryService { @@ -46,7 +41,6 @@ public class IndexQueryService { private final Marker queryMarker = MarkerFactory.getMarker("QUERY"); - private static final Counter wmsa_edge_index_query_timeouts = Counter.build().name("wmsa_edge_index_query_timeouts").help("-").register(); private static final Gauge wmsa_edge_index_query_cost = Gauge.build().name("wmsa_edge_index_query_cost").help("-").register(); private static final Histogram wmsa_edge_index_query_time = Histogram.build().name("wmsa_edge_index_query_time").linearBuckets(25/1000., 25/1000., 15).help("-").register(); @@ -79,7 +73,8 @@ public class IndexQueryService { return wmsa_edge_index_query_time.time(() -> { var params = new SearchParameters(specsSet, getSearchSet(specsSet)); - List results = executeSearch(params); + SearchResultSet results = executeSearch(params); + logger.info(queryMarker, "Index Result Count: {}", results.size()); wmsa_edge_index_query_cost.set(params.getDataCost()); @@ -87,7 +82,7 @@ public class IndexQueryService { wmsa_edge_index_query_timeouts.inc(); } - return new SearchResultSet(results); + return results; }); } catch (HaltException ex) { @@ -104,7 +99,7 @@ public class IndexQueryService { // exists for test access SearchResultSet justQuery(SearchSpecification specsSet) { - return new SearchResultSet(executeSearch(new SearchParameters(specsSet, getSearchSet(specsSet)))); + return executeSearch(new SearchParameters(specsSet, getSearchSet(specsSet))); } private SearchSet getSearchSet(SearchSpecification specsSet) { @@ -115,12 +110,28 @@ public class IndexQueryService { return searchSetsService.getSearchSetByName(specsSet.searchSetIdentifier); } - private List executeSearch(SearchParameters params) { + private SearchResultSet executeSearch(SearchParameters params) { var resultIds = evaluateSubqueries(params); var resultItems = calculateResultScores(params, resultIds); - return selectBestResults(params, resultItems); + var bestResults = selectBestResults(params, resultItems); + + return new SearchResultSet(bestResults, createRankingContext(params.subqueries)); + } + + /* This information is routed back up the search service in order to calculate BM-25 + * accurately */ + private SearchResultRankingContext createRankingContext(List subqueries) { + final Map termToId = searchTermsSvc.getAllIncludeTerms(subqueries); + + int totalDocCount = index.getTotalDocCount(); + + final Map termFrequencies = new HashMap<>(termToId); + + termToId.forEach((key, id) -> termFrequencies.put(key, index.getTermFrequency(id))); + + return new SearchResultRankingContext(totalDocCount, termFrequencies); } private TLongList evaluateSubqueries(SearchParameters params) { @@ -130,8 +141,6 @@ public class IndexQueryService { for (var sq : params.subqueries) { final SearchIndexSearchTerms searchTerms = searchTermsSvc.getSearchTerms(sq); - - if (searchTerms.isEmpty()) { continue; } @@ -176,31 +185,29 @@ public class IndexQueryService { return results; } - private ArrayList calculateResultScores(SearchParameters params, TLongList results) { + private ArrayList calculateResultScores(SearchParameters params, TLongList resultIds) { - final var evaluator = new IndexResultValuator( - searchTermsSvc, - metadataService, - results, - params.subqueries, - params.queryParams); + final var evaluator = new IndexResultValuator(metadataService, resultIds, params.subqueries, params.queryParams); - ArrayList items = new ArrayList<>(results.size()); + ArrayList items = new ArrayList<>(resultIds.size()); - // Sorting the result ids results in better paging characteristics - results.sort(); + // Note, this is a pre-sorting the result IDs. This is a performance optimization, as it will cluster + // disk access to adjacent parts of the forward index when fetching metadata + // + // This is *not* where the actual search results are sorted + resultIds.sort(); - results.forEach(id -> { - var item = evaluator.evaluateResult(id); + resultIds.forEach(id -> { + var item = evaluator.calculatePreliminaryScore(id); - if (item.getScore() < 100) { + if (!item.getScore().isEmpty()) { items.add(item); } return true; }); - logger.info(queryMarker, "After filtering: {} -> {}", results.size(), items.size()); + logger.info(queryMarker, "After filtering: {} -> {}", resultIds.size(), items.size()); return items; @@ -210,7 +217,7 @@ public class IndexQueryService { var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain); - results.sort(comparingDouble(SearchResultItem::getScore) + results.sort(Comparator.comparing(SearchResultItem::getScore).reversed() .thenComparingInt(SearchResultItem::getRanking) .thenComparingInt(SearchResultItem::getUrlIdInt)); @@ -240,58 +247,3 @@ public class IndexQueryService { } -class SearchParameters { - /** This is how many results matching the keywords we'll try to get - before evaluating them for the best result. */ - final int fetchSize; - final IndexSearchBudget budget; - final List subqueries; - final IndexQueryParams queryParams; - - final int limitByDomain; - final int limitTotal; - - // mutable: - - /** An estimate of how much data has been read */ - long dataCost = 0; - - /** A set of id:s considered during each subquery, - * for deduplication - */ - final TLongHashSet consideredUrlIds; - - public SearchParameters(SearchSpecification specsSet, SearchSet searchSet) { - var limits = specsSet.queryLimits; - - this.fetchSize = limits.fetchSize(); - this.budget = new IndexSearchBudget(limits.timeoutMs()); - this.subqueries = specsSet.subqueries; - this.limitByDomain = limits.resultsByDomain(); - this.limitTotal = limits.resultsTotal(); - - this.consideredUrlIds = new TLongHashSet(fetchSize * 4); - - queryParams = new IndexQueryParams( - specsSet.quality, - specsSet.year, - specsSet.size, - specsSet.rank, - searchSet, - specsSet.queryStrategy); - } - - IndexQuery createIndexQuery(SearchIndex index, SearchIndexSearchTerms terms) { - return index.createQuery(terms, queryParams, consideredUrlIds::add); - } - - boolean hasTimeLeft() { - return budget.hasTimeLeft(); - } - - long getDataCost() { - return dataCost; - } - -} - diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java index ecb8bdba..512c735e 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexSearchSetsService.java @@ -7,7 +7,6 @@ import nu.marginalia.index.IndexServicesFactory; import nu.marginalia.index.searchset.SearchSet; import nu.marginalia.ranking.ReversePageRank; import nu.marginalia.ranking.StandardPageRank; -import nu.marginalia.ranking.accumulator.RankingResultBitSetAccumulator; import nu.marginalia.ranking.accumulator.RankingResultHashMapAccumulator; import nu.marginalia.ranking.accumulator.RankingResultHashSetAccumulator; import nu.marginalia.ranking.data.RankingDomainFetcher; @@ -17,6 +16,7 @@ import nu.marginalia.index.svc.searchset.SearchSetAny; import nu.marginalia.index.config.RankingSettings; import nu.marginalia.ranking.DomainRankings; import nu.marginalia.index.client.model.query.SearchSetIdentifier; +import nu.marginalia.index.db.DbUpdateRanks; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -26,6 +26,7 @@ import java.io.IOException; public class IndexSearchSetsService { private final Logger logger = LoggerFactory.getLogger(getClass()); private final RankingDomainFetcher rankingDomains; + private final DbUpdateRanks dbUpdateRanks; private final RankingDomainFetcher similarityDomains; private final RankingSettings rankingSettings; @@ -43,9 +44,11 @@ public class IndexSearchSetsService { public IndexSearchSetsService(RankingDomainFetcher rankingDomains, RankingDomainFetcherForSimilarityData similarityDomains, RankingSettings rankingSettings, - IndexServicesFactory servicesFactory) throws IOException { + IndexServicesFactory servicesFactory, + DbUpdateRanks dbUpdateRanks) throws IOException { this.rankingDomains = rankingDomains; + this.dbUpdateRanks = dbUpdateRanks; if (similarityDomains.hasData()) { this.similarityDomains = similarityDomains; @@ -95,6 +98,10 @@ public class IndexSearchSetsService { synchronized (this) { domainRankings = new DomainRankings(ranks); } + + // The EC_DOMAIN table has a field that reflects the rank, this needs to be set for search result ordering to + // make sense + dbUpdateRanks.execute(ranks); } @SneakyThrows diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchParameters.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchParameters.java new file mode 100644 index 00000000..846f52c0 --- /dev/null +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchParameters.java @@ -0,0 +1,73 @@ +package nu.marginalia.index.svc; + +import gnu.trove.set.hash.TLongHashSet; +import nu.marginalia.index.client.model.query.SearchSpecification; +import nu.marginalia.index.client.model.query.SearchSubquery; +import nu.marginalia.index.index.SearchIndex; +import nu.marginalia.index.index.SearchIndexSearchTerms; +import nu.marginalia.index.query.IndexQuery; +import nu.marginalia.index.query.IndexQueryParams; +import nu.marginalia.index.query.IndexSearchBudget; +import nu.marginalia.index.searchset.SearchSet; + +import java.util.List; + +public class SearchParameters { + /** + * This is how many results matching the keywords we'll try to get + * before evaluating them for the best result. + */ + final int fetchSize; + final IndexSearchBudget budget; + final List subqueries; + final IndexQueryParams queryParams; + + final int limitByDomain; + final int limitTotal; + + // mutable: + + /** + * An estimate of how much data has been read + */ + long dataCost = 0; + + /** + * A set of id:s considered during each subquery, + * for deduplication + */ + final TLongHashSet consideredUrlIds; + + public SearchParameters(SearchSpecification specsSet, SearchSet searchSet) { + var limits = specsSet.queryLimits; + + this.fetchSize = limits.fetchSize(); + this.budget = new IndexSearchBudget(limits.timeoutMs()); + this.subqueries = specsSet.subqueries; + this.limitByDomain = limits.resultsByDomain(); + this.limitTotal = limits.resultsTotal(); + + this.consideredUrlIds = new TLongHashSet(fetchSize * 4); + + queryParams = new IndexQueryParams( + specsSet.quality, + specsSet.year, + specsSet.size, + specsSet.rank, + searchSet, + specsSet.queryStrategy); + } + + IndexQuery createIndexQuery(SearchIndex index, SearchIndexSearchTerms terms) { + return index.createQuery(terms, queryParams, consideredUrlIds::add); + } + + boolean hasTimeLeft() { + return budget.hasTimeLeft(); + } + + long getDataCost() { + return dataCost; + } + +} diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java index 886bede5..d82a8c42 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java @@ -11,6 +11,9 @@ import nu.marginalia.lexicon.KeywordLexiconReadOnlyView; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.HashMap; +import java.util.List; +import java.util.Map; import java.util.OptionalInt; @Singleton @@ -37,6 +40,7 @@ public class SearchTermsService { includes.add(word.getAsInt()); } + for (var advice : request.searchTermsAdvice) { var word = lookUpWord(advice); if (word.isEmpty()) { @@ -64,4 +68,16 @@ public class SearchTermsService { } return OptionalInt.of(ret); } + + public Map getAllIncludeTerms(List subqueries) { + Map ret = new HashMap<>(); + + for (var subquery : subqueries) { + for (var include : subquery.searchTermsInclude) { + ret.computeIfAbsent(include, term -> lookUpWord(term).orElse(-1)); + } + } + + return ret; + } } diff --git a/code/services-core/search-service/build.gradle b/code/services-core/search-service/build.gradle index 5715a8ff..56f59688 100644 --- a/code/services-core/search-service/build.gradle +++ b/code/services-core/search-service/build.gradle @@ -42,6 +42,7 @@ dependencies { implementation project(':code:features-search:screenshots') implementation project(':code:features-search:random-websites') implementation project(':code:features-search:query-parser') + implementation project(':code:features-search:result-ranking') implementation libs.lombok diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchService.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchService.java index ecb21502..096b623d 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchService.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/SearchService.java @@ -25,7 +25,7 @@ import java.nio.charset.StandardCharsets; public class SearchService extends Service { private final WebsiteUrl websiteUrl; - private StaticResources staticResources; + private final StaticResources staticResources; private static final Logger logger = LoggerFactory.getLogger(SearchService.class); @@ -71,7 +71,7 @@ public class SearchService extends Service { Spark.exception(Exception.class, (e,p,q) -> { logger.error("Error during processing", e); - errorPageService.serveError(Context.fromRequest(p), q); + errorPageService.serveError(Context.fromRequest(p), p, q); }); Spark.awaitInitialization(); diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java index 0ec971ef..3ce45b46 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/model/SearchProfile.java @@ -54,13 +54,13 @@ public enum SearchProfile { subquery.searchTermsPriority.add("js:false"); } if (this == PLAIN_TEXT) { - subquery.searchTermsInclude.add("format:plain"); + subquery.searchTermsAdvice.add("format:plain"); } if (this == FOOD) { - subquery.searchTermsInclude.add(HtmlFeature.CATEGORY_FOOD.getKeyword()); + subquery.searchTermsAdvice.add(HtmlFeature.CATEGORY_FOOD.getKeyword()); } if (this == CRAFTS) { - subquery.searchTermsInclude.add(HtmlFeature.CATEGORY_CRAFTS.getKeyword()); + subquery.searchTermsAdvice.add(HtmlFeature.CATEGORY_CRAFTS.getKeyword()); } } diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QueryFactory.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QueryFactory.java index e1425ffd..d7ccde8f 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QueryFactory.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QueryFactory.java @@ -21,7 +21,6 @@ import nu.marginalia.search.model.SearchProfile; import nu.marginalia.search.query.model.SearchQuery; import nu.marginalia.search.query.model.UserSearchParameters; import nu.marginalia.language.WordPatterns; -import nu.marginalia.search.valuation.SearchResultValuator; import org.eclipse.jetty.http.HttpStatus; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -32,9 +31,7 @@ import java.util.*; @Singleton public class QueryFactory { - private final EnglishDictionary englishDictionary; private final Logger logger = LoggerFactory.getLogger(getClass()); - private final SearchResultValuator searchResultValuator; private final DbNearDomainsQuery dbNearDomainsQuery; private static final int RETAIN_QUERY_VARIANT_COUNT = 5; @@ -48,11 +45,7 @@ public class QueryFactory { TermFrequencyDict dict, EnglishDictionary englishDictionary, NGramBloomFilter nGramBloomFilter, - SearchResultValuator searchResultValuator, DbNearDomainsQuery dbNearDomainsQuery) { - - this.englishDictionary = englishDictionary; - this.searchResultValuator = searchResultValuator; this.dbNearDomainsQuery = dbNearDomainsQuery; this.queryVariants = ThreadLocal.withInitial(() -> new QueryVariants(lm ,dict, nGramBloomFilter, englishDictionary)); @@ -70,11 +63,9 @@ public class QueryFactory { final var processedQuery = createQuery(getQueryPermutation(), params); final List subqueries = processedQuery.specs.subqueries; - for (var sq : subqueries) { - sq.setValue(searchResultValuator.preEvaluate(sq)); - } + // There used to be a piece of logic here that would try to figure out which one of these subqueries were the "best", + // it's gone for the moment, but it would be neat if it resurrected somehow - subqueries.sort(Comparator.comparing(SearchSubquery::getValue)); trimArray(subqueries, RETAIN_QUERY_VARIANT_COUNT); return processedQuery; @@ -238,7 +229,7 @@ public class QueryFactory { int domainLimit; if (domain != null) { - domainLimit = 100; + domainLimit = 1000; } else { domainLimit = 2; } @@ -315,9 +306,6 @@ public class QueryFactory { if (!word.contains("_") && word.length() >= WordPatterns.MAX_WORD_LENGTH) { problems.add("Search term \"" + term.displayStr + "\" too long"); } - if (!word.contains("_") && !WordPatterns.wordPattern.matcher(word.replaceAll("[_:]","")).matches()) { - problems.add("The term \"" + term.displayStr + "\" contains characters that are not currently supported"); - } } } diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/results/SearchResultDecorator.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/results/SearchResultDecorator.java index 7b1907b1..bc703fb2 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/results/SearchResultDecorator.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/results/SearchResultDecorator.java @@ -5,13 +5,16 @@ import gnu.trove.list.array.TIntArrayList; import gnu.trove.map.hash.TIntObjectHashMap; import it.unimi.dsi.fastutil.ints.Int2IntArrayMap; import nu.marginalia.bbpc.BrailleBlockPunchCards; +import nu.marginalia.index.client.model.results.SearchResultRankingContext; +import nu.marginalia.index.client.model.results.SearchResultSet; +import nu.marginalia.ranking.ResultValuator; import nu.marginalia.search.db.DbUrlDetailsQuery; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.id.EdgeIdList; import nu.marginalia.index.client.model.results.SearchResultItem; import nu.marginalia.search.model.UrlDetails; -import nu.marginalia.search.valuation.SearchResultValuator; +import nu.marginalia.search.query.model.SearchQuery; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -20,19 +23,20 @@ import java.util.List; public class SearchResultDecorator { private final DbUrlDetailsQuery dbUrlDetailsQuery; - private final SearchResultValuator valuator; + private final ResultValuator valuator; private final Logger logger = LoggerFactory.getLogger(getClass()); @Inject - public SearchResultDecorator(DbUrlDetailsQuery dbUrlDetailsQuery, SearchResultValuator valuator) { + public SearchResultDecorator(DbUrlDetailsQuery dbUrlDetailsQuery, + ResultValuator valuator) { this.dbUrlDetailsQuery = dbUrlDetailsQuery; this.valuator = valuator; } - public List getAllUrlDetails(List resultItems) { - TIntObjectHashMap detailsById = new TIntObjectHashMap<>(resultItems.size()); + public List getAllUrlDetails(SearchResultSet resultSet) { + TIntObjectHashMap detailsById = new TIntObjectHashMap<>(resultSet.size()); - EdgeIdList idList = resultItems.stream() + EdgeIdList idList = resultSet.results.stream() .mapToInt(SearchResultItem::getUrlIdInt) .collect(EdgeIdList::new, EdgeIdList::add, EdgeIdList::addAll); @@ -42,10 +46,10 @@ public class SearchResultDecorator { detailsById.put(val.id, val); } - List retList = new ArrayList<>(resultItems.size()); + List retList = new ArrayList<>(resultSet.size()); TIntArrayList missedIds = new TIntArrayList(); - for (var resultItem : resultItems) { + for (var resultItem : resultSet.results) { var rankingId = resultItem.getRanking(); var uid = resultItem.getUrlId().id(); @@ -59,7 +63,7 @@ public class SearchResultDecorator { details.rankingId = rankingId; details.resultsFromSameDomain = resultItem.resultsFromDomain; - details.termScore = calculateTermScore(resultItem, details); + details.termScore = calculateTermScore(resultItem, details, resultSet.rankingContext); details.positions = getPositionsString(resultItem); details.resultItem = resultItem; @@ -75,7 +79,7 @@ public class SearchResultDecorator { private String getPositionsString(SearchResultItem resultItem) { Int2IntArrayMap positionsPerSet = new Int2IntArrayMap(8); - for (var score : resultItem.scores) { + for (var score : resultItem.keywordScores) { if (!score.isKeywordRegular()) { continue; } @@ -95,10 +99,14 @@ public class SearchResultDecorator { return a | b; } - private double calculateTermScore(SearchResultItem resultItem, UrlDetails details) { + private double calculateTermScore(SearchResultItem resultItem, UrlDetails details, SearchResultRankingContext rankingContext) { final double statePenalty = (details.domainState == DomainIndexingState.SPECIAL) ? 1.25 : 0; - final double value = valuator.evaluateTerms(resultItem.scores, details.words, details.title.length()); + + final double value = valuator.calculateSearchResultValue(resultItem.keywordScores, + details.words, + details.title.length(), + rankingContext); return value + statePenalty; } diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchApiQueryService.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchApiQueryService.java index 28951265..2826b9f0 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchApiQueryService.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchApiQueryService.java @@ -62,7 +62,7 @@ public class SearchApiQueryService { ApiSearchResult convert(UrlDetails url) { List> details = new ArrayList<>(); if (url.resultItem != null) { - var bySet = url.resultItem.scores.stream().collect(Collectors.groupingBy(SearchResultKeywordScore::subquery)); + var bySet = url.resultItem.keywordScores.stream().collect(Collectors.groupingBy(SearchResultKeywordScore::subquery)); outer: for (var entries : bySet.values()) { diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchErrorPageService.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchErrorPageService.java index 5e1ea6ae..03ae6297 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchErrorPageService.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchErrorPageService.java @@ -3,25 +3,36 @@ package nu.marginalia.search.svc; import com.google.inject.Inject; import nu.marginalia.client.Context; import nu.marginalia.index.client.IndexClient; +import nu.marginalia.renderer.MustacheRenderer; +import nu.marginalia.renderer.RendererFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import spark.Request; import spark.Response; +import java.io.IOException; +import java.util.Map; + public class SearchErrorPageService { private final IndexClient indexClient; private final Logger logger = LoggerFactory.getLogger(getClass()); + private final MustacheRenderer renderer; @Inject - public SearchErrorPageService(IndexClient indexClient) { + public SearchErrorPageService(IndexClient indexClient, + RendererFactory rendererFactory) throws IOException { + + renderer = rendererFactory.renderer("search/error-page-search"); + this.indexClient = indexClient; } - public void serveError(Context ctx, Response rsp) { + public void serveError(Context ctx, Request request, Response rsp) { boolean isIndexUp = indexClient.isAlive(); try { if (!isIndexUp) { - rsp.body(renderError("The index is down", + rsp.body(renderError(request, "The index is down", """ The search index server appears to be down.

@@ -30,14 +41,14 @@ public class SearchErrorPageService { searches can't be served. """)); } else if (indexClient.isBlocked(ctx).blockingFirst()) { - rsp.body(renderError("The index is starting up", + rsp.body(renderError(request, "The index is starting up", """ The search index server appears to be in the process of starting up. This typically takes a few minutes. Be patient. """)); } else { - rsp.body(renderError("Error processing request", + rsp.body(renderError(request, "Error processing request", """ The search index appears to be up and running, so the problem may be related to some wider general error, or pertain to an error handling your query. @@ -46,81 +57,18 @@ public class SearchErrorPageService { } catch (Exception ex) { logger.warn("Error during rendering of error page", ex); - rsp.body(renderError("Error processing error", + rsp.body(renderError(request, "Error processing error", """ An error has occurred, additionally, an error occurred while handling that error -

- https://www.youtube.com/watch?v=dsx2vdn7gpY. - """)); } } - private String renderError(String title, String message) { - return """ - - Error - - -

- -
-
-
- -
-
-
-

- """ - + title + - """ -

-
- """ - +message+ - """ -
-
-
-

More Info

-
- You may be able to find more information here: - -
-
- """; + private String renderError(Request request, String title, String message) { + return renderer.render(Map.of("title", title, "message", message, + "profile", request.queryParamOrDefault("profile", ""), + "js", request.queryParamOrDefault("js", ""), + "query", request.queryParamOrDefault("query", "") + )); } } diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryIndexService.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryIndexService.java index 47a6a4f1..b73a76b3 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryIndexService.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryIndexService.java @@ -5,6 +5,7 @@ import com.google.inject.Singleton; import nu.marginalia.index.client.IndexClient; import nu.marginalia.index.client.model.results.SearchResultItem; import nu.marginalia.index.client.model.query.SearchSpecification; +import nu.marginalia.index.client.model.results.SearchResultSet; import nu.marginalia.search.model.PageScoreAdjustment; import nu.marginalia.search.model.UrlDetails; import nu.marginalia.search.results.SearchResultDecorator; @@ -37,7 +38,7 @@ public class SearchQueryIndexService { } public List executeQuery(Context ctx, SearchQuery processedQuery) { - final List results = indexClient.query(ctx, processedQuery.specs); + final SearchResultSet results = indexClient.query(ctx, processedQuery.specs); List urlDetails = resultDecorator.getAllUrlDetails(results); diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryService.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryService.java index 97151015..5d0e872f 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryService.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/svc/SearchQueryService.java @@ -61,7 +61,7 @@ public class SearchQueryService { } catch (Exception ex) { logger.error("Error", ex); - errorPageService.serveError(ctx, response); + errorPageService.serveError(ctx, request, response); } return ""; diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/valuation/SearchResultValuator.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/valuation/SearchResultValuator.java deleted file mode 100644 index 045fd48f..00000000 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/valuation/SearchResultValuator.java +++ /dev/null @@ -1,359 +0,0 @@ -package nu.marginalia.search.valuation; - -import com.google.inject.Inject; -import com.google.inject.Singleton; -import nu.marginalia.term_frequency_dict.TermFrequencyDict; -import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.model.idx.WordMetadata; -import nu.marginalia.index.client.model.results.SearchResultKeywordScore; -import nu.marginalia.index.client.model.query.SearchSubquery; -import nu.marginalia.language.WordPatterns; -import org.jetbrains.annotations.NotNull; - -import java.util.Arrays; -import java.util.EnumSet; -import java.util.Iterator; -import java.util.List; -import java.util.regex.Pattern; - -import static java.lang.Math.min; - -@Singleton -public class SearchResultValuator { - private final TermFrequencyDict dict; - - private static final Pattern separator = Pattern.compile("_"); - - private static final int MIN_LENGTH = 2000; - private static final int AVG_LENGTH = 5000; - private final int docCount; - - @Inject - public SearchResultValuator(TermFrequencyDict dict) { - this.dict = dict; - docCount = dict.docCount(); - } - - - public double preEvaluate(SearchSubquery sq) { - final String[] terms = sq.searchTermsInclude.stream().filter(f -> !f.contains(":")).toArray(String[]::new); - - double termSum = 0.; - double factorSum = 0.; - - final double[] weights = getTermWeights(terms); - - for (int i = 0; i < terms.length; i++) { - final double factor = 1. / (1.0 + weights[i]); - - factorSum += factor; - termSum += factor; // fixme - - // This logic is the casualty of refactoring. It is intended to prioritize search queries - // according to sum-of-idf, but right now it uses many CPU cycles to always calculate the value 1. - } - - return termSum / factorSum; - } - - public double evaluateTerms(List rawScores, int length, int titleLength) { - int sets = 1 + rawScores.stream().mapToInt(SearchResultKeywordScore::subquery).max().orElse(0); - - double bestScore = 10; - double bestAllTermsFactor = 1.; - - final double priorityTermBonus; - - if (hasPriorityTerm(rawScores)) { - priorityTermBonus = 0.5; - } - else { - priorityTermBonus = 1; - } - - for (int set = 0; set <= sets; set++) { - SearchResultsKeywordSet keywordSet = createKeywordSet(rawScores, set); - - if (keywordSet == null) - continue; - - final double bm25Factor = getBM25(keywordSet, length); - - bestScore = min(bestScore, bm25Factor); - - bestAllTermsFactor = min(bestAllTermsFactor, getAllTermsFactorForSet(keywordSet, titleLength)); - - } - - return bestScore * (0.1 + 0.9 * bestAllTermsFactor) * priorityTermBonus; - } - - private boolean hasPriorityTerm(List rawScores) { - return rawScores.stream() - .findAny() - .map(SearchResultKeywordScore::hasPriorityTerms) - .orElse(false); - } - - private double getBM25(SearchResultsKeywordSet keywordSet, int length) { - final double scalingFactor = 750.; - - final double wf1 = 0.7; - double k = 2; - - double sum = 0.; - - for (var keyword : keywordSet) { - double count = Math.min(255, Integer.bitCount(keyword.wordMetadata().positions())); - double wt = keyword.weight() * keyword.weight() / keywordSet.length(); - - final double invFreq = Math.log(1.0 + (docCount - wt + 0.5)/(wt + 0.5)); - - sum += invFreq * (count * (k + 1)) / (count + k * (1 - wf1 + wf1 * AVG_LENGTH/length)); - } - - return Math.sqrt(scalingFactor / sum); - } - - private double getAllTermsFactorForSet(SearchResultsKeywordSet set, int titleLength) { - double totalFactor = 1.; - - double totalWeight = 0; - for (var keyword : set) { - totalWeight += keyword.weight(); - } - - for (var keyword : set) { - totalFactor *= getAllTermsFactor(keyword, totalWeight, titleLength); - } - - if (set.keywords.length > 1) { - totalFactor = calculateTermCoherencePenalty(set, totalFactor); - } - else { - totalFactor = calculateSingleTermBonus(set, totalFactor); - } - - return totalFactor; - } - - private double calculateSingleTermBonus(SearchResultsKeywordSet set, double totalFactor) { - final var theKeyword = set.iterator().next(); - - final var wordMetadata = theKeyword.wordMetadata; - final int posCount = wordMetadata.positionCount(); - - if (wordMetadata.hasFlag(WordFlags.Title)) { - return totalFactor * 0.5; - } - else if (wordMetadata.hasFlag(WordFlags.Subjects)) { - return totalFactor * 0.6; - } - else if (wordMetadata.hasFlag(WordFlags.SiteAdjacent) && posCount > 0) { - return totalFactor * 0.65; - } - else if (wordMetadata.hasFlag(WordFlags.Site) && posCount > 0) { - return totalFactor * 0.7; - } - - if (wordMetadata.hasFlag(WordFlags.UrlDomain)) { - return totalFactor * 0.8; - } - else if (wordMetadata.hasFlag(WordFlags.UrlPath) && posCount > 2) - { - return totalFactor * 0.9; - } - - return totalFactor; - } - - private double calculateTermCoherencePenalty(SearchResultsKeywordSet keywordSet, double f) { - long maskDirect = ~0; - long maskAdjacent = ~0; - - byte excludeMask = (byte) (WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.Synthetic.asBit()); - - for (var keyword : keywordSet) { - var meta = keyword.wordMetadata; - long positions; - - if (meta.isEmpty()) { - return f; - } - - - positions = meta.positions(); - - maskAdjacent &= (positions | (positions << 1) | (positions >>> 1)); - if (positions != 0 && !WordMetadata.hasAnyFlags(meta.flags(), excludeMask)) - { - maskDirect &= positions; - } - } - - if (maskAdjacent == 0) { - return 2 * f; - } - - if (maskDirect == 0) { - return 1.25 * f; - } - - if (maskDirect != ~0L) { - double locationFactor = 0.5 + Math.max(0., - 0.5 * Long.numberOfTrailingZeros(maskDirect) / 16. - - Math.sqrt(Long.bitCount(maskDirect) - 1) / 3. - ); - - return f * locationFactor; - } - else { - return f; - } - } - - private double getAllTermsFactor(SearchResultsKeyword keyword, double totalWeight, int titleLength) { - double f = 1.; - - final double k = keyword.weight() / totalWeight; - - int posCount = keyword.wordMetadata.positionCount(); - - EnumSet flags = keyword.flags(); - - final boolean title = flags.contains(WordFlags.Title); - final boolean site = flags.contains(WordFlags.Site); - final boolean siteAdjacent = flags.contains(WordFlags.SiteAdjacent); - final boolean subject = flags.contains(WordFlags.Subjects); - final boolean names = flags.contains(WordFlags.NamesWords); - final boolean urlDomain = flags.contains(WordFlags.UrlDomain); - final boolean urlPath = flags.contains(WordFlags.UrlPath); - - if (title) { - if (titleLength <= 64) { - f *= Math.pow(0.5, k); - } - else if (titleLength < 96) { - f *= Math.pow(0.75, k); - } - else { // likely keyword stuffing if the title is this long - f *= Math.pow(0.9, k); - } - } - - if (posCount != 0) { - if (site) { - f *= Math.pow(0.75, k); - } else if (siteAdjacent) { - f *= Math.pow(0.8, k); - } - } - - if (subject) { - f *= Math.pow(0.8, k); - } - - if (urlDomain) { - f *= Math.pow(0.8, k); - } - else if (urlPath && posCount > 1) { - f *= Math.pow(0.9, k); - } - - if (!title && !subject && names) { - f *= Math.pow(0.9, k); - } - - return f; - } - - private double[] getTermWeights(SearchResultKeywordScore[] scores) { - double[] weights = new double[scores.length]; - - for (int i = 0; i < scores.length; i++) { - String[] parts = separator.split(scores[i].keyword); - double sumScore = 0.; - - int count = 0; - for (String part : parts) { - if (!WordPatterns.isStopWord(part)) { - sumScore += dict.getTermFreq(part); - count++; - } - } - if (count == 0) count = 1; - - weights[i] = Math.sqrt(sumScore)/count; - } - - return weights; - } - - - private double[] getTermWeights(String[] words) { - double[] weights = new double[words.length]; - - for (int i = 0; i < words.length; i++) { - String[] parts = separator.split(words[i]); - double sumScore = 0.; - - int count = 0; - for (String part : parts) { - if (!WordPatterns.isStopWord(part)) { - sumScore += dict.getTermFreq(part); - count++; - } - } - if (count == 0) count = 1; - - weights[i] = Math.sqrt(sumScore)/count; - } - - return weights; - } - - private SearchResultsKeywordSet createKeywordSet(List rawScores, int thisSet) { - SearchResultKeywordScore[] scores = rawScores.stream().filter(w -> w.subquery() == thisSet && !w.keyword.contains(":")).toArray(SearchResultKeywordScore[]::new); - if (scores.length == 0) { - return null; - } - final double[] weights = getTermWeights(scores); - - SearchResultsKeyword[] keywords = new SearchResultsKeyword[scores.length]; - for (int i = 0; i < scores.length; i++) { - keywords[i] = new SearchResultsKeyword(scores[i], weights[i]); - } - - return new SearchResultsKeywordSet(keywords); - - } - - - private record SearchResultsKeyword(SearchResultKeywordScore score, WordMetadata wordMetadata, double weight) { - public SearchResultsKeyword(SearchResultKeywordScore score, double weight) { - this(score, new WordMetadata(score.encodedWordMetadata()), weight); - } - - public int tfIdf() { - return wordMetadata.tfIdf(); - } - - public EnumSet flags() { - return wordMetadata.flagSet(); - } - } - - private record SearchResultsKeywordSet( - SearchResultsKeyword[] keywords) implements Iterable - { - @NotNull - @Override - public Iterator iterator() { - return Arrays.stream(keywords).iterator(); - } - - public int length() { - return keywords.length; - } - } -} diff --git a/code/services-core/search-service/src/main/resources/static/search/style-new.css b/code/services-core/search-service/src/main/resources/static/search/style-new.css index 090a7655..2c3488ad 100644 --- a/code/services-core/search-service/src/main/resources/static/search/style-new.css +++ b/code/services-core/search-service/src/main/resources/static/search/style-new.css @@ -157,16 +157,26 @@ ul.semantic-results a { margin-right: -1ch; margin-left: 1ch; } + .big .card { min-width: 40ch; } +.card.problems { + max-width: 40ch; +} + .card .info { flex-grow: 1; padding-left: 1ch; padding-right: 1ch; line-height: 1.6; } + +.card.problems .info { + margin-top: 1ch; +} + .card { flex-basis: 20ch; border: 2px #ccc; diff --git a/code/services-core/search-service/src/main/resources/templates/search/error-page-search.hdb b/code/services-core/search-service/src/main/resources/templates/search/error-page-search.hdb new file mode 100644 index 00000000..08bdf1e1 --- /dev/null +++ b/code/services-core/search-service/src/main/resources/templates/search/error-page-search.hdb @@ -0,0 +1,29 @@ + + + + + Marginalia Search - {{title}} + + + + + + + + + +{{>search/parts/search-header}} + +
+{{>search/parts/search-form}} + +
+
+

{{ title }}

+
{{{message}}}
+
+
+
+ +{{>search/parts/search-footer}} + diff --git a/code/services-core/search-service/src/test/java/nu/marginalia/search/query/QueryFactoryTest.java b/code/services-core/search-service/src/test/java/nu/marginalia/search/query/QueryFactoryTest.java index 09fe52c3..1c36922f 100644 --- a/code/services-core/search-service/src/test/java/nu/marginalia/search/query/QueryFactoryTest.java +++ b/code/services-core/search-service/src/test/java/nu/marginalia/search/query/QueryFactoryTest.java @@ -9,7 +9,6 @@ import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.search.command.SearchJsParameter; import nu.marginalia.search.model.SearchProfile; import nu.marginalia.search.query.model.UserSearchParameters; -import nu.marginalia.search.valuation.SearchResultValuator; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; @@ -32,7 +31,6 @@ public class QueryFactoryTest { tfd, new EnglishDictionary(tfd), new NGramBloomFilter(lm), - new SearchResultValuator(tfd), null ); } diff --git a/settings.gradle b/settings.gradle index 6e014e19..2aec2434 100644 --- a/settings.gradle +++ b/settings.gradle @@ -23,16 +23,16 @@ include 'code:libraries:term-frequency-dict' include 'code:features-search:screenshots' include 'code:features-search:random-websites' include 'code:features-search:query-parser' +include 'code:features-search:result-ranking' include 'code:features-convert:adblock' include 'code:features-convert:pubdate' +include 'code:features-convert:summary-extraction' include 'code:features-convert:keyword-extraction' include 'code:features-convert:topic-detection' include 'code:features-crawl:crawl-blocklist' include 'code:features-crawl:link-parser' -include 'code:features-crawl:work-log' -include 'code:features-crawl:crawl-plan' include 'code:features-index:lexicon' include 'code:features-index:index-journal' @@ -51,6 +51,7 @@ include 'code:common:service' include 'code:common:config' include 'code:common:model' include 'code:common:renderer' +include 'code:common:process' include 'code:processes:crawl-job-extractor-process' @@ -68,6 +69,7 @@ include 'third-party:symspell' include 'third-party:rdrpostagger' include 'third-party:uppend' include 'third-party:openzim' +include 'third-party:count-min-sketch' include 'third-party:monkey-patch-opennlp' include 'other:memex' @@ -125,6 +127,8 @@ dependencyResolutionManagement { library('trove', 'net.sf.trove4j', 'trove4j').version('3.0.3') library('fastutil', 'it.unimi.dsi', 'fastutil').version('8.5.8') + library('hll', 'net.agkn', 'hll').version('1.6.0') + library('okhttp3','com.squareup.okhttp3','okhttp').version('4.10.0') library('httpcomponents.core','org.apache.httpcomponents','httpcore').version('4.4.15') diff --git a/third-party/README.md b/third-party/README.md index 577566bd..6c4b2b9f 100644 --- a/third-party/README.md +++ b/third-party/README.md @@ -10,7 +10,10 @@ or lack an artifact, or to override some default that is inappropriate for the t * [PorterStemmer](porterstemmer/) - LGPL3 * [Uppend](uppend/) - MIT * [OpenZIM](openzim/) - GPL-2.0 + +### Repackaged * [SymSpell](symspell/) - LGPL-3.0 +* [Count-Min-Sketch](count-min-sketch/) - Apache 2.0 ### Monkey Patched * [Stanford OpenNLP](monkey-patch-opennlp/) - Apache-2.0 diff --git a/third-party/count-min-sketch/build.gradle b/third-party/count-min-sketch/build.gradle new file mode 100644 index 00000000..de627417 --- /dev/null +++ b/third-party/count-min-sketch/build.gradle @@ -0,0 +1,16 @@ +plugins { + id 'java' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(17)) + } +} + +dependencies { +} + +test { + useJUnitPlatform() +} diff --git a/third-party/count-min-sketch/readme.md b/third-party/count-min-sketch/readme.md new file mode 100644 index 00000000..b3375be7 --- /dev/null +++ b/third-party/count-min-sketch/readme.md @@ -0,0 +1,5 @@ +# Count Min Sketch + +[Count-min-sketch](https://github.com/prasanthj/count-min-sketch/blob/master/src/main/java/com/github/prasanthj/cmsketch/CountMinSketch.java) - Apache-2.0 + +by prasanthj diff --git a/third-party/count-min-sketch/src/main/java/com/github/prasanthj/cmsketch/CountMinSketch.java b/third-party/count-min-sketch/src/main/java/com/github/prasanthj/cmsketch/CountMinSketch.java new file mode 100644 index 00000000..7822963b --- /dev/null +++ b/third-party/count-min-sketch/src/main/java/com/github/prasanthj/cmsketch/CountMinSketch.java @@ -0,0 +1,305 @@ +/** + * Copyright 2014 Prasanth Jayachandran + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.github.prasanthj.cmsketch; + +import java.nio.ByteBuffer; + +/** + * Count Min sketch is a probabilistic data structure for finding the frequency of events in a + * stream of data. The data structure accepts two parameters epsilon and delta, epsilon specifies + * the error in estimation and delta specifies the probability that the estimation is wrong (or the + * confidence interval). The default values are 1% estimation error (epsilon) and 99% confidence + * (1 - delta). Tuning these parameters results in increase or decrease in the size of the count + * min sketch. The constructor also accepts width and depth parameters. The relationship between + * width and epsilon (error) is width = Math.ceil(Math.exp(1.0)/epsilon). In simpler terms, the + * lesser the error is, the greater is the width and hence the size of count min sketch. + * The relationship between delta and depth is depth = Math.ceil(Math.log(1.0/delta)). In simpler + * terms, the more the depth of the greater is the confidence. + * The way it works is, if we need to estimate the number of times a certain key is inserted (or appeared in + * the stream), count min sketch uses pairwise independent hash functions to map the key to + * different locations in count min sketch and increment the counter. + *

+ * For example, if width = 10 and depth = 4, lets assume the hashcodes + * for key "HELLO" using pairwise independent hash functions are 9812121, 6565512, 21312312, 8787008 + * respectively. Then the counter in hashcode % width locations are incremented. + *

+ * 0 1 2 3 4 5 6 7 8 9 + * --- --- --- --- --- --- --- --- --- --- + * | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + * --- --- --- --- --- --- --- --- --- --- + * --- --- --- --- --- --- --- --- --- --- + * | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + * --- --- --- --- --- --- --- --- --- --- + * --- --- --- --- --- --- --- --- --- --- + * | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + * --- --- --- --- --- --- --- --- --- --- + * --- --- --- --- --- --- --- --- --- --- + * | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | + * --- --- --- --- --- --- --- --- --- --- + *

+ * Now for a different key "WORLD", let the hashcodes be 23123123, 45354352, 8567453, 12312312. + * As we can see below there is a collision for 2nd hashcode + *

+ * 0 1 2 3 4 5 6 7 8 9 + * --- --- --- --- --- --- --- --- --- --- + * | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | + * --- --- --- --- --- --- --- --- --- --- + * --- --- --- --- --- --- --- --- --- --- + * | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + * --- --- --- --- --- --- --- --- --- --- + * --- --- --- --- --- --- --- --- --- --- + * | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | + * --- --- --- --- --- --- --- --- --- --- + * --- --- --- --- --- --- --- --- --- --- + * | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | + * --- --- --- --- --- --- --- --- --- --- + *

+ * Now, to get the estimated count for key "HELLO", same process is repeated again to find the + * values in each position and the estimated count will be the minimum of all values (to account for + * hash collisions). + *

+ * estimatedCount("HELLO") = min(1, 2, 1, 1) + *

+ * so even if there are multiple hash collisions, the returned value will be the best estimate + * (upper bound) for the given key. The actual count can never be greater than this value. + */ +public class CountMinSketch { + // 1% estimation error with 1% probability (99% confidence) that the estimation breaks this limit + private static final float DEFAULT_DELTA = 0.01f; + private static final float DEFAULT_EPSILON = 0.01f; + private final int w; + private final int d; + private final int[][] multiset; + + public CountMinSketch() { + this(DEFAULT_DELTA, DEFAULT_EPSILON); + } + + public CountMinSketch(float delta, float epsilon) { + this.w = (int) Math.ceil(Math.exp(1.0) / epsilon); + this.d = (int) Math.ceil(Math.log(1.0 / delta)); + this.multiset = new int[d][w]; + } + + public CountMinSketch(int width, int depth) { + this.w = width; + this.d = depth; + this.multiset = new int[d][w]; + } + + private CountMinSketch(int width, int depth, int[][] ms) { + this.w = width; + this.d = depth; + this.multiset = ms; + } + + public int getWidth() { + return w; + } + + public int getDepth() { + return d; + } + + /** + * Returns the size in bytes after serialization. + * + * @return serialized size in bytes + */ + public long getSizeInBytes() { + return ((w * d) + 2) * (Integer.SIZE / 8); + } + + public void set(byte[] key) { + // We use the trick mentioned in "Less Hashing, Same Performance: Building a Better Bloom Filter" + // by Kirsch et.al. From abstract 'only two hash functions are necessary to effectively + // implement a Bloom filter without any loss in the asymptotic false positive probability' + // The paper also proves that the same technique (using just 2 pairwise independent hash functions) + // can be used for Count-Min sketch. + + // Lets split up 64-bit hashcode into two 32-bit hashcodes and employ the technique mentioned + // in the above paper + long hash64 = Murmur3.hash64(key); + int hash1 = (int) hash64; + int hash2 = (int) (hash64 >>> 32); + for (int i = 1; i <= d; i++) { + int combinedHash = hash1 + (i * hash2); + // hashcode should be positive, flip all the bits if it's negative + if (combinedHash < 0) { + combinedHash = ~combinedHash; + } + int pos = combinedHash % w; + multiset[i - 1][pos] += 1; + } + } + + public void setString(String val) { + set(val.getBytes()); + } + + public void setByte(byte val) { + set(new byte[]{val}); + } + + public void setInt(int val) { + // puts int in little endian order + set(intToByteArrayLE(val)); + } + + + public void setLong(long val) { + // puts long in little endian order + set(longToByteArrayLE(val)); + } + + public void setFloat(float val) { + setInt(Float.floatToIntBits(val)); + } + + public void setDouble(double val) { + setLong(Double.doubleToLongBits(val)); + } + + private static byte[] intToByteArrayLE(int val) { + return new byte[]{(byte) (val >> 0), + (byte) (val >> 8), + (byte) (val >> 16), + (byte) (val >> 24)}; + } + + private static byte[] longToByteArrayLE(long val) { + return new byte[]{(byte) (val >> 0), + (byte) (val >> 8), + (byte) (val >> 16), + (byte) (val >> 24), + (byte) (val >> 32), + (byte) (val >> 40), + (byte) (val >> 48), + (byte) (val >> 56),}; + } + + public int getEstimatedCount(byte[] key) { + long hash64 = Murmur3.hash64(key); + int hash1 = (int) hash64; + int hash2 = (int) (hash64 >>> 32); + int min = Integer.MAX_VALUE; + for (int i = 1; i <= d; i++) { + int combinedHash = hash1 + (i * hash2); + // hashcode should be positive, flip all the bits if it's negative + if (combinedHash < 0) { + combinedHash = ~combinedHash; + } + int pos = combinedHash % w; + min = Math.min(min, multiset[i - 1][pos]); + } + + return min; + } + + public int getEstimatedCountString(String val) { + return getEstimatedCount(val.getBytes()); + } + + public int getEstimatedCountByte(byte val) { + return getEstimatedCount(new byte[]{val}); + } + + public int getEstimatedCountInt(int val) { + return getEstimatedCount(intToByteArrayLE(val)); + } + + public int getEstimatedCountLong(long val) { + return getEstimatedCount(longToByteArrayLE(val)); + } + + public int getEstimatedCountFloat(float val) { + return getEstimatedCountInt(Float.floatToIntBits(val)); + } + + public int getEstimatedCountDouble(double val) { + return getEstimatedCountLong(Double.doubleToLongBits(val)); + } + + /** + * Merge the give count min sketch with current one. Merge will throw RuntimeException if the + * provided CountMinSketch is not compatible with current one. + * + * @param that - the one to be merged + */ + public void merge(CountMinSketch that) { + if (that == null) { + return; + } + + if (this.w != that.w) { + throw new RuntimeException("Merge failed! Width of count min sketch do not match!" + + "this.width: " + this.getWidth() + " that.width: " + that.getWidth()); + } + + if (this.d != that.d) { + throw new RuntimeException("Merge failed! Depth of count min sketch do not match!" + + "this.depth: " + this.getDepth() + " that.depth: " + that.getDepth()); + } + + for (int i = 0; i < d; i++) { + for (int j = 0; j < w; j++) { + this.multiset[i][j] += that.multiset[i][j]; + } + } + } + + /** + * Serialize the count min sketch to byte array. The format of serialization is width followed by + * depth followed by integers in multiset from row1, row2 and so on.. + * + * @return serialized byte array + */ + public static byte[] serialize(CountMinSketch cms) { + long serializedSize = cms.getSizeInBytes(); + ByteBuffer bb = ByteBuffer.allocate((int) serializedSize); + bb.putInt(cms.getWidth()); + bb.putInt(cms.getDepth()); + for (int i = 0; i < cms.getDepth(); i++) { + for (int j = 0; j < cms.getWidth(); j++) { + bb.putInt(cms.multiset[i][j]); + } + } + bb.flip(); + return bb.array(); + } + + /** + * Deserialize the serialized count min sketch. + * + * @param serialized - serialized count min sketch + * @return deserialized count min sketch object + */ + public static CountMinSketch deserialize(byte[] serialized) { + ByteBuffer bb = ByteBuffer.allocate(serialized.length); + bb.put(serialized); + bb.flip(); + int width = bb.getInt(); + int depth = bb.getInt(); + int[][] multiset = new int[depth][width]; + for (int i = 0; i < depth; i++) { + for (int j = 0; j < width; j++) { + multiset[i][j] = bb.getInt(); + } + } + CountMinSketch cms = new CountMinSketch(width, depth, multiset); + return cms; + } +} diff --git a/third-party/count-min-sketch/src/main/java/com/github/prasanthj/cmsketch/Murmur3.java b/third-party/count-min-sketch/src/main/java/com/github/prasanthj/cmsketch/Murmur3.java new file mode 100644 index 00000000..9dab784e --- /dev/null +++ b/third-party/count-min-sketch/src/main/java/com/github/prasanthj/cmsketch/Murmur3.java @@ -0,0 +1,317 @@ +/** + * Copyright 2014 Prasanth Jayachandran + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.github.prasanthj.cmsketch; + +/** + * Murmur3 32 and 128 bit variants. + * 32-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#94 + * 128-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#255 + */ +class Murmur3 { + // Constants for 32 bit variant + private static final int C1_32 = 0xcc9e2d51; + private static final int C2_32 = 0x1b873593; + private static final int R1_32 = 15; + private static final int R2_32 = 13; + private static final int M_32 = 5; + private static final int N_32 = 0xe6546b64; + + // Constants for 128 bit variant + private static final long C1 = 0x87c37b91114253d5L; + private static final long C2 = 0x4cf5ad432745937fL; + private static final int R1 = 31; + private static final int R2 = 27; + private static final int R3 = 33; + private static final int M = 5; + private static final int N1 = 0x52dce729; + private static final int N2 = 0x38495ab5; + + private static final int DEFAULT_SEED = 0; + + /** + * Murmur3 32-bit variant. + * + * @param data - input byte array + * @return - hashcode + */ + public static int hash32(byte[] data) { + return hash32(data, data.length, DEFAULT_SEED); + } + + /** + * Murmur3 32-bit variant. + * + * @param data - input byte array + * @param length - length of array + * @param seed - seed. (default 0) + * @return - hashcode + */ + public static int hash32(byte[] data, int length, int seed) { + int hash = seed; + final int nblocks = length >> 2; + + // body + for (int i = 0; i < nblocks; i++) { + int i_4 = i << 2; + int k = (data[i_4] & 0xff) + | ((data[i_4 + 1] & 0xff) << 8) + | ((data[i_4 + 2] & 0xff) << 16) + | ((data[i_4 + 3] & 0xff) << 24); + + // mix functions + k *= C1_32; + k = Integer.rotateLeft(k, R1_32); + k *= C2_32; + hash ^= k; + hash = Integer.rotateLeft(hash, R2_32) * M_32 + N_32; + } + + // tail + int idx = nblocks << 2; + int k1 = 0; + switch (length - idx) { + case 3: + k1 ^= data[idx + 2] << 16; + case 2: + k1 ^= data[idx + 1] << 8; + case 1: + k1 ^= data[idx]; + + // mix functions + k1 *= C1_32; + k1 = Integer.rotateLeft(k1, R1_32); + k1 *= C2_32; + hash ^= k1; + } + + // finalization + hash ^= length; + hash ^= (hash >>> 16); + hash *= 0x85ebca6b; + hash ^= (hash >>> 13); + hash *= 0xc2b2ae35; + hash ^= (hash >>> 16); + + return hash; + } + + /** + * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant. + * + * @param data - input byte array + * @return - hashcode + */ + public static long hash64(byte[] data) { + return hash64(data, data.length, DEFAULT_SEED); + } + + /** + * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant. + * + * @param data - input byte array + * @param length - length of array + * @param seed - seed. (default is 0) + * @return - hashcode + */ + public static long hash64(byte[] data, int length, int seed) { + long hash = seed; + final int nblocks = length >> 3; + + // body + for (int i = 0; i < nblocks; i++) { + final int i8 = i << 3; + long k = ((long) data[i8] & 0xff) + | (((long) data[i8 + 1] & 0xff) << 8) + | (((long) data[i8 + 2] & 0xff) << 16) + | (((long) data[i8 + 3] & 0xff) << 24) + | (((long) data[i8 + 4] & 0xff) << 32) + | (((long) data[i8 + 5] & 0xff) << 40) + | (((long) data[i8 + 6] & 0xff) << 48) + | (((long) data[i8 + 7] & 0xff) << 56); + + // mix functions + k *= C1; + k = Long.rotateLeft(k, R1); + k *= C2; + hash ^= k; + hash = Long.rotateLeft(hash, R2) * M + N1; + } + + // tail + long k1 = 0; + int tailStart = nblocks << 3; + switch (length - tailStart) { + case 7: + k1 ^= ((long) data[tailStart + 6] & 0xff) << 48; + case 6: + k1 ^= ((long) data[tailStart + 5] & 0xff) << 40; + case 5: + k1 ^= ((long) data[tailStart + 4] & 0xff) << 32; + case 4: + k1 ^= ((long) data[tailStart + 3] & 0xff) << 24; + case 3: + k1 ^= ((long) data[tailStart + 2] & 0xff) << 16; + case 2: + k1 ^= ((long) data[tailStart + 1] & 0xff) << 8; + case 1: + k1 ^= ((long) data[tailStart] & 0xff); + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + hash ^= k1; + } + + // finalization + hash ^= length; + hash = fmix64(hash); + + return hash; + } + + /** + * Murmur3 128-bit variant. + * + * @param data - input byte array + * @return - hashcode (2 longs) + */ + public static long[] hash128(byte[] data) { + return hash128(data, data.length, DEFAULT_SEED); + } + + /** + * Murmur3 128-bit variant. + * + * @param data - input byte array + * @param length - length of array + * @param seed - seed. (default is 0) + * @return - hashcode (2 longs) + */ + public static long[] hash128(byte[] data, int length, int seed) { + long h1 = seed; + long h2 = seed; + final int nblocks = length >> 4; + + // body + for (int i = 0; i < nblocks; i++) { + final int i16 = i << 4; + long k1 = ((long) data[i16] & 0xff) + | (((long) data[i16 + 1] & 0xff) << 8) + | (((long) data[i16 + 2] & 0xff) << 16) + | (((long) data[i16 + 3] & 0xff) << 24) + | (((long) data[i16 + 4] & 0xff) << 32) + | (((long) data[i16 + 5] & 0xff) << 40) + | (((long) data[i16 + 6] & 0xff) << 48) + | (((long) data[i16 + 7] & 0xff) << 56); + + long k2 = ((long) data[i16 + 8] & 0xff) + | (((long) data[i16 + 9] & 0xff) << 8) + | (((long) data[i16 + 10] & 0xff) << 16) + | (((long) data[i16 + 11] & 0xff) << 24) + | (((long) data[i16 + 12] & 0xff) << 32) + | (((long) data[i16 + 13] & 0xff) << 40) + | (((long) data[i16 + 14] & 0xff) << 48) + | (((long) data[i16 + 15] & 0xff) << 56); + + // mix functions for k1 + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + h1 ^= k1; + h1 = Long.rotateLeft(h1, R2); + h1 += h2; + h1 = h1 * M + N1; + + // mix functions for k2 + k2 *= C2; + k2 = Long.rotateLeft(k2, R3); + k2 *= C1; + h2 ^= k2; + h2 = Long.rotateLeft(h2, R1); + h2 += h1; + h2 = h2 * M + N2; + } + + // tail + long k1 = 0; + long k2 = 0; + int tailStart = nblocks << 4; + switch (length - tailStart) { + case 15: + k2 ^= (long) (data[tailStart + 14] & 0xff) << 48; + case 14: + k2 ^= (long) (data[tailStart + 13] & 0xff) << 40; + case 13: + k2 ^= (long) (data[tailStart + 12] & 0xff) << 32; + case 12: + k2 ^= (long) (data[tailStart + 11] & 0xff) << 24; + case 11: + k2 ^= (long) (data[tailStart + 10] & 0xff) << 16; + case 10: + k2 ^= (long) (data[tailStart + 9] & 0xff) << 8; + case 9: + k2 ^= (long) (data[tailStart + 8] & 0xff); + k2 *= C2; + k2 = Long.rotateLeft(k2, R3); + k2 *= C1; + h2 ^= k2; + + case 8: + k1 ^= (long) (data[tailStart + 7] & 0xff) << 56; + case 7: + k1 ^= (long) (data[tailStart + 6] & 0xff) << 48; + case 6: + k1 ^= (long) (data[tailStart + 5] & 0xff) << 40; + case 5: + k1 ^= (long) (data[tailStart + 4] & 0xff) << 32; + case 4: + k1 ^= (long) (data[tailStart + 3] & 0xff) << 24; + case 3: + k1 ^= (long) (data[tailStart + 2] & 0xff) << 16; + case 2: + k1 ^= (long) (data[tailStart + 1] & 0xff) << 8; + case 1: + k1 ^= (long) (data[tailStart] & 0xff); + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + h1 ^= k1; + } + + // finalization + h1 ^= length; + h2 ^= length; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + return new long[]{h1, h2}; + } + + private static long fmix64(long h) { + h ^= (h >>> 33); + h *= 0xff51afd7ed558ccdL; + h ^= (h >>> 33); + h *= 0xc4ceb9fe1a85ec53L; + h ^= (h >>> 33); + return h; + } +} \ No newline at end of file