diff --git a/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentFlags.java b/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentFlags.java
index 7528a4eb..099d2da8 100644
--- a/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentFlags.java
+++ b/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentFlags.java
@@ -3,9 +3,7 @@ package nu.marginalia.model.idx;
import java.util.EnumSet;
public enum DocumentFlags {
- /** Simple processing was done, this document should be de-prioritized as a search result */
- Simple,
-
+ UnusedBit1,
PlainText,
UnusedBit2,
UnusedBit3,
diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/DocumentKeywordExtractor.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/DocumentKeywordExtractor.java
index 49d64002..af0f5073 100644
--- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/DocumentKeywordExtractor.java
+++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/DocumentKeywordExtractor.java
@@ -67,7 +67,7 @@ public class DocumentKeywordExtractor {
String flatWord = AsciiFlattener.flattenUnicode(word.word);
- if (WordPatterns.hasWordQualities(flatWord)) {
+ if (!flatWord.isBlank()) {
wordsBuilder.add(flatWord, metadata.getMetadataForWord(word.stemmed));
}
}
diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/KeywordExtractor.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/KeywordExtractor.java
index 6b3540f0..bdc87f29 100644
--- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/KeywordExtractor.java
+++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/KeywordExtractor.java
@@ -220,7 +220,7 @@ public class KeywordExtractor {
}
String word = sentence.constructWordFromSpan(w);
- if (word.isBlank() || !WordPatterns.filter(word)) return false;
+ if (word.isBlank() || !WordPatterns.isNotJunkWord(word)) return false;
if (sentence.posTags[w.start].equals("CC")) return false;
if (sentence.posTags[w.end-1].equals("IN")) return false;
if (sentence.posTags[w.end-1].equals("DT")) return false;
diff --git a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryBuilder.java b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryBuilder.java
index f5f1fc22..979af42d 100644
--- a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryBuilder.java
+++ b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryBuilder.java
@@ -12,11 +12,6 @@ public class IndexJournalEntryBuilder {
this.documentMeta = documentMeta;
}
- public IndexJournalEntryBuilder capacity(int size) {
- items.ensureCapacity(size);
- return this;
- }
-
public IndexJournalEntryBuilder add(long wordId, long metadata) {
items.add(wordId);
diff --git a/code/features-search/query-parser/build.gradle b/code/features-search/query-parser/build.gradle
index edf02741..c738a144 100644
--- a/code/features-search/query-parser/build.gradle
+++ b/code/features-search/query-parser/build.gradle
@@ -13,12 +13,13 @@ java {
}
dependencies {
implementation project(':code:libraries:language-processing')
- implementation project(':code:libraries:ngram-bloom-filter')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:features-convert:keyword-extraction')
implementation project(':code:common:config')
implementation project(':code:common:model')
+ implementation project(':third-party:porterstemmer')
+
implementation libs.lombok
annotationProcessor libs.lombok
implementation libs.bundles.slf4j
@@ -26,6 +27,7 @@ dependencies {
implementation libs.bundles.handlebars
implementation libs.trove
+ implementation libs.guice
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
diff --git a/code/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/EnglishDictionary.java b/code/features-search/query-parser/src/main/java/nu/marginalia/language/EnglishDictionary.java
similarity index 99%
rename from code/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/EnglishDictionary.java
rename to code/features-search/query-parser/src/main/java/nu/marginalia/language/EnglishDictionary.java
index d96c0666..0afd3625 100644
--- a/code/libraries/language-processing/src/main/java/nu/marginalia/language/statistics/EnglishDictionary.java
+++ b/code/features-search/query-parser/src/main/java/nu/marginalia/language/EnglishDictionary.java
@@ -1,4 +1,4 @@
-package nu.marginalia.language.statistics;
+package nu.marginalia.language;
import com.google.inject.Inject;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
diff --git a/code/libraries/ngram-bloom-filter/src/main/java/nu/marginalia/ngram_bloom_filter/DenseBitMap.java b/code/features-search/query-parser/src/main/java/nu/marginalia/ngrams/DenseBitMap.java
similarity index 97%
rename from code/libraries/ngram-bloom-filter/src/main/java/nu/marginalia/ngram_bloom_filter/DenseBitMap.java
rename to code/features-search/query-parser/src/main/java/nu/marginalia/ngrams/DenseBitMap.java
index a69576cc..ca5666b1 100644
--- a/code/libraries/ngram-bloom-filter/src/main/java/nu/marginalia/ngram_bloom_filter/DenseBitMap.java
+++ b/code/features-search/query-parser/src/main/java/nu/marginalia/ngrams/DenseBitMap.java
@@ -1,4 +1,4 @@
-package nu.marginalia.ngram_bloom_filter;
+package nu.marginalia.ngrams;
import java.io.IOException;
import java.nio.ByteBuffer;
diff --git a/code/libraries/ngram-bloom-filter/src/main/java/nu/marginalia/ngram_bloom_filter/NGramBloomFilter.java b/code/features-search/query-parser/src/main/java/nu/marginalia/ngrams/NGramBloomFilter.java
similarity index 97%
rename from code/libraries/ngram-bloom-filter/src/main/java/nu/marginalia/ngram_bloom_filter/NGramBloomFilter.java
rename to code/features-search/query-parser/src/main/java/nu/marginalia/ngrams/NGramBloomFilter.java
index 85af1367..89a6d9cf 100644
--- a/code/libraries/ngram-bloom-filter/src/main/java/nu/marginalia/ngram_bloom_filter/NGramBloomFilter.java
+++ b/code/features-search/query-parser/src/main/java/nu/marginalia/ngrams/NGramBloomFilter.java
@@ -1,4 +1,4 @@
-package nu.marginalia.ngram_bloom_filter;
+package nu.marginalia.ngrams;
import ca.rmen.porterstemmer.PorterStemmer;
import com.google.common.hash.HashFunction;
diff --git a/code/features-search/query-parser/src/main/java/nu/marginalia/query_parser/QueryVariants.java b/code/features-search/query-parser/src/main/java/nu/marginalia/query_parser/QueryVariants.java
index 6acdaed4..eb4abd79 100644
--- a/code/features-search/query-parser/src/main/java/nu/marginalia/query_parser/QueryVariants.java
+++ b/code/features-search/query-parser/src/main/java/nu/marginalia/query_parser/QueryVariants.java
@@ -6,9 +6,9 @@ import lombok.Getter;
import lombok.ToString;
import nu.marginalia.LanguageModels;
import nu.marginalia.keyword.KeywordExtractor;
-import nu.marginalia.language.statistics.EnglishDictionary;
+import nu.marginalia.language.EnglishDictionary;
import nu.marginalia.language.sentence.SentenceExtractor;
-import nu.marginalia.ngram_bloom_filter.NGramBloomFilter;
+import nu.marginalia.ngrams.NGramBloomFilter;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.WordSpan;
diff --git a/code/libraries/ngram-bloom-filter/src/test/java/nu/marginalia/ngram_bloom_filter/DenseBitMapTest.java b/code/features-search/query-parser/src/test/java/nu/marginalia/ngrams/DenseBitMapTest.java
similarity index 97%
rename from code/libraries/ngram-bloom-filter/src/test/java/nu/marginalia/ngram_bloom_filter/DenseBitMapTest.java
rename to code/features-search/query-parser/src/test/java/nu/marginalia/ngrams/DenseBitMapTest.java
index 783b2ca9..d2db16b6 100644
--- a/code/libraries/ngram-bloom-filter/src/test/java/nu/marginalia/ngram_bloom_filter/DenseBitMapTest.java
+++ b/code/features-search/query-parser/src/test/java/nu/marginalia/ngrams/DenseBitMapTest.java
@@ -1,4 +1,4 @@
-package nu.marginalia.ngram_bloom_filter;
+package nu.marginalia.ngrams;
import org.junit.jupiter.api.Test;
diff --git a/code/features-search/query-parser/src/test/java/nu/marginalia/query_parser/BodyQueryParserTest.java b/code/features-search/query-parser/src/test/java/nu/marginalia/query_parser/BodyQueryParserTest.java
index cd9a61eb..8cc38312 100644
--- a/code/features-search/query-parser/src/test/java/nu/marginalia/query_parser/BodyQueryParserTest.java
+++ b/code/features-search/query-parser/src/test/java/nu/marginalia/query_parser/BodyQueryParserTest.java
@@ -1,8 +1,8 @@
package nu.marginalia.query_parser;
import nu.marginalia.LanguageModels;
-import nu.marginalia.language.statistics.EnglishDictionary;
-import nu.marginalia.ngram_bloom_filter.NGramBloomFilter;
+import nu.marginalia.language.EnglishDictionary;
+import nu.marginalia.ngrams.NGramBloomFilter;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.query_parser.token.TokenType;
import nu.marginalia.util.TestLanguageModels;
diff --git a/code/features-search/query-parser/src/test/java/nu/marginalia/query_parser/QueryVariantsTest.java b/code/features-search/query-parser/src/test/java/nu/marginalia/query_parser/QueryVariantsTest.java
index e67a6940..d82976e9 100644
--- a/code/features-search/query-parser/src/test/java/nu/marginalia/query_parser/QueryVariantsTest.java
+++ b/code/features-search/query-parser/src/test/java/nu/marginalia/query_parser/QueryVariantsTest.java
@@ -1,8 +1,8 @@
package nu.marginalia.query_parser;
import nu.marginalia.LanguageModels;
-import nu.marginalia.language.statistics.EnglishDictionary;
-import nu.marginalia.ngram_bloom_filter.NGramBloomFilter;
+import nu.marginalia.language.EnglishDictionary;
+import nu.marginalia.ngrams.NGramBloomFilter;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.language.sentence.SentenceExtractor;
diff --git a/code/features-search/result-ranking/src/test/java/nu/marginalia/ranking/factors/TermCoherenceFactorTest.java b/code/features-search/result-ranking/src/test/java/nu/marginalia/ranking/factors/TermCoherenceFactorTest.java
index c9d8dd00..685118d4 100644
--- a/code/features-search/result-ranking/src/test/java/nu/marginalia/ranking/factors/TermCoherenceFactorTest.java
+++ b/code/features-search/result-ranking/src/test/java/nu/marginalia/ranking/factors/TermCoherenceFactorTest.java
@@ -41,7 +41,7 @@ class TermCoherenceFactorTest {
assertEquals(0, termCoherenceFactor.calculate(allPositionsSet));
}
- @Test
+ @Test @SuppressWarnings("unchecked")
public void testLowPosMatches() {
var allPositionsSet = createSet(
List.of(0, 1, 2, 3), List.of(0, 1, 2, 3)
@@ -53,7 +53,7 @@ class TermCoherenceFactorTest {
assertEquals(1.0, termCoherenceFactor.bitPositionFactor(mask), 0.01);
}
- @Test
+ @Test @SuppressWarnings("unchecked")
public void testHiPosMatches() {
var allPositionsSet = createSet(
List.of(28, 29, 30, 31), List.of(28, 29, 30, 31)
diff --git a/code/libraries/language-processing/src/main/java/nu/marginalia/language/WordPatterns.java b/code/libraries/language-processing/src/main/java/nu/marginalia/language/WordPatterns.java
index 39483338..6758fdae 100644
--- a/code/libraries/language-processing/src/main/java/nu/marginalia/language/WordPatterns.java
+++ b/code/libraries/language-processing/src/main/java/nu/marginalia/language/WordPatterns.java
@@ -8,8 +8,6 @@ import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Objects;
import java.util.Set;
-import java.util.function.Predicate;
-import java.util.regex.Pattern;
/** Regular expression patterns for deciding which words are eligible to be keywords.
*
se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels()));
-// LanguageFilter lf = new LanguageFilter();
-//
-// TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1);
-//
-// ForkJoinPool fjp = new ForkJoinPool(24);
-// AtomicInteger docCount = new AtomicInteger();
-//
-// for (var domain : plan.domainsIterable()) { // leaks file descriptor, is fine
-//
-// if (domain.doc == null)
-// continue;
-//
-// fjp.execute(() -> {
-//
-// TLongHashSet words = new TLongHashSet(10_000);
-//
-// for (var doc : domain.doc) {
-//
-// if (doc.documentBody == null)
-// continue;
-// docCount.incrementAndGet();
-//
-// Document parsed = Jsoup.parse(doc.documentBody.decode());
-// parsed.body().filter(new DomPruningFilter(0.5));
-//
-// DocumentLanguageData dld = se.get().extractSentences(parsed);
-//
-// if (lf.dictionaryAgreement(dld) < 0.1) {
-// return;
-// }
-//
-// for (var sent : dld.sentences) {
-// for (var word : sent) {
-// words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8)));
-// }
-// }
-//
-// synchronized (counts) {
-// words.forEach(w -> {
-// counts.adjustOrPutValue(w, 1, 1);
-// return true;
-// });
-// }
-//
-// words.clear();
-// }
-//
-// System.out.println(domain.domain + "\t" + counts.size());
-// });
-//
-//
-// }
-//
-// fjp.shutdown();
-// fjp.awaitTermination(10, TimeUnit.DAYS);
-//
-// try (var dos = new DataOutputStream(Files.newOutputStream(Path.of(outFile)))) {
-// synchronized (counts) {
-// counts.put(DOC_COUNT_KEY, docCount.get());
-//
-// counts.forEachEntry((hash, cnt) -> {
-// try {
-// dos.writeLong(hash);
-// dos.writeLong(cnt);
-// } catch (IOException e) {
-// throw new RuntimeException(e);
-// }
-// return true;
-// });
-// }
-// }
-//
-// System.out.println(docCount.get());
-// }
+ /** Get the term frequency for the string s */
+ public long getTermFreq(String s) {
+ return wordRates.get(getStringHash(s));
+ }
+
+ /** Get the term frequency for the already stemmed string s */
+ public long getTermFreqStemmed(String s) {
+ return wordRates.get(longHash(s.getBytes()));
+ }
+
+ /** Get the term frequency for the already stemmed and already hashed value 'hash' */
+ public long getTermFreqHash(long hash) {
+ return wordRates.get(hash);
+ }
public static long getStringHash(String s) {
if (s.indexOf(' ') >= 0 || s.indexOf('_') >= 0) {
@@ -156,17 +93,11 @@ public class TermFrequencyDict {
}
}
- public long getTermFreqHash(long hash) {
- return wordRates.get(hash);
- }
- public long getTermFreq(String s) {
- return wordRates.get(getStringHash(s));
- }
- public long getTermFreqStemmed(String s) {
- return wordRates.get(longHash(s.getBytes()));
- }
-
- // If this ever changes, we need to re-generate the term frequency dictionary
+ /** The hashing function used by TermFrequencyHash
+ *
+ * If this function changes its behavior in any way,
+ * it is necessary to re-generate the dictionary.
+ */
public static long longHash(byte[]... bytesSets) {
if (bytesSets == null || bytesSets.length == 0)
return 0;
diff --git a/code/process-models/converting-model/readme.md b/code/process-models/converting-model/readme.md
index 59d89f2e..feaae4b3 100644
--- a/code/process-models/converting-model/readme.md
+++ b/code/process-models/converting-model/readme.md
@@ -1,4 +1,49 @@
# Converting Models
Contains models shared by the [converting-process](../../processes/converting-process/) and
-[loading-process](../../processes/loading-process/).
\ No newline at end of file
+[loading-process](../../processes/loading-process/).
+
+## Design
+
+The two processes communicate through a file-based protocol. The converter serializes [instructions](src/main/java/nu/marginalia/converting/instruction/Instruction.java)
+to file, which are deserialized by the loader and fed into an [instructions](src/main/java/nu/marginalia/converting/instruction/Interpreter.java).
+
+The instructions implement a visitor pattern.
+
+Conceptually the pattern can be thought of a bit like remote function calls over file,
+or a crude instructions-based programming language.
+
+This
+
+```java
+producer.foo("cat");
+producer.bar("milk", "eggs", "bread");
+```
+
+translates through this paradigm, to this:
+
+```
+(producer)
+writeInstruction(DoFoo("Cat"))
+writeInstruction(DoBar("Milk", "Eggs", "Bread"))
+
+(consumer)
+while read instruction:
+ interpreter.apply(instruction)
+
+(Interpreter)
+doFoo(animal):
+ ...
+doBar(ingredients):
+ ...
+
+(doFoo)
+DoFoo(animal):
+ apply(interpreter):
+ interpreter.foo(animal)
+
+(doBar)
+DoBar(ingredients):
+ apply(interpreter):
+ interpreter.bar(ingredients)
+```
diff --git a/code/process-models/crawling-model/readme.md b/code/process-models/crawling-model/readme.md
index 3b63d74e..3f6d02a6 100644
--- a/code/process-models/crawling-model/readme.md
+++ b/code/process-models/crawling-model/readme.md
@@ -7,6 +7,7 @@ Contains models shared by the [crawling-process](../../processes/crawling-proces
* [CrawledDocument](src/main/java/nu/marginalia/crawling/model/CrawledDocument.java)
* [CrawledDomain](src/main/java/nu/marginalia/crawling/model/CrawledDomain.java)
+* [CrawlingSpecification](src/main/java/nu/marginalia/crawling/model/spec/CrawlingSpecification.java)
### Marshalling
* [CrawledDomainReader](src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java)
diff --git a/code/libraries/language-processing/src/main/java/nu/marginalia/language/LanguageFilter.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/language/LanguageFilter.java
similarity index 98%
rename from code/libraries/language-processing/src/main/java/nu/marginalia/language/LanguageFilter.java
rename to code/processes/converting-process/src/main/java/nu/marginalia/converting/language/LanguageFilter.java
index b4ba2793..dd375fad 100644
--- a/code/libraries/language-processing/src/main/java/nu/marginalia/language/LanguageFilter.java
+++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/language/LanguageFilter.java
@@ -1,4 +1,4 @@
-package nu.marginalia.language;
+package nu.marginalia.converting.language;
import nu.marginalia.language.encoding.UnicodeRanges;
import nu.marginalia.language.model.DocumentLanguageData;
diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocument.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocument.java
index a725be0a..3edd9f80 100644
--- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocument.java
+++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDocument.java
@@ -29,7 +29,7 @@ public class ProcessedDocument {
if (details == null)
return false;
- return !details.metadata.hasFlag(DocumentFlags.Simple);
+ return true;
}
public OptionalDouble quality() {
diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java
index 4ccfdafe..fa7fd118 100644
--- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java
+++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java
@@ -2,7 +2,7 @@ package nu.marginalia.converting.processor.plugin;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
-import nu.marginalia.language.LanguageFilter;
+import nu.marginalia.converting.language.LanguageFilter;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java
index faf5fbb0..afedf88e 100644
--- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java
+++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java
@@ -4,6 +4,7 @@ package nu.marginalia.converting;
import com.google.inject.Guice;
import com.google.inject.Injector;
import nu.marginalia.bigstring.BigString;
+import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
@@ -22,8 +23,7 @@ import static org.junit.jupiter.api.Assertions.*;
public class ConvertingIntegrationTest {
-
- DomainProcessor domainProcessor;
+ private DomainProcessor domainProcessor;
@BeforeEach
public void setUp() {
@@ -60,7 +60,22 @@ public class ConvertingIntegrationTest {
ret.documents.forEach(doc -> {
resultsByStatusCount.merge(doc.state, 1, Integer::sum);
});
- assertTrue(resultsByStatusCount.get(UrlIndexingState.OK) > 5);
+
+ assertTrue(resultsByStatusCount.get(UrlIndexingState.OK) > 25);
+
+ for (var doc : ret.documents) {
+
+ if (!doc.isProcessedFully()) {
+ continue;
+ }
+
+ var details = doc.details;
+
+ assertTrue(details.title.length() > 4);
+ assertTrue(details.description.length() > 4);
+ assertEquals(HtmlStandard.HTML5, details.standard);
+
+ }
}
private CrawledDomain readMarginaliaWorkingSet() throws IOException {
diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/LanguageFilterTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/language/LanguageFilterTest.java
similarity index 93%
rename from code/processes/crawling-process/src/test/java/nu/marginalia/crawling/LanguageFilterTest.java
rename to code/processes/converting-process/src/test/java/nu/marginalia/converting/language/LanguageFilterTest.java
index 694810ba..f37c0cb5 100644
--- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/LanguageFilterTest.java
+++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/language/LanguageFilterTest.java
@@ -1,6 +1,5 @@
-package nu.marginalia.crawling;
+package nu.marginalia.converting.language;
-import nu.marginalia.language.LanguageFilter;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Test;
diff --git a/code/processes/converting-process/src/test/resources/memex-marginalia/readme.md b/code/processes/converting-process/src/test/resources/memex-marginalia/readme.md
new file mode 100644
index 00000000..71e29df8
--- /dev/null
+++ b/code/processes/converting-process/src/test/resources/memex-marginalia/readme.md
@@ -0,0 +1,3 @@
+# Test Data
+
+This is a snapshot of memex.marginalia.nu from 2023-03-17.
\ No newline at end of file
diff --git a/code/processes/crawl-job-extractor-process/readme.md b/code/processes/crawl-job-extractor-process/readme.md
deleted file mode 100644
index d8f3e98e..00000000
--- a/code/processes/crawl-job-extractor-process/readme.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# Crawl Job Extractor
-
-The crawl job extractor creates a file containing a list of domains
-along with known URLs. This is consumed by the [crawling-process](../crawling-process).
\ No newline at end of file
diff --git a/code/processes/readme.md b/code/processes/readme.md
index ce86db9a..44c29a1e 100644
--- a/code/processes/readme.md
+++ b/code/processes/readme.md
@@ -1,22 +1,20 @@
# Processes
-## 1. Crawl Job Extractor
-
-The [crawl-job-extractor-process](crawl-job-extractor-process/) creates a crawl job specification
-based on the content in the database.
-
-## 2. Crawl Process
+## 1. Crawl Process
The [crawling-process](crawling-process/) fetches website contents and saves them
as compressed JSON models described in [crawling-model](../process-models/crawling-model/).
-## 3. Converting Process
+The operation is specified by a crawl job specification. This is generated by [tools/crawl-job-extractor](../tools/crawl-job-extractor/)
+based on the content in the database.
+
+## 2. Converting Process
The [converting-process](converting-process/) reads crawl data from the crawling step and
processes them, extracting keywords and metadata and saves them as compressed JSON models
described in [converting-model](../process-models/converting-model/).
-## 4. Loading Process
+## 3. Loading Process
The [loading-process](loading-process/) reads the processed data and creates an index journal
and lexicon, and loads domains and addresses into the MariaDB-database.
diff --git a/code/readme.md b/code/readme.md
index a889e507..60609310 100644
--- a/code/readme.md
+++ b/code/readme.md
@@ -21,11 +21,15 @@ You'll find a short description in each module of what it does and how it relate
Processes are batch jobs that deal with data retrieval, processing and loading.
* [processes](processes/)
-* * [crawl-job-extractor](processes/crawl-job-extractor-process)
* * [crawling-process](processes/crawling-process)
* * [converting-process](processes/converting-process)
* * [loading-process](processes/loading-process)
+#### Tools
+
+* * [crawl-job-extractor](tools/crawl-job-extractor)
+* * [term-frequency-extractor](tools/term-frequency-extractor)
+
### Features
Features are relatively stand-alone components that serve some part of the domain. They aren't domain-independent,
diff --git a/code/services-core/search-service/build.gradle b/code/services-core/search-service/build.gradle
index 56f59688..60717341 100644
--- a/code/services-core/search-service/build.gradle
+++ b/code/services-core/search-service/build.gradle
@@ -29,7 +29,6 @@ dependencies {
implementation project(':code:libraries:easy-lsh')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:braille-block-punch-cards')
- implementation project(':code:libraries:ngram-bloom-filter')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:api:assistant-api')
diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QueryFactory.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QueryFactory.java
index d7ccde8f..50804e11 100644
--- a/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QueryFactory.java
+++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QueryFactory.java
@@ -8,8 +8,8 @@ import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
-import nu.marginalia.language.statistics.EnglishDictionary;
-import nu.marginalia.ngram_bloom_filter.NGramBloomFilter;
+import nu.marginalia.language.EnglishDictionary;
+import nu.marginalia.ngrams.NGramBloomFilter;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.query_parser.QueryParser;
import nu.marginalia.query_parser.QueryPermutation;
diff --git a/code/services-core/search-service/src/test/java/nu/marginalia/search/query/QueryFactoryTest.java b/code/services-core/search-service/src/test/java/nu/marginalia/search/query/QueryFactoryTest.java
index 1c36922f..84377154 100644
--- a/code/services-core/search-service/src/test/java/nu/marginalia/search/query/QueryFactoryTest.java
+++ b/code/services-core/search-service/src/test/java/nu/marginalia/search/query/QueryFactoryTest.java
@@ -2,9 +2,9 @@ package nu.marginalia.search.query;
import nu.marginalia.WmsaHome;
import nu.marginalia.index.query.limit.SpecificationLimitType;
-import nu.marginalia.language.statistics.EnglishDictionary;
+import nu.marginalia.language.EnglishDictionary;
import nu.marginalia.index.client.model.query.SearchSpecification;
-import nu.marginalia.ngram_bloom_filter.NGramBloomFilter;
+import nu.marginalia.ngrams.NGramBloomFilter;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.search.command.SearchJsParameter;
import nu.marginalia.search.model.SearchProfile;
diff --git a/code/processes/crawl-job-extractor-process/build.gradle b/code/tools/crawl-job-extractor/build.gradle
similarity index 96%
rename from code/processes/crawl-job-extractor-process/build.gradle
rename to code/tools/crawl-job-extractor/build.gradle
index 0aaecd62..07ba17f5 100644
--- a/code/processes/crawl-job-extractor-process/build.gradle
+++ b/code/tools/crawl-job-extractor/build.gradle
@@ -31,7 +31,7 @@ dependencies {
implementation libs.bundles.mariadb
implementation libs.guice
- implementation libs.gson
+ implementation libs.bundles.gson
implementation libs.zstd
testImplementation libs.bundles.slf4j.test
diff --git a/code/tools/crawl-job-extractor/readme.md b/code/tools/crawl-job-extractor/readme.md
new file mode 100644
index 00000000..ea242ba2
--- /dev/null
+++ b/code/tools/crawl-job-extractor/readme.md
@@ -0,0 +1,6 @@
+# Crawl Job Extractor
+
+The crawl job extractor creates a file containing a list of domains
+along with known URLs.
+
+This is consumed by [processes/crawling-process](../../processes/crawling-process).
\ No newline at end of file
diff --git a/code/processes/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java b/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java
similarity index 100%
rename from code/processes/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java
rename to code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java
diff --git a/code/processes/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java b/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java
similarity index 100%
rename from code/processes/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java
rename to code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java
diff --git a/code/processes/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobSpecWriter.java b/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobSpecWriter.java
similarity index 100%
rename from code/processes/crawl-job-extractor-process/src/main/java/nu/marginalia/crawl/CrawlJobSpecWriter.java
rename to code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobSpecWriter.java
diff --git a/code/processes/crawl-job-extractor-process/src/test/java/nu/marginalia/crawl/CrawlJobSpecWriterTest.java b/code/tools/crawl-job-extractor/src/test/java/nu/marginalia/crawl/CrawlJobSpecWriterTest.java
similarity index 100%
rename from code/processes/crawl-job-extractor-process/src/test/java/nu/marginalia/crawl/CrawlJobSpecWriterTest.java
rename to code/tools/crawl-job-extractor/src/test/java/nu/marginalia/crawl/CrawlJobSpecWriterTest.java
diff --git a/code/tools/term-frequency-extractor/build.gradle b/code/tools/term-frequency-extractor/build.gradle
new file mode 100644
index 00000000..966c3c4a
--- /dev/null
+++ b/code/tools/term-frequency-extractor/build.gradle
@@ -0,0 +1,62 @@
+plugins {
+ id 'java'
+ id "io.freefair.lombok" version "5.3.3.3"
+ id 'application'
+
+ id 'jvm-test-suite'
+}
+
+java {
+ toolchain {
+ languageVersion.set(JavaLanguageVersion.of(17))
+ }
+}
+
+application {
+ mainClass = 'nu.marginalia.tools.TermFrequencyExtractor'
+ applicationName = 'term-frequency-extractor'
+}
+
+tasks.distZip.enabled = false
+
+dependencies {
+ implementation project(':third-party:rdrpostagger')
+ implementation project(':third-party:porterstemmer')
+ implementation project(':third-party:monkey-patch-opennlp')
+ implementation project(':code:common:model')
+ implementation project(':code:common:config')
+ implementation project(':code:common:process')
+ implementation project(':code:libraries:language-processing')
+ implementation project(':code:libraries:term-frequency-dict')
+ implementation project(':code:libraries:big-string')
+ implementation project(':code:processes:converting-process')
+ implementation project(':code:process-models:crawling-model')
+
+ implementation libs.lombok
+ annotationProcessor libs.lombok
+ implementation libs.bundles.slf4j
+ implementation libs.notnull
+
+ implementation libs.guice
+ implementation libs.jsoup
+ implementation libs.trove
+ implementation libs.fastutil
+
+ implementation libs.bundles.nlp
+ implementation libs.commons.lang3
+
+ testImplementation libs.bundles.slf4j.test
+ testImplementation libs.bundles.junit
+ testImplementation libs.mockito
+}
+
+
+test {
+ useJUnitPlatform()
+}
+
+task fastTests(type: Test) {
+ useJUnitPlatform {
+ excludeTags "slow"
+ }
+}
diff --git a/code/tools/term-frequency-extractor/readme.md b/code/tools/term-frequency-extractor/readme.md
new file mode 100644
index 00000000..dde1dff8
--- /dev/null
+++ b/code/tools/term-frequency-extractor/readme.md
@@ -0,0 +1,16 @@
+# Term Frequency Extractor
+
+Generates a term frequency dictionary file from a batch of crawl data.
+
+Usage:
+
+```shell
+PATH_TO_SAMPLES=run/samples/crawl-s
+export JAVA_OPTS=-Dcrawl.rootDirRewrite=/crawl:${PATH_TO_SAMPLES}
+
+term-frequency-extractor ${PATH_TO_SAMPLES}/plan.yaml out.dat
+```
+
+## See Also
+
+* [libraries/term-frequency-dict](../../libraries/term-frequency-dict)
\ No newline at end of file
diff --git a/code/tools/term-frequency-extractor/src/main/java/nu/marginalia/tools/TermFrequencyExtractor.java b/code/tools/term-frequency-extractor/src/main/java/nu/marginalia/tools/TermFrequencyExtractor.java
new file mode 100644
index 00000000..ece6a507
--- /dev/null
+++ b/code/tools/term-frequency-extractor/src/main/java/nu/marginalia/tools/TermFrequencyExtractor.java
@@ -0,0 +1,114 @@
+package nu.marginalia.tools;
+
+import gnu.trove.map.hash.TLongIntHashMap;
+import gnu.trove.set.hash.TLongHashSet;
+import nu.marginalia.WmsaHome;
+import nu.marginalia.converting.language.LanguageFilter;
+import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
+import nu.marginalia.language.model.DocumentLanguageData;
+import nu.marginalia.language.sentence.SentenceExtractor;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import plan.CrawlPlanLoader;
+
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import static nu.marginalia.term_frequency_dict.TermFrequencyDict.DOC_COUNT_KEY;
+import static nu.marginalia.term_frequency_dict.TermFrequencyDict.longHash;
+
+public class TermFrequencyExtractor {
+
+ public static void main(String... args) throws IOException, InterruptedException {
+ if (args.length != 2) {
+ System.err.println("Expected arguments: plan.yaml out-file");
+ return;
+ }
+
+ String outFile = args[1];
+
+ var plan = new CrawlPlanLoader().load(Path.of(args[0]));
+
+ ThreadLocal se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels()));
+ LanguageFilter lf = new LanguageFilter();
+
+ TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1);
+
+ ForkJoinPool fjp = new ForkJoinPool(24);
+ AtomicInteger docCount = new AtomicInteger();
+
+ for (var domain : plan.domainsIterable()) { // leaks file descriptor, is fine
+
+ if (domain.doc == null)
+ continue;
+
+ fjp.execute(() -> {
+
+ TLongHashSet words = new TLongHashSet(10_000);
+
+ for (var doc : domain.doc) {
+
+ if (doc.documentBody == null)
+ continue;
+ docCount.incrementAndGet();
+
+ Document parsed = Jsoup.parse(doc.documentBody.decode());
+ parsed.body().filter(new DomPruningFilter(0.5));
+
+ DocumentLanguageData dld = se.get().extractSentences(parsed);
+
+ if (lf.dictionaryAgreement(dld) < 0.1) {
+ return;
+ }
+
+ for (var sent : dld.sentences) {
+ for (var word : sent) {
+ words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8)));
+ }
+ }
+
+ synchronized (counts) {
+ words.forEach(w -> {
+ counts.adjustOrPutValue(w, 1, 1);
+ return true;
+ });
+ }
+
+ words.clear();
+ }
+
+ System.out.println(domain.domain + "\t" + counts.size());
+ });
+
+
+ }
+
+ fjp.shutdown();
+ fjp.awaitTermination(10, TimeUnit.DAYS);
+
+ try (var dos = new DataOutputStream(Files.newOutputStream(Path.of(outFile)))) {
+ synchronized (counts) {
+ counts.put(DOC_COUNT_KEY, docCount.get());
+
+ counts.forEachEntry((hash, cnt) -> {
+ try {
+ dos.writeLong(hash);
+ dos.writeLong(cnt);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ return true;
+ });
+ }
+ }
+
+ System.out.println(docCount.get());
+ }
+
+}
diff --git a/other/memex/build.gradle b/other/memex/build.gradle
deleted file mode 100644
index 5e422b57..00000000
--- a/other/memex/build.gradle
+++ /dev/null
@@ -1,242 +0,0 @@
-plugins {
- id 'java'
- id "io.freefair.lombok" version "5.3.3.3"
-
- id "me.champeau.jmh" version "0.6.6"
- id "de.undercouch.download" version "5.1.0"
-
- id 'jvm-test-suite'
-}
-
-repositories {
- mavenLocal()
- maven { url "https://artifactory.cronapp.io/public-release/" }
- maven { url "https://repo1.maven.org/maven2/" }
- maven { url "https://www2.ph.ed.ac.uk/maven2/" }
- maven { url "https://jitpack.io/" }
- exclusiveContent {
- forRepository {
- maven {
- url = uri("https://jitpack.io")
- }
- }
- filter {
- // Only use JitPack for the `gson-record-type-adapter-factory` library
- includeModule("com.github.Marcono1234", "gson-record-type-adapter-factory")
- }
- }
-}
-
-sourceSets {
- e2eTest {
- java {
- java {
- compileClasspath += main.output + test.output
- runtimeClasspath += main.output + test.output
- srcDir file('src/e2e/java')
- }
- resources.srcDir file('src/e2e/resources')
- }
- }
- jmh {
- java {
- java {
- compileClasspath += main.output + test.output
- runtimeClasspath += main.output + test.output
- srcDir file('src/jmh/java')
- }
- resources.srcDir file('src/jmh/resources')
- }
- }
-}
-
-java {
- toolchain {
- languageVersion.set(JavaLanguageVersion.of(17))
- }
-}
-jmhJar {
- zip64 true
-}
-dependencies {
- implementation project(':code:common:service')
- implementation project(':code:common:config')
- implementation project(':code:common:service-discovery')
- implementation project(':code:common:service-client')
-
- implementation 'org.projectlombok:lombok:1.18.24'
- implementation 'org.jetbrains:annotations:20.1.0'
- annotationProcessor 'org.projectlombok:lombok:1.18.24'
-
- implementation 'com.github.jknack:handlebars:4.3.1'
- implementation 'com.github.jknack:handlebars-markdown:4.2.1'
-
- implementation group: 'com.google.code.gson', name: 'gson', version: '2.9.0'
- implementation 'io.reactivex.rxjava3:rxjava:3.1.5'
- implementation "com.sparkjava:spark-core:2.9.3"
- implementation 'com.opencsv:opencsv:5.6'
-
- implementation group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.17.2'
- implementation group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.17.2'
- implementation group: 'org.apache.logging.log4j', name: 'log4j-slf4j-impl', version: '2.17.2'
-
- implementation 'org.slf4j:slf4j-api:1.7.36'
- testImplementation 'org.slf4j:slf4j-jdk14:2.0.3'
-
- implementation 'com.google.guava:guava:31.1-jre'
- implementation 'com.google.inject:guice:5.1.0'
- implementation 'com.github.jnr:jnr-ffi:2.2.12'
- implementation 'org.apache.httpcomponents:httpcore:4.4.15'
- implementation 'org.apache.httpcomponents:httpclient:4.5.13'
-
- implementation group: 'com.h2database', name: 'h2', version: '2.1.210'
-
- implementation 'org.jsoup:jsoup:1.15.3'
-
- implementation 'org.mariadb.jdbc:mariadb-java-client:3.0.6'
- implementation group: 'net.sf.trove4j', name: 'trove4j', version: '3.0.3'
-
- implementation 'com.zaxxer:HikariCP:5.0.1'
-
- implementation 'org.apache.opennlp:opennlp-tools:1.9.4'
- implementation 'io.prometheus:simpleclient:0.16.0'
- implementation 'io.prometheus:simpleclient_servlet:0.16.0'
- implementation 'io.prometheus:simpleclient_httpserver:0.16.0'
- implementation 'io.prometheus:simpleclient_hotspot:0.16.0'
- implementation 'com.fasterxml.jackson.core:jackson-databind:2.13.3'
-
- implementation group: 'org.yaml', name: 'snakeyaml', version: '1.30'
-
- implementation 'com.github.luben:zstd-jni:1.5.2-2'
- implementation 'org.lz4:lz4-java:1.8.0'
-
- implementation 'com.github.vladimir-bukhtoyarov:bucket4j-core:7.5.0'
- implementation 'de.rototor.jeuclid:jeuclid-core:3.1.14'
-
- implementation 'org.imgscalr:imgscalr-lib:4.2'
- implementation 'org.jclarion:image4j:0.7'
-
- implementation 'commons-net:commons-net:3.8.0'
- implementation 'org.eclipse.jgit:org.eclipse.jgit:5.12.0.202106070339-r'
- implementation 'org.eclipse.jgit:org.eclipse.jgit.ssh.jsch:5.12.0.202106070339-r'
- implementation 'com.jcraft:jsch:0.1.55'
-
- implementation group: 'it.unimi.dsi', name: 'fastutil', version: '8.5.8'
- implementation 'org.roaringbitmap:RoaringBitmap:0.9.32'
-
- testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2'
- testImplementation 'org.mockito:mockito-junit-jupiter:4.5.1'
- testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
- testCompileOnly 'org.projectlombok:lombok:1.18.24'
- testImplementation 'org.projectlombok:lombok:1.18.24'
- testAnnotationProcessor 'org.projectlombok:lombok:1.18.24'
-
- testImplementation group: 'org.mockito', name: 'mockito-core', version: '4.5.1'
-
- testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
- testImplementation 'org.testcontainers:mariadb:1.17.4'
- testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
-
- e2eTestImplementation 'org.junit.jupiter:junit-jupiter-api:5.9.0'
- e2eTestRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
- e2eTestImplementation 'org.projectlombok:lombok:1.18.24'
- e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.24'
- e2eTestImplementation 'org.testcontainers:nginx:1.17.4'
- e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.2"
- e2eTestImplementation 'org.testcontainers:selenium:1.17.4'
- e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.5.3'
- e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.5.3'
-
-
- implementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.5.3'
- implementation 'org.seleniumhq.selenium:selenium-java:4.5.3'
- implementation 'org.sejda.imageio:webp-imageio:0.1.6'
-
- jmh 'org.openjdk.jmh:jmh-core:1.35'
- jmh 'org.openjdk.jmh:jmh-generator-annprocess:1.35'
-
- implementation 'net.agkn:hll:1.6.0'
-
-}
-
-configurations {
- e2eTestImplementation.extendsFrom(testImplementation)
-
-}
-
-test {
- maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1
- maxHeapSize = "8G"
- useJUnitPlatform()
-}
-
-task fastTests(type: Test) {
- maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1
- maxHeapSize = "8G"
- useJUnitPlatform {
- excludeTags "slow"
- }
-}
-
-task e2eTest(type: Test) {
- maxParallelForks = 1
- forkEvery = 1
- maxHeapSize = "8G"
- dependsOn ':shadowJar'
- dependsOn 'downloadTestData'
- dependsOn 'downloadRDRModelData'
- dependsOn 'downloadSentenceModelData'
- dependsOn 'downloadTokenModelData'
- dependsOn 'downloadTermFreqData'
- dependsOn 'IP2LocationFile'
-
- classpath = sourceSets.e2eTest.runtimeClasspath
- testClassesDirs = sourceSets.e2eTest.output.classesDirs
- useJUnitPlatform {
- includeTags "e2e"
- }
-}
-
-task downloadTestData(type: Download) {
- src 'http://hammurabi.acc.umu.se/mirror/kiwix.org/zim/wikipedia/wikipedia_en_100_nopic_2022-05.zim'
- dest file('data/test/wikipedia_en_100_nopic.zim')
- overwrite false
-}
-
-task downloadRDRModelData(type: Download) {
- src (['https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.DICT',
- 'https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.RDR'])
- dest file('data/models/')
- overwrite false
-}
-
-task downloadSentenceModelData(type: Download) {
- src 'https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin'
- dest file('data/models/opennlp-sentence.bin')
- overwrite false
-}
-task downloadTokenModelData(type: Download) {
- src 'https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin'
- dest file('data/models/opennlp-tokens.bin')
- overwrite false
-}
-task downloadIP2LocationFile(type: Download) {
- src 'https://download.ip2location.com/lite/IP2LOCATION-LITE-DB1.CSV.ZIP'
- dest file('data/models/IP2LOCATION-LITE-DB1.CSV.ZIP')
- overwrite false
-}
-task IP2LocationFile(type: Copy) {
- dependsOn 'downloadIP2LocationFile'
- def zipFile = file('data/models/IP2LOCATION-LITE-DB1.CSV.ZIP')
- def outputDir = file("data/models/IP2LOC")
-
- from zipTree(zipFile)
- into outputDir
-}
-
-task downloadTermFreqData(type: Download) {
- src 'https://downloads.marginalia.nu/model/tfreq-new-algo3.bin'
- dest file('data/models/tfreq-new-algo3.bin')
- overwrite false
-}
-
diff --git a/other/memex/lombok.config b/other/memex/lombok.config
deleted file mode 100644
index 6aa51d71..00000000
--- a/other/memex/lombok.config
+++ /dev/null
@@ -1,2 +0,0 @@
-# This file is generated by the 'io.freefair.lombok' Gradle plugin
-config.stopBubbling = true
diff --git a/other/memex/src/main/java/nu/marginalia/memex/MemexServiceDescriptors.java b/other/memex/src/main/java/nu/marginalia/memex/MemexServiceDescriptors.java
deleted file mode 100644
index 25ef5662..00000000
--- a/other/memex/src/main/java/nu/marginalia/memex/MemexServiceDescriptors.java
+++ /dev/null
@@ -1,15 +0,0 @@
-package nu.marginalia.memex;
-
-import nu.marginalia.memex.auth.AuthMain;
-import nu.marginalia.service.descriptor.ServiceDescriptor;
-import nu.marginalia.service.descriptor.ServiceDescriptors;
-import nu.marginalia.service.id.ServiceId;
-
-import java.util.List;
-
-public class MemexServiceDescriptors {
- public static ServiceDescriptors descriptors = new ServiceDescriptors(
- List.of(
- new ServiceDescriptor(ServiceId.Other_Memex, 5030),
- new ServiceDescriptor (ServiceId.Other_Auth, 5003)));
-}
diff --git a/other/memex/src/main/java/nu/marginalia/memex/auth/AuthConfigurationModule.java b/other/memex/src/main/java/nu/marginalia/memex/auth/AuthConfigurationModule.java
deleted file mode 100644
index e0ad33f5..00000000
--- a/other/memex/src/main/java/nu/marginalia/memex/auth/AuthConfigurationModule.java
+++ /dev/null
@@ -1,14 +0,0 @@
-package nu.marginalia.memex.auth;
-
-import com.google.inject.AbstractModule;
-import com.google.inject.name.Names;
-import nu.marginalia.service.descriptor.HostsFile;
-
-import java.nio.file.Path;
-
-public class AuthConfigurationModule extends AbstractModule {
- public void configure() {
- bind(Path.class).annotatedWith(Names.named("password-file")).toInstance(Path.of("/var/lib/wmsa/password.dat"));
- bind(HostsFile.class).toInstance(new HostsFile());
- }
-}
diff --git a/other/memex/src/main/java/nu/marginalia/memex/auth/AuthMain.java b/other/memex/src/main/java/nu/marginalia/memex/auth/AuthMain.java
deleted file mode 100644
index e997d777..00000000
--- a/other/memex/src/main/java/nu/marginalia/memex/auth/AuthMain.java
+++ /dev/null
@@ -1,27 +0,0 @@
-package nu.marginalia.memex.auth;
-
-import com.google.inject.Guice;
-import com.google.inject.Inject;
-import com.google.inject.Injector;
-import nu.marginalia.memex.MemexServiceDescriptors;
-import nu.marginalia.service.MainClass;
-import nu.marginalia.service.id.ServiceId;
-import nu.marginalia.service.module.ConfigurationModule;
-import nu.marginalia.service.server.Initialization;
-
-public class AuthMain extends MainClass {
-
- @Inject
- public AuthMain(AuthService service) {
- }
-
- public static void main(String... args) {
- MainClass.init(ServiceId.Other_Auth, args);
-
- Injector injector = Guice.createInjector(
- new AuthConfigurationModule(),
- new ConfigurationModule(MemexServiceDescriptors.descriptors, ServiceId.Other_Auth));
- injector.getInstance(AuthMain.class);
- injector.getInstance(Initialization.class).setReady();
- }
-}
diff --git a/other/memex/src/main/java/nu/marginalia/memex/auth/AuthService.java b/other/memex/src/main/java/nu/marginalia/memex/auth/AuthService.java
deleted file mode 100644
index 6f9ef59d..00000000
--- a/other/memex/src/main/java/nu/marginalia/memex/auth/AuthService.java
+++ /dev/null
@@ -1,118 +0,0 @@
-package nu.marginalia.memex.auth;
-
-import com.google.inject.Inject;
-import com.google.inject.name.Named;
-import nu.marginalia.client.Context;
-import nu.marginalia.memex.auth.model.LoginFormModel;
-import nu.marginalia.memex.renderer.MustacheRenderer;
-import nu.marginalia.memex.renderer.RendererFactory;
-import nu.marginalia.service.server.Initialization;
-import nu.marginalia.service.server.MetricsServer;
-import nu.marginalia.service.server.RateLimiter;
-import nu.marginalia.service.server.Service;
-import org.apache.http.HttpStatus;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import spark.Request;
-import spark.Response;
-import spark.Spark;
-
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.Objects;
-import java.util.Optional;
-import java.util.UUID;
-
-import static spark.Spark.*;
-
-public class AuthService extends Service {
-
- private final Logger logger = LoggerFactory.getLogger(getClass());
- private String password;
-
- private final RateLimiter rateLimiter = RateLimiter.forLogin();
- private final MustacheRenderer loginFormRenderer;
-
- @Inject
- public AuthService(@Named("service-host") String ip,
- @Named("service-port") Integer port,
- @Named("password-file") Path topSecretPasswordFile,
- RendererFactory rendererFactory,
- Initialization initialization,
- MetricsServer metricsServer) throws IOException {
-
- super(ip, port, initialization, metricsServer);
-
- password = initPassword(topSecretPasswordFile);
-
- loginFormRenderer = rendererFactory.renderer("auth/login");
-
- Spark.path("public/api", () -> {
- before((req, rsp) -> {
- logger.info("{} {}", req.requestMethod(), req.pathInfo());
- });
-
- post("/login", this::login);
- get("/login", this::loginForm);
- });
- Spark.path("api", () -> {
- get("/is-logged-in", this::isLoggedIn);
- });
- }
-
- private String initPassword(Path topSecretPasswordFile) {
- if (Files.exists(topSecretPasswordFile)) {
- try {
- return Files.readString(topSecretPasswordFile);
- } catch (IOException e) {
- logger.error("Could not read password from file " + topSecretPasswordFile, e);
- }
- }
- logger.error("Setting random password");
- return UUID.randomUUID().toString();
- }
-
- private Object loginForm(Request request, Response response) {
- String redir = Objects.requireNonNull(request.queryParams("redirect"));
- String service = Objects.requireNonNull(request.queryParams("service"));
-
- return loginFormRenderer.render(new LoginFormModel(service, redir));
- }
-
- private Object login(Request request, Response response) {
- var redir = Objects.requireNonNullElse(request.queryParams("redirect"), "/");
-
- if (isLoggedIn(request, response)) {
- response.redirect(redir);
- return "";
- }
-
- if (!rateLimiter.isAllowed(Context.fromRequest(request))) {
- Spark.halt(429, "Too many requests");
- return null;
- }
-
- if (Objects.equals(password, request.queryParams("password"))) {
- request.session(true).attribute("logged-in", true);
- response.redirect(redir);
- return "";
- }
-
- response.status(HttpStatus.SC_FORBIDDEN);
- return "Bad password!
";
- }
-
- public boolean isLoggedIn(Request request, Response response) {
- var session = request.session(false);
-
- if (null == session) {
- return false;
- }
-
- return Optional.ofNullable(session.attribute("logged-in"))
- .map(Boolean.class::cast)
- .orElse(false);
- }
-
-}
diff --git a/other/memex/src/main/java/nu/marginalia/memex/auth/client/AuthClient.java b/other/memex/src/main/java/nu/marginalia/memex/auth/client/AuthClient.java
deleted file mode 100644
index f2d68667..00000000
--- a/other/memex/src/main/java/nu/marginalia/memex/auth/client/AuthClient.java
+++ /dev/null
@@ -1,45 +0,0 @@
-package nu.marginalia.memex.auth.client;
-
-import com.google.gson.GsonBuilder;
-import com.google.inject.Inject;
-import io.reactivex.rxjava3.core.Observable;
-import nu.marginalia.WmsaHome;
-import nu.marginalia.client.AbstractDynamicClient;
-import nu.marginalia.client.Context;
-import nu.marginalia.service.descriptor.ServiceDescriptors;
-import nu.marginalia.service.id.ServiceId;
-import org.apache.http.HttpStatus;
-import spark.Request;
-import spark.Response;
-import spark.Spark;
-
-import java.net.URLEncoder;
-import java.nio.charset.StandardCharsets;
-import java.util.concurrent.TimeUnit;
-
-
-public class AuthClient extends AbstractDynamicClient {
- @Inject
- public AuthClient(ServiceDescriptors descriptors) {
- super(descriptors.forId(ServiceId.Other_Auth), WmsaHome.getHostsFile(), new GsonBuilder()::create);
- }
-
- public Observable isLoggedIn(Context ctx) {
- return get(ctx, "/api/is-logged-in").map(Boolean::parseBoolean);
- }
-
- public void redirectToLoginIfUnauthenticated(String domain, Request req, Response rsp) {
- if (!isLoggedIn(Context.fromRequest(req)).timeout(1, TimeUnit.SECONDS).blockingFirst()) {
- rsp.redirect(req.headers("X-Extern-Domain") + "/auth/login?service="+domain
- +"&redirect="+ URLEncoder.encode(req.headers("X-Extern-Url"), StandardCharsets.UTF_8));
- Spark.halt();
- }
- }
-
-
- public void requireLogIn(Context ctx) {
- if (!isLoggedIn(ctx).timeout(1, TimeUnit.SECONDS).blockingFirst()) {
- Spark.halt(HttpStatus.SC_FORBIDDEN);
- }
- }
-}
diff --git a/other/memex/src/main/java/nu/marginalia/memex/auth/model/LoginFormModel.java b/other/memex/src/main/java/nu/marginalia/memex/auth/model/LoginFormModel.java
deleted file mode 100644
index f31876e0..00000000
--- a/other/memex/src/main/java/nu/marginalia/memex/auth/model/LoginFormModel.java
+++ /dev/null
@@ -1,10 +0,0 @@
-package nu.marginalia.memex.auth.model;
-
-import lombok.AllArgsConstructor;
-import lombok.Getter;
-
-@Getter @AllArgsConstructor
-public class LoginFormModel {
- public final String service;
- public final String redirect;
-}
diff --git a/other/memex/src/main/java/nu/marginalia/memex/gemini/BadBotList.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/BadBotList.java
deleted file mode 100644
index 2b879a10..00000000
--- a/other/memex/src/main/java/nu/marginalia/memex/gemini/BadBotList.java
+++ /dev/null
@@ -1,43 +0,0 @@
-package nu.marginalia.memex.gemini;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.net.InetAddress;
-import java.util.HashSet;
-import java.util.Set;
-
-public class BadBotList {
- private final Set shitlist = new HashSet<>();
- public static final BadBotList INSTANCE = new BadBotList();
- private final Logger logger = LoggerFactory.getLogger(getClass().getSimpleName());
-
- private BadBotList() {}
-
- public boolean isAllowed(InetAddress address) {
- return !shitlist.contains(address);
- }
-
- public boolean isQueryPermitted(InetAddress address, String query) {
- if (isBadQuery(query)) {
- logger.info("Banning {}", address);
- shitlist.add(address);
- return false;
- }
- return true;
- }
-
- private boolean isBadQuery(String query) {
- if (query.startsWith("GET")) {
- return true;
- }
- if (query.startsWith("OPTIONS")) {
- return true;
- }
- if (query.contains("mstshash")) {
- return true;
- }
-
- return false;
- }
-}
diff --git a/other/memex/src/main/java/nu/marginalia/memex/gemini/GeminiConfigurationModule.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/GeminiConfigurationModule.java
deleted file mode 100644
index 2d269332..00000000
--- a/other/memex/src/main/java/nu/marginalia/memex/gemini/GeminiConfigurationModule.java
+++ /dev/null
@@ -1,17 +0,0 @@
-package nu.marginalia.memex.gemini;
-
-import com.google.inject.AbstractModule;
-import com.google.inject.name.Names;
-
-import java.nio.file.Path;
-
-public class GeminiConfigurationModule extends AbstractModule {
- public void configure() {
- bind(Path.class).annotatedWith(Names.named("gemini-server-root")).toInstance(Path.of("/var/lib/wmsa/memex-gmi"));
- bind(Path.class).annotatedWith(Names.named("gemini-cert-file")).toInstance(Path.of("/var/lib/wmsa/gemini/crypto.jks"));
- bind(Path.class).annotatedWith(Names.named("gemini-cert-password-file")).toInstance(Path.of("/var/lib/wmsa/gemini/password.dat"));
- bind(Integer.class).annotatedWith(Names.named("gemini-server-port")).toInstance(1965);
-
- }
-
-}
diff --git a/other/memex/src/main/java/nu/marginalia/memex/gemini/GeminiService.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/GeminiService.java
deleted file mode 100644
index d5c6db9c..00000000
--- a/other/memex/src/main/java/nu/marginalia/memex/gemini/GeminiService.java
+++ /dev/null
@@ -1,7 +0,0 @@
-package nu.marginalia.memex.gemini;
-
-public interface GeminiService {
- String DEFAULT_FILENAME = "index.gmi";
-
- void run();
-}
diff --git a/other/memex/src/main/java/nu/marginalia/memex/gemini/GeminiServiceDummy.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/GeminiServiceDummy.java
deleted file mode 100644
index 33fcffb2..00000000
--- a/other/memex/src/main/java/nu/marginalia/memex/gemini/GeminiServiceDummy.java
+++ /dev/null
@@ -1,10 +0,0 @@
-package nu.marginalia.memex.gemini;
-
-import com.google.inject.Singleton;
-
-@Singleton
-public class GeminiServiceDummy implements GeminiService {
- @Override
- public void run() {
- }
-}
diff --git a/other/memex/src/main/java/nu/marginalia/memex/gemini/GeminiServiceImpl.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/GeminiServiceImpl.java
deleted file mode 100644
index 27b956d9..00000000
--- a/other/memex/src/main/java/nu/marginalia/memex/gemini/GeminiServiceImpl.java
+++ /dev/null
@@ -1,164 +0,0 @@
-package nu.marginalia.memex.gemini;
-
-import com.google.inject.Inject;
-import com.google.inject.Singleton;
-import com.google.inject.name.Named;
-import nu.marginalia.memex.gemini.io.GeminiConnection;
-import nu.marginalia.memex.gemini.io.GeminiSSLSetUp;
-import nu.marginalia.memex.gemini.io.GeminiStatusCode;
-import nu.marginalia.memex.gemini.io.GeminiUserException;
-import nu.marginalia.memex.gemini.plugins.BareStaticPagePlugin;
-import nu.marginalia.memex.gemini.plugins.Plugin;
-import nu.marginalia.memex.gemini.plugins.SearchPlugin;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import javax.net.ssl.SSLException;
-import javax.net.ssl.SSLServerSocket;
-import javax.net.ssl.SSLServerSocketFactory;
-import javax.net.ssl.SSLSocket;
-import java.io.IOException;
-import java.net.URI;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.Optional;
-import java.util.concurrent.Executor;
-import java.util.concurrent.Executors;
-
-@Singleton
-public class GeminiServiceImpl implements GeminiService {
-
- public final Path serverRoot;
-
- private final Logger logger = LoggerFactory.getLogger(getClass().getSimpleName());
- private final Executor pool = Executors.newFixedThreadPool(32);
- private final SSLServerSocket serverSocket;
-
- private final Plugin[] plugins;
- private final BadBotList badBotList = BadBotList.INSTANCE;
-
- @Inject
- public GeminiServiceImpl(@Named("gemini-server-root") Path serverRoot,
- @Named("gemini-server-port") Integer port,
- GeminiSSLSetUp sslSetUp,
- BareStaticPagePlugin pagePlugin,
- SearchPlugin searchPlugin) throws Exception {
- this.serverRoot = serverRoot;
- logger.info("Setting up crypto");
- final SSLServerSocketFactory socketFactory = sslSetUp.getServerSocketFactory();
-
- serverSocket = (SSLServerSocket) socketFactory.createServerSocket(port /* 1965 */);
- serverSocket.setEnabledCipherSuites(socketFactory.getSupportedCipherSuites());
- serverSocket.setEnabledProtocols(new String[] {"TLSv1.3", "TLSv1.2"});
-
- logger.info("Verifying setup");
- if (!Files.exists(this.serverRoot)) {
- logger.error("Could not find SERVER_ROOT {}", this.serverRoot);
- System.exit(255);
- }
-
- plugins = new Plugin[] {
- pagePlugin,
- searchPlugin
- };
- }
-
- @Override
- public void run() {
- logger.info("Awaiting connections");
-
- try {
- for (;;) {
- SSLSocket connection = (SSLSocket) serverSocket.accept();
- connection.setSoTimeout(10_000);
-
- if (!badBotList.isAllowed(connection.getInetAddress())) {
- connection.close();
- } else {
- pool.execute(() -> serve(connection));
- }
- }
- }
- catch (IOException ex) {
- logger.error("IO Exception in gemini server", ex);
- }
- }
-
- private void serve(SSLSocket socket) {
- final GeminiConnection connection;
- try {
- connection = new GeminiConnection(socket);
- }
- catch (IOException ex) {
- logger.error("Failed to create connection object", ex);
- return;
- }
-
- try {
- handleRequest(connection);
- }
- catch (GeminiUserException ex) {
- errorResponse(connection, ex.getMessage());
- }
- catch (SSLException ex) {
- logger.error(connection.getAddress() + " SSL error");
- connection.close();
- }
- catch (Exception ex) {
- errorResponse(connection, "Error");
- logger.error(connection.getAddress(), ex);
- }
- finally {
- connection.close();
- }
- }
-
- private void errorResponse(GeminiConnection connection, String message) {
- if (connection.isConnected()) {
- try {
- logger.error("=> " + connection.getAddress(), message);
- connection.writeStatusLine(GeminiStatusCode.ERROR_PERMANENT, message);
- }
- catch (IOException ex) {
- logger.error("Exception while sending error", ex);
- }
- }
- }
-
- private void handleRequest(GeminiConnection connection) throws Exception {
-
- final String address = connection.getAddress();
- logger.info("Connect: " + address);
-
- final Optional maybeUri = connection.readUrl();
- if (maybeUri.isEmpty()) {
- logger.info("Done: {}", address);
- return;
- }
-
- final URI uri = maybeUri.get();
- logger.info("Request {}", uri);
-
- if (!uri.getScheme().equals("gemini")) {
- throw new GeminiUserException("Unsupported protocol");
- }
-
- servePage(connection, uri);
- logger.info("Done: {}", address);
- }
-
- private void servePage(GeminiConnection connection, URI url) throws IOException {
- String path = url.getPath();
-
- for (Plugin p : plugins) {
- if (p.serve(url, connection)) {
- return;
- }
- }
-
- logger.error("FileNotFound {}", path);
- connection.writeStatusLine(GeminiStatusCode.ERROR_TEMPORARY, "No such file");
- }
-
-
-}
diff --git a/other/memex/src/main/java/nu/marginalia/memex/gemini/client/GeminiClient.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/client/GeminiClient.java
deleted file mode 100644
index 27d2a2a9..00000000
--- a/other/memex/src/main/java/nu/marginalia/memex/gemini/client/GeminiClient.java
+++ /dev/null
@@ -1,130 +0,0 @@
-package nu.marginalia.memex.gemini.client;
-
-import javax.net.ssl.SSLContext;
-import javax.net.ssl.SSLSocketFactory;
-import javax.net.ssl.TrustManager;
-import javax.net.ssl.X509TrustManager;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.net.URI;
-import java.nio.charset.StandardCharsets;
-import java.security.cert.X509Certificate;
-
-/** Unstable code! */
-public class GeminiClient {
-
- private final SSLSocketFactory socketFactory;
-
- // Create a trust manager that does not validate anything
- public static final TrustManager[] trustAllCerts = new TrustManager[]{
- new X509TrustManager() {
- @Override
- public void checkClientTrusted(X509Certificate[] chain,
- String authType) {
- }
-
- @Override
- public void checkServerTrusted(X509Certificate[] chain,
- String authType) {
- }
-
- @Override
- public X509Certificate[] getAcceptedIssuers() {
- return new X509Certificate[0];
- }
- }
- };
-
-
- public static SSLSocketFactory buildSocketFactory() throws Exception {
- // Install the all-trusting trust manager
- final SSLContext sslContext = SSLContext.getInstance("SSL");
- sslContext.init(null, trustAllCerts, new java.security.SecureRandom());
-
- return sslContext.getSocketFactory();
- }
-
- public GeminiClient() throws Exception {
- socketFactory = buildSocketFactory();
- }
-
- public Response get(URI uri) throws IOException {
-
- final int port = uri.getPort() == -1 ? 1965 : uri.getPort();
- final String host = uri.getHost();
- var requestString = String.format("%s\r\n", uri).getBytes(StandardCharsets.UTF_8);
-
- try (var socket = socketFactory.createSocket(host, port)) {
- socket.setSoTimeout(10_000);
- socket.getOutputStream().write(requestString);
-
- var is = socket.getInputStream();
- String statusLine = new GeminiInput(is).get();
-
- int code = Integer.parseInt(statusLine.substring(0,2));
- String meta = statusLine.substring(3);
-
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- is.transferTo(baos);
-
- return new Response(code, meta, baos.toByteArray());
- }
-
- }
-
- public static class Response {
- public final int code;
- public final String meta;
- public final byte[] data;
-
- Response(int code, String meta, byte[] data) {
- this.code = code;
- this.meta = meta;
- this.data = data;
- }
- }
-
-
- public static class GeminiInput {
- private final InputStream is;
- private final byte[] buffer = new byte[1024];
- private int idx;
-
- final String result;
-
- public GeminiInput(InputStream is) throws IOException {
- this.is = is;
-
- for (idx = 0; idx < buffer.length; idx++) {
- if (hasEndOfLine()) {
- result = new String(buffer, 0, idx-2, StandardCharsets.UTF_8);
- return;
- }
-
- readCharacter();
- }
-
- throw new RuntimeException("String too long");
- }
-
- public String get() {
- return result;
- }
-
- private void readCharacter() throws IOException {
- int rb = is.read();
- if (-1 == rb) {
- throw new RuntimeException("URL incomplete (no CR LF)");
- }
- buffer[idx] = (byte) rb;
- }
-
- public boolean hasEndOfLine() {
- return idx > 2
- && buffer[idx - 1] == (byte) '\n'
- && buffer[idx - 2] == (byte) '\r';
- }
-
- }
-}
diff --git a/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/Gemtext.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/Gemtext.java
deleted file mode 100644
index 692b65ce..00000000
--- a/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/Gemtext.java
+++ /dev/null
@@ -1,53 +0,0 @@
-package nu.marginalia.memex.gemini.gmi;
-
-import lombok.Getter;
-import nu.marginalia.memex.gemini.gmi.line.AbstractGemtextLine;
-import nu.marginalia.memex.gemini.gmi.parser.GemtextParser;
-import nu.marginalia.memex.gemini.gmi.renderer.GemtextRenderer;
-import nu.marginalia.memex.memex.model.MemexNodeHeadingId;
-import nu.marginalia.memex.memex.model.MemexNodeUrl;
-
-import java.io.IOException;
-import java.io.Writer;
-import java.util.Arrays;
-import java.util.stream.Collectors;
-import java.util.stream.Stream;
-
-
-@Getter
-public class Gemtext {
- private final AbstractGemtextLine[] lines;
- private final MemexNodeUrl url;
-
- public Gemtext(MemexNodeUrl url, String[] lines, MemexNodeHeadingId headingRoot) {
- this.lines = GemtextParser.parse(lines, headingRoot);
- this.url = url;
- }
- public Gemtext(MemexNodeUrl url, String[] lines) {
- this.lines = GemtextParser.parse(lines, new MemexNodeHeadingId(0));
- this.url = url;
- }
-
- public String render(GemtextRenderer renderer) {
- return Arrays.stream(lines).map(renderer::renderLine).collect(Collectors.joining());
- }
-
- public void render(GemtextRenderer renderer, Writer w) throws IOException {
- for (var line : lines) {
- w.write(renderer.renderLine(line));
- w.write('\n');
- }
- }
-
- public Stream stream() {
- return Arrays.stream(lines);
- }
-
- public AbstractGemtextLine get(int idx) {
- return lines[idx];
- }
- public int size() {
- return lines.length;
- }
-
-}
diff --git a/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/GemtextDatabase.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/GemtextDatabase.java
deleted file mode 100644
index 2beb1772..00000000
--- a/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/GemtextDatabase.java
+++ /dev/null
@@ -1,71 +0,0 @@
-package nu.marginalia.memex.gemini.gmi;
-
-import com.google.common.collect.Sets;
-import nu.marginalia.memex.gemini.gmi.line.GemtextLineVisitorAdapter;
-import nu.marginalia.memex.gemini.gmi.line.GemtextLink;
-import nu.marginalia.memex.memex.model.MemexNodeUrl;
-import nu.marginalia.memex.memex.model.MemexUrl;
-
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.*;
-
-public class GemtextDatabase extends Gemtext {
- public final Map links;
-
- public GemtextDatabase(MemexNodeUrl url, String[] lines) {
- super(url, lines);
-
- links = new HashMap<>();
- for (int i = 0; i < size(); i++) {
- int linkIdx = i;
-
- get(i).visit(new GemtextLineVisitorAdapter<>() {
- @Override
- public Object visit(GemtextLink g) {
- links.put(g.getUrl().toString(), linkIdx);
- return null;
- }
- });
- }
- }
-
- public Set keys() {
- return links.keySet();
- }
-
- public Optional getLinkData(MemexUrl url) {
- Integer idx = links.get(url.getUrl());
- if (idx != null) {
- return
- Optional.of(get(idx).mapLink(GemtextLink::getTitle).orElse(""));
- }
- return Optional.empty();
- }
-
-
- public static GemtextDatabase of(MemexNodeUrl url, String[] lines) {
- return new GemtextDatabase(url, lines);
- }
-
- public static GemtextDatabase of(MemexNodeUrl url, Path file) throws IOException {
- try (var s = Files.lines(file)) {
- return new GemtextDatabase(url, s.toArray(String[]::new));
- }
- }
-
- public Set difference(GemtextDatabase other) {
- Set differences = new HashSet<>();
-
- Sets.difference(keys(), other.keys()).stream().map(MemexNodeUrl::new).forEach(differences::add);
-
- Sets.intersection(keys(), other.keys())
- .stream()
- .map(MemexNodeUrl::new)
- .filter(url -> !Objects.equals(getLinkData(url), other.getLinkData(url)))
- .forEach(differences::add);
-
- return differences;
- }
-}
diff --git a/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/GemtextDocument.java b/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/GemtextDocument.java
deleted file mode 100644
index 8e347c6f..00000000
--- a/other/memex/src/main/java/nu/marginalia/memex/gemini/gmi/GemtextDocument.java
+++ /dev/null
@@ -1,163 +0,0 @@
-package nu.marginalia.memex.gemini.gmi;
-
-import lombok.Getter;
-import nu.marginalia.memex.gemini.gmi.renderer.GemtextRenderer;
-import nu.marginalia.memex.gemini.gmi.renderer.GemtextRendererFactory;
-import nu.marginalia.memex.gemini.gmi.line.*;
-import nu.marginalia.memex.memex.model.MemexNodeHeadingId;
-import nu.marginalia.memex.memex.model.MemexNodeTaskId;
-import nu.marginalia.memex.memex.model.MemexNodeUrl;
-import nu.marginalia.memex.memex.model.MemexTaskState;
-import org.apache.commons.lang3.tuple.Pair;
-
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.*;
-import java.util.regex.Pattern;
-import java.util.stream.Collectors;
-
-@Getter
-public class GemtextDocument extends Gemtext {
- private final Map headings;
- private final Map> headingsByName;
- private final Set pragmas;
- private final List tasks;
-
- private final String title;
- private final String date;
- private final List links;
- private final int hashCode;
-
- private static final Pattern datePattern = Pattern.compile(".*(\\d{4}-\\d{2}-\\d{2}).*");
- private static final GemtextRenderer rawRenderer = new GemtextRendererFactory().gemtextRendererAsIs();
-
- public GemtextDocument(MemexNodeUrl url, String[] lines, MemexNodeHeadingId headingRoot) {
- super(url, lines, headingRoot);
-
- this.hashCode = Arrays.hashCode(lines);
-
- GemtextDataExtractor extractor = new GemtextDataExtractor();
-
- Arrays.stream(this.getLines()).forEach(extractor::take);
-
- this.headings = extractor.getHeadings();
- this.links = extractor.getLinks();
- this.title = Objects.requireNonNullElse(extractor.getTitle(), url.getUrl());
- this.pragmas = extractor.getPragmas();
- this.headingsByName = extractor.getHeadingsByName();
- this.tasks = extractor.getTasks();
- this.date = extractor.getDate();
- }
-
- public String getHeadingForElement(AbstractGemtextLine line) {
- return headings.getOrDefault(line.getHeading(), "");
- }
-
- public List getSection(MemexNodeHeadingId headingId) {
- return stream()
- .filter(line -> line.getHeading().isChildOf(headingId))
- .collect(Collectors.toList());
- }
-
- public String getSectionGemtext(MemexNodeHeadingId headingId) {
- if (headingId.equals(new MemexNodeHeadingId(0))) {
- return stream()
- .map(rawRenderer::renderLine)
- .collect(Collectors.joining("\n"));
- }
-
- return stream()
- .filter(line -> line.getHeading().isChildOf(headingId))
- .map(rawRenderer::renderLine)
- .collect(Collectors.joining("\n"));
- }
-
- public Map> getOpenTopTasks() {
- return tasks.stream()
- .filter(task -> MemexTaskState.TODO.equals(task.getState())
- || MemexTaskState.URGENT.equals(task.getState()))
- .filter(task -> task.getId().level() == 1)
- .collect(Collectors.toMap(GemtextTask::getId, task -> Pair.of(task.getTask(), task.getState())));
- }
-
- public static GemtextDocument of(MemexNodeUrl url, String... lines) {
- return new GemtextDocument(url, lines, new MemexNodeHeadingId(0));
- }
-
- public static GemtextDocument of(MemexNodeUrl url, Path file) throws IOException {
- try (var s = Files.lines(file)) {
- return new GemtextDocument(url, s.toArray(String[]::new), new MemexNodeHeadingId(0));
- }
- }
-
- public boolean isIndex() {
- return getUrl().getFilename().equals("index.gmi");
- }
-
- @Override
- public int hashCode() {
- return hashCode;
- }
-
- public Optional getHeading(MemexNodeHeadingId heading) {
- return Optional.ofNullable(headings.get(heading));
- }
-
- public Optional getHeadingByName(MemexNodeHeadingId parent, String name) {
- var headings = headingsByName.get(name);
- if (null == headings) {
- return Optional.empty();
- }
- return headings.stream().filter(heading -> heading.isChildOf(parent)).findAny();
- }
-
- @Getter
- private static class GemtextDataExtractor extends GemtextLineVisitorAdapter