diff --git a/code/common/config/src/main/java/nu/marginalia/WmsaHome.java b/code/common/config/src/main/java/nu/marginalia/WmsaHome.java index 0f5f6598..b1bc3512 100644 --- a/code/common/config/src/main/java/nu/marginalia/WmsaHome.java +++ b/code/common/config/src/main/java/nu/marginalia/WmsaHome.java @@ -77,7 +77,11 @@ public class WmsaHome { home.resolve("model/lid.176.ftz")); } + public static Path getAtagsPath() { + return getHomePath().resolve("data/atags.parquet"); + } private static final boolean debugMode = Boolean.getBoolean("wmsa-debug"); + public static boolean isDebug() { return debugMode; } diff --git a/code/common/model/src/main/java/nu/marginalia/model/idx/WordFlags.java b/code/common/model/src/main/java/nu/marginalia/model/idx/WordFlags.java index dec7437e..dc627715 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/idx/WordFlags.java +++ b/code/common/model/src/main/java/nu/marginalia/model/idx/WordFlags.java @@ -31,14 +31,15 @@ public enum WordFlags { SiteAdjacent, /** Keyword appears in URL path - * */ UrlPath, /** Keyword appears in domain name - * */ - UrlDomain + UrlDomain, + + /** Word appears in an external link */ + ExternalLink ; public int asBit() { diff --git a/code/common/model/src/main/java/nu/marginalia/model/idx/WordMetadata.java b/code/common/model/src/main/java/nu/marginalia/model/idx/WordMetadata.java index cc07379c..1f1add44 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/idx/WordMetadata.java +++ b/code/common/model/src/main/java/nu/marginalia/model/idx/WordMetadata.java @@ -12,14 +12,12 @@ import java.util.Set; * @param flags word flags (see {@link WordFlags}) */ public record WordMetadata(long positions, - byte flags) { + int flags) { - // Bottom 8 bits are used for flags - - public static final long FLAGS_MASK = 0xFFL; - - public static final int POSITIONS_SHIFT = 8; - public static final long POSITIONS_MASK = 0xFF_FFFF_FFFF_FFFFL; + public static final long FLAGS_MASK = (1L << WordFlags.values().length) - 1; + public static final int POSITIONS_COUNT = 64 - WordFlags.values().length; + public static final int POSITIONS_SHIFT = WordFlags.values().length; + public static final long POSITIONS_MASK = ~0L >>> POSITIONS_SHIFT; @@ -30,7 +28,7 @@ public record WordMetadata(long positions, public WordMetadata(long value) { this( ((value >>> POSITIONS_SHIFT) & POSITIONS_MASK), - (byte) (value & FLAGS_MASK) + (int)(value & FLAGS_MASK) ); } @@ -40,9 +38,9 @@ public record WordMetadata(long positions, this(positions, encodeFlags(flags)); } - private static byte encodeFlags(Set flags) { - byte ret = 0; - for (var flag : flags) { ret |= (byte) flag.asBit(); } + private static int encodeFlags(Set flags) { + int ret = 0; + for (var flag : flags) { ret |= flag.asBit(); } return ret; } @@ -69,7 +67,7 @@ public record WordMetadata(long positions, public long encode() { long ret = 0; - ret |= Byte.toUnsignedLong(flags); + ret |= Integer.toUnsignedLong(flags) & FLAGS_MASK; ret |= (positions & POSITIONS_MASK) << POSITIONS_SHIFT; return ret; diff --git a/code/common/model/src/test/java/nu/marginalia/model/WordMetadataTest.java b/code/common/model/src/test/java/nu/marginalia/model/WordMetadataTest.java index 79258915..6de3179b 100644 --- a/code/common/model/src/test/java/nu/marginalia/model/WordMetadataTest.java +++ b/code/common/model/src/test/java/nu/marginalia/model/WordMetadataTest.java @@ -17,14 +17,15 @@ class WordMetadataTest { verifyCodec("Position 32bit", new WordMetadata(0xff0f0000L, EnumSet.allOf(WordFlags.class))); verifyCodec("Position all", new WordMetadata(0xffff_ff0f_0000L, EnumSet.allOf(WordFlags.class))); verifyCodec("No flags", new WordMetadata( 0xff0f0000L, EnumSet.noneOf(WordFlags.class))); - verifyCodec("No flags, some bits", new WordMetadata(0x7f7f7f7f7f7f7fL, EnumSet.noneOf(WordFlags.class))); - verifyCodec("No flags, all bits", new WordMetadata( 0xffffffffffffffL, EnumSet.noneOf(WordFlags.class))); - verifyCodec("All flags, all bits", new WordMetadata( 0xffffffffffffffL, EnumSet.allOf(WordFlags.class))); + verifyCodec("No flags, some bits", new WordMetadata(0x3f_7f7f_7f7f_7f7fL, EnumSet.noneOf(WordFlags.class))); + verifyCodec("No flags, all bits", new WordMetadata( 0x3f_ffff_ffff_ffffL, EnumSet.noneOf(WordFlags.class))); + verifyCodec("All flags, all bits", new WordMetadata( 0x3f_ffff_ffff_ffffL, EnumSet.allOf(WordFlags.class))); System.out.println(new WordMetadata(0x7f0f0005L, EnumSet.allOf(WordFlags.class))); System.out.println(new WordMetadata(0xff0f0013L, EnumSet.noneOf(WordFlags.class))); System.out.println(new WordMetadata(0xf0f000ff0f0013L, EnumSet.allOf(WordFlags.class))); System.out.println(new WordMetadata(0xf0f000ff0f0013L, (byte)-1)); - System.out.println(new WordMetadata(0xffffffffffffffL, (byte)0)); + System.out.println(new WordMetadata(0x3f_ffff_ffff_ffffL, (byte)0)); + System.out.println(new WordMetadata(0x3f_ffff_ffff_ffffL, (byte)0)); System.out.println(BrailleBlockPunchCards.printBits(new WordMetadata(~0L, (byte) 0).encode(), 64)); System.out.println(BrailleBlockPunchCards.printBits(new WordMetadata(0, (byte) 0xff).encode(), 64)); System.out.println(BrailleBlockPunchCards.printBits(131973L, 64)); diff --git a/code/features-convert/anchor-keywords/build.gradle b/code/features-convert/anchor-keywords/build.gradle new file mode 100644 index 00000000..122046f8 --- /dev/null +++ b/code/features-convert/anchor-keywords/build.gradle @@ -0,0 +1,33 @@ +plugins { + id 'java' + id "de.undercouch.download" version "5.1.0" + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(21)) + } +} + +dependencies { + implementation project(':code:common:config') + implementation project(':code:common:model') + implementation project(':code:common:db') + implementation project(':code:common:process') + implementation project(':code:features-convert:keyword-extraction') + implementation project(':code:libraries:language-processing') + + + implementation libs.bundles.slf4j + implementation libs.guice + implementation libs.bundles.mariadb + implementation libs.duckdb + implementation libs.notnull + implementation libs.jsoup + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + diff --git a/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/AnchorTextKeywords.java b/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/AnchorTextKeywords.java new file mode 100644 index 00000000..524caaba --- /dev/null +++ b/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/AnchorTextKeywords.java @@ -0,0 +1,74 @@ +package nu.marginalia.atags; + +import com.google.inject.Inject; +import nu.marginalia.atags.model.DomainLinks; +import nu.marginalia.keyword.KeywordExtractor; +import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.model.EdgeUrl; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.*; + +public class AnchorTextKeywords { + private final KeywordExtractor keywordExtractor; + private final SentenceExtractor sentenceExtractor; + private final Set stopList; + @Inject + public AnchorTextKeywords(KeywordExtractor keywordExtractor, + SentenceExtractor sentenceExtractor) + { + this.keywordExtractor = keywordExtractor; + this.sentenceExtractor = sentenceExtractor; + + stopList = readStoplist(); + } + + private Set readStoplist() { + Set ret = new HashSet<>(); + + try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("atags-stop-list"), + "Could not load word frequency table"); + var br = new BufferedReader(new InputStreamReader(resource)) + ) { + while (true) { + String s = br.readLine(); + + if (s == null) break; + if (s.isBlank()) continue; + + ret.add(s.trim()); + } + } catch (IOException e) { + e.printStackTrace(); + } + return ret; + } + + public List getAnchorTextKeywords(DomainLinks links, EdgeUrl url) { + var keywordsRaw = links.forUrl(url); + + // Extract and count keywords from anchor text + Map wordsWithCount = new HashMap<>(); + for (var keyword : keywordsRaw) { + if (stopList.contains(keyword.text().toLowerCase())) + continue; + + var sentence = sentenceExtractor.extractSentence(keyword.text()); + for (var wordSpan : keywordExtractor.getKeywordsFromSentence(sentence)) { + wordsWithCount.merge(sentence.constructWordFromSpan(wordSpan), 1, Integer::sum); + } + } + + // Filter out keywords that appear infrequently + final List keywords = new ArrayList<>(wordsWithCount.size()); + for (var wordEntry : wordsWithCount.entrySet()) { + if (wordEntry.getValue() > 2) { + keywords.add(wordEntry.getKey()); + } + } + + return keywords; + } +} diff --git a/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/model/DomainLinks.java b/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/model/DomainLinks.java new file mode 100644 index 00000000..bee75337 --- /dev/null +++ b/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/model/DomainLinks.java @@ -0,0 +1,40 @@ +package nu.marginalia.atags.model; + +import nu.marginalia.model.EdgeUrl; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public class DomainLinks { + private final Map> links; + + public DomainLinks() { + links = Map.of(); + } + + public DomainLinks(List linksForDomain) { + links = linksForDomain. + stream() + .collect(Collectors.groupingBy(LinkWithText::url, + Collectors.mapping(LinkWithText::toLink, Collectors.toList()))); + } + + public List getUrls() { + return new ArrayList<>(links.keySet()); + } + + public List forUrl(EdgeUrl url) { + String key = url.domain.toString() + url.path + (url.param == null ? "" : "?" + url.param); + return links.getOrDefault(key, List.of()); + } + + @Override + public String toString() { + return "DomainLinks{" + + "links=" + links + + '}'; + } +} diff --git a/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/model/Link.java b/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/model/Link.java new file mode 100644 index 00000000..1c76469f --- /dev/null +++ b/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/model/Link.java @@ -0,0 +1,4 @@ +package nu.marginalia.atags.model; + +public record Link(String source, String text) { +} diff --git a/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/model/LinkWithText.java b/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/model/LinkWithText.java new file mode 100644 index 00000000..784580fc --- /dev/null +++ b/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/model/LinkWithText.java @@ -0,0 +1,7 @@ +package nu.marginalia.atags.model; + +public record LinkWithText(String url, String text, String source) { + public Link toLink() { + return new Link(source, text); + } +} diff --git a/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/source/AnchorTagsImpl.java b/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/source/AnchorTagsImpl.java new file mode 100644 index 00000000..9004b2ac --- /dev/null +++ b/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/source/AnchorTagsImpl.java @@ -0,0 +1,93 @@ +package nu.marginalia.atags.source; + +import nu.marginalia.atags.model.DomainLinks; +import nu.marginalia.atags.model.LinkWithText; +import nu.marginalia.model.EdgeDomain; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.nio.file.Path; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; + +public class AnchorTagsImpl implements AnchorTagsSource { + private final Connection duckdbConnection; + private static final Logger logger = LoggerFactory.getLogger(AnchorTagsImpl.class); + public AnchorTagsImpl(Path atagsPath, + List relevantDomains) + throws SQLException + { + duckdbConnection = DriverManager.getConnection("jdbc:duckdb:"); + + logger.info("Loading atags from " + atagsPath); + + try (var stmt = duckdbConnection.createStatement()) { + // Insert the domains into a temporary table, then use that to filter the atags table + + stmt.executeUpdate("create table domains (domain varchar)"); + try (var ps = duckdbConnection.prepareStatement("insert into domains values (?)")) { + for (var domain : relevantDomains) { + ps.setString(1, domain.toString()); + ps.executeUpdate(); + } + } + + // Project the atags table down to only the relevant domains. This looks like an SQL injection + // vulnerability if you're a validation tool, but the string comes from a trusted source. + stmt.executeUpdate(""" + create table atags as + select * from '%s' + where dest in (select * from domains) + """.formatted(atagsPath.toAbsolutePath())); + + // Free up the memory used by the domains table + stmt.executeUpdate("drop table domains"); + + // Create an index on the dest column to speed up queries + stmt.executeUpdate("create index atags_dest on atags(dest)"); + + // This is probably not necessary + if (!duckdbConnection.getAutoCommit()) { + duckdbConnection.commit(); + } + } + + logger.info("Finished loading!"); + + } + + @Override + public DomainLinks getAnchorTags(EdgeDomain domain) { + List links = new ArrayList<>(); + + try (var ps = duckdbConnection.prepareStatement(""" + select + unnest(text) as 'text', + unnest(url) as 'url', + unnest(source) as 'source' + from atags + where dest = ? + """)) + { + ps.setString(1, domain.toString()); + var rs = ps.executeQuery(); + while (rs.next()) { + links.add(new LinkWithText(rs.getString("url"), rs.getString("text"), rs.getString("source"))); + } + return new DomainLinks(links); + } + catch (SQLException ex) { + logger.warn("Failed to get atags for " + domain, ex); + } + + return new DomainLinks(); + } + + @Override + public void close() throws Exception { + duckdbConnection.close(); + } +} diff --git a/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/source/AnchorTagsSource.java b/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/source/AnchorTagsSource.java new file mode 100644 index 00000000..d0dbb9e6 --- /dev/null +++ b/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/source/AnchorTagsSource.java @@ -0,0 +1,10 @@ +package nu.marginalia.atags.source; + +import nu.marginalia.atags.model.DomainLinks; +import nu.marginalia.model.EdgeDomain; + +public interface AnchorTagsSource extends AutoCloseable { + DomainLinks getAnchorTags(EdgeDomain domain); + + default void close() throws Exception {} +} diff --git a/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/source/AnchorTagsSourceFactory.java b/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/source/AnchorTagsSourceFactory.java new file mode 100644 index 00000000..0b8596bd --- /dev/null +++ b/code/features-convert/anchor-keywords/src/main/java/nu/marginalia/atags/source/AnchorTagsSourceFactory.java @@ -0,0 +1,73 @@ +package nu.marginalia.atags.source; + +import com.google.inject.Inject; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.ProcessConfiguration; +import nu.marginalia.WmsaHome; +import nu.marginalia.atags.model.DomainLinks; +import nu.marginalia.model.EdgeDomain; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; + +public class AnchorTagsSourceFactory { + private final Path atagsPath; + private final int nodeId; + private final HikariDataSource dataSource; + private static final Logger logger = LoggerFactory.getLogger(AnchorTagsSourceFactory.class); + @Inject + public AnchorTagsSourceFactory(HikariDataSource dataSource, + ProcessConfiguration config) + { + this.dataSource = dataSource; + this.atagsPath = WmsaHome.getAtagsPath(); + this.nodeId = config.node(); + } + + public AnchorTagsSource create() throws SQLException { + if (!Files.exists(atagsPath)) + return dummy(); + + List relevantDomains = getRelevantDomains(); + + if (relevantDomains.isEmpty()) + return dummy(); + + return new AnchorTagsImpl(atagsPath, relevantDomains); + } + + private AnchorTagsSource dummy() { + return x -> new DomainLinks(); + } + + // Only get domains that are assigned to this node. This reduces the amount of data + // that needs to be loaded into the duckdb instance to a more manageable level, and keeps + // the memory footprint of the service down. + private List getRelevantDomains() { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT DOMAIN_NAME + FROM WMSA_prod.EC_DOMAIN + WHERE NODE_AFFINITY = ? + """)) + { + stmt.setInt(1, nodeId); + var rs = stmt.executeQuery(); + var ret = new ArrayList(); + while (rs.next()) { + ret.add(new EdgeDomain(rs.getString(1))); + } + return ret; + } catch (Exception e) { + logger.warn("Failed to get relevant domains for node id " + nodeId, e); + return List.of(); + } + } + + +} diff --git a/code/features-convert/anchor-keywords/src/main/resources/atags-stop-list b/code/features-convert/anchor-keywords/src/main/resources/atags-stop-list new file mode 100644 index 00000000..71d0131c --- /dev/null +++ b/code/features-convert/anchor-keywords/src/main/resources/atags-stop-list @@ -0,0 +1,147 @@ +[-] +facebook +website +twitter +link +instagram +read more +visit website +amazon +youtube +linkedin +learn more +go to album +more +share +sign up +log in +buy now +1 +here. +. +italiano +privacy policy +register +2 +buy tickets +book now +view +more info +apply now +play pause +report +contact us +view article +home +online +d +3 +[link] +hier +read +reports +view website +said +[1] +reported +wordpress +announced +4 +fr +donate +contact +news +more information +en +apply +via +post +comments +register now +listen +read full review +details +register here +visit site +privacy +5 +shop now +[2] +info +help +play +de +this +* +terms of use +directions +url +open +p +photo +careers +find out more +login +view original post +0 +about +live demo +blogger +6 +www +subscribe +view details +watch +read more... +view profile +download now +wrote +t +get started +gallery +7 +preview +visit +terms of service +a +email +html +view more +view this tutor +spanish +8 +permalink +read article +results +demo +rss +[4] +about us +part 2 +click here. +get tickets +visit websitewebsite +es +says +pin ++ +watch now +listen now +writes +part 1 +clicking here +page +link here +i +[5] +profile +it +sign in +11 +french +donate now +home page +order now +12 +more... +these \ No newline at end of file diff --git a/code/features-convert/anchor-keywords/src/test/java/nu/marginalia/atags/DomainAnchorTagsImplTest.java b/code/features-convert/anchor-keywords/src/test/java/nu/marginalia/atags/DomainAnchorTagsImplTest.java new file mode 100644 index 00000000..d585ff7c --- /dev/null +++ b/code/features-convert/anchor-keywords/src/test/java/nu/marginalia/atags/DomainAnchorTagsImplTest.java @@ -0,0 +1,46 @@ +package nu.marginalia.atags; + +import nu.marginalia.atags.source.AnchorTagsImpl; +import nu.marginalia.keyword.KeywordExtractor; +import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.util.TestLanguageModels; +import org.junit.jupiter.api.Test; + +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.List; + +class DomainAnchorTagsImplTest { + + @Test + void getAnchorTags() { + Path atagsPath = Path.of("/home/vlofgren/atags.parquet"); + try (var domainAnchorTags = new AnchorTagsImpl( + atagsPath, List.of(new EdgeDomain("www.chiark.greenend.org.uk")) + )) { + var tags = domainAnchorTags.getAnchorTags(new EdgeDomain("www.chiark.greenend.org.uk")); + + System.out.println(tags); + System.out.println(tags.getUrls()); + System.out.println(tags.forUrl(new EdgeUrl("https://www.chiark.greenend.org.uk/~sgtatham/putty/"))); + System.out.println(tags.forUrl(new EdgeUrl("http://www.chiark.greenend.org.uk/~sgtatham/putty/"))); + System.out.println(tags.forUrl(new EdgeUrl("http://www.chiark.greenend.org.uk/~sgtatham/putt"))); + + var atagsKeywords = new AnchorTextKeywords( + new KeywordExtractor(), + new SentenceExtractor( + TestLanguageModels.getLanguageModels() + ) + ); + System.out.println( + atagsKeywords.getAnchorTextKeywords(tags, new EdgeUrl("https://www.chiark.greenend.org.uk/~sgtatham/")) + ); + } catch (SQLException e) { + throw new RuntimeException(e); + } catch (Exception e) { + throw new RuntimeException(e); + } + } +} \ No newline at end of file diff --git a/code/features-convert/anchor-keywords/src/test/java/nu/marginalia/util/TestLanguageModels.java b/code/features-convert/anchor-keywords/src/test/java/nu/marginalia/util/TestLanguageModels.java new file mode 100644 index 00000000..5efd2025 --- /dev/null +++ b/code/features-convert/anchor-keywords/src/test/java/nu/marginalia/util/TestLanguageModels.java @@ -0,0 +1,38 @@ +package nu.marginalia.util; + +import nu.marginalia.LanguageModels; +import nu.marginalia.WmsaHome; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Optional; + +public class TestLanguageModels { + private static final Path LANGUAGE_MODELS_DEFAULT = WmsaHome.getHomePath().resolve("model"); + + public static Path getLanguageModelsPath() { + final Path languageModelsHome = Optional.ofNullable(System.getenv("LANGUAGE_MODELS_HOME")) + .map(Path::of) + .orElse(LANGUAGE_MODELS_DEFAULT); + + if (!Files.isDirectory(languageModelsHome)) { + throw new IllegalStateException("Could not find $LANGUAGE_MODELS_HOME, see doc/language-models.md"); + } + return languageModelsHome; + } + + public static LanguageModels getLanguageModels() { + + var languageModelsHome = getLanguageModelsPath(); + + return new LanguageModels( + languageModelsHome.resolve("ngrams.bin"), + languageModelsHome.resolve("tfreq-new-algo3.bin"), + languageModelsHome.resolve("opennlp-sentence.bin"), + languageModelsHome.resolve("English.RDR"), + languageModelsHome.resolve("English.DICT"), + languageModelsHome.resolve("opennlp-tokens.bin"), + languageModelsHome.resolve("lid.176.ftz") + ); + } +} diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java index 5e91b12c..b402c9f6 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java @@ -4,12 +4,13 @@ import com.google.inject.Inject; import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap; import nu.marginalia.keyword.KeywordExtractor; import nu.marginalia.language.model.DocumentLanguageData; +import nu.marginalia.model.idx.WordMetadata; /** Generates a position bitmask for each word in a document */ public class KeywordPositionBitmask { private final Object2LongOpenHashMap positionMask = new Object2LongOpenHashMap<>(10_000, 0.7f); - private final static int positionWidth = 56; - private final static long positionBitmask = (1L << positionWidth) - 1; + private final static int positionWidth = WordMetadata.POSITIONS_COUNT; + private final static long positionBitmask = WordMetadata.POSITIONS_MASK; private static final int unmodulatedPortion = 16; @Inject diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java index 6ce80372..aadb893d 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java @@ -2,7 +2,6 @@ package nu.marginalia.keyword.model; import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap; import lombok.Getter; -import lombok.ToString; import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordMetadata; @@ -76,6 +75,14 @@ public class DocumentKeywordsBuilder { newWords.forEach(word -> words.putIfAbsent(word, meta)); } + public void addAnchorTerms(List keywords) { + long meta = WordFlags.Title.asBit() + | WordFlags.ExternalLink.asBit() + | (1L << WordMetadata.POSITIONS_SHIFT); + + keywords.forEach(word -> words.mergeLong(word, meta, (a, b) -> a|b)); + } + public List getWordsWithAnyFlag(long flags) { List ret = new ArrayList<>(); @@ -103,4 +110,5 @@ public class DocumentKeywordsBuilder { return sb.append(']').toString(); } + } diff --git a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexReader.java b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexReader.java index 7d355d6e..54ad3010 100644 --- a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexReader.java +++ b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexReader.java @@ -53,9 +53,6 @@ public class ForwardIndexReader { data = loadData(dataFile); } - public void selfTest() { - - } private static TLongIntHashMap loadIds(Path idsFile) throws IOException { try (var idsArray = LongArrayFactory.mmapForReadingShared(idsFile)) { assert idsArray.size() < Integer.MAX_VALUE; diff --git a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java index b1c9462e..31ac7d2c 100644 --- a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java +++ b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/ResultValuator.java @@ -11,6 +11,9 @@ import nu.marginalia.ranking.factors.*; import com.google.inject.Inject; import com.google.inject.Singleton; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import java.util.ArrayList; import java.util.List; @@ -18,7 +21,7 @@ import static java.lang.Math.min; @Singleton public class ResultValuator { - final static double scalingFactor = 250.; + final static double scalingFactor = 500.; private final Bm25Factor bm25Factor; private final TermCoherenceFactor termCoherenceFactor; @@ -28,6 +31,8 @@ public class ResultValuator { private final ThreadLocal> listPool = ThreadLocal.withInitial(ValuatorListPool::new); + private static final Logger logger = LoggerFactory.getLogger(ResultValuator.class); + @Inject public ResultValuator(Bm25Factor bm25Factor, TermCoherenceFactor termCoherenceFactor, @@ -46,7 +51,6 @@ public class ResultValuator { var threadListPool = listPool.get(); int sets = numberOfSets(scores); - double bestScore = 10; long documentMetadata = documentMetadata(scores); int features = htmlFeatures(scores); @@ -56,7 +60,7 @@ public class ResultValuator { int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata); int quality = DocumentMetadata.decodeQuality(documentMetadata); int size = DocumentMetadata.decodeSize(documentMetadata); - int flagsPenalty = flagsPenalty(features, documentMetadata & 0xFF, size, quality); + int flagsPenalty = flagsPenalty(features, documentMetadata & 0xFF, size); int topology = DocumentMetadata.decodeTopology(documentMetadata); int year = DocumentMetadata.decodeYear(documentMetadata); @@ -85,7 +89,8 @@ public class ResultValuator { + flagsPenalty + priorityTermBonus.calculate(scores); - for (int set = 0; set <= sets; set++) { + double bestScore = 10; + for (int set = 0; set < sets; set++) { ResultKeywordSet keywordSet = createKeywordSet(threadListPool, scores, set); if (keywordSet.isEmpty() || keywordSet.hasNgram()) @@ -95,8 +100,7 @@ public class ResultValuator { final double bm25 = rankingParams.bm25FullWeight * bm25Factor.calculateBm25(rankingParams.fullParams, keywordSet, length, ctx); final double bm25p = rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, keywordSet, ctx); - double nonNormalizedScore = bm25 + bm25p + tcf + overallPart; - double score = normalize(nonNormalizedScore, keywordSet.length()); + double score = normalize(bm25 + bm25p + tcf + overallPart); bestScore = min(bestScore, score); @@ -116,7 +120,7 @@ public class ResultValuator { } } - private int flagsPenalty(int featureFlags, long docFlags, int size, double quality) { + private int flagsPenalty(int featureFlags, long docFlags, int size) { // Short-circuit for index-service, which does not have the feature flags if (featureFlags == 0) @@ -203,11 +207,11 @@ public class ResultValuator { return 1 + maxSet; } - public static double normalize(double value, int setSize) { + public static double normalize(double value) { if (value < 0) value = 0; - return Math.sqrt((1.0 + scalingFactor) / (1.0 + value / Math.max(1., setSize))); + return Math.sqrt((1.0 + scalingFactor) / (1.0 + value)); } } diff --git a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/factors/Bm25Factor.java b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/factors/Bm25Factor.java index fe35eccf..a03eeeff 100644 --- a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/factors/Bm25Factor.java +++ b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/factors/Bm25Factor.java @@ -43,6 +43,8 @@ public class Bm25Factor { | WordFlags.SiteAdjacent.asBit() | WordFlags.UrlPath.asBit() | WordFlags.UrlDomain.asBit() + | WordFlags.ExternalLink.asBit() + | WordFlags.Title.asBit() | WordFlags.Subjects.asBit(); for (var keyword : keywordSet.keywords()) { @@ -50,6 +52,7 @@ public class Bm25Factor { int freq = ctx.priorityFrequency(keyword.keyword); + // note we override b to zero for priority terms as they are independent of document length sum += invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0); } diff --git a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/factors/TermCoherenceFactor.java b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/factors/TermCoherenceFactor.java index 482676d3..54964dc1 100644 --- a/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/factors/TermCoherenceFactor.java +++ b/code/features-index/result-ranking/src/main/java/nu/marginalia/ranking/factors/TermCoherenceFactor.java @@ -1,5 +1,6 @@ package nu.marginalia.ranking.factors; +import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.ranking.ResultKeywordSet; /** Rewards documents where terms appear frequently within the same sentences @@ -15,16 +16,14 @@ public class TermCoherenceFactor { double bitsSetFactor(long mask) { final int bitsSetInMask = Long.bitCount(mask); - return Math.pow(bitsSetInMask/56., 0.25); + return Math.pow(bitsSetInMask/(float) WordMetadata.POSITIONS_COUNT, 0.25); } long combinedMask(ResultKeywordSet keywordSet) { - long mask = 0xFF_FFFF_FFFF_FFFFL; + long mask = WordMetadata.POSITIONS_MASK; for (var keyword : keywordSet.keywords()) { - long positions = keyword.positions(); - - mask &= positions; + mask &= keyword.positions(); } return mask; diff --git a/code/features-index/result-ranking/src/test/java/nu/marginalia/ranking/factors/TermCoherenceFactorTest.java b/code/features-index/result-ranking/src/test/java/nu/marginalia/ranking/factors/TermCoherenceFactorTest.java index a981ba80..b4f455f4 100644 --- a/code/features-index/result-ranking/src/test/java/nu/marginalia/ranking/factors/TermCoherenceFactorTest.java +++ b/code/features-index/result-ranking/src/test/java/nu/marginalia/ranking/factors/TermCoherenceFactorTest.java @@ -17,7 +17,7 @@ class TermCoherenceFactorTest { @Test public void testAllBitsSet() { var allPositionsSet = createSet( - 0xFF_FFFF_FFFF_FFFFL, 0xFF_FFFF_FFFF_FFFFL + WordMetadata.POSITIONS_MASK, WordMetadata.POSITIONS_MASK ); long mask = termCoherenceFactor.combinedMask(allPositionsSet); diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index acbddf7e..faa952fb 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -49,6 +49,7 @@ dependencies { implementation project(':code:process-models:crawling-model') implementation project(':code:features-convert:adblock') + implementation project(':code:features-convert:anchor-keywords') implementation project(':code:features-convert:topic-detection') implementation project(':code:features-convert:pubdate') implementation project(':code:features-convert:keyword-extraction') diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index 79cb7444..e16d1afe 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -2,6 +2,9 @@ package nu.marginalia.converting.processor; import com.google.inject.Inject; import lombok.SneakyThrows; +import nu.marginalia.atags.AnchorTextKeywords; +import nu.marginalia.atags.source.AnchorTagsSource; +import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.logic.links.LinkGraph; import nu.marginalia.crawling.io.SerializableCrawlDataStream; @@ -16,11 +19,14 @@ import nu.marginalia.model.crawl.HtmlFeature; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.sql.SQLException; import java.util.*; public class DomainProcessor { private final DocumentProcessor documentProcessor; private final SiteWords siteWords; + private final AnchorTagsSource anchorTagsSource; + private final AnchorTextKeywords anchorTextKeywords; private final LshDocumentDeduplicator documentDeduplicator; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -28,11 +34,15 @@ public class DomainProcessor { @Inject public DomainProcessor(DocumentProcessor documentProcessor, SiteWords siteWords, - LshDocumentDeduplicator documentDeduplicator) + AnchorTagsSourceFactory anchorTagsSourceFactory, + AnchorTextKeywords anchorTextKeywords, + LshDocumentDeduplicator documentDeduplicator) throws SQLException { this.documentProcessor = documentProcessor; this.siteWords = siteWords; + this.anchorTextKeywords = anchorTextKeywords; this.documentDeduplicator = documentDeduplicator; + this.anchorTagsSource = anchorTagsSourceFactory.create(); } @SneakyThrows @@ -76,19 +86,26 @@ public class DomainProcessor { List terms = new ArrayList<>(); terms.add("ip:"+ip); - if (cookies) + if (cookies) { terms.add(HtmlFeature.COOKIES.getKeyword()); + } + + var atags = anchorTagsSource.getAnchorTags(ret.domain); for (var document : ret.documents) { if (document.details == null) continue; - if (cookies) + if (cookies) { document.details.features.add(HtmlFeature.COOKIES); + } document.words.addAllSyntheticTerms(terms); - } + document.words.addAnchorTerms( + anchorTextKeywords.getAnchorTextKeywords(atags, document.url) + ); + } documentDeduplicator.deduplicate(ret.documents); calculateStatistics(ret); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java index 8419687e..e0691471 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java @@ -53,7 +53,7 @@ public class SideloaderProcessing { ret.details = details.details(); ret.details.metadata = ret.details.metadata - .withSize(size, Math.max(0, 255 - url.length())); + .withSize(size, Math.max(0, 32 - url.length()) / 4); ret.url = new EdgeUrl(url); ret.state = UrlIndexingState.OK; ret.stateReason = "SIDELOAD"; diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTestModule.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTestModule.java index 4471e4d1..b23c5cc1 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTestModule.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTestModule.java @@ -5,6 +5,7 @@ import com.google.inject.name.Names; import nu.marginalia.LanguageModels; import nu.marginalia.WmsaHome; import nu.marginalia.converting.processor.ConverterDomainTypes; +import nu.marginalia.service.module.ServiceConfiguration; import org.mockito.Mockito; public class ConvertingIntegrationTestModule extends AbstractModule { @@ -13,6 +14,9 @@ public class ConvertingIntegrationTestModule extends AbstractModule { bind(Integer.class).annotatedWith(Names.named("min-document-length")).toInstance(250); bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128); bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255); + bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration( + null, 1, "localhost", 0, 0, null + )); bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels()); bind(ConverterDomainTypes.class).toInstance(Mockito.mock(ConverterDomainTypes.class)); diff --git a/code/processes/index-constructor-process/src/main/java/nu/marginalia/index/IndexConstructorMain.java b/code/processes/index-constructor-process/src/main/java/nu/marginalia/index/IndexConstructorMain.java index 42a2f888..3e93dafd 100644 --- a/code/processes/index-constructor-process/src/main/java/nu/marginalia/index/IndexConstructorMain.java +++ b/code/processes/index-constructor-process/src/main/java/nu/marginalia/index/IndexConstructorMain.java @@ -146,6 +146,7 @@ public class IndexConstructorMain { | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.Site.asBit() + | WordFlags.ExternalLink.asBit() | WordFlags.SiteAdjacent.asBit(); return r -> WordMetadata.hasAnyFlags(r, highPriorityFlags); diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java index e16e7283..e19d3809 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java @@ -53,14 +53,14 @@ public class IndexResultValuator { } private final long flagsFilterMask = - WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit(); + WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit(); @Nullable - public SearchResultItem calculatePreliminaryScore(long id) { + public SearchResultItem calculatePreliminaryScore(long combinedId) { - final long docId = UrlIdCodec.removeRank(id); + final long docId = UrlIdCodec.removeRank(combinedId); - if (!termMetadataForDocuments.testCoherence(id, searchTerms.coherences)) + if (!termMetadataForDocuments.testCoherence(combinedId, searchTerms.coherences)) return null; long docMetadata = metadataService.getDocumentMetadata(docId); @@ -70,7 +70,7 @@ public class IndexResultValuator { boolean anyAllSynthetic = false; int maxPositionsSet = 0; - SearchResultItem searchResult = new SearchResultItem(id, + SearchResultItem searchResult = new SearchResultItem(combinedId, searchTermVariants.stream().mapToInt(List::size).sum()); for (int querySetId = 0; @@ -133,7 +133,7 @@ public class IndexResultValuator { rankingContext); searchResult.setScore(new SearchResultPreliminaryScore( - resultsWithPriorityTerms.contains(id), + resultsWithPriorityTerms.contains(combinedId), score )); diff --git a/code/tools/experiment-runner/build.gradle b/code/tools/experiment-runner/build.gradle index e308b8e5..44494949 100644 --- a/code/tools/experiment-runner/build.gradle +++ b/code/tools/experiment-runner/build.gradle @@ -23,10 +23,12 @@ dependencies { implementation project(':third-party:rdrpostagger') implementation project(':third-party:porterstemmer') implementation project(':third-party:monkey-patch-opennlp') + implementation project(':code:common:db') implementation project(':code:common:model') implementation project(':code:common:config') implementation project(':code:common:process') implementation project(':code:common:service') + implementation project(':code:common:service-discovery') implementation project(':code:libraries:language-processing') implementation project(':code:libraries:term-frequency-dict') implementation project(':code:libraries:big-string') @@ -36,6 +38,7 @@ dependencies { implementation project(':third-party:commons-codec') implementation project(':code:features-crawl:link-parser') implementation project(':code:features-convert:adblock') + implementation project(':code:features-convert:anchor-keywords') implementation project(':code:features-convert:topic-detection') implementation project(':code:features-convert:keyword-extraction') @@ -49,6 +52,7 @@ dependencies { implementation libs.bundles.nlp implementation libs.commons.lang3 + implementation libs.bundles.mariadb testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java index 9997ce71..97df4a39 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java @@ -22,6 +22,7 @@ public class ExperimentRunnerMain { "test", TestExperiment.class, "adblock", AdblockExperiment.class, "topic", TopicExperiment.class, + "atags", AtagsExperiment.class, "sentence-statistics", SentenceStatisticsExperiment.class, "site-statistics", SiteStatisticsExperiment.class, "export-atags", ExportExternalLinksExperiment.class, diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/LegacyExperiment.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/LegacyExperiment.java index 52bfa035..4e61ffc4 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/LegacyExperiment.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/LegacyExperiment.java @@ -5,6 +5,7 @@ import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; import java.io.IOException; +import java.net.URISyntaxException; import java.util.ArrayList; import java.util.List; diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/AtagsExperiment.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/AtagsExperiment.java new file mode 100644 index 00000000..d08ec90f --- /dev/null +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/AtagsExperiment.java @@ -0,0 +1,52 @@ +package nu.marginalia.tools.experiments; + +import com.google.inject.Inject; +import com.zaxxer.hikari.HikariDataSource; +import lombok.SneakyThrows; +import nu.marginalia.ProcessConfiguration; +import nu.marginalia.atags.AnchorTextKeywords; +import nu.marginalia.atags.source.AnchorTagsSource; +import nu.marginalia.atags.source.AnchorTagsSourceFactory; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.tools.LegacyExperiment; + +import java.sql.SQLException; + +public class AtagsExperiment extends LegacyExperiment { + + + private final AnchorTextKeywords keywords; + private final AnchorTagsSource source; + + @Inject + public AtagsExperiment(AnchorTextKeywords keywords, HikariDataSource dataSource) throws SQLException { + this.keywords = keywords; + this.source = new AnchorTagsSourceFactory(dataSource, new ProcessConfiguration(null, 1, null)) + .create(); + + } + + @Override + @SneakyThrows + public boolean process(CrawledDomain domain) { + var atags = source.getAnchorTags(new EdgeDomain(domain.domain)); + for (var doc : domain.doc) { + if (doc.documentBody == null) + continue; + + var newKeywords = keywords.getAnchorTextKeywords(atags, new EdgeUrl(doc.url)); + if (!newKeywords.isEmpty()) { + System.out.println(newKeywords + " " + doc.url); + } + } + return true; + } + + @Override + @SneakyThrows + public void onFinish() { + source.close(); + } +} diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java index 12c88215..98c11e7f 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java @@ -1,22 +1,11 @@ package nu.marginalia.tools.experiments; import com.google.inject.Inject; -import nu.marginalia.WmsaHome; -import nu.marginalia.adblock.GoogleAnwersSpamDetector; import nu.marginalia.converting.model.ProcessedDocument; -import nu.marginalia.converting.processor.DocumentProcessor; import nu.marginalia.converting.processor.DomainProcessor; -import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; import nu.marginalia.crawling.io.SerializableCrawlDataStream; -import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.tools.Experiment; -import nu.marginalia.topic.RecipeDetector; -import nu.marginalia.topic.TextileCraftDetector; -import nu.marginalia.topic.WoodworkingDetector; -import org.jsoup.Jsoup; import java.util.Comparator; diff --git a/settings.gradle b/settings.gradle index b6c330a2..952acd9c 100644 --- a/settings.gradle +++ b/settings.gradle @@ -31,6 +31,7 @@ include 'code:features-qs:query-parser' include 'code:features-index:result-ranking' include 'code:features-convert:adblock' +include 'code:features-convert:anchor-keywords' include 'code:features-convert:stackexchange-xml' include 'code:features-convert:pubdate' include 'code:features-convert:summary-extraction' @@ -149,7 +150,7 @@ dependencyResolutionManagement { library('fastutil', 'it.unimi.dsi', 'fastutil').version('8.5.8') library('hll', 'net.agkn', 'hll').version('1.6.0') - + library('duckdb', 'org.duckdb', 'duckdb_jdbc').version('0.9.1') library('okhttp3','com.squareup.okhttp3','okhttp').version('4.11.0') library('httpcomponents.core','org.apache.httpcomponents','httpcore').version('4.4.15')