mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-22 20:48:59 +00:00
Initial Commit Anchor Tags
* Added new (optional) model file in $WMSA_HOME/data/atags.parquet * Converter gets a component for creating a projection of its domains onto the full atags parquet file * New WordFlag ExternalLink * These terms are also for now flagged as title words * Fixed a bug where Title words aliased with UrlDomain words * Fixed a bug in the encyclopedia sideloader that gave everything too high topology ranking
This commit is contained in:
parent
30ca5046b5
commit
0152004c42
@ -77,7 +77,11 @@ public class WmsaHome {
|
||||
home.resolve("model/lid.176.ftz"));
|
||||
}
|
||||
|
||||
public static Path getAtagsPath() {
|
||||
return getHomePath().resolve("data/atags.parquet");
|
||||
}
|
||||
private static final boolean debugMode = Boolean.getBoolean("wmsa-debug");
|
||||
|
||||
public static boolean isDebug() {
|
||||
return debugMode;
|
||||
}
|
||||
|
@ -31,14 +31,15 @@ public enum WordFlags {
|
||||
SiteAdjacent,
|
||||
|
||||
/** Keyword appears in URL path
|
||||
*
|
||||
*/
|
||||
UrlPath,
|
||||
|
||||
/** Keyword appears in domain name
|
||||
*
|
||||
*/
|
||||
UrlDomain
|
||||
UrlDomain,
|
||||
|
||||
/** Word appears in an external link */
|
||||
ExternalLink
|
||||
;
|
||||
|
||||
public int asBit() {
|
||||
|
@ -12,14 +12,12 @@ import java.util.Set;
|
||||
* @param flags word flags (see {@link WordFlags})
|
||||
*/
|
||||
public record WordMetadata(long positions,
|
||||
byte flags) {
|
||||
int flags) {
|
||||
|
||||
// Bottom 8 bits are used for flags
|
||||
|
||||
public static final long FLAGS_MASK = 0xFFL;
|
||||
|
||||
public static final int POSITIONS_SHIFT = 8;
|
||||
public static final long POSITIONS_MASK = 0xFF_FFFF_FFFF_FFFFL;
|
||||
public static final long FLAGS_MASK = (1L << WordFlags.values().length) - 1;
|
||||
public static final int POSITIONS_COUNT = 64 - WordFlags.values().length;
|
||||
public static final int POSITIONS_SHIFT = WordFlags.values().length;
|
||||
public static final long POSITIONS_MASK = ~0L >>> POSITIONS_SHIFT;
|
||||
|
||||
|
||||
|
||||
@ -30,7 +28,7 @@ public record WordMetadata(long positions,
|
||||
public WordMetadata(long value) {
|
||||
this(
|
||||
((value >>> POSITIONS_SHIFT) & POSITIONS_MASK),
|
||||
(byte) (value & FLAGS_MASK)
|
||||
(int)(value & FLAGS_MASK)
|
||||
);
|
||||
}
|
||||
|
||||
@ -40,9 +38,9 @@ public record WordMetadata(long positions,
|
||||
this(positions, encodeFlags(flags));
|
||||
}
|
||||
|
||||
private static byte encodeFlags(Set<WordFlags> flags) {
|
||||
byte ret = 0;
|
||||
for (var flag : flags) { ret |= (byte) flag.asBit(); }
|
||||
private static int encodeFlags(Set<WordFlags> flags) {
|
||||
int ret = 0;
|
||||
for (var flag : flags) { ret |= flag.asBit(); }
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -69,7 +67,7 @@ public record WordMetadata(long positions,
|
||||
public long encode() {
|
||||
long ret = 0;
|
||||
|
||||
ret |= Byte.toUnsignedLong(flags);
|
||||
ret |= Integer.toUnsignedLong(flags) & FLAGS_MASK;
|
||||
ret |= (positions & POSITIONS_MASK) << POSITIONS_SHIFT;
|
||||
|
||||
return ret;
|
||||
|
@ -17,14 +17,15 @@ class WordMetadataTest {
|
||||
verifyCodec("Position 32bit", new WordMetadata(0xff0f0000L, EnumSet.allOf(WordFlags.class)));
|
||||
verifyCodec("Position all", new WordMetadata(0xffff_ff0f_0000L, EnumSet.allOf(WordFlags.class)));
|
||||
verifyCodec("No flags", new WordMetadata( 0xff0f0000L, EnumSet.noneOf(WordFlags.class)));
|
||||
verifyCodec("No flags, some bits", new WordMetadata(0x7f7f7f7f7f7f7fL, EnumSet.noneOf(WordFlags.class)));
|
||||
verifyCodec("No flags, all bits", new WordMetadata( 0xffffffffffffffL, EnumSet.noneOf(WordFlags.class)));
|
||||
verifyCodec("All flags, all bits", new WordMetadata( 0xffffffffffffffL, EnumSet.allOf(WordFlags.class)));
|
||||
verifyCodec("No flags, some bits", new WordMetadata(0x3f_7f7f_7f7f_7f7fL, EnumSet.noneOf(WordFlags.class)));
|
||||
verifyCodec("No flags, all bits", new WordMetadata( 0x3f_ffff_ffff_ffffL, EnumSet.noneOf(WordFlags.class)));
|
||||
verifyCodec("All flags, all bits", new WordMetadata( 0x3f_ffff_ffff_ffffL, EnumSet.allOf(WordFlags.class)));
|
||||
System.out.println(new WordMetadata(0x7f0f0005L, EnumSet.allOf(WordFlags.class)));
|
||||
System.out.println(new WordMetadata(0xff0f0013L, EnumSet.noneOf(WordFlags.class)));
|
||||
System.out.println(new WordMetadata(0xf0f000ff0f0013L, EnumSet.allOf(WordFlags.class)));
|
||||
System.out.println(new WordMetadata(0xf0f000ff0f0013L, (byte)-1));
|
||||
System.out.println(new WordMetadata(0xffffffffffffffL, (byte)0));
|
||||
System.out.println(new WordMetadata(0x3f_ffff_ffff_ffffL, (byte)0));
|
||||
System.out.println(new WordMetadata(0x3f_ffff_ffff_ffffL, (byte)0));
|
||||
System.out.println(BrailleBlockPunchCards.printBits(new WordMetadata(~0L, (byte) 0).encode(), 64));
|
||||
System.out.println(BrailleBlockPunchCards.printBits(new WordMetadata(0, (byte) 0xff).encode(), 64));
|
||||
System.out.println(BrailleBlockPunchCards.printBits(131973L, 64));
|
||||
|
33
code/features-convert/anchor-keywords/build.gradle
Normal file
33
code/features-convert/anchor-keywords/build.gradle
Normal file
@ -0,0 +1,33 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id "de.undercouch.download" version "5.1.0"
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(21))
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:common:process')
|
||||
implementation project(':code:features-convert:keyword-extraction')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.guice
|
||||
implementation libs.bundles.mariadb
|
||||
implementation libs.duckdb
|
||||
implementation libs.notnull
|
||||
implementation libs.jsoup
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
||||
|
@ -0,0 +1,74 @@
|
||||
package nu.marginalia.atags;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.*;
|
||||
|
||||
public class AnchorTextKeywords {
|
||||
private final KeywordExtractor keywordExtractor;
|
||||
private final SentenceExtractor sentenceExtractor;
|
||||
private final Set<String> stopList;
|
||||
@Inject
|
||||
public AnchorTextKeywords(KeywordExtractor keywordExtractor,
|
||||
SentenceExtractor sentenceExtractor)
|
||||
{
|
||||
this.keywordExtractor = keywordExtractor;
|
||||
this.sentenceExtractor = sentenceExtractor;
|
||||
|
||||
stopList = readStoplist();
|
||||
}
|
||||
|
||||
private Set<String> readStoplist() {
|
||||
Set<String> ret = new HashSet<>();
|
||||
|
||||
try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("atags-stop-list"),
|
||||
"Could not load word frequency table");
|
||||
var br = new BufferedReader(new InputStreamReader(resource))
|
||||
) {
|
||||
while (true) {
|
||||
String s = br.readLine();
|
||||
|
||||
if (s == null) break;
|
||||
if (s.isBlank()) continue;
|
||||
|
||||
ret.add(s.trim());
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
public List<String> getAnchorTextKeywords(DomainLinks links, EdgeUrl url) {
|
||||
var keywordsRaw = links.forUrl(url);
|
||||
|
||||
// Extract and count keywords from anchor text
|
||||
Map<String, Integer> wordsWithCount = new HashMap<>();
|
||||
for (var keyword : keywordsRaw) {
|
||||
if (stopList.contains(keyword.text().toLowerCase()))
|
||||
continue;
|
||||
|
||||
var sentence = sentenceExtractor.extractSentence(keyword.text());
|
||||
for (var wordSpan : keywordExtractor.getKeywordsFromSentence(sentence)) {
|
||||
wordsWithCount.merge(sentence.constructWordFromSpan(wordSpan), 1, Integer::sum);
|
||||
}
|
||||
}
|
||||
|
||||
// Filter out keywords that appear infrequently
|
||||
final List<String> keywords = new ArrayList<>(wordsWithCount.size());
|
||||
for (var wordEntry : wordsWithCount.entrySet()) {
|
||||
if (wordEntry.getValue() > 2) {
|
||||
keywords.add(wordEntry.getKey());
|
||||
}
|
||||
}
|
||||
|
||||
return keywords;
|
||||
}
|
||||
}
|
@ -0,0 +1,40 @@
|
||||
package nu.marginalia.atags.model;
|
||||
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class DomainLinks {
|
||||
private final Map<String, List<Link>> links;
|
||||
|
||||
public DomainLinks() {
|
||||
links = Map.of();
|
||||
}
|
||||
|
||||
public DomainLinks(List<LinkWithText> linksForDomain) {
|
||||
links = linksForDomain.
|
||||
stream()
|
||||
.collect(Collectors.groupingBy(LinkWithText::url,
|
||||
Collectors.mapping(LinkWithText::toLink, Collectors.toList())));
|
||||
}
|
||||
|
||||
public List<String> getUrls() {
|
||||
return new ArrayList<>(links.keySet());
|
||||
}
|
||||
|
||||
public List<Link> forUrl(EdgeUrl url) {
|
||||
String key = url.domain.toString() + url.path + (url.param == null ? "" : "?" + url.param);
|
||||
return links.getOrDefault(key, List.of());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "DomainLinks{" +
|
||||
"links=" + links +
|
||||
'}';
|
||||
}
|
||||
}
|
@ -0,0 +1,4 @@
|
||||
package nu.marginalia.atags.model;
|
||||
|
||||
public record Link(String source, String text) {
|
||||
}
|
@ -0,0 +1,7 @@
|
||||
package nu.marginalia.atags.model;
|
||||
|
||||
public record LinkWithText(String url, String text, String source) {
|
||||
public Link toLink() {
|
||||
return new Link(source, text);
|
||||
}
|
||||
}
|
@ -0,0 +1,93 @@
|
||||
package nu.marginalia.atags.source;
|
||||
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.atags.model.LinkWithText;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class AnchorTagsImpl implements AnchorTagsSource {
|
||||
private final Connection duckdbConnection;
|
||||
private static final Logger logger = LoggerFactory.getLogger(AnchorTagsImpl.class);
|
||||
public AnchorTagsImpl(Path atagsPath,
|
||||
List<EdgeDomain> relevantDomains)
|
||||
throws SQLException
|
||||
{
|
||||
duckdbConnection = DriverManager.getConnection("jdbc:duckdb:");
|
||||
|
||||
logger.info("Loading atags from " + atagsPath);
|
||||
|
||||
try (var stmt = duckdbConnection.createStatement()) {
|
||||
// Insert the domains into a temporary table, then use that to filter the atags table
|
||||
|
||||
stmt.executeUpdate("create table domains (domain varchar)");
|
||||
try (var ps = duckdbConnection.prepareStatement("insert into domains values (?)")) {
|
||||
for (var domain : relevantDomains) {
|
||||
ps.setString(1, domain.toString());
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
// Project the atags table down to only the relevant domains. This looks like an SQL injection
|
||||
// vulnerability if you're a validation tool, but the string comes from a trusted source.
|
||||
stmt.executeUpdate("""
|
||||
create table atags as
|
||||
select * from '%s'
|
||||
where dest in (select * from domains)
|
||||
""".formatted(atagsPath.toAbsolutePath()));
|
||||
|
||||
// Free up the memory used by the domains table
|
||||
stmt.executeUpdate("drop table domains");
|
||||
|
||||
// Create an index on the dest column to speed up queries
|
||||
stmt.executeUpdate("create index atags_dest on atags(dest)");
|
||||
|
||||
// This is probably not necessary
|
||||
if (!duckdbConnection.getAutoCommit()) {
|
||||
duckdbConnection.commit();
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("Finished loading!");
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public DomainLinks getAnchorTags(EdgeDomain domain) {
|
||||
List<LinkWithText> links = new ArrayList<>();
|
||||
|
||||
try (var ps = duckdbConnection.prepareStatement("""
|
||||
select
|
||||
unnest(text) as 'text',
|
||||
unnest(url) as 'url',
|
||||
unnest(source) as 'source'
|
||||
from atags
|
||||
where dest = ?
|
||||
"""))
|
||||
{
|
||||
ps.setString(1, domain.toString());
|
||||
var rs = ps.executeQuery();
|
||||
while (rs.next()) {
|
||||
links.add(new LinkWithText(rs.getString("url"), rs.getString("text"), rs.getString("source")));
|
||||
}
|
||||
return new DomainLinks(links);
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.warn("Failed to get atags for " + domain, ex);
|
||||
}
|
||||
|
||||
return new DomainLinks();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws Exception {
|
||||
duckdbConnection.close();
|
||||
}
|
||||
}
|
@ -0,0 +1,10 @@
|
||||
package nu.marginalia.atags.source;
|
||||
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
public interface AnchorTagsSource extends AutoCloseable {
|
||||
DomainLinks getAnchorTags(EdgeDomain domain);
|
||||
|
||||
default void close() throws Exception {}
|
||||
}
|
@ -0,0 +1,73 @@
|
||||
package nu.marginalia.atags.source;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.ProcessConfiguration;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class AnchorTagsSourceFactory {
|
||||
private final Path atagsPath;
|
||||
private final int nodeId;
|
||||
private final HikariDataSource dataSource;
|
||||
private static final Logger logger = LoggerFactory.getLogger(AnchorTagsSourceFactory.class);
|
||||
@Inject
|
||||
public AnchorTagsSourceFactory(HikariDataSource dataSource,
|
||||
ProcessConfiguration config)
|
||||
{
|
||||
this.dataSource = dataSource;
|
||||
this.atagsPath = WmsaHome.getAtagsPath();
|
||||
this.nodeId = config.node();
|
||||
}
|
||||
|
||||
public AnchorTagsSource create() throws SQLException {
|
||||
if (!Files.exists(atagsPath))
|
||||
return dummy();
|
||||
|
||||
List<EdgeDomain> relevantDomains = getRelevantDomains();
|
||||
|
||||
if (relevantDomains.isEmpty())
|
||||
return dummy();
|
||||
|
||||
return new AnchorTagsImpl(atagsPath, relevantDomains);
|
||||
}
|
||||
|
||||
private AnchorTagsSource dummy() {
|
||||
return x -> new DomainLinks();
|
||||
}
|
||||
|
||||
// Only get domains that are assigned to this node. This reduces the amount of data
|
||||
// that needs to be loaded into the duckdb instance to a more manageable level, and keeps
|
||||
// the memory footprint of the service down.
|
||||
private List<EdgeDomain> getRelevantDomains() {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT DOMAIN_NAME
|
||||
FROM WMSA_prod.EC_DOMAIN
|
||||
WHERE NODE_AFFINITY = ?
|
||||
"""))
|
||||
{
|
||||
stmt.setInt(1, nodeId);
|
||||
var rs = stmt.executeQuery();
|
||||
var ret = new ArrayList<EdgeDomain>();
|
||||
while (rs.next()) {
|
||||
ret.add(new EdgeDomain(rs.getString(1)));
|
||||
}
|
||||
return ret;
|
||||
} catch (Exception e) {
|
||||
logger.warn("Failed to get relevant domains for node id " + nodeId, e);
|
||||
return List.of();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,147 @@
|
||||
[-]
|
||||
facebook
|
||||
website
|
||||
twitter
|
||||
link
|
||||
instagram
|
||||
read more
|
||||
visit website
|
||||
amazon
|
||||
youtube
|
||||
linkedin
|
||||
learn more
|
||||
go to album
|
||||
more
|
||||
share
|
||||
sign up
|
||||
log in
|
||||
buy now
|
||||
1
|
||||
here.
|
||||
.
|
||||
italiano
|
||||
privacy policy
|
||||
register
|
||||
2
|
||||
buy tickets
|
||||
book now
|
||||
view
|
||||
more info
|
||||
apply now
|
||||
play pause
|
||||
report
|
||||
contact us
|
||||
view article
|
||||
home
|
||||
online
|
||||
d
|
||||
3
|
||||
[link]
|
||||
hier
|
||||
read
|
||||
reports
|
||||
view website
|
||||
said
|
||||
[1]
|
||||
reported
|
||||
wordpress
|
||||
announced
|
||||
4
|
||||
fr
|
||||
donate
|
||||
contact
|
||||
news
|
||||
more information
|
||||
en
|
||||
apply
|
||||
via
|
||||
post
|
||||
comments
|
||||
register now
|
||||
listen
|
||||
read full review
|
||||
details
|
||||
register here
|
||||
visit site
|
||||
privacy
|
||||
5
|
||||
shop now
|
||||
[2]
|
||||
info
|
||||
help
|
||||
play
|
||||
de
|
||||
this
|
||||
*
|
||||
terms of use
|
||||
directions
|
||||
url
|
||||
open
|
||||
p
|
||||
photo
|
||||
careers
|
||||
find out more
|
||||
login
|
||||
view original post
|
||||
0
|
||||
about
|
||||
live demo
|
||||
blogger
|
||||
6
|
||||
www
|
||||
subscribe
|
||||
view details
|
||||
watch
|
||||
read more...
|
||||
view profile
|
||||
download now
|
||||
wrote
|
||||
t
|
||||
get started
|
||||
gallery
|
||||
7
|
||||
preview
|
||||
visit
|
||||
terms of service
|
||||
a
|
||||
email
|
||||
html
|
||||
view more
|
||||
view this tutor
|
||||
spanish
|
||||
8
|
||||
permalink
|
||||
read article
|
||||
results
|
||||
demo
|
||||
rss
|
||||
[4]
|
||||
about us
|
||||
part 2
|
||||
click here.
|
||||
get tickets
|
||||
visit websitewebsite
|
||||
es
|
||||
says
|
||||
pin
|
||||
+
|
||||
watch now
|
||||
listen now
|
||||
writes
|
||||
part 1
|
||||
clicking here
|
||||
page
|
||||
link here
|
||||
i
|
||||
[5]
|
||||
profile
|
||||
it
|
||||
sign in
|
||||
11
|
||||
french
|
||||
donate now
|
||||
home page
|
||||
order now
|
||||
12
|
||||
more...
|
||||
these
|
@ -0,0 +1,46 @@
|
||||
package nu.marginalia.atags;
|
||||
|
||||
import nu.marginalia.atags.source.AnchorTagsImpl;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.util.TestLanguageModels;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.List;
|
||||
|
||||
class DomainAnchorTagsImplTest {
|
||||
|
||||
@Test
|
||||
void getAnchorTags() {
|
||||
Path atagsPath = Path.of("/home/vlofgren/atags.parquet");
|
||||
try (var domainAnchorTags = new AnchorTagsImpl(
|
||||
atagsPath, List.of(new EdgeDomain("www.chiark.greenend.org.uk"))
|
||||
)) {
|
||||
var tags = domainAnchorTags.getAnchorTags(new EdgeDomain("www.chiark.greenend.org.uk"));
|
||||
|
||||
System.out.println(tags);
|
||||
System.out.println(tags.getUrls());
|
||||
System.out.println(tags.forUrl(new EdgeUrl("https://www.chiark.greenend.org.uk/~sgtatham/putty/")));
|
||||
System.out.println(tags.forUrl(new EdgeUrl("http://www.chiark.greenend.org.uk/~sgtatham/putty/")));
|
||||
System.out.println(tags.forUrl(new EdgeUrl("http://www.chiark.greenend.org.uk/~sgtatham/putt")));
|
||||
|
||||
var atagsKeywords = new AnchorTextKeywords(
|
||||
new KeywordExtractor(),
|
||||
new SentenceExtractor(
|
||||
TestLanguageModels.getLanguageModels()
|
||||
)
|
||||
);
|
||||
System.out.println(
|
||||
atagsKeywords.getAnchorTextKeywords(tags, new EdgeUrl("https://www.chiark.greenend.org.uk/~sgtatham/"))
|
||||
);
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException(e);
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,38 @@
|
||||
package nu.marginalia.util;
|
||||
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.WmsaHome;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Optional;
|
||||
|
||||
public class TestLanguageModels {
|
||||
private static final Path LANGUAGE_MODELS_DEFAULT = WmsaHome.getHomePath().resolve("model");
|
||||
|
||||
public static Path getLanguageModelsPath() {
|
||||
final Path languageModelsHome = Optional.ofNullable(System.getenv("LANGUAGE_MODELS_HOME"))
|
||||
.map(Path::of)
|
||||
.orElse(LANGUAGE_MODELS_DEFAULT);
|
||||
|
||||
if (!Files.isDirectory(languageModelsHome)) {
|
||||
throw new IllegalStateException("Could not find $LANGUAGE_MODELS_HOME, see doc/language-models.md");
|
||||
}
|
||||
return languageModelsHome;
|
||||
}
|
||||
|
||||
public static LanguageModels getLanguageModels() {
|
||||
|
||||
var languageModelsHome = getLanguageModelsPath();
|
||||
|
||||
return new LanguageModels(
|
||||
languageModelsHome.resolve("ngrams.bin"),
|
||||
languageModelsHome.resolve("tfreq-new-algo3.bin"),
|
||||
languageModelsHome.resolve("opennlp-sentence.bin"),
|
||||
languageModelsHome.resolve("English.RDR"),
|
||||
languageModelsHome.resolve("English.DICT"),
|
||||
languageModelsHome.resolve("opennlp-tokens.bin"),
|
||||
languageModelsHome.resolve("lid.176.ftz")
|
||||
);
|
||||
}
|
||||
}
|
@ -4,12 +4,13 @@ import com.google.inject.Inject;
|
||||
import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
|
||||
/** Generates a position bitmask for each word in a document */
|
||||
public class KeywordPositionBitmask {
|
||||
private final Object2LongOpenHashMap<String> positionMask = new Object2LongOpenHashMap<>(10_000, 0.7f);
|
||||
private final static int positionWidth = 56;
|
||||
private final static long positionBitmask = (1L << positionWidth) - 1;
|
||||
private final static int positionWidth = WordMetadata.POSITIONS_COUNT;
|
||||
private final static long positionBitmask = WordMetadata.POSITIONS_MASK;
|
||||
private static final int unmodulatedPortion = 16;
|
||||
|
||||
@Inject
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.keyword.model;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap;
|
||||
import lombok.Getter;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
|
||||
@ -76,6 +75,14 @@ public class DocumentKeywordsBuilder {
|
||||
newWords.forEach(word -> words.putIfAbsent(word, meta));
|
||||
}
|
||||
|
||||
public void addAnchorTerms(List<String> keywords) {
|
||||
long meta = WordFlags.Title.asBit()
|
||||
| WordFlags.ExternalLink.asBit()
|
||||
| (1L << WordMetadata.POSITIONS_SHIFT);
|
||||
|
||||
keywords.forEach(word -> words.mergeLong(word, meta, (a, b) -> a|b));
|
||||
}
|
||||
|
||||
public List<String> getWordsWithAnyFlag(long flags) {
|
||||
List<String> ret = new ArrayList<>();
|
||||
|
||||
@ -103,4 +110,5 @@ public class DocumentKeywordsBuilder {
|
||||
return sb.append(']').toString();
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -53,9 +53,6 @@ public class ForwardIndexReader {
|
||||
data = loadData(dataFile);
|
||||
}
|
||||
|
||||
public void selfTest() {
|
||||
|
||||
}
|
||||
private static TLongIntHashMap loadIds(Path idsFile) throws IOException {
|
||||
try (var idsArray = LongArrayFactory.mmapForReadingShared(idsFile)) {
|
||||
assert idsArray.size() < Integer.MAX_VALUE;
|
||||
|
@ -11,6 +11,9 @@ import nu.marginalia.ranking.factors.*;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@ -18,7 +21,7 @@ import static java.lang.Math.min;
|
||||
|
||||
@Singleton
|
||||
public class ResultValuator {
|
||||
final static double scalingFactor = 250.;
|
||||
final static double scalingFactor = 500.;
|
||||
|
||||
private final Bm25Factor bm25Factor;
|
||||
private final TermCoherenceFactor termCoherenceFactor;
|
||||
@ -28,6 +31,8 @@ public class ResultValuator {
|
||||
private final ThreadLocal<ValuatorListPool<SearchResultKeywordScore>> listPool =
|
||||
ThreadLocal.withInitial(ValuatorListPool::new);
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(ResultValuator.class);
|
||||
|
||||
@Inject
|
||||
public ResultValuator(Bm25Factor bm25Factor,
|
||||
TermCoherenceFactor termCoherenceFactor,
|
||||
@ -46,7 +51,6 @@ public class ResultValuator {
|
||||
var threadListPool = listPool.get();
|
||||
int sets = numberOfSets(scores);
|
||||
|
||||
double bestScore = 10;
|
||||
|
||||
long documentMetadata = documentMetadata(scores);
|
||||
int features = htmlFeatures(scores);
|
||||
@ -56,7 +60,7 @@ public class ResultValuator {
|
||||
int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata);
|
||||
int quality = DocumentMetadata.decodeQuality(documentMetadata);
|
||||
int size = DocumentMetadata.decodeSize(documentMetadata);
|
||||
int flagsPenalty = flagsPenalty(features, documentMetadata & 0xFF, size, quality);
|
||||
int flagsPenalty = flagsPenalty(features, documentMetadata & 0xFF, size);
|
||||
int topology = DocumentMetadata.decodeTopology(documentMetadata);
|
||||
int year = DocumentMetadata.decodeYear(documentMetadata);
|
||||
|
||||
@ -85,7 +89,8 @@ public class ResultValuator {
|
||||
+ flagsPenalty
|
||||
+ priorityTermBonus.calculate(scores);
|
||||
|
||||
for (int set = 0; set <= sets; set++) {
|
||||
double bestScore = 10;
|
||||
for (int set = 0; set < sets; set++) {
|
||||
ResultKeywordSet keywordSet = createKeywordSet(threadListPool, scores, set);
|
||||
|
||||
if (keywordSet.isEmpty() || keywordSet.hasNgram())
|
||||
@ -95,8 +100,7 @@ public class ResultValuator {
|
||||
final double bm25 = rankingParams.bm25FullWeight * bm25Factor.calculateBm25(rankingParams.fullParams, keywordSet, length, ctx);
|
||||
final double bm25p = rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, keywordSet, ctx);
|
||||
|
||||
double nonNormalizedScore = bm25 + bm25p + tcf + overallPart;
|
||||
double score = normalize(nonNormalizedScore, keywordSet.length());
|
||||
double score = normalize(bm25 + bm25p + tcf + overallPart);
|
||||
|
||||
bestScore = min(bestScore, score);
|
||||
|
||||
@ -116,7 +120,7 @@ public class ResultValuator {
|
||||
}
|
||||
}
|
||||
|
||||
private int flagsPenalty(int featureFlags, long docFlags, int size, double quality) {
|
||||
private int flagsPenalty(int featureFlags, long docFlags, int size) {
|
||||
|
||||
// Short-circuit for index-service, which does not have the feature flags
|
||||
if (featureFlags == 0)
|
||||
@ -203,11 +207,11 @@ public class ResultValuator {
|
||||
return 1 + maxSet;
|
||||
}
|
||||
|
||||
public static double normalize(double value, int setSize) {
|
||||
public static double normalize(double value) {
|
||||
if (value < 0)
|
||||
value = 0;
|
||||
|
||||
return Math.sqrt((1.0 + scalingFactor) / (1.0 + value / Math.max(1., setSize)));
|
||||
return Math.sqrt((1.0 + scalingFactor) / (1.0 + value));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -43,6 +43,8 @@ public class Bm25Factor {
|
||||
| WordFlags.SiteAdjacent.asBit()
|
||||
| WordFlags.UrlPath.asBit()
|
||||
| WordFlags.UrlDomain.asBit()
|
||||
| WordFlags.ExternalLink.asBit()
|
||||
| WordFlags.Title.asBit()
|
||||
| WordFlags.Subjects.asBit();
|
||||
|
||||
for (var keyword : keywordSet.keywords()) {
|
||||
@ -50,6 +52,7 @@ public class Bm25Factor {
|
||||
|
||||
int freq = ctx.priorityFrequency(keyword.keyword);
|
||||
|
||||
// note we override b to zero for priority terms as they are independent of document length
|
||||
sum += invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0);
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.ranking.factors;
|
||||
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.ranking.ResultKeywordSet;
|
||||
|
||||
/** Rewards documents where terms appear frequently within the same sentences
|
||||
@ -15,16 +16,14 @@ public class TermCoherenceFactor {
|
||||
double bitsSetFactor(long mask) {
|
||||
final int bitsSetInMask = Long.bitCount(mask);
|
||||
|
||||
return Math.pow(bitsSetInMask/56., 0.25);
|
||||
return Math.pow(bitsSetInMask/(float) WordMetadata.POSITIONS_COUNT, 0.25);
|
||||
}
|
||||
|
||||
long combinedMask(ResultKeywordSet keywordSet) {
|
||||
long mask = 0xFF_FFFF_FFFF_FFFFL;
|
||||
long mask = WordMetadata.POSITIONS_MASK;
|
||||
|
||||
for (var keyword : keywordSet.keywords()) {
|
||||
long positions = keyword.positions();
|
||||
|
||||
mask &= positions;
|
||||
mask &= keyword.positions();
|
||||
}
|
||||
|
||||
return mask;
|
||||
|
@ -17,7 +17,7 @@ class TermCoherenceFactorTest {
|
||||
@Test
|
||||
public void testAllBitsSet() {
|
||||
var allPositionsSet = createSet(
|
||||
0xFF_FFFF_FFFF_FFFFL, 0xFF_FFFF_FFFF_FFFFL
|
||||
WordMetadata.POSITIONS_MASK, WordMetadata.POSITIONS_MASK
|
||||
);
|
||||
|
||||
long mask = termCoherenceFactor.combinedMask(allPositionsSet);
|
||||
|
@ -49,6 +49,7 @@ dependencies {
|
||||
implementation project(':code:process-models:crawling-model')
|
||||
|
||||
implementation project(':code:features-convert:adblock')
|
||||
implementation project(':code:features-convert:anchor-keywords')
|
||||
implementation project(':code:features-convert:topic-detection')
|
||||
implementation project(':code:features-convert:pubdate')
|
||||
implementation project(':code:features-convert:keyword-extraction')
|
||||
|
@ -2,6 +2,9 @@ package nu.marginalia.converting.processor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.atags.AnchorTextKeywords;
|
||||
import nu.marginalia.atags.source.AnchorTagsSource;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.processor.logic.links.LinkGraph;
|
||||
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
||||
@ -16,11 +19,14 @@ import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
|
||||
public class DomainProcessor {
|
||||
private final DocumentProcessor documentProcessor;
|
||||
private final SiteWords siteWords;
|
||||
private final AnchorTagsSource anchorTagsSource;
|
||||
private final AnchorTextKeywords anchorTextKeywords;
|
||||
private final LshDocumentDeduplicator documentDeduplicator;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
@ -28,11 +34,15 @@ public class DomainProcessor {
|
||||
@Inject
|
||||
public DomainProcessor(DocumentProcessor documentProcessor,
|
||||
SiteWords siteWords,
|
||||
LshDocumentDeduplicator documentDeduplicator)
|
||||
AnchorTagsSourceFactory anchorTagsSourceFactory,
|
||||
AnchorTextKeywords anchorTextKeywords,
|
||||
LshDocumentDeduplicator documentDeduplicator) throws SQLException
|
||||
{
|
||||
this.documentProcessor = documentProcessor;
|
||||
this.siteWords = siteWords;
|
||||
this.anchorTextKeywords = anchorTextKeywords;
|
||||
this.documentDeduplicator = documentDeduplicator;
|
||||
this.anchorTagsSource = anchorTagsSourceFactory.create();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@ -76,19 +86,26 @@ public class DomainProcessor {
|
||||
|
||||
List<String> terms = new ArrayList<>();
|
||||
terms.add("ip:"+ip);
|
||||
if (cookies)
|
||||
if (cookies) {
|
||||
terms.add(HtmlFeature.COOKIES.getKeyword());
|
||||
}
|
||||
|
||||
var atags = anchorTagsSource.getAnchorTags(ret.domain);
|
||||
|
||||
for (var document : ret.documents) {
|
||||
if (document.details == null)
|
||||
continue;
|
||||
|
||||
if (cookies)
|
||||
if (cookies) {
|
||||
document.details.features.add(HtmlFeature.COOKIES);
|
||||
}
|
||||
|
||||
document.words.addAllSyntheticTerms(terms);
|
||||
}
|
||||
|
||||
document.words.addAnchorTerms(
|
||||
anchorTextKeywords.getAnchorTextKeywords(atags, document.url)
|
||||
);
|
||||
}
|
||||
documentDeduplicator.deduplicate(ret.documents);
|
||||
calculateStatistics(ret);
|
||||
|
||||
|
@ -53,7 +53,7 @@ public class SideloaderProcessing {
|
||||
|
||||
ret.details = details.details();
|
||||
ret.details.metadata = ret.details.metadata
|
||||
.withSize(size, Math.max(0, 255 - url.length()));
|
||||
.withSize(size, Math.max(0, 32 - url.length()) / 4);
|
||||
ret.url = new EdgeUrl(url);
|
||||
ret.state = UrlIndexingState.OK;
|
||||
ret.stateReason = "SIDELOAD";
|
||||
|
@ -5,6 +5,7 @@ import com.google.inject.name.Names;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.ConverterDomainTypes;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
public class ConvertingIntegrationTestModule extends AbstractModule {
|
||||
@ -13,6 +14,9 @@ public class ConvertingIntegrationTestModule extends AbstractModule {
|
||||
bind(Integer.class).annotatedWith(Names.named("min-document-length")).toInstance(250);
|
||||
bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128);
|
||||
bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255);
|
||||
bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration(
|
||||
null, 1, "localhost", 0, 0, null
|
||||
));
|
||||
|
||||
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
|
||||
bind(ConverterDomainTypes.class).toInstance(Mockito.mock(ConverterDomainTypes.class));
|
||||
|
@ -146,6 +146,7 @@ public class IndexConstructorMain {
|
||||
| WordFlags.UrlDomain.asBit()
|
||||
| WordFlags.UrlPath.asBit()
|
||||
| WordFlags.Site.asBit()
|
||||
| WordFlags.ExternalLink.asBit()
|
||||
| WordFlags.SiteAdjacent.asBit();
|
||||
|
||||
return r -> WordMetadata.hasAnyFlags(r, highPriorityFlags);
|
||||
|
@ -53,14 +53,14 @@ public class IndexResultValuator {
|
||||
}
|
||||
|
||||
private final long flagsFilterMask =
|
||||
WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit();
|
||||
WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit();
|
||||
|
||||
@Nullable
|
||||
public SearchResultItem calculatePreliminaryScore(long id) {
|
||||
public SearchResultItem calculatePreliminaryScore(long combinedId) {
|
||||
|
||||
final long docId = UrlIdCodec.removeRank(id);
|
||||
final long docId = UrlIdCodec.removeRank(combinedId);
|
||||
|
||||
if (!termMetadataForDocuments.testCoherence(id, searchTerms.coherences))
|
||||
if (!termMetadataForDocuments.testCoherence(combinedId, searchTerms.coherences))
|
||||
return null;
|
||||
|
||||
long docMetadata = metadataService.getDocumentMetadata(docId);
|
||||
@ -70,7 +70,7 @@ public class IndexResultValuator {
|
||||
boolean anyAllSynthetic = false;
|
||||
int maxPositionsSet = 0;
|
||||
|
||||
SearchResultItem searchResult = new SearchResultItem(id,
|
||||
SearchResultItem searchResult = new SearchResultItem(combinedId,
|
||||
searchTermVariants.stream().mapToInt(List::size).sum());
|
||||
|
||||
for (int querySetId = 0;
|
||||
@ -133,7 +133,7 @@ public class IndexResultValuator {
|
||||
rankingContext);
|
||||
|
||||
searchResult.setScore(new SearchResultPreliminaryScore(
|
||||
resultsWithPriorityTerms.contains(id),
|
||||
resultsWithPriorityTerms.contains(combinedId),
|
||||
score
|
||||
));
|
||||
|
||||
|
@ -23,10 +23,12 @@ dependencies {
|
||||
implementation project(':third-party:rdrpostagger')
|
||||
implementation project(':third-party:porterstemmer')
|
||||
implementation project(':third-party:monkey-patch-opennlp')
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:process')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:service-discovery')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
implementation project(':code:libraries:big-string')
|
||||
@ -36,6 +38,7 @@ dependencies {
|
||||
implementation project(':third-party:commons-codec')
|
||||
implementation project(':code:features-crawl:link-parser')
|
||||
implementation project(':code:features-convert:adblock')
|
||||
implementation project(':code:features-convert:anchor-keywords')
|
||||
implementation project(':code:features-convert:topic-detection')
|
||||
implementation project(':code:features-convert:keyword-extraction')
|
||||
|
||||
@ -49,6 +52,7 @@ dependencies {
|
||||
|
||||
implementation libs.bundles.nlp
|
||||
implementation libs.commons.lang3
|
||||
implementation libs.bundles.mariadb
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
|
@ -22,6 +22,7 @@ public class ExperimentRunnerMain {
|
||||
"test", TestExperiment.class,
|
||||
"adblock", AdblockExperiment.class,
|
||||
"topic", TopicExperiment.class,
|
||||
"atags", AtagsExperiment.class,
|
||||
"sentence-statistics", SentenceStatisticsExperiment.class,
|
||||
"site-statistics", SiteStatisticsExperiment.class,
|
||||
"export-atags", ExportExternalLinksExperiment.class,
|
||||
|
@ -5,6 +5,7 @@ import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
|
@ -0,0 +1,52 @@
|
||||
package nu.marginalia.tools.experiments;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.ProcessConfiguration;
|
||||
import nu.marginalia.atags.AnchorTextKeywords;
|
||||
import nu.marginalia.atags.source.AnchorTagsSource;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.tools.LegacyExperiment;
|
||||
|
||||
import java.sql.SQLException;
|
||||
|
||||
public class AtagsExperiment extends LegacyExperiment {
|
||||
|
||||
|
||||
private final AnchorTextKeywords keywords;
|
||||
private final AnchorTagsSource source;
|
||||
|
||||
@Inject
|
||||
public AtagsExperiment(AnchorTextKeywords keywords, HikariDataSource dataSource) throws SQLException {
|
||||
this.keywords = keywords;
|
||||
this.source = new AnchorTagsSourceFactory(dataSource, new ProcessConfiguration(null, 1, null))
|
||||
.create();
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public boolean process(CrawledDomain domain) {
|
||||
var atags = source.getAnchorTags(new EdgeDomain(domain.domain));
|
||||
for (var doc : domain.doc) {
|
||||
if (doc.documentBody == null)
|
||||
continue;
|
||||
|
||||
var newKeywords = keywords.getAnchorTextKeywords(atags, new EdgeUrl(doc.url));
|
||||
if (!newKeywords.isEmpty()) {
|
||||
System.out.println(newKeywords + " " + doc.url);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public void onFinish() {
|
||||
source.close();
|
||||
}
|
||||
}
|
@ -1,22 +1,11 @@
|
||||
package nu.marginalia.tools.experiments;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.processor.DocumentProcessor;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.tools.Experiment;
|
||||
import nu.marginalia.topic.RecipeDetector;
|
||||
import nu.marginalia.topic.TextileCraftDetector;
|
||||
import nu.marginalia.topic.WoodworkingDetector;
|
||||
import org.jsoup.Jsoup;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
|
@ -31,6 +31,7 @@ include 'code:features-qs:query-parser'
|
||||
include 'code:features-index:result-ranking'
|
||||
|
||||
include 'code:features-convert:adblock'
|
||||
include 'code:features-convert:anchor-keywords'
|
||||
include 'code:features-convert:stackexchange-xml'
|
||||
include 'code:features-convert:pubdate'
|
||||
include 'code:features-convert:summary-extraction'
|
||||
@ -149,7 +150,7 @@ dependencyResolutionManagement {
|
||||
library('fastutil', 'it.unimi.dsi', 'fastutil').version('8.5.8')
|
||||
|
||||
library('hll', 'net.agkn', 'hll').version('1.6.0')
|
||||
|
||||
library('duckdb', 'org.duckdb', 'duckdb_jdbc').version('0.9.1')
|
||||
library('okhttp3','com.squareup.okhttp3','okhttp').version('4.11.0')
|
||||
|
||||
library('httpcomponents.core','org.apache.httpcomponents','httpcore').version('4.4.15')
|
||||
|
Loading…
Reference in New Issue
Block a user