Initial Commit Anchor Tags

* Added new (optional) model file in $WMSA_HOME/data/atags.parquet
* Converter gets a component for creating a projection of its domains onto the full atags parquet file
* New WordFlag ExternalLink
* These terms are also for now flagged as title words
* Fixed a bug where Title words aliased with UrlDomain words
* Fixed a bug in the encyclopedia sideloader that gave everything too high topology ranking
This commit is contained in:
Viktor Lofgren 2023-11-04 14:24:17 +01:00
parent 30ca5046b5
commit 0152004c42
34 changed files with 715 additions and 63 deletions

View File

@ -77,7 +77,11 @@ public class WmsaHome {
home.resolve("model/lid.176.ftz"));
}
public static Path getAtagsPath() {
return getHomePath().resolve("data/atags.parquet");
}
private static final boolean debugMode = Boolean.getBoolean("wmsa-debug");
public static boolean isDebug() {
return debugMode;
}

View File

@ -31,14 +31,15 @@ public enum WordFlags {
SiteAdjacent,
/** Keyword appears in URL path
*
*/
UrlPath,
/** Keyword appears in domain name
*
*/
UrlDomain
UrlDomain,
/** Word appears in an external link */
ExternalLink
;
public int asBit() {

View File

@ -12,14 +12,12 @@ import java.util.Set;
* @param flags word flags (see {@link WordFlags})
*/
public record WordMetadata(long positions,
byte flags) {
int flags) {
// Bottom 8 bits are used for flags
public static final long FLAGS_MASK = 0xFFL;
public static final int POSITIONS_SHIFT = 8;
public static final long POSITIONS_MASK = 0xFF_FFFF_FFFF_FFFFL;
public static final long FLAGS_MASK = (1L << WordFlags.values().length) - 1;
public static final int POSITIONS_COUNT = 64 - WordFlags.values().length;
public static final int POSITIONS_SHIFT = WordFlags.values().length;
public static final long POSITIONS_MASK = ~0L >>> POSITIONS_SHIFT;
@ -30,7 +28,7 @@ public record WordMetadata(long positions,
public WordMetadata(long value) {
this(
((value >>> POSITIONS_SHIFT) & POSITIONS_MASK),
(byte) (value & FLAGS_MASK)
(int)(value & FLAGS_MASK)
);
}
@ -40,9 +38,9 @@ public record WordMetadata(long positions,
this(positions, encodeFlags(flags));
}
private static byte encodeFlags(Set<WordFlags> flags) {
byte ret = 0;
for (var flag : flags) { ret |= (byte) flag.asBit(); }
private static int encodeFlags(Set<WordFlags> flags) {
int ret = 0;
for (var flag : flags) { ret |= flag.asBit(); }
return ret;
}
@ -69,7 +67,7 @@ public record WordMetadata(long positions,
public long encode() {
long ret = 0;
ret |= Byte.toUnsignedLong(flags);
ret |= Integer.toUnsignedLong(flags) & FLAGS_MASK;
ret |= (positions & POSITIONS_MASK) << POSITIONS_SHIFT;
return ret;

View File

@ -17,14 +17,15 @@ class WordMetadataTest {
verifyCodec("Position 32bit", new WordMetadata(0xff0f0000L, EnumSet.allOf(WordFlags.class)));
verifyCodec("Position all", new WordMetadata(0xffff_ff0f_0000L, EnumSet.allOf(WordFlags.class)));
verifyCodec("No flags", new WordMetadata( 0xff0f0000L, EnumSet.noneOf(WordFlags.class)));
verifyCodec("No flags, some bits", new WordMetadata(0x7f7f7f7f7f7f7fL, EnumSet.noneOf(WordFlags.class)));
verifyCodec("No flags, all bits", new WordMetadata( 0xffffffffffffffL, EnumSet.noneOf(WordFlags.class)));
verifyCodec("All flags, all bits", new WordMetadata( 0xffffffffffffffL, EnumSet.allOf(WordFlags.class)));
verifyCodec("No flags, some bits", new WordMetadata(0x3f_7f7f_7f7f_7f7fL, EnumSet.noneOf(WordFlags.class)));
verifyCodec("No flags, all bits", new WordMetadata( 0x3f_ffff_ffff_ffffL, EnumSet.noneOf(WordFlags.class)));
verifyCodec("All flags, all bits", new WordMetadata( 0x3f_ffff_ffff_ffffL, EnumSet.allOf(WordFlags.class)));
System.out.println(new WordMetadata(0x7f0f0005L, EnumSet.allOf(WordFlags.class)));
System.out.println(new WordMetadata(0xff0f0013L, EnumSet.noneOf(WordFlags.class)));
System.out.println(new WordMetadata(0xf0f000ff0f0013L, EnumSet.allOf(WordFlags.class)));
System.out.println(new WordMetadata(0xf0f000ff0f0013L, (byte)-1));
System.out.println(new WordMetadata(0xffffffffffffffL, (byte)0));
System.out.println(new WordMetadata(0x3f_ffff_ffff_ffffL, (byte)0));
System.out.println(new WordMetadata(0x3f_ffff_ffff_ffffL, (byte)0));
System.out.println(BrailleBlockPunchCards.printBits(new WordMetadata(~0L, (byte) 0).encode(), 64));
System.out.println(BrailleBlockPunchCards.printBits(new WordMetadata(0, (byte) 0xff).encode(), 64));
System.out.println(BrailleBlockPunchCards.printBits(131973L, 64));

View File

@ -0,0 +1,33 @@
plugins {
id 'java'
id "de.undercouch.download" version "5.1.0"
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(21))
}
}
dependencies {
implementation project(':code:common:config')
implementation project(':code:common:model')
implementation project(':code:common:db')
implementation project(':code:common:process')
implementation project(':code:features-convert:keyword-extraction')
implementation project(':code:libraries:language-processing')
implementation libs.bundles.slf4j
implementation libs.guice
implementation libs.bundles.mariadb
implementation libs.duckdb
implementation libs.notnull
implementation libs.jsoup
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}

View File

@ -0,0 +1,74 @@
package nu.marginalia.atags;
import com.google.inject.Inject;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.EdgeUrl;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.*;
public class AnchorTextKeywords {
private final KeywordExtractor keywordExtractor;
private final SentenceExtractor sentenceExtractor;
private final Set<String> stopList;
@Inject
public AnchorTextKeywords(KeywordExtractor keywordExtractor,
SentenceExtractor sentenceExtractor)
{
this.keywordExtractor = keywordExtractor;
this.sentenceExtractor = sentenceExtractor;
stopList = readStoplist();
}
private Set<String> readStoplist() {
Set<String> ret = new HashSet<>();
try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("atags-stop-list"),
"Could not load word frequency table");
var br = new BufferedReader(new InputStreamReader(resource))
) {
while (true) {
String s = br.readLine();
if (s == null) break;
if (s.isBlank()) continue;
ret.add(s.trim());
}
} catch (IOException e) {
e.printStackTrace();
}
return ret;
}
public List<String> getAnchorTextKeywords(DomainLinks links, EdgeUrl url) {
var keywordsRaw = links.forUrl(url);
// Extract and count keywords from anchor text
Map<String, Integer> wordsWithCount = new HashMap<>();
for (var keyword : keywordsRaw) {
if (stopList.contains(keyword.text().toLowerCase()))
continue;
var sentence = sentenceExtractor.extractSentence(keyword.text());
for (var wordSpan : keywordExtractor.getKeywordsFromSentence(sentence)) {
wordsWithCount.merge(sentence.constructWordFromSpan(wordSpan), 1, Integer::sum);
}
}
// Filter out keywords that appear infrequently
final List<String> keywords = new ArrayList<>(wordsWithCount.size());
for (var wordEntry : wordsWithCount.entrySet()) {
if (wordEntry.getValue() > 2) {
keywords.add(wordEntry.getKey());
}
}
return keywords;
}
}

View File

@ -0,0 +1,40 @@
package nu.marginalia.atags.model;
import nu.marginalia.model.EdgeUrl;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public class DomainLinks {
private final Map<String, List<Link>> links;
public DomainLinks() {
links = Map.of();
}
public DomainLinks(List<LinkWithText> linksForDomain) {
links = linksForDomain.
stream()
.collect(Collectors.groupingBy(LinkWithText::url,
Collectors.mapping(LinkWithText::toLink, Collectors.toList())));
}
public List<String> getUrls() {
return new ArrayList<>(links.keySet());
}
public List<Link> forUrl(EdgeUrl url) {
String key = url.domain.toString() + url.path + (url.param == null ? "" : "?" + url.param);
return links.getOrDefault(key, List.of());
}
@Override
public String toString() {
return "DomainLinks{" +
"links=" + links +
'}';
}
}

View File

@ -0,0 +1,4 @@
package nu.marginalia.atags.model;
public record Link(String source, String text) {
}

View File

@ -0,0 +1,7 @@
package nu.marginalia.atags.model;
public record LinkWithText(String url, String text, String source) {
public Link toLink() {
return new Link(source, text);
}
}

View File

@ -0,0 +1,93 @@
package nu.marginalia.atags.source;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.atags.model.LinkWithText;
import nu.marginalia.model.EdgeDomain;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.file.Path;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
public class AnchorTagsImpl implements AnchorTagsSource {
private final Connection duckdbConnection;
private static final Logger logger = LoggerFactory.getLogger(AnchorTagsImpl.class);
public AnchorTagsImpl(Path atagsPath,
List<EdgeDomain> relevantDomains)
throws SQLException
{
duckdbConnection = DriverManager.getConnection("jdbc:duckdb:");
logger.info("Loading atags from " + atagsPath);
try (var stmt = duckdbConnection.createStatement()) {
// Insert the domains into a temporary table, then use that to filter the atags table
stmt.executeUpdate("create table domains (domain varchar)");
try (var ps = duckdbConnection.prepareStatement("insert into domains values (?)")) {
for (var domain : relevantDomains) {
ps.setString(1, domain.toString());
ps.executeUpdate();
}
}
// Project the atags table down to only the relevant domains. This looks like an SQL injection
// vulnerability if you're a validation tool, but the string comes from a trusted source.
stmt.executeUpdate("""
create table atags as
select * from '%s'
where dest in (select * from domains)
""".formatted(atagsPath.toAbsolutePath()));
// Free up the memory used by the domains table
stmt.executeUpdate("drop table domains");
// Create an index on the dest column to speed up queries
stmt.executeUpdate("create index atags_dest on atags(dest)");
// This is probably not necessary
if (!duckdbConnection.getAutoCommit()) {
duckdbConnection.commit();
}
}
logger.info("Finished loading!");
}
@Override
public DomainLinks getAnchorTags(EdgeDomain domain) {
List<LinkWithText> links = new ArrayList<>();
try (var ps = duckdbConnection.prepareStatement("""
select
unnest(text) as 'text',
unnest(url) as 'url',
unnest(source) as 'source'
from atags
where dest = ?
"""))
{
ps.setString(1, domain.toString());
var rs = ps.executeQuery();
while (rs.next()) {
links.add(new LinkWithText(rs.getString("url"), rs.getString("text"), rs.getString("source")));
}
return new DomainLinks(links);
}
catch (SQLException ex) {
logger.warn("Failed to get atags for " + domain, ex);
}
return new DomainLinks();
}
@Override
public void close() throws Exception {
duckdbConnection.close();
}
}

View File

@ -0,0 +1,10 @@
package nu.marginalia.atags.source;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.model.EdgeDomain;
public interface AnchorTagsSource extends AutoCloseable {
DomainLinks getAnchorTags(EdgeDomain domain);
default void close() throws Exception {}
}

View File

@ -0,0 +1,73 @@
package nu.marginalia.atags.source;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.ProcessConfiguration;
import nu.marginalia.WmsaHome;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.model.EdgeDomain;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
public class AnchorTagsSourceFactory {
private final Path atagsPath;
private final int nodeId;
private final HikariDataSource dataSource;
private static final Logger logger = LoggerFactory.getLogger(AnchorTagsSourceFactory.class);
@Inject
public AnchorTagsSourceFactory(HikariDataSource dataSource,
ProcessConfiguration config)
{
this.dataSource = dataSource;
this.atagsPath = WmsaHome.getAtagsPath();
this.nodeId = config.node();
}
public AnchorTagsSource create() throws SQLException {
if (!Files.exists(atagsPath))
return dummy();
List<EdgeDomain> relevantDomains = getRelevantDomains();
if (relevantDomains.isEmpty())
return dummy();
return new AnchorTagsImpl(atagsPath, relevantDomains);
}
private AnchorTagsSource dummy() {
return x -> new DomainLinks();
}
// Only get domains that are assigned to this node. This reduces the amount of data
// that needs to be loaded into the duckdb instance to a more manageable level, and keeps
// the memory footprint of the service down.
private List<EdgeDomain> getRelevantDomains() {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
SELECT DOMAIN_NAME
FROM WMSA_prod.EC_DOMAIN
WHERE NODE_AFFINITY = ?
"""))
{
stmt.setInt(1, nodeId);
var rs = stmt.executeQuery();
var ret = new ArrayList<EdgeDomain>();
while (rs.next()) {
ret.add(new EdgeDomain(rs.getString(1)));
}
return ret;
} catch (Exception e) {
logger.warn("Failed to get relevant domains for node id " + nodeId, e);
return List.of();
}
}
}

View File

@ -0,0 +1,147 @@
[-]
facebook
website
twitter
link
instagram
read more
visit website
amazon
youtube
linkedin
learn more
go to album
more
share
sign up
log in
buy now
1
here.
.
italiano
privacy policy
register
2
buy tickets
book now
view
more info
apply now
play pause
report
contact us
view article
home
online
d
3
[link]
hier
read
reports
view website
said
[1]
reported
wordpress
announced
4
fr
donate
contact
news
more information
en
apply
via
post
comments
register now
listen
read full review
details
register here
visit site
privacy
5
shop now
[2]
info
help
play
de
this
*
terms of use
directions
url
open
p
photo
careers
find out more
login
view original post
0
about
live demo
blogger
6
www
subscribe
view details
watch
read more...
view profile
download now
wrote
t
get started
gallery
7
preview
visit
terms of service
a
email
html
view more
view this tutor
spanish
8
permalink
read article
results
demo
rss
[4]
about us
part 2
click here.
get tickets
visit websitewebsite
es
says
pin
+
watch now
listen now
writes
part 1
clicking here
page
link here
i
[5]
profile
it
sign in
11
french
donate now
home page
order now
12
more...
these

View File

@ -0,0 +1,46 @@
package nu.marginalia.atags;
import nu.marginalia.atags.source.AnchorTagsImpl;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.util.TestLanguageModels;
import org.junit.jupiter.api.Test;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.List;
class DomainAnchorTagsImplTest {
@Test
void getAnchorTags() {
Path atagsPath = Path.of("/home/vlofgren/atags.parquet");
try (var domainAnchorTags = new AnchorTagsImpl(
atagsPath, List.of(new EdgeDomain("www.chiark.greenend.org.uk"))
)) {
var tags = domainAnchorTags.getAnchorTags(new EdgeDomain("www.chiark.greenend.org.uk"));
System.out.println(tags);
System.out.println(tags.getUrls());
System.out.println(tags.forUrl(new EdgeUrl("https://www.chiark.greenend.org.uk/~sgtatham/putty/")));
System.out.println(tags.forUrl(new EdgeUrl("http://www.chiark.greenend.org.uk/~sgtatham/putty/")));
System.out.println(tags.forUrl(new EdgeUrl("http://www.chiark.greenend.org.uk/~sgtatham/putt")));
var atagsKeywords = new AnchorTextKeywords(
new KeywordExtractor(),
new SentenceExtractor(
TestLanguageModels.getLanguageModels()
)
);
System.out.println(
atagsKeywords.getAnchorTextKeywords(tags, new EdgeUrl("https://www.chiark.greenend.org.uk/~sgtatham/"))
);
} catch (SQLException e) {
throw new RuntimeException(e);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}

View File

@ -0,0 +1,38 @@
package nu.marginalia.util;
import nu.marginalia.LanguageModels;
import nu.marginalia.WmsaHome;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Optional;
public class TestLanguageModels {
private static final Path LANGUAGE_MODELS_DEFAULT = WmsaHome.getHomePath().resolve("model");
public static Path getLanguageModelsPath() {
final Path languageModelsHome = Optional.ofNullable(System.getenv("LANGUAGE_MODELS_HOME"))
.map(Path::of)
.orElse(LANGUAGE_MODELS_DEFAULT);
if (!Files.isDirectory(languageModelsHome)) {
throw new IllegalStateException("Could not find $LANGUAGE_MODELS_HOME, see doc/language-models.md");
}
return languageModelsHome;
}
public static LanguageModels getLanguageModels() {
var languageModelsHome = getLanguageModelsPath();
return new LanguageModels(
languageModelsHome.resolve("ngrams.bin"),
languageModelsHome.resolve("tfreq-new-algo3.bin"),
languageModelsHome.resolve("opennlp-sentence.bin"),
languageModelsHome.resolve("English.RDR"),
languageModelsHome.resolve("English.DICT"),
languageModelsHome.resolve("opennlp-tokens.bin"),
languageModelsHome.resolve("lid.176.ftz")
);
}
}

View File

@ -4,12 +4,13 @@ import com.google.inject.Inject;
import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.idx.WordMetadata;
/** Generates a position bitmask for each word in a document */
public class KeywordPositionBitmask {
private final Object2LongOpenHashMap<String> positionMask = new Object2LongOpenHashMap<>(10_000, 0.7f);
private final static int positionWidth = 56;
private final static long positionBitmask = (1L << positionWidth) - 1;
private final static int positionWidth = WordMetadata.POSITIONS_COUNT;
private final static long positionBitmask = WordMetadata.POSITIONS_MASK;
private static final int unmodulatedPortion = 16;
@Inject

View File

@ -2,7 +2,6 @@ package nu.marginalia.keyword.model;
import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
@ -76,6 +75,14 @@ public class DocumentKeywordsBuilder {
newWords.forEach(word -> words.putIfAbsent(word, meta));
}
public void addAnchorTerms(List<String> keywords) {
long meta = WordFlags.Title.asBit()
| WordFlags.ExternalLink.asBit()
| (1L << WordMetadata.POSITIONS_SHIFT);
keywords.forEach(word -> words.mergeLong(word, meta, (a, b) -> a|b));
}
public List<String> getWordsWithAnyFlag(long flags) {
List<String> ret = new ArrayList<>();
@ -103,4 +110,5 @@ public class DocumentKeywordsBuilder {
return sb.append(']').toString();
}
}

View File

@ -53,9 +53,6 @@ public class ForwardIndexReader {
data = loadData(dataFile);
}
public void selfTest() {
}
private static TLongIntHashMap loadIds(Path idsFile) throws IOException {
try (var idsArray = LongArrayFactory.mmapForReadingShared(idsFile)) {
assert idsArray.size() < Integer.MAX_VALUE;

View File

@ -11,6 +11,9 @@ import nu.marginalia.ranking.factors.*;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List;
@ -18,7 +21,7 @@ import static java.lang.Math.min;
@Singleton
public class ResultValuator {
final static double scalingFactor = 250.;
final static double scalingFactor = 500.;
private final Bm25Factor bm25Factor;
private final TermCoherenceFactor termCoherenceFactor;
@ -28,6 +31,8 @@ public class ResultValuator {
private final ThreadLocal<ValuatorListPool<SearchResultKeywordScore>> listPool =
ThreadLocal.withInitial(ValuatorListPool::new);
private static final Logger logger = LoggerFactory.getLogger(ResultValuator.class);
@Inject
public ResultValuator(Bm25Factor bm25Factor,
TermCoherenceFactor termCoherenceFactor,
@ -46,7 +51,6 @@ public class ResultValuator {
var threadListPool = listPool.get();
int sets = numberOfSets(scores);
double bestScore = 10;
long documentMetadata = documentMetadata(scores);
int features = htmlFeatures(scores);
@ -56,7 +60,7 @@ public class ResultValuator {
int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata);
int quality = DocumentMetadata.decodeQuality(documentMetadata);
int size = DocumentMetadata.decodeSize(documentMetadata);
int flagsPenalty = flagsPenalty(features, documentMetadata & 0xFF, size, quality);
int flagsPenalty = flagsPenalty(features, documentMetadata & 0xFF, size);
int topology = DocumentMetadata.decodeTopology(documentMetadata);
int year = DocumentMetadata.decodeYear(documentMetadata);
@ -85,7 +89,8 @@ public class ResultValuator {
+ flagsPenalty
+ priorityTermBonus.calculate(scores);
for (int set = 0; set <= sets; set++) {
double bestScore = 10;
for (int set = 0; set < sets; set++) {
ResultKeywordSet keywordSet = createKeywordSet(threadListPool, scores, set);
if (keywordSet.isEmpty() || keywordSet.hasNgram())
@ -95,8 +100,7 @@ public class ResultValuator {
final double bm25 = rankingParams.bm25FullWeight * bm25Factor.calculateBm25(rankingParams.fullParams, keywordSet, length, ctx);
final double bm25p = rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, keywordSet, ctx);
double nonNormalizedScore = bm25 + bm25p + tcf + overallPart;
double score = normalize(nonNormalizedScore, keywordSet.length());
double score = normalize(bm25 + bm25p + tcf + overallPart);
bestScore = min(bestScore, score);
@ -116,7 +120,7 @@ public class ResultValuator {
}
}
private int flagsPenalty(int featureFlags, long docFlags, int size, double quality) {
private int flagsPenalty(int featureFlags, long docFlags, int size) {
// Short-circuit for index-service, which does not have the feature flags
if (featureFlags == 0)
@ -203,11 +207,11 @@ public class ResultValuator {
return 1 + maxSet;
}
public static double normalize(double value, int setSize) {
public static double normalize(double value) {
if (value < 0)
value = 0;
return Math.sqrt((1.0 + scalingFactor) / (1.0 + value / Math.max(1., setSize)));
return Math.sqrt((1.0 + scalingFactor) / (1.0 + value));
}
}

View File

@ -43,6 +43,8 @@ public class Bm25Factor {
| WordFlags.SiteAdjacent.asBit()
| WordFlags.UrlPath.asBit()
| WordFlags.UrlDomain.asBit()
| WordFlags.ExternalLink.asBit()
| WordFlags.Title.asBit()
| WordFlags.Subjects.asBit();
for (var keyword : keywordSet.keywords()) {
@ -50,6 +52,7 @@ public class Bm25Factor {
int freq = ctx.priorityFrequency(keyword.keyword);
// note we override b to zero for priority terms as they are independent of document length
sum += invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0);
}

View File

@ -1,5 +1,6 @@
package nu.marginalia.ranking.factors;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.ranking.ResultKeywordSet;
/** Rewards documents where terms appear frequently within the same sentences
@ -15,16 +16,14 @@ public class TermCoherenceFactor {
double bitsSetFactor(long mask) {
final int bitsSetInMask = Long.bitCount(mask);
return Math.pow(bitsSetInMask/56., 0.25);
return Math.pow(bitsSetInMask/(float) WordMetadata.POSITIONS_COUNT, 0.25);
}
long combinedMask(ResultKeywordSet keywordSet) {
long mask = 0xFF_FFFF_FFFF_FFFFL;
long mask = WordMetadata.POSITIONS_MASK;
for (var keyword : keywordSet.keywords()) {
long positions = keyword.positions();
mask &= positions;
mask &= keyword.positions();
}
return mask;

View File

@ -17,7 +17,7 @@ class TermCoherenceFactorTest {
@Test
public void testAllBitsSet() {
var allPositionsSet = createSet(
0xFF_FFFF_FFFF_FFFFL, 0xFF_FFFF_FFFF_FFFFL
WordMetadata.POSITIONS_MASK, WordMetadata.POSITIONS_MASK
);
long mask = termCoherenceFactor.combinedMask(allPositionsSet);

View File

@ -49,6 +49,7 @@ dependencies {
implementation project(':code:process-models:crawling-model')
implementation project(':code:features-convert:adblock')
implementation project(':code:features-convert:anchor-keywords')
implementation project(':code:features-convert:topic-detection')
implementation project(':code:features-convert:pubdate')
implementation project(':code:features-convert:keyword-extraction')

View File

@ -2,6 +2,9 @@ package nu.marginalia.converting.processor;
import com.google.inject.Inject;
import lombok.SneakyThrows;
import nu.marginalia.atags.AnchorTextKeywords;
import nu.marginalia.atags.source.AnchorTagsSource;
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.processor.logic.links.LinkGraph;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
@ -16,11 +19,14 @@ import nu.marginalia.model.crawl.HtmlFeature;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.*;
public class DomainProcessor {
private final DocumentProcessor documentProcessor;
private final SiteWords siteWords;
private final AnchorTagsSource anchorTagsSource;
private final AnchorTextKeywords anchorTextKeywords;
private final LshDocumentDeduplicator documentDeduplicator;
private final Logger logger = LoggerFactory.getLogger(getClass());
@ -28,11 +34,15 @@ public class DomainProcessor {
@Inject
public DomainProcessor(DocumentProcessor documentProcessor,
SiteWords siteWords,
LshDocumentDeduplicator documentDeduplicator)
AnchorTagsSourceFactory anchorTagsSourceFactory,
AnchorTextKeywords anchorTextKeywords,
LshDocumentDeduplicator documentDeduplicator) throws SQLException
{
this.documentProcessor = documentProcessor;
this.siteWords = siteWords;
this.anchorTextKeywords = anchorTextKeywords;
this.documentDeduplicator = documentDeduplicator;
this.anchorTagsSource = anchorTagsSourceFactory.create();
}
@SneakyThrows
@ -76,19 +86,26 @@ public class DomainProcessor {
List<String> terms = new ArrayList<>();
terms.add("ip:"+ip);
if (cookies)
if (cookies) {
terms.add(HtmlFeature.COOKIES.getKeyword());
}
var atags = anchorTagsSource.getAnchorTags(ret.domain);
for (var document : ret.documents) {
if (document.details == null)
continue;
if (cookies)
if (cookies) {
document.details.features.add(HtmlFeature.COOKIES);
}
document.words.addAllSyntheticTerms(terms);
}
document.words.addAnchorTerms(
anchorTextKeywords.getAnchorTextKeywords(atags, document.url)
);
}
documentDeduplicator.deduplicate(ret.documents);
calculateStatistics(ret);

View File

@ -53,7 +53,7 @@ public class SideloaderProcessing {
ret.details = details.details();
ret.details.metadata = ret.details.metadata
.withSize(size, Math.max(0, 255 - url.length()));
.withSize(size, Math.max(0, 32 - url.length()) / 4);
ret.url = new EdgeUrl(url);
ret.state = UrlIndexingState.OK;
ret.stateReason = "SIDELOAD";

View File

@ -5,6 +5,7 @@ import com.google.inject.name.Names;
import nu.marginalia.LanguageModels;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.ConverterDomainTypes;
import nu.marginalia.service.module.ServiceConfiguration;
import org.mockito.Mockito;
public class ConvertingIntegrationTestModule extends AbstractModule {
@ -13,6 +14,9 @@ public class ConvertingIntegrationTestModule extends AbstractModule {
bind(Integer.class).annotatedWith(Names.named("min-document-length")).toInstance(250);
bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128);
bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255);
bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration(
null, 1, "localhost", 0, 0, null
));
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
bind(ConverterDomainTypes.class).toInstance(Mockito.mock(ConverterDomainTypes.class));

View File

@ -146,6 +146,7 @@ public class IndexConstructorMain {
| WordFlags.UrlDomain.asBit()
| WordFlags.UrlPath.asBit()
| WordFlags.Site.asBit()
| WordFlags.ExternalLink.asBit()
| WordFlags.SiteAdjacent.asBit();
return r -> WordMetadata.hasAnyFlags(r, highPriorityFlags);

View File

@ -53,14 +53,14 @@ public class IndexResultValuator {
}
private final long flagsFilterMask =
WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit();
WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit();
@Nullable
public SearchResultItem calculatePreliminaryScore(long id) {
public SearchResultItem calculatePreliminaryScore(long combinedId) {
final long docId = UrlIdCodec.removeRank(id);
final long docId = UrlIdCodec.removeRank(combinedId);
if (!termMetadataForDocuments.testCoherence(id, searchTerms.coherences))
if (!termMetadataForDocuments.testCoherence(combinedId, searchTerms.coherences))
return null;
long docMetadata = metadataService.getDocumentMetadata(docId);
@ -70,7 +70,7 @@ public class IndexResultValuator {
boolean anyAllSynthetic = false;
int maxPositionsSet = 0;
SearchResultItem searchResult = new SearchResultItem(id,
SearchResultItem searchResult = new SearchResultItem(combinedId,
searchTermVariants.stream().mapToInt(List::size).sum());
for (int querySetId = 0;
@ -133,7 +133,7 @@ public class IndexResultValuator {
rankingContext);
searchResult.setScore(new SearchResultPreliminaryScore(
resultsWithPriorityTerms.contains(id),
resultsWithPriorityTerms.contains(combinedId),
score
));

View File

@ -23,10 +23,12 @@ dependencies {
implementation project(':third-party:rdrpostagger')
implementation project(':third-party:porterstemmer')
implementation project(':third-party:monkey-patch-opennlp')
implementation project(':code:common:db')
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:common:process')
implementation project(':code:common:service')
implementation project(':code:common:service-discovery')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:libraries:big-string')
@ -36,6 +38,7 @@ dependencies {
implementation project(':third-party:commons-codec')
implementation project(':code:features-crawl:link-parser')
implementation project(':code:features-convert:adblock')
implementation project(':code:features-convert:anchor-keywords')
implementation project(':code:features-convert:topic-detection')
implementation project(':code:features-convert:keyword-extraction')
@ -49,6 +52,7 @@ dependencies {
implementation libs.bundles.nlp
implementation libs.commons.lang3
implementation libs.bundles.mariadb
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit

View File

@ -22,6 +22,7 @@ public class ExperimentRunnerMain {
"test", TestExperiment.class,
"adblock", AdblockExperiment.class,
"topic", TopicExperiment.class,
"atags", AtagsExperiment.class,
"sentence-statistics", SentenceStatisticsExperiment.class,
"site-statistics", SiteStatisticsExperiment.class,
"export-atags", ExportExternalLinksExperiment.class,

View File

@ -5,6 +5,7 @@ import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;

View File

@ -0,0 +1,52 @@
package nu.marginalia.tools.experiments;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.ProcessConfiguration;
import nu.marginalia.atags.AnchorTextKeywords;
import nu.marginalia.atags.source.AnchorTagsSource;
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.tools.LegacyExperiment;
import java.sql.SQLException;
public class AtagsExperiment extends LegacyExperiment {
private final AnchorTextKeywords keywords;
private final AnchorTagsSource source;
@Inject
public AtagsExperiment(AnchorTextKeywords keywords, HikariDataSource dataSource) throws SQLException {
this.keywords = keywords;
this.source = new AnchorTagsSourceFactory(dataSource, new ProcessConfiguration(null, 1, null))
.create();
}
@Override
@SneakyThrows
public boolean process(CrawledDomain domain) {
var atags = source.getAnchorTags(new EdgeDomain(domain.domain));
for (var doc : domain.doc) {
if (doc.documentBody == null)
continue;
var newKeywords = keywords.getAnchorTextKeywords(atags, new EdgeUrl(doc.url));
if (!newKeywords.isEmpty()) {
System.out.println(newKeywords + " " + doc.url);
}
}
return true;
}
@Override
@SneakyThrows
public void onFinish() {
source.close();
}
}

View File

@ -1,22 +1,11 @@
package nu.marginalia.tools.experiments;
import com.google.inject.Inject;
import nu.marginalia.WmsaHome;
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.processor.DocumentProcessor;
import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.tools.Experiment;
import nu.marginalia.topic.RecipeDetector;
import nu.marginalia.topic.TextileCraftDetector;
import nu.marginalia.topic.WoodworkingDetector;
import org.jsoup.Jsoup;
import java.util.Comparator;

View File

@ -31,6 +31,7 @@ include 'code:features-qs:query-parser'
include 'code:features-index:result-ranking'
include 'code:features-convert:adblock'
include 'code:features-convert:anchor-keywords'
include 'code:features-convert:stackexchange-xml'
include 'code:features-convert:pubdate'
include 'code:features-convert:summary-extraction'
@ -149,7 +150,7 @@ dependencyResolutionManagement {
library('fastutil', 'it.unimi.dsi', 'fastutil').version('8.5.8')
library('hll', 'net.agkn', 'hll').version('1.6.0')
library('duckdb', 'org.duckdb', 'duckdb_jdbc').version('0.9.1')
library('okhttp3','com.squareup.okhttp3','okhttp').version('4.11.0')
library('httpcomponents.core','org.apache.httpcomponents','httpcore').version('4.4.15')