mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Experiments in keyword extraction
This commit is contained in:
parent
4516b23f90
commit
e1b3477115
@ -1,215 +1,193 @@
|
|||||||
package nu.marginalia.wmsa.edge.converting;
|
package nu.marginalia.wmsa.edge.converting;
|
||||||
|
|
||||||
import com.google.common.hash.HashFunction;
|
import gnu.trove.set.hash.TIntHashSet;
|
||||||
import com.google.common.hash.Hashing;
|
import nu.marginalia.wmsa.edge.converting.atags.AnchorTextExtractor;
|
||||||
import com.google.inject.Guice;
|
|
||||||
import com.google.inject.Inject;
|
|
||||||
import com.google.inject.Injector;
|
|
||||||
import nu.marginalia.util.DenseBitMap;
|
|
||||||
import nu.marginalia.util.language.WordPatterns;
|
|
||||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
|
||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
|
||||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
|
||||||
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
||||||
import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader;
|
import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader;
|
||||||
import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader;
|
import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader;
|
||||||
import nu.marginalia.wmsa.edge.crawling.WorkLog;
|
import nu.marginalia.wmsa.edge.crawling.WorkLog;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
|
||||||
|
import nu.marginalia.wmsa.edge.integration.stackoverflow.StackOverflowPostsReader;
|
||||||
|
import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
|
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import org.jsoup.Jsoup;
|
|
||||||
import org.jsoup.nodes.Document;
|
|
||||||
import org.jsoup.nodes.Element;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.BufferedOutputStream;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URISyntaxException;
|
import java.io.OutputStream;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.function.Predicate;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
|
|
||||||
public class LinkKeywordExtractorMain {
|
public class LinkKeywordExtractorMain {
|
||||||
private static final Logger logger = LoggerFactory.getLogger(LinkKeywordExtractorMain.class);
|
private static final Logger logger = LoggerFactory.getLogger(LinkKeywordExtractorMain.class);
|
||||||
|
|
||||||
public static void main(String... args) throws IOException {
|
public static void main(String... args) throws IOException, InterruptedException {
|
||||||
|
|
||||||
if (args.length != 1) {
|
if (args.length < 2) {
|
||||||
System.err.println("Arguments: crawl-plan.yaml");
|
System.err.println("Arguments: [crawl|so|wiki] crawl-plan.yaml [data]");
|
||||||
System.exit(0);
|
System.exit(0);
|
||||||
}
|
}
|
||||||
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
|
||||||
|
|
||||||
Injector injector = Guice.createInjector(
|
String command = args[0];
|
||||||
new ConverterModule(plan)
|
var plan = new CrawlPlanLoader().load(Path.of(args[1]));
|
||||||
);
|
|
||||||
|
switch (command) {
|
||||||
|
case "crawl": getKeywordsFromCrawl(plan); break;
|
||||||
|
case "so": getKeywordsFromSo(plan, args[2]); break;
|
||||||
|
case "wiki": getKeywordsFromWiki(plan, args[2]); break;
|
||||||
|
default: System.err.println("Unrecognized command");
|
||||||
|
}
|
||||||
|
|
||||||
injector.getInstance(LinkKeywordExtractorMain.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private final HashSet<String> crawledDomains = new HashSet<>();
|
private static void getKeywordsFromWiki(EdgeCrawlPlan plan, String arg) throws IOException, InterruptedException {
|
||||||
private final List<String> fileNames = new ArrayList<>();
|
|
||||||
private final LinkParser linkParser = new LinkParser();
|
|
||||||
private final SentenceExtractor sentenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels());
|
|
||||||
|
|
||||||
private final HashFunction hashFunction = Hashing.murmur3_128();
|
|
||||||
|
|
||||||
// This bit map is used as a bloom filter to deduplicate url-keyword combinations
|
HashSet<String> crawledDomains = new HashSet<>();
|
||||||
// false positives are expected, but that's an acceptable trade-off to not have to deal with
|
TIntHashSet crawledUrls = new TIntHashSet(50_000_000);
|
||||||
// de-duplicating billions of shuffled (url, word) tuples on limited hardware
|
|
||||||
private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS);
|
logger.info("Loading URLs");
|
||||||
|
Files.lines(Path.of("/home/vlofgren/good-urls3.txt"))
|
||||||
|
.filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange"))
|
||||||
|
.mapToInt(String::hashCode)
|
||||||
|
.forEach(crawledUrls::add);
|
||||||
|
|
||||||
|
logger.info("Loading input spec");
|
||||||
|
CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
|
||||||
|
spec -> { crawledDomains.add(spec.domain); });
|
||||||
|
|
||||||
|
try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
|
||||||
|
AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(domain -> crawledDomains.contains(domain)
|
||||||
|
&& !domain.contains("wiki")
|
||||||
|
&& !domain.contains("isni")
|
||||||
|
&& !domain.contains("wiktionary"),
|
||||||
|
url -> crawledUrls.contains(url.toString().hashCode()),
|
||||||
|
output::write);
|
||||||
|
|
||||||
|
new WikipediaReader(arg, new EdgeDomain("invalid.example"), article -> {
|
||||||
|
anchorTextExtractor.processDocument(article.getUrl().toString(), article.body);
|
||||||
|
}).join();
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
ex.printStackTrace();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void getKeywordsFromSo(EdgeCrawlPlan plan, String arg) throws IOException, InterruptedException {
|
||||||
|
TIntHashSet crawledUrls = new TIntHashSet(50_000_000);
|
||||||
|
|
||||||
|
logger.info("Loading URLs");
|
||||||
|
Files.lines(Path.of("/home/vlofgren/good-urls3.txt"))
|
||||||
|
.filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange"))
|
||||||
|
.mapToInt(String::hashCode)
|
||||||
|
.forEach(crawledUrls::add);
|
||||||
|
|
||||||
@Inject
|
|
||||||
public LinkKeywordExtractorMain(EdgeCrawlPlan plan) throws IOException {
|
|
||||||
logger.info("Loading input spec");
|
logger.info("Loading input spec");
|
||||||
|
|
||||||
|
HashSet<String> crawledDomains = new HashSet<>();
|
||||||
CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
|
CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
|
||||||
spec -> crawledDomains.add(spec.domain));
|
spec -> crawledDomains.add(spec.domain));
|
||||||
|
|
||||||
|
crawledDomains.remove("jsfiddle.net"); // like 30% of SO's links go here
|
||||||
|
crawledDomains.remove("jsbin.com");
|
||||||
|
crawledDomains.remove("codepad.org");
|
||||||
|
|
||||||
|
|
||||||
|
try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
|
||||||
|
AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains,
|
||||||
|
url -> crawledUrls.contains(url.toString().hashCode()),
|
||||||
|
output::write);
|
||||||
|
|
||||||
|
new StackOverflowPostsReader(arg, new EdgeDomain("invalid.example"), post -> {
|
||||||
|
anchorTextExtractor.processDocument(post.getUrl().toString(), post.fullBody);
|
||||||
|
}).join();
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
ex.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static void getKeywordsFromCrawl(EdgeCrawlPlan plan) throws IOException {
|
||||||
|
|
||||||
|
TIntHashSet crawledUrls = new TIntHashSet(50_000_000);
|
||||||
|
|
||||||
|
logger.info("Loading URLs");
|
||||||
|
Files.lines(Path.of("/home/vlofgren/good-urls3.txt"))
|
||||||
|
.filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange"))
|
||||||
|
.mapToInt(String::hashCode)
|
||||||
|
.forEach(crawledUrls::add);
|
||||||
|
|
||||||
|
|
||||||
|
logger.info("Loading input spec");
|
||||||
|
|
||||||
|
HashSet<String> crawledDomains = new HashSet<>();
|
||||||
|
CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
|
||||||
|
spec -> crawledDomains.add(spec.domain));
|
||||||
|
|
||||||
|
List<String> fileNames = new ArrayList<>();
|
||||||
|
|
||||||
logger.info("Replaying crawl log");
|
logger.info("Replaying crawl log");
|
||||||
WorkLog.readLog(plan.crawl.getLogFile(),
|
WorkLog.readLog(plan.crawl.getLogFile(),
|
||||||
entry -> fileNames.add(entry.path()));
|
entry -> fileNames.add(entry.path()));
|
||||||
|
|
||||||
logger.info("Reading files");
|
try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
|
||||||
for (var fn : fileNames) {
|
AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains,
|
||||||
CrawledDomainReader crawledDomainReader = new CrawledDomainReader();
|
url -> crawledUrls.contains(url.toString().hashCode()),
|
||||||
var crawledDomain = crawledDomainReader.read(plan.getCrawledFilePath(fn));
|
output::write);
|
||||||
if (crawledDomain.doc == null) continue;
|
|
||||||
|
|
||||||
System.out.println("# " + crawledDomain.domain);
|
logger.info("Reading files");
|
||||||
|
for (var fn : fileNames) {
|
||||||
|
CrawledDomainReader crawledDomainReader = new CrawledDomainReader();
|
||||||
|
var crawledDomain = crawledDomainReader.read(plan.getCrawledFilePath(fn));
|
||||||
|
if (crawledDomain.doc == null) continue;
|
||||||
|
|
||||||
for (var doc : crawledDomain.doc) {
|
System.out.println("# " + crawledDomain.domain);
|
||||||
try {
|
|
||||||
|
for (var doc : crawledDomain.doc) {
|
||||||
if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) {
|
if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) {
|
||||||
processDocument(doc.url, doc.documentBody);
|
anchorTextExtractor.processDocument(doc.url, doc.documentBody);
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (URISyntaxException ex) {
|
|
||||||
// This Shouldn't Happen (TM) as the URL that we're failing to process
|
|
||||||
// is expected to have already been parsed by this code successfully
|
|
||||||
// in the process of getting here.
|
|
||||||
//
|
|
||||||
// But also, if it does happen, it's no big deal
|
|
||||||
|
|
||||||
logger.warn("Bad URL format", ex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void processDocument(String docUrl, String documentBody) throws URISyntaxException {
|
|
||||||
final Document processed = Jsoup.parse(documentBody);
|
|
||||||
final EdgeUrl documentUrl = new EdgeUrl(docUrl);
|
|
||||||
|
|
||||||
for (var link : processed.getElementsByTag("a")) {
|
|
||||||
if (link.hasAttr("href")) {
|
|
||||||
String href = link.attr("href");
|
|
||||||
String text = getLinkText(link);
|
|
||||||
|
|
||||||
processAnchor(documentUrl, href, text);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private final Pattern anchorTextNoise = Pattern.compile("[\\s\"()“”:]+");
|
|
||||||
|
|
||||||
private String getLinkText(Element link) {
|
|
||||||
String text = link.text();
|
|
||||||
|
|
||||||
if (link.text().isBlank()) {
|
|
||||||
text = getLinkTextByImgAltTag(link);
|
|
||||||
}
|
|
||||||
|
|
||||||
return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getLinkTextByImgAltTag(Element link) {
|
|
||||||
for (var img: link.getElementsByTag("img")) {
|
|
||||||
if (img.hasAttr("alt")) {
|
|
||||||
return img.attr("alt");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
|
|
||||||
private void processAnchor(EdgeUrl documentUrl, String href, String text) {
|
|
||||||
if (!isInterestingAnchorText(text)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
var optLinkUrl = linkParser.parseLink(documentUrl, href);
|
|
||||||
if (optLinkUrl.isEmpty()) return;
|
|
||||||
|
|
||||||
var linkUrl = optLinkUrl.get();
|
|
||||||
|
|
||||||
if (!isInterestingAnchorLink(linkUrl)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
DocumentLanguageData languageData = sentenceExtractor.extractSentences(text);
|
|
||||||
for (var sent : languageData.sentences) {
|
|
||||||
for (var wordPos : sent) {
|
|
||||||
if (wordPos.isStopWord())
|
|
||||||
continue;
|
|
||||||
|
|
||||||
String word = wordPos.wordLowerCase();
|
|
||||||
|
|
||||||
if (!WordPatterns.filter(word))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (!linkUrl.domain.equals(documentUrl.domain)) {
|
|
||||||
if (isNewKeywordForLink(word, linkUrl.toString())) {
|
|
||||||
System.out.println(linkUrl + "\t" + word);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine
|
private static class UrlKeywordTsvWriter implements AutoCloseable {
|
||||||
private final Predicate<String> looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate();
|
|
||||||
|
|
||||||
private boolean isInterestingAnchorText(String text) {
|
private final OutputStream stream;
|
||||||
if (text.isBlank()) return false;
|
|
||||||
if (text.length() > 32) return false;
|
|
||||||
|
|
||||||
// Google loves questions, and so does SEO spammers
|
UrlKeywordTsvWriter(Path outputFile) throws IOException {
|
||||||
if (text.endsWith("?")) return false;
|
this.stream = new BufferedOutputStream(new FileOutputStream(outputFile.toFile()));
|
||||||
|
|
||||||
if (text.startsWith("http:") || text.startsWith("https:")) return false;
|
|
||||||
|
|
||||||
if (looksLikeAnURL.test(text)) return false;
|
|
||||||
|
|
||||||
return switch (text) {
|
|
||||||
case "this", "here", "click", "click here", "download", "source" -> false;
|
|
||||||
default -> true;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean isInterestingAnchorLink(EdgeUrl linkUrl) {
|
|
||||||
if (!(linkUrl.proto.endsWith("http") || linkUrl.proto.equals("https"))) {
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return crawledDomains.contains(linkUrl.domain.toString());
|
void write(EdgeUrl url, String keyword) {
|
||||||
|
try {
|
||||||
|
stream.write(url.toString().getBytes());
|
||||||
|
stream.write('\t');
|
||||||
|
stream.write(keyword.getBytes());
|
||||||
|
stream.write('\n');
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
stream.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isNewKeywordForLink(String href, String text) {
|
|
||||||
long hash = 0;
|
|
||||||
|
|
||||||
hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).asLong();
|
|
||||||
hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).asLong();
|
|
||||||
|
|
||||||
// Remove sign bit because we don't want a negative index in deduplicateHashBitset
|
|
||||||
hash &= 0x7FFF_FFFF_FFFF_FFFFL;
|
|
||||||
|
|
||||||
return !deduplicateHashBitset.set(hash % deduplicateHashBitset.cardinality);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,149 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.atags;
|
||||||
|
|
||||||
|
import com.google.common.hash.HashFunction;
|
||||||
|
import com.google.common.hash.Hashing;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.util.DenseBitMap;
|
||||||
|
import nu.marginalia.util.language.WordPatterns;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.function.BiConsumer;
|
||||||
|
import java.util.function.Predicate;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
public class AnchorTextExtractor {
|
||||||
|
private final Predicate<String> includeDomainPredicate;
|
||||||
|
private final Predicate<EdgeUrl> includeUrlPredicate;
|
||||||
|
private final BiConsumer<EdgeUrl, String> linkKeywordConsumer;
|
||||||
|
|
||||||
|
private final LinkParser linkParser = new LinkParser();
|
||||||
|
|
||||||
|
private final HashFunction hashFunction = Hashing.murmur3_128();
|
||||||
|
|
||||||
|
// This bit map is used as a bloom filter to deduplicate url-keyword combinations
|
||||||
|
// false positives are expected, but that's an acceptable trade-off to not have to deal with
|
||||||
|
// de-duplicating billions of shuffled (url, word) tuples on limited hardware
|
||||||
|
private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS);
|
||||||
|
|
||||||
|
public AnchorTextExtractor(Predicate<String> includeDomainPredicate,
|
||||||
|
Predicate<EdgeUrl> includeUrlPredicate,
|
||||||
|
BiConsumer<EdgeUrl, String> linkKeywordConsumer) {
|
||||||
|
this.includeDomainPredicate = includeDomainPredicate;
|
||||||
|
this.includeUrlPredicate = includeUrlPredicate;
|
||||||
|
this.linkKeywordConsumer = linkKeywordConsumer;
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public void processDocument(String docUrl, String documentBody) {
|
||||||
|
final Document processed = Jsoup.parse(documentBody);
|
||||||
|
final EdgeUrl documentUrl = new EdgeUrl(docUrl);
|
||||||
|
|
||||||
|
for (var link : processed.getElementsByTag("a")) {
|
||||||
|
if (link.hasAttr("href")) {
|
||||||
|
String href = link.attr("href");
|
||||||
|
String text = getLinkText(link);
|
||||||
|
|
||||||
|
processAnchor(documentUrl, href, text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private final Pattern anchorTextNoise = Pattern.compile("[ \t\n\"()“”]+");
|
||||||
|
|
||||||
|
private String getLinkText(Element link) {
|
||||||
|
String text = link.text();
|
||||||
|
|
||||||
|
if (link.text().isBlank()) {
|
||||||
|
for (var img: link.getElementsByTag("img")) {
|
||||||
|
if (img.hasAttr("alt")) {
|
||||||
|
text = img.attr("alt");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void processAnchor(EdgeUrl documentUrl, String href, String text) {
|
||||||
|
if (!isInterestingAnchorText(text)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (href.contains("?")) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
var optLinkUrl = linkParser.parseLink(documentUrl, href);
|
||||||
|
if (optLinkUrl.isEmpty()) return;
|
||||||
|
|
||||||
|
var linkUrl = optLinkUrl.get();
|
||||||
|
|
||||||
|
if (!isInterestingAnchorLink(linkUrl)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String word: anchorTextNoise.split(text)) {
|
||||||
|
if (WordPatterns.isStopWord(word))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
word = word.toLowerCase();
|
||||||
|
if (!WordPatterns.filter(word))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (!linkUrl.domain.equals(documentUrl.domain)) {
|
||||||
|
if (isNewKeywordForLink(word, linkUrl.toString())) {
|
||||||
|
linkKeywordConsumer.accept(linkUrl, word);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine
|
||||||
|
private final Predicate<String> looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate();
|
||||||
|
|
||||||
|
private boolean isInterestingAnchorText(String text) {
|
||||||
|
if (text.isBlank()) return false;
|
||||||
|
if (text.length() > 32) return false;
|
||||||
|
|
||||||
|
// Google loves questions, and so does SEO spammers
|
||||||
|
if (text.endsWith("?")) return false;
|
||||||
|
|
||||||
|
if (text.startsWith("http:") || text.startsWith("https:")) return false;
|
||||||
|
|
||||||
|
if (looksLikeAnURL.test(text)) return false;
|
||||||
|
|
||||||
|
return switch (text) {
|
||||||
|
case "this", "here", "click", "click here", "download", "source" -> false;
|
||||||
|
default -> true;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isInterestingAnchorLink(EdgeUrl linkUrl) {
|
||||||
|
if (!(linkUrl.proto.endsWith("http") || linkUrl.proto.equals("https"))) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!includeUrlPredicate.test(linkUrl)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return includeDomainPredicate.test(linkUrl.domain.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isNewKeywordForLink(String href, String text) {
|
||||||
|
long hash = 0;
|
||||||
|
|
||||||
|
hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).asLong();
|
||||||
|
hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).asLong();
|
||||||
|
|
||||||
|
// Remove sign bit because we don't want a negative index in deduplicateHashBitset
|
||||||
|
hash &= 0x7FFF_FFFF_FFFF_FFFFL;
|
||||||
|
|
||||||
|
return !deduplicateHashBitset.set(hash % deduplicateHashBitset.cardinality);
|
||||||
|
}
|
||||||
|
}
|
@ -18,20 +18,20 @@
|
|||||||
|
|
||||||
package org.openzim.ZIMTypes;
|
package org.openzim.ZIMTypes;
|
||||||
|
|
||||||
import java.io.*;
|
|
||||||
import java.util.*;
|
|
||||||
import java.util.function.BiConsumer;
|
|
||||||
import java.util.function.Consumer;
|
|
||||||
import java.util.function.Predicate;
|
|
||||||
|
|
||||||
import com.github.luben.zstd.RecyclingBufferPool;
|
import com.github.luben.zstd.RecyclingBufferPool;
|
||||||
import com.github.luben.zstd.ZstdInputStream;
|
import com.github.luben.zstd.ZstdInputStream;
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.tukaani.xz.SingleXZInputStream;
|
|
||||||
import org.openzim.util.RandomAcessFileZIMInputStream;
|
import org.openzim.util.RandomAcessFileZIMInputStream;
|
||||||
import org.openzim.util.Utilities;
|
import org.openzim.util.Utilities;
|
||||||
|
import org.tukaani.xz.SingleXZInputStream;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.function.BiConsumer;
|
||||||
|
import java.util.function.Consumer;
|
||||||
|
import java.util.function.Predicate;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author Arunesh Mathur
|
* @author Arunesh Mathur
|
||||||
@ -401,198 +401,6 @@ public class ZIMReader {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getArticleData(DirectoryEntry mainEntry) throws IOException {
|
|
||||||
|
|
||||||
byte[] buffer = new byte[8];
|
|
||||||
|
|
||||||
if (mainEntry != null) {
|
|
||||||
|
|
||||||
// Check what kind of an entry was mainEnrty
|
|
||||||
if (mainEntry.getClass() == ArticleEntry.class) {
|
|
||||||
|
|
||||||
// Cast to ArticleEntry
|
|
||||||
ArticleEntry article = (ArticleEntry) mainEntry;
|
|
||||||
|
|
||||||
// Get the cluster and blob numbers from the article
|
|
||||||
long clusterNumber = article.getClusterNumber();
|
|
||||||
int blobNumber = article.getBlobnumber();
|
|
||||||
|
|
||||||
// Move to the cluster entry in the clusterPtrPos
|
|
||||||
mReader.seek(mFile.getClusterPtrPos() + clusterNumber * 8);
|
|
||||||
|
|
||||||
// Read the location of the cluster
|
|
||||||
long clusterPos = mReader
|
|
||||||
.readEightLittleEndianBytesValue(buffer);
|
|
||||||
|
|
||||||
// Move to the cluster
|
|
||||||
mReader.seek(clusterPos);
|
|
||||||
|
|
||||||
// Read the first byte, for compression information
|
|
||||||
int compressionType = mReader.read();
|
|
||||||
|
|
||||||
// Reference declaration
|
|
||||||
SingleXZInputStream xzReader = null;
|
|
||||||
int firstOffset, numberOfBlobs, offset1,
|
|
||||||
offset2,
|
|
||||||
location,
|
|
||||||
differenceOffset;
|
|
||||||
|
|
||||||
ByteArrayOutputStream baos;
|
|
||||||
|
|
||||||
// Check the compression type that was read
|
|
||||||
switch (compressionType) {
|
|
||||||
|
|
||||||
// TODO: Read uncompressed data directly
|
|
||||||
case 0:
|
|
||||||
case 1:
|
|
||||||
|
|
||||||
// Read the first 4 bytes to find out the number of artciles
|
|
||||||
buffer = new byte[4];
|
|
||||||
|
|
||||||
// Create a dictionary with size 40MiB, the zimlib uses this
|
|
||||||
// size while creating
|
|
||||||
|
|
||||||
// Read the first offset
|
|
||||||
mReader.read(buffer);
|
|
||||||
|
|
||||||
// The first four bytes are the offset of the zeroth blob
|
|
||||||
firstOffset = Utilities
|
|
||||||
.toFourLittleEndianInteger(buffer);
|
|
||||||
|
|
||||||
// The number of blobs
|
|
||||||
numberOfBlobs = firstOffset / 4;
|
|
||||||
|
|
||||||
// The blobNumber has to be lesser than the numberOfBlobs
|
|
||||||
assert blobNumber < numberOfBlobs;
|
|
||||||
|
|
||||||
|
|
||||||
if (blobNumber == 0) {
|
|
||||||
// The first offset is what we read earlier
|
|
||||||
offset1 = firstOffset;
|
|
||||||
} else {
|
|
||||||
|
|
||||||
location = (blobNumber - 1) * 4;
|
|
||||||
Utilities.skipFully(mReader, location);
|
|
||||||
mReader.read(buffer);
|
|
||||||
offset1 = Utilities.toFourLittleEndianInteger(buffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
mReader.read(buffer);
|
|
||||||
offset2 = Utilities.toFourLittleEndianInteger(buffer);
|
|
||||||
|
|
||||||
differenceOffset = offset2 - offset1;
|
|
||||||
buffer = new byte[differenceOffset];
|
|
||||||
|
|
||||||
Utilities.skipFully(mReader,
|
|
||||||
(offset1 - 4 * (blobNumber + 2)));
|
|
||||||
|
|
||||||
mReader.read(buffer, 0, differenceOffset);
|
|
||||||
|
|
||||||
return new String(buffer);
|
|
||||||
|
|
||||||
// LZMA2 compressed data
|
|
||||||
case 4:
|
|
||||||
|
|
||||||
// Read the first 4 bytes to find out the number of artciles
|
|
||||||
buffer = new byte[4];
|
|
||||||
|
|
||||||
// Create a dictionary with size 40MiB, the zimlib uses this
|
|
||||||
// size while creating
|
|
||||||
xzReader = new SingleXZInputStream(mReader, 4194304);
|
|
||||||
|
|
||||||
// Read the first offset
|
|
||||||
xzReader.read(buffer);
|
|
||||||
|
|
||||||
// The first four bytes are the offset of the zeroth blob
|
|
||||||
firstOffset = Utilities
|
|
||||||
.toFourLittleEndianInteger(buffer);
|
|
||||||
|
|
||||||
// The number of blobs
|
|
||||||
numberOfBlobs = firstOffset / 4;
|
|
||||||
|
|
||||||
// The blobNumber has to be lesser than the numberOfBlobs
|
|
||||||
assert blobNumber < numberOfBlobs;
|
|
||||||
|
|
||||||
if(blobNumber == 0) {
|
|
||||||
// The first offset is what we read earlier
|
|
||||||
offset1 = firstOffset;
|
|
||||||
} else {
|
|
||||||
|
|
||||||
location = (blobNumber - 1) * 4;
|
|
||||||
Utilities.skipFully(xzReader, location);
|
|
||||||
xzReader.read(buffer);
|
|
||||||
offset1 = Utilities.toFourLittleEndianInteger(buffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
xzReader.read(buffer);
|
|
||||||
offset2 = Utilities.toFourLittleEndianInteger(buffer);
|
|
||||||
|
|
||||||
differenceOffset = offset2 - offset1;
|
|
||||||
buffer = new byte[differenceOffset];
|
|
||||||
|
|
||||||
Utilities.skipFully(xzReader,
|
|
||||||
(offset1 - 4 * (blobNumber + 2)));
|
|
||||||
|
|
||||||
xzReader.read(buffer, 0, differenceOffset);
|
|
||||||
return new String(buffer);
|
|
||||||
|
|
||||||
case 5:
|
|
||||||
// Read the first 4 bytes to find out the number of artciles
|
|
||||||
buffer = new byte[4];
|
|
||||||
|
|
||||||
// Create a dictionary with size 40MiB, the zimlib uses this
|
|
||||||
// size while creating
|
|
||||||
var zstdInputStream = new com.github.luben.zstd.ZstdInputStream(new BufferedInputStream(mReader));
|
|
||||||
|
|
||||||
// Read the first offset
|
|
||||||
zstdInputStream.read(buffer);
|
|
||||||
|
|
||||||
// The first four bytes are the offset of the zeroth blob
|
|
||||||
firstOffset = Utilities
|
|
||||||
.toFourLittleEndianInteger(buffer);
|
|
||||||
|
|
||||||
// The number of blobs
|
|
||||||
numberOfBlobs = firstOffset / 4;
|
|
||||||
|
|
||||||
// The blobNumber has to be lesser than the numberOfBlobs
|
|
||||||
assert blobNumber < numberOfBlobs;
|
|
||||||
|
|
||||||
if(blobNumber == 0) {
|
|
||||||
// The first offset is what we read earlier
|
|
||||||
offset1 = firstOffset;
|
|
||||||
} else {
|
|
||||||
|
|
||||||
location = (blobNumber - 1) * 4;
|
|
||||||
Utilities.skipFully(zstdInputStream, location);
|
|
||||||
zstdInputStream.read(buffer);
|
|
||||||
offset1 = Utilities.toFourLittleEndianInteger(buffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
zstdInputStream.read(buffer);
|
|
||||||
offset2 = Utilities.toFourLittleEndianInteger(buffer);
|
|
||||||
|
|
||||||
differenceOffset = offset2 - offset1;
|
|
||||||
buffer = new byte[differenceOffset];
|
|
||||||
|
|
||||||
Utilities.skipFully(zstdInputStream,
|
|
||||||
(offset1 - 4 * (blobNumber + 2)));
|
|
||||||
|
|
||||||
zstdInputStream.read(buffer, 0, differenceOffset);
|
|
||||||
|
|
||||||
return new String(buffer);
|
|
||||||
|
|
||||||
default:
|
|
||||||
System.err.print("What is compression = " + compressionType);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return null;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public DirectoryEntry getDirectoryInfoAtTitlePosition(long position)
|
public DirectoryEntry getDirectoryInfoAtTitlePosition(long position)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user