mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Experiments in keyword extraction
This commit is contained in:
parent
4516b23f90
commit
e1b3477115
@ -1,81 +1,152 @@
|
||||
package nu.marginalia.wmsa.edge.converting;
|
||||
|
||||
import com.google.common.hash.HashFunction;
|
||||
import com.google.common.hash.Hashing;
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.util.DenseBitMap;
|
||||
import nu.marginalia.util.language.WordPatterns;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import nu.marginalia.wmsa.edge.converting.atags.AnchorTextExtractor;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader;
|
||||
import nu.marginalia.wmsa.edge.crawling.WorkLog;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
|
||||
import nu.marginalia.wmsa.edge.integration.stackoverflow.StackOverflowPostsReader;
|
||||
import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.io.OutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class LinkKeywordExtractorMain {
|
||||
private static final Logger logger = LoggerFactory.getLogger(LinkKeywordExtractorMain.class);
|
||||
|
||||
public static void main(String... args) throws IOException {
|
||||
public static void main(String... args) throws IOException, InterruptedException {
|
||||
|
||||
if (args.length != 1) {
|
||||
System.err.println("Arguments: crawl-plan.yaml");
|
||||
if (args.length < 2) {
|
||||
System.err.println("Arguments: [crawl|so|wiki] crawl-plan.yaml [data]");
|
||||
System.exit(0);
|
||||
}
|
||||
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||
|
||||
Injector injector = Guice.createInjector(
|
||||
new ConverterModule(plan)
|
||||
);
|
||||
String command = args[0];
|
||||
var plan = new CrawlPlanLoader().load(Path.of(args[1]));
|
||||
|
||||
injector.getInstance(LinkKeywordExtractorMain.class);
|
||||
switch (command) {
|
||||
case "crawl": getKeywordsFromCrawl(plan); break;
|
||||
case "so": getKeywordsFromSo(plan, args[2]); break;
|
||||
case "wiki": getKeywordsFromWiki(plan, args[2]); break;
|
||||
default: System.err.println("Unrecognized command");
|
||||
}
|
||||
|
||||
private final HashSet<String> crawledDomains = new HashSet<>();
|
||||
private final List<String> fileNames = new ArrayList<>();
|
||||
private final LinkParser linkParser = new LinkParser();
|
||||
private final SentenceExtractor sentenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
}
|
||||
|
||||
private final HashFunction hashFunction = Hashing.murmur3_128();
|
||||
private static void getKeywordsFromWiki(EdgeCrawlPlan plan, String arg) throws IOException, InterruptedException {
|
||||
|
||||
// This bit map is used as a bloom filter to deduplicate url-keyword combinations
|
||||
// false positives are expected, but that's an acceptable trade-off to not have to deal with
|
||||
// de-duplicating billions of shuffled (url, word) tuples on limited hardware
|
||||
private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS);
|
||||
|
||||
@Inject
|
||||
public LinkKeywordExtractorMain(EdgeCrawlPlan plan) throws IOException {
|
||||
HashSet<String> crawledDomains = new HashSet<>();
|
||||
TIntHashSet crawledUrls = new TIntHashSet(50_000_000);
|
||||
|
||||
logger.info("Loading URLs");
|
||||
Files.lines(Path.of("/home/vlofgren/good-urls3.txt"))
|
||||
.filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange"))
|
||||
.mapToInt(String::hashCode)
|
||||
.forEach(crawledUrls::add);
|
||||
|
||||
logger.info("Loading input spec");
|
||||
CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
|
||||
spec -> { crawledDomains.add(spec.domain); });
|
||||
|
||||
try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
|
||||
AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(domain -> crawledDomains.contains(domain)
|
||||
&& !domain.contains("wiki")
|
||||
&& !domain.contains("isni")
|
||||
&& !domain.contains("wiktionary"),
|
||||
url -> crawledUrls.contains(url.toString().hashCode()),
|
||||
output::write);
|
||||
|
||||
new WikipediaReader(arg, new EdgeDomain("invalid.example"), article -> {
|
||||
anchorTextExtractor.processDocument(article.getUrl().toString(), article.body);
|
||||
}).join();
|
||||
}
|
||||
catch (IOException ex) {
|
||||
ex.printStackTrace();
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
private static void getKeywordsFromSo(EdgeCrawlPlan plan, String arg) throws IOException, InterruptedException {
|
||||
TIntHashSet crawledUrls = new TIntHashSet(50_000_000);
|
||||
|
||||
logger.info("Loading URLs");
|
||||
Files.lines(Path.of("/home/vlofgren/good-urls3.txt"))
|
||||
.filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange"))
|
||||
.mapToInt(String::hashCode)
|
||||
.forEach(crawledUrls::add);
|
||||
|
||||
logger.info("Loading input spec");
|
||||
|
||||
HashSet<String> crawledDomains = new HashSet<>();
|
||||
CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
|
||||
spec -> crawledDomains.add(spec.domain));
|
||||
|
||||
crawledDomains.remove("jsfiddle.net"); // like 30% of SO's links go here
|
||||
crawledDomains.remove("jsbin.com");
|
||||
crawledDomains.remove("codepad.org");
|
||||
|
||||
|
||||
try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
|
||||
AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains,
|
||||
url -> crawledUrls.contains(url.toString().hashCode()),
|
||||
output::write);
|
||||
|
||||
new StackOverflowPostsReader(arg, new EdgeDomain("invalid.example"), post -> {
|
||||
anchorTextExtractor.processDocument(post.getUrl().toString(), post.fullBody);
|
||||
}).join();
|
||||
}
|
||||
catch (IOException ex) {
|
||||
ex.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static void getKeywordsFromCrawl(EdgeCrawlPlan plan) throws IOException {
|
||||
|
||||
TIntHashSet crawledUrls = new TIntHashSet(50_000_000);
|
||||
|
||||
logger.info("Loading URLs");
|
||||
Files.lines(Path.of("/home/vlofgren/good-urls3.txt"))
|
||||
.filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange"))
|
||||
.mapToInt(String::hashCode)
|
||||
.forEach(crawledUrls::add);
|
||||
|
||||
|
||||
logger.info("Loading input spec");
|
||||
|
||||
HashSet<String> crawledDomains = new HashSet<>();
|
||||
CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
|
||||
spec -> crawledDomains.add(spec.domain));
|
||||
|
||||
List<String> fileNames = new ArrayList<>();
|
||||
|
||||
logger.info("Replaying crawl log");
|
||||
WorkLog.readLog(plan.crawl.getLogFile(),
|
||||
entry -> fileNames.add(entry.path()));
|
||||
|
||||
try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
|
||||
AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains,
|
||||
url -> crawledUrls.contains(url.toString().hashCode()),
|
||||
output::write);
|
||||
|
||||
logger.info("Reading files");
|
||||
for (var fn : fileNames) {
|
||||
CrawledDomainReader crawledDomainReader = new CrawledDomainReader();
|
||||
@ -85,131 +156,38 @@ public class LinkKeywordExtractorMain {
|
||||
System.out.println("# " + crawledDomain.domain);
|
||||
|
||||
for (var doc : crawledDomain.doc) {
|
||||
try {
|
||||
if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) {
|
||||
processDocument(doc.url, doc.documentBody);
|
||||
}
|
||||
}
|
||||
catch (URISyntaxException ex) {
|
||||
// This Shouldn't Happen (TM) as the URL that we're failing to process
|
||||
// is expected to have already been parsed by this code successfully
|
||||
// in the process of getting here.
|
||||
//
|
||||
// But also, if it does happen, it's no big deal
|
||||
|
||||
logger.warn("Bad URL format", ex);
|
||||
anchorTextExtractor.processDocument(doc.url, doc.documentBody);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void processDocument(String docUrl, String documentBody) throws URISyntaxException {
|
||||
final Document processed = Jsoup.parse(documentBody);
|
||||
final EdgeUrl documentUrl = new EdgeUrl(docUrl);
|
||||
|
||||
for (var link : processed.getElementsByTag("a")) {
|
||||
if (link.hasAttr("href")) {
|
||||
String href = link.attr("href");
|
||||
String text = getLinkText(link);
|
||||
|
||||
processAnchor(documentUrl, href, text);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private final Pattern anchorTextNoise = Pattern.compile("[\\s\"()“”:]+");
|
||||
private static class UrlKeywordTsvWriter implements AutoCloseable {
|
||||
|
||||
private String getLinkText(Element link) {
|
||||
String text = link.text();
|
||||
private final OutputStream stream;
|
||||
|
||||
if (link.text().isBlank()) {
|
||||
text = getLinkTextByImgAltTag(link);
|
||||
UrlKeywordTsvWriter(Path outputFile) throws IOException {
|
||||
this.stream = new BufferedOutputStream(new FileOutputStream(outputFile.toFile()));
|
||||
}
|
||||
|
||||
return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim();
|
||||
}
|
||||
|
||||
private String getLinkTextByImgAltTag(Element link) {
|
||||
for (var img: link.getElementsByTag("img")) {
|
||||
if (img.hasAttr("alt")) {
|
||||
return img.attr("alt");
|
||||
void write(EdgeUrl url, String keyword) {
|
||||
try {
|
||||
stream.write(url.toString().getBytes());
|
||||
stream.write('\t');
|
||||
stream.write(keyword.getBytes());
|
||||
stream.write('\n');
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
private void processAnchor(EdgeUrl documentUrl, String href, String text) {
|
||||
if (!isInterestingAnchorText(text)) {
|
||||
return;
|
||||
}
|
||||
|
||||
var optLinkUrl = linkParser.parseLink(documentUrl, href);
|
||||
if (optLinkUrl.isEmpty()) return;
|
||||
|
||||
var linkUrl = optLinkUrl.get();
|
||||
|
||||
if (!isInterestingAnchorLink(linkUrl)) {
|
||||
return;
|
||||
}
|
||||
|
||||
DocumentLanguageData languageData = sentenceExtractor.extractSentences(text);
|
||||
for (var sent : languageData.sentences) {
|
||||
for (var wordPos : sent) {
|
||||
if (wordPos.isStopWord())
|
||||
continue;
|
||||
|
||||
String word = wordPos.wordLowerCase();
|
||||
|
||||
if (!WordPatterns.filter(word))
|
||||
continue;
|
||||
|
||||
if (!linkUrl.domain.equals(documentUrl.domain)) {
|
||||
if (isNewKeywordForLink(word, linkUrl.toString())) {
|
||||
System.out.println(linkUrl + "\t" + word);
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
stream.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine
|
||||
private final Predicate<String> looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate();
|
||||
|
||||
private boolean isInterestingAnchorText(String text) {
|
||||
if (text.isBlank()) return false;
|
||||
if (text.length() > 32) return false;
|
||||
|
||||
// Google loves questions, and so does SEO spammers
|
||||
if (text.endsWith("?")) return false;
|
||||
|
||||
if (text.startsWith("http:") || text.startsWith("https:")) return false;
|
||||
|
||||
if (looksLikeAnURL.test(text)) return false;
|
||||
|
||||
return switch (text) {
|
||||
case "this", "here", "click", "click here", "download", "source" -> false;
|
||||
default -> true;
|
||||
};
|
||||
}
|
||||
|
||||
private boolean isInterestingAnchorLink(EdgeUrl linkUrl) {
|
||||
if (!(linkUrl.proto.endsWith("http") || linkUrl.proto.equals("https"))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return crawledDomains.contains(linkUrl.domain.toString());
|
||||
}
|
||||
|
||||
private boolean isNewKeywordForLink(String href, String text) {
|
||||
long hash = 0;
|
||||
|
||||
hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).asLong();
|
||||
hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).asLong();
|
||||
|
||||
// Remove sign bit because we don't want a negative index in deduplicateHashBitset
|
||||
hash &= 0x7FFF_FFFF_FFFF_FFFFL;
|
||||
|
||||
return !deduplicateHashBitset.set(hash % deduplicateHashBitset.cardinality);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,149 @@
|
||||
package nu.marginalia.wmsa.edge.converting.atags;
|
||||
|
||||
import com.google.common.hash.HashFunction;
|
||||
import com.google.common.hash.Hashing;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.DenseBitMap;
|
||||
import nu.marginalia.util.language.WordPatterns;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class AnchorTextExtractor {
|
||||
private final Predicate<String> includeDomainPredicate;
|
||||
private final Predicate<EdgeUrl> includeUrlPredicate;
|
||||
private final BiConsumer<EdgeUrl, String> linkKeywordConsumer;
|
||||
|
||||
private final LinkParser linkParser = new LinkParser();
|
||||
|
||||
private final HashFunction hashFunction = Hashing.murmur3_128();
|
||||
|
||||
// This bit map is used as a bloom filter to deduplicate url-keyword combinations
|
||||
// false positives are expected, but that's an acceptable trade-off to not have to deal with
|
||||
// de-duplicating billions of shuffled (url, word) tuples on limited hardware
|
||||
private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS);
|
||||
|
||||
public AnchorTextExtractor(Predicate<String> includeDomainPredicate,
|
||||
Predicate<EdgeUrl> includeUrlPredicate,
|
||||
BiConsumer<EdgeUrl, String> linkKeywordConsumer) {
|
||||
this.includeDomainPredicate = includeDomainPredicate;
|
||||
this.includeUrlPredicate = includeUrlPredicate;
|
||||
this.linkKeywordConsumer = linkKeywordConsumer;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void processDocument(String docUrl, String documentBody) {
|
||||
final Document processed = Jsoup.parse(documentBody);
|
||||
final EdgeUrl documentUrl = new EdgeUrl(docUrl);
|
||||
|
||||
for (var link : processed.getElementsByTag("a")) {
|
||||
if (link.hasAttr("href")) {
|
||||
String href = link.attr("href");
|
||||
String text = getLinkText(link);
|
||||
|
||||
processAnchor(documentUrl, href, text);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private final Pattern anchorTextNoise = Pattern.compile("[ \t\n\"()“”]+");
|
||||
|
||||
private String getLinkText(Element link) {
|
||||
String text = link.text();
|
||||
|
||||
if (link.text().isBlank()) {
|
||||
for (var img: link.getElementsByTag("img")) {
|
||||
if (img.hasAttr("alt")) {
|
||||
text = img.attr("alt");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim();
|
||||
}
|
||||
|
||||
private void processAnchor(EdgeUrl documentUrl, String href, String text) {
|
||||
if (!isInterestingAnchorText(text)) {
|
||||
return;
|
||||
}
|
||||
if (href.contains("?")) {
|
||||
return;
|
||||
}
|
||||
|
||||
var optLinkUrl = linkParser.parseLink(documentUrl, href);
|
||||
if (optLinkUrl.isEmpty()) return;
|
||||
|
||||
var linkUrl = optLinkUrl.get();
|
||||
|
||||
if (!isInterestingAnchorLink(linkUrl)) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (String word: anchorTextNoise.split(text)) {
|
||||
if (WordPatterns.isStopWord(word))
|
||||
continue;
|
||||
|
||||
word = word.toLowerCase();
|
||||
if (!WordPatterns.filter(word))
|
||||
continue;
|
||||
|
||||
if (!linkUrl.domain.equals(documentUrl.domain)) {
|
||||
if (isNewKeywordForLink(word, linkUrl.toString())) {
|
||||
linkKeywordConsumer.accept(linkUrl, word);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine
|
||||
private final Predicate<String> looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate();
|
||||
|
||||
private boolean isInterestingAnchorText(String text) {
|
||||
if (text.isBlank()) return false;
|
||||
if (text.length() > 32) return false;
|
||||
|
||||
// Google loves questions, and so does SEO spammers
|
||||
if (text.endsWith("?")) return false;
|
||||
|
||||
if (text.startsWith("http:") || text.startsWith("https:")) return false;
|
||||
|
||||
if (looksLikeAnURL.test(text)) return false;
|
||||
|
||||
return switch (text) {
|
||||
case "this", "here", "click", "click here", "download", "source" -> false;
|
||||
default -> true;
|
||||
};
|
||||
}
|
||||
|
||||
private boolean isInterestingAnchorLink(EdgeUrl linkUrl) {
|
||||
if (!(linkUrl.proto.endsWith("http") || linkUrl.proto.equals("https"))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!includeUrlPredicate.test(linkUrl)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return includeDomainPredicate.test(linkUrl.domain.toString());
|
||||
}
|
||||
|
||||
private boolean isNewKeywordForLink(String href, String text) {
|
||||
long hash = 0;
|
||||
|
||||
hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).asLong();
|
||||
hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).asLong();
|
||||
|
||||
// Remove sign bit because we don't want a negative index in deduplicateHashBitset
|
||||
hash &= 0x7FFF_FFFF_FFFF_FFFFL;
|
||||
|
||||
return !deduplicateHashBitset.set(hash % deduplicateHashBitset.cardinality);
|
||||
}
|
||||
}
|
@ -18,20 +18,20 @@
|
||||
|
||||
package org.openzim.ZIMTypes;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
import com.github.luben.zstd.RecyclingBufferPool;
|
||||
import com.github.luben.zstd.ZstdInputStream;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.tukaani.xz.SingleXZInputStream;
|
||||
import org.openzim.util.RandomAcessFileZIMInputStream;
|
||||
import org.openzim.util.Utilities;
|
||||
import org.tukaani.xz.SingleXZInputStream;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
/**
|
||||
* @author Arunesh Mathur
|
||||
@ -401,198 +401,6 @@ public class ZIMReader {
|
||||
|
||||
}
|
||||
|
||||
public String getArticleData(DirectoryEntry mainEntry) throws IOException {
|
||||
|
||||
byte[] buffer = new byte[8];
|
||||
|
||||
if (mainEntry != null) {
|
||||
|
||||
// Check what kind of an entry was mainEnrty
|
||||
if (mainEntry.getClass() == ArticleEntry.class) {
|
||||
|
||||
// Cast to ArticleEntry
|
||||
ArticleEntry article = (ArticleEntry) mainEntry;
|
||||
|
||||
// Get the cluster and blob numbers from the article
|
||||
long clusterNumber = article.getClusterNumber();
|
||||
int blobNumber = article.getBlobnumber();
|
||||
|
||||
// Move to the cluster entry in the clusterPtrPos
|
||||
mReader.seek(mFile.getClusterPtrPos() + clusterNumber * 8);
|
||||
|
||||
// Read the location of the cluster
|
||||
long clusterPos = mReader
|
||||
.readEightLittleEndianBytesValue(buffer);
|
||||
|
||||
// Move to the cluster
|
||||
mReader.seek(clusterPos);
|
||||
|
||||
// Read the first byte, for compression information
|
||||
int compressionType = mReader.read();
|
||||
|
||||
// Reference declaration
|
||||
SingleXZInputStream xzReader = null;
|
||||
int firstOffset, numberOfBlobs, offset1,
|
||||
offset2,
|
||||
location,
|
||||
differenceOffset;
|
||||
|
||||
ByteArrayOutputStream baos;
|
||||
|
||||
// Check the compression type that was read
|
||||
switch (compressionType) {
|
||||
|
||||
// TODO: Read uncompressed data directly
|
||||
case 0:
|
||||
case 1:
|
||||
|
||||
// Read the first 4 bytes to find out the number of artciles
|
||||
buffer = new byte[4];
|
||||
|
||||
// Create a dictionary with size 40MiB, the zimlib uses this
|
||||
// size while creating
|
||||
|
||||
// Read the first offset
|
||||
mReader.read(buffer);
|
||||
|
||||
// The first four bytes are the offset of the zeroth blob
|
||||
firstOffset = Utilities
|
||||
.toFourLittleEndianInteger(buffer);
|
||||
|
||||
// The number of blobs
|
||||
numberOfBlobs = firstOffset / 4;
|
||||
|
||||
// The blobNumber has to be lesser than the numberOfBlobs
|
||||
assert blobNumber < numberOfBlobs;
|
||||
|
||||
|
||||
if (blobNumber == 0) {
|
||||
// The first offset is what we read earlier
|
||||
offset1 = firstOffset;
|
||||
} else {
|
||||
|
||||
location = (blobNumber - 1) * 4;
|
||||
Utilities.skipFully(mReader, location);
|
||||
mReader.read(buffer);
|
||||
offset1 = Utilities.toFourLittleEndianInteger(buffer);
|
||||
}
|
||||
|
||||
mReader.read(buffer);
|
||||
offset2 = Utilities.toFourLittleEndianInteger(buffer);
|
||||
|
||||
differenceOffset = offset2 - offset1;
|
||||
buffer = new byte[differenceOffset];
|
||||
|
||||
Utilities.skipFully(mReader,
|
||||
(offset1 - 4 * (blobNumber + 2)));
|
||||
|
||||
mReader.read(buffer, 0, differenceOffset);
|
||||
|
||||
return new String(buffer);
|
||||
|
||||
// LZMA2 compressed data
|
||||
case 4:
|
||||
|
||||
// Read the first 4 bytes to find out the number of artciles
|
||||
buffer = new byte[4];
|
||||
|
||||
// Create a dictionary with size 40MiB, the zimlib uses this
|
||||
// size while creating
|
||||
xzReader = new SingleXZInputStream(mReader, 4194304);
|
||||
|
||||
// Read the first offset
|
||||
xzReader.read(buffer);
|
||||
|
||||
// The first four bytes are the offset of the zeroth blob
|
||||
firstOffset = Utilities
|
||||
.toFourLittleEndianInteger(buffer);
|
||||
|
||||
// The number of blobs
|
||||
numberOfBlobs = firstOffset / 4;
|
||||
|
||||
// The blobNumber has to be lesser than the numberOfBlobs
|
||||
assert blobNumber < numberOfBlobs;
|
||||
|
||||
if(blobNumber == 0) {
|
||||
// The first offset is what we read earlier
|
||||
offset1 = firstOffset;
|
||||
} else {
|
||||
|
||||
location = (blobNumber - 1) * 4;
|
||||
Utilities.skipFully(xzReader, location);
|
||||
xzReader.read(buffer);
|
||||
offset1 = Utilities.toFourLittleEndianInteger(buffer);
|
||||
}
|
||||
|
||||
xzReader.read(buffer);
|
||||
offset2 = Utilities.toFourLittleEndianInteger(buffer);
|
||||
|
||||
differenceOffset = offset2 - offset1;
|
||||
buffer = new byte[differenceOffset];
|
||||
|
||||
Utilities.skipFully(xzReader,
|
||||
(offset1 - 4 * (blobNumber + 2)));
|
||||
|
||||
xzReader.read(buffer, 0, differenceOffset);
|
||||
return new String(buffer);
|
||||
|
||||
case 5:
|
||||
// Read the first 4 bytes to find out the number of artciles
|
||||
buffer = new byte[4];
|
||||
|
||||
// Create a dictionary with size 40MiB, the zimlib uses this
|
||||
// size while creating
|
||||
var zstdInputStream = new com.github.luben.zstd.ZstdInputStream(new BufferedInputStream(mReader));
|
||||
|
||||
// Read the first offset
|
||||
zstdInputStream.read(buffer);
|
||||
|
||||
// The first four bytes are the offset of the zeroth blob
|
||||
firstOffset = Utilities
|
||||
.toFourLittleEndianInteger(buffer);
|
||||
|
||||
// The number of blobs
|
||||
numberOfBlobs = firstOffset / 4;
|
||||
|
||||
// The blobNumber has to be lesser than the numberOfBlobs
|
||||
assert blobNumber < numberOfBlobs;
|
||||
|
||||
if(blobNumber == 0) {
|
||||
// The first offset is what we read earlier
|
||||
offset1 = firstOffset;
|
||||
} else {
|
||||
|
||||
location = (blobNumber - 1) * 4;
|
||||
Utilities.skipFully(zstdInputStream, location);
|
||||
zstdInputStream.read(buffer);
|
||||
offset1 = Utilities.toFourLittleEndianInteger(buffer);
|
||||
}
|
||||
|
||||
zstdInputStream.read(buffer);
|
||||
offset2 = Utilities.toFourLittleEndianInteger(buffer);
|
||||
|
||||
differenceOffset = offset2 - offset1;
|
||||
buffer = new byte[differenceOffset];
|
||||
|
||||
Utilities.skipFully(zstdInputStream,
|
||||
(offset1 - 4 * (blobNumber + 2)));
|
||||
|
||||
zstdInputStream.read(buffer, 0, differenceOffset);
|
||||
|
||||
return new String(buffer);
|
||||
|
||||
default:
|
||||
System.err.print("What is compression = " + compressionType);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
|
||||
}
|
||||
|
||||
public DirectoryEntry getDirectoryInfoAtTitlePosition(long position)
|
||||
throws IOException {
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user