Experiments in keyword extraction

This commit is contained in:
vlofgren 2022-06-23 17:02:28 +02:00
parent 4516b23f90
commit e1b3477115
3 changed files with 292 additions and 357 deletions

View File

@ -1,215 +1,193 @@
package nu.marginalia.wmsa.edge.converting; package nu.marginalia.wmsa.edge.converting;
import com.google.common.hash.HashFunction; import gnu.trove.set.hash.TIntHashSet;
import com.google.common.hash.Hashing; import nu.marginalia.wmsa.edge.converting.atags.AnchorTextExtractor;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import nu.marginalia.util.DenseBitMap;
import nu.marginalia.util.language.WordPatterns;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader; import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader;
import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader; import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader;
import nu.marginalia.wmsa.edge.crawling.WorkLog; import nu.marginalia.wmsa.edge.crawling.WorkLog;
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus; import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
import nu.marginalia.wmsa.edge.integration.stackoverflow.StackOverflowPostsReader;
import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader;
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.BufferedOutputStream;
import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.net.URISyntaxException; import java.io.OutputStream;
import java.nio.charset.StandardCharsets; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.function.Predicate;
import java.util.regex.Pattern;
public class LinkKeywordExtractorMain { public class LinkKeywordExtractorMain {
private static final Logger logger = LoggerFactory.getLogger(LinkKeywordExtractorMain.class); private static final Logger logger = LoggerFactory.getLogger(LinkKeywordExtractorMain.class);
public static void main(String... args) throws IOException { public static void main(String... args) throws IOException, InterruptedException {
if (args.length != 1) { if (args.length < 2) {
System.err.println("Arguments: crawl-plan.yaml"); System.err.println("Arguments: [crawl|so|wiki] crawl-plan.yaml [data]");
System.exit(0); System.exit(0);
} }
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
Injector injector = Guice.createInjector( String command = args[0];
new ConverterModule(plan) var plan = new CrawlPlanLoader().load(Path.of(args[1]));
);
switch (command) {
case "crawl": getKeywordsFromCrawl(plan); break;
case "so": getKeywordsFromSo(plan, args[2]); break;
case "wiki": getKeywordsFromWiki(plan, args[2]); break;
default: System.err.println("Unrecognized command");
}
injector.getInstance(LinkKeywordExtractorMain.class);
} }
private final HashSet<String> crawledDomains = new HashSet<>(); private static void getKeywordsFromWiki(EdgeCrawlPlan plan, String arg) throws IOException, InterruptedException {
private final List<String> fileNames = new ArrayList<>();
private final LinkParser linkParser = new LinkParser();
private final SentenceExtractor sentenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels());
private final HashFunction hashFunction = Hashing.murmur3_128();
// This bit map is used as a bloom filter to deduplicate url-keyword combinations HashSet<String> crawledDomains = new HashSet<>();
// false positives are expected, but that's an acceptable trade-off to not have to deal with TIntHashSet crawledUrls = new TIntHashSet(50_000_000);
// de-duplicating billions of shuffled (url, word) tuples on limited hardware
private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS); logger.info("Loading URLs");
Files.lines(Path.of("/home/vlofgren/good-urls3.txt"))
.filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange"))
.mapToInt(String::hashCode)
.forEach(crawledUrls::add);
logger.info("Loading input spec");
CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
spec -> { crawledDomains.add(spec.domain); });
try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(domain -> crawledDomains.contains(domain)
&& !domain.contains("wiki")
&& !domain.contains("isni")
&& !domain.contains("wiktionary"),
url -> crawledUrls.contains(url.toString().hashCode()),
output::write);
new WikipediaReader(arg, new EdgeDomain("invalid.example"), article -> {
anchorTextExtractor.processDocument(article.getUrl().toString(), article.body);
}).join();
}
catch (IOException ex) {
ex.printStackTrace();
}
}
private static void getKeywordsFromSo(EdgeCrawlPlan plan, String arg) throws IOException, InterruptedException {
TIntHashSet crawledUrls = new TIntHashSet(50_000_000);
logger.info("Loading URLs");
Files.lines(Path.of("/home/vlofgren/good-urls3.txt"))
.filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange"))
.mapToInt(String::hashCode)
.forEach(crawledUrls::add);
@Inject
public LinkKeywordExtractorMain(EdgeCrawlPlan plan) throws IOException {
logger.info("Loading input spec"); logger.info("Loading input spec");
HashSet<String> crawledDomains = new HashSet<>();
CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(), CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
spec -> crawledDomains.add(spec.domain)); spec -> crawledDomains.add(spec.domain));
crawledDomains.remove("jsfiddle.net"); // like 30% of SO's links go here
crawledDomains.remove("jsbin.com");
crawledDomains.remove("codepad.org");
try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains,
url -> crawledUrls.contains(url.toString().hashCode()),
output::write);
new StackOverflowPostsReader(arg, new EdgeDomain("invalid.example"), post -> {
anchorTextExtractor.processDocument(post.getUrl().toString(), post.fullBody);
}).join();
}
catch (IOException ex) {
ex.printStackTrace();
}
}
public static void getKeywordsFromCrawl(EdgeCrawlPlan plan) throws IOException {
TIntHashSet crawledUrls = new TIntHashSet(50_000_000);
logger.info("Loading URLs");
Files.lines(Path.of("/home/vlofgren/good-urls3.txt"))
.filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange"))
.mapToInt(String::hashCode)
.forEach(crawledUrls::add);
logger.info("Loading input spec");
HashSet<String> crawledDomains = new HashSet<>();
CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
spec -> crawledDomains.add(spec.domain));
List<String> fileNames = new ArrayList<>();
logger.info("Replaying crawl log"); logger.info("Replaying crawl log");
WorkLog.readLog(plan.crawl.getLogFile(), WorkLog.readLog(plan.crawl.getLogFile(),
entry -> fileNames.add(entry.path())); entry -> fileNames.add(entry.path()));
logger.info("Reading files"); try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
for (var fn : fileNames) { AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains,
CrawledDomainReader crawledDomainReader = new CrawledDomainReader(); url -> crawledUrls.contains(url.toString().hashCode()),
var crawledDomain = crawledDomainReader.read(plan.getCrawledFilePath(fn)); output::write);
if (crawledDomain.doc == null) continue;
System.out.println("# " + crawledDomain.domain); logger.info("Reading files");
for (var fn : fileNames) {
CrawledDomainReader crawledDomainReader = new CrawledDomainReader();
var crawledDomain = crawledDomainReader.read(plan.getCrawledFilePath(fn));
if (crawledDomain.doc == null) continue;
for (var doc : crawledDomain.doc) { System.out.println("# " + crawledDomain.domain);
try {
for (var doc : crawledDomain.doc) {
if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) { if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) {
processDocument(doc.url, doc.documentBody); anchorTextExtractor.processDocument(doc.url, doc.documentBody);
}
}
catch (URISyntaxException ex) {
// This Shouldn't Happen (TM) as the URL that we're failing to process
// is expected to have already been parsed by this code successfully
// in the process of getting here.
//
// But also, if it does happen, it's no big deal
logger.warn("Bad URL format", ex);
}
}
}
}
private void processDocument(String docUrl, String documentBody) throws URISyntaxException {
final Document processed = Jsoup.parse(documentBody);
final EdgeUrl documentUrl = new EdgeUrl(docUrl);
for (var link : processed.getElementsByTag("a")) {
if (link.hasAttr("href")) {
String href = link.attr("href");
String text = getLinkText(link);
processAnchor(documentUrl, href, text);
}
}
}
private final Pattern anchorTextNoise = Pattern.compile("[\\s\"()“”:]+");
private String getLinkText(Element link) {
String text = link.text();
if (link.text().isBlank()) {
text = getLinkTextByImgAltTag(link);
}
return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim();
}
private String getLinkTextByImgAltTag(Element link) {
for (var img: link.getElementsByTag("img")) {
if (img.hasAttr("alt")) {
return img.attr("alt");
}
}
return "";
}
private void processAnchor(EdgeUrl documentUrl, String href, String text) {
if (!isInterestingAnchorText(text)) {
return;
}
var optLinkUrl = linkParser.parseLink(documentUrl, href);
if (optLinkUrl.isEmpty()) return;
var linkUrl = optLinkUrl.get();
if (!isInterestingAnchorLink(linkUrl)) {
return;
}
DocumentLanguageData languageData = sentenceExtractor.extractSentences(text);
for (var sent : languageData.sentences) {
for (var wordPos : sent) {
if (wordPos.isStopWord())
continue;
String word = wordPos.wordLowerCase();
if (!WordPatterns.filter(word))
continue;
if (!linkUrl.domain.equals(documentUrl.domain)) {
if (isNewKeywordForLink(word, linkUrl.toString())) {
System.out.println(linkUrl + "\t" + word);
} }
} }
} }
} }
} }
// This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine private static class UrlKeywordTsvWriter implements AutoCloseable {
private final Predicate<String> looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate();
private boolean isInterestingAnchorText(String text) { private final OutputStream stream;
if (text.isBlank()) return false;
if (text.length() > 32) return false;
// Google loves questions, and so does SEO spammers UrlKeywordTsvWriter(Path outputFile) throws IOException {
if (text.endsWith("?")) return false; this.stream = new BufferedOutputStream(new FileOutputStream(outputFile.toFile()));
if (text.startsWith("http:") || text.startsWith("https:")) return false;
if (looksLikeAnURL.test(text)) return false;
return switch (text) {
case "this", "here", "click", "click here", "download", "source" -> false;
default -> true;
};
}
private boolean isInterestingAnchorLink(EdgeUrl linkUrl) {
if (!(linkUrl.proto.endsWith("http") || linkUrl.proto.equals("https"))) {
return false;
} }
return crawledDomains.contains(linkUrl.domain.toString()); void write(EdgeUrl url, String keyword) {
try {
stream.write(url.toString().getBytes());
stream.write('\t');
stream.write(keyword.getBytes());
stream.write('\n');
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
public void close() throws IOException {
stream.close();
}
} }
private boolean isNewKeywordForLink(String href, String text) {
long hash = 0;
hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).asLong();
hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).asLong();
// Remove sign bit because we don't want a negative index in deduplicateHashBitset
hash &= 0x7FFF_FFFF_FFFF_FFFFL;
return !deduplicateHashBitset.set(hash % deduplicateHashBitset.cardinality);
}
} }

View File

@ -0,0 +1,149 @@
package nu.marginalia.wmsa.edge.converting.atags;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import lombok.SneakyThrows;
import nu.marginalia.util.DenseBitMap;
import nu.marginalia.util.language.WordPatterns;
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.nio.charset.StandardCharsets;
import java.util.function.BiConsumer;
import java.util.function.Predicate;
import java.util.regex.Pattern;
public class AnchorTextExtractor {
private final Predicate<String> includeDomainPredicate;
private final Predicate<EdgeUrl> includeUrlPredicate;
private final BiConsumer<EdgeUrl, String> linkKeywordConsumer;
private final LinkParser linkParser = new LinkParser();
private final HashFunction hashFunction = Hashing.murmur3_128();
// This bit map is used as a bloom filter to deduplicate url-keyword combinations
// false positives are expected, but that's an acceptable trade-off to not have to deal with
// de-duplicating billions of shuffled (url, word) tuples on limited hardware
private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS);
public AnchorTextExtractor(Predicate<String> includeDomainPredicate,
Predicate<EdgeUrl> includeUrlPredicate,
BiConsumer<EdgeUrl, String> linkKeywordConsumer) {
this.includeDomainPredicate = includeDomainPredicate;
this.includeUrlPredicate = includeUrlPredicate;
this.linkKeywordConsumer = linkKeywordConsumer;
}
@SneakyThrows
public void processDocument(String docUrl, String documentBody) {
final Document processed = Jsoup.parse(documentBody);
final EdgeUrl documentUrl = new EdgeUrl(docUrl);
for (var link : processed.getElementsByTag("a")) {
if (link.hasAttr("href")) {
String href = link.attr("href");
String text = getLinkText(link);
processAnchor(documentUrl, href, text);
}
}
}
private final Pattern anchorTextNoise = Pattern.compile("[ \t\n\"()“”]+");
private String getLinkText(Element link) {
String text = link.text();
if (link.text().isBlank()) {
for (var img: link.getElementsByTag("img")) {
if (img.hasAttr("alt")) {
text = img.attr("alt");
break;
}
}
}
return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim();
}
private void processAnchor(EdgeUrl documentUrl, String href, String text) {
if (!isInterestingAnchorText(text)) {
return;
}
if (href.contains("?")) {
return;
}
var optLinkUrl = linkParser.parseLink(documentUrl, href);
if (optLinkUrl.isEmpty()) return;
var linkUrl = optLinkUrl.get();
if (!isInterestingAnchorLink(linkUrl)) {
return;
}
for (String word: anchorTextNoise.split(text)) {
if (WordPatterns.isStopWord(word))
continue;
word = word.toLowerCase();
if (!WordPatterns.filter(word))
continue;
if (!linkUrl.domain.equals(documentUrl.domain)) {
if (isNewKeywordForLink(word, linkUrl.toString())) {
linkKeywordConsumer.accept(linkUrl, word);
}
}
}
}
// This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine
private final Predicate<String> looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate();
private boolean isInterestingAnchorText(String text) {
if (text.isBlank()) return false;
if (text.length() > 32) return false;
// Google loves questions, and so does SEO spammers
if (text.endsWith("?")) return false;
if (text.startsWith("http:") || text.startsWith("https:")) return false;
if (looksLikeAnURL.test(text)) return false;
return switch (text) {
case "this", "here", "click", "click here", "download", "source" -> false;
default -> true;
};
}
private boolean isInterestingAnchorLink(EdgeUrl linkUrl) {
if (!(linkUrl.proto.endsWith("http") || linkUrl.proto.equals("https"))) {
return false;
}
if (!includeUrlPredicate.test(linkUrl)) {
return false;
}
return includeDomainPredicate.test(linkUrl.domain.toString());
}
private boolean isNewKeywordForLink(String href, String text) {
long hash = 0;
hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).asLong();
hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).asLong();
// Remove sign bit because we don't want a negative index in deduplicateHashBitset
hash &= 0x7FFF_FFFF_FFFF_FFFFL;
return !deduplicateHashBitset.set(hash % deduplicateHashBitset.cardinality);
}
}

View File

@ -18,20 +18,20 @@
package org.openzim.ZIMTypes; package org.openzim.ZIMTypes;
import java.io.*;
import java.util.*;
import java.util.function.BiConsumer;
import java.util.function.Consumer;
import java.util.function.Predicate;
import com.github.luben.zstd.RecyclingBufferPool; import com.github.luben.zstd.RecyclingBufferPool;
import com.github.luben.zstd.ZstdInputStream; import com.github.luben.zstd.ZstdInputStream;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Getter; import lombok.Getter;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import org.tukaani.xz.SingleXZInputStream;
import org.openzim.util.RandomAcessFileZIMInputStream; import org.openzim.util.RandomAcessFileZIMInputStream;
import org.openzim.util.Utilities; import org.openzim.util.Utilities;
import org.tukaani.xz.SingleXZInputStream;
import java.io.*;
import java.util.*;
import java.util.function.BiConsumer;
import java.util.function.Consumer;
import java.util.function.Predicate;
/** /**
* @author Arunesh Mathur * @author Arunesh Mathur
@ -401,198 +401,6 @@ public class ZIMReader {
} }
public String getArticleData(DirectoryEntry mainEntry) throws IOException {
byte[] buffer = new byte[8];
if (mainEntry != null) {
// Check what kind of an entry was mainEnrty
if (mainEntry.getClass() == ArticleEntry.class) {
// Cast to ArticleEntry
ArticleEntry article = (ArticleEntry) mainEntry;
// Get the cluster and blob numbers from the article
long clusterNumber = article.getClusterNumber();
int blobNumber = article.getBlobnumber();
// Move to the cluster entry in the clusterPtrPos
mReader.seek(mFile.getClusterPtrPos() + clusterNumber * 8);
// Read the location of the cluster
long clusterPos = mReader
.readEightLittleEndianBytesValue(buffer);
// Move to the cluster
mReader.seek(clusterPos);
// Read the first byte, for compression information
int compressionType = mReader.read();
// Reference declaration
SingleXZInputStream xzReader = null;
int firstOffset, numberOfBlobs, offset1,
offset2,
location,
differenceOffset;
ByteArrayOutputStream baos;
// Check the compression type that was read
switch (compressionType) {
// TODO: Read uncompressed data directly
case 0:
case 1:
// Read the first 4 bytes to find out the number of artciles
buffer = new byte[4];
// Create a dictionary with size 40MiB, the zimlib uses this
// size while creating
// Read the first offset
mReader.read(buffer);
// The first four bytes are the offset of the zeroth blob
firstOffset = Utilities
.toFourLittleEndianInteger(buffer);
// The number of blobs
numberOfBlobs = firstOffset / 4;
// The blobNumber has to be lesser than the numberOfBlobs
assert blobNumber < numberOfBlobs;
if (blobNumber == 0) {
// The first offset is what we read earlier
offset1 = firstOffset;
} else {
location = (blobNumber - 1) * 4;
Utilities.skipFully(mReader, location);
mReader.read(buffer);
offset1 = Utilities.toFourLittleEndianInteger(buffer);
}
mReader.read(buffer);
offset2 = Utilities.toFourLittleEndianInteger(buffer);
differenceOffset = offset2 - offset1;
buffer = new byte[differenceOffset];
Utilities.skipFully(mReader,
(offset1 - 4 * (blobNumber + 2)));
mReader.read(buffer, 0, differenceOffset);
return new String(buffer);
// LZMA2 compressed data
case 4:
// Read the first 4 bytes to find out the number of artciles
buffer = new byte[4];
// Create a dictionary with size 40MiB, the zimlib uses this
// size while creating
xzReader = new SingleXZInputStream(mReader, 4194304);
// Read the first offset
xzReader.read(buffer);
// The first four bytes are the offset of the zeroth blob
firstOffset = Utilities
.toFourLittleEndianInteger(buffer);
// The number of blobs
numberOfBlobs = firstOffset / 4;
// The blobNumber has to be lesser than the numberOfBlobs
assert blobNumber < numberOfBlobs;
if(blobNumber == 0) {
// The first offset is what we read earlier
offset1 = firstOffset;
} else {
location = (blobNumber - 1) * 4;
Utilities.skipFully(xzReader, location);
xzReader.read(buffer);
offset1 = Utilities.toFourLittleEndianInteger(buffer);
}
xzReader.read(buffer);
offset2 = Utilities.toFourLittleEndianInteger(buffer);
differenceOffset = offset2 - offset1;
buffer = new byte[differenceOffset];
Utilities.skipFully(xzReader,
(offset1 - 4 * (blobNumber + 2)));
xzReader.read(buffer, 0, differenceOffset);
return new String(buffer);
case 5:
// Read the first 4 bytes to find out the number of artciles
buffer = new byte[4];
// Create a dictionary with size 40MiB, the zimlib uses this
// size while creating
var zstdInputStream = new com.github.luben.zstd.ZstdInputStream(new BufferedInputStream(mReader));
// Read the first offset
zstdInputStream.read(buffer);
// The first four bytes are the offset of the zeroth blob
firstOffset = Utilities
.toFourLittleEndianInteger(buffer);
// The number of blobs
numberOfBlobs = firstOffset / 4;
// The blobNumber has to be lesser than the numberOfBlobs
assert blobNumber < numberOfBlobs;
if(blobNumber == 0) {
// The first offset is what we read earlier
offset1 = firstOffset;
} else {
location = (blobNumber - 1) * 4;
Utilities.skipFully(zstdInputStream, location);
zstdInputStream.read(buffer);
offset1 = Utilities.toFourLittleEndianInteger(buffer);
}
zstdInputStream.read(buffer);
offset2 = Utilities.toFourLittleEndianInteger(buffer);
differenceOffset = offset2 - offset1;
buffer = new byte[differenceOffset];
Utilities.skipFully(zstdInputStream,
(offset1 - 4 * (blobNumber + 2)));
zstdInputStream.read(buffer, 0, differenceOffset);
return new String(buffer);
default:
System.err.print("What is compression = " + compressionType);
}
}
}
return null;
}
public DirectoryEntry getDirectoryInfoAtTitlePosition(long position) public DirectoryEntry getDirectoryInfoAtTitlePosition(long position)
throws IOException { throws IOException {