From e1b34771156a6f2c64ad71993a35213e6424129f Mon Sep 17 00:00:00 2001 From: vlofgren Date: Thu, 23 Jun 2022 17:02:28 +0200 Subject: [PATCH] Experiments in keyword extraction --- .../converting/LinkKeywordExtractorMain.java | 294 ++++++++---------- .../converting/atags/AnchorTextExtractor.java | 149 +++++++++ .../java/org/openzim/ZIMTypes/ZIMReader.java | 206 +----------- 3 files changed, 292 insertions(+), 357 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java index 570c47b5..792dac6f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java @@ -1,215 +1,193 @@ package nu.marginalia.wmsa.edge.converting; -import com.google.common.hash.HashFunction; -import com.google.common.hash.Hashing; -import com.google.inject.Guice; -import com.google.inject.Inject; -import com.google.inject.Injector; -import nu.marginalia.util.DenseBitMap; -import nu.marginalia.util.language.WordPatterns; -import nu.marginalia.util.language.processing.SentenceExtractor; -import nu.marginalia.util.language.processing.model.DocumentLanguageData; -import nu.marginalia.wmsa.configuration.WmsaHome; -import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; +import gnu.trove.set.hash.TIntHashSet; +import nu.marginalia.wmsa.edge.converting.atags.AnchorTextExtractor; import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader; import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader; import nu.marginalia.wmsa.edge.crawling.WorkLog; import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus; +import nu.marginalia.wmsa.edge.integration.stackoverflow.StackOverflowPostsReader; +import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader; import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; +import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.BufferedOutputStream; +import java.io.FileOutputStream; import java.io.IOException; -import java.net.URISyntaxException; -import java.nio.charset.StandardCharsets; +import java.io.OutputStream; +import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Objects; -import java.util.function.Predicate; -import java.util.regex.Pattern; public class LinkKeywordExtractorMain { private static final Logger logger = LoggerFactory.getLogger(LinkKeywordExtractorMain.class); - public static void main(String... args) throws IOException { + public static void main(String... args) throws IOException, InterruptedException { - if (args.length != 1) { - System.err.println("Arguments: crawl-plan.yaml"); + if (args.length < 2) { + System.err.println("Arguments: [crawl|so|wiki] crawl-plan.yaml [data]"); System.exit(0); } - var plan = new CrawlPlanLoader().load(Path.of(args[0])); - Injector injector = Guice.createInjector( - new ConverterModule(plan) - ); + String command = args[0]; + var plan = new CrawlPlanLoader().load(Path.of(args[1])); + + switch (command) { + case "crawl": getKeywordsFromCrawl(plan); break; + case "so": getKeywordsFromSo(plan, args[2]); break; + case "wiki": getKeywordsFromWiki(plan, args[2]); break; + default: System.err.println("Unrecognized command"); + } - injector.getInstance(LinkKeywordExtractorMain.class); } - private final HashSet crawledDomains = new HashSet<>(); - private final List fileNames = new ArrayList<>(); - private final LinkParser linkParser = new LinkParser(); - private final SentenceExtractor sentenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels()); + private static void getKeywordsFromWiki(EdgeCrawlPlan plan, String arg) throws IOException, InterruptedException { - private final HashFunction hashFunction = Hashing.murmur3_128(); - // This bit map is used as a bloom filter to deduplicate url-keyword combinations - // false positives are expected, but that's an acceptable trade-off to not have to deal with - // de-duplicating billions of shuffled (url, word) tuples on limited hardware - private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS); + HashSet crawledDomains = new HashSet<>(); + TIntHashSet crawledUrls = new TIntHashSet(50_000_000); + + logger.info("Loading URLs"); + Files.lines(Path.of("/home/vlofgren/good-urls3.txt")) + .filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange")) + .mapToInt(String::hashCode) + .forEach(crawledUrls::add); + + logger.info("Loading input spec"); + CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(), + spec -> { crawledDomains.add(spec.domain); }); + + try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) { + AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(domain -> crawledDomains.contains(domain) + && !domain.contains("wiki") + && !domain.contains("isni") + && !domain.contains("wiktionary"), + url -> crawledUrls.contains(url.toString().hashCode()), + output::write); + + new WikipediaReader(arg, new EdgeDomain("invalid.example"), article -> { + anchorTextExtractor.processDocument(article.getUrl().toString(), article.body); + }).join(); + } + catch (IOException ex) { + ex.printStackTrace(); + } + + + + } + + private static void getKeywordsFromSo(EdgeCrawlPlan plan, String arg) throws IOException, InterruptedException { + TIntHashSet crawledUrls = new TIntHashSet(50_000_000); + + logger.info("Loading URLs"); + Files.lines(Path.of("/home/vlofgren/good-urls3.txt")) + .filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange")) + .mapToInt(String::hashCode) + .forEach(crawledUrls::add); - @Inject - public LinkKeywordExtractorMain(EdgeCrawlPlan plan) throws IOException { logger.info("Loading input spec"); + HashSet crawledDomains = new HashSet<>(); CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(), spec -> crawledDomains.add(spec.domain)); + crawledDomains.remove("jsfiddle.net"); // like 30% of SO's links go here + crawledDomains.remove("jsbin.com"); + crawledDomains.remove("codepad.org"); + + + try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) { + AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains, + url -> crawledUrls.contains(url.toString().hashCode()), + output::write); + + new StackOverflowPostsReader(arg, new EdgeDomain("invalid.example"), post -> { + anchorTextExtractor.processDocument(post.getUrl().toString(), post.fullBody); + }).join(); + } + catch (IOException ex) { + ex.printStackTrace(); + } + } + + + public static void getKeywordsFromCrawl(EdgeCrawlPlan plan) throws IOException { + + TIntHashSet crawledUrls = new TIntHashSet(50_000_000); + + logger.info("Loading URLs"); + Files.lines(Path.of("/home/vlofgren/good-urls3.txt")) + .filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange")) + .mapToInt(String::hashCode) + .forEach(crawledUrls::add); + + + logger.info("Loading input spec"); + + HashSet crawledDomains = new HashSet<>(); + CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(), + spec -> crawledDomains.add(spec.domain)); + + List fileNames = new ArrayList<>(); + logger.info("Replaying crawl log"); WorkLog.readLog(plan.crawl.getLogFile(), entry -> fileNames.add(entry.path())); - logger.info("Reading files"); - for (var fn : fileNames) { - CrawledDomainReader crawledDomainReader = new CrawledDomainReader(); - var crawledDomain = crawledDomainReader.read(plan.getCrawledFilePath(fn)); - if (crawledDomain.doc == null) continue; + try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) { + AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains, + url -> crawledUrls.contains(url.toString().hashCode()), + output::write); - System.out.println("# " + crawledDomain.domain); + logger.info("Reading files"); + for (var fn : fileNames) { + CrawledDomainReader crawledDomainReader = new CrawledDomainReader(); + var crawledDomain = crawledDomainReader.read(plan.getCrawledFilePath(fn)); + if (crawledDomain.doc == null) continue; - for (var doc : crawledDomain.doc) { - try { + System.out.println("# " + crawledDomain.domain); + + for (var doc : crawledDomain.doc) { if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) { - processDocument(doc.url, doc.documentBody); - } - } - catch (URISyntaxException ex) { - // This Shouldn't Happen (TM) as the URL that we're failing to process - // is expected to have already been parsed by this code successfully - // in the process of getting here. - // - // But also, if it does happen, it's no big deal - - logger.warn("Bad URL format", ex); - } - } - } - } - - - private void processDocument(String docUrl, String documentBody) throws URISyntaxException { - final Document processed = Jsoup.parse(documentBody); - final EdgeUrl documentUrl = new EdgeUrl(docUrl); - - for (var link : processed.getElementsByTag("a")) { - if (link.hasAttr("href")) { - String href = link.attr("href"); - String text = getLinkText(link); - - processAnchor(documentUrl, href, text); - } - } - } - - private final Pattern anchorTextNoise = Pattern.compile("[\\s\"()“”:]+"); - - private String getLinkText(Element link) { - String text = link.text(); - - if (link.text().isBlank()) { - text = getLinkTextByImgAltTag(link); - } - - return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim(); - } - - private String getLinkTextByImgAltTag(Element link) { - for (var img: link.getElementsByTag("img")) { - if (img.hasAttr("alt")) { - return img.attr("alt"); - } - } - return ""; - } - - private void processAnchor(EdgeUrl documentUrl, String href, String text) { - if (!isInterestingAnchorText(text)) { - return; - } - - var optLinkUrl = linkParser.parseLink(documentUrl, href); - if (optLinkUrl.isEmpty()) return; - - var linkUrl = optLinkUrl.get(); - - if (!isInterestingAnchorLink(linkUrl)) { - return; - } - - DocumentLanguageData languageData = sentenceExtractor.extractSentences(text); - for (var sent : languageData.sentences) { - for (var wordPos : sent) { - if (wordPos.isStopWord()) - continue; - - String word = wordPos.wordLowerCase(); - - if (!WordPatterns.filter(word)) - continue; - - if (!linkUrl.domain.equals(documentUrl.domain)) { - if (isNewKeywordForLink(word, linkUrl.toString())) { - System.out.println(linkUrl + "\t" + word); + anchorTextExtractor.processDocument(doc.url, doc.documentBody); } } } } + } - // This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine - private final Predicate looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate(); + private static class UrlKeywordTsvWriter implements AutoCloseable { - private boolean isInterestingAnchorText(String text) { - if (text.isBlank()) return false; - if (text.length() > 32) return false; + private final OutputStream stream; - // Google loves questions, and so does SEO spammers - if (text.endsWith("?")) return false; - - if (text.startsWith("http:") || text.startsWith("https:")) return false; - - if (looksLikeAnURL.test(text)) return false; - - return switch (text) { - case "this", "here", "click", "click here", "download", "source" -> false; - default -> true; - }; - } - - private boolean isInterestingAnchorLink(EdgeUrl linkUrl) { - if (!(linkUrl.proto.endsWith("http") || linkUrl.proto.equals("https"))) { - return false; + UrlKeywordTsvWriter(Path outputFile) throws IOException { + this.stream = new BufferedOutputStream(new FileOutputStream(outputFile.toFile())); } - return crawledDomains.contains(linkUrl.domain.toString()); + void write(EdgeUrl url, String keyword) { + try { + stream.write(url.toString().getBytes()); + stream.write('\t'); + stream.write(keyword.getBytes()); + stream.write('\n'); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + public void close() throws IOException { + stream.close(); + } } - private boolean isNewKeywordForLink(String href, String text) { - long hash = 0; - - hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).asLong(); - hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).asLong(); - - // Remove sign bit because we don't want a negative index in deduplicateHashBitset - hash &= 0x7FFF_FFFF_FFFF_FFFFL; - - return !deduplicateHashBitset.set(hash % deduplicateHashBitset.cardinality); - } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java new file mode 100644 index 00000000..c96fd400 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java @@ -0,0 +1,149 @@ +package nu.marginalia.wmsa.edge.converting.atags; + +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; +import lombok.SneakyThrows; +import nu.marginalia.util.DenseBitMap; +import nu.marginalia.util.language.WordPatterns; +import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import java.nio.charset.StandardCharsets; +import java.util.function.BiConsumer; +import java.util.function.Predicate; +import java.util.regex.Pattern; + +public class AnchorTextExtractor { + private final Predicate includeDomainPredicate; + private final Predicate includeUrlPredicate; + private final BiConsumer linkKeywordConsumer; + + private final LinkParser linkParser = new LinkParser(); + + private final HashFunction hashFunction = Hashing.murmur3_128(); + + // This bit map is used as a bloom filter to deduplicate url-keyword combinations + // false positives are expected, but that's an acceptable trade-off to not have to deal with + // de-duplicating billions of shuffled (url, word) tuples on limited hardware + private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS); + + public AnchorTextExtractor(Predicate includeDomainPredicate, + Predicate includeUrlPredicate, + BiConsumer linkKeywordConsumer) { + this.includeDomainPredicate = includeDomainPredicate; + this.includeUrlPredicate = includeUrlPredicate; + this.linkKeywordConsumer = linkKeywordConsumer; + } + + @SneakyThrows + public void processDocument(String docUrl, String documentBody) { + final Document processed = Jsoup.parse(documentBody); + final EdgeUrl documentUrl = new EdgeUrl(docUrl); + + for (var link : processed.getElementsByTag("a")) { + if (link.hasAttr("href")) { + String href = link.attr("href"); + String text = getLinkText(link); + + processAnchor(documentUrl, href, text); + } + } + } + + private final Pattern anchorTextNoise = Pattern.compile("[ \t\n\"()“”]+"); + + private String getLinkText(Element link) { + String text = link.text(); + + if (link.text().isBlank()) { + for (var img: link.getElementsByTag("img")) { + if (img.hasAttr("alt")) { + text = img.attr("alt"); + break; + } + } + } + + return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim(); + } + + private void processAnchor(EdgeUrl documentUrl, String href, String text) { + if (!isInterestingAnchorText(text)) { + return; + } + if (href.contains("?")) { + return; + } + + var optLinkUrl = linkParser.parseLink(documentUrl, href); + if (optLinkUrl.isEmpty()) return; + + var linkUrl = optLinkUrl.get(); + + if (!isInterestingAnchorLink(linkUrl)) { + return; + } + + for (String word: anchorTextNoise.split(text)) { + if (WordPatterns.isStopWord(word)) + continue; + + word = word.toLowerCase(); + if (!WordPatterns.filter(word)) + continue; + + if (!linkUrl.domain.equals(documentUrl.domain)) { + if (isNewKeywordForLink(word, linkUrl.toString())) { + linkKeywordConsumer.accept(linkUrl, word); + } + } + } + } + + // This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine + private final Predicate looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate(); + + private boolean isInterestingAnchorText(String text) { + if (text.isBlank()) return false; + if (text.length() > 32) return false; + + // Google loves questions, and so does SEO spammers + if (text.endsWith("?")) return false; + + if (text.startsWith("http:") || text.startsWith("https:")) return false; + + if (looksLikeAnURL.test(text)) return false; + + return switch (text) { + case "this", "here", "click", "click here", "download", "source" -> false; + default -> true; + }; + } + + private boolean isInterestingAnchorLink(EdgeUrl linkUrl) { + if (!(linkUrl.proto.endsWith("http") || linkUrl.proto.equals("https"))) { + return false; + } + + if (!includeUrlPredicate.test(linkUrl)) { + return false; + } + + return includeDomainPredicate.test(linkUrl.domain.toString()); + } + + private boolean isNewKeywordForLink(String href, String text) { + long hash = 0; + + hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).asLong(); + hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).asLong(); + + // Remove sign bit because we don't want a negative index in deduplicateHashBitset + hash &= 0x7FFF_FFFF_FFFF_FFFFL; + + return !deduplicateHashBitset.set(hash % deduplicateHashBitset.cardinality); + } +} diff --git a/third_party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java b/third_party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java index d97c3c73..7706e8d1 100644 --- a/third_party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java +++ b/third_party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java @@ -18,20 +18,20 @@ package org.openzim.ZIMTypes; -import java.io.*; -import java.util.*; -import java.util.function.BiConsumer; -import java.util.function.Consumer; -import java.util.function.Predicate; - import com.github.luben.zstd.RecyclingBufferPool; import com.github.luben.zstd.ZstdInputStream; import lombok.AllArgsConstructor; import lombok.Getter; import org.jetbrains.annotations.NotNull; -import org.tukaani.xz.SingleXZInputStream; import org.openzim.util.RandomAcessFileZIMInputStream; import org.openzim.util.Utilities; +import org.tukaani.xz.SingleXZInputStream; + +import java.io.*; +import java.util.*; +import java.util.function.BiConsumer; +import java.util.function.Consumer; +import java.util.function.Predicate; /** * @author Arunesh Mathur @@ -401,198 +401,6 @@ public class ZIMReader { } - public String getArticleData(DirectoryEntry mainEntry) throws IOException { - - byte[] buffer = new byte[8]; - - if (mainEntry != null) { - - // Check what kind of an entry was mainEnrty - if (mainEntry.getClass() == ArticleEntry.class) { - - // Cast to ArticleEntry - ArticleEntry article = (ArticleEntry) mainEntry; - - // Get the cluster and blob numbers from the article - long clusterNumber = article.getClusterNumber(); - int blobNumber = article.getBlobnumber(); - - // Move to the cluster entry in the clusterPtrPos - mReader.seek(mFile.getClusterPtrPos() + clusterNumber * 8); - - // Read the location of the cluster - long clusterPos = mReader - .readEightLittleEndianBytesValue(buffer); - - // Move to the cluster - mReader.seek(clusterPos); - - // Read the first byte, for compression information - int compressionType = mReader.read(); - - // Reference declaration - SingleXZInputStream xzReader = null; - int firstOffset, numberOfBlobs, offset1, - offset2, - location, - differenceOffset; - - ByteArrayOutputStream baos; - - // Check the compression type that was read - switch (compressionType) { - - // TODO: Read uncompressed data directly - case 0: - case 1: - - // Read the first 4 bytes to find out the number of artciles - buffer = new byte[4]; - - // Create a dictionary with size 40MiB, the zimlib uses this - // size while creating - - // Read the first offset - mReader.read(buffer); - - // The first four bytes are the offset of the zeroth blob - firstOffset = Utilities - .toFourLittleEndianInteger(buffer); - - // The number of blobs - numberOfBlobs = firstOffset / 4; - - // The blobNumber has to be lesser than the numberOfBlobs - assert blobNumber < numberOfBlobs; - - - if (blobNumber == 0) { - // The first offset is what we read earlier - offset1 = firstOffset; - } else { - - location = (blobNumber - 1) * 4; - Utilities.skipFully(mReader, location); - mReader.read(buffer); - offset1 = Utilities.toFourLittleEndianInteger(buffer); - } - - mReader.read(buffer); - offset2 = Utilities.toFourLittleEndianInteger(buffer); - - differenceOffset = offset2 - offset1; - buffer = new byte[differenceOffset]; - - Utilities.skipFully(mReader, - (offset1 - 4 * (blobNumber + 2))); - - mReader.read(buffer, 0, differenceOffset); - - return new String(buffer); - - // LZMA2 compressed data - case 4: - - // Read the first 4 bytes to find out the number of artciles - buffer = new byte[4]; - - // Create a dictionary with size 40MiB, the zimlib uses this - // size while creating - xzReader = new SingleXZInputStream(mReader, 4194304); - - // Read the first offset - xzReader.read(buffer); - - // The first four bytes are the offset of the zeroth blob - firstOffset = Utilities - .toFourLittleEndianInteger(buffer); - - // The number of blobs - numberOfBlobs = firstOffset / 4; - - // The blobNumber has to be lesser than the numberOfBlobs - assert blobNumber < numberOfBlobs; - - if(blobNumber == 0) { - // The first offset is what we read earlier - offset1 = firstOffset; - } else { - - location = (blobNumber - 1) * 4; - Utilities.skipFully(xzReader, location); - xzReader.read(buffer); - offset1 = Utilities.toFourLittleEndianInteger(buffer); - } - - xzReader.read(buffer); - offset2 = Utilities.toFourLittleEndianInteger(buffer); - - differenceOffset = offset2 - offset1; - buffer = new byte[differenceOffset]; - - Utilities.skipFully(xzReader, - (offset1 - 4 * (blobNumber + 2))); - - xzReader.read(buffer, 0, differenceOffset); - return new String(buffer); - - case 5: - // Read the first 4 bytes to find out the number of artciles - buffer = new byte[4]; - - // Create a dictionary with size 40MiB, the zimlib uses this - // size while creating - var zstdInputStream = new com.github.luben.zstd.ZstdInputStream(new BufferedInputStream(mReader)); - - // Read the first offset - zstdInputStream.read(buffer); - - // The first four bytes are the offset of the zeroth blob - firstOffset = Utilities - .toFourLittleEndianInteger(buffer); - - // The number of blobs - numberOfBlobs = firstOffset / 4; - - // The blobNumber has to be lesser than the numberOfBlobs - assert blobNumber < numberOfBlobs; - - if(blobNumber == 0) { - // The first offset is what we read earlier - offset1 = firstOffset; - } else { - - location = (blobNumber - 1) * 4; - Utilities.skipFully(zstdInputStream, location); - zstdInputStream.read(buffer); - offset1 = Utilities.toFourLittleEndianInteger(buffer); - } - - zstdInputStream.read(buffer); - offset2 = Utilities.toFourLittleEndianInteger(buffer); - - differenceOffset = offset2 - offset1; - buffer = new byte[differenceOffset]; - - Utilities.skipFully(zstdInputStream, - (offset1 - 4 * (blobNumber + 2))); - - zstdInputStream.read(buffer, 0, differenceOffset); - - return new String(buffer); - - default: - System.err.print("What is compression = " + compressionType); - - } - - } - } - - return null; - - } - public DirectoryEntry getDirectoryInfoAtTitlePosition(long position) throws IOException {