Experiments in keyword extraction

2025-02-23 21:18:58 +00:00 · 2022-06-23 17:02:28 +02:00 · 2022-06-23 17:02:28 +02:00 · e1b3477115
commit e1b3477115
parent 4516b23f90
3 changed files with 292 additions and 357 deletions
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java
@ -1,215 +1,193 @@
 package nu.marginalia.wmsa.edge.converting;
-import com.google.common.hash.HashFunction;
+import gnu.trove.set.hash.TIntHashSet;
-import com.google.common.hash.Hashing;
+import nu.marginalia.wmsa.edge.converting.atags.AnchorTextExtractor;
 import com.google.inject.Guice;
 import com.google.inject.Inject;
 import com.google.inject.Injector;
 import nu.marginalia.util.DenseBitMap;
 import nu.marginalia.util.language.WordPatterns;
 import nu.marginalia.util.language.processing.SentenceExtractor;
 import nu.marginalia.util.language.processing.model.DocumentLanguageData;
 import nu.marginalia.wmsa.configuration.WmsaHome;
 import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
 import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
 import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader;
 import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader;
 import nu.marginalia.wmsa.edge.crawling.WorkLog;
 import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
 import nu.marginalia.wmsa.edge.integration.stackoverflow.StackOverflowPostsReader;
 import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader;
 import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
 import nu.marginalia.wmsa.edge.model.EdgeDomain;
 import nu.marginalia.wmsa.edge.model.EdgeUrl;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import java.io.BufferedOutputStream;
 import java.io.FileOutputStream;
 import java.io.IOException;
-import java.net.URISyntaxException;
+import java.io.OutputStream;
-import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Objects;
 import java.util.function.Predicate;
 import java.util.regex.Pattern;
 public class LinkKeywordExtractorMain {
    private static final Logger logger = LoggerFactory.getLogger(LinkKeywordExtractorMain.class);
-    public static void main(String... args) throws IOException {
+    public static void main(String... args) throws IOException, InterruptedException {
-        if (args.length != 1) {
+        if (args.length < 2) {
-            System.err.println("Arguments: crawl-plan.yaml");
+            System.err.println("Arguments: [crawl|so|wiki] crawl-plan.yaml [data]");
            System.exit(0);
        }
        var plan = new CrawlPlanLoader().load(Path.of(args[0]));
-        Injector injector = Guice.createInjector(
+        String command = args[0];
-                new ConverterModule(plan)
+        var plan = new CrawlPlanLoader().load(Path.of(args[1]));
-        );
+
        switch (command) {
            case "crawl": getKeywordsFromCrawl(plan); break;
            case "so": getKeywordsFromSo(plan, args[2]); break;
            case "wiki": getKeywordsFromWiki(plan, args[2]); break;
            default: System.err.println("Unrecognized command");
        }
        injector.getInstance(LinkKeywordExtractorMain.class);
    }
-    private final HashSet<String> crawledDomains = new HashSet<>();
+    private static void getKeywordsFromWiki(EdgeCrawlPlan plan, String arg) throws IOException, InterruptedException {
    private final List<String> fileNames = new ArrayList<>();
    private final LinkParser linkParser = new LinkParser();
    private final SentenceExtractor sentenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels());
    private final HashFunction hashFunction = Hashing.murmur3_128();
-    // This bit map is used as a bloom filter to deduplicate url-keyword combinations
+        HashSet<String> crawledDomains = new HashSet<>();
-    // false positives are expected, but that's an acceptable trade-off to not have to deal with
+        TIntHashSet crawledUrls = new TIntHashSet(50_000_000);
-    // de-duplicating billions of shuffled (url, word) tuples on limited hardware
+
-    private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS);
+        logger.info("Loading URLs");
        Files.lines(Path.of("/home/vlofgren/good-urls3.txt"))
                .filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange"))
                .mapToInt(String::hashCode)
                .forEach(crawledUrls::add);
        logger.info("Loading input spec");
        CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
                spec -> { crawledDomains.add(spec.domain); });
        try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
            AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(domain -> crawledDomains.contains(domain)
                    && !domain.contains("wiki")
                    && !domain.contains("isni")
                    && !domain.contains("wiktionary"),
                    url -> crawledUrls.contains(url.toString().hashCode()),
                    output::write);
            new WikipediaReader(arg, new EdgeDomain("invalid.example"), article -> {
                anchorTextExtractor.processDocument(article.getUrl().toString(), article.body);
            }).join();
        }
        catch (IOException ex) {
            ex.printStackTrace();
        }
    }
    private static void getKeywordsFromSo(EdgeCrawlPlan plan, String arg) throws IOException, InterruptedException {
        TIntHashSet crawledUrls = new TIntHashSet(50_000_000);
        logger.info("Loading URLs");
        Files.lines(Path.of("/home/vlofgren/good-urls3.txt"))
                .filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange"))
                .mapToInt(String::hashCode)
                .forEach(crawledUrls::add);
    @Inject
    public LinkKeywordExtractorMain(EdgeCrawlPlan plan) throws IOException {
        logger.info("Loading input spec");
        HashSet<String> crawledDomains = new HashSet<>();
        CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
                spec -> crawledDomains.add(spec.domain));
        crawledDomains.remove("jsfiddle.net"); // like 30% of SO's links go here
        crawledDomains.remove("jsbin.com");
        crawledDomains.remove("codepad.org");
        try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
            AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains,
                url -> crawledUrls.contains(url.toString().hashCode()),
                output::write);
            new StackOverflowPostsReader(arg, new EdgeDomain("invalid.example"), post -> {
                anchorTextExtractor.processDocument(post.getUrl().toString(), post.fullBody);
            }).join();
        }
        catch (IOException ex) {
            ex.printStackTrace();
        }
    }
    public static void getKeywordsFromCrawl(EdgeCrawlPlan plan) throws IOException {
        TIntHashSet crawledUrls = new TIntHashSet(50_000_000);
        logger.info("Loading URLs");
        Files.lines(Path.of("/home/vlofgren/good-urls3.txt"))
                .filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange"))
                .mapToInt(String::hashCode)
                .forEach(crawledUrls::add);
        logger.info("Loading input spec");
        HashSet<String> crawledDomains = new HashSet<>();
        CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
                spec -> crawledDomains.add(spec.domain));
        List<String> fileNames = new ArrayList<>();
        logger.info("Replaying crawl log");
        WorkLog.readLog(plan.crawl.getLogFile(),
                entry -> fileNames.add(entry.path()));
-        logger.info("Reading files");
+        try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
-        for (var fn : fileNames) {
+            AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains,
-            CrawledDomainReader crawledDomainReader = new CrawledDomainReader();
+                    url -> crawledUrls.contains(url.toString().hashCode()),
-            var crawledDomain = crawledDomainReader.read(plan.getCrawledFilePath(fn));
+                    output::write);
            if (crawledDomain.doc == null) continue;
-            System.out.println("# " + crawledDomain.domain);
+            logger.info("Reading files");
            for (var fn : fileNames) {
                CrawledDomainReader crawledDomainReader = new CrawledDomainReader();
                var crawledDomain = crawledDomainReader.read(plan.getCrawledFilePath(fn));
                if (crawledDomain.doc == null) continue;
-            for (var doc : crawledDomain.doc) {
+                System.out.println("# " + crawledDomain.domain);
-                try {
+
                for (var doc : crawledDomain.doc) {
                    if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) {
-                        processDocument(doc.url, doc.documentBody);
+                        anchorTextExtractor.processDocument(doc.url, doc.documentBody);
                    }
                }
                catch (URISyntaxException ex) {
                    // This Shouldn't Happen (TM) as the URL that we're failing to process
                    // is expected to have already been parsed by this code successfully
                    // in the process of getting here.
                    //
                    // But also, if it does happen, it's no big deal
                    logger.warn("Bad URL format", ex);
                }
            }
        }
    }
    private void processDocument(String docUrl, String documentBody) throws URISyntaxException {
        final Document processed = Jsoup.parse(documentBody);
        final EdgeUrl documentUrl = new EdgeUrl(docUrl);
        for (var link : processed.getElementsByTag("a")) {
            if (link.hasAttr("href")) {
                String href = link.attr("href");
                String text = getLinkText(link);
                processAnchor(documentUrl, href, text);
            }
        }
    }
    private final Pattern anchorTextNoise = Pattern.compile("[\\s\"()“”:]+");
    private String getLinkText(Element link) {
        String text = link.text();
        if (link.text().isBlank()) {
            text = getLinkTextByImgAltTag(link);
        }
        return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim();
    }
    private String getLinkTextByImgAltTag(Element link) {
        for (var img: link.getElementsByTag("img")) {
            if (img.hasAttr("alt")) {
                return img.attr("alt");
            }
        }
        return "";
    }
    private void processAnchor(EdgeUrl documentUrl, String href, String text) {
        if (!isInterestingAnchorText(text)) {
            return;
        }
        var optLinkUrl = linkParser.parseLink(documentUrl, href);
        if (optLinkUrl.isEmpty()) return;
        var linkUrl = optLinkUrl.get();
        if (!isInterestingAnchorLink(linkUrl)) {
            return;
        }
        DocumentLanguageData languageData = sentenceExtractor.extractSentences(text);
        for (var sent : languageData.sentences) {
            for (var wordPos : sent) {
                if (wordPos.isStopWord())
                    continue;
                String word = wordPos.wordLowerCase();
                if (!WordPatterns.filter(word))
                    continue;
                if (!linkUrl.domain.equals(documentUrl.domain)) {
                    if (isNewKeywordForLink(word, linkUrl.toString())) {
                        System.out.println(linkUrl + "\t" + word);
                    }
                }
            }
        }
    }
-    // This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine
+    private static class UrlKeywordTsvWriter implements AutoCloseable {
    private final Predicate<String> looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate();
-    private boolean isInterestingAnchorText(String text) {
+        private final OutputStream stream;
        if (text.isBlank()) return false;
        if (text.length() > 32) return false;
-        // Google loves questions, and so does SEO spammers
+        UrlKeywordTsvWriter(Path outputFile) throws IOException {
-        if (text.endsWith("?")) return false;
+            this.stream = new BufferedOutputStream(new FileOutputStream(outputFile.toFile()));
        if (text.startsWith("http:") || text.startsWith("https:")) return false;
        if (looksLikeAnURL.test(text)) return false;
        return switch (text) {
            case "this", "here", "click", "click here", "download", "source" -> false;
            default -> true;
        };
    }
    private boolean isInterestingAnchorLink(EdgeUrl linkUrl) {
        if (!(linkUrl.proto.endsWith("http") || linkUrl.proto.equals("https"))) {
            return false;
        }
-        return crawledDomains.contains(linkUrl.domain.toString());
+        void write(EdgeUrl url, String keyword) {
            try {
                stream.write(url.toString().getBytes());
                stream.write('\t');
                stream.write(keyword.getBytes());
                stream.write('\n');
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
        @Override
        public void close() throws IOException {
            stream.close();
        }
    }
    private boolean isNewKeywordForLink(String href, String text) {
        long hash = 0;
        hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).asLong();
        hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).asLong();
        // Remove sign bit because we don't want a negative index in deduplicateHashBitset
        hash &= 0x7FFF_FFFF_FFFF_FFFFL;
        return !deduplicateHashBitset.set(hash % deduplicateHashBitset.cardinality);
    }
 }
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java
@ -0,0 +1,149 @@
 package nu.marginalia.wmsa.edge.converting.atags;
 import com.google.common.hash.HashFunction;
 import com.google.common.hash.Hashing;
 import lombok.SneakyThrows;
 import nu.marginalia.util.DenseBitMap;
 import nu.marginalia.util.language.WordPatterns;
 import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
 import nu.marginalia.wmsa.edge.model.EdgeUrl;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import java.nio.charset.StandardCharsets;
 import java.util.function.BiConsumer;
 import java.util.function.Predicate;
 import java.util.regex.Pattern;
 public class AnchorTextExtractor {
    private final Predicate<String> includeDomainPredicate;
    private final Predicate<EdgeUrl> includeUrlPredicate;
    private final BiConsumer<EdgeUrl, String> linkKeywordConsumer;
    private final LinkParser linkParser = new LinkParser();
    private final HashFunction hashFunction = Hashing.murmur3_128();
    // This bit map is used as a bloom filter to deduplicate url-keyword combinations
    // false positives are expected, but that's an acceptable trade-off to not have to deal with
    // de-duplicating billions of shuffled (url, word) tuples on limited hardware
    private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS);
    public AnchorTextExtractor(Predicate<String> includeDomainPredicate,
                               Predicate<EdgeUrl> includeUrlPredicate,
                               BiConsumer<EdgeUrl, String> linkKeywordConsumer) {
        this.includeDomainPredicate = includeDomainPredicate;
        this.includeUrlPredicate = includeUrlPredicate;
        this.linkKeywordConsumer = linkKeywordConsumer;
    }
    @SneakyThrows
    public void processDocument(String docUrl, String documentBody) {
        final Document processed = Jsoup.parse(documentBody);
        final EdgeUrl documentUrl = new EdgeUrl(docUrl);
        for (var link : processed.getElementsByTag("a")) {
            if (link.hasAttr("href")) {
                String href = link.attr("href");
                String text = getLinkText(link);
                processAnchor(documentUrl, href, text);
            }
        }
    }
    private final Pattern anchorTextNoise = Pattern.compile("[ \t\n\"()“”]+");
    private String getLinkText(Element link) {
        String text = link.text();
        if (link.text().isBlank()) {
            for (var img: link.getElementsByTag("img")) {
                if (img.hasAttr("alt")) {
                    text = img.attr("alt");
                    break;
                }
            }
        }
        return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim();
    }
    private void processAnchor(EdgeUrl documentUrl, String href, String text) {
        if (!isInterestingAnchorText(text)) {
            return;
        }
        if (href.contains("?")) {
            return;
        }
        var optLinkUrl = linkParser.parseLink(documentUrl, href);
        if (optLinkUrl.isEmpty()) return;
        var linkUrl = optLinkUrl.get();
        if (!isInterestingAnchorLink(linkUrl)) {
            return;
        }
        for (String word: anchorTextNoise.split(text)) {
            if (WordPatterns.isStopWord(word))
                continue;
            word = word.toLowerCase();
            if (!WordPatterns.filter(word))
                continue;
            if (!linkUrl.domain.equals(documentUrl.domain)) {
                if (isNewKeywordForLink(word, linkUrl.toString())) {
                    linkKeywordConsumer.accept(linkUrl, word);
                }
            }
        }
    }
    // This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine
    private final Predicate<String> looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate();
    private boolean isInterestingAnchorText(String text) {
        if (text.isBlank()) return false;
        if (text.length() > 32) return false;
        // Google loves questions, and so does SEO spammers
        if (text.endsWith("?")) return false;
        if (text.startsWith("http:") || text.startsWith("https:")) return false;
        if (looksLikeAnURL.test(text)) return false;
        return switch (text) {
            case "this", "here", "click", "click here", "download", "source" -> false;
            default -> true;
        };
    }
    private boolean isInterestingAnchorLink(EdgeUrl linkUrl) {
        if (!(linkUrl.proto.endsWith("http") || linkUrl.proto.equals("https"))) {
            return false;
        }
        if (!includeUrlPredicate.test(linkUrl)) {
            return false;
        }
        return includeDomainPredicate.test(linkUrl.domain.toString());
    }
    private boolean isNewKeywordForLink(String href, String text) {
        long hash = 0;
        hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).asLong();
        hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).asLong();
        // Remove sign bit because we don't want a negative index in deduplicateHashBitset
        hash &= 0x7FFF_FFFF_FFFF_FFFFL;
        return !deduplicateHashBitset.set(hash % deduplicateHashBitset.cardinality);
    }
 }
--- a/third_party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java
+++ b/third_party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java
@ -18,20 +18,20 @@
 package org.openzim.ZIMTypes;
 import java.io.*;
 import java.util.*;
 import java.util.function.BiConsumer;
 import java.util.function.Consumer;
 import java.util.function.Predicate;
 import com.github.luben.zstd.RecyclingBufferPool;
 import com.github.luben.zstd.ZstdInputStream;
 import lombok.AllArgsConstructor;
 import lombok.Getter;
 import org.jetbrains.annotations.NotNull;
 import org.tukaani.xz.SingleXZInputStream;
 import org.openzim.util.RandomAcessFileZIMInputStream;
 import org.openzim.util.Utilities;
 import org.tukaani.xz.SingleXZInputStream;
 import java.io.*;
 import java.util.*;
 import java.util.function.BiConsumer;
 import java.util.function.Consumer;
 import java.util.function.Predicate;
 /**
 * @author Arunesh Mathur
@ -401,198 +401,6 @@ public class ZIMReader {
 	}
 	public String getArticleData(DirectoryEntry mainEntry) throws IOException {
 		byte[] buffer = new byte[8];
 		if (mainEntry != null) {
 			// Check what kind of an entry was mainEnrty
 			if (mainEntry.getClass() == ArticleEntry.class) {
 				// Cast to ArticleEntry
 				ArticleEntry article = (ArticleEntry) mainEntry;
 				// Get the cluster and blob numbers from the article
 				long clusterNumber = article.getClusterNumber();
 				int blobNumber = article.getBlobnumber();
 				// Move to the cluster entry in the clusterPtrPos
 				mReader.seek(mFile.getClusterPtrPos() + clusterNumber * 8);
 				// Read the location of the cluster
 				long clusterPos = mReader
 						.readEightLittleEndianBytesValue(buffer);
 				// Move to the cluster
 				mReader.seek(clusterPos);
 				// Read the first byte, for compression information
 				int compressionType = mReader.read();
 				// Reference declaration
 				SingleXZInputStream xzReader = null;
 				int firstOffset, numberOfBlobs, offset1,
 				offset2,
 				location,
 				differenceOffset;
 				ByteArrayOutputStream baos;
 				// Check the compression type that was read
 				switch (compressionType) {
 					// TODO: Read uncompressed data directly
 					case 0:
 					case 1:
 						// Read the first 4 bytes to find out the number of artciles
 						buffer = new byte[4];
 						// Create a dictionary with size 40MiB, the zimlib uses this
 						// size while creating
 						// Read the first offset
 						mReader.read(buffer);
 						// The first four bytes are the offset of the zeroth blob
 						firstOffset = Utilities
 								.toFourLittleEndianInteger(buffer);
 						// The number of blobs
 						numberOfBlobs = firstOffset / 4;
 						// The blobNumber has to be lesser than the numberOfBlobs
 						assert blobNumber < numberOfBlobs;
 						if (blobNumber == 0) {
 							// The first offset is what we read earlier
 							offset1 = firstOffset;
 						} else {
 							location = (blobNumber - 1) * 4;
 							Utilities.skipFully(mReader, location);
 							mReader.read(buffer);
 							offset1 = Utilities.toFourLittleEndianInteger(buffer);
 						}
 						mReader.read(buffer);
 						offset2 = Utilities.toFourLittleEndianInteger(buffer);
 						differenceOffset = offset2 - offset1;
 						buffer = new byte[differenceOffset];
 						Utilities.skipFully(mReader,
 								(offset1 - 4 * (blobNumber + 2)));
 						mReader.read(buffer, 0, differenceOffset);
 						return new String(buffer);
 				// LZMA2 compressed data
 				case 4:
 					// Read the first 4 bytes to find out the number of artciles
 					buffer = new byte[4];
 					// Create a dictionary with size 40MiB, the zimlib uses this
 					// size while creating
 					xzReader = new SingleXZInputStream(mReader, 4194304);
 					// Read the first offset
 					xzReader.read(buffer);
 					// The first four bytes are the offset of the zeroth blob
 					firstOffset = Utilities
 							.toFourLittleEndianInteger(buffer);
 					// The number of blobs
 					numberOfBlobs = firstOffset / 4;
 					// The blobNumber has to be lesser than the numberOfBlobs
 					assert blobNumber < numberOfBlobs;
 					if(blobNumber == 0) {
 						// The first offset is what we read earlier
 						offset1 = firstOffset;
 					} else {
 						location = (blobNumber - 1) * 4;
 						Utilities.skipFully(xzReader, location);
 						xzReader.read(buffer);
 						offset1 = Utilities.toFourLittleEndianInteger(buffer);
 					}
 					xzReader.read(buffer);
 					offset2 = Utilities.toFourLittleEndianInteger(buffer);
 					differenceOffset = offset2 - offset1;
 					buffer = new byte[differenceOffset];
 					Utilities.skipFully(xzReader,
 							(offset1 - 4 * (blobNumber + 2)));
 					xzReader.read(buffer, 0, differenceOffset);
 					return new String(buffer);
 				case 5:
 					// Read the first 4 bytes to find out the number of artciles
 					buffer = new byte[4];
 					// Create a dictionary with size 40MiB, the zimlib uses this
 					// size while creating
 					var zstdInputStream = new com.github.luben.zstd.ZstdInputStream(new BufferedInputStream(mReader));
 					// Read the first offset
 					zstdInputStream.read(buffer);
 					// The first four bytes are the offset of the zeroth blob
 					firstOffset = Utilities
 							.toFourLittleEndianInteger(buffer);
 					// The number of blobs
 					numberOfBlobs = firstOffset / 4;
 					// The blobNumber has to be lesser than the numberOfBlobs
 					assert blobNumber < numberOfBlobs;
 					if(blobNumber == 0) {
 						// The first offset is what we read earlier
 						offset1 = firstOffset;
 					} else {
 						location = (blobNumber - 1) * 4;
 						Utilities.skipFully(zstdInputStream, location);
 						zstdInputStream.read(buffer);
 						offset1 = Utilities.toFourLittleEndianInteger(buffer);
 					}
 					zstdInputStream.read(buffer);
 					offset2 = Utilities.toFourLittleEndianInteger(buffer);
 					differenceOffset = offset2 - offset1;
 					buffer = new byte[differenceOffset];
 					Utilities.skipFully(zstdInputStream,
 							(offset1 - 4 * (blobNumber + 2)));
 					zstdInputStream.read(buffer, 0, differenceOffset);
 					return new String(buffer);
 					default:
 						System.err.print("What is compression = " + compressionType);
 				}
 			}
 		}
 		return null;
 	}
 	public DirectoryEntry getDirectoryInfoAtTitlePosition(long position)
 			throws IOException {