Experiments in keyword extraction

2025-02-23 21:18:58 +00:00 · 2022-06-23 17:02:28 +02:00 · 2022-06-23 17:02:28 +02:00 · e1b3477115
commit e1b3477115
parent 4516b23f90
3 changed files with 292 additions and 357 deletions
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java
@ -1,81 +1,152 @@
 package nu.marginalia.wmsa.edge.converting;

-import com.google.common.hash.HashFunction;
-import com.google.common.hash.Hashing;
-import com.google.inject.Guice;
-import com.google.inject.Inject;
-import com.google.inject.Injector;
-import nu.marginalia.util.DenseBitMap;
-import nu.marginalia.util.language.WordPatterns;
-import nu.marginalia.util.language.processing.SentenceExtractor;
-import nu.marginalia.util.language.processing.model.DocumentLanguageData;
-import nu.marginalia.wmsa.configuration.WmsaHome;
-import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
+import gnu.trove.set.hash.TIntHashSet;
+import nu.marginalia.wmsa.edge.converting.atags.AnchorTextExtractor;
 import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
 import nu.marginalia.wmsa.edge.crawling.CrawledDomainReader;
 import nu.marginalia.wmsa.edge.crawling.CrawlerSpecificationLoader;
 import nu.marginalia.wmsa.edge.crawling.WorkLog;
 import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
+import nu.marginalia.wmsa.edge.integration.stackoverflow.StackOverflowPostsReader;
+import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader;
 import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
+import nu.marginalia.wmsa.edge.model.EdgeDomain;
 import nu.marginalia.wmsa.edge.model.EdgeUrl;
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Element;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

+import java.io.BufferedOutputStream;
+import java.io.FileOutputStream;
 import java.io.IOException;
-import java.net.URISyntaxException;
-import java.nio.charset.StandardCharsets;
+import java.io.OutputStream;
+import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Objects;
-import java.util.function.Predicate;
-import java.util.regex.Pattern;

 public class LinkKeywordExtractorMain {
    private static final Logger logger = LoggerFactory.getLogger(LinkKeywordExtractorMain.class);

-    public static void main(String... args) throws IOException {
+    public static void main(String... args) throws IOException, InterruptedException {

-        if (args.length != 1) {
-            System.err.println("Arguments: crawl-plan.yaml");
+        if (args.length < 2) {
+            System.err.println("Arguments: [crawl|so|wiki] crawl-plan.yaml [data]");
            System.exit(0);
        }
-        var plan = new CrawlPlanLoader().load(Path.of(args[0]));

-        Injector injector = Guice.createInjector(
-                new ConverterModule(plan)
-        );
+        String command = args[0];
+        var plan = new CrawlPlanLoader().load(Path.of(args[1]));

-        injector.getInstance(LinkKeywordExtractorMain.class);
+        switch (command) {
+            case "crawl": getKeywordsFromCrawl(plan); break;
+            case "so": getKeywordsFromSo(plan, args[2]); break;
+            case "wiki": getKeywordsFromWiki(plan, args[2]); break;
+            default: System.err.println("Unrecognized command");
        }

-    private final HashSet<String> crawledDomains = new HashSet<>();
-    private final List<String> fileNames = new ArrayList<>();
-    private final LinkParser linkParser = new LinkParser();
-    private final SentenceExtractor sentenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels());
+    }

-    private final HashFunction hashFunction = Hashing.murmur3_128();
+    private static void getKeywordsFromWiki(EdgeCrawlPlan plan, String arg) throws IOException, InterruptedException {

-    // This bit map is used as a bloom filter to deduplicate url-keyword combinations
-    // false positives are expected, but that's an acceptable trade-off to not have to deal with
-    // de-duplicating billions of shuffled (url, word) tuples on limited hardware
-    private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS);

-    @Inject
-    public LinkKeywordExtractorMain(EdgeCrawlPlan plan) throws IOException {
+        HashSet<String> crawledDomains = new HashSet<>();
+        TIntHashSet crawledUrls = new TIntHashSet(50_000_000);
+
+        logger.info("Loading URLs");
+        Files.lines(Path.of("/home/vlofgren/good-urls3.txt"))
+                .filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange"))
+                .mapToInt(String::hashCode)
+                .forEach(crawledUrls::add);
+
+        logger.info("Loading input spec");
+        CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
+                spec -> { crawledDomains.add(spec.domain); });
+
+        try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
+            AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(domain -> crawledDomains.contains(domain)
+                    && !domain.contains("wiki")
+                    && !domain.contains("isni")
+                    && !domain.contains("wiktionary"),
+                    url -> crawledUrls.contains(url.toString().hashCode()),
+                    output::write);
+
+            new WikipediaReader(arg, new EdgeDomain("invalid.example"), article -> {
+                anchorTextExtractor.processDocument(article.getUrl().toString(), article.body);
+            }).join();
+        }
+        catch (IOException ex) {
+            ex.printStackTrace();
+        }
+
+
+
+    }
+
+    private static void getKeywordsFromSo(EdgeCrawlPlan plan, String arg) throws IOException, InterruptedException {
+        TIntHashSet crawledUrls = new TIntHashSet(50_000_000);
+
+        logger.info("Loading URLs");
+        Files.lines(Path.of("/home/vlofgren/good-urls3.txt"))
+                .filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange"))
+                .mapToInt(String::hashCode)
+                .forEach(crawledUrls::add);
+
        logger.info("Loading input spec");

+        HashSet<String> crawledDomains = new HashSet<>();
        CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
                spec -> crawledDomains.add(spec.domain));

+        crawledDomains.remove("jsfiddle.net"); // like 30% of SO's links go here
+        crawledDomains.remove("jsbin.com");
+        crawledDomains.remove("codepad.org");
+
+
+        try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
+            AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains,
+                url -> crawledUrls.contains(url.toString().hashCode()),
+                output::write);
+
+            new StackOverflowPostsReader(arg, new EdgeDomain("invalid.example"), post -> {
+                anchorTextExtractor.processDocument(post.getUrl().toString(), post.fullBody);
+            }).join();
+        }
+        catch (IOException ex) {
+            ex.printStackTrace();
+        }
+    }
+
+
+    public static void getKeywordsFromCrawl(EdgeCrawlPlan plan) throws IOException {
+
+        TIntHashSet crawledUrls = new TIntHashSet(50_000_000);
+
+        logger.info("Loading URLs");
+        Files.lines(Path.of("/home/vlofgren/good-urls3.txt"))
+                .filter(url -> !url.contains("stackoverflow") && !url.contains("stackexchange"))
+                .mapToInt(String::hashCode)
+                .forEach(crawledUrls::add);
+
+
+        logger.info("Loading input spec");
+
+        HashSet<String> crawledDomains = new HashSet<>();
+        CrawlerSpecificationLoader.readInputSpec(plan.getJobSpec(),
+                spec -> crawledDomains.add(spec.domain));
+
+        List<String> fileNames = new ArrayList<>();
+
        logger.info("Replaying crawl log");
        WorkLog.readLog(plan.crawl.getLogFile(),
                entry -> fileNames.add(entry.path()));

+        try (var output = new UrlKeywordTsvWriter(Path.of("links.tsv"))) {
+            AnchorTextExtractor anchorTextExtractor = new AnchorTextExtractor(crawledDomains::contains,
+                    url -> crawledUrls.contains(url.toString().hashCode()),
+                    output::write);
+
            logger.info("Reading files");
            for (var fn : fileNames) {
                CrawledDomainReader crawledDomainReader = new CrawledDomainReader();
@ -85,131 +156,38 @@ public class LinkKeywordExtractorMain {
                System.out.println("# " + crawledDomain.domain);

                for (var doc : crawledDomain.doc) {
-                try {
                    if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) {
-                        processDocument(doc.url, doc.documentBody);
-                    }
-                }
-                catch (URISyntaxException ex) {
-                    // This Shouldn't Happen (TM) as the URL that we're failing to process
-                    // is expected to have already been parsed by this code successfully
-                    // in the process of getting here.
-                    //
-                    // But also, if it does happen, it's no big deal
-
-                    logger.warn("Bad URL format", ex);
+                        anchorTextExtractor.processDocument(doc.url, doc.documentBody);
                    }
                }
            }
        }

-
-    private void processDocument(String docUrl, String documentBody) throws URISyntaxException {
-        final Document processed = Jsoup.parse(documentBody);
-        final EdgeUrl documentUrl = new EdgeUrl(docUrl);
-
-        for (var link : processed.getElementsByTag("a")) {
-            if (link.hasAttr("href")) {
-                String href = link.attr("href");
-                String text = getLinkText(link);
-
-                processAnchor(documentUrl, href, text);
-            }
-        }
    }

-    private final Pattern anchorTextNoise = Pattern.compile("[\\s\"()“”:]+");
+    private static class UrlKeywordTsvWriter implements AutoCloseable {

-    private String getLinkText(Element link) {
-        String text = link.text();
+        private final OutputStream stream;

-        if (link.text().isBlank()) {
-            text = getLinkTextByImgAltTag(link);
+        UrlKeywordTsvWriter(Path outputFile) throws IOException {
+            this.stream = new BufferedOutputStream(new FileOutputStream(outputFile.toFile()));
        }

-        return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim();
-    }
-
-    private String getLinkTextByImgAltTag(Element link) {
-        for (var img: link.getElementsByTag("img")) {
-            if (img.hasAttr("alt")) {
-                return img.attr("alt");
+        void write(EdgeUrl url, String keyword) {
+            try {
+                stream.write(url.toString().getBytes());
+                stream.write('\t');
+                stream.write(keyword.getBytes());
+                stream.write('\n');
+            } catch (IOException e) {
+                throw new RuntimeException(e);
            }
        }
-        return "";
-    }

-    private void processAnchor(EdgeUrl documentUrl, String href, String text) {
-        if (!isInterestingAnchorText(text)) {
-            return;
-        }
-
-        var optLinkUrl = linkParser.parseLink(documentUrl, href);
-        if (optLinkUrl.isEmpty()) return;
-
-        var linkUrl = optLinkUrl.get();
-
-        if (!isInterestingAnchorLink(linkUrl)) {
-            return;
-        }
-
-        DocumentLanguageData languageData = sentenceExtractor.extractSentences(text);
-        for (var sent : languageData.sentences) {
-            for (var wordPos : sent) {
-                if (wordPos.isStopWord())
-                    continue;
-
-                String word = wordPos.wordLowerCase();
-
-                if (!WordPatterns.filter(word))
-                    continue;
-
-                if (!linkUrl.domain.equals(documentUrl.domain)) {
-                    if (isNewKeywordForLink(word, linkUrl.toString())) {
-                        System.out.println(linkUrl + "\t" + word);
+        @Override
+        public void close() throws IOException {
+            stream.close();
        }
    }
-            }
-        }
-    }
-
-    // This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine
-    private final Predicate<String> looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate();
-
-    private boolean isInterestingAnchorText(String text) {
-        if (text.isBlank()) return false;
-        if (text.length() > 32) return false;
-
-        // Google loves questions, and so does SEO spammers
-        if (text.endsWith("?")) return false;
-
-        if (text.startsWith("http:") || text.startsWith("https:")) return false;
-
-        if (looksLikeAnURL.test(text)) return false;
-
-        return switch (text) {
-            case "this", "here", "click", "click here", "download", "source" -> false;
-            default -> true;
-        };
-    }
-
-    private boolean isInterestingAnchorLink(EdgeUrl linkUrl) {
-        if (!(linkUrl.proto.endsWith("http") || linkUrl.proto.equals("https"))) {
-            return false;
-        }
-
-        return crawledDomains.contains(linkUrl.domain.toString());
-    }
-
-    private boolean isNewKeywordForLink(String href, String text) {
-        long hash = 0;
-
-        hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).asLong();
-        hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).asLong();
-
-        // Remove sign bit because we don't want a negative index in deduplicateHashBitset
-        hash &= 0x7FFF_FFFF_FFFF_FFFFL;
-
-        return !deduplicateHashBitset.set(hash % deduplicateHashBitset.cardinality);
-    }
+
 }
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/atags/AnchorTextExtractor.java
@ -0,0 +1,149 @@
+package nu.marginalia.wmsa.edge.converting.atags;
+
+import com.google.common.hash.HashFunction;
+import com.google.common.hash.Hashing;
+import lombok.SneakyThrows;
+import nu.marginalia.util.DenseBitMap;
+import nu.marginalia.util.language.WordPatterns;
+import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
+import nu.marginalia.wmsa.edge.model.EdgeUrl;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+
+import java.nio.charset.StandardCharsets;
+import java.util.function.BiConsumer;
+import java.util.function.Predicate;
+import java.util.regex.Pattern;
+
+public class AnchorTextExtractor {
+    private final Predicate<String> includeDomainPredicate;
+    private final Predicate<EdgeUrl> includeUrlPredicate;
+    private final BiConsumer<EdgeUrl, String> linkKeywordConsumer;
+
+    private final LinkParser linkParser = new LinkParser();
+
+    private final HashFunction hashFunction = Hashing.murmur3_128();
+
+    // This bit map is used as a bloom filter to deduplicate url-keyword combinations
+    // false positives are expected, but that's an acceptable trade-off to not have to deal with
+    // de-duplicating billions of shuffled (url, word) tuples on limited hardware
+    private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS);
+
+    public AnchorTextExtractor(Predicate<String> includeDomainPredicate,
+                               Predicate<EdgeUrl> includeUrlPredicate,
+                               BiConsumer<EdgeUrl, String> linkKeywordConsumer) {
+        this.includeDomainPredicate = includeDomainPredicate;
+        this.includeUrlPredicate = includeUrlPredicate;
+        this.linkKeywordConsumer = linkKeywordConsumer;
+    }
+
+    @SneakyThrows
+    public void processDocument(String docUrl, String documentBody) {
+        final Document processed = Jsoup.parse(documentBody);
+        final EdgeUrl documentUrl = new EdgeUrl(docUrl);
+
+        for (var link : processed.getElementsByTag("a")) {
+            if (link.hasAttr("href")) {
+                String href = link.attr("href");
+                String text = getLinkText(link);
+
+                processAnchor(documentUrl, href, text);
+            }
+        }
+    }
+
+    private final Pattern anchorTextNoise = Pattern.compile("[ \t\n\"()“”]+");
+
+    private String getLinkText(Element link) {
+        String text = link.text();
+
+        if (link.text().isBlank()) {
+            for (var img: link.getElementsByTag("img")) {
+                if (img.hasAttr("alt")) {
+                    text = img.attr("alt");
+                    break;
+                }
+            }
+        }
+
+        return anchorTextNoise.matcher(text.toLowerCase()).replaceAll(" ").trim();
+    }
+
+    private void processAnchor(EdgeUrl documentUrl, String href, String text) {
+        if (!isInterestingAnchorText(text)) {
+            return;
+        }
+        if (href.contains("?")) {
+            return;
+        }
+
+        var optLinkUrl = linkParser.parseLink(documentUrl, href);
+        if (optLinkUrl.isEmpty()) return;
+
+        var linkUrl = optLinkUrl.get();
+
+        if (!isInterestingAnchorLink(linkUrl)) {
+            return;
+        }
+
+        for (String word: anchorTextNoise.split(text)) {
+            if (WordPatterns.isStopWord(word))
+                continue;
+
+            word = word.toLowerCase();
+            if (!WordPatterns.filter(word))
+                continue;
+
+            if (!linkUrl.domain.equals(documentUrl.domain)) {
+                if (isNewKeywordForLink(word, linkUrl.toString())) {
+                    linkKeywordConsumer.accept(linkUrl, word);
+                }
+            }
+        }
+    }
+
+    // This pattern doesn't need to perfectly capture all anchor texts that are URLs, if it gets 95% that's fine
+    private final Predicate<String> looksLikeAnURL = Pattern.compile("(\\p{Alpha}+://)?[\\p{Alnum}.]+(/[^/]+)+").asMatchPredicate();
+
+    private boolean isInterestingAnchorText(String text) {
+        if (text.isBlank()) return false;
+        if (text.length() > 32) return false;
+
+        // Google loves questions, and so does SEO spammers
+        if (text.endsWith("?")) return false;
+
+        if (text.startsWith("http:") || text.startsWith("https:")) return false;
+
+        if (looksLikeAnURL.test(text)) return false;
+
+        return switch (text) {
+            case "this", "here", "click", "click here", "download", "source" -> false;
+            default -> true;
+        };
+    }
+
+    private boolean isInterestingAnchorLink(EdgeUrl linkUrl) {
+        if (!(linkUrl.proto.endsWith("http") || linkUrl.proto.equals("https"))) {
+            return false;
+        }
+
+        if (!includeUrlPredicate.test(linkUrl)) {
+            return false;
+        }
+
+        return includeDomainPredicate.test(linkUrl.domain.toString());
+    }
+
+    private boolean isNewKeywordForLink(String href, String text) {
+        long hash = 0;
+
+        hash ^= hashFunction.hashString(href, StandardCharsets.UTF_8).asLong();
+        hash ^= hashFunction.hashString(text, StandardCharsets.UTF_8).asLong();
+
+        // Remove sign bit because we don't want a negative index in deduplicateHashBitset
+        hash &= 0x7FFF_FFFF_FFFF_FFFFL;
+
+        return !deduplicateHashBitset.set(hash % deduplicateHashBitset.cardinality);
+    }
+}
--- a/third_party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java
+++ b/third_party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java
@ -18,20 +18,20 @@

 package org.openzim.ZIMTypes;

-import java.io.*;
-import java.util.*;
-import java.util.function.BiConsumer;
-import java.util.function.Consumer;
-import java.util.function.Predicate;
-
 import com.github.luben.zstd.RecyclingBufferPool;
 import com.github.luben.zstd.ZstdInputStream;
 import lombok.AllArgsConstructor;
 import lombok.Getter;
 import org.jetbrains.annotations.NotNull;
-import org.tukaani.xz.SingleXZInputStream;
 import org.openzim.util.RandomAcessFileZIMInputStream;
 import org.openzim.util.Utilities;
+import org.tukaani.xz.SingleXZInputStream;
+
+import java.io.*;
+import java.util.*;
+import java.util.function.BiConsumer;
+import java.util.function.Consumer;
+import java.util.function.Predicate;

 /**
 * @author Arunesh Mathur
@ -401,198 +401,6 @@ public class ZIMReader {

 	}

-	public String getArticleData(DirectoryEntry mainEntry) throws IOException {
-
-		byte[] buffer = new byte[8];
-
-		if (mainEntry != null) {
-
-			// Check what kind of an entry was mainEnrty
-			if (mainEntry.getClass() == ArticleEntry.class) {
-
-				// Cast to ArticleEntry
-				ArticleEntry article = (ArticleEntry) mainEntry;
-
-				// Get the cluster and blob numbers from the article
-				long clusterNumber = article.getClusterNumber();
-				int blobNumber = article.getBlobnumber();
-
-				// Move to the cluster entry in the clusterPtrPos
-				mReader.seek(mFile.getClusterPtrPos() + clusterNumber * 8);
-
-				// Read the location of the cluster
-				long clusterPos = mReader
-						.readEightLittleEndianBytesValue(buffer);
-
-				// Move to the cluster
-				mReader.seek(clusterPos);
-
-				// Read the first byte, for compression information
-				int compressionType = mReader.read();
-
-				// Reference declaration
-				SingleXZInputStream xzReader = null;
-				int firstOffset, numberOfBlobs, offset1,
-				offset2,
-				location,
-				differenceOffset;
-				
-				ByteArrayOutputStream baos;
-
-				// Check the compression type that was read
-				switch (compressionType) {
-
-					// TODO: Read uncompressed data directly
-					case 0:
-					case 1:
-
-						// Read the first 4 bytes to find out the number of artciles
-						buffer = new byte[4];
-
-						// Create a dictionary with size 40MiB, the zimlib uses this
-						// size while creating
-
-						// Read the first offset
-						mReader.read(buffer);
-
-						// The first four bytes are the offset of the zeroth blob
-						firstOffset = Utilities
-								.toFourLittleEndianInteger(buffer);
-
-						// The number of blobs
-						numberOfBlobs = firstOffset / 4;
-
-						// The blobNumber has to be lesser than the numberOfBlobs
-						assert blobNumber < numberOfBlobs;
-
-
-						if (blobNumber == 0) {
-							// The first offset is what we read earlier
-							offset1 = firstOffset;
-						} else {
-
-							location = (blobNumber - 1) * 4;
-							Utilities.skipFully(mReader, location);
-							mReader.read(buffer);
-							offset1 = Utilities.toFourLittleEndianInteger(buffer);
-						}
-
-						mReader.read(buffer);
-						offset2 = Utilities.toFourLittleEndianInteger(buffer);
-
-						differenceOffset = offset2 - offset1;
-						buffer = new byte[differenceOffset];
-
-						Utilities.skipFully(mReader,
-								(offset1 - 4 * (blobNumber + 2)));
-
-						mReader.read(buffer, 0, differenceOffset);
-
-						return new String(buffer);
-
-				// LZMA2 compressed data
-				case 4:
-
-					// Read the first 4 bytes to find out the number of artciles
-					buffer = new byte[4];
-
-					// Create a dictionary with size 40MiB, the zimlib uses this
-					// size while creating
-					xzReader = new SingleXZInputStream(mReader, 4194304);
-
-					// Read the first offset
-					xzReader.read(buffer);
-
-					// The first four bytes are the offset of the zeroth blob
-					firstOffset = Utilities
-							.toFourLittleEndianInteger(buffer);
-
-					// The number of blobs
-					numberOfBlobs = firstOffset / 4;
-
-					// The blobNumber has to be lesser than the numberOfBlobs
-					assert blobNumber < numberOfBlobs;
-
-					if(blobNumber == 0) {
-						// The first offset is what we read earlier
-						offset1 = firstOffset;
-					} else {
-
-						location = (blobNumber - 1) * 4;
-						Utilities.skipFully(xzReader, location);
-						xzReader.read(buffer);
-						offset1 = Utilities.toFourLittleEndianInteger(buffer);
-					}
-
-					xzReader.read(buffer);
-					offset2 = Utilities.toFourLittleEndianInteger(buffer);
-
-					differenceOffset = offset2 - offset1;
-					buffer = new byte[differenceOffset];
-
-					Utilities.skipFully(xzReader,
-							(offset1 - 4 * (blobNumber + 2)));
-
-					xzReader.read(buffer, 0, differenceOffset);
-					return new String(buffer);
-
-				case 5:
-					// Read the first 4 bytes to find out the number of artciles
-					buffer = new byte[4];
-
-					// Create a dictionary with size 40MiB, the zimlib uses this
-					// size while creating
-					var zstdInputStream = new com.github.luben.zstd.ZstdInputStream(new BufferedInputStream(mReader));
-
-					// Read the first offset
-					zstdInputStream.read(buffer);
-
-					// The first four bytes are the offset of the zeroth blob
-					firstOffset = Utilities
-							.toFourLittleEndianInteger(buffer);
-
-					// The number of blobs
-					numberOfBlobs = firstOffset / 4;
-
-					// The blobNumber has to be lesser than the numberOfBlobs
-					assert blobNumber < numberOfBlobs;
-
-					if(blobNumber == 0) {
-						// The first offset is what we read earlier
-						offset1 = firstOffset;
-					} else {
-
-						location = (blobNumber - 1) * 4;
-						Utilities.skipFully(zstdInputStream, location);
-						zstdInputStream.read(buffer);
-						offset1 = Utilities.toFourLittleEndianInteger(buffer);
-					}
-
-					zstdInputStream.read(buffer);
-					offset2 = Utilities.toFourLittleEndianInteger(buffer);
-
-					differenceOffset = offset2 - offset1;
-					buffer = new byte[differenceOffset];
-
-					Utilities.skipFully(zstdInputStream,
-							(offset1 - 4 * (blobNumber + 2)));
-
-					zstdInputStream.read(buffer, 0, differenceOffset);
-
-					return new String(buffer);
-
-					default:
-						System.err.print("What is compression = " + compressionType);
-
-				}
-
-			}
-		}
-
-		return null;
-
-	}
-
 	public DirectoryEntry getDirectoryInfoAtTitlePosition(long position)
 			throws IOException {