From a433bbbe454971c9f5308f221bf68e525dd06b40 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 24 Sep 2023 19:39:27 +0200 Subject: [PATCH] (converter) Fix rare sentence extractor bug It was caused by non-thread safe concurrent memory access in SentenceExtractor. --- .../language/sentence/SentenceExtractor.java | 5 +++++ .../ThreadLocalSentenceExtractorProvider.java | 19 +++++++++++++++++++ .../plugin/HtmlDocumentProcessorPlugin.java | 9 ++++++--- .../sideload/SideloadSourceFactory.java | 10 +++++----- .../sideload/SideloaderProcessing.java | 6 ++++-- .../StackexchangeSideloader.java | 10 +++++----- 6 files changed, 44 insertions(+), 15 deletions(-) create mode 100644 code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/ThreadLocalSentenceExtractorProvider.java diff --git a/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java b/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java index 88a73148..4cbdaf29 100644 --- a/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java +++ b/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/SentenceExtractor.java @@ -22,6 +22,11 @@ import java.io.IOException; import java.io.InputStream; import java.util.*; +/** This class still isn't thread safe!! If you use it concurrently, it won't throw exceptions, + * it will just haunt your code and cause unpredictable mysterious errors. + * + * Use {@link ThreadLocalSentenceExtractorProvider} instead to avoid falling into the twilight zone! + */ public class SentenceExtractor { private SentenceDetectorME sentenceDetector; diff --git a/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/ThreadLocalSentenceExtractorProvider.java b/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/ThreadLocalSentenceExtractorProvider.java new file mode 100644 index 00000000..6ea76e01 --- /dev/null +++ b/code/libraries/language-processing/src/main/java/nu/marginalia/language/sentence/ThreadLocalSentenceExtractorProvider.java @@ -0,0 +1,19 @@ +package nu.marginalia.language.sentence; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.LanguageModels; + +@Singleton +public class ThreadLocalSentenceExtractorProvider { + private final ThreadLocal sentenceExtractorThreadLocal; + + @Inject + public ThreadLocalSentenceExtractorProvider(LanguageModels languageModels) { + sentenceExtractorThreadLocal = ThreadLocal.withInitial(() -> new SentenceExtractor(languageModels)); + } + + public SentenceExtractor get() { + return sentenceExtractorThreadLocal.get(); + } +} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index c51e9690..0e8ed9bb 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -10,6 +10,7 @@ import nu.marginalia.converting.processor.logic.links.FileLinks; import nu.marginalia.converting.processor.logic.links.LinkProcessor; import nu.marginalia.converting.processor.plugin.specialization.*; import nu.marginalia.language.model.DocumentLanguageData; +import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider; import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.crawling.model.CrawledDocument; @@ -44,7 +45,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin private final Logger logger = LoggerFactory.getLogger(getClass()); private final double minDocumentQuality; - private final SentenceExtractor sentenceExtractor; private final FeatureExtractor featureExtractor; private final TitleExtractor titleExtractor; private final DocumentKeywordExtractor keywordExtractor; @@ -59,6 +59,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin private static final LinkParser linkParser = new LinkParser(); private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser); + private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider; private final HtmlProcessorSpecializations htmlProcessorSpecializations; @Inject @@ -73,13 +74,13 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin DocumentLengthLogic documentLengthLogic, MetaRobotsTag metaRobotsTag, DocumentGeneratorExtractor documentGeneratorExtractor, + ThreadLocalSentenceExtractorProvider sentenceExtractorProvider, HtmlProcessorSpecializations specializations) { super(languageFilter); this.documentLengthLogic = documentLengthLogic; this.minDocumentQuality = minDocumentQuality; - this.sentenceExtractor = sentenceExtractor; this.featureExtractor = featureExtractor; this.titleExtractor = titleExtractor; @@ -88,6 +89,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin this.metaRobotsTag = metaRobotsTag; this.documentGeneratorExtractor = documentGeneratorExtractor; + this.sentenceExtractorProvider = sentenceExtractorProvider; this.htmlProcessorSpecializations = specializations; } @@ -122,7 +124,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin throw new DisqualifiedException(DisqualificationReason.IRRELEVANT); } - DocumentLanguageData dld = sentenceExtractor.extractSentences(specialization.prune(doc)); + DocumentLanguageData dld = + sentenceExtractorProvider.get().extractSentences(specialization.prune(doc)); checkDocumentLanguage(dld); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java index 18db740e..76f4de9c 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java @@ -6,7 +6,7 @@ import nu.marginalia.converting.sideload.dirtree.DirtreeSideloaderFactory; import nu.marginalia.converting.sideload.encyclopedia.EncyclopediaMarginaliaNuSideloader; import nu.marginalia.converting.sideload.stackexchange.StackexchangeSideloader; import nu.marginalia.keyword.DocumentKeywordExtractor; -import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider; import java.io.IOException; import java.nio.file.Files; @@ -17,19 +17,19 @@ import java.util.Collection; public class SideloadSourceFactory { private final Gson gson; private final SideloaderProcessing sideloaderProcessing; - private final SentenceExtractor sentenceExtractor; + private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider; private final DocumentKeywordExtractor documentKeywordExtractor; private final DirtreeSideloaderFactory dirtreeSideloaderFactory; @Inject public SideloadSourceFactory(Gson gson, SideloaderProcessing sideloaderProcessing, - SentenceExtractor sentenceExtractor, + ThreadLocalSentenceExtractorProvider sentenceExtractorProvider, DocumentKeywordExtractor documentKeywordExtractor, DirtreeSideloaderFactory dirtreeSideloaderFactory) { this.gson = gson; this.sideloaderProcessing = sideloaderProcessing; - this.sentenceExtractor = sentenceExtractor; + this.sentenceExtractorProvider = sentenceExtractorProvider; this.documentKeywordExtractor = documentKeywordExtractor; this.dirtreeSideloaderFactory = dirtreeSideloaderFactory; } @@ -48,7 +48,7 @@ public class SideloadSourceFactory { return dirs .filter(Files::isRegularFile) .filter(f -> f.toFile().getName().endsWith(".db")) - .map(dbFile -> new StackexchangeSideloader(dbFile, sentenceExtractor, documentKeywordExtractor)) + .map(dbFile -> new StackexchangeSideloader(dbFile, sentenceExtractorProvider, documentKeywordExtractor)) .toList(); } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java index 96df9b1f..8419687e 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java @@ -8,7 +8,6 @@ import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.UrlIndexingState; import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.model.idx.WordMetadata; import java.net.URISyntaxException; import java.time.LocalDateTime; @@ -23,7 +22,10 @@ public class SideloaderProcessing { this.htmlProcessorPlugin = htmlProcessorPlugin; } - public ProcessedDocument processDocument(String url, String body, List extraKeywords, int size) throws URISyntaxException { + public ProcessedDocument processDocument(String url, + String body, + List extraKeywords, + int size) throws URISyntaxException { var crawledDoc = new CrawledDocument( "encyclopedia.marginalia.nu", url, diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java index a1b1ffac..8fe840fe 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java @@ -5,7 +5,7 @@ import nu.marginalia.converting.model.*; import nu.marginalia.converting.sideload.SideloadSource; import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb; import nu.marginalia.keyword.DocumentKeywordExtractor; -import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.DomainIndexingState; @@ -29,7 +29,7 @@ import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.TimeUnit; public class StackexchangeSideloader implements SideloadSource { - private final SentenceExtractor sentenceExtractor; + private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider; private final DocumentKeywordExtractor keywordExtractor; private final String domainName; @@ -37,12 +37,12 @@ public class StackexchangeSideloader implements SideloadSource { @SneakyThrows public StackexchangeSideloader(Path pathToDbFile, - SentenceExtractor sentenceExtractor, + ThreadLocalSentenceExtractorProvider sentenceExtractorProvider, DocumentKeywordExtractor keywordExtractor ) { this.dbFile = pathToDbFile; this.domainName = StackExchangePostsDb.getDomainName(pathToDbFile); - this.sentenceExtractor = sentenceExtractor; + this.sentenceExtractorProvider = sentenceExtractorProvider; this.keywordExtractor = keywordExtractor; } @@ -109,7 +109,7 @@ public class StackexchangeSideloader implements SideloadSource { var url = new EdgeUrl(fullUrl); var doc = Jsoup.parse(fullHtml.toString()); - var dld = sentenceExtractor.extractSentences(doc); + var dld = sentenceExtractorProvider.get().extractSentences(doc); ret.url = url; ret.words = keywordExtractor.extractKeywords(dld, url);