mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Merge branch 'master' into term-positions
# Conflicts: # code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java # code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
This commit is contained in:
commit
285e657f68
@ -12,6 +12,7 @@ import nu.marginalia.atags.source.AnchorTagsSource;
|
|||||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||||
|
import nu.marginalia.crawl.retreival.DomainLocks;
|
||||||
import nu.marginalia.crawl.retreival.DomainProber;
|
import nu.marginalia.crawl.retreival.DomainProber;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||||
@ -72,6 +73,8 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
private final int node;
|
private final int node;
|
||||||
private final SimpleBlockingThreadPool pool;
|
private final SimpleBlockingThreadPool pool;
|
||||||
|
|
||||||
|
private final DomainLocks domainLocks = new DomainLocks();
|
||||||
|
|
||||||
private final Map<String, String> processingIds = new ConcurrentHashMap<>();
|
private final Map<String, String> processingIds = new ConcurrentHashMap<>();
|
||||||
|
|
||||||
private final AbortMonitor abortMonitor = AbortMonitor.getInstance();
|
private final AbortMonitor abortMonitor = AbortMonitor.getInstance();
|
||||||
@ -272,10 +275,16 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
Files.deleteIfExists(tempFile);
|
Files.deleteIfExists(tempFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var domainLock = domainLocks.getSemaphore(new EdgeDomain(specification.domain));
|
||||||
|
|
||||||
try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
|
try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
|
||||||
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder);
|
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder);
|
||||||
CrawlDataReference reference = getReference())
|
CrawlDataReference reference = getReference())
|
||||||
{
|
{
|
||||||
|
// acquire the domain lock to prevent other threads from crawling the same domain,
|
||||||
|
// we release it at the end of the task to let them go ahead
|
||||||
|
Thread.currentThread().setName("crawling:" + domain + " [await domain lock]");
|
||||||
|
domainLock.acquire();
|
||||||
Thread.currentThread().setName("crawling:" + domain);
|
Thread.currentThread().setName("crawling:" + domain);
|
||||||
|
|
||||||
var domainLinks = anchorTagsSource.getAnchorTags(domain);
|
var domainLinks = anchorTagsSource.getAnchorTags(domain);
|
||||||
@ -306,6 +315,9 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
logger.error("Error fetching domain " + domain, e);
|
logger.error("Error fetching domain " + domain, e);
|
||||||
}
|
}
|
||||||
finally {
|
finally {
|
||||||
|
// release the domain lock to permit other threads to crawl subdomains of this domain
|
||||||
|
domainLock.release();
|
||||||
|
|
||||||
// We don't need to double-count these; it's also kept int he workLog
|
// We don't need to double-count these; it's also kept int he workLog
|
||||||
processingIds.remove(domain);
|
processingIds.remove(domain);
|
||||||
Thread.currentThread().setName("[idle]");
|
Thread.currentThread().setName("[idle]");
|
||||||
|
@ -28,6 +28,7 @@ import java.nio.file.Path;
|
|||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
public class CrawlerRetreiver implements AutoCloseable {
|
public class CrawlerRetreiver implements AutoCloseable {
|
||||||
|
|
||||||
@ -93,6 +94,10 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
new EdgeUrl("http", new EdgeDomain(domain), null, "/", null));
|
new EdgeUrl("http", new EdgeDomain(domain), null, "/", null));
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
// Sleep a bit to avoid hammering the server with requests, we just probed it
|
||||||
|
TimeUnit.SECONDS.sleep(1);
|
||||||
|
|
||||||
|
// Fetch the domain
|
||||||
return crawlDomain(oldCrawlData, probeResult, domainLinks);
|
return crawlDomain(oldCrawlData, probeResult, domainLinks);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
@ -123,14 +128,16 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder);
|
final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder);
|
||||||
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
|
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
|
||||||
|
|
||||||
sniffRootDocument(rootUrl);
|
delayTimer.waitFetchDelay(0); // initial delay after robots.txt
|
||||||
|
sniffRootDocument(rootUrl, delayTimer);
|
||||||
|
delayTimer.waitFetchDelay(0); // delay after sniffing
|
||||||
|
|
||||||
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
||||||
int recrawled = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
|
int recrawled = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
|
||||||
|
|
||||||
if (recrawled > 0) {
|
if (recrawled > 0) {
|
||||||
// If we have reference data, we will always grow the crawl depth a bit
|
// If we have reference data, we will always grow the crawl depth a bit
|
||||||
crawlFrontier.increaseDepth(1.5);
|
crawlFrontier.increaseDepth(1.5, 2500);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add external links to the crawl frontier
|
// Add external links to the crawl frontier
|
||||||
@ -196,13 +203,28 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
return fetchedCount;
|
return fetchedCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void sniffRootDocument(EdgeUrl rootUrl) {
|
private void sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) {
|
||||||
try {
|
try {
|
||||||
logger.debug("Configuring link filter");
|
logger.debug("Configuring link filter");
|
||||||
|
|
||||||
var url = rootUrl.withPathAndParam("/", null);
|
var url = rootUrl.withPathAndParam("/", null);
|
||||||
|
|
||||||
var result = fetcher.fetchContent(url, warcRecorder, ContentTags.empty());
|
HttpFetchResult result = null;
|
||||||
|
|
||||||
|
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
|
||||||
|
try {
|
||||||
|
result = fetcher.fetchContent(url, warcRecorder, ContentTags.empty());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
catch (RateLimitException ex) {
|
||||||
|
timer.waitRetryDelay(ex);
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.warn("Failed to fetch {}", url, ex);
|
||||||
|
result = new HttpFetchResult.ResultException(ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (!(result instanceof HttpFetchResult.ResultOk ok))
|
if (!(result instanceof HttpFetchResult.ResultOk ok))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
@ -54,8 +54,14 @@ public class DomainCrawlFrontier {
|
|||||||
* than the number of already visited documents, the base depth will be adjusted
|
* than the number of already visited documents, the base depth will be adjusted
|
||||||
* to the visited count first.
|
* to the visited count first.
|
||||||
*/
|
*/
|
||||||
public void increaseDepth(double depthIncreaseFactor) {
|
public void increaseDepth(double depthIncreaseFactor,
|
||||||
depth = (int)(Math.max(visited.size(), depth) * depthIncreaseFactor);
|
int maxDepthIncreaseAbsolute
|
||||||
|
) {
|
||||||
|
int base = Math.max(visited.size(), depth);
|
||||||
|
|
||||||
|
int scaledUp = (int)(base * depthIncreaseFactor);
|
||||||
|
|
||||||
|
depth = Math.min(base + maxDepthIncreaseAbsolute, scaledUp);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setLinkFilter(Predicate<EdgeUrl> linkFilter) {
|
public void setLinkFilter(Predicate<EdgeUrl> linkFilter) {
|
||||||
|
@ -0,0 +1,45 @@
|
|||||||
|
package nu.marginalia.crawl.retreival;
|
||||||
|
|
||||||
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
import java.util.concurrent.Semaphore;
|
||||||
|
|
||||||
|
/** Holds lock objects for each domain, to prevent multiple threads from
|
||||||
|
* crawling the same domain at the same time.
|
||||||
|
*/
|
||||||
|
public class DomainLocks {
|
||||||
|
// The locks are stored in a map, with the domain name as the key. This map will grow
|
||||||
|
// relatively big, but should be manageable since the number of domains is limited to
|
||||||
|
// a few hundred thousand typically.
|
||||||
|
private final Map<String, Semaphore> locks = new ConcurrentHashMap<>();
|
||||||
|
|
||||||
|
/** Returns a lock object corresponding to the given domain. The object is returned as-is,
|
||||||
|
* and may be held by another thread. The caller is responsible for locking and releasing the lock.
|
||||||
|
*/
|
||||||
|
public Semaphore getSemaphore(EdgeDomain domain) {
|
||||||
|
return locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Semaphore defaultPermits(String topDomain) {
|
||||||
|
if (topDomain.equals("wordpress.com"))
|
||||||
|
return new Semaphore(16);
|
||||||
|
if (topDomain.equals("blogspot.com"))
|
||||||
|
return new Semaphore(8);
|
||||||
|
|
||||||
|
if (topDomain.equals("neocities.org"))
|
||||||
|
return new Semaphore(4);
|
||||||
|
if (topDomain.equals("github.io"))
|
||||||
|
return new Semaphore(4);
|
||||||
|
|
||||||
|
if (topDomain.equals("substack.com")) {
|
||||||
|
return new Semaphore(1);
|
||||||
|
}
|
||||||
|
if (topDomain.endsWith(".edu")) {
|
||||||
|
return new Semaphore(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new Semaphore(2);
|
||||||
|
}
|
||||||
|
}
|
@ -38,6 +38,7 @@ public class CrawlerRevisitor {
|
|||||||
int recrawled = 0;
|
int recrawled = 0;
|
||||||
int retained = 0;
|
int retained = 0;
|
||||||
int errors = 0;
|
int errors = 0;
|
||||||
|
int skipped = 0;
|
||||||
|
|
||||||
for (;;) {
|
for (;;) {
|
||||||
if (errors > 20) {
|
if (errors > 20) {
|
||||||
@ -84,9 +85,32 @@ public class CrawlerRevisitor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (recrawled > 5
|
double skipProb;
|
||||||
&& retained > 0.9 * recrawled
|
|
||||||
&& Math.random() < 0.9)
|
// calculate the probability of skipping this document based on the
|
||||||
|
// fraction of documents that haven't changed
|
||||||
|
if (recrawled > 0) {
|
||||||
|
skipProb = (double) retained / recrawled;
|
||||||
|
|
||||||
|
// If we've crawled a lot of documents, we'll be more conservative
|
||||||
|
// in trying to recrawl documents, to avoid hammering the server too much;
|
||||||
|
// in the case of a large change, we'll eventually catch it anyway
|
||||||
|
|
||||||
|
if (skipped + recrawled > 10_000) {
|
||||||
|
skipProb = Math.clamp(skipProb, 0.75, 0.99);
|
||||||
|
} else if (skipped + recrawled > 1000) {
|
||||||
|
skipProb = Math.clamp(skipProb, 0.5, 0.99);
|
||||||
|
} else {
|
||||||
|
skipProb = Math.clamp(skipProb, 0, 0.95);
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
// If we haven't recrawled anything yet, we'll be more aggressive
|
||||||
|
// in trying to recrawl documents
|
||||||
|
skipProb = 0.25;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Math.random() < skipProb) //
|
||||||
{
|
{
|
||||||
// Since it looks like most of these documents haven't changed,
|
// Since it looks like most of these documents haven't changed,
|
||||||
// we'll load the documents directly; but we do this in a random
|
// we'll load the documents directly; but we do this in a random
|
||||||
@ -103,6 +127,8 @@ public class CrawlerRevisitor {
|
|||||||
doc.documentBody,
|
doc.documentBody,
|
||||||
new ContentTags(doc.etagMaybe, doc.lastModifiedMaybe)
|
new ContentTags(doc.etagMaybe, doc.lastModifiedMaybe)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
skipped++;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
// GET the document with the stored document as a reference
|
// GET the document with the stored document as a reference
|
||||||
|
@ -6,11 +6,14 @@ import lombok.SneakyThrows;
|
|||||||
import nu.marginalia.WebsiteUrl;
|
import nu.marginalia.WebsiteUrl;
|
||||||
import nu.marginalia.api.math.MathClient;
|
import nu.marginalia.api.math.MathClient;
|
||||||
import nu.marginalia.api.searchquery.QueryClient;
|
import nu.marginalia.api.searchquery.QueryClient;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
|
||||||
import nu.marginalia.db.DbDomainQueries;
|
|
||||||
import nu.marginalia.api.searchquery.model.query.QueryResponse;
|
import nu.marginalia.api.searchquery.model.query.QueryResponse;
|
||||||
|
import nu.marginalia.db.DbDomainQueries;
|
||||||
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.search.command.SearchParameters;
|
import nu.marginalia.search.command.SearchParameters;
|
||||||
import nu.marginalia.search.model.*;
|
import nu.marginalia.search.model.ClusteredUrlDetails;
|
||||||
|
import nu.marginalia.search.model.DecoratedSearchResults;
|
||||||
|
import nu.marginalia.search.model.SearchFilters;
|
||||||
|
import nu.marginalia.search.model.UrlDetails;
|
||||||
import nu.marginalia.search.svc.SearchQueryIndexService;
|
import nu.marginalia.search.svc.SearchQueryIndexService;
|
||||||
import nu.marginalia.search.svc.SearchUnitConversionService;
|
import nu.marginalia.search.svc.SearchUnitConversionService;
|
||||||
import org.apache.logging.log4j.util.Strings;
|
import org.apache.logging.log4j.util.Strings;
|
||||||
@ -65,9 +68,10 @@ public class SearchOperator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public List<UrlDetails> doSiteSearch(String domain,
|
public List<UrlDetails> doSiteSearch(String domain,
|
||||||
|
int domainId,
|
||||||
int count) {
|
int count) {
|
||||||
|
|
||||||
var queryParams = paramFactory.forSiteSearch(domain, count);
|
var queryParams = paramFactory.forSiteSearch(domain, domainId, count);
|
||||||
var queryResponse = queryClient.search(queryParams);
|
var queryResponse = queryClient.search(queryParams);
|
||||||
|
|
||||||
return searchQueryService.getResultsFromQuery(queryResponse);
|
return searchQueryService.getResultsFromQuery(queryResponse);
|
||||||
|
@ -1,12 +1,12 @@
|
|||||||
package nu.marginalia.search;
|
package nu.marginalia.search;
|
||||||
|
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
|
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||||
|
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||||
import nu.marginalia.index.query.limit.QueryLimits;
|
import nu.marginalia.index.query.limit.QueryLimits;
|
||||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
|
||||||
import nu.marginalia.search.command.SearchParameters;
|
import nu.marginalia.search.command.SearchParameters;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -42,7 +42,7 @@ public class SearchQueryParamFactory {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public QueryParams forSiteSearch(String domain, int count) {
|
public QueryParams forSiteSearch(String domain, int domainId, int count) {
|
||||||
return new QueryParams("site:"+domain,
|
return new QueryParams("site:"+domain,
|
||||||
null,
|
null,
|
||||||
List.of(),
|
List.of(),
|
||||||
@ -53,7 +53,7 @@ public class SearchQueryParamFactory {
|
|||||||
SpecificationLimit.none(),
|
SpecificationLimit.none(),
|
||||||
SpecificationLimit.none(),
|
SpecificationLimit.none(),
|
||||||
SpecificationLimit.none(),
|
SpecificationLimit.none(),
|
||||||
List.of(),
|
List.of(domainId),
|
||||||
new QueryLimits(count, count, 100, 512),
|
new QueryLimits(count, count, 100, 512),
|
||||||
SearchSetIdentifier.NONE.name(),
|
SearchSetIdentifier.NONE.name(),
|
||||||
QueryStrategy.AUTO,
|
QueryStrategy.AUTO,
|
||||||
|
@ -5,13 +5,13 @@ import nu.marginalia.api.domains.DomainInfoClient;
|
|||||||
import nu.marginalia.api.domains.model.DomainInformation;
|
import nu.marginalia.api.domains.model.DomainInformation;
|
||||||
import nu.marginalia.api.domains.model.SimilarDomain;
|
import nu.marginalia.api.domains.model.SimilarDomain;
|
||||||
import nu.marginalia.db.DbDomainQueries;
|
import nu.marginalia.db.DbDomainQueries;
|
||||||
|
import nu.marginalia.feedlot.FeedlotClient;
|
||||||
import nu.marginalia.feedlot.model.FeedItems;
|
import nu.marginalia.feedlot.model.FeedItems;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.renderer.MustacheRenderer;
|
import nu.marginalia.renderer.MustacheRenderer;
|
||||||
import nu.marginalia.renderer.RendererFactory;
|
import nu.marginalia.renderer.RendererFactory;
|
||||||
import nu.marginalia.screenshot.ScreenshotService;
|
import nu.marginalia.screenshot.ScreenshotService;
|
||||||
import nu.marginalia.search.SearchOperator;
|
import nu.marginalia.search.SearchOperator;
|
||||||
import nu.marginalia.feedlot.FeedlotClient;
|
|
||||||
import nu.marginalia.search.model.UrlDetails;
|
import nu.marginalia.search.model.UrlDetails;
|
||||||
import nu.marginalia.search.svc.SearchFlagSiteService.FlagSiteFormData;
|
import nu.marginalia.search.svc.SearchFlagSiteService.FlagSiteFormData;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -153,7 +153,7 @@ public class SearchSiteInfoService {
|
|||||||
linkingDomainsFuture = domainInfoClient.linkedDomains(domainId, 25);
|
linkingDomainsFuture = domainInfoClient.linkedDomains(domainId, 25);
|
||||||
}
|
}
|
||||||
|
|
||||||
List<UrlDetails> sampleResults = searchOperator.doSiteSearch(domainName, 5);
|
List<UrlDetails> sampleResults = searchOperator.doSiteSearch(domainName, domainId,5);
|
||||||
if (!sampleResults.isEmpty()) {
|
if (!sampleResults.isEmpty()) {
|
||||||
url = sampleResults.getFirst().url.withPathAndParam("/", null).toString();
|
url = sampleResults.getFirst().url.withPathAndParam("/", null).toString();
|
||||||
}
|
}
|
||||||
@ -195,9 +195,10 @@ public class SearchSiteInfoService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private Docs listDocs(String domainName) {
|
private Docs listDocs(String domainName) {
|
||||||
|
int domainId = domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1);
|
||||||
return new Docs(domainName,
|
return new Docs(domainName,
|
||||||
domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1),
|
domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1),
|
||||||
searchOperator.doSiteSearch(domainName, 100));
|
searchOperator.doSiteSearch(domainName, domainId, 100));
|
||||||
}
|
}
|
||||||
|
|
||||||
public record Docs(Map<String, Boolean> view,
|
public record Docs(Map<String, Boolean> view,
|
||||||
|
@ -66,8 +66,8 @@ fi
|
|||||||
|
|
||||||
download_model model/English.DICT https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.DICT
|
download_model model/English.DICT https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.DICT
|
||||||
download_model model/English.RDR https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.RDR
|
download_model model/English.RDR https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.RDR
|
||||||
download_model model/opennlp-sentence.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin
|
download_model model/opennlp-sentence.bin https://downloads.apache.org/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin
|
||||||
download_model model/opennlp-tokens.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin
|
download_model model/opennlp-tokens.bin https://downloads.apache.org/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin
|
||||||
download_model model/segments.bin https://downloads.marginalia.nu/model/segments.bin a2650796c77968b1bd9db0d7c01e3150
|
download_model model/segments.bin https://downloads.marginalia.nu/model/segments.bin a2650796c77968b1bd9db0d7c01e3150
|
||||||
download_model model/tfreq-new-algo3.bin https://downloads.marginalia.nu/model/tfreq-new-algo3.bin a38f0809f983723001dfc784d88ebb6d
|
download_model model/tfreq-new-algo3.bin https://downloads.marginalia.nu/model/tfreq-new-algo3.bin a38f0809f983723001dfc784d88ebb6d
|
||||||
download_model model/lid.176.ftz https://downloads.marginalia.nu/model/lid.176.ftz 340156704bb8c8e50c4abf35a7ec2569
|
download_model model/lid.176.ftz https://downloads.marginalia.nu/model/lid.176.ftz 340156704bb8c8e50c4abf35a7ec2569
|
||||||
|
Loading…
Reference in New Issue
Block a user