mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Refactor crawler and add special logic for some platforms
* Break apart CrawlerRetreiver * Break apart HttpFetcher into an interface and impl for testing sanity * Add special logic for Lemmy, Mediawiki and Discourse to not waste requests on paths that aren't interesting.
This commit is contained in:
parent
5abaf13192
commit
ed373eef61
@ -2,13 +2,14 @@ package nu.marginalia.crawl;
|
||||
|
||||
import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
import plan.CrawlPlanLoader;
|
||||
import plan.CrawlPlan;
|
||||
import nu.marginalia.crawling.io.CrawledDomainWriter;
|
||||
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.HttpFetcher;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
import okhttp3.ConnectionPool;
|
||||
import okhttp3.Dispatcher;
|
||||
import okhttp3.internal.Util;
|
||||
@ -102,8 +103,8 @@ public class CrawlerMain implements AutoCloseable {
|
||||
if (workLog.isJobFinished(specification.id))
|
||||
return;
|
||||
|
||||
HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool);
|
||||
|
||||
HttpFetcher fetcher = new HttpFetcher(userAgent.uaString(), dispatcher, connectionPool);
|
||||
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) {
|
||||
var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept);
|
||||
|
||||
|
@ -3,11 +3,12 @@ package nu.marginalia.crawl.retreival;
|
||||
import com.google.common.hash.HashFunction;
|
||||
import com.google.common.hash.Hashing;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.crawl.retreival.fetcher.FetchResult;
|
||||
import nu.marginalia.crawl.retreival.fetcher.FetchResultState;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.crawling.model.*;
|
||||
import nu.marginalia.ip_blocklist.GeoIpBlocklist;
|
||||
import nu.marginalia.ip_blocklist.IpBlockList;
|
||||
import nu.marginalia.ip_blocklist.UrlBlocklist;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
@ -20,10 +21,9 @@ import java.net.InetAddress;
|
||||
import java.net.UnknownHostException;
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
import static java.lang.Math.max;
|
||||
import static java.lang.Math.min;
|
||||
@ -32,16 +32,18 @@ public class CrawlerRetreiver {
|
||||
private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 1000);
|
||||
private static final long DEFAULT_CRAWL_DELAY_MAX_MS = Long.getLong("defaultCrawlDelaySlow", 2500);
|
||||
|
||||
private static final int MAX_ERRORS = 10;
|
||||
private static final int MAX_ERRORS = 20;
|
||||
|
||||
private final LinkedList<EdgeUrl> queue = new LinkedList<>();
|
||||
private final HttpFetcher fetcher;
|
||||
|
||||
private final HashSet<String> visited;
|
||||
private final HashSet<String> known;
|
||||
|
||||
/** Flag to indicate that the crawler should slow down, e.g. from 429s */
|
||||
private boolean slowDown = false;
|
||||
|
||||
private final int depth;
|
||||
|
||||
/** Testing flag to disable crawl delay (otherwise crawler tests take several minutes) */
|
||||
private boolean testFlagIgnoreDelay = false;
|
||||
|
||||
private final String id;
|
||||
private final String domain;
|
||||
private final Consumer<SerializableCrawlData> crawledDomainWriter;
|
||||
@ -50,118 +52,120 @@ public class CrawlerRetreiver {
|
||||
private static final Logger logger = LoggerFactory.getLogger(CrawlerRetreiver.class);
|
||||
|
||||
private static final HashFunction hashMethod = Hashing.murmur3_128(0);
|
||||
private static final IpBlockList ipBlocklist;
|
||||
private static final UrlBlocklist urlBlocklist = new UrlBlocklist();
|
||||
private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector();
|
||||
|
||||
private static final DomainProber domainProber = new DomainProber();
|
||||
private final DomainCrawlFrontier crawlFrontier;
|
||||
|
||||
|
||||
int errorCount = 0;
|
||||
|
||||
static {
|
||||
try {
|
||||
ipBlocklist = new IpBlockList(new GeoIpBlocklist());
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs, Consumer<SerializableCrawlData> writer) {
|
||||
this.fetcher = fetcher;
|
||||
visited = new HashSet<>((int)(specs.urls.size() * 1.5));
|
||||
known = new HashSet<>(specs.urls.size() * 10);
|
||||
|
||||
depth = specs.crawlDepth;
|
||||
id = specs.id;
|
||||
domain = specs.domain;
|
||||
|
||||
crawledDomainWriter = writer;
|
||||
|
||||
for (String urlStr : specs.urls) {
|
||||
EdgeUrl.parse(urlStr).ifPresent(this::addToQueue);
|
||||
}
|
||||
this.crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), specs.urls, specs.crawlDepth);
|
||||
|
||||
if (queue.peek() != null) {
|
||||
var fst = queue.peek();
|
||||
var fst = crawlFrontier.peek();
|
||||
if (fst != null) {
|
||||
|
||||
// Ensure the index page is always crawled
|
||||
var root = fst.withPathAndParam("/", null);
|
||||
if (known.add(root.toString()))
|
||||
queue.addFirst(root);
|
||||
if (crawlFrontier.addKnown(root))
|
||||
crawlFrontier.addFirst(root);
|
||||
}
|
||||
else {
|
||||
addToQueue(new EdgeUrl("http", new EdgeDomain(domain), null, "/", null));
|
||||
addToQueue(new EdgeUrl("https", new EdgeDomain(domain), null, "/", null));
|
||||
// We know nothing about this domain, so we'll start with the index, trying both HTTP and HTTPS
|
||||
crawlFrontier.addToQueue(new EdgeUrl("http", new EdgeDomain(domain), null, "/", null));
|
||||
crawlFrontier.addToQueue(new EdgeUrl("https", new EdgeDomain(domain), null, "/", null));
|
||||
}
|
||||
}
|
||||
|
||||
public CrawlerRetreiver withNoDelay() {
|
||||
testFlagIgnoreDelay = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
public int fetch() {
|
||||
Optional<CrawledDomain> probeResult = probeDomainForProblems(domain);
|
||||
final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, crawlFrontier.peek());
|
||||
|
||||
if (probeResult.isPresent()) {
|
||||
crawledDomainWriter.accept(probeResult.get());
|
||||
return 1;
|
||||
}
|
||||
else {
|
||||
if (probeResult instanceof DomainProber.ProbeResultOk) {
|
||||
return crawlDomain();
|
||||
}
|
||||
}
|
||||
|
||||
private Optional<CrawledDomain> probeDomainForProblems(String domain) {
|
||||
EdgeUrl fst = queue.peek();
|
||||
// handle error cases for probe
|
||||
|
||||
var ip = findIp(domain);
|
||||
|
||||
if (fst == null) {
|
||||
logger.warn("No URLs for domain {}", domain);
|
||||
|
||||
return Optional.of(CrawledDomain.builder()
|
||||
.crawlerStatus(CrawlerDomainStatus.ERROR.name())
|
||||
.crawlerStatusDesc("No known URLs")
|
||||
if (probeResult instanceof DomainProber.ProbeResultError err) {
|
||||
crawledDomainWriter.accept(
|
||||
CrawledDomain.builder()
|
||||
.crawlerStatus(err.status().name())
|
||||
.crawlerStatusDesc(err.desc())
|
||||
.id(id)
|
||||
.domain(domain)
|
||||
.build());
|
||||
.ip(ip)
|
||||
.build()
|
||||
);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!ipBlocklist.isAllowed(fst.domain)) {
|
||||
return Optional.of(CrawledDomain.builder()
|
||||
.crawlerStatus(CrawlerDomainStatus.BLOCKED.name())
|
||||
if (probeResult instanceof DomainProber.ProbeResultRedirect redirect) {
|
||||
crawledDomainWriter.accept(
|
||||
CrawledDomain.builder()
|
||||
.crawlerStatus(CrawlerDomainStatus.REDIRECT.name())
|
||||
.crawlerStatusDesc("Redirected to different domain")
|
||||
.redirectDomain(redirect.domain().toString())
|
||||
.id(id)
|
||||
.domain(domain)
|
||||
.ip(findIp(domain))
|
||||
.build());
|
||||
.ip(ip)
|
||||
.build()
|
||||
);
|
||||
return 1;
|
||||
}
|
||||
|
||||
var fetchResult = fetcher.probeDomain(fst.withPathAndParam("/", null));
|
||||
if (!fetchResult.ok()) {
|
||||
logger.debug("Bad status on {}", domain);
|
||||
return Optional.of(createErrorPostFromStatus(fetchResult));
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
throw new IllegalStateException("Unknown probe result: " + probeResult);
|
||||
};
|
||||
|
||||
private int crawlDomain() {
|
||||
String ip = findIp(domain);
|
||||
|
||||
assert !queue.isEmpty();
|
||||
assert !crawlFrontier.isEmpty();
|
||||
|
||||
var robotsRules = fetcher.fetchRobotRules(queue.peek().domain);
|
||||
var robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain);
|
||||
long crawlDelay = robotsRules.getCrawlDelay();
|
||||
|
||||
CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, new ArrayList<>(), null);
|
||||
|
||||
int fetchedCount = 0;
|
||||
|
||||
while (!queue.isEmpty() && visited.size() < depth && errorCount < MAX_ERRORS ) {
|
||||
var top = queue.removeFirst();
|
||||
configureLinkFilter();
|
||||
|
||||
while (!crawlFrontier.isEmpty()
|
||||
&& !crawlFrontier.isCrawlDepthReached()
|
||||
&& errorCount < MAX_ERRORS)
|
||||
{
|
||||
var top = crawlFrontier.takeNextUrl();
|
||||
|
||||
if (!robotsRules.isAllowed(top.toString())) {
|
||||
crawledDomainWriter.accept(createRobotsError(top));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!crawlFrontier.filterLink(top))
|
||||
continue;
|
||||
if (urlBlocklist.isUrlBlocked(top))
|
||||
continue;
|
||||
if (!isAllowedProtocol(top.proto))
|
||||
continue;
|
||||
if (top.toString().length() > 255)
|
||||
continue;
|
||||
if (!visited.add(top.toString()))
|
||||
if (!crawlFrontier.addVisited(top))
|
||||
continue;
|
||||
|
||||
if (fetchDocument(top, crawlDelay)) {
|
||||
@ -176,8 +180,22 @@ public class CrawlerRetreiver {
|
||||
return fetchedCount;
|
||||
}
|
||||
|
||||
private void configureLinkFilter() {
|
||||
try {
|
||||
logger.info("Configuring link filter");
|
||||
|
||||
fetchUrl(crawlFrontier.peek())
|
||||
.map(linkFilterSelector::selectFilter)
|
||||
.ifPresent(crawlFrontier::setLinkFilter);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error configuring link filter", ex);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean fetchDocument(EdgeUrl top, long crawlDelay) {
|
||||
logger.debug("Fetching {}", top);
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
|
||||
var doc = fetchUrl(top);
|
||||
@ -186,10 +204,10 @@ public class CrawlerRetreiver {
|
||||
crawledDomainWriter.accept(d);
|
||||
|
||||
if (d.url != null) {
|
||||
EdgeUrl.parse(d.url).map(EdgeUrl::toString).ifPresent(visited::add);
|
||||
EdgeUrl.parse(d.url).ifPresent(crawlFrontier::addVisited);
|
||||
}
|
||||
|
||||
if ("ERROR".equals(d.crawlerStatus)) {
|
||||
if ("ERROR".equals(d.crawlerStatus) && d.httpStatus != 404) {
|
||||
errorCount++;
|
||||
}
|
||||
|
||||
@ -211,7 +229,6 @@ public class CrawlerRetreiver {
|
||||
var doc = fetchContent(top);
|
||||
|
||||
if (doc.documentBody != null) {
|
||||
|
||||
doc.documentBodyHash = createHash(doc.documentBody.decode());
|
||||
|
||||
Optional<Document> parsedDoc = parseDoc(doc);
|
||||
@ -260,37 +277,23 @@ public class CrawlerRetreiver {
|
||||
return Optional.of(Jsoup.parse(doc.documentBody.decode()));
|
||||
}
|
||||
|
||||
public boolean isSameDomain(EdgeUrl url) {
|
||||
return domain.equalsIgnoreCase(url.domain.toString());
|
||||
}
|
||||
|
||||
private void findLinks(EdgeUrl baseUrl, Document parsed) {
|
||||
baseUrl = linkParser.getBaseLink(parsed, baseUrl);
|
||||
|
||||
for (var link : parsed.getElementsByTag("a")) {
|
||||
linkParser.parseLink(baseUrl, link).ifPresent(this::addToQueue);
|
||||
linkParser.parseLink(baseUrl, link).ifPresent(crawlFrontier::addToQueue);
|
||||
}
|
||||
for (var link : parsed.getElementsByTag("frame")) {
|
||||
linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue);
|
||||
linkParser.parseFrame(baseUrl, link).ifPresent(crawlFrontier::addToQueue);
|
||||
}
|
||||
for (var link : parsed.getElementsByTag("iframe")) {
|
||||
linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue);
|
||||
linkParser.parseFrame(baseUrl, link).ifPresent(crawlFrontier::addToQueue);
|
||||
}
|
||||
for (var link : parsed.getElementsByTag("link")) {
|
||||
String rel = link.attr("rel");
|
||||
if (rel.equalsIgnoreCase("next") || rel.equalsIgnoreCase("prev")) {
|
||||
linkParser.parseLink(baseUrl, link).ifPresent(crawlFrontier::addToQueue);
|
||||
}
|
||||
|
||||
private void addToQueue(EdgeUrl url) {
|
||||
if (!isSameDomain(url))
|
||||
return;
|
||||
if (urlBlocklist.isUrlBlocked(url))
|
||||
return;
|
||||
if (urlBlocklist.isMailingListLink(url))
|
||||
return;
|
||||
// reduce memory usage by not growing queue huge when crawling large sites
|
||||
if (queue.size() + visited.size() >= depth + 100)
|
||||
return;
|
||||
|
||||
if (known.add(url.toString())) {
|
||||
queue.addLast(url);
|
||||
}
|
||||
}
|
||||
|
||||
@ -314,6 +317,9 @@ public class CrawlerRetreiver {
|
||||
|
||||
@SneakyThrows
|
||||
private void delay(long sleepTime, long spentTime) {
|
||||
if (testFlagIgnoreDelay)
|
||||
return;
|
||||
|
||||
if (sleepTime >= 1) {
|
||||
if (spentTime > sleepTime)
|
||||
return;
|
||||
@ -355,17 +361,17 @@ public class CrawlerRetreiver {
|
||||
.crawlerStatus(CrawlerDocumentStatus.ERROR.name())
|
||||
.build();
|
||||
}
|
||||
private CrawledDomain createErrorPostFromStatus(HttpFetcher.FetchResult ret) {
|
||||
private CrawledDomain createErrorPostFromStatus(FetchResult ret) {
|
||||
String ip = findIp(domain);
|
||||
|
||||
if (ret.state == HttpFetcher.FetchResultState.ERROR) {
|
||||
if (ret.state == FetchResultState.ERROR) {
|
||||
return CrawledDomain.builder()
|
||||
.crawlerStatus(CrawlerDomainStatus.ERROR.name())
|
||||
.id(id).domain(domain)
|
||||
.ip(ip)
|
||||
.build();
|
||||
}
|
||||
if (ret.state == HttpFetcher.FetchResultState.REDIRECT) {
|
||||
if (ret.state == FetchResultState.REDIRECT) {
|
||||
return CrawledDomain.builder()
|
||||
.crawlerStatus(CrawlerDomainStatus.REDIRECT.name())
|
||||
.id(id)
|
||||
@ -377,4 +383,5 @@ public class CrawlerRetreiver {
|
||||
throw new AssertionError("Unexpected case");
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,99 @@
|
||||
package nu.marginalia.crawl.retreival;
|
||||
|
||||
import nu.marginalia.ip_blocklist.UrlBlocklist;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Objects;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
public class DomainCrawlFrontier {
|
||||
private final LinkedList<EdgeUrl> queue = new LinkedList<>();
|
||||
private final HashSet<String> visited;
|
||||
private final HashSet<String> known;
|
||||
|
||||
private final EdgeDomain thisDomain;
|
||||
private final UrlBlocklist urlBlocklist;
|
||||
|
||||
private Predicate<EdgeUrl> linkFilter = url -> true;
|
||||
|
||||
final int depth;
|
||||
|
||||
public DomainCrawlFrontier(EdgeDomain thisDomain, Collection<String> urls, int depth) {
|
||||
this.thisDomain = thisDomain;
|
||||
this.urlBlocklist = new UrlBlocklist();
|
||||
this.depth = depth;
|
||||
|
||||
visited = new HashSet<>((int)(urls.size() * 1.5));
|
||||
known = new HashSet<>(urls.size() * 10);
|
||||
|
||||
for (String urlStr : urls) {
|
||||
EdgeUrl.parse(urlStr).ifPresent(this::addToQueue);
|
||||
}
|
||||
}
|
||||
|
||||
public void setLinkFilter(Predicate<EdgeUrl> linkFilter) {
|
||||
this.linkFilter = linkFilter;
|
||||
}
|
||||
|
||||
public boolean isCrawlDepthReached() {
|
||||
return visited.size() >= depth;
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return queue.isEmpty();
|
||||
}
|
||||
public boolean addKnown(EdgeUrl url) {
|
||||
return known.contains(url.toString());
|
||||
}
|
||||
public void addFirst(EdgeUrl url) {
|
||||
queue.addFirst(url);
|
||||
}
|
||||
|
||||
public EdgeUrl takeNextUrl() {
|
||||
return queue.removeFirst();
|
||||
}
|
||||
|
||||
public EdgeUrl peek() {
|
||||
return queue.peek();
|
||||
}
|
||||
|
||||
public boolean addVisited(EdgeUrl url) {
|
||||
return visited.add(url.toString());
|
||||
}
|
||||
|
||||
public boolean filterLink(EdgeUrl url) {
|
||||
return linkFilter.test(url);
|
||||
}
|
||||
|
||||
public void addToQueue(EdgeUrl url) {
|
||||
if (!isSameDomain(url))
|
||||
return;
|
||||
if (urlBlocklist.isUrlBlocked(url))
|
||||
return;
|
||||
if (urlBlocklist.isMailingListLink(url))
|
||||
return;
|
||||
if (!linkFilter.test(url))
|
||||
return;
|
||||
|
||||
// reduce memory usage by not growing queue huge when crawling large sites
|
||||
if (queue.size() + visited.size() >= depth + 100)
|
||||
return;
|
||||
|
||||
if (known.add(url.toString())) {
|
||||
queue.addLast(url);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
public boolean isSameDomain(EdgeUrl url) {
|
||||
return Objects.equals(thisDomain, url.domain);
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,59 @@
|
||||
package nu.marginalia.crawl.retreival;
|
||||
|
||||
import nu.marginalia.crawl.retreival.fetcher.FetchResultState;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawling.model.CrawlerDomainStatus;
|
||||
import nu.marginalia.ip_blocklist.GeoIpBlocklist;
|
||||
import nu.marginalia.ip_blocklist.IpBlockList;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
public class DomainProber {
|
||||
private final Logger logger = LoggerFactory.getLogger(DomainProber.class);
|
||||
private static IpBlockList ipBlockList;
|
||||
|
||||
static {
|
||||
try {
|
||||
ipBlockList = new IpBlockList(new GeoIpBlocklist());
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/** To detect problems early we do a probing request to the domain before we start crawling it properly.
|
||||
* This is a HEAD, typically to the root path. We check the IP against the blocklist, we check that it
|
||||
* doesn't immediately redirect to another domain (which should be crawled separately, not under the name
|
||||
* of this domain).
|
||||
*/
|
||||
public ProbeResult probeDomain(HttpFetcher fetcher, String domain, @Nullable EdgeUrl firstUrlInQueue) {
|
||||
|
||||
if (firstUrlInQueue == null) {
|
||||
logger.warn("No valid URLs for domain {}", domain);
|
||||
|
||||
return new ProbeResultError(CrawlerDomainStatus.ERROR, "No known URLs");
|
||||
}
|
||||
|
||||
if (!ipBlockList.isAllowed(firstUrlInQueue.domain))
|
||||
return new ProbeResultError(CrawlerDomainStatus.BLOCKED, "IP not allowed");
|
||||
|
||||
var fetchResult = fetcher.probeDomain(firstUrlInQueue.withPathAndParam("/", null));
|
||||
|
||||
if (fetchResult.ok())
|
||||
return new ProbeResultOk();
|
||||
|
||||
if (fetchResult.state == FetchResultState.REDIRECT)
|
||||
return new ProbeResultRedirect(fetchResult.domain);
|
||||
|
||||
return new ProbeResultError(CrawlerDomainStatus.ERROR, "Bad status");
|
||||
}
|
||||
|
||||
interface ProbeResult {};
|
||||
|
||||
record ProbeResultError(CrawlerDomainStatus status, String desc) implements ProbeResult {}
|
||||
record ProbeResultRedirect(EdgeDomain domain) implements ProbeResult {}
|
||||
record ProbeResultOk() implements ProbeResult {}
|
||||
}
|
@ -1,108 +0,0 @@
|
||||
package nu.marginalia.crawl.retreival;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.google.inject.name.Named;
|
||||
import io.reactivex.rxjava3.core.Observable;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.client.exception.NetworkException;
|
||||
import okhttp3.OkHttpClient;
|
||||
import okhttp3.Request;
|
||||
import okhttp3.Response;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.net.ssl.X509TrustManager;
|
||||
import java.io.IOException;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
// TODO: Is this used?
|
||||
@Singleton
|
||||
public class HttpRedirectResolver {
|
||||
private static final LinkParser linkParser = new LinkParser();
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final String userAgent;
|
||||
private final Cookies cookies = new Cookies();
|
||||
|
||||
private final OkHttpClient client = createClient();
|
||||
|
||||
@SneakyThrows
|
||||
private OkHttpClient createClient() {
|
||||
|
||||
return new OkHttpClient.Builder()
|
||||
.sslSocketFactory(NoSecuritySSL.buildSocketFactory(), (X509TrustManager) NoSecuritySSL.trustAllCerts[0])
|
||||
.hostnameVerifier(NoSecuritySSL.buildHostnameVerifyer())
|
||||
.cookieJar(cookies.getJar())
|
||||
.followRedirects(false)
|
||||
.followSslRedirects(false)
|
||||
.connectTimeout(8, TimeUnit.SECONDS)
|
||||
.build();
|
||||
}
|
||||
|
||||
@Inject
|
||||
public HttpRedirectResolver(@Named("user-agent") String userAgent) {
|
||||
this.userAgent = userAgent;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public Observable<EdgeUrl> probe(EdgeUrl url) {
|
||||
return probe(url, 0);
|
||||
}
|
||||
|
||||
private Observable<EdgeUrl> probe(EdgeUrl url, int depth) {
|
||||
if (depth > 10) {
|
||||
return Observable.error(new IllegalStateException("Too many redirects"));
|
||||
}
|
||||
if (!url.proto.toLowerCase().startsWith("http")) {
|
||||
return Observable.empty();
|
||||
}
|
||||
var head = new Request.Builder().get().addHeader("User-agent", userAgent)
|
||||
.url(url.toString())
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.build();
|
||||
|
||||
var call = client.newCall(head);
|
||||
try (var rsp = call.execute()) {
|
||||
return resolveRedirects(depth, url, rsp);
|
||||
} catch (IOException e) {
|
||||
return Observable.error(e);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private Observable<EdgeUrl> resolveRedirects(int depth, EdgeUrl url, Response response) {
|
||||
int code = response.code();
|
||||
response.close();
|
||||
|
||||
if (code < 300) {
|
||||
return Observable.just(url);
|
||||
}
|
||||
if (code < 309) {
|
||||
String newUrl = response.header("Location");
|
||||
return Observable.fromOptional(linkParser.parseLink(url, newUrl))
|
||||
.flatMap(u -> probe(u, depth + 1));
|
||||
}
|
||||
if (code >= 400) {
|
||||
return Observable.just(url);
|
||||
}
|
||||
return Observable.error(new IllegalStateException("HttpStatusCode " + code));
|
||||
}
|
||||
|
||||
|
||||
private boolean failOnBadStatus(Response response) {
|
||||
if (response.code() >= 400) {
|
||||
response.close();
|
||||
throw new NetworkException("Bad status " + response.code());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public static class BadContentType extends RuntimeException {
|
||||
public BadContentType(String type) {
|
||||
super(type);
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,61 @@
|
||||
package nu.marginalia.crawl.retreival;
|
||||
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Element;
|
||||
|
||||
import java.util.function.Predicate;
|
||||
|
||||
public class LinkFilterSelector {
|
||||
|
||||
/* With websites that run e.g. forum software or wiki software, it's
|
||||
very beneficial to cherry-pick the URLs that we want to crawl to
|
||||
exclude e.g. user profiles, and other similar noise.
|
||||
*/
|
||||
public Predicate<EdgeUrl> selectFilter(CrawledDocument sample) {
|
||||
|
||||
if (sample.httpStatus != 200) {
|
||||
return LinkFilterSelector::defaultFilter;
|
||||
}
|
||||
|
||||
// Sniff the software based on the sample document
|
||||
|
||||
var doc = Jsoup.parse(sample.documentBody.decode());
|
||||
var head = doc.getElementsByTag("head").first();
|
||||
if (null == head) {
|
||||
return url -> true;
|
||||
}
|
||||
|
||||
if (isLemmy(head)) {
|
||||
return url -> url.path.startsWith("/post/") || url.path.startsWith("/c/");
|
||||
}
|
||||
if (isMediawiki(head)) {
|
||||
return url -> url.path.startsWith("/wiki/") && !url.path.contains(":");
|
||||
}
|
||||
if (isDiscourse(head)) {
|
||||
return url -> url.path.startsWith("/t/") || url.path.contains("/latest");
|
||||
}
|
||||
|
||||
return LinkFilterSelector::defaultFilter;
|
||||
}
|
||||
|
||||
public static boolean defaultFilter(EdgeUrl url) {
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean isMediawiki(Element head) {
|
||||
return head.select("meta[name=generator]").attr("content").toLowerCase().contains("mediawiki");
|
||||
}
|
||||
private boolean isDiscourse(Element head) {
|
||||
return head.select("meta[name=generator]").attr("content").toLowerCase().contains("discourse");
|
||||
}
|
||||
private boolean isLemmy(Element head) {
|
||||
for (var scriptTags : head.select("script")) {
|
||||
if (scriptTags.html().contains("window.lemmyConfig")) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.crawl.retreival;
|
||||
package nu.marginalia.crawl.retreival.fetcher;
|
||||
|
||||
import javax.net.SocketFactory;
|
||||
import java.io.IOException;
|
@ -0,0 +1,16 @@
|
||||
package nu.marginalia.crawl.retreival.fetcher;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
@AllArgsConstructor
|
||||
@ToString
|
||||
public class FetchResult {
|
||||
public final FetchResultState state;
|
||||
public final EdgeDomain domain;
|
||||
|
||||
public boolean ok() {
|
||||
return state == FetchResultState.OK;
|
||||
}
|
||||
}
|
@ -0,0 +1,7 @@
|
||||
package nu.marginalia.crawl.retreival.fetcher;
|
||||
|
||||
public enum FetchResultState {
|
||||
OK,
|
||||
REDIRECT,
|
||||
ERROR
|
||||
}
|
@ -0,0 +1,25 @@
|
||||
package nu.marginalia.crawl.retreival.fetcher;
|
||||
|
||||
import com.google.inject.ImplementedBy;
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.crawl.retreival.RateLimitException;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@ImplementedBy(HttpFetcherImpl.class)
|
||||
public interface HttpFetcher {
|
||||
void setAllowAllContentTypes(boolean allowAllContentTypes);
|
||||
|
||||
List<String> getCookies();
|
||||
void clearCookies();
|
||||
|
||||
FetchResult probeDomain(EdgeUrl url);
|
||||
|
||||
CrawledDocument fetchContent(EdgeUrl url) throws RateLimitException;
|
||||
|
||||
SimpleRobotRules fetchRobotRules(EdgeDomain domain);
|
||||
}
|
@ -1,12 +1,12 @@
|
||||
package nu.marginalia.crawl.retreival;
|
||||
package nu.marginalia.crawl.retreival.fetcher;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import crawlercommons.robots.SimpleRobotRulesParser;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.crawl.retreival.Cookies;
|
||||
import nu.marginalia.crawl.retreival.RateLimitException;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
|
||||
import nu.marginalia.crawling.model.ContentType;
|
||||
@ -35,7 +35,7 @@ import java.util.Optional;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
public class HttpFetcher {
|
||||
public class HttpFetcherImpl implements HttpFetcher {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final String userAgent;
|
||||
@ -46,29 +46,15 @@ public class HttpFetcher {
|
||||
|
||||
private final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
||||
|
||||
@Override
|
||||
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
||||
contentTypeLogic.setAllowAllContentTypes(allowAllContentTypes);
|
||||
}
|
||||
|
||||
private final OkHttpClient client;
|
||||
|
||||
public enum FetchResultState {
|
||||
OK,
|
||||
REDIRECT,
|
||||
ERROR
|
||||
}
|
||||
|
||||
@AllArgsConstructor @ToString
|
||||
public static class FetchResult {
|
||||
public final FetchResultState state;
|
||||
public final EdgeDomain domain;
|
||||
|
||||
public boolean ok() {
|
||||
return state == FetchResultState.OK;
|
||||
}
|
||||
}
|
||||
|
||||
private static final FastTerminatingSocketFactory ftSocketFactory = new FastTerminatingSocketFactory();
|
||||
|
||||
@SneakyThrows
|
||||
private OkHttpClient createClient(Dispatcher dispatcher, ConnectionPool pool) {
|
||||
var builder = new OkHttpClient.Builder();
|
||||
@ -90,25 +76,28 @@ public class HttpFetcher {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> getCookies() {
|
||||
return cookies.getCookies();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clearCookies() {
|
||||
cookies.clear();
|
||||
}
|
||||
|
||||
@Inject
|
||||
public HttpFetcher(@Named("user-agent") String userAgent, Dispatcher dispatcher, ConnectionPool connectionPool) {
|
||||
public HttpFetcherImpl(@Named("user-agent") String userAgent, Dispatcher dispatcher, ConnectionPool connectionPool) {
|
||||
this.client = createClient(dispatcher, connectionPool);
|
||||
this.userAgent = userAgent;
|
||||
}
|
||||
|
||||
public HttpFetcher(@Named("user-agent") String userAgent) {
|
||||
public HttpFetcherImpl(@Named("user-agent") String userAgent) {
|
||||
this.client = createClient(null, new ConnectionPool());
|
||||
this.userAgent = userAgent;
|
||||
}
|
||||
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public FetchResult probeDomain(EdgeUrl url) {
|
||||
var head = new Request.Builder().head().addHeader("User-agent", userAgent)
|
||||
@ -126,6 +115,7 @@ public class HttpFetcher {
|
||||
}
|
||||
return new FetchResult(FetchResultState.OK, requestDomain);
|
||||
}
|
||||
|
||||
catch (Exception ex) {
|
||||
if (url.proto.equalsIgnoreCase("http") && "/".equals(url.path)) {
|
||||
return probeDomain(new EdgeUrl("https", url.domain, url.port, url.path, url.param));
|
||||
@ -151,6 +141,7 @@ public class HttpFetcher {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public CrawledDocument fetchContent(EdgeUrl url) throws RateLimitException {
|
||||
|
||||
@ -312,6 +303,7 @@ public class HttpFetcher {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public SimpleRobotRules fetchRobotRules(EdgeDomain domain) {
|
||||
return fetchRobotsForProto("https", domain)
|
||||
.or(() -> fetchRobotsForProto("http", domain))
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.crawl.retreival;
|
||||
package nu.marginalia.crawl.retreival.fetcher;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
@ -1,9 +1,8 @@
|
||||
package nu.marginalia.crawling;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.crawl.retreival.HttpFetcher;
|
||||
import nu.marginalia.crawl.retreival.HttpRedirectResolver;
|
||||
import nu.marginalia.crawl.retreival.RateLimitException;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.retreival.logic.ContentTypeLogic;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
@ -29,44 +28,15 @@ class HttpFetcherTest {
|
||||
|
||||
@Test
|
||||
void fetchUTF8() throws URISyntaxException, RateLimitException {
|
||||
var fetcher = new HttpFetcher("nu.marginalia.edge-crawler");
|
||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||
var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"));
|
||||
System.out.println(str.contentType);
|
||||
}
|
||||
|
||||
@Test
|
||||
void fetchText() throws URISyntaxException, RateLimitException {
|
||||
var fetcher = new HttpFetcher("nu.marginalia.edge-crawler");
|
||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||
var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"));
|
||||
System.out.println(str);
|
||||
}
|
||||
|
||||
@Test
|
||||
void resolveRedirect() throws URISyntaxException {
|
||||
var fetcher = new HttpRedirectResolver("nu.marginalia.edge-crawler");
|
||||
var str = fetcher.probe(new EdgeUrl("https://www.marginalia.nu/robots.txt"));
|
||||
System.out.println(str);
|
||||
}
|
||||
|
||||
@Test
|
||||
void resolveRedirect2() throws URISyntaxException {
|
||||
var fetcher = new HttpRedirectResolver("nu.marginalia.edge-crawler");
|
||||
var str = fetcher.probe(new EdgeUrl("https://www.marginalia.nu/robots.txt")).blockingFirst();
|
||||
System.out.println(str);
|
||||
}
|
||||
|
||||
@Test
|
||||
void resolveRedirect3() throws URISyntaxException {
|
||||
var fetcher = new HttpRedirectResolver("nu.marginalia.edge-crawler");
|
||||
var str = fetcher.probe(new EdgeUrl("https://www.marginalia.nu/robots.txt"));
|
||||
System.out.println(str);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
void resolveRedirect4() throws URISyntaxException {
|
||||
var fetcher = new HttpRedirectResolver("nu.marginalia.edge-crawler");
|
||||
var str = fetcher.probe(new EdgeUrl("https://www.marginalia.nu/robots.txt"));
|
||||
System.out.println(str);
|
||||
}
|
||||
}
|
@ -0,0 +1,152 @@
|
||||
package nu.marginalia.crawling.retreival;
|
||||
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.bigstring.BigString;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.RateLimitException;
|
||||
import nu.marginalia.crawl.retreival.fetcher.FetchResult;
|
||||
import nu.marginalia.crawl.retreival.fetcher.FetchResultState;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
|
||||
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class CrawlerMockFetcherTest {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(CrawlerMockFetcherTest.class);
|
||||
|
||||
Map<EdgeUrl, CrawledDocument> mockData = new HashMap<>();
|
||||
HttpFetcher fetcherMock = new MockFetcher();
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() {
|
||||
mockData.clear();
|
||||
}
|
||||
|
||||
private void registerUrl(EdgeUrl url, String documentData) {
|
||||
mockData.put(url, CrawledDocument.builder()
|
||||
.crawlId("1")
|
||||
.url(url.toString())
|
||||
.contentType("text/html")
|
||||
.httpStatus(200)
|
||||
.crawlerStatus(CrawlerDocumentStatus.OK.name())
|
||||
.documentBody(BigString.encode(documentData))
|
||||
.build());
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private void registerUrlClasspathData(EdgeUrl url, String path) {
|
||||
try (var resourceStream = getClass().getClassLoader().getResourceAsStream(path)) {
|
||||
if (resourceStream == null) throw new IllegalArgumentException("No such resource: " + path);
|
||||
|
||||
var data = BigString.encode(new String(resourceStream.readAllBytes(), StandardCharsets.UTF_8));
|
||||
|
||||
mockData.put(url, CrawledDocument.builder()
|
||||
.crawlId("1")
|
||||
.url(url.toString())
|
||||
.contentType("text/html")
|
||||
.httpStatus(200)
|
||||
.crawlerStatus(CrawlerDocumentStatus.OK.name())
|
||||
.documentBody(data)
|
||||
.build());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLemmy() throws URISyntaxException {
|
||||
List<SerializableCrawlData> out = new ArrayList<>();
|
||||
|
||||
registerUrlClasspathData(new EdgeUrl("https://startrek.website/"), "mock-crawl-data/lemmy/index.html");
|
||||
registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html");
|
||||
|
||||
new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "startrek.website", new ArrayList<>()), out::add)
|
||||
.withNoDelay()
|
||||
.fetch();
|
||||
|
||||
out.forEach(System.out::println);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMediawiki() throws URISyntaxException {
|
||||
List<SerializableCrawlData> out = new ArrayList<>();
|
||||
|
||||
registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html");
|
||||
|
||||
new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "en.wikipedia.org", new ArrayList<>()), out::add)
|
||||
.withNoDelay()
|
||||
.fetch();
|
||||
|
||||
out.forEach(System.out::println);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDiscourse() throws URISyntaxException {
|
||||
List<SerializableCrawlData> out = new ArrayList<>();
|
||||
|
||||
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/"), "mock-crawl-data/discourse/index.html");
|
||||
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501"), "mock-crawl-data/discourse/telegram.html");
|
||||
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html");
|
||||
|
||||
new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 100, "community.tt-rss.org", new ArrayList<>()), out::add)
|
||||
.withNoDelay()
|
||||
.fetch();
|
||||
|
||||
out.forEach(System.out::println);
|
||||
}
|
||||
|
||||
class MockFetcher implements HttpFetcher {
|
||||
|
||||
@Override
|
||||
public void setAllowAllContentTypes(boolean allowAllContentTypes) {}
|
||||
|
||||
@Override
|
||||
public List<String> getCookies() { return List.of();}
|
||||
|
||||
@Override
|
||||
public void clearCookies() {}
|
||||
|
||||
@Override
|
||||
public FetchResult probeDomain(EdgeUrl url) {
|
||||
logger.info("Probing {}", url);
|
||||
return new FetchResult(FetchResultState.OK, url.domain);
|
||||
}
|
||||
|
||||
@Override
|
||||
public CrawledDocument fetchContent(EdgeUrl url) {
|
||||
logger.info("Fetching {}", url);
|
||||
if (mockData.containsKey(url)) {
|
||||
return mockData.get(url);
|
||||
}
|
||||
else {
|
||||
return CrawledDocument.builder()
|
||||
.crawlId("1")
|
||||
.url(url.toString())
|
||||
.contentType("text/html")
|
||||
.httpStatus(404)
|
||||
.crawlerStatus(CrawlerDocumentStatus.ERROR.name())
|
||||
.build();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public SimpleRobotRules fetchRobotRules(EdgeDomain domain) {
|
||||
return new SimpleRobotRules();
|
||||
}
|
||||
}
|
||||
}
|
@ -1,7 +1,8 @@
|
||||
package nu.marginalia.crawling.retreival;
|
||||
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.HttpFetcher;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
||||
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||
@ -23,7 +24,7 @@ class CrawlerRetreiverTest {
|
||||
|
||||
var specs = new CrawlingSpecification("1", 5, "www.marginalia.nu", new ArrayList<>());
|
||||
|
||||
HttpFetcher fetcher = new HttpFetcher("test.marginalia.nu");
|
||||
HttpFetcher fetcher = new HttpFetcherImpl("test.marginalia.nu");
|
||||
|
||||
|
||||
List<SerializableCrawlData> data = new ArrayList<>();
|
||||
|
@ -0,0 +1,860 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Combined mode but grid - Development - Tiny Tiny RSS: Community</title>
|
||||
<meta name="description" content="horrible, huh?
|
||||
@media screen and (min-width: 1400px) {
|
||||
#headlines-frame {
|
||||
display : grid;
|
||||
grid-template-columns: repeat(2, 1fr);
|
||||
grid-gap : 8px;
|
||||
|
||||
.cdm.expanded {
|
||||
.footer {
|
||||
border : 0;
|
||||
}
|
||||
|
||||
b&hellip;">
|
||||
<meta name="generator" content="Discourse 3.1.0.beta4 - https://github.com/discourse/discourse version 7ff8e5580f9a900cde4be66377cf4f1dcd253a35">
|
||||
<link rel="icon" type="image/png" href="https://community.tt-rss.org/uploads/default/optimized/1X/18a2e96275d1fffb21cce225d30a87be4544db60_2_32x32.png">
|
||||
<link rel="apple-touch-icon" type="image/png" href="https://community.tt-rss.org/uploads/default/optimized/1X/18a2e96275d1fffb21cce225d30a87be4544db60_2_180x180.png">
|
||||
<meta name="theme-color" media="all" content="#ffffff">
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0, user-scalable=yes, viewport-fit=cover">
|
||||
<link rel="canonical" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489" />
|
||||
|
||||
<link rel="search" type="application/opensearchdescription+xml" href="https://community.tt-rss.org/opensearch.xml" title="Tiny Tiny RSS: Community Search">
|
||||
|
||||
<link href="/stylesheets/color_definitions_base__2_e1a3786e9787d3094d4ad821cf887d92d1d46700.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" class="light-scheme"/>
|
||||
|
||||
<link href="/stylesheets/desktop_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="desktop" />
|
||||
|
||||
|
||||
|
||||
<link href="/stylesheets/discourse-details_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-details" />
|
||||
<link href="/stylesheets/discourse-lazy-videos_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-lazy-videos" />
|
||||
<link href="/stylesheets/discourse-local-dates_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-local-dates" />
|
||||
<link href="/stylesheets/discourse-narrative-bot_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-narrative-bot" />
|
||||
<link href="/stylesheets/discourse-presence_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-presence" />
|
||||
<link href="/stylesheets/discourse-reactions_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-reactions" />
|
||||
<link href="/stylesheets/discourse-solved_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-solved" />
|
||||
<link href="/stylesheets/docker_manager_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="docker_manager" />
|
||||
<link href="/stylesheets/poll_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="poll" />
|
||||
<link href="/stylesheets/discourse-reactions_desktop_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-reactions_desktop" />
|
||||
<link href="/stylesheets/poll_desktop_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="poll_desktop" />
|
||||
|
||||
<link href="/stylesheets/desktop_theme_7_fe50c691a9fb30bb61c18aa08d3b7a6cb61a0150.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="desktop_theme" data-theme-id="7" data-theme-name="custom header links"/>
|
||||
<link href="/stylesheets/desktop_theme_6_7ca213afc12b87349d5efdb08c76d0a62c01d53a.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="desktop_theme" data-theme-id="6" data-theme-name="bears"/>
|
||||
<link href="/stylesheets/desktop_theme_2_88a998cf58047c52104c0e1f454c9c1c16e18c70.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="desktop_theme" data-theme-id="2" data-theme-name="ttrss"/>
|
||||
|
||||
<!-- <script type="text/discourse-plugin" version="0.2">
|
||||
api.onPageChange((url, title) => {
|
||||
if (_paq) {
|
||||
_paq.push(["setCustomUrl", url]);
|
||||
_paq.push(["setDocumentTitle", title]);
|
||||
_paq.push(["trackPageView"]);
|
||||
|
||||
const currentUser = api.getCurrentUser();
|
||||
|
||||
if (currentUser && currentUser['username']) {
|
||||
_paq.push(['setUserId', currentUser['username']]);
|
||||
}
|
||||
}
|
||||
});
|
||||
</script> -->
|
||||
|
||||
|
||||
<link rel="alternate nofollow" type="application/rss+xml" title="RSS feed of 'Combined mode but grid'" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489.rss" />
|
||||
<meta property="og:site_name" content="Tiny Tiny RSS: Community" />
|
||||
<meta property="og:type" content="website" />
|
||||
<meta name="twitter:card" content="summary" />
|
||||
<meta name="twitter:image" content="https://community.tt-rss.org/uploads/default/optimized/2X/d/d253dab1dab0906bf5361527e16cd069401ac1cc_2_1024x670.jpeg" />
|
||||
<meta property="og:image" content="https://community.tt-rss.org/uploads/default/optimized/2X/d/d253dab1dab0906bf5361527e16cd069401ac1cc_2_1024x670.jpeg" />
|
||||
<meta property="og:url" content="https://community.tt-rss.org/t/combined-mode-but-grid/4489" />
|
||||
<meta name="twitter:url" content="https://community.tt-rss.org/t/combined-mode-but-grid/4489" />
|
||||
<meta property="og:title" content="Combined mode but grid" />
|
||||
<meta name="twitter:title" content="Combined mode but grid" />
|
||||
<meta property="og:description" content="horrible, huh? @media screen and (min-width: 1400px) { #headlines-frame { display : grid; grid-template-columns: repeat(2, 1fr); grid-gap : 8px; .cdm.expanded { .footer { border : 0; } border : 1px solid @border-default; } } } }" />
|
||||
<meta name="twitter:description" content="horrible, huh? @media screen and (min-width: 1400px) { #headlines-frame { display : grid; grid-template-columns: repeat(2, 1fr); grid-gap : 8px; .cdm.expanded { .footer { border : 0; } border : 1px solid @border-default; } } } }" />
|
||||
<meta property="og:article:section" content="Tiny Tiny RSS" />
|
||||
<meta property="og:article:section:color" content="25AAE2" />
|
||||
<meta property="og:article:section" content="Development" />
|
||||
<meta property="og:article:section:color" content="3AB54A" />
|
||||
<meta property="article:published_time" content="2021-03-09T18:27:09+00:00" />
|
||||
<meta property="og:ignore_canonical" content="true" />
|
||||
|
||||
|
||||
<script type="application/ld+json">{"@context":"http://schema.org","@type":"QAPage","name":"Combined mode but grid","mainEntity":{"@type":"Question","name":"Combined mode but grid","text":"horrible, huh?\n\n@media screen and (min-width: 1400px) {\n\n#headlines-frame {\n\ndisplay : grid;\n\ngrid-template-columns: repeat(2, 1fr);\n\ngrid-gap : 8px;\n\n.cdm.expanded {\n\n.footer {\n\nborder : 0;\n\n}\n\nborder : 1px solid @border-default;\n\n}\n\n}\n\n}\n\n}\n\n<a class=\"lightbox\" href=\"https://discourse.tt-rss.org/uploads/default/original/2X/d/d253dab1dab0906bf5361527e16cd069401ac1cc.jpeg\" data-download-href=\"https://discourse.tt-rss.org/uploads/default/d253dab1dab0906bf5361527e16cd069401ac1cc\" title=\"image\">[image]<\/a>","upvoteCount":0,"answerCount":0,"dateCreated":"2021-03-09T18:27:09.054Z","author":{"@type":"Person","name":""}}}</script>
|
||||
</head>
|
||||
<body class="crawler ">
|
||||
|
||||
<header>
|
||||
<a href="/">
|
||||
Tiny Tiny RSS: Community
|
||||
</a>
|
||||
</header>
|
||||
|
||||
<div id="main-outlet" class="wrap" role="main">
|
||||
<div id="topic-title">
|
||||
<h1>
|
||||
<a href="/t/combined-mode-but-grid/4489">Combined mode but grid</a>
|
||||
</h1>
|
||||
|
||||
<div class="topic-category" itemscope itemtype="http://schema.org/BreadcrumbList">
|
||||
<span itemprop="itemListElement" itemscope itemtype="http://schema.org/ListItem">
|
||||
<a href="https://community.tt-rss.org/c/tiny-tiny-rss/8" class="badge-wrapper bullet" itemprop="item">
|
||||
<span class='badge-category-bg' style='background-color: #25AAE2'></span>
|
||||
<span class='badge-category clear-badge'>
|
||||
<span class='category-name' itemprop='name'>Tiny Tiny RSS</span>
|
||||
</span>
|
||||
</a>
|
||||
<meta itemprop="position" content="1" />
|
||||
</span>
|
||||
<span itemprop="itemListElement" itemscope itemtype="http://schema.org/ListItem">
|
||||
<a href="https://community.tt-rss.org/c/tiny-tiny-rss/development/6" class="badge-wrapper bullet" itemprop="item">
|
||||
<span class='badge-category-bg' style='background-color: #3AB54A'></span>
|
||||
<span class='badge-category clear-badge'>
|
||||
<span class='category-name' itemprop='name'>Development</span>
|
||||
</span>
|
||||
</a>
|
||||
<meta itemprop="position" content="2" />
|
||||
</span>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div itemscope itemtype='http://schema.org/DiscussionForumPosting'>
|
||||
<meta itemprop='headline' content='Combined mode but grid'>
|
||||
<meta itemprop='articleSection' content='Development'>
|
||||
<meta itemprop='keywords' content=''>
|
||||
<div itemprop='publisher' itemscope itemtype="http://schema.org/Organization">
|
||||
<meta itemprop='name' content='Tiny Tiny RSS: Community'>
|
||||
</div>
|
||||
|
||||
<div id='post_1' class='topic-body crawler-post'>
|
||||
<div class='crawler-post-meta'>
|
||||
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
|
||||
<a itemprop="url" href='https://community.tt-rss.org/u/fox'><span itemprop='name'>fox</span></a>
|
||||
|
||||
</span>
|
||||
|
||||
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
|
||||
|
||||
<link itemprop="image" href="https://community.tt-rss.org/uploads/default/original/2X/d/d253dab1dab0906bf5361527e16cd069401ac1cc.jpeg">
|
||||
|
||||
<span class="crawler-post-infos">
|
||||
<time itemprop='datePublished' datetime='2021-03-09T18:27:09Z' class='post-time'>
|
||||
March 9, 2021, 6:27pm
|
||||
</time>
|
||||
<meta itemprop='dateModified' content='2021-03-09T18:27:09Z'>
|
||||
<span itemprop='position'>1</span>
|
||||
</span>
|
||||
</div>
|
||||
<div class='post' itemprop='articleBody'>
|
||||
<p>horrible, huh?</p>
|
||||
<pre><code class="lang-css">@media screen and (min-width: 1400px) {
|
||||
#headlines-frame {
|
||||
display : grid;
|
||||
grid-template-columns: repeat(2, 1fr);
|
||||
grid-gap : 8px;
|
||||
|
||||
.cdm.expanded {
|
||||
.footer {
|
||||
border : 0;
|
||||
}
|
||||
|
||||
border : 1px solid @border-default;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
</code></pre>
|
||||
<p><div class="lightbox-wrapper"><a class="lightbox" href="https://discourse.tt-rss.org/uploads/default/original/2X/d/d253dab1dab0906bf5361527e16cd069401ac1cc.jpeg" data-download-href="https://discourse.tt-rss.org/uploads/default/d253dab1dab0906bf5361527e16cd069401ac1cc" title="image"><img src="https://discourse.tt-rss.org/uploads/default/optimized/2X/d/d253dab1dab0906bf5361527e16cd069401ac1cc_2_690x451.jpeg" alt="image" data-base62-sha1="u0DVoAOiT7CgzBwmYIh7fWqFZz6" width="690" height="451" srcset="https://discourse.tt-rss.org/uploads/default/optimized/2X/d/d253dab1dab0906bf5361527e16cd069401ac1cc_2_690x451.jpeg, https://discourse.tt-rss.org/uploads/default/optimized/2X/d/d253dab1dab0906bf5361527e16cd069401ac1cc_2_1035x676.jpeg 1.5x, https://discourse.tt-rss.org/uploads/default/optimized/2X/d/d253dab1dab0906bf5361527e16cd069401ac1cc_2_1380x902.jpeg 2x" data-small-upload="https://discourse.tt-rss.org/uploads/default/optimized/2X/d/d253dab1dab0906bf5361527e16cd069401ac1cc_2_10x10.png"><div class="meta">
|
||||
<svg class="fa d-icon d-icon-far-image svg-icon" aria-hidden="true"><use xlink:href="#far-image"></use></svg><span class="filename">image</span><span class="informations">1826×1195 425 KB</span><svg class="fa d-icon d-icon-discourse-expand svg-icon" aria-hidden="true"><use xlink:href="#discourse-expand"></use></svg>
|
||||
</div></a></div></p>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
<span class='post-likes'></span>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<div id='post_2' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
|
||||
<div class='crawler-post-meta'>
|
||||
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
|
||||
<a itemprop="url" href='https://community.tt-rss.org/u/sam302psu'><span itemprop='name'>sam302psu</span></a>
|
||||
|
||||
</span>
|
||||
|
||||
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
|
||||
|
||||
|
||||
<span class="crawler-post-infos">
|
||||
<time itemprop='datePublished' datetime='2021-03-09T18:52:37Z' class='post-time'>
|
||||
March 9, 2021, 6:52pm
|
||||
</time>
|
||||
<meta itemprop='dateModified' content='2021-03-09T18:52:37Z'>
|
||||
<span itemprop='position'>2</span>
|
||||
</span>
|
||||
</div>
|
||||
<div class='post' itemprop='text'>
|
||||
<p>I kind of like it. The concept at least.</p>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
<span class='post-likes'></span>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<div id='post_3' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
|
||||
<div class='crawler-post-meta'>
|
||||
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
|
||||
<a itemprop="url" href='https://community.tt-rss.org/u/linoth'><span itemprop='name'>linoth</span></a>
|
||||
|
||||
</span>
|
||||
|
||||
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
|
||||
|
||||
|
||||
<span class="crawler-post-infos">
|
||||
<time itemprop='datePublished' datetime='2021-03-10T01:41:53Z' class='post-time'>
|
||||
March 10, 2021, 1:41am
|
||||
</time>
|
||||
<meta itemprop='dateModified' content='2021-03-10T01:41:53Z'>
|
||||
<span itemprop='position'>3</span>
|
||||
</span>
|
||||
</div>
|
||||
<div class='post' itemprop='text'>
|
||||
<p>You got me curious.</p>
|
||||
<p>A little rough in its current form if you’re someone toggling collapsed mode. Not sure exactly what caused it, but I wound up with the page getting wider a time or two while I tried it out. I’m sure that it’s actually pretty desirable for image-heavy uses, such as your example of Reddit. Would not have guessed that it could be pulled off with just some lines of CSS, but I don’t know web dev.</p>
|
||||
<p>Neat.</p>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
<span class='post-likes'></span>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
|
||||
<meta itemprop="userInteractionCount" content="1" />
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<div id='post_4' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
|
||||
<div class='crawler-post-meta'>
|
||||
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
|
||||
<a itemprop="url" href='https://community.tt-rss.org/u/fox'><span itemprop='name'>fox</span></a>
|
||||
|
||||
</span>
|
||||
|
||||
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
|
||||
|
||||
<link itemprop="image" href="https://community.tt-rss.org/uploads/default/original/2X/a/a0dcebfc42b3940a0f4c55ae4640a7ee97e4da08.jpeg">
|
||||
|
||||
<span class="crawler-post-infos">
|
||||
<time itemprop='datePublished' datetime='2021-03-10T05:03:00Z' class='post-time'>
|
||||
March 10, 2021, 5:03am
|
||||
</time>
|
||||
<meta itemprop='dateModified' content='2021-03-10T05:41:19Z'>
|
||||
<span itemprop='position'>4</span>
|
||||
</span>
|
||||
</div>
|
||||
<div class='post' itemprop='text'>
|
||||
<p>yeah this would obviously only work for expanded mode <img src="https://discourse.tt-rss.org/images/emoji/google_classic/slight_smile.png?v=9" title=":slight_smile:" class="emoji" alt=":slight_smile:"></p>
|
||||
<p><div class="lightbox-wrapper"><a class="lightbox" href="https://discourse.tt-rss.org/uploads/default/original/2X/a/a0dcebfc42b3940a0f4c55ae4640a7ee97e4da08.jpeg" data-download-href="https://discourse.tt-rss.org/uploads/default/a0dcebfc42b3940a0f4c55ae4640a7ee97e4da08" title="image"><img src="https://discourse.tt-rss.org/uploads/default/optimized/2X/a/a0dcebfc42b3940a0f4c55ae4640a7ee97e4da08_2_690x448.jpeg" alt="image" data-base62-sha1="mX3IwjuahEZxyHk7fMxVA6XKJCw" width="690" height="448" srcset="https://discourse.tt-rss.org/uploads/default/optimized/2X/a/a0dcebfc42b3940a0f4c55ae4640a7ee97e4da08_2_690x448.jpeg, https://discourse.tt-rss.org/uploads/default/optimized/2X/a/a0dcebfc42b3940a0f4c55ae4640a7ee97e4da08_2_1035x672.jpeg 1.5x, https://discourse.tt-rss.org/uploads/default/optimized/2X/a/a0dcebfc42b3940a0f4c55ae4640a7ee97e4da08_2_1380x896.jpeg 2x" data-small-upload="https://discourse.tt-rss.org/uploads/default/optimized/2X/a/a0dcebfc42b3940a0f4c55ae4640a7ee97e4da08_2_10x10.png"><div class="meta">
|
||||
<svg class="fa d-icon d-icon-far-image svg-icon" aria-hidden="true"><use xlink:href="#far-image"></use></svg><span class="filename">image</span><span class="informations">1951×1267 552 KB</span><svg class="fa d-icon d-icon-discourse-expand svg-icon" aria-hidden="true"><use xlink:href="#discourse-expand"></use></svg>
|
||||
</div></a></div></p>
|
||||
<p>a bit more polished looking, i think.</p>
|
||||
<p>e: it’s an option now.</p>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
<span class='post-likes'></span>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<div id='post_5' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
|
||||
<div class='crawler-post-meta'>
|
||||
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
|
||||
<a itemprop="url" href='https://community.tt-rss.org/u/fox'><span itemprop='name'>fox</span></a>
|
||||
|
||||
</span>
|
||||
|
||||
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
|
||||
|
||||
<link itemprop="image" href="https://community.tt-rss.org/uploads/default/original/2X/0/04be46f4f91f917188576fdeb34027184c5d8bbc.png">
|
||||
|
||||
<span class="crawler-post-infos">
|
||||
<time itemprop='datePublished' datetime='2021-03-11T20:17:10Z' class='post-time'>
|
||||
March 11, 2021, 8:17pm
|
||||
</time>
|
||||
<meta itemprop='dateModified' content='2021-03-11T20:27:54Z'>
|
||||
<span itemprop='position'>5</span>
|
||||
</span>
|
||||
</div>
|
||||
<div class='post' itemprop='text'>
|
||||
<p>i made a really primitive plugin that fakes masonry layout for the grid:</p>
|
||||
<p><div class="lightbox-wrapper"><a class="lightbox" href="https://discourse.tt-rss.org/uploads/default/original/2X/0/04be46f4f91f917188576fdeb34027184c5d8bbc.png" data-download-href="https://discourse.tt-rss.org/uploads/default/04be46f4f91f917188576fdeb34027184c5d8bbc" title="image"><img src="https://discourse.tt-rss.org/uploads/default/optimized/2X/0/04be46f4f91f917188576fdeb34027184c5d8bbc_2_690x431.png" alt="image" data-base62-sha1="FXzLpxEuFssaI6B3WUk8pRDS3O" width="690" height="431" srcset="https://discourse.tt-rss.org/uploads/default/optimized/2X/0/04be46f4f91f917188576fdeb34027184c5d8bbc_2_690x431.png, https://discourse.tt-rss.org/uploads/default/optimized/2X/0/04be46f4f91f917188576fdeb34027184c5d8bbc_2_1035x646.png 1.5x, https://discourse.tt-rss.org/uploads/default/optimized/2X/0/04be46f4f91f917188576fdeb34027184c5d8bbc_2_1380x862.png 2x" data-dominant-color="E1E1E2"><div class="meta">
|
||||
<svg class="fa d-icon d-icon-far-image svg-icon" aria-hidden="true"><use href="#far-image"></use></svg><span class="filename">image</span><span class="informations">2033×1271 386 KB</span><svg class="fa d-icon d-icon-discourse-expand svg-icon" aria-hidden="true"><use href="#discourse-expand"></use></svg>
|
||||
</div></a></div></p>
|
||||
<p>its somewhat buggy and lacks any optimization whatsoever but cool nonetheless (when it works). enjoy.</p>
|
||||
<p><a href="https://git.tt-rss.org/fox/ttrss-grid-masonry" class="onebox" target="_blank" rel="noopener">https://git.tt-rss.org/fox/ttrss-grid-masonry</a></p>
|
||||
<p>e: this needs latest master, just in case.</p>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
<span class='post-likes'></span>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
|
||||
<meta itemprop="userInteractionCount" content="1" />
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<div id='post_6' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
|
||||
<div class='crawler-post-meta'>
|
||||
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
|
||||
<a itemprop="url" href='https://community.tt-rss.org/u/sam302psu'><span itemprop='name'>sam302psu</span></a>
|
||||
|
||||
</span>
|
||||
|
||||
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
|
||||
|
||||
|
||||
<span class="crawler-post-infos">
|
||||
<time itemprop='datePublished' datetime='2021-03-11T21:24:30Z' class='post-time'>
|
||||
March 11, 2021, 9:24pm
|
||||
</time>
|
||||
<meta itemprop='dateModified' content='2021-03-11T21:26:55Z'>
|
||||
<span itemprop='position'>6</span>
|
||||
</span>
|
||||
</div>
|
||||
<div class='post' itemprop='text'>
|
||||
<p>So I’ve been using this view and I like it but one small issue I can’t figure out how to fix is word wrapping in the frames. Words keep getting split between lines and making things hard to read sometimes. Is that something I can fix with the custom css?</p>
|
||||
<p>edit: I should have mentioned I am running the dynamic docker setup and I restarted the containers a few hours ago.</p>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
<span class='post-likes'></span>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
|
||||
<meta itemprop="userInteractionCount" content="1" />
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<div id='post_7' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
|
||||
<div class='crawler-post-meta'>
|
||||
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
|
||||
<a itemprop="url" href='https://community.tt-rss.org/u/fox'><span itemprop='name'>fox</span></a>
|
||||
|
||||
</span>
|
||||
|
||||
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
|
||||
|
||||
|
||||
<span class="crawler-post-infos">
|
||||
<time itemprop='datePublished' datetime='2021-03-12T04:38:48Z' class='post-time'>
|
||||
March 12, 2021, 4:38am
|
||||
</time>
|
||||
<meta itemprop='dateModified' content='2021-03-12T04:38:48Z'>
|
||||
<span itemprop='position'>7</span>
|
||||
</span>
|
||||
</div>
|
||||
<div class='post' itemprop='text'>
|
||||
<p>i should probably go easier on word-breaking in there, limit to links only or something like that.</p>
|
||||
<p><a href="https://git.tt-rss.org/fox/tt-rss/src/branch/master/themes/light/tt-rss.less#L749" class="onebox" target="_blank" rel="noopener">https://git.tt-rss.org/fox/tt-rss/src/branch/master/themes/light/tt-rss.less#L749</a></p>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
<span class='post-likes'></span>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<div id='post_8' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
|
||||
<div class='crawler-post-meta'>
|
||||
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
|
||||
<a itemprop="url" href='https://community.tt-rss.org/u/levito'><span itemprop='name'>levito</span></a>
|
||||
|
||||
</span>
|
||||
|
||||
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
|
||||
|
||||
|
||||
<span class="crawler-post-infos">
|
||||
<time itemprop='datePublished' datetime='2021-03-12T09:07:30Z' class='post-time'>
|
||||
March 12, 2021, 9:07am
|
||||
</time>
|
||||
<meta itemprop='dateModified' content='2021-03-12T09:07:30Z'>
|
||||
<span itemprop='position'>8</span>
|
||||
</span>
|
||||
</div>
|
||||
<div class='post' itemprop='text'>
|
||||
<p>Hi <a class="mention" href="/u/fox">@fox</a>,awesome to see that rush of progress!</p>
|
||||
<p>I’d suggest using <code>word-wrap: break-word;</code> instead of <code>word-break: break-all;</code>. This makes words only break if they are really wider than the container. Short words are not affected. So you might then also remove the restriction to links.</p>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
<span class='post-likes'></span>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
|
||||
<meta itemprop="userInteractionCount" content="1" />
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<div id='post_9' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
|
||||
<div class='crawler-post-meta'>
|
||||
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
|
||||
<a itemprop="url" href='https://community.tt-rss.org/u/fox'><span itemprop='name'>fox</span></a>
|
||||
|
||||
</span>
|
||||
|
||||
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
|
||||
|
||||
<link itemprop="image" href="https://community.tt-rss.org/uploads/default/original/2X/b/b9d8e8ecd722ea2f29e0fa101548482e9fe912fd.png">
|
||||
|
||||
<span class="crawler-post-infos">
|
||||
<time itemprop='datePublished' datetime='2021-03-12T09:34:16Z' class='post-time'>
|
||||
March 12, 2021, 9:34am
|
||||
</time>
|
||||
<meta itemprop='dateModified' content='2021-03-12T09:34:16Z'>
|
||||
<span itemprop='position'>9</span>
|
||||
</span>
|
||||
</div>
|
||||
<div class='post' itemprop='text'>
|
||||
<p>yeah this could work for text, but for links specifically it can make things ugly (uglier?):</p>
|
||||
<p><div class="lightbox-wrapper"><a class="lightbox" href="https://discourse.tt-rss.org/uploads/default/original/2X/b/b9d8e8ecd722ea2f29e0fa101548482e9fe912fd.png" data-download-href="https://discourse.tt-rss.org/uploads/default/b9d8e8ecd722ea2f29e0fa101548482e9fe912fd" title="image"><img src="https://discourse.tt-rss.org/uploads/default/optimized/2X/b/b9d8e8ecd722ea2f29e0fa101548482e9fe912fd_2_690x187.png" alt="image" data-base62-sha1="qw53FQsmq3Tnl5aap9yIPqLbSRT" width="690" height="187" srcset="https://discourse.tt-rss.org/uploads/default/optimized/2X/b/b9d8e8ecd722ea2f29e0fa101548482e9fe912fd_2_690x187.png, https://discourse.tt-rss.org/uploads/default/optimized/2X/b/b9d8e8ecd722ea2f29e0fa101548482e9fe912fd_2_1035x280.png 1.5x, https://discourse.tt-rss.org/uploads/default/optimized/2X/b/b9d8e8ecd722ea2f29e0fa101548482e9fe912fd_2_1380x374.png 2x" data-small-upload="https://discourse.tt-rss.org/uploads/default/optimized/2X/b/b9d8e8ecd722ea2f29e0fa101548482e9fe912fd_2_10x10.png"><div class="meta">
|
||||
<svg class="fa d-icon d-icon-far-image svg-icon" aria-hidden="true"><use xlink:href="#far-image"></use></svg><span class="filename">image</span><span class="informations">1615×439 74.6 KB</span><svg class="fa d-icon d-icon-discourse-expand svg-icon" aria-hidden="true"><use xlink:href="#discourse-expand"></use></svg>
|
||||
</div></a></div></p>
|
||||
<p>which is why i went with break-all.</p>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
<span class='post-likes'></span>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<div id='post_10' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
|
||||
<div class='crawler-post-meta'>
|
||||
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
|
||||
<a itemprop="url" href='https://community.tt-rss.org/u/levito'><span itemprop='name'>levito</span></a>
|
||||
|
||||
</span>
|
||||
|
||||
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
|
||||
|
||||
<link itemprop="image" href="https://community.tt-rss.org/uploads/default/original/2X/5/5d9654dc6e0e9cb1f655b3af565163a882300e84.png">
|
||||
|
||||
<span class="crawler-post-infos">
|
||||
<time itemprop='datePublished' datetime='2021-03-12T09:43:23Z' class='post-time'>
|
||||
March 12, 2021, 9:43am
|
||||
</time>
|
||||
<meta itemprop='dateModified' content='2021-03-12T09:43:23Z'>
|
||||
<span itemprop='position'>10</span>
|
||||
</span>
|
||||
</div>
|
||||
<div class='post' itemprop='text'>
|
||||
<p>Thanks for the quick reply and your explanation – I get your point. The only problematic edge-case I see with this is links at the end of a line.</p>
|
||||
<p><div class="lightbox-wrapper"><a class="lightbox" href="https://discourse.tt-rss.org/uploads/default/original/2X/5/5d9654dc6e0e9cb1f655b3af565163a882300e84.png" data-download-href="https://discourse.tt-rss.org/uploads/default/5d9654dc6e0e9cb1f655b3af565163a882300e84" title="Screenshot 2021-03-12 at 10.40.38"><img src="https://discourse.tt-rss.org/uploads/default/optimized/2X/5/5d9654dc6e0e9cb1f655b3af565163a882300e84_2_690x274.png" alt="Screenshot 2021-03-12 at 10.40.38" data-base62-sha1="dlUuhIyTaomOYut8VQNv9nyKtuY" width="690" height="274" srcset="https://discourse.tt-rss.org/uploads/default/optimized/2X/5/5d9654dc6e0e9cb1f655b3af565163a882300e84_2_690x274.png, https://discourse.tt-rss.org/uploads/default/optimized/2X/5/5d9654dc6e0e9cb1f655b3af565163a882300e84_2_1035x411.png 1.5x, https://discourse.tt-rss.org/uploads/default/original/2X/5/5d9654dc6e0e9cb1f655b3af565163a882300e84.png 2x" data-small-upload="https://discourse.tt-rss.org/uploads/default/optimized/2X/5/5d9654dc6e0e9cb1f655b3af565163a882300e84_2_10x10.png"><div class="meta">
|
||||
<svg class="fa d-icon d-icon-far-image svg-icon" aria-hidden="true"><use xlink:href="#far-image"></use></svg><span class="filename">Screenshot 2021-03-12 at 10.40.38</span><span class="informations">1334×530 56.7 KB</span><svg class="fa d-icon d-icon-discourse-expand svg-icon" aria-hidden="true"><use xlink:href="#discourse-expand"></use></svg>
|
||||
</div></a></div></p>
|
||||
<p>But I think that’s okay.</p>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
<span class='post-likes'></span>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<div id='post_11' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
|
||||
<div class='crawler-post-meta'>
|
||||
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
|
||||
<a itemprop="url" href='https://community.tt-rss.org/u/sam302psu'><span itemprop='name'>sam302psu</span></a>
|
||||
|
||||
</span>
|
||||
|
||||
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
|
||||
|
||||
|
||||
<span class="crawler-post-infos">
|
||||
<time itemprop='datePublished' datetime='2021-03-12T12:44:58Z' class='post-time'>
|
||||
March 12, 2021, 12:44pm
|
||||
</time>
|
||||
<meta itemprop='dateModified' content='2021-03-12T12:44:58Z'>
|
||||
<span itemprop='position'>11</span>
|
||||
</span>
|
||||
</div>
|
||||
<div class='post' itemprop='text'>
|
||||
<p>Thanks for the quick fix. It’s much better. I’m really liking the grid layout with mark as read on scroll. Thanks.</p>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
<span class='post-likes'></span>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<div id='post_12' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
|
||||
<div class='crawler-post-meta'>
|
||||
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
|
||||
<a itemprop="url" href='https://community.tt-rss.org/u/OldBear'><span itemprop='name'>OldBear</span></a>
|
||||
|
||||
</span>
|
||||
|
||||
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
|
||||
|
||||
|
||||
<span class="crawler-post-infos">
|
||||
<time itemprop='datePublished' datetime='2021-03-15T16:33:52Z' class='post-time'>
|
||||
March 15, 2021, 4:33pm
|
||||
</time>
|
||||
<meta itemprop='dateModified' content='2021-03-15T16:33:52Z'>
|
||||
<span itemprop='position'>12</span>
|
||||
</span>
|
||||
</div>
|
||||
<div class='post' itemprop='text'>
|
||||
<p>I think it’s a nice layout to be set to feeds like Dilbert</p>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
<span class='post-likes'></span>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<div id='post_13' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
|
||||
<div class='crawler-post-meta'>
|
||||
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
|
||||
<a itemprop="url" href='https://community.tt-rss.org/u/toomyzoom'><span itemprop='name'>toomyzoom</span></a>
|
||||
|
||||
</span>
|
||||
|
||||
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
|
||||
|
||||
|
||||
<span class="crawler-post-infos">
|
||||
<time itemprop='datePublished' datetime='2021-03-20T19:32:21Z' class='post-time'>
|
||||
March 20, 2021, 7:32pm
|
||||
</time>
|
||||
<meta itemprop='dateModified' content='2021-03-20T19:32:21Z'>
|
||||
<span itemprop='position'>13</span>
|
||||
</span>
|
||||
</div>
|
||||
<div class='post' itemprop='text'>
|
||||
<p>Good, now add it to android app <img src="https://discourse.tt-rss.org/images/emoji/google_classic/grin.png?v=9" title=":grin:" class="emoji" alt=":grin:"></p>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
<span class='post-likes'></span>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<div id='post_14' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
|
||||
<div class='crawler-post-meta'>
|
||||
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
|
||||
<a itemprop="url" href='https://community.tt-rss.org/u/linoth'><span itemprop='name'>linoth</span></a>
|
||||
|
||||
</span>
|
||||
|
||||
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
|
||||
|
||||
|
||||
<span class="crawler-post-infos">
|
||||
<time itemprop='datePublished' datetime='2021-03-20T21:18:36Z' class='post-time'>
|
||||
March 20, 2021, 9:18pm
|
||||
</time>
|
||||
<meta itemprop='dateModified' content='2021-03-20T21:18:36Z'>
|
||||
<span itemprop='position'>14</span>
|
||||
</span>
|
||||
</div>
|
||||
<div class='post' itemprop='text'>
|
||||
<p>Feel a bit guilty since I gave feedback and disappeared.</p>
|
||||
<p>I have an image heavy feed I’m very far behind on, and your first post made me try out expanded instead of collapsed. Sped things up immensely because apparently marking posts as read when you flip through them was CPU heavy on my server.</p>
|
||||
<p>After that, columns just made things even faster for me.</p>
|
||||
<p>I did run into an edge case that most people probably wouldn’t have even noticed, where a headline would be top-to-bottom in one column at 1080p, so I ham-fisted some CSS to set a max-height on titleWrap.</p>
|
||||
<p>Thanks so much for this feature, fox. Didn’t even know I needed it.</p>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
<span class='post-likes'></span>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<div id='post_15' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
|
||||
<div class='crawler-post-meta'>
|
||||
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
|
||||
<a itemprop="url" href='https://community.tt-rss.org/u/sam302psu'><span itemprop='name'>sam302psu</span></a>
|
||||
|
||||
</span>
|
||||
|
||||
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
|
||||
|
||||
|
||||
<span class="crawler-post-infos">
|
||||
<time itemprop='datePublished' datetime='2022-08-22T16:29:33Z' class='post-time'>
|
||||
August 22, 2022, 4:29pm
|
||||
</time>
|
||||
<meta itemprop='dateModified' content='2022-08-22T16:29:33Z'>
|
||||
<span itemprop='position'>15</span>
|
||||
</span>
|
||||
</div>
|
||||
<div class='post' itemprop='text'>
|
||||
<p>Sorry to resurrect this old thread. I’ve really grown to love grid mode but I noticed the article headlines sometimes break words on to separate lines. It seems to happen with all themes. I don’t have the masonry plugin active either.</p>
|
||||
<p>I’m using the stock docker setup.</p>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
<span class='post-likes'></span>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
|
||||
<meta itemprop="userInteractionCount" content="1" />
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<div id='post_16' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
|
||||
<div class='crawler-post-meta'>
|
||||
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
|
||||
<a itemprop="url" href='https://community.tt-rss.org/u/fox'><span itemprop='name'>fox</span></a>
|
||||
|
||||
</span>
|
||||
|
||||
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
|
||||
|
||||
|
||||
<span class="crawler-post-infos">
|
||||
<time itemprop='datePublished' datetime='2022-08-22T16:42:04Z' class='post-time'>
|
||||
August 22, 2022, 4:42pm
|
||||
</time>
|
||||
<meta itemprop='dateModified' content='2022-08-22T17:13:10Z'>
|
||||
<span itemprop='position'>16</span>
|
||||
</span>
|
||||
</div>
|
||||
<div class='post' itemprop='text'>
|
||||
<aside class="quote no-group" data-username="sam302psu" data-post="15" data-topic="4489">
|
||||
<div class="title">
|
||||
<div class="quote-controls"></div>
|
||||
<img loading="lazy" alt="" width="20" height="20" src="https://discourse.tt-rss.org/user_avatar/discourse.tt-rss.org/sam302psu/40/326_2.png" class="avatar"> sam302psu:</div>
|
||||
<blockquote>
|
||||
<p>article headlines sometimes break words on to separate lines</p>
|
||||
</blockquote>
|
||||
</aside>
|
||||
<p>yeah it’s a stock CSS thing, it’s either that or super long words breaking layout. i’m not fond of either behaviors. <img src="https://discourse.tt-rss.org/images/emoji/google_classic/frowning.png?v=12" title=":frowning:" class="emoji" alt=":frowning:" loading="lazy" width="20" height="20"></p>
|
||||
<p>there are several rules like this in the .less files:</p>
|
||||
<pre><code class="lang-css">word-break : break-all;
|
||||
</code></pre>
|
||||
<p>ideas welcome, etc.</p>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
<span class='post-likes'></span>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<div id='post_17' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
|
||||
<div class='crawler-post-meta'>
|
||||
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
|
||||
<a itemprop="url" href='https://community.tt-rss.org/u/sam302psu'><span itemprop='name'>sam302psu</span></a>
|
||||
|
||||
</span>
|
||||
|
||||
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
|
||||
|
||||
|
||||
<span class="crawler-post-infos">
|
||||
<time itemprop='datePublished' datetime='2022-08-22T17:31:29Z' class='post-time'>
|
||||
August 22, 2022, 5:31pm
|
||||
</time>
|
||||
<meta itemprop='dateModified' content='2022-08-22T17:31:29Z'>
|
||||
<span itemprop='position'>17</span>
|
||||
</span>
|
||||
</div>
|
||||
<div class='post' itemprop='text'>
|
||||
<p>Thanks for confirming. I will take the occassional headline issue over breaking the layout. I’ll use this as a chance to learn something and if I come up with a fix I’ll let you know.</p>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
<span class='post-likes'></span>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<div id='post_18' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
|
||||
<div class='crawler-post-meta'>
|
||||
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
|
||||
<a itemprop="url" href='https://community.tt-rss.org/u/ManuelW'><span itemprop='name'>ManuelW</span></a>
|
||||
|
||||
</span>
|
||||
|
||||
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
|
||||
|
||||
|
||||
<span class="crawler-post-infos">
|
||||
<time itemprop='datePublished' datetime='2023-06-24T11:30:44Z' class='post-time'>
|
||||
June 24, 2023, 11:30am
|
||||
</time>
|
||||
<meta itemprop='dateModified' content='2023-06-24T11:30:44Z'>
|
||||
<span itemprop='position'>18</span>
|
||||
</span>
|
||||
</div>
|
||||
<div class='post' itemprop='text'>
|
||||
<p>How do I activate this? I installed the Plugin and set the checkmark, but nothing happens.</p>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
<span class='post-likes'></span>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</div>
|
||||
<footer class="container wrap">
|
||||
<nav class='crawler-nav'>
|
||||
<ul>
|
||||
<li itemscope itemtype='http://schema.org/SiteNavigationElement'>
|
||||
<span itemprop='name'>
|
||||
<a href='/' itemprop="url">Home </a>
|
||||
</span>
|
||||
</li>
|
||||
<li itemscope itemtype='http://schema.org/SiteNavigationElement'>
|
||||
<span itemprop='name'>
|
||||
<a href='/categories' itemprop="url">Categories </a>
|
||||
</span>
|
||||
</li>
|
||||
<li itemscope itemtype='http://schema.org/SiteNavigationElement'>
|
||||
<span itemprop='name'>
|
||||
<a href='/guidelines' itemprop="url">FAQ/Guidelines </a>
|
||||
</span>
|
||||
</li>
|
||||
<li itemscope itemtype='http://schema.org/SiteNavigationElement'>
|
||||
<span itemprop='name'>
|
||||
<a href='/tos' itemprop="url">Terms of Service </a>
|
||||
</span>
|
||||
</li>
|
||||
<li itemscope itemtype='http://schema.org/SiteNavigationElement'>
|
||||
<span itemprop='name'>
|
||||
<a href='/privacy' itemprop="url">Privacy Policy </a>
|
||||
</span>
|
||||
</li>
|
||||
</ul>
|
||||
</nav>
|
||||
<p class='powered-by-link'>Powered by <a href="https://www.discourse.org">Discourse</a>, best viewed with JavaScript enabled</p>
|
||||
</footer>
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,230 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Telegram channel to idle on - Everything else - Tiny Tiny RSS: Community</title>
|
||||
<meta name="description" content="Here: Telegram: Contact @TinyTinyRSS
|
||||
Now there’s a place to post on if something on tt-rss.org doesn’t work right.">
|
||||
<meta name="generator" content="Discourse 3.1.0.beta4 - https://github.com/discourse/discourse version 7ff8e5580f9a900cde4be66377cf4f1dcd253a35">
|
||||
<link rel="icon" type="image/png" href="https://community.tt-rss.org/uploads/default/optimized/1X/18a2e96275d1fffb21cce225d30a87be4544db60_2_32x32.png">
|
||||
<link rel="apple-touch-icon" type="image/png" href="https://community.tt-rss.org/uploads/default/optimized/1X/18a2e96275d1fffb21cce225d30a87be4544db60_2_180x180.png">
|
||||
<meta name="theme-color" media="all" content="#ffffff">
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0, user-scalable=yes, viewport-fit=cover">
|
||||
<link rel="canonical" href="https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501" />
|
||||
|
||||
<link rel="search" type="application/opensearchdescription+xml" href="https://community.tt-rss.org/opensearch.xml" title="Tiny Tiny RSS: Community Search">
|
||||
|
||||
<link href="/stylesheets/color_definitions_base__2_e1a3786e9787d3094d4ad821cf887d92d1d46700.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" class="light-scheme"/>
|
||||
|
||||
<link href="/stylesheets/desktop_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="desktop" />
|
||||
|
||||
|
||||
|
||||
<link href="/stylesheets/discourse-details_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-details" />
|
||||
<link href="/stylesheets/discourse-lazy-videos_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-lazy-videos" />
|
||||
<link href="/stylesheets/discourse-local-dates_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-local-dates" />
|
||||
<link href="/stylesheets/discourse-narrative-bot_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-narrative-bot" />
|
||||
<link href="/stylesheets/discourse-presence_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-presence" />
|
||||
<link href="/stylesheets/discourse-reactions_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-reactions" />
|
||||
<link href="/stylesheets/discourse-solved_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-solved" />
|
||||
<link href="/stylesheets/docker_manager_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="docker_manager" />
|
||||
<link href="/stylesheets/poll_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="poll" />
|
||||
<link href="/stylesheets/discourse-reactions_desktop_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-reactions_desktop" />
|
||||
<link href="/stylesheets/poll_desktop_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="poll_desktop" />
|
||||
|
||||
<link href="/stylesheets/desktop_theme_7_fe50c691a9fb30bb61c18aa08d3b7a6cb61a0150.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="desktop_theme" data-theme-id="7" data-theme-name="custom header links"/>
|
||||
<link href="/stylesheets/desktop_theme_6_7ca213afc12b87349d5efdb08c76d0a62c01d53a.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="desktop_theme" data-theme-id="6" data-theme-name="bears"/>
|
||||
<link href="/stylesheets/desktop_theme_2_88a998cf58047c52104c0e1f454c9c1c16e18c70.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="desktop_theme" data-theme-id="2" data-theme-name="ttrss"/>
|
||||
|
||||
<!-- <script type="text/discourse-plugin" version="0.2">
|
||||
api.onPageChange((url, title) => {
|
||||
if (_paq) {
|
||||
_paq.push(["setCustomUrl", url]);
|
||||
_paq.push(["setDocumentTitle", title]);
|
||||
_paq.push(["trackPageView"]);
|
||||
|
||||
const currentUser = api.getCurrentUser();
|
||||
|
||||
if (currentUser && currentUser['username']) {
|
||||
_paq.push(['setUserId', currentUser['username']]);
|
||||
}
|
||||
}
|
||||
});
|
||||
</script> -->
|
||||
|
||||
|
||||
<link rel="alternate nofollow" type="application/rss+xml" title="RSS feed of 'Telegram channel to idle on'" href="https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501.rss" />
|
||||
<meta property="og:site_name" content="Tiny Tiny RSS: Community" />
|
||||
<meta property="og:type" content="website" />
|
||||
<meta name="twitter:card" content="summary" />
|
||||
<meta name="twitter:image" content="https://community.tt-rss.org/uploads/default/original/1X/18a2e96275d1fffb21cce225d30a87be4544db60.png" />
|
||||
<meta property="og:image" content="https://community.tt-rss.org/uploads/default/original/1X/18a2e96275d1fffb21cce225d30a87be4544db60.png" />
|
||||
<meta property="og:url" content="https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501" />
|
||||
<meta name="twitter:url" content="https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501" />
|
||||
<meta property="og:title" content="Telegram channel to idle on" />
|
||||
<meta name="twitter:title" content="Telegram channel to idle on" />
|
||||
<meta property="og:description" content="Here: Telegram: Contact @TinyTinyRSS Now there’s a place to post on if something on tt-rss.org doesn’t work right." />
|
||||
<meta name="twitter:description" content="Here: Telegram: Contact @TinyTinyRSS Now there’s a place to post on if something on tt-rss.org doesn’t work right." />
|
||||
<meta property="og:article:section" content="Everything else" />
|
||||
<meta property="og:article:section:color" content="12A89D" />
|
||||
<meta property="article:published_time" content="2020-05-15T15:07:15+00:00" />
|
||||
<meta property="og:ignore_canonical" content="true" />
|
||||
|
||||
|
||||
|
||||
</head>
|
||||
<body class="crawler ">
|
||||
|
||||
<header>
|
||||
<a href="/">
|
||||
Tiny Tiny RSS: Community
|
||||
</a>
|
||||
</header>
|
||||
|
||||
<div id="main-outlet" class="wrap" role="main">
|
||||
<div id="topic-title">
|
||||
<h1>
|
||||
<a href="/t/telegram-channel-to-idle-on/3501">Telegram channel to idle on</a>
|
||||
</h1>
|
||||
|
||||
<div class="topic-category" itemscope itemtype="http://schema.org/BreadcrumbList">
|
||||
<span itemprop="itemListElement" itemscope itemtype="http://schema.org/ListItem">
|
||||
<a href="https://community.tt-rss.org/c/everything-else/12" class="badge-wrapper bullet" itemprop="item">
|
||||
<span class='badge-category-bg' style='background-color: #12A89D'></span>
|
||||
<span class='badge-category clear-badge'>
|
||||
<span class='category-name' itemprop='name'>Everything else</span>
|
||||
</span>
|
||||
</a>
|
||||
<meta itemprop="position" content="1" />
|
||||
</span>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div itemscope itemtype='http://schema.org/DiscussionForumPosting'>
|
||||
<meta itemprop='headline' content='Telegram channel to idle on'>
|
||||
<meta itemprop='articleSection' content='Everything else'>
|
||||
<meta itemprop='keywords' content=''>
|
||||
<div itemprop='publisher' itemscope itemtype="http://schema.org/Organization">
|
||||
<meta itemprop='name' content='Tiny Tiny RSS: Community'>
|
||||
</div>
|
||||
|
||||
<div id='post_1' class='topic-body crawler-post'>
|
||||
<div class='crawler-post-meta'>
|
||||
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
|
||||
<a itemprop="url" href='https://community.tt-rss.org/u/fox'><span itemprop='name'>fox</span></a>
|
||||
|
||||
</span>
|
||||
|
||||
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501">
|
||||
|
||||
|
||||
<span class="crawler-post-infos">
|
||||
<time itemprop='datePublished' datetime='2020-05-15T15:07:15Z' class='post-time'>
|
||||
May 15, 2020, 3:07pm
|
||||
</time>
|
||||
<meta itemprop='dateModified' content='2020-05-15T15:07:15Z'>
|
||||
<span itemprop='position'>1</span>
|
||||
</span>
|
||||
</div>
|
||||
<div class='post' itemprop='articleBody'>
|
||||
<p>Here: <a href="https://t.me/TinyTinyRSS" class="inline-onebox">Telegram: Contact @TinyTinyRSS</a></p>
|
||||
<p>Now there’s a place to post on if something on <a href="http://tt-rss.org">tt-rss.org</a> doesn’t work right.</p>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
<span class='post-likes'></span>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<div id='post_2' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
|
||||
<div class='crawler-post-meta'>
|
||||
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
|
||||
<a itemprop="url" href='https://community.tt-rss.org/u/fox'><span itemprop='name'>fox</span></a>
|
||||
|
||||
Pinned globally
|
||||
</span>
|
||||
|
||||
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501">
|
||||
|
||||
|
||||
<span class="crawler-post-infos">
|
||||
<time itemprop='datePublished' datetime='2020-05-15T15:07:48Z' class='post-time'>
|
||||
May 15, 2020, 3:07pm
|
||||
</time>
|
||||
<meta itemprop='dateModified' content='2020-05-15T15:07:48Z'>
|
||||
<span itemprop='position'>2</span>
|
||||
</span>
|
||||
</div>
|
||||
<div class='post' itemprop='text'>
|
||||
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
<span class='post-likes'></span>
|
||||
</div>
|
||||
|
||||
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
|
||||
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
|
||||
<meta itemprop="userInteractionCount" content="0" />
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</div>
|
||||
<footer class="container wrap">
|
||||
<nav class='crawler-nav'>
|
||||
<ul>
|
||||
<li itemscope itemtype='http://schema.org/SiteNavigationElement'>
|
||||
<span itemprop='name'>
|
||||
<a href='/' itemprop="url">Home </a>
|
||||
</span>
|
||||
</li>
|
||||
<li itemscope itemtype='http://schema.org/SiteNavigationElement'>
|
||||
<span itemprop='name'>
|
||||
<a href='/categories' itemprop="url">Categories </a>
|
||||
</span>
|
||||
</li>
|
||||
<li itemscope itemtype='http://schema.org/SiteNavigationElement'>
|
||||
<span itemprop='name'>
|
||||
<a href='/guidelines' itemprop="url">FAQ/Guidelines </a>
|
||||
</span>
|
||||
</li>
|
||||
<li itemscope itemtype='http://schema.org/SiteNavigationElement'>
|
||||
<span itemprop='name'>
|
||||
<a href='/tos' itemprop="url">Terms of Service </a>
|
||||
</span>
|
||||
</li>
|
||||
<li itemscope itemtype='http://schema.org/SiteNavigationElement'>
|
||||
<span itemprop='name'>
|
||||
<a href='/privacy' itemprop="url">Privacy Policy </a>
|
||||
</span>
|
||||
</li>
|
||||
</ul>
|
||||
</nav>
|
||||
<p class='powered-by-link'>Powered by <a href="https://www.discourse.org">Discourse</a>, best viewed with JavaScript enabled</p>
|
||||
</footer>
|
||||
|
||||
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1,10 +1,17 @@
|
||||
package nu.marginalia.tools.experiments;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.converting.model.GeneratorType;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.tools.Experiment;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.Jsoup;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
public class DebugConverterExperiment extends Experiment {
|
||||
|
||||
@ -17,15 +24,55 @@ public class DebugConverterExperiment extends Experiment {
|
||||
|
||||
}
|
||||
|
||||
Set<String> seenGenerators = new HashSet<>();
|
||||
|
||||
@Override
|
||||
public boolean process(CrawledDomain domain) {
|
||||
var ret = domainProcessor.process(domain);
|
||||
|
||||
ret.documents.stream()
|
||||
.filter(ProcessedDocument::isProcessedFully)
|
||||
.peek(d -> System.out.println(d.url))
|
||||
.map(d -> d.details.metadata)
|
||||
.forEach(System.out::println);
|
||||
if (domain.doc == null) return true;
|
||||
|
||||
var dge = new DocumentGeneratorExtractor();
|
||||
|
||||
for (var doc : domain.doc) {
|
||||
if (doc.documentBody == null) continue;
|
||||
|
||||
var parsed = Jsoup.parse(doc.documentBody.decode());
|
||||
parsed.getElementsByTag("head").comments()
|
||||
.stream().filter(c -> {
|
||||
String data = c.getData();
|
||||
if (data.contains("<script"))
|
||||
return false;
|
||||
if (data.contains("[if"))
|
||||
return false;
|
||||
if (data.contains("shim"))
|
||||
return false;
|
||||
return data.contains("Generated by") || data.contains("generated by")
|
||||
|| data.contains("Powered by") || data.contains("powered by");
|
||||
}).forEach(System.out::println);
|
||||
|
||||
var generators = dge.generatorCleaned(parsed);
|
||||
for (var g : generators.keywords()) {
|
||||
if (seenGenerators.add(g)) {
|
||||
System.out.println(g + "->" + generators.type());
|
||||
if (generators.type() == GeneratorType.UNKNOWN) {
|
||||
System.out.println(parsed.select("meta[name=generator]")
|
||||
.attr("content"));
|
||||
System.out.println(doc.url);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//
|
||||
// var ret = domainProcessor.process(domain);
|
||||
//
|
||||
//
|
||||
// ret.documents.stream()
|
||||
// .filter(ProcessedDocument::isProcessedFully)
|
||||
// .peek(d -> System.out.println(d.url))
|
||||
// .map(d -> d.details.metadata)
|
||||
// .forEach(System.out::println);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user