Refactor crawler and add special logic for some platforms

* Break apart CrawlerRetreiver
* Break apart HttpFetcher into an interface and impl for testing sanity
* Add special logic for Lemmy, Mediawiki and Discourse to not waste requests on paths that aren't interesting.
This commit is contained in:
Viktor Lofgren 2023-06-24 20:09:54 +02:00 committed by Viktor
parent 5abaf13192
commit ed373eef61
22 changed files with 5094 additions and 270 deletions

View File

@ -2,13 +2,14 @@ package nu.marginalia.crawl;
import nu.marginalia.UserAgent; import nu.marginalia.UserAgent;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.process.log.WorkLog; import nu.marginalia.process.log.WorkLog;
import plan.CrawlPlanLoader; import plan.CrawlPlanLoader;
import plan.CrawlPlan; import plan.CrawlPlan;
import nu.marginalia.crawling.io.CrawledDomainWriter; import nu.marginalia.crawling.io.CrawledDomainWriter;
import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.crawling.model.spec.CrawlingSpecification;
import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import okhttp3.ConnectionPool; import okhttp3.ConnectionPool;
import okhttp3.Dispatcher; import okhttp3.Dispatcher;
import okhttp3.internal.Util; import okhttp3.internal.Util;
@ -102,8 +103,8 @@ public class CrawlerMain implements AutoCloseable {
if (workLog.isJobFinished(specification.id)) if (workLog.isJobFinished(specification.id))
return; return;
HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool);
HttpFetcher fetcher = new HttpFetcher(userAgent.uaString(), dispatcher, connectionPool);
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) { try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) {
var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept); var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept);

View File

@ -3,11 +3,12 @@ package nu.marginalia.crawl.retreival;
import com.google.common.hash.HashFunction; import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing; import com.google.common.hash.Hashing;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.crawl.retreival.fetcher.FetchResult;
import nu.marginalia.crawl.retreival.fetcher.FetchResultState;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.crawling.model.spec.CrawlingSpecification;
import nu.marginalia.link_parser.LinkParser; import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.crawling.model.*; import nu.marginalia.crawling.model.*;
import nu.marginalia.ip_blocklist.GeoIpBlocklist;
import nu.marginalia.ip_blocklist.IpBlockList;
import nu.marginalia.ip_blocklist.UrlBlocklist; import nu.marginalia.ip_blocklist.UrlBlocklist;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
@ -20,10 +21,9 @@ import java.net.InetAddress;
import java.net.UnknownHostException; import java.net.UnknownHostException;
import java.time.LocalDateTime; import java.time.LocalDateTime;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Optional; import java.util.Optional;
import java.util.function.Consumer; import java.util.function.Consumer;
import java.util.function.Predicate;
import static java.lang.Math.max; import static java.lang.Math.max;
import static java.lang.Math.min; import static java.lang.Math.min;
@ -32,16 +32,18 @@ public class CrawlerRetreiver {
private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 1000); private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 1000);
private static final long DEFAULT_CRAWL_DELAY_MAX_MS = Long.getLong("defaultCrawlDelaySlow", 2500); private static final long DEFAULT_CRAWL_DELAY_MAX_MS = Long.getLong("defaultCrawlDelaySlow", 2500);
private static final int MAX_ERRORS = 10; private static final int MAX_ERRORS = 20;
private final LinkedList<EdgeUrl> queue = new LinkedList<>();
private final HttpFetcher fetcher; private final HttpFetcher fetcher;
private final HashSet<String> visited;
private final HashSet<String> known; /** Flag to indicate that the crawler should slow down, e.g. from 429s */
private boolean slowDown = false; private boolean slowDown = false;
private final int depth;
/** Testing flag to disable crawl delay (otherwise crawler tests take several minutes) */
private boolean testFlagIgnoreDelay = false;
private final String id; private final String id;
private final String domain; private final String domain;
private final Consumer<SerializableCrawlData> crawledDomainWriter; private final Consumer<SerializableCrawlData> crawledDomainWriter;
@ -50,118 +52,120 @@ public class CrawlerRetreiver {
private static final Logger logger = LoggerFactory.getLogger(CrawlerRetreiver.class); private static final Logger logger = LoggerFactory.getLogger(CrawlerRetreiver.class);
private static final HashFunction hashMethod = Hashing.murmur3_128(0); private static final HashFunction hashMethod = Hashing.murmur3_128(0);
private static final IpBlockList ipBlocklist;
private static final UrlBlocklist urlBlocklist = new UrlBlocklist(); private static final UrlBlocklist urlBlocklist = new UrlBlocklist();
private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector();
private static final DomainProber domainProber = new DomainProber();
private final DomainCrawlFrontier crawlFrontier;
int errorCount = 0; int errorCount = 0;
static {
try {
ipBlocklist = new IpBlockList(new GeoIpBlocklist());
} catch (Exception e) {
throw new RuntimeException(e);
}
}
public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs, Consumer<SerializableCrawlData> writer) { public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs, Consumer<SerializableCrawlData> writer) {
this.fetcher = fetcher; this.fetcher = fetcher;
visited = new HashSet<>((int)(specs.urls.size() * 1.5));
known = new HashSet<>(specs.urls.size() * 10);
depth = specs.crawlDepth;
id = specs.id; id = specs.id;
domain = specs.domain; domain = specs.domain;
crawledDomainWriter = writer; crawledDomainWriter = writer;
for (String urlStr : specs.urls) { this.crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), specs.urls, specs.crawlDepth);
EdgeUrl.parse(urlStr).ifPresent(this::addToQueue);
}
if (queue.peek() != null) { var fst = crawlFrontier.peek();
var fst = queue.peek(); if (fst != null) {
// Ensure the index page is always crawled
var root = fst.withPathAndParam("/", null); var root = fst.withPathAndParam("/", null);
if (known.add(root.toString())) if (crawlFrontier.addKnown(root))
queue.addFirst(root); crawlFrontier.addFirst(root);
} }
else { else {
addToQueue(new EdgeUrl("http", new EdgeDomain(domain), null, "/", null)); // We know nothing about this domain, so we'll start with the index, trying both HTTP and HTTPS
addToQueue(new EdgeUrl("https", new EdgeDomain(domain), null, "/", null)); crawlFrontier.addToQueue(new EdgeUrl("http", new EdgeDomain(domain), null, "/", null));
crawlFrontier.addToQueue(new EdgeUrl("https", new EdgeDomain(domain), null, "/", null));
} }
} }
public CrawlerRetreiver withNoDelay() {
testFlagIgnoreDelay = true;
return this;
}
public int fetch() { public int fetch() {
Optional<CrawledDomain> probeResult = probeDomainForProblems(domain); final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, crawlFrontier.peek());
if (probeResult.isPresent()) { if (probeResult instanceof DomainProber.ProbeResultOk) {
crawledDomainWriter.accept(probeResult.get());
return 1;
}
else {
return crawlDomain(); return crawlDomain();
} }
}
private Optional<CrawledDomain> probeDomainForProblems(String domain) { // handle error cases for probe
EdgeUrl fst = queue.peek();
var ip = findIp(domain);
if (fst == null) { if (probeResult instanceof DomainProber.ProbeResultError err) {
logger.warn("No URLs for domain {}", domain); crawledDomainWriter.accept(
CrawledDomain.builder()
return Optional.of(CrawledDomain.builder() .crawlerStatus(err.status().name())
.crawlerStatus(CrawlerDomainStatus.ERROR.name()) .crawlerStatusDesc(err.desc())
.crawlerStatusDesc("No known URLs")
.id(id) .id(id)
.domain(domain) .domain(domain)
.build()); .ip(ip)
.build()
);
return 1;
} }
if (!ipBlocklist.isAllowed(fst.domain)) { if (probeResult instanceof DomainProber.ProbeResultRedirect redirect) {
return Optional.of(CrawledDomain.builder() crawledDomainWriter.accept(
.crawlerStatus(CrawlerDomainStatus.BLOCKED.name()) CrawledDomain.builder()
.crawlerStatus(CrawlerDomainStatus.REDIRECT.name())
.crawlerStatusDesc("Redirected to different domain")
.redirectDomain(redirect.domain().toString())
.id(id) .id(id)
.domain(domain) .domain(domain)
.ip(findIp(domain)) .ip(ip)
.build()); .build()
);
return 1;
} }
var fetchResult = fetcher.probeDomain(fst.withPathAndParam("/", null)); throw new IllegalStateException("Unknown probe result: " + probeResult);
if (!fetchResult.ok()) { };
logger.debug("Bad status on {}", domain);
return Optional.of(createErrorPostFromStatus(fetchResult));
}
return Optional.empty();
}
private int crawlDomain() { private int crawlDomain() {
String ip = findIp(domain); String ip = findIp(domain);
assert !queue.isEmpty(); assert !crawlFrontier.isEmpty();
var robotsRules = fetcher.fetchRobotRules(queue.peek().domain); var robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain);
long crawlDelay = robotsRules.getCrawlDelay(); long crawlDelay = robotsRules.getCrawlDelay();
CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, new ArrayList<>(), null); CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, new ArrayList<>(), null);
int fetchedCount = 0; int fetchedCount = 0;
while (!queue.isEmpty() && visited.size() < depth && errorCount < MAX_ERRORS ) { configureLinkFilter();
var top = queue.removeFirst();
while (!crawlFrontier.isEmpty()
&& !crawlFrontier.isCrawlDepthReached()
&& errorCount < MAX_ERRORS)
{
var top = crawlFrontier.takeNextUrl();
if (!robotsRules.isAllowed(top.toString())) { if (!robotsRules.isAllowed(top.toString())) {
crawledDomainWriter.accept(createRobotsError(top)); crawledDomainWriter.accept(createRobotsError(top));
continue; continue;
} }
if (!crawlFrontier.filterLink(top))
continue;
if (urlBlocklist.isUrlBlocked(top)) if (urlBlocklist.isUrlBlocked(top))
continue; continue;
if (!isAllowedProtocol(top.proto)) if (!isAllowedProtocol(top.proto))
continue; continue;
if (top.toString().length() > 255) if (top.toString().length() > 255)
continue; continue;
if (!visited.add(top.toString())) if (!crawlFrontier.addVisited(top))
continue; continue;
if (fetchDocument(top, crawlDelay)) { if (fetchDocument(top, crawlDelay)) {
@ -176,8 +180,22 @@ public class CrawlerRetreiver {
return fetchedCount; return fetchedCount;
} }
private void configureLinkFilter() {
try {
logger.info("Configuring link filter");
fetchUrl(crawlFrontier.peek())
.map(linkFilterSelector::selectFilter)
.ifPresent(crawlFrontier::setLinkFilter);
}
catch (Exception ex) {
logger.error("Error configuring link filter", ex);
}
}
private boolean fetchDocument(EdgeUrl top, long crawlDelay) { private boolean fetchDocument(EdgeUrl top, long crawlDelay) {
logger.debug("Fetching {}", top); logger.debug("Fetching {}", top);
long startTime = System.currentTimeMillis(); long startTime = System.currentTimeMillis();
var doc = fetchUrl(top); var doc = fetchUrl(top);
@ -186,10 +204,10 @@ public class CrawlerRetreiver {
crawledDomainWriter.accept(d); crawledDomainWriter.accept(d);
if (d.url != null) { if (d.url != null) {
EdgeUrl.parse(d.url).map(EdgeUrl::toString).ifPresent(visited::add); EdgeUrl.parse(d.url).ifPresent(crawlFrontier::addVisited);
} }
if ("ERROR".equals(d.crawlerStatus)) { if ("ERROR".equals(d.crawlerStatus) && d.httpStatus != 404) {
errorCount++; errorCount++;
} }
@ -211,7 +229,6 @@ public class CrawlerRetreiver {
var doc = fetchContent(top); var doc = fetchContent(top);
if (doc.documentBody != null) { if (doc.documentBody != null) {
doc.documentBodyHash = createHash(doc.documentBody.decode()); doc.documentBodyHash = createHash(doc.documentBody.decode());
Optional<Document> parsedDoc = parseDoc(doc); Optional<Document> parsedDoc = parseDoc(doc);
@ -260,37 +277,23 @@ public class CrawlerRetreiver {
return Optional.of(Jsoup.parse(doc.documentBody.decode())); return Optional.of(Jsoup.parse(doc.documentBody.decode()));
} }
public boolean isSameDomain(EdgeUrl url) {
return domain.equalsIgnoreCase(url.domain.toString());
}
private void findLinks(EdgeUrl baseUrl, Document parsed) { private void findLinks(EdgeUrl baseUrl, Document parsed) {
baseUrl = linkParser.getBaseLink(parsed, baseUrl); baseUrl = linkParser.getBaseLink(parsed, baseUrl);
for (var link : parsed.getElementsByTag("a")) { for (var link : parsed.getElementsByTag("a")) {
linkParser.parseLink(baseUrl, link).ifPresent(this::addToQueue); linkParser.parseLink(baseUrl, link).ifPresent(crawlFrontier::addToQueue);
} }
for (var link : parsed.getElementsByTag("frame")) { for (var link : parsed.getElementsByTag("frame")) {
linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue); linkParser.parseFrame(baseUrl, link).ifPresent(crawlFrontier::addToQueue);
} }
for (var link : parsed.getElementsByTag("iframe")) { for (var link : parsed.getElementsByTag("iframe")) {
linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue); linkParser.parseFrame(baseUrl, link).ifPresent(crawlFrontier::addToQueue);
} }
for (var link : parsed.getElementsByTag("link")) {
String rel = link.attr("rel");
if (rel.equalsIgnoreCase("next") || rel.equalsIgnoreCase("prev")) {
linkParser.parseLink(baseUrl, link).ifPresent(crawlFrontier::addToQueue);
} }
private void addToQueue(EdgeUrl url) {
if (!isSameDomain(url))
return;
if (urlBlocklist.isUrlBlocked(url))
return;
if (urlBlocklist.isMailingListLink(url))
return;
// reduce memory usage by not growing queue huge when crawling large sites
if (queue.size() + visited.size() >= depth + 100)
return;
if (known.add(url.toString())) {
queue.addLast(url);
} }
} }
@ -314,6 +317,9 @@ public class CrawlerRetreiver {
@SneakyThrows @SneakyThrows
private void delay(long sleepTime, long spentTime) { private void delay(long sleepTime, long spentTime) {
if (testFlagIgnoreDelay)
return;
if (sleepTime >= 1) { if (sleepTime >= 1) {
if (spentTime > sleepTime) if (spentTime > sleepTime)
return; return;
@ -355,17 +361,17 @@ public class CrawlerRetreiver {
.crawlerStatus(CrawlerDocumentStatus.ERROR.name()) .crawlerStatus(CrawlerDocumentStatus.ERROR.name())
.build(); .build();
} }
private CrawledDomain createErrorPostFromStatus(HttpFetcher.FetchResult ret) { private CrawledDomain createErrorPostFromStatus(FetchResult ret) {
String ip = findIp(domain); String ip = findIp(domain);
if (ret.state == HttpFetcher.FetchResultState.ERROR) { if (ret.state == FetchResultState.ERROR) {
return CrawledDomain.builder() return CrawledDomain.builder()
.crawlerStatus(CrawlerDomainStatus.ERROR.name()) .crawlerStatus(CrawlerDomainStatus.ERROR.name())
.id(id).domain(domain) .id(id).domain(domain)
.ip(ip) .ip(ip)
.build(); .build();
} }
if (ret.state == HttpFetcher.FetchResultState.REDIRECT) { if (ret.state == FetchResultState.REDIRECT) {
return CrawledDomain.builder() return CrawledDomain.builder()
.crawlerStatus(CrawlerDomainStatus.REDIRECT.name()) .crawlerStatus(CrawlerDomainStatus.REDIRECT.name())
.id(id) .id(id)
@ -377,4 +383,5 @@ public class CrawlerRetreiver {
throw new AssertionError("Unexpected case"); throw new AssertionError("Unexpected case");
} }
} }

View File

@ -0,0 +1,99 @@
package nu.marginalia.crawl.retreival;
import nu.marginalia.ip_blocklist.UrlBlocklist;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import java.util.Collection;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Objects;
import java.util.function.Predicate;
public class DomainCrawlFrontier {
private final LinkedList<EdgeUrl> queue = new LinkedList<>();
private final HashSet<String> visited;
private final HashSet<String> known;
private final EdgeDomain thisDomain;
private final UrlBlocklist urlBlocklist;
private Predicate<EdgeUrl> linkFilter = url -> true;
final int depth;
public DomainCrawlFrontier(EdgeDomain thisDomain, Collection<String> urls, int depth) {
this.thisDomain = thisDomain;
this.urlBlocklist = new UrlBlocklist();
this.depth = depth;
visited = new HashSet<>((int)(urls.size() * 1.5));
known = new HashSet<>(urls.size() * 10);
for (String urlStr : urls) {
EdgeUrl.parse(urlStr).ifPresent(this::addToQueue);
}
}
public void setLinkFilter(Predicate<EdgeUrl> linkFilter) {
this.linkFilter = linkFilter;
}
public boolean isCrawlDepthReached() {
return visited.size() >= depth;
}
public boolean isEmpty() {
return queue.isEmpty();
}
public boolean addKnown(EdgeUrl url) {
return known.contains(url.toString());
}
public void addFirst(EdgeUrl url) {
queue.addFirst(url);
}
public EdgeUrl takeNextUrl() {
return queue.removeFirst();
}
public EdgeUrl peek() {
return queue.peek();
}
public boolean addVisited(EdgeUrl url) {
return visited.add(url.toString());
}
public boolean filterLink(EdgeUrl url) {
return linkFilter.test(url);
}
public void addToQueue(EdgeUrl url) {
if (!isSameDomain(url))
return;
if (urlBlocklist.isUrlBlocked(url))
return;
if (urlBlocklist.isMailingListLink(url))
return;
if (!linkFilter.test(url))
return;
// reduce memory usage by not growing queue huge when crawling large sites
if (queue.size() + visited.size() >= depth + 100)
return;
if (known.add(url.toString())) {
queue.addLast(url);
}
}
public boolean isSameDomain(EdgeUrl url) {
return Objects.equals(thisDomain, url.domain);
}
}

View File

@ -0,0 +1,59 @@
package nu.marginalia.crawl.retreival;
import nu.marginalia.crawl.retreival.fetcher.FetchResultState;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawling.model.CrawlerDomainStatus;
import nu.marginalia.ip_blocklist.GeoIpBlocklist;
import nu.marginalia.ip_blocklist.IpBlockList;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
public class DomainProber {
private final Logger logger = LoggerFactory.getLogger(DomainProber.class);
private static IpBlockList ipBlockList;
static {
try {
ipBlockList = new IpBlockList(new GeoIpBlocklist());
} catch (Exception e) {
throw new RuntimeException(e);
}
}
/** To detect problems early we do a probing request to the domain before we start crawling it properly.
* This is a HEAD, typically to the root path. We check the IP against the blocklist, we check that it
* doesn't immediately redirect to another domain (which should be crawled separately, not under the name
* of this domain).
*/
public ProbeResult probeDomain(HttpFetcher fetcher, String domain, @Nullable EdgeUrl firstUrlInQueue) {
if (firstUrlInQueue == null) {
logger.warn("No valid URLs for domain {}", domain);
return new ProbeResultError(CrawlerDomainStatus.ERROR, "No known URLs");
}
if (!ipBlockList.isAllowed(firstUrlInQueue.domain))
return new ProbeResultError(CrawlerDomainStatus.BLOCKED, "IP not allowed");
var fetchResult = fetcher.probeDomain(firstUrlInQueue.withPathAndParam("/", null));
if (fetchResult.ok())
return new ProbeResultOk();
if (fetchResult.state == FetchResultState.REDIRECT)
return new ProbeResultRedirect(fetchResult.domain);
return new ProbeResultError(CrawlerDomainStatus.ERROR, "Bad status");
}
interface ProbeResult {};
record ProbeResultError(CrawlerDomainStatus status, String desc) implements ProbeResult {}
record ProbeResultRedirect(EdgeDomain domain) implements ProbeResult {}
record ProbeResultOk() implements ProbeResult {}
}

View File

@ -1,108 +0,0 @@
package nu.marginalia.crawl.retreival;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.google.inject.name.Named;
import io.reactivex.rxjava3.core.Observable;
import lombok.SneakyThrows;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.client.exception.NetworkException;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.net.ssl.X509TrustManager;
import java.io.IOException;
import java.util.concurrent.TimeUnit;
// TODO: Is this used?
@Singleton
public class HttpRedirectResolver {
private static final LinkParser linkParser = new LinkParser();
private final Logger logger = LoggerFactory.getLogger(getClass());
private final String userAgent;
private final Cookies cookies = new Cookies();
private final OkHttpClient client = createClient();
@SneakyThrows
private OkHttpClient createClient() {
return new OkHttpClient.Builder()
.sslSocketFactory(NoSecuritySSL.buildSocketFactory(), (X509TrustManager) NoSecuritySSL.trustAllCerts[0])
.hostnameVerifier(NoSecuritySSL.buildHostnameVerifyer())
.cookieJar(cookies.getJar())
.followRedirects(false)
.followSslRedirects(false)
.connectTimeout(8, TimeUnit.SECONDS)
.build();
}
@Inject
public HttpRedirectResolver(@Named("user-agent") String userAgent) {
this.userAgent = userAgent;
}
@SneakyThrows
public Observable<EdgeUrl> probe(EdgeUrl url) {
return probe(url, 0);
}
private Observable<EdgeUrl> probe(EdgeUrl url, int depth) {
if (depth > 10) {
return Observable.error(new IllegalStateException("Too many redirects"));
}
if (!url.proto.toLowerCase().startsWith("http")) {
return Observable.empty();
}
var head = new Request.Builder().get().addHeader("User-agent", userAgent)
.url(url.toString())
.addHeader("Accept-Encoding", "gzip")
.build();
var call = client.newCall(head);
try (var rsp = call.execute()) {
return resolveRedirects(depth, url, rsp);
} catch (IOException e) {
return Observable.error(e);
}
}
@SneakyThrows
private Observable<EdgeUrl> resolveRedirects(int depth, EdgeUrl url, Response response) {
int code = response.code();
response.close();
if (code < 300) {
return Observable.just(url);
}
if (code < 309) {
String newUrl = response.header("Location");
return Observable.fromOptional(linkParser.parseLink(url, newUrl))
.flatMap(u -> probe(u, depth + 1));
}
if (code >= 400) {
return Observable.just(url);
}
return Observable.error(new IllegalStateException("HttpStatusCode " + code));
}
private boolean failOnBadStatus(Response response) {
if (response.code() >= 400) {
response.close();
throw new NetworkException("Bad status " + response.code());
}
return true;
}
public static class BadContentType extends RuntimeException {
public BadContentType(String type) {
super(type);
}
}
}

View File

@ -0,0 +1,61 @@
package nu.marginalia.crawl.retreival;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.model.EdgeUrl;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import java.util.function.Predicate;
public class LinkFilterSelector {
/* With websites that run e.g. forum software or wiki software, it's
very beneficial to cherry-pick the URLs that we want to crawl to
exclude e.g. user profiles, and other similar noise.
*/
public Predicate<EdgeUrl> selectFilter(CrawledDocument sample) {
if (sample.httpStatus != 200) {
return LinkFilterSelector::defaultFilter;
}
// Sniff the software based on the sample document
var doc = Jsoup.parse(sample.documentBody.decode());
var head = doc.getElementsByTag("head").first();
if (null == head) {
return url -> true;
}
if (isLemmy(head)) {
return url -> url.path.startsWith("/post/") || url.path.startsWith("/c/");
}
if (isMediawiki(head)) {
return url -> url.path.startsWith("/wiki/") && !url.path.contains(":");
}
if (isDiscourse(head)) {
return url -> url.path.startsWith("/t/") || url.path.contains("/latest");
}
return LinkFilterSelector::defaultFilter;
}
public static boolean defaultFilter(EdgeUrl url) {
return true;
}
private boolean isMediawiki(Element head) {
return head.select("meta[name=generator]").attr("content").toLowerCase().contains("mediawiki");
}
private boolean isDiscourse(Element head) {
return head.select("meta[name=generator]").attr("content").toLowerCase().contains("discourse");
}
private boolean isLemmy(Element head) {
for (var scriptTags : head.select("script")) {
if (scriptTags.html().contains("window.lemmyConfig")) {
return true;
}
}
return false;
}
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawl.retreival; package nu.marginalia.crawl.retreival.fetcher;
import javax.net.SocketFactory; import javax.net.SocketFactory;
import java.io.IOException; import java.io.IOException;

View File

@ -0,0 +1,16 @@
package nu.marginalia.crawl.retreival.fetcher;
import lombok.AllArgsConstructor;
import lombok.ToString;
import nu.marginalia.model.EdgeDomain;
@AllArgsConstructor
@ToString
public class FetchResult {
public final FetchResultState state;
public final EdgeDomain domain;
public boolean ok() {
return state == FetchResultState.OK;
}
}

View File

@ -0,0 +1,7 @@
package nu.marginalia.crawl.retreival.fetcher;
public enum FetchResultState {
OK,
REDIRECT,
ERROR
}

View File

@ -0,0 +1,25 @@
package nu.marginalia.crawl.retreival.fetcher;
import com.google.inject.ImplementedBy;
import crawlercommons.robots.SimpleRobotRules;
import lombok.SneakyThrows;
import nu.marginalia.crawl.retreival.RateLimitException;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import java.util.List;
@ImplementedBy(HttpFetcherImpl.class)
public interface HttpFetcher {
void setAllowAllContentTypes(boolean allowAllContentTypes);
List<String> getCookies();
void clearCookies();
FetchResult probeDomain(EdgeUrl url);
CrawledDocument fetchContent(EdgeUrl url) throws RateLimitException;
SimpleRobotRules fetchRobotRules(EdgeDomain domain);
}

View File

@ -1,12 +1,12 @@
package nu.marginalia.crawl.retreival; package nu.marginalia.crawl.retreival.fetcher;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.name.Named; import com.google.inject.name.Named;
import crawlercommons.robots.SimpleRobotRules; import crawlercommons.robots.SimpleRobotRules;
import crawlercommons.robots.SimpleRobotRulesParser; import crawlercommons.robots.SimpleRobotRulesParser;
import lombok.AllArgsConstructor;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import lombok.ToString; import nu.marginalia.crawl.retreival.Cookies;
import nu.marginalia.crawl.retreival.RateLimitException;
import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawlerDocumentStatus; import nu.marginalia.crawling.model.CrawlerDocumentStatus;
import nu.marginalia.crawling.model.ContentType; import nu.marginalia.crawling.model.ContentType;
@ -35,7 +35,7 @@ import java.util.Optional;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.zip.GZIPInputStream; import java.util.zip.GZIPInputStream;
public class HttpFetcher { public class HttpFetcherImpl implements HttpFetcher {
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
private final String userAgent; private final String userAgent;
@ -46,29 +46,15 @@ public class HttpFetcher {
private final ContentTypeLogic contentTypeLogic = new ContentTypeLogic(); private final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
@Override
public void setAllowAllContentTypes(boolean allowAllContentTypes) { public void setAllowAllContentTypes(boolean allowAllContentTypes) {
contentTypeLogic.setAllowAllContentTypes(allowAllContentTypes); contentTypeLogic.setAllowAllContentTypes(allowAllContentTypes);
} }
private final OkHttpClient client; private final OkHttpClient client;
public enum FetchResultState {
OK,
REDIRECT,
ERROR
}
@AllArgsConstructor @ToString
public static class FetchResult {
public final FetchResultState state;
public final EdgeDomain domain;
public boolean ok() {
return state == FetchResultState.OK;
}
}
private static final FastTerminatingSocketFactory ftSocketFactory = new FastTerminatingSocketFactory(); private static final FastTerminatingSocketFactory ftSocketFactory = new FastTerminatingSocketFactory();
@SneakyThrows @SneakyThrows
private OkHttpClient createClient(Dispatcher dispatcher, ConnectionPool pool) { private OkHttpClient createClient(Dispatcher dispatcher, ConnectionPool pool) {
var builder = new OkHttpClient.Builder(); var builder = new OkHttpClient.Builder();
@ -90,25 +76,28 @@ public class HttpFetcher {
} }
@Override
public List<String> getCookies() { public List<String> getCookies() {
return cookies.getCookies(); return cookies.getCookies();
} }
@Override
public void clearCookies() { public void clearCookies() {
cookies.clear(); cookies.clear();
} }
@Inject @Inject
public HttpFetcher(@Named("user-agent") String userAgent, Dispatcher dispatcher, ConnectionPool connectionPool) { public HttpFetcherImpl(@Named("user-agent") String userAgent, Dispatcher dispatcher, ConnectionPool connectionPool) {
this.client = createClient(dispatcher, connectionPool); this.client = createClient(dispatcher, connectionPool);
this.userAgent = userAgent; this.userAgent = userAgent;
} }
public HttpFetcher(@Named("user-agent") String userAgent) { public HttpFetcherImpl(@Named("user-agent") String userAgent) {
this.client = createClient(null, new ConnectionPool()); this.client = createClient(null, new ConnectionPool());
this.userAgent = userAgent; this.userAgent = userAgent;
} }
@Override
@SneakyThrows @SneakyThrows
public FetchResult probeDomain(EdgeUrl url) { public FetchResult probeDomain(EdgeUrl url) {
var head = new Request.Builder().head().addHeader("User-agent", userAgent) var head = new Request.Builder().head().addHeader("User-agent", userAgent)
@ -126,6 +115,7 @@ public class HttpFetcher {
} }
return new FetchResult(FetchResultState.OK, requestDomain); return new FetchResult(FetchResultState.OK, requestDomain);
} }
catch (Exception ex) { catch (Exception ex) {
if (url.proto.equalsIgnoreCase("http") && "/".equals(url.path)) { if (url.proto.equalsIgnoreCase("http") && "/".equals(url.path)) {
return probeDomain(new EdgeUrl("https", url.domain, url.port, url.path, url.param)); return probeDomain(new EdgeUrl("https", url.domain, url.port, url.path, url.param));
@ -151,6 +141,7 @@ public class HttpFetcher {
} }
@Override
@SneakyThrows @SneakyThrows
public CrawledDocument fetchContent(EdgeUrl url) throws RateLimitException { public CrawledDocument fetchContent(EdgeUrl url) throws RateLimitException {
@ -312,6 +303,7 @@ public class HttpFetcher {
} }
@Override
public SimpleRobotRules fetchRobotRules(EdgeDomain domain) { public SimpleRobotRules fetchRobotRules(EdgeDomain domain) {
return fetchRobotsForProto("https", domain) return fetchRobotsForProto("https", domain)
.or(() -> fetchRobotsForProto("http", domain)) .or(() -> fetchRobotsForProto("http", domain))

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawl.retreival; package nu.marginalia.crawl.retreival.fetcher;
import lombok.SneakyThrows; import lombok.SneakyThrows;

View File

@ -1,9 +1,8 @@
package nu.marginalia.crawling; package nu.marginalia.crawling;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.crawl.retreival.HttpFetcher;
import nu.marginalia.crawl.retreival.HttpRedirectResolver;
import nu.marginalia.crawl.retreival.RateLimitException; import nu.marginalia.crawl.retreival.RateLimitException;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.logic.ContentTypeLogic; import nu.marginalia.crawl.retreival.logic.ContentTypeLogic;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
@ -29,44 +28,15 @@ class HttpFetcherTest {
@Test @Test
void fetchUTF8() throws URISyntaxException, RateLimitException { void fetchUTF8() throws URISyntaxException, RateLimitException {
var fetcher = new HttpFetcher("nu.marginalia.edge-crawler"); var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu")); var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"));
System.out.println(str.contentType); System.out.println(str.contentType);
} }
@Test @Test
void fetchText() throws URISyntaxException, RateLimitException { void fetchText() throws URISyntaxException, RateLimitException {
var fetcher = new HttpFetcher("nu.marginalia.edge-crawler"); var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt")); var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"));
System.out.println(str); System.out.println(str);
} }
@Test
void resolveRedirect() throws URISyntaxException {
var fetcher = new HttpRedirectResolver("nu.marginalia.edge-crawler");
var str = fetcher.probe(new EdgeUrl("https://www.marginalia.nu/robots.txt"));
System.out.println(str);
}
@Test
void resolveRedirect2() throws URISyntaxException {
var fetcher = new HttpRedirectResolver("nu.marginalia.edge-crawler");
var str = fetcher.probe(new EdgeUrl("https://www.marginalia.nu/robots.txt")).blockingFirst();
System.out.println(str);
}
@Test
void resolveRedirect3() throws URISyntaxException {
var fetcher = new HttpRedirectResolver("nu.marginalia.edge-crawler");
var str = fetcher.probe(new EdgeUrl("https://www.marginalia.nu/robots.txt"));
System.out.println(str);
}
@Test
void resolveRedirect4() throws URISyntaxException {
var fetcher = new HttpRedirectResolver("nu.marginalia.edge-crawler");
var str = fetcher.probe(new EdgeUrl("https://www.marginalia.nu/robots.txt"));
System.out.println(str);
}
} }

View File

@ -0,0 +1,152 @@
package nu.marginalia.crawling.retreival;
import crawlercommons.robots.SimpleRobotRules;
import lombok.SneakyThrows;
import nu.marginalia.bigstring.BigString;
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.RateLimitException;
import nu.marginalia.crawl.retreival.fetcher.FetchResult;
import nu.marginalia.crawl.retreival.fetcher.FetchResultState;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
import nu.marginalia.crawling.model.SerializableCrawlData;
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class CrawlerMockFetcherTest {
private static final Logger logger = LoggerFactory.getLogger(CrawlerMockFetcherTest.class);
Map<EdgeUrl, CrawledDocument> mockData = new HashMap<>();
HttpFetcher fetcherMock = new MockFetcher();
@AfterEach
public void tearDown() {
mockData.clear();
}
private void registerUrl(EdgeUrl url, String documentData) {
mockData.put(url, CrawledDocument.builder()
.crawlId("1")
.url(url.toString())
.contentType("text/html")
.httpStatus(200)
.crawlerStatus(CrawlerDocumentStatus.OK.name())
.documentBody(BigString.encode(documentData))
.build());
}
@SneakyThrows
private void registerUrlClasspathData(EdgeUrl url, String path) {
try (var resourceStream = getClass().getClassLoader().getResourceAsStream(path)) {
if (resourceStream == null) throw new IllegalArgumentException("No such resource: " + path);
var data = BigString.encode(new String(resourceStream.readAllBytes(), StandardCharsets.UTF_8));
mockData.put(url, CrawledDocument.builder()
.crawlId("1")
.url(url.toString())
.contentType("text/html")
.httpStatus(200)
.crawlerStatus(CrawlerDocumentStatus.OK.name())
.documentBody(data)
.build());
}
}
@Test
public void testLemmy() throws URISyntaxException {
List<SerializableCrawlData> out = new ArrayList<>();
registerUrlClasspathData(new EdgeUrl("https://startrek.website/"), "mock-crawl-data/lemmy/index.html");
registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html");
new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "startrek.website", new ArrayList<>()), out::add)
.withNoDelay()
.fetch();
out.forEach(System.out::println);
}
@Test
public void testMediawiki() throws URISyntaxException {
List<SerializableCrawlData> out = new ArrayList<>();
registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html");
new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "en.wikipedia.org", new ArrayList<>()), out::add)
.withNoDelay()
.fetch();
out.forEach(System.out::println);
}
@Test
public void testDiscourse() throws URISyntaxException {
List<SerializableCrawlData> out = new ArrayList<>();
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/"), "mock-crawl-data/discourse/index.html");
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501"), "mock-crawl-data/discourse/telegram.html");
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html");
new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 100, "community.tt-rss.org", new ArrayList<>()), out::add)
.withNoDelay()
.fetch();
out.forEach(System.out::println);
}
class MockFetcher implements HttpFetcher {
@Override
public void setAllowAllContentTypes(boolean allowAllContentTypes) {}
@Override
public List<String> getCookies() { return List.of();}
@Override
public void clearCookies() {}
@Override
public FetchResult probeDomain(EdgeUrl url) {
logger.info("Probing {}", url);
return new FetchResult(FetchResultState.OK, url.domain);
}
@Override
public CrawledDocument fetchContent(EdgeUrl url) {
logger.info("Fetching {}", url);
if (mockData.containsKey(url)) {
return mockData.get(url);
}
else {
return CrawledDocument.builder()
.crawlId("1")
.url(url.toString())
.contentType("text/html")
.httpStatus(404)
.crawlerStatus(CrawlerDocumentStatus.ERROR.name())
.build();
}
}
@Override
public SimpleRobotRules fetchRobotRules(EdgeDomain domain) {
return new SimpleRobotRules();
}
}
}

View File

@ -1,7 +1,8 @@
package nu.marginalia.crawling.retreival; package nu.marginalia.crawling.retreival;
import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.crawling.model.spec.CrawlingSpecification;
import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.crawling.model.SerializableCrawlData;
@ -23,7 +24,7 @@ class CrawlerRetreiverTest {
var specs = new CrawlingSpecification("1", 5, "www.marginalia.nu", new ArrayList<>()); var specs = new CrawlingSpecification("1", 5, "www.marginalia.nu", new ArrayList<>());
HttpFetcher fetcher = new HttpFetcher("test.marginalia.nu"); HttpFetcher fetcher = new HttpFetcherImpl("test.marginalia.nu");
List<SerializableCrawlData> data = new ArrayList<>(); List<SerializableCrawlData> data = new ArrayList<>();

View File

@ -0,0 +1,860 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Combined mode but grid - Development - Tiny Tiny RSS: Community</title>
<meta name="description" content="horrible, huh?
@media screen and (min-width: 1400px) {
#headlines-frame {
display : grid;
grid-template-columns: repeat(2, 1fr);
grid-gap : 8px;
.cdm.expanded {
.footer {
border : 0;
}
b&amp;hellip;">
<meta name="generator" content="Discourse 3.1.0.beta4 - https://github.com/discourse/discourse version 7ff8e5580f9a900cde4be66377cf4f1dcd253a35">
<link rel="icon" type="image/png" href="https://community.tt-rss.org/uploads/default/optimized/1X/18a2e96275d1fffb21cce225d30a87be4544db60_2_32x32.png">
<link rel="apple-touch-icon" type="image/png" href="https://community.tt-rss.org/uploads/default/optimized/1X/18a2e96275d1fffb21cce225d30a87be4544db60_2_180x180.png">
<meta name="theme-color" media="all" content="#ffffff">
<meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0, user-scalable=yes, viewport-fit=cover">
<link rel="canonical" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489" />
<link rel="search" type="application/opensearchdescription+xml" href="https://community.tt-rss.org/opensearch.xml" title="Tiny Tiny RSS: Community Search">
<link href="/stylesheets/color_definitions_base__2_e1a3786e9787d3094d4ad821cf887d92d1d46700.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" class="light-scheme"/>
<link href="/stylesheets/desktop_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="desktop" />
<link href="/stylesheets/discourse-details_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-details" />
<link href="/stylesheets/discourse-lazy-videos_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-lazy-videos" />
<link href="/stylesheets/discourse-local-dates_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-local-dates" />
<link href="/stylesheets/discourse-narrative-bot_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-narrative-bot" />
<link href="/stylesheets/discourse-presence_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-presence" />
<link href="/stylesheets/discourse-reactions_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-reactions" />
<link href="/stylesheets/discourse-solved_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-solved" />
<link href="/stylesheets/docker_manager_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="docker_manager" />
<link href="/stylesheets/poll_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="poll" />
<link href="/stylesheets/discourse-reactions_desktop_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-reactions_desktop" />
<link href="/stylesheets/poll_desktop_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="poll_desktop" />
<link href="/stylesheets/desktop_theme_7_fe50c691a9fb30bb61c18aa08d3b7a6cb61a0150.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="desktop_theme" data-theme-id="7" data-theme-name="custom header links"/>
<link href="/stylesheets/desktop_theme_6_7ca213afc12b87349d5efdb08c76d0a62c01d53a.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="desktop_theme" data-theme-id="6" data-theme-name="bears"/>
<link href="/stylesheets/desktop_theme_2_88a998cf58047c52104c0e1f454c9c1c16e18c70.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="desktop_theme" data-theme-id="2" data-theme-name="ttrss"/>
<!-- <script type="text/discourse-plugin" version="0.2">
api.onPageChange((url, title) => {
if (_paq) {
_paq.push(["setCustomUrl", url]);
_paq.push(["setDocumentTitle", title]);
_paq.push(["trackPageView"]);
const currentUser = api.getCurrentUser();
if (currentUser && currentUser['username']) {
_paq.push(['setUserId', currentUser['username']]);
}
}
});
</script> -->
<link rel="alternate nofollow" type="application/rss+xml" title="RSS feed of &#39;Combined mode but grid&#39;" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489.rss" />
<meta property="og:site_name" content="Tiny Tiny RSS: Community" />
<meta property="og:type" content="website" />
<meta name="twitter:card" content="summary" />
<meta name="twitter:image" content="https://community.tt-rss.org/uploads/default/optimized/2X/d/d253dab1dab0906bf5361527e16cd069401ac1cc_2_1024x670.jpeg" />
<meta property="og:image" content="https://community.tt-rss.org/uploads/default/optimized/2X/d/d253dab1dab0906bf5361527e16cd069401ac1cc_2_1024x670.jpeg" />
<meta property="og:url" content="https://community.tt-rss.org/t/combined-mode-but-grid/4489" />
<meta name="twitter:url" content="https://community.tt-rss.org/t/combined-mode-but-grid/4489" />
<meta property="og:title" content="Combined mode but grid" />
<meta name="twitter:title" content="Combined mode but grid" />
<meta property="og:description" content="horrible, huh? @media screen and (min-width: 1400px) { #headlines-frame { display : grid; grid-template-columns: repeat(2, 1fr); grid-gap : 8px; .cdm.expanded { .footer { border : 0; } border : 1px solid @border-default; } } } }" />
<meta name="twitter:description" content="horrible, huh? @media screen and (min-width: 1400px) { #headlines-frame { display : grid; grid-template-columns: repeat(2, 1fr); grid-gap : 8px; .cdm.expanded { .footer { border : 0; } border : 1px solid @border-default; } } } }" />
<meta property="og:article:section" content="Tiny Tiny RSS" />
<meta property="og:article:section:color" content="25AAE2" />
<meta property="og:article:section" content="Development" />
<meta property="og:article:section:color" content="3AB54A" />
<meta property="article:published_time" content="2021-03-09T18:27:09+00:00" />
<meta property="og:ignore_canonical" content="true" />
<script type="application/ld+json">{"@context":"http://schema.org","@type":"QAPage","name":"Combined mode but grid","mainEntity":{"@type":"Question","name":"Combined mode but grid","text":"horrible, huh?\n\n@media screen and (min-width: 1400px) {\n\n#headlines-frame {\n\ndisplay : grid;\n\ngrid-template-columns: repeat(2, 1fr);\n\ngrid-gap : 8px;\n\n.cdm.expanded {\n\n.footer {\n\nborder : 0;\n\n}\n\nborder : 1px solid @border-default;\n\n}\n\n}\n\n}\n\n}\n\n<a class=\"lightbox\" href=\"https://discourse.tt-rss.org/uploads/default/original/2X/d/d253dab1dab0906bf5361527e16cd069401ac1cc.jpeg\" data-download-href=\"https://discourse.tt-rss.org/uploads/default/d253dab1dab0906bf5361527e16cd069401ac1cc\" title=\"image\">[image]<\/a>","upvoteCount":0,"answerCount":0,"dateCreated":"2021-03-09T18:27:09.054Z","author":{"@type":"Person","name":""}}}</script>
</head>
<body class="crawler ">
<header>
<a href="/">
Tiny Tiny RSS: Community
</a>
</header>
<div id="main-outlet" class="wrap" role="main">
<div id="topic-title">
<h1>
<a href="/t/combined-mode-but-grid/4489">Combined mode but grid</a>
</h1>
<div class="topic-category" itemscope itemtype="http://schema.org/BreadcrumbList">
<span itemprop="itemListElement" itemscope itemtype="http://schema.org/ListItem">
<a href="https://community.tt-rss.org/c/tiny-tiny-rss/8" class="badge-wrapper bullet" itemprop="item">
<span class='badge-category-bg' style='background-color: #25AAE2'></span>
<span class='badge-category clear-badge'>
<span class='category-name' itemprop='name'>Tiny Tiny RSS</span>
</span>
</a>
<meta itemprop="position" content="1" />
</span>
<span itemprop="itemListElement" itemscope itemtype="http://schema.org/ListItem">
<a href="https://community.tt-rss.org/c/tiny-tiny-rss/development/6" class="badge-wrapper bullet" itemprop="item">
<span class='badge-category-bg' style='background-color: #3AB54A'></span>
<span class='badge-category clear-badge'>
<span class='category-name' itemprop='name'>Development</span>
</span>
</a>
<meta itemprop="position" content="2" />
</span>
</div>
</div>
<div itemscope itemtype='http://schema.org/DiscussionForumPosting'>
<meta itemprop='headline' content='Combined mode but grid'>
<meta itemprop='articleSection' content='Development'>
<meta itemprop='keywords' content=''>
<div itemprop='publisher' itemscope itemtype="http://schema.org/Organization">
<meta itemprop='name' content='Tiny Tiny RSS: Community'>
</div>
<div id='post_1' class='topic-body crawler-post'>
<div class='crawler-post-meta'>
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
<a itemprop="url" href='https://community.tt-rss.org/u/fox'><span itemprop='name'>fox</span></a>
</span>
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
<link itemprop="image" href="https://community.tt-rss.org/uploads/default/original/2X/d/d253dab1dab0906bf5361527e16cd069401ac1cc.jpeg">
<span class="crawler-post-infos">
<time itemprop='datePublished' datetime='2021-03-09T18:27:09Z' class='post-time'>
March 9, 2021, 6:27pm
</time>
<meta itemprop='dateModified' content='2021-03-09T18:27:09Z'>
<span itemprop='position'>1</span>
</span>
</div>
<div class='post' itemprop='articleBody'>
<p>horrible, huh?</p>
<pre><code class="lang-css">@media screen and (min-width: 1400px) {
#headlines-frame {
display : grid;
grid-template-columns: repeat(2, 1fr);
grid-gap : 8px;
.cdm.expanded {
.footer {
border : 0;
}
border : 1px solid @border-default;
}
}
}
}
</code></pre>
<p><div class="lightbox-wrapper"><a class="lightbox" href="https://discourse.tt-rss.org/uploads/default/original/2X/d/d253dab1dab0906bf5361527e16cd069401ac1cc.jpeg" data-download-href="https://discourse.tt-rss.org/uploads/default/d253dab1dab0906bf5361527e16cd069401ac1cc" title="image"><img src="https://discourse.tt-rss.org/uploads/default/optimized/2X/d/d253dab1dab0906bf5361527e16cd069401ac1cc_2_690x451.jpeg" alt="image" data-base62-sha1="u0DVoAOiT7CgzBwmYIh7fWqFZz6" width="690" height="451" srcset="https://discourse.tt-rss.org/uploads/default/optimized/2X/d/d253dab1dab0906bf5361527e16cd069401ac1cc_2_690x451.jpeg, https://discourse.tt-rss.org/uploads/default/optimized/2X/d/d253dab1dab0906bf5361527e16cd069401ac1cc_2_1035x676.jpeg 1.5x, https://discourse.tt-rss.org/uploads/default/optimized/2X/d/d253dab1dab0906bf5361527e16cd069401ac1cc_2_1380x902.jpeg 2x" data-small-upload="https://discourse.tt-rss.org/uploads/default/optimized/2X/d/d253dab1dab0906bf5361527e16cd069401ac1cc_2_10x10.png"><div class="meta">
<svg class="fa d-icon d-icon-far-image svg-icon" aria-hidden="true"><use xlink:href="#far-image"></use></svg><span class="filename">image</span><span class="informations">1826×1195 425 KB</span><svg class="fa d-icon d-icon-discourse-expand svg-icon" aria-hidden="true"><use xlink:href="#discourse-expand"></use></svg>
</div></a></div></p>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
<meta itemprop="userInteractionCount" content="0" />
<span class='post-likes'></span>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
<meta itemprop="userInteractionCount" content="0" />
</div>
</div>
<div id='post_2' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
<div class='crawler-post-meta'>
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
<a itemprop="url" href='https://community.tt-rss.org/u/sam302psu'><span itemprop='name'>sam302psu</span></a>
</span>
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
<span class="crawler-post-infos">
<time itemprop='datePublished' datetime='2021-03-09T18:52:37Z' class='post-time'>
March 9, 2021, 6:52pm
</time>
<meta itemprop='dateModified' content='2021-03-09T18:52:37Z'>
<span itemprop='position'>2</span>
</span>
</div>
<div class='post' itemprop='text'>
<p>I kind of like it. The concept at least.</p>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
<meta itemprop="userInteractionCount" content="0" />
<span class='post-likes'></span>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
<meta itemprop="userInteractionCount" content="0" />
</div>
</div>
<div id='post_3' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
<div class='crawler-post-meta'>
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
<a itemprop="url" href='https://community.tt-rss.org/u/linoth'><span itemprop='name'>linoth</span></a>
</span>
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
<span class="crawler-post-infos">
<time itemprop='datePublished' datetime='2021-03-10T01:41:53Z' class='post-time'>
March 10, 2021, 1:41am
</time>
<meta itemprop='dateModified' content='2021-03-10T01:41:53Z'>
<span itemprop='position'>3</span>
</span>
</div>
<div class='post' itemprop='text'>
<p>You got me curious.</p>
<p>A little rough in its current form if youre someone toggling collapsed mode. Not sure exactly what caused it, but I wound up with the page getting wider a time or two while I tried it out. Im sure that its actually pretty desirable for image-heavy uses, such as your example of Reddit. Would not have guessed that it could be pulled off with just some lines of CSS, but I dont know web dev.</p>
<p>Neat.</p>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
<meta itemprop="userInteractionCount" content="0" />
<span class='post-likes'></span>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
<meta itemprop="userInteractionCount" content="1" />
</div>
</div>
<div id='post_4' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
<div class='crawler-post-meta'>
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
<a itemprop="url" href='https://community.tt-rss.org/u/fox'><span itemprop='name'>fox</span></a>
</span>
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
<link itemprop="image" href="https://community.tt-rss.org/uploads/default/original/2X/a/a0dcebfc42b3940a0f4c55ae4640a7ee97e4da08.jpeg">
<span class="crawler-post-infos">
<time itemprop='datePublished' datetime='2021-03-10T05:03:00Z' class='post-time'>
March 10, 2021, 5:03am
</time>
<meta itemprop='dateModified' content='2021-03-10T05:41:19Z'>
<span itemprop='position'>4</span>
</span>
</div>
<div class='post' itemprop='text'>
<p>yeah this would obviously only work for expanded mode <img src="https://discourse.tt-rss.org/images/emoji/google_classic/slight_smile.png?v=9" title=":slight_smile:" class="emoji" alt=":slight_smile:"></p>
<p><div class="lightbox-wrapper"><a class="lightbox" href="https://discourse.tt-rss.org/uploads/default/original/2X/a/a0dcebfc42b3940a0f4c55ae4640a7ee97e4da08.jpeg" data-download-href="https://discourse.tt-rss.org/uploads/default/a0dcebfc42b3940a0f4c55ae4640a7ee97e4da08" title="image"><img src="https://discourse.tt-rss.org/uploads/default/optimized/2X/a/a0dcebfc42b3940a0f4c55ae4640a7ee97e4da08_2_690x448.jpeg" alt="image" data-base62-sha1="mX3IwjuahEZxyHk7fMxVA6XKJCw" width="690" height="448" srcset="https://discourse.tt-rss.org/uploads/default/optimized/2X/a/a0dcebfc42b3940a0f4c55ae4640a7ee97e4da08_2_690x448.jpeg, https://discourse.tt-rss.org/uploads/default/optimized/2X/a/a0dcebfc42b3940a0f4c55ae4640a7ee97e4da08_2_1035x672.jpeg 1.5x, https://discourse.tt-rss.org/uploads/default/optimized/2X/a/a0dcebfc42b3940a0f4c55ae4640a7ee97e4da08_2_1380x896.jpeg 2x" data-small-upload="https://discourse.tt-rss.org/uploads/default/optimized/2X/a/a0dcebfc42b3940a0f4c55ae4640a7ee97e4da08_2_10x10.png"><div class="meta">
<svg class="fa d-icon d-icon-far-image svg-icon" aria-hidden="true"><use xlink:href="#far-image"></use></svg><span class="filename">image</span><span class="informations">1951×1267 552 KB</span><svg class="fa d-icon d-icon-discourse-expand svg-icon" aria-hidden="true"><use xlink:href="#discourse-expand"></use></svg>
</div></a></div></p>
<p>a bit more polished looking, i think.</p>
<p>e: its an option now.</p>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
<meta itemprop="userInteractionCount" content="0" />
<span class='post-likes'></span>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
<meta itemprop="userInteractionCount" content="0" />
</div>
</div>
<div id='post_5' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
<div class='crawler-post-meta'>
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
<a itemprop="url" href='https://community.tt-rss.org/u/fox'><span itemprop='name'>fox</span></a>
</span>
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
<link itemprop="image" href="https://community.tt-rss.org/uploads/default/original/2X/0/04be46f4f91f917188576fdeb34027184c5d8bbc.png">
<span class="crawler-post-infos">
<time itemprop='datePublished' datetime='2021-03-11T20:17:10Z' class='post-time'>
March 11, 2021, 8:17pm
</time>
<meta itemprop='dateModified' content='2021-03-11T20:27:54Z'>
<span itemprop='position'>5</span>
</span>
</div>
<div class='post' itemprop='text'>
<p>i made a really primitive plugin that fakes masonry layout for the grid:</p>
<p><div class="lightbox-wrapper"><a class="lightbox" href="https://discourse.tt-rss.org/uploads/default/original/2X/0/04be46f4f91f917188576fdeb34027184c5d8bbc.png" data-download-href="https://discourse.tt-rss.org/uploads/default/04be46f4f91f917188576fdeb34027184c5d8bbc" title="image"><img src="https://discourse.tt-rss.org/uploads/default/optimized/2X/0/04be46f4f91f917188576fdeb34027184c5d8bbc_2_690x431.png" alt="image" data-base62-sha1="FXzLpxEuFssaI6B3WUk8pRDS3O" width="690" height="431" srcset="https://discourse.tt-rss.org/uploads/default/optimized/2X/0/04be46f4f91f917188576fdeb34027184c5d8bbc_2_690x431.png, https://discourse.tt-rss.org/uploads/default/optimized/2X/0/04be46f4f91f917188576fdeb34027184c5d8bbc_2_1035x646.png 1.5x, https://discourse.tt-rss.org/uploads/default/optimized/2X/0/04be46f4f91f917188576fdeb34027184c5d8bbc_2_1380x862.png 2x" data-dominant-color="E1E1E2"><div class="meta">
<svg class="fa d-icon d-icon-far-image svg-icon" aria-hidden="true"><use href="#far-image"></use></svg><span class="filename">image</span><span class="informations">2033×1271 386 KB</span><svg class="fa d-icon d-icon-discourse-expand svg-icon" aria-hidden="true"><use href="#discourse-expand"></use></svg>
</div></a></div></p>
<p>its somewhat buggy and lacks any optimization whatsoever but cool nonetheless (when it works). enjoy.</p>
<p><a href="https://git.tt-rss.org/fox/ttrss-grid-masonry" class="onebox" target="_blank" rel="noopener">https://git.tt-rss.org/fox/ttrss-grid-masonry</a></p>
<p>e: this needs latest master, just in case.</p>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
<meta itemprop="userInteractionCount" content="0" />
<span class='post-likes'></span>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
<meta itemprop="userInteractionCount" content="1" />
</div>
</div>
<div id='post_6' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
<div class='crawler-post-meta'>
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
<a itemprop="url" href='https://community.tt-rss.org/u/sam302psu'><span itemprop='name'>sam302psu</span></a>
</span>
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
<span class="crawler-post-infos">
<time itemprop='datePublished' datetime='2021-03-11T21:24:30Z' class='post-time'>
March 11, 2021, 9:24pm
</time>
<meta itemprop='dateModified' content='2021-03-11T21:26:55Z'>
<span itemprop='position'>6</span>
</span>
</div>
<div class='post' itemprop='text'>
<p>So Ive been using this view and I like it but one small issue I cant figure out how to fix is word wrapping in the frames. Words keep getting split between lines and making things hard to read sometimes. Is that something I can fix with the custom css?</p>
<p>edit: I should have mentioned I am running the dynamic docker setup and I restarted the containers a few hours ago.</p>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
<meta itemprop="userInteractionCount" content="0" />
<span class='post-likes'></span>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
<meta itemprop="userInteractionCount" content="1" />
</div>
</div>
<div id='post_7' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
<div class='crawler-post-meta'>
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
<a itemprop="url" href='https://community.tt-rss.org/u/fox'><span itemprop='name'>fox</span></a>
</span>
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
<span class="crawler-post-infos">
<time itemprop='datePublished' datetime='2021-03-12T04:38:48Z' class='post-time'>
March 12, 2021, 4:38am
</time>
<meta itemprop='dateModified' content='2021-03-12T04:38:48Z'>
<span itemprop='position'>7</span>
</span>
</div>
<div class='post' itemprop='text'>
<p>i should probably go easier on word-breaking in there, limit to links only or something like that.</p>
<p><a href="https://git.tt-rss.org/fox/tt-rss/src/branch/master/themes/light/tt-rss.less#L749" class="onebox" target="_blank" rel="noopener">https://git.tt-rss.org/fox/tt-rss/src/branch/master/themes/light/tt-rss.less#L749</a></p>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
<meta itemprop="userInteractionCount" content="0" />
<span class='post-likes'></span>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
<meta itemprop="userInteractionCount" content="0" />
</div>
</div>
<div id='post_8' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
<div class='crawler-post-meta'>
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
<a itemprop="url" href='https://community.tt-rss.org/u/levito'><span itemprop='name'>levito</span></a>
</span>
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
<span class="crawler-post-infos">
<time itemprop='datePublished' datetime='2021-03-12T09:07:30Z' class='post-time'>
March 12, 2021, 9:07am
</time>
<meta itemprop='dateModified' content='2021-03-12T09:07:30Z'>
<span itemprop='position'>8</span>
</span>
</div>
<div class='post' itemprop='text'>
<p>Hi <a class="mention" href="/u/fox">@fox</a>,awesome to see that rush of progress!</p>
<p>Id suggest using <code>word-wrap: break-word;</code> instead of <code>word-break: break-all;</code>. This makes words only break if they are really wider than the container. Short words are not affected. So you might then also remove the restriction to links.</p>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
<meta itemprop="userInteractionCount" content="0" />
<span class='post-likes'></span>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
<meta itemprop="userInteractionCount" content="1" />
</div>
</div>
<div id='post_9' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
<div class='crawler-post-meta'>
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
<a itemprop="url" href='https://community.tt-rss.org/u/fox'><span itemprop='name'>fox</span></a>
</span>
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
<link itemprop="image" href="https://community.tt-rss.org/uploads/default/original/2X/b/b9d8e8ecd722ea2f29e0fa101548482e9fe912fd.png">
<span class="crawler-post-infos">
<time itemprop='datePublished' datetime='2021-03-12T09:34:16Z' class='post-time'>
March 12, 2021, 9:34am
</time>
<meta itemprop='dateModified' content='2021-03-12T09:34:16Z'>
<span itemprop='position'>9</span>
</span>
</div>
<div class='post' itemprop='text'>
<p>yeah this could work for text, but for links specifically it can make things ugly (uglier?):</p>
<p><div class="lightbox-wrapper"><a class="lightbox" href="https://discourse.tt-rss.org/uploads/default/original/2X/b/b9d8e8ecd722ea2f29e0fa101548482e9fe912fd.png" data-download-href="https://discourse.tt-rss.org/uploads/default/b9d8e8ecd722ea2f29e0fa101548482e9fe912fd" title="image"><img src="https://discourse.tt-rss.org/uploads/default/optimized/2X/b/b9d8e8ecd722ea2f29e0fa101548482e9fe912fd_2_690x187.png" alt="image" data-base62-sha1="qw53FQsmq3Tnl5aap9yIPqLbSRT" width="690" height="187" srcset="https://discourse.tt-rss.org/uploads/default/optimized/2X/b/b9d8e8ecd722ea2f29e0fa101548482e9fe912fd_2_690x187.png, https://discourse.tt-rss.org/uploads/default/optimized/2X/b/b9d8e8ecd722ea2f29e0fa101548482e9fe912fd_2_1035x280.png 1.5x, https://discourse.tt-rss.org/uploads/default/optimized/2X/b/b9d8e8ecd722ea2f29e0fa101548482e9fe912fd_2_1380x374.png 2x" data-small-upload="https://discourse.tt-rss.org/uploads/default/optimized/2X/b/b9d8e8ecd722ea2f29e0fa101548482e9fe912fd_2_10x10.png"><div class="meta">
<svg class="fa d-icon d-icon-far-image svg-icon" aria-hidden="true"><use xlink:href="#far-image"></use></svg><span class="filename">image</span><span class="informations">1615×439 74.6 KB</span><svg class="fa d-icon d-icon-discourse-expand svg-icon" aria-hidden="true"><use xlink:href="#discourse-expand"></use></svg>
</div></a></div></p>
<p>which is why i went with break-all.</p>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
<meta itemprop="userInteractionCount" content="0" />
<span class='post-likes'></span>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
<meta itemprop="userInteractionCount" content="0" />
</div>
</div>
<div id='post_10' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
<div class='crawler-post-meta'>
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
<a itemprop="url" href='https://community.tt-rss.org/u/levito'><span itemprop='name'>levito</span></a>
</span>
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
<link itemprop="image" href="https://community.tt-rss.org/uploads/default/original/2X/5/5d9654dc6e0e9cb1f655b3af565163a882300e84.png">
<span class="crawler-post-infos">
<time itemprop='datePublished' datetime='2021-03-12T09:43:23Z' class='post-time'>
March 12, 2021, 9:43am
</time>
<meta itemprop='dateModified' content='2021-03-12T09:43:23Z'>
<span itemprop='position'>10</span>
</span>
</div>
<div class='post' itemprop='text'>
<p>Thanks for the quick reply and your explanation I get your point. The only problematic edge-case I see with this is links at the end of a line.</p>
<p><div class="lightbox-wrapper"><a class="lightbox" href="https://discourse.tt-rss.org/uploads/default/original/2X/5/5d9654dc6e0e9cb1f655b3af565163a882300e84.png" data-download-href="https://discourse.tt-rss.org/uploads/default/5d9654dc6e0e9cb1f655b3af565163a882300e84" title="Screenshot 2021-03-12 at 10.40.38"><img src="https://discourse.tt-rss.org/uploads/default/optimized/2X/5/5d9654dc6e0e9cb1f655b3af565163a882300e84_2_690x274.png" alt="Screenshot 2021-03-12 at 10.40.38" data-base62-sha1="dlUuhIyTaomOYut8VQNv9nyKtuY" width="690" height="274" srcset="https://discourse.tt-rss.org/uploads/default/optimized/2X/5/5d9654dc6e0e9cb1f655b3af565163a882300e84_2_690x274.png, https://discourse.tt-rss.org/uploads/default/optimized/2X/5/5d9654dc6e0e9cb1f655b3af565163a882300e84_2_1035x411.png 1.5x, https://discourse.tt-rss.org/uploads/default/original/2X/5/5d9654dc6e0e9cb1f655b3af565163a882300e84.png 2x" data-small-upload="https://discourse.tt-rss.org/uploads/default/optimized/2X/5/5d9654dc6e0e9cb1f655b3af565163a882300e84_2_10x10.png"><div class="meta">
<svg class="fa d-icon d-icon-far-image svg-icon" aria-hidden="true"><use xlink:href="#far-image"></use></svg><span class="filename">Screenshot 2021-03-12 at 10.40.38</span><span class="informations">1334×530 56.7 KB</span><svg class="fa d-icon d-icon-discourse-expand svg-icon" aria-hidden="true"><use xlink:href="#discourse-expand"></use></svg>
</div></a></div></p>
<p>But I think thats okay.</p>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
<meta itemprop="userInteractionCount" content="0" />
<span class='post-likes'></span>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
<meta itemprop="userInteractionCount" content="0" />
</div>
</div>
<div id='post_11' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
<div class='crawler-post-meta'>
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
<a itemprop="url" href='https://community.tt-rss.org/u/sam302psu'><span itemprop='name'>sam302psu</span></a>
</span>
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
<span class="crawler-post-infos">
<time itemprop='datePublished' datetime='2021-03-12T12:44:58Z' class='post-time'>
March 12, 2021, 12:44pm
</time>
<meta itemprop='dateModified' content='2021-03-12T12:44:58Z'>
<span itemprop='position'>11</span>
</span>
</div>
<div class='post' itemprop='text'>
<p>Thanks for the quick fix. Its much better. Im really liking the grid layout with mark as read on scroll. Thanks.</p>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
<meta itemprop="userInteractionCount" content="0" />
<span class='post-likes'></span>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
<meta itemprop="userInteractionCount" content="0" />
</div>
</div>
<div id='post_12' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
<div class='crawler-post-meta'>
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
<a itemprop="url" href='https://community.tt-rss.org/u/OldBear'><span itemprop='name'>OldBear</span></a>
</span>
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
<span class="crawler-post-infos">
<time itemprop='datePublished' datetime='2021-03-15T16:33:52Z' class='post-time'>
March 15, 2021, 4:33pm
</time>
<meta itemprop='dateModified' content='2021-03-15T16:33:52Z'>
<span itemprop='position'>12</span>
</span>
</div>
<div class='post' itemprop='text'>
<p>I think its a nice layout to be set to feeds like Dilbert</p>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
<meta itemprop="userInteractionCount" content="0" />
<span class='post-likes'></span>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
<meta itemprop="userInteractionCount" content="0" />
</div>
</div>
<div id='post_13' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
<div class='crawler-post-meta'>
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
<a itemprop="url" href='https://community.tt-rss.org/u/toomyzoom'><span itemprop='name'>toomyzoom</span></a>
</span>
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
<span class="crawler-post-infos">
<time itemprop='datePublished' datetime='2021-03-20T19:32:21Z' class='post-time'>
March 20, 2021, 7:32pm
</time>
<meta itemprop='dateModified' content='2021-03-20T19:32:21Z'>
<span itemprop='position'>13</span>
</span>
</div>
<div class='post' itemprop='text'>
<p>Good, now add it to android app <img src="https://discourse.tt-rss.org/images/emoji/google_classic/grin.png?v=9" title=":grin:" class="emoji" alt=":grin:"></p>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
<meta itemprop="userInteractionCount" content="0" />
<span class='post-likes'></span>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
<meta itemprop="userInteractionCount" content="0" />
</div>
</div>
<div id='post_14' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
<div class='crawler-post-meta'>
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
<a itemprop="url" href='https://community.tt-rss.org/u/linoth'><span itemprop='name'>linoth</span></a>
</span>
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
<span class="crawler-post-infos">
<time itemprop='datePublished' datetime='2021-03-20T21:18:36Z' class='post-time'>
March 20, 2021, 9:18pm
</time>
<meta itemprop='dateModified' content='2021-03-20T21:18:36Z'>
<span itemprop='position'>14</span>
</span>
</div>
<div class='post' itemprop='text'>
<p>Feel a bit guilty since I gave feedback and disappeared.</p>
<p>I have an image heavy feed Im very far behind on, and your first post made me try out expanded instead of collapsed. Sped things up immensely because apparently marking posts as read when you flip through them was CPU heavy on my server.</p>
<p>After that, columns just made things even faster for me.</p>
<p>I did run into an edge case that most people probably wouldnt have even noticed, where a headline would be top-to-bottom in one column at 1080p, so I ham-fisted some CSS to set a max-height on titleWrap.</p>
<p>Thanks so much for this feature, fox. Didnt even know I needed it.</p>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
<meta itemprop="userInteractionCount" content="0" />
<span class='post-likes'></span>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
<meta itemprop="userInteractionCount" content="0" />
</div>
</div>
<div id='post_15' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
<div class='crawler-post-meta'>
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
<a itemprop="url" href='https://community.tt-rss.org/u/sam302psu'><span itemprop='name'>sam302psu</span></a>
</span>
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
<span class="crawler-post-infos">
<time itemprop='datePublished' datetime='2022-08-22T16:29:33Z' class='post-time'>
August 22, 2022, 4:29pm
</time>
<meta itemprop='dateModified' content='2022-08-22T16:29:33Z'>
<span itemprop='position'>15</span>
</span>
</div>
<div class='post' itemprop='text'>
<p>Sorry to resurrect this old thread. Ive really grown to love grid mode but I noticed the article headlines sometimes break words on to separate lines. It seems to happen with all themes. I dont have the masonry plugin active either.</p>
<p>Im using the stock docker setup.</p>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
<meta itemprop="userInteractionCount" content="0" />
<span class='post-likes'></span>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
<meta itemprop="userInteractionCount" content="1" />
</div>
</div>
<div id='post_16' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
<div class='crawler-post-meta'>
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
<a itemprop="url" href='https://community.tt-rss.org/u/fox'><span itemprop='name'>fox</span></a>
</span>
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
<span class="crawler-post-infos">
<time itemprop='datePublished' datetime='2022-08-22T16:42:04Z' class='post-time'>
August 22, 2022, 4:42pm
</time>
<meta itemprop='dateModified' content='2022-08-22T17:13:10Z'>
<span itemprop='position'>16</span>
</span>
</div>
<div class='post' itemprop='text'>
<aside class="quote no-group" data-username="sam302psu" data-post="15" data-topic="4489">
<div class="title">
<div class="quote-controls"></div>
<img loading="lazy" alt="" width="20" height="20" src="https://discourse.tt-rss.org/user_avatar/discourse.tt-rss.org/sam302psu/40/326_2.png" class="avatar"> sam302psu:</div>
<blockquote>
<p>article headlines sometimes break words on to separate lines</p>
</blockquote>
</aside>
<p>yeah its a stock CSS thing, its either that or super long words breaking layout. im not fond of either behaviors. <img src="https://discourse.tt-rss.org/images/emoji/google_classic/frowning.png?v=12" title=":frowning:" class="emoji" alt=":frowning:" loading="lazy" width="20" height="20"></p>
<p>there are several rules like this in the .less files:</p>
<pre><code class="lang-css">word-break : break-all;
</code></pre>
<p>ideas welcome, etc.</p>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
<meta itemprop="userInteractionCount" content="0" />
<span class='post-likes'></span>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
<meta itemprop="userInteractionCount" content="0" />
</div>
</div>
<div id='post_17' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
<div class='crawler-post-meta'>
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
<a itemprop="url" href='https://community.tt-rss.org/u/sam302psu'><span itemprop='name'>sam302psu</span></a>
</span>
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
<span class="crawler-post-infos">
<time itemprop='datePublished' datetime='2022-08-22T17:31:29Z' class='post-time'>
August 22, 2022, 5:31pm
</time>
<meta itemprop='dateModified' content='2022-08-22T17:31:29Z'>
<span itemprop='position'>17</span>
</span>
</div>
<div class='post' itemprop='text'>
<p>Thanks for confirming. I will take the occassional headline issue over breaking the layout. Ill use this as a chance to learn something and if I come up with a fix Ill let you know.</p>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
<meta itemprop="userInteractionCount" content="0" />
<span class='post-likes'></span>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
<meta itemprop="userInteractionCount" content="0" />
</div>
</div>
<div id='post_18' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
<div class='crawler-post-meta'>
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
<a itemprop="url" href='https://community.tt-rss.org/u/ManuelW'><span itemprop='name'>ManuelW</span></a>
</span>
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/combined-mode-but-grid/4489">
<span class="crawler-post-infos">
<time itemprop='datePublished' datetime='2023-06-24T11:30:44Z' class='post-time'>
June 24, 2023, 11:30am
</time>
<meta itemprop='dateModified' content='2023-06-24T11:30:44Z'>
<span itemprop='position'>18</span>
</span>
</div>
<div class='post' itemprop='text'>
<p>How do I activate this? I installed the Plugin and set the checkmark, but nothing happens.</p>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
<meta itemprop="userInteractionCount" content="0" />
<span class='post-likes'></span>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
<meta itemprop="userInteractionCount" content="0" />
</div>
</div>
</div>
</div>
<footer class="container wrap">
<nav class='crawler-nav'>
<ul>
<li itemscope itemtype='http://schema.org/SiteNavigationElement'>
<span itemprop='name'>
<a href='/' itemprop="url">Home </a>
</span>
</li>
<li itemscope itemtype='http://schema.org/SiteNavigationElement'>
<span itemprop='name'>
<a href='/categories' itemprop="url">Categories </a>
</span>
</li>
<li itemscope itemtype='http://schema.org/SiteNavigationElement'>
<span itemprop='name'>
<a href='/guidelines' itemprop="url">FAQ/Guidelines </a>
</span>
</li>
<li itemscope itemtype='http://schema.org/SiteNavigationElement'>
<span itemprop='name'>
<a href='/tos' itemprop="url">Terms of Service </a>
</span>
</li>
<li itemscope itemtype='http://schema.org/SiteNavigationElement'>
<span itemprop='name'>
<a href='/privacy' itemprop="url">Privacy Policy </a>
</span>
</li>
</ul>
</nav>
<p class='powered-by-link'>Powered by <a href="https://www.discourse.org">Discourse</a>, best viewed with JavaScript enabled</p>
</footer>
</body>
</html>

View File

@ -0,0 +1,230 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Telegram channel to idle on - Everything else - Tiny Tiny RSS: Community</title>
<meta name="description" content="Here: Telegram: Contact @TinyTinyRSS
Now theres a place to post on if something on tt-rss.org doesnt work right.">
<meta name="generator" content="Discourse 3.1.0.beta4 - https://github.com/discourse/discourse version 7ff8e5580f9a900cde4be66377cf4f1dcd253a35">
<link rel="icon" type="image/png" href="https://community.tt-rss.org/uploads/default/optimized/1X/18a2e96275d1fffb21cce225d30a87be4544db60_2_32x32.png">
<link rel="apple-touch-icon" type="image/png" href="https://community.tt-rss.org/uploads/default/optimized/1X/18a2e96275d1fffb21cce225d30a87be4544db60_2_180x180.png">
<meta name="theme-color" media="all" content="#ffffff">
<meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0, user-scalable=yes, viewport-fit=cover">
<link rel="canonical" href="https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501" />
<link rel="search" type="application/opensearchdescription+xml" href="https://community.tt-rss.org/opensearch.xml" title="Tiny Tiny RSS: Community Search">
<link href="/stylesheets/color_definitions_base__2_e1a3786e9787d3094d4ad821cf887d92d1d46700.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" class="light-scheme"/>
<link href="/stylesheets/desktop_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="desktop" />
<link href="/stylesheets/discourse-details_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-details" />
<link href="/stylesheets/discourse-lazy-videos_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-lazy-videos" />
<link href="/stylesheets/discourse-local-dates_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-local-dates" />
<link href="/stylesheets/discourse-narrative-bot_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-narrative-bot" />
<link href="/stylesheets/discourse-presence_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-presence" />
<link href="/stylesheets/discourse-reactions_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-reactions" />
<link href="/stylesheets/discourse-solved_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-solved" />
<link href="/stylesheets/docker_manager_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="docker_manager" />
<link href="/stylesheets/poll_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="poll" />
<link href="/stylesheets/discourse-reactions_desktop_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="discourse-reactions_desktop" />
<link href="/stylesheets/poll_desktop_5d530b0376385f589f398785395c13114c8247b0.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="poll_desktop" />
<link href="/stylesheets/desktop_theme_7_fe50c691a9fb30bb61c18aa08d3b7a6cb61a0150.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="desktop_theme" data-theme-id="7" data-theme-name="custom header links"/>
<link href="/stylesheets/desktop_theme_6_7ca213afc12b87349d5efdb08c76d0a62c01d53a.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="desktop_theme" data-theme-id="6" data-theme-name="bears"/>
<link href="/stylesheets/desktop_theme_2_88a998cf58047c52104c0e1f454c9c1c16e18c70.css?__ws=community.tt-rss.org" media="all" rel="stylesheet" data-target="desktop_theme" data-theme-id="2" data-theme-name="ttrss"/>
<!-- <script type="text/discourse-plugin" version="0.2">
api.onPageChange((url, title) => {
if (_paq) {
_paq.push(["setCustomUrl", url]);
_paq.push(["setDocumentTitle", title]);
_paq.push(["trackPageView"]);
const currentUser = api.getCurrentUser();
if (currentUser && currentUser['username']) {
_paq.push(['setUserId', currentUser['username']]);
}
}
});
</script> -->
<link rel="alternate nofollow" type="application/rss+xml" title="RSS feed of &#39;Telegram channel to idle on&#39;" href="https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501.rss" />
<meta property="og:site_name" content="Tiny Tiny RSS: Community" />
<meta property="og:type" content="website" />
<meta name="twitter:card" content="summary" />
<meta name="twitter:image" content="https://community.tt-rss.org/uploads/default/original/1X/18a2e96275d1fffb21cce225d30a87be4544db60.png" />
<meta property="og:image" content="https://community.tt-rss.org/uploads/default/original/1X/18a2e96275d1fffb21cce225d30a87be4544db60.png" />
<meta property="og:url" content="https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501" />
<meta name="twitter:url" content="https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501" />
<meta property="og:title" content="Telegram channel to idle on" />
<meta name="twitter:title" content="Telegram channel to idle on" />
<meta property="og:description" content="Here: Telegram: Contact @TinyTinyRSS Now theres a place to post on if something on tt-rss.org doesnt work right." />
<meta name="twitter:description" content="Here: Telegram: Contact @TinyTinyRSS Now theres a place to post on if something on tt-rss.org doesnt work right." />
<meta property="og:article:section" content="Everything else" />
<meta property="og:article:section:color" content="12A89D" />
<meta property="article:published_time" content="2020-05-15T15:07:15+00:00" />
<meta property="og:ignore_canonical" content="true" />
</head>
<body class="crawler ">
<header>
<a href="/">
Tiny Tiny RSS: Community
</a>
</header>
<div id="main-outlet" class="wrap" role="main">
<div id="topic-title">
<h1>
<a href="/t/telegram-channel-to-idle-on/3501">Telegram channel to idle on</a>
</h1>
<div class="topic-category" itemscope itemtype="http://schema.org/BreadcrumbList">
<span itemprop="itemListElement" itemscope itemtype="http://schema.org/ListItem">
<a href="https://community.tt-rss.org/c/everything-else/12" class="badge-wrapper bullet" itemprop="item">
<span class='badge-category-bg' style='background-color: #12A89D'></span>
<span class='badge-category clear-badge'>
<span class='category-name' itemprop='name'>Everything else</span>
</span>
</a>
<meta itemprop="position" content="1" />
</span>
</div>
</div>
<div itemscope itemtype='http://schema.org/DiscussionForumPosting'>
<meta itemprop='headline' content='Telegram channel to idle on'>
<meta itemprop='articleSection' content='Everything else'>
<meta itemprop='keywords' content=''>
<div itemprop='publisher' itemscope itemtype="http://schema.org/Organization">
<meta itemprop='name' content='Tiny Tiny RSS: Community'>
</div>
<div id='post_1' class='topic-body crawler-post'>
<div class='crawler-post-meta'>
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
<a itemprop="url" href='https://community.tt-rss.org/u/fox'><span itemprop='name'>fox</span></a>
</span>
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501">
<span class="crawler-post-infos">
<time itemprop='datePublished' datetime='2020-05-15T15:07:15Z' class='post-time'>
May 15, 2020, 3:07pm
</time>
<meta itemprop='dateModified' content='2020-05-15T15:07:15Z'>
<span itemprop='position'>1</span>
</span>
</div>
<div class='post' itemprop='articleBody'>
<p>Here: <a href="https://t.me/TinyTinyRSS" class="inline-onebox">Telegram: Contact @TinyTinyRSS</a></p>
<p>Now theres a place to post on if something on <a href="http://tt-rss.org">tt-rss.org</a> doesnt work right.</p>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
<meta itemprop="userInteractionCount" content="0" />
<span class='post-likes'></span>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
<meta itemprop="userInteractionCount" content="0" />
</div>
</div>
<div id='post_2' itemprop='comment' itemscope itemtype='http://schema.org/Comment' class='topic-body crawler-post'>
<div class='crawler-post-meta'>
<span class="creator" itemprop="author" itemscope itemtype="http://schema.org/Person">
<a itemprop="url" href='https://community.tt-rss.org/u/fox'><span itemprop='name'>fox</span></a>
Pinned globally
</span>
<link itemprop="mainEntityOfPage" href="https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501">
<span class="crawler-post-infos">
<time itemprop='datePublished' datetime='2020-05-15T15:07:48Z' class='post-time'>
May 15, 2020, 3:07pm
</time>
<meta itemprop='dateModified' content='2020-05-15T15:07:48Z'>
<span itemprop='position'>2</span>
</span>
</div>
<div class='post' itemprop='text'>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/LikeAction"/>
<meta itemprop="userInteractionCount" content="0" />
<span class='post-likes'></span>
</div>
<div itemprop="interactionStatistic" itemscope itemtype="http://schema.org/InteractionCounter">
<meta itemprop="interactionType" content="http://schema.org/CommentAction"/>
<meta itemprop="userInteractionCount" content="0" />
</div>
</div>
</div>
</div>
<footer class="container wrap">
<nav class='crawler-nav'>
<ul>
<li itemscope itemtype='http://schema.org/SiteNavigationElement'>
<span itemprop='name'>
<a href='/' itemprop="url">Home </a>
</span>
</li>
<li itemscope itemtype='http://schema.org/SiteNavigationElement'>
<span itemprop='name'>
<a href='/categories' itemprop="url">Categories </a>
</span>
</li>
<li itemscope itemtype='http://schema.org/SiteNavigationElement'>
<span itemprop='name'>
<a href='/guidelines' itemprop="url">FAQ/Guidelines </a>
</span>
</li>
<li itemscope itemtype='http://schema.org/SiteNavigationElement'>
<span itemprop='name'>
<a href='/tos' itemprop="url">Terms of Service </a>
</span>
</li>
<li itemscope itemtype='http://schema.org/SiteNavigationElement'>
<span itemprop='name'>
<a href='/privacy' itemprop="url">Privacy Policy </a>
</span>
</li>
</ul>
</nav>
<p class='powered-by-link'>Powered by <a href="https://www.discourse.org">Discourse</a>, best viewed with JavaScript enabled</p>
</footer>
</body>
</html>

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,10 +1,17 @@
package nu.marginalia.tools.experiments; package nu.marginalia.tools.experiments;
import com.google.inject.Inject; import com.google.inject.Inject;
import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.tools.Experiment; import nu.marginalia.tools.Experiment;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import java.util.HashSet;
import java.util.Set;
public class DebugConverterExperiment extends Experiment { public class DebugConverterExperiment extends Experiment {
@ -17,15 +24,55 @@ public class DebugConverterExperiment extends Experiment {
} }
Set<String> seenGenerators = new HashSet<>();
@Override @Override
public boolean process(CrawledDomain domain) { public boolean process(CrawledDomain domain) {
var ret = domainProcessor.process(domain);
ret.documents.stream() if (domain.doc == null) return true;
.filter(ProcessedDocument::isProcessedFully)
.peek(d -> System.out.println(d.url)) var dge = new DocumentGeneratorExtractor();
.map(d -> d.details.metadata)
.forEach(System.out::println); for (var doc : domain.doc) {
if (doc.documentBody == null) continue;
var parsed = Jsoup.parse(doc.documentBody.decode());
parsed.getElementsByTag("head").comments()
.stream().filter(c -> {
String data = c.getData();
if (data.contains("<script"))
return false;
if (data.contains("[if"))
return false;
if (data.contains("shim"))
return false;
return data.contains("Generated by") || data.contains("generated by")
|| data.contains("Powered by") || data.contains("powered by");
}).forEach(System.out::println);
var generators = dge.generatorCleaned(parsed);
for (var g : generators.keywords()) {
if (seenGenerators.add(g)) {
System.out.println(g + "->" + generators.type());
if (generators.type() == GeneratorType.UNKNOWN) {
System.out.println(parsed.select("meta[name=generator]")
.attr("content"));
System.out.println(doc.url);
}
}
}
}
//
// var ret = domainProcessor.process(domain);
//
//
// ret.documents.stream()
// .filter(ProcessedDocument::isProcessedFully)
// .peek(d -> System.out.println(d.url))
// .map(d -> d.details.metadata)
// .forEach(System.out::println);
return true; return true;
} }