Add synthetic keywords for same-site files linked from a document (e.g. file:png). Also add category keywords, like file:image or file:document.

This commit is contained in:
Viktor Lofgren 2023-04-30 19:29:13 +02:00
parent 1e3b6934bb
commit a9f7b4c457
2 changed files with 143 additions and 40 deletions

View File

@ -0,0 +1,136 @@
package nu.marginalia.converting.processor.logic.links;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import org.jsoup.nodes.Document;
import java.nio.file.Path;
import java.util.HashSet;
import java.util.Set;
public class FileLinks {
// If a document links to a file on the same server, and that file has
// an appropriate file ending, then add the filename as a keyword so that it can
// be found even if it's not explicitly mentioned on the page
public static Set<String> createFileLinkKeywords(LinkProcessor lp, EdgeDomain domain) {
Set<String> fileKeywords = new HashSet<>(100);
for (var link : lp.getNonIndexableUrls()) {
if (!domain.hasSameTopDomain(link.domain)) {
continue;
}
synthesizeFilenameKeyword(fileKeywords, link);
}
return fileKeywords;
}
private static void synthesizeFilenameKeyword(Set<String> fileKeywords, EdgeUrl link) {
Path pFilename = Path.of(link.path.toLowerCase()).getFileName();
if (pFilename == null) return;
String filename = pFilename.toString();
if (filename.length() > 32
|| filename.endsWith(".xml")
|| filename.endsWith(".jpg")
|| filename.endsWith(".png")
|| filename.endsWith(".pdf")
|| filename.endsWith(".gif"))
return;
fileKeywords.add(filename.replace(' ', '_'));
}
/** Create synthetic keywords for file endings of files linked within the same server.
* Also generate categorical keywords for the type of file (audio, video, image, document, archive)
*/
public static Set<String> createFileEndingKeywords(Document doc) {
Set<String> endings = new HashSet<>();
doc.getElementsByTag("a").forEach(e -> {
var src = e.attr("href");
if (src.contains(":")) return;
if (src.contains("/")) src = src.substring(src.lastIndexOf("/"));
if (src.contains("?")) src = src.split("\\?", 2)[0];
if (src.contains("#")) src = src.split("#", 2)[0];
src = src.toLowerCase();
if (src.startsWith("www")) return;
final int firstPeriod = src.indexOf(".");
final int lastPeriod = src.lastIndexOf(".");
if (firstPeriod < 0) return;
if (firstPeriod != lastPeriod) return;
String ending = src.substring(lastPeriod + 1).trim();
if (ending.contains("_")) return;
if (ignoredEndings.contains(ending)) return;
int endingLength = ending.length();
if (endingLength > 1 && endingLength <= 4) {
endings.add(ending);
}
});
if (endings.isEmpty())
return endings;
Set<String> keywords = new HashSet<>(endings.size() + 8);
for (var ending : endings) {
keywords.add("file:" + ending);
}
if (hasEndingType(audioEndings, endings)) keywords.add("file:audio");
if (hasEndingType(videoEndings, endings)) keywords.add("file:video");
if (hasEndingType(imageEndings, endings)) keywords.add("file:image");
if (hasEndingType(documentEndings, endings)) keywords.add("file:document");
if (hasEndingType(archiveEndings, endings)) keywords.add("file:archive");
return keywords;
}
private static final Set<String> ignoredEndings = Set.of("html",
"htm", "cfm", "php", "asp", "aspx", "jsp", "shtml",
"xhtml", "com", "org", "cgi", "net", "edu", "gov", "jp", "nl",
"ly", "co", "io", "dev", "rss", "xml");
private static final String[] videoEndings = new String[] {
"avi", "mp4", "mov", "wmv", "flv", "mkv", "mpg", "mpeg", "m4v", "webm", "3gp"
};
private static final String[] audioEndings = new String[] {
"mp3", "wav", "ogg", "wma", "aac", "flac", "m4a", "mid", "midi", "aiff", "aif", "aifc", "au", "snd", "amr", "oga", "opus"
};
private static final String[] imageEndings = new String[] {
"jpg", "jpeg", "png", "gif", "bmp", "tiff", "tif", "svg", "webp"
};
private static final String[] documentEndings = new String[] {
"pdf", "doc", "docx", "xls", "xslx", "ppt", "pptx", "odt", "ods", "odp", "rtf", "txt", "csv", "tsv"
};
private static final String[] archiveEndings = new String[] {
"zip", "rar", "gz", "tar", "7z", "bz2", "xz", "bz2", "iso", "dmg", "pkg", "deb", "rpm", "apk", "jar", "war", "ear", "tgz"
};
private static boolean hasEndingType(String[] included, Set<String> endings) {
for (var ending : included) {
if (endings.contains(ending)) return true;
}
return false;
}
}

View File

@ -4,6 +4,7 @@ import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.converting.processor.MetaRobotsTag;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.converting.processor.logic.links.FileLinks;
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.crawl.HtmlFeature;
@ -243,54 +244,20 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
.ifPresent(lp::acceptFeed);
}
createFileLinkKeywords(words, lp, domain);
createLinkKeywords(words, lp);
words.addAllSyntheticTerms(FileLinks.createFileLinkKeywords(lp, domain));
words.addAllSyntheticTerms(FileLinks.createFileEndingKeywords(doc));
words.addAllSyntheticTerms(createLinkKeywords(lp));
}
// If a document links to a file on the same server, and that file has
// a salient file ending, then add the filename as a keyword so that it can
// be found
private void createFileLinkKeywords(DocumentKeywordsBuilder words, LinkProcessor lp, EdgeDomain domain) {
Set<String> fileKeywords = new HashSet<>(100);
for (var link : lp.getNonIndexableUrls()) {
if (!domain.hasSameTopDomain(link.domain)) {
continue;
}
synthesizeFilenameKeyword(fileKeywords, link);
}
words.addAllSyntheticTerms(fileKeywords);
}
private void synthesizeFilenameKeyword(Set<String> fileKeywords, EdgeUrl link) {
Path pFilename = Path.of(link.path.toLowerCase()).getFileName();
if (pFilename == null) return;
String filename = pFilename.toString();
if (filename.length() > 32
|| filename.endsWith(".xml")
|| filename.endsWith(".jpg")
|| filename.endsWith(".png")
|| filename.endsWith(".pdf")
|| filename.endsWith(".gif"))
return;
fileKeywords.add(filename.replace(' ', '_'));
}
private void createLinkKeywords(DocumentKeywordsBuilder words, LinkProcessor lp) {
private Set<String> createLinkKeywords(LinkProcessor lp) {
final Set<String> linkTerms = new HashSet<>();
for (var fd : lp.getForeignDomains()) {
linkTerms.add("links:"+fd.toString().toLowerCase());
linkTerms.add("links:"+fd.getDomain().toLowerCase());
}
words.addAllSyntheticTerms(linkTerms);
return linkTerms;
}
private HtmlStandard getHtmlStandard(Document doc) {