mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Specialization for javadocs
This commit is contained in:
parent
24dce8c03b
commit
42375f0e53
@ -13,15 +13,19 @@ public class HtmlProcessorSpecializations {
|
|||||||
private final LemmySpecialization lemmySpecialization;
|
private final LemmySpecialization lemmySpecialization;
|
||||||
private final XenForoSpecialization xenforoSpecialization;
|
private final XenForoSpecialization xenforoSpecialization;
|
||||||
private final PhpBBSpecialization phpBBSpecialization;
|
private final PhpBBSpecialization phpBBSpecialization;
|
||||||
|
private final JavadocSpecialization javadocSpecialization;
|
||||||
private final DefaultSpecialization defaultSpecialization;
|
private final DefaultSpecialization defaultSpecialization;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public HtmlProcessorSpecializations(LemmySpecialization lemmySpecialization,
|
public HtmlProcessorSpecializations(LemmySpecialization lemmySpecialization,
|
||||||
XenForoSpecialization xenforoSpecialization,
|
XenForoSpecialization xenforoSpecialization,
|
||||||
PhpBBSpecialization phpBBSpecialization, DefaultSpecialization defaultSpecialization) {
|
PhpBBSpecialization phpBBSpecialization,
|
||||||
|
JavadocSpecialization javadocSpecialization,
|
||||||
|
DefaultSpecialization defaultSpecialization) {
|
||||||
this.lemmySpecialization = lemmySpecialization;
|
this.lemmySpecialization = lemmySpecialization;
|
||||||
this.xenforoSpecialization = xenforoSpecialization;
|
this.xenforoSpecialization = xenforoSpecialization;
|
||||||
this.phpBBSpecialization = phpBBSpecialization;
|
this.phpBBSpecialization = phpBBSpecialization;
|
||||||
|
this.javadocSpecialization = javadocSpecialization;
|
||||||
this.defaultSpecialization = defaultSpecialization;
|
this.defaultSpecialization = defaultSpecialization;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -36,6 +40,10 @@ public class HtmlProcessorSpecializations {
|
|||||||
if (generator.keywords().contains("phpbb")) {
|
if (generator.keywords().contains("phpbb")) {
|
||||||
return xenforoSpecialization;
|
return xenforoSpecialization;
|
||||||
}
|
}
|
||||||
|
if (generator.keywords().contains("javadoc")) {
|
||||||
|
return javadocSpecialization;
|
||||||
|
}
|
||||||
|
|
||||||
return defaultSpecialization;
|
return defaultSpecialization;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -0,0 +1,40 @@
|
|||||||
|
package nu.marginalia.converting.processor.plugin.specialization;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.summary.SummaryExtractor;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class JavadocSpecialization extends DefaultSpecialization {
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(JavadocSpecialization.class);
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public JavadocSpecialization(SummaryExtractor summaryExtractor) {
|
||||||
|
super(summaryExtractor);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Document prune(Document doc) {
|
||||||
|
final var prunedDoc = super.prune(doc);
|
||||||
|
|
||||||
|
prunedDoc.getElementsByTag("noscript").remove();
|
||||||
|
|
||||||
|
return prunedDoc;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getSummary(Document doc,
|
||||||
|
Set<String> importantWords) {
|
||||||
|
var block = doc.getElementsByClass("block").first();
|
||||||
|
|
||||||
|
if (block != null)
|
||||||
|
return block.text();
|
||||||
|
|
||||||
|
return super.getSummary(doc, importantWords);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,48 @@
|
|||||||
|
package nu.marginalia.converting.processor.plugin.specialization;
|
||||||
|
|
||||||
|
import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor;
|
||||||
|
import nu.marginalia.summary.SummaryExtractor;
|
||||||
|
import nu.marginalia.test.CommonTestData;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
class JavadocSpecializationTest {
|
||||||
|
|
||||||
|
static JavadocSpecialization specialization;
|
||||||
|
static DocumentGeneratorExtractor generatorExtractor = new DocumentGeneratorExtractor();
|
||||||
|
|
||||||
|
String thread = CommonTestData.loadTestData("mock-crawl-data/javadoc/stream.html");
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
public static void setUpAll() {
|
||||||
|
specialization = new JavadocSpecialization(
|
||||||
|
new SummaryExtractor(255,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
null));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void prune() {
|
||||||
|
System.out.println(specialization.prune(Jsoup.parse(thread)));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void generatorExtraction() {
|
||||||
|
var gen = generatorExtractor.generatorCleaned(Jsoup.parse(thread));
|
||||||
|
|
||||||
|
System.out.println(gen);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void getSummary() {
|
||||||
|
String summary = specialization.getSummary(Jsoup.parse(thread), Set.of(""));
|
||||||
|
|
||||||
|
System.out.println(summary);
|
||||||
|
}
|
||||||
|
}
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user