mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Use document generator to complement the document selection.
Will let through e.g. a modern SSG in the small web filter.
This commit is contained in:
parent
b5ef67ed28
commit
55c65f0935
@ -20,7 +20,7 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
|||||||
int domainId = forwardIndexReader.getDomainId(urlId);
|
int domainId = forwardIndexReader.getDomainId(urlId);
|
||||||
long meta = forwardIndexReader.getDocMeta(urlId);
|
long meta = forwardIndexReader.getDocMeta(urlId);
|
||||||
|
|
||||||
if (!validateDomain(domainId)) {
|
if (!validateDomain(domainId, meta)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -43,8 +43,8 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean validateDomain(int domainId) {
|
private boolean validateDomain(int domainId, long meta) {
|
||||||
return params.searchSet().contains(domainId);
|
return params.searchSet().contains(domainId, meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean validateQuality(long meta) {
|
private boolean validateQuality(long meta) {
|
||||||
|
@ -1,6 +1,12 @@
|
|||||||
package nu.marginalia.index.searchset;
|
package nu.marginalia.index.searchset;
|
||||||
|
|
||||||
public interface SearchSet {
|
public interface SearchSet {
|
||||||
boolean contains(int urlId);
|
|
||||||
|
/**
|
||||||
|
* Returns true if the given urlId is contained in the set
|
||||||
|
* or if the documentMetadata vibes with the set
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
boolean contains(int urlId, long documentMetadata);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,8 @@ package nu.marginalia.index.svc.searchset;
|
|||||||
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
|
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
|
||||||
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
|
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
|
||||||
import nu.marginalia.index.searchset.SearchSet;
|
import nu.marginalia.index.searchset.SearchSet;
|
||||||
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -13,7 +15,7 @@ import java.nio.file.Files;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.StandardOpenOption;
|
import java.nio.file.StandardOpenOption;
|
||||||
|
|
||||||
/** A serializable bit map of domains
|
/** A serializable bit map of domains corresponding to a method of ranking the domains
|
||||||
*
|
*
|
||||||
* @see SearchSetIdentifier
|
* @see SearchSetIdentifier
|
||||||
*
|
*
|
||||||
@ -61,10 +63,27 @@ public class RankingSearchSet implements SearchSet {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean contains(int urlId) {
|
public boolean contains(int urlId, long documentMetadata) {
|
||||||
// Fallback on allow-all if no items are in set
|
// For ranked search sets, exclude excessively commercial sites
|
||||||
|
// TODO: Maybe this particular check should be moved up to the search service and be opt-in?
|
||||||
|
if (DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorSpammy.asBit())) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return set.contains(urlId) || set.isEmpty();
|
// This is the main check
|
||||||
|
if (set.contains(urlId) || set.isEmpty()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// For the rest, let through some domains that are not in the set based on the generator tag
|
||||||
|
if (identifier == SearchSetIdentifier.SMALLWEB) {
|
||||||
|
return DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorBlog.asBit());
|
||||||
|
}
|
||||||
|
if (identifier == SearchSetIdentifier.RETRO) {
|
||||||
|
return DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorVintage.asBit());
|
||||||
|
}
|
||||||
|
|
||||||
|
return DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorForumWiki.asBit());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void write() throws IOException {
|
public void write() throws IOException {
|
||||||
|
@ -4,7 +4,7 @@ import nu.marginalia.index.searchset.SearchSet;
|
|||||||
|
|
||||||
public class SearchSetAny implements SearchSet {
|
public class SearchSetAny implements SearchSet {
|
||||||
@Override
|
@Override
|
||||||
public boolean contains(int urlId) {
|
public boolean contains(int urlId, long meta) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6,6 +6,7 @@ import nu.marginalia.index.searchset.SearchSet;
|
|||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
|
||||||
|
/** A specialized search set for a small number of entries, for use when specifying the exact domains to query */
|
||||||
public class SmallSearchSet implements SearchSet {
|
public class SmallSearchSet implements SearchSet {
|
||||||
public TIntHashSet entries;
|
public TIntHashSet entries;
|
||||||
|
|
||||||
@ -14,7 +15,7 @@ public class SmallSearchSet implements SearchSet {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean contains(int domainId) {
|
public boolean contains(int domainId, long meta) {
|
||||||
return entries.contains(domainId);
|
return entries.contains(domainId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -26,10 +26,10 @@ class RankingSearchSetTest {
|
|||||||
set.write();
|
set.write();
|
||||||
|
|
||||||
RankingSearchSet set2 = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, p);
|
RankingSearchSet set2 = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, p);
|
||||||
assertTrue(set2.contains(1));
|
assertTrue(set2.contains(1, 0));
|
||||||
assertTrue(set2.contains(5));
|
assertTrue(set2.contains(5, 0));
|
||||||
assertTrue(set2.contains(7));
|
assertTrue(set2.contains(7, 0));
|
||||||
assertTrue(set2.contains(9));
|
assertTrue(set2.contains(9, 0));
|
||||||
|
|
||||||
Files.delete(p);
|
Files.delete(p);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user