mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 13:19:02 +00:00
(search) Add implicit coherence constraints based on segmentation
This commit is contained in:
parent
af8ff8ce99
commit
579295a673
@ -11,7 +11,6 @@ import org.apache.commons.lang3.StringUtils;
|
|||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Collectors;
|
|
||||||
import java.util.stream.IntStream;
|
import java.util.stream.IntStream;
|
||||||
|
|
||||||
/** Responsible for expanding a query, that is creating alternative branches of query execution
|
/** Responsible for expanding a query, that is creating alternative branches of query execution
|
||||||
@ -25,8 +24,7 @@ public class QueryExpansion {
|
|||||||
private final List<ExpansionStrategy> expansionStrategies = List.of(
|
private final List<ExpansionStrategy> expansionStrategies = List.of(
|
||||||
this::joinDashes,
|
this::joinDashes,
|
||||||
this::splitWordNum,
|
this::splitWordNum,
|
||||||
this::joinTerms,
|
this::joinTerms
|
||||||
this::createSegments
|
|
||||||
);
|
);
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
@ -37,7 +35,7 @@ public class QueryExpansion {
|
|||||||
this.lexicon = lexicon;
|
this.lexicon = lexicon;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String expandQuery(List<String> words) {
|
public Expansion expandQuery(List<String> words) {
|
||||||
|
|
||||||
QWordGraph graph = new QWordGraph(words);
|
QWordGraph graph = new QWordGraph(words);
|
||||||
|
|
||||||
@ -45,7 +43,11 @@ public class QueryExpansion {
|
|||||||
strategy.expand(graph);
|
strategy.expand(graph);
|
||||||
}
|
}
|
||||||
|
|
||||||
return QWordPathsRenderer.render(graph);
|
List<List<String>> coherences = createSegments(graph);
|
||||||
|
|
||||||
|
var compiled = QWordPathsRenderer.render(graph);
|
||||||
|
|
||||||
|
return new Expansion(compiled, coherences);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final Pattern dashPattern = Pattern.compile("-");
|
private static final Pattern dashPattern = Pattern.compile("-");
|
||||||
@ -99,8 +101,12 @@ public class QueryExpansion {
|
|||||||
/** Create an alternative interpretation of the query that replaces a sequence of words
|
/** Create an alternative interpretation of the query that replaces a sequence of words
|
||||||
* with a word n-gram. This makes it so that when possible, the order of words in the document
|
* with a word n-gram. This makes it so that when possible, the order of words in the document
|
||||||
* matches the order of the words in the query.
|
* matches the order of the words in the query.
|
||||||
|
*
|
||||||
|
* The function modifies the graph in place, adding new variants to the graph; but also
|
||||||
|
* returns a list of the new groupings that were added.
|
||||||
*/
|
*/
|
||||||
public void createSegments(QWordGraph graph) {
|
public List<List<String>> createSegments(QWordGraph graph)
|
||||||
|
{
|
||||||
List<QWord> nodes = new ArrayList<>();
|
List<QWord> nodes = new ArrayList<>();
|
||||||
|
|
||||||
for (var qw : graph) {
|
for (var qw : graph) {
|
||||||
@ -118,25 +124,32 @@ public class QueryExpansion {
|
|||||||
allSegments.sort(Comparator.comparing(NgramLexicon.SentenceSegment::start));
|
allSegments.sort(Comparator.comparing(NgramLexicon.SentenceSegment::start));
|
||||||
|
|
||||||
if (allSegments.isEmpty()) {
|
if (allSegments.isEmpty()) {
|
||||||
return;
|
return List.of();
|
||||||
}
|
}
|
||||||
|
|
||||||
Set<NgramLexicon.SentenceSegment> bestSegmentation =
|
Set<NgramLexicon.SentenceSegment> bestSegmentation =
|
||||||
findBestSegmentation(allSegments);
|
findBestSegmentation(allSegments);
|
||||||
|
|
||||||
|
List<List<String>> coherences = new ArrayList<>();
|
||||||
|
|
||||||
for (var segment : bestSegmentation) {
|
for (var segment : bestSegmentation) {
|
||||||
|
|
||||||
int start = segment.start();
|
int start = segment.start();
|
||||||
int end = segment.start() + segment.length();
|
int end = segment.start() + segment.length();
|
||||||
|
|
||||||
var word = IntStream.range(start, end)
|
List<String> components =IntStream.range(start, end)
|
||||||
.mapToObj(nodes::get)
|
.mapToObj(nodes::get)
|
||||||
.map(QWord::word)
|
.map(QWord::word)
|
||||||
.collect(Collectors.joining("_"));
|
.toList();
|
||||||
|
|
||||||
|
coherences.add(components);
|
||||||
|
|
||||||
|
String word = String.join("_", components);
|
||||||
graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word);
|
graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return coherences;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private Set<NgramLexicon.SentenceSegment> findBestSegmentation(List<NgramLexicon.SentenceSegment> allSegments) {
|
private Set<NgramLexicon.SentenceSegment> findBestSegmentation(List<NgramLexicon.SentenceSegment> allSegments) {
|
||||||
@ -178,4 +191,6 @@ public class QueryExpansion {
|
|||||||
public interface ExpansionStrategy {
|
public interface ExpansionStrategy {
|
||||||
void expand(QWordGraph graph);
|
void expand(QWordGraph graph);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public record Expansion(String compiledQuery, List<List<String>> extraCoherences) {}
|
||||||
}
|
}
|
||||||
|
@ -137,10 +137,11 @@ public class QueryFactory {
|
|||||||
limits = limits.forSingleDomain();
|
limits = limits.forSingleDomain();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var expansion = queryExpansion.expandQuery(searchTermsInclude);
|
||||||
|
searchTermCoherences.addAll(expansion.extraCoherences());
|
||||||
|
|
||||||
var searchQuery = new SearchQuery(
|
var searchQuery = new SearchQuery(
|
||||||
queryExpansion.expandQuery(
|
expansion.compiledQuery(),
|
||||||
searchTermsInclude
|
|
||||||
),
|
|
||||||
searchTermsInclude,
|
searchTermsInclude,
|
||||||
searchTermsExclude,
|
searchTermsExclude,
|
||||||
searchTermsAdvice,
|
searchTermsAdvice,
|
||||||
|
@ -178,4 +178,15 @@ public class QueryFactoryTest {
|
|||||||
System.out.println(subquery.compiledQuery);
|
System.out.println(subquery.compiledQuery);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testExpansion2() {
|
||||||
|
|
||||||
|
long start = System.currentTimeMillis();
|
||||||
|
var subquery = parseAndGetSpecs("need for speed").query;
|
||||||
|
System.out.println("Time: " + (System.currentTimeMillis() - start));
|
||||||
|
System.out.println(subquery);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user