From 0a383a712dc83c0ea6bd5a3b5a7378b7392de10c Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 15 Aug 2024 11:44:17 +0200 Subject: [PATCH] (qdebug) Accurately display positions when intersecting with spans --- .../marginalia/model/idx/CodedWordSpan.java | 26 ---------- .../index/forward/spans/DocumentSpan.java | 49 ++++++++++++++++--- .../index/forward/spans/DocumentSpans.java | 2 +- .../model/DocumentKeywordsBuilder.java | 14 +++++- .../test/nu/marginalia/IntegrationTest.java | 4 +- 5 files changed, 60 insertions(+), 35 deletions(-) diff --git a/code/common/model/java/nu/marginalia/model/idx/CodedWordSpan.java b/code/common/model/java/nu/marginalia/model/idx/CodedWordSpan.java index 484636a9..7dd25cec 100644 --- a/code/common/model/java/nu/marginalia/model/idx/CodedWordSpan.java +++ b/code/common/model/java/nu/marginalia/model/idx/CodedWordSpan.java @@ -2,31 +2,5 @@ package nu.marginalia.model.idx; import nu.marginalia.sequence.GammaCodedSequence; -import java.util.List; - public record CodedWordSpan(byte code, GammaCodedSequence spans) { - public static SplitSpansList fromSplit(String codes, List spans) { - return new SplitSpansList(codes, spans); - } - public static SplitSpansList split(List spanList) { - return new SplitSpansList( - spanList.stream() - .map(CodedWordSpan::code) - .collect(StringBuilder::new, StringBuilder::append, StringBuilder::append).toString(), - spanList.stream() - .map(CodedWordSpan::spans) - .toList() - ); - } - - public record SplitSpansList(String codes, List spans) { - public List unite() { - if (null == codes) { - return List.of(); - } - else { - return codes.chars().mapToObj(c -> new CodedWordSpan((byte) c, spans.get(codes.indexOf(c)))).toList(); - } - } - } } diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java index b2a4def4..6ca8584c 100644 --- a/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java +++ b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java @@ -3,7 +3,6 @@ package nu.marginalia.index.forward.spans; import it.unimi.dsi.fastutil.ints.IntIterator; import it.unimi.dsi.fastutil.ints.IntList; import nu.marginalia.sequence.CodedSequence; -import nu.marginalia.sequence.SequenceOperations; public class DocumentSpan { @@ -58,17 +57,55 @@ public class DocumentSpan { return false; } - public boolean overlapsRange(CodedSequence sequence) { - return SequenceOperations.intersectSequences(iterator(), sequence.iterator()); - } - /** Returns an iterator over the start and end positions of each span in the document of this type */ public IntIterator iterator() { if (null == startsEnds) { return IntList.of().iterator(); } - return startsEnds.iterator(); + return new DocumentSpanPositionsIterator(); + } + + /** Iteator over the values between the start and end positions of each span in the document of this type */ + class DocumentSpanPositionsIterator implements IntIterator { + private final IntIterator startStopIterator; + + private int value = -1; + private int current = -1; + private int end = -1; + + public DocumentSpanPositionsIterator() { + this.startStopIterator = startsEnds.iterator(); + } + + @Override + public int nextInt() { + if (hasNext()) { + int ret = value; + value = -1; + return ret; + } + throw new IllegalStateException(); + } + + @Override + public boolean hasNext() { + if (value >= 0) { + return true; + } + else if (current >= 0 && current < end) { + value = ++current; + return true; + } + else if (startStopIterator.hasNext()) { + current = startStopIterator.nextInt(); + end = startStopIterator.nextInt(); + value = current; + return true; + } + + return false; + } } public int length() { diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpans.java b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpans.java index d3646faf..56bb51e9 100644 --- a/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpans.java +++ b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpans.java @@ -4,7 +4,7 @@ import nu.marginalia.language.sentence.tag.HtmlTag; import nu.marginalia.sequence.CodedSequence; public class DocumentSpans { - private static DocumentSpan EMPTY_SPAN = new DocumentSpan(); + private static final DocumentSpan EMPTY_SPAN = new DocumentSpan(); public DocumentSpan title = EMPTY_SPAN; public DocumentSpan heading = EMPTY_SPAN; diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java index 699cf096..1f3629e9 100644 --- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java @@ -142,7 +142,19 @@ public class DocumentKeywordsBuilder { StringBuilder sb = new StringBuilder("[ "); wordToMeta.forEach((word, meta) -> { - sb.append(word).append("->").append(WordFlags.decode(meta)).append(',').append(wordToPos.getOrDefault(word, new IntArrayList())).append(' '); + sb.append(word) + .append("->") + .append(WordFlags.decode(meta)) + .append(',') + .append(wordToPos.getOrDefault(word, new IntArrayList())) + .append(' '); + }); + + wordSpans.forEach((tag, spans) -> { + sb.append(tag) + .append("->") + .append(spans) + .append(' '); }); return sb.append(']').toString(); } diff --git a/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java b/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java index 820525b9..7fbcdefc 100644 --- a/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java +++ b/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java @@ -209,7 +209,9 @@ public class IntegrationTest { var params = QueryProtobufCodec.convertRequest(request); - var query = queryFactory.createQuery(params, ResultRankingParameters.sensibleDefaults()); + var p = ResultRankingParameters.sensibleDefaults(); + p.exportDebugData = true; + var query = queryFactory.createQuery(params, p); var indexRequest = QueryProtobufCodec.convertQuery(request, query);