From e11ebf18e5c0e9e7407952fad6668b121abdcb7e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 28 Nov 2024 13:36:25 +0100 Subject: [PATCH] (span) Correct intersection counting logic, add comprehensive tests --- .../index/forward/spans/DocumentSpan.java | 47 ++++++++-- .../forward/ForwardIndexSpansReaderTest.java | 90 ++++++++++++++++++- 2 files changed, 130 insertions(+), 7 deletions(-) diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java index 3510f669..5ab5d166 100644 --- a/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java +++ b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java @@ -101,13 +101,14 @@ public class DocumentSpan { int start = startsEnds.getInt(sei++); int end = startsEnds.getInt(sei++); - for (int pi = 0; pi < positions.size(); pi++) { + for (int pi = 0; pi < positions.size();) { int position = positions.getInt(pi); if (position >= start && position + len <= end) { return true; } - - if (sei + 2 < startsEnds.size()) { + else if (position < end) { + pi++; + } else if (sei + 2 <= startsEnds.size()) { start = startsEnds.getInt(sei++); end = startsEnds.getInt(sei++); } @@ -133,14 +134,15 @@ public class DocumentSpan { int start = startsEnds.getInt(sei++); int end = startsEnds.getInt(sei++); - for (int pi = 0; pi < positions.size(); pi++) { + for (int pi = 0; pi < positions.size(); ) { int position = positions.getInt(pi); if (position == start && position + len == end) { return true; } - - if (sei + 2 < startsEnds.size()) { + else if (position < end) { + pi++; + } else if (sei + 2 <= startsEnds.size()) { start = startsEnds.getInt(sei++); end = startsEnds.getInt(sei++); } @@ -152,6 +154,39 @@ public class DocumentSpan { return false; } + public int countRangeMatches(IntList positions, int len) { + if (null == startsEnds || startsEnds.size() < 2 || positions.isEmpty()) { + return 0; + } + + int sei = 0; + int ret = 0; + + int start = startsEnds.getInt(sei++); + int end = startsEnds.getInt(sei++); + + for (int pi = 0; pi < positions.size();) { + int position = positions.getInt(pi); + if (position >= start && position + len <= end) { + ret++; + pi++; + } + else if (position < end) { + pi++; + } + else if (sei + 2 <= startsEnds.size()) { + start = startsEnds.getInt(sei++); + end = startsEnds.getInt(sei++); + } + else { + return ret; + } + + } + + return ret; + } + /** Returns an iterator over each position between the start and end positions of each span in the document of this type */ public IntIterator iterator() { if (null == startsEnds) { diff --git a/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java index f92ef785..f0170883 100644 --- a/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java +++ b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java @@ -6,6 +6,7 @@ import nu.marginalia.index.forward.spans.ForwardIndexSpansWriter; import nu.marginalia.language.sentence.tag.HtmlTag; import nu.marginalia.sequence.VarintCodedSequence; import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import java.io.IOException; @@ -28,7 +29,7 @@ class ForwardIndexSpansReaderTest { } @Test - void testSunnyDay() throws IOException { + void testContainsPosition() throws IOException { ByteBuffer wa = ByteBuffer.allocate(32); long offset1; @@ -72,4 +73,91 @@ class ForwardIndexSpansReaderTest { assertFalse(spans2.title.containsPosition(8)); } } + + @Test + void testContainsRange() throws IOException { + long offset1; + try (var writer = new ForwardIndexSpansWriter(testFile)) { + writer.beginRecord(1); + writer.writeSpan(HtmlTag.HEADING.code, VarintCodedSequence.generate( 1, 2, 10, 15, 20, 25).buffer()); + offset1 = writer.endRecord(); + } + + try (var reader = new ForwardIndexSpansReader(testFile); + var arena = Arena.ofConfined() + ) { + var spans1 = reader.readSpans(arena, offset1); + + assertTrue(spans1.heading.containsRange(IntList.of(10), 2)); + assertTrue(spans1.heading.containsRange(IntList.of(8, 10), 2)); + assertTrue(spans1.heading.containsRange(IntList.of(8, 10, 14), 2)); + + assertTrue(spans1.heading.containsRange(IntList.of(10), 5)); + assertTrue(spans1.heading.containsRange(IntList.of(8, 10), 5)); + assertTrue(spans1.heading.containsRange(IntList.of(8, 10, 14), 5)); + + assertFalse(spans1.heading.containsRange(IntList.of(11), 5)); + assertFalse(spans1.heading.containsRange(IntList.of(9), 5)); + } + } + + @Test + void testContainsRangeExact() throws IOException { + long offset1; + try (var writer = new ForwardIndexSpansWriter(testFile)) { + writer.beginRecord(1); + writer.writeSpan(HtmlTag.HEADING.code, VarintCodedSequence.generate( 1, 2, 10, 15, 20, 25).buffer()); + offset1 = writer.endRecord(); + } + + try (var reader = new ForwardIndexSpansReader(testFile); + var arena = Arena.ofConfined() + ) { + var spans1 = reader.readSpans(arena, offset1); + + assertFalse(spans1.heading.containsRangeExact(IntList.of(10), 2)); + assertFalse(spans1.heading.containsRangeExact(IntList.of(8, 10), 2)); + assertFalse(spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 2)); + + assertTrue(spans1.heading.containsRangeExact(IntList.of(10), 5)); + assertTrue(spans1.heading.containsRangeExact(IntList.of(8, 10), 5)); + assertTrue(spans1.heading.containsRangeExact(IntList.of(8, 10, 14), 5)); + + assertFalse(spans1.heading.containsRangeExact(IntList.of(11), 5)); + assertFalse(spans1.heading.containsRangeExact(IntList.of(9), 5)); + } + } + + @Test + void testCountRangeMatches() throws IOException { + long offset1; + try (var writer = new ForwardIndexSpansWriter(testFile)) { + writer.beginRecord(1); + writer.writeSpan(HtmlTag.HEADING.code, VarintCodedSequence.generate( 1, 2, 10, 15, 20, 25).buffer()); + offset1 = writer.endRecord(); + } + + try (var reader = new ForwardIndexSpansReader(testFile); + var arena = Arena.ofConfined() + ) { + var spans1 = reader.readSpans(arena, offset1); + + Assertions.assertEquals(1, spans1.heading.countRangeMatches(IntList.of(10), 2)); + Assertions.assertEquals(1, spans1.heading.countRangeMatches(IntList.of(8, 10), 2)); + Assertions.assertEquals(1, spans1.heading.countRangeMatches(IntList.of(8, 10, 14), 2)); + + Assertions.assertEquals(1, spans1.heading.countRangeMatches(IntList.of(10), 5)); + Assertions.assertEquals(1, spans1.heading.countRangeMatches(IntList.of(8, 10), 5)); + Assertions.assertEquals(1, spans1.heading.countRangeMatches(IntList.of(8, 10, 14), 5)); + + Assertions.assertEquals(2, spans1.heading.countRangeMatches(IntList.of(10, 20), 5)); + Assertions.assertEquals(2, spans1.heading.countRangeMatches(IntList.of(8, 10, 13, 20), 5)); + Assertions.assertEquals(2, spans1.heading.countRangeMatches(IntList.of(8, 10, 14, 20, 55), 5)); + + Assertions.assertEquals(2, spans1.heading.countRangeMatches(IntList.of(10, 12), 2)); + + Assertions.assertEquals(0, spans1.heading.countRangeMatches(IntList.of(11), 5)); + Assertions.assertEquals(0, spans1.heading.countRangeMatches(IntList.of(9), 5)); + } + } } \ No newline at end of file