From 5393167bf8d00fb418ed9ad661538d65687638e3 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Thu, 20 Oct 2022 21:55:51 +0200 Subject: [PATCH] Fixes in sorting logic, and optimized update domain statistics to not take 4+ hours. --- .../util/language/UnicodeRanges.java | 2 +- .../util/multimap/MultimapFileLong.java | 39 +- .../util/multimap/MultimapSorter.java | 2 +- .../edge/converting/ReindexTriggerMain.java | 28 +- .../converting/UpdateDomainStatistics.java | 66 +++ .../util/btree/BTreeWriterTest.java | 8 +- .../btree/BTreeWriterTestCachedReader.java | 382 ------------------ .../edge/crawling/LanguageFilterTest.java | 2 +- .../edge/index/service/MultimapFileTest.java | 61 +-- .../edge/index/svc/query/IndexQueryTest.java | 6 +- 10 files changed, 115 insertions(+), 481 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/UpdateDomainStatistics.java delete mode 100644 marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTestCachedReader.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/UnicodeRanges.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/UnicodeRanges.java index f4d89a1d..bd1d3043 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/UnicodeRanges.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/UnicodeRanges.java @@ -74,7 +74,7 @@ public enum UnicodeRanges { for (int i = 0; i < Math.min(2000, text.length()); i++) { char c = text.charAt(i); - if (c >= min && c <= max) { + if (c >= min && c <= this.max) { if (count++ > max) { return true; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java index d5c64162..d3e1376d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java @@ -216,6 +216,9 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { @Override public long get(long idx) { + if (idx < 0) + throw new IllegalArgumentException("get("+idx+")"); + if (idx >= mappedSize) grow(idx); @@ -650,7 +653,7 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { if (start + n + wordSize - 1 >= mappedSize) grow(start + n + wordSize - 1); - if (n == 1) { + if (n <= 1) { return; } @@ -659,33 +662,29 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice { int off = (int) (start % bufferSize); for (int i = 1; i < n; i++) { - for (int j = i; j > 0; j--) { - int a = off + wordSize*(j-1); - int b = off + wordSize*j; + long key = buffer.get(off + i * wordSize); - if (buffer.get(a) > buffer.get(b)) { - for (int w = 0; w < wordSize; w++) { - long tmp = buffer.get(a+w); - buffer.put(a+w, buffer.get(b+w)); - buffer.put(b+w, tmp); - } + int j = i - 1; + while (j >= 0 && buffer.get(off + wordSize*j) > key) { + for (int w = 0; w < wordSize; w++) { + long tmp = buffer.get(off+wordSize*j+w); + buffer.put(off+wordSize*j+w, buffer.get(off+wordSize*(j+1)+w)); + buffer.put(off+wordSize*(j+1)+w, tmp); } - else break; + j--; } + buffer.put(off + (j+1) * wordSize, key); } } else for (int i = 1; i < n; i++) { - for (int j = i; j > 0; j--) { - long a = start + (long)wordSize*(j-1); - long b = start + (long)wordSize*j; + long key = get(start + (long) i * wordSize); - if (get(a) > get(b)) { - swap(a, b); - } - else { - break; - } + int j = i - 1; + while (j >= 0 && get(start + (long)wordSize*j) > key) { + swapn(wordSize, start + (long)wordSize*j, start + (long)wordSize*(j+1)); + j--; } + put(start + (long) (j+1) * wordSize, key); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java index c6b1827f..88d873a3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java @@ -135,7 +135,7 @@ public class MultimapSorter { if (low >= 0 && highInclusive >= 0 && low < highInclusive) { if (highInclusive - low < 32) { - multimapFileLong.insertionSort(wordSize, low, (int) (1 + (highInclusive - low) / wordSize)); + multimapFileLong.insertionSort(wordSize, low, (int) ((wordSize + highInclusive - low) / wordSize)); } else { long p = multimapFileLong.quickSortPartition(wordSize, low, highInclusive); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java index 1fe173bf..d1c8db01 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java @@ -28,32 +28,8 @@ public class ReindexTriggerMain { .followRedirects(true) .build(); - try (var ds = db.provideConnection(); var conn = ds.getConnection(); var stmt = conn.createStatement()) { - var rs = stmt.executeQuery("SELECT ID, DOMAIN_NAME, STATE, INDEXED FROM EC_DOMAIN LIMIT 100"); - while (rs.next()) { - System.out.printf("%d %s %s %d\n", - rs.getInt(1), - rs.getString(2), - rs.getString(3), - rs.getInt(4)); - } - - rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, PATH, VISITED, STATE FROM EC_URL LIMIT 100"); - while (rs.next()) { - System.out.printf("%d %d %s %d %s\n", - rs.getInt(1), - rs.getInt(2), - rs.getString(3), - rs.getInt(4), - rs.getString(5)); - - } - - stmt.executeUpdate("INSERT IGNORE INTO DOMAIN_METADATA(ID,GOOD_URLS,KNOWN_URLS,VISITED_URLS) SELECT ID,0,0,0 FROM EC_DOMAIN WHERE INDEXED>0"); - stmt.executeUpdate("UPDATE DOMAIN_METADATA INNER JOIN (SELECT DOMAIN_ID,COUNT(*) CNT FROM EC_URL WHERE VISITED AND STATE='ok' GROUP BY DOMAIN_ID) T ON T.DOMAIN_ID=ID SET GOOD_URLS=CNT"); - stmt.executeUpdate("UPDATE DOMAIN_METADATA INNER JOIN (SELECT DOMAIN_ID,COUNT(*) CNT FROM EC_URL GROUP BY DOMAIN_ID) T ON T.DOMAIN_ID=ID SET KNOWN_URLS=CNT"); - stmt.executeUpdate("UPDATE DOMAIN_METADATA INNER JOIN (SELECT DOMAIN_ID,COUNT(*) CNT FROM EC_URL WHERE VISITED GROUP BY DOMAIN_ID) T ON T.DOMAIN_ID=ID SET VISITED_URLS=CNT"); - } + var updateStatistics = new UpdateDomainStatistics(db.provideConnection()); + updateStatistics.run(); var rb = new RequestBody() { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/UpdateDomainStatistics.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/UpdateDomainStatistics.java new file mode 100644 index 00000000..162c2d8b --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/UpdateDomainStatistics.java @@ -0,0 +1,66 @@ +package nu.marginalia.wmsa.edge.converting; + +import com.zaxxer.hikari.HikariDataSource; +import gnu.trove.map.hash.TIntIntHashMap; + +import java.sql.SQLException; + +public class UpdateDomainStatistics { + private final HikariDataSource dataSource; + + public UpdateDomainStatistics(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + public void run() throws SQLException { + + // This looks weird, but it's actually much faster than doing the computations with SQL queries + // + // ... in part because we can assume the data is immutable and don't mind consuming egregious + // resources + + try (var conn = dataSource.getConnection(); + var stmt = conn.createStatement(); + var domainInfoQuery = conn.prepareStatement("SELECT DOMAIN_ID, VISITED, STATE='ok' FROM EC_URL"); + var insertDomainInfo = conn.prepareStatement("INSERT INTO DOMAIN_METADATA(ID,KNOWN_URLS,GOOD_URLS,VISITED_URLS) VALUES (?, ?, ?, ?)") + ) { + + stmt.executeUpdate("DELETE FROM DOMAIN_METADATA"); + + TIntIntHashMap knownUrls = new TIntIntHashMap(1_000_000, 0.75f, 0, 0); + TIntIntHashMap visitedUrls = new TIntIntHashMap(1_000_000, 0.75f, 0, 0); + TIntIntHashMap goodUrls = new TIntIntHashMap(1_000_000, 0.75f, 0, 0); + + domainInfoQuery.setFetchSize(10_000); + var rsp = domainInfoQuery.executeQuery(); + while (rsp.next()) { + int domainId = rsp.getInt(1); + boolean visited = rsp.getBoolean(2); + boolean stateOk = rsp.getBoolean(3); + + knownUrls.adjustOrPutValue(domainId, 1, 1); + if (visited) { + visitedUrls.adjustOrPutValue(domainId, 1, 1); + if (stateOk) { + goodUrls.adjustOrPutValue(domainId, 1, 1); + } + } + } + + int i = 0; + for (int domainId : knownUrls.keys()) { + insertDomainInfo.setInt(1, domainId); + insertDomainInfo.setInt(2, knownUrls.get(domainId)); + insertDomainInfo.setInt(3, visitedUrls.get(domainId)); + insertDomainInfo.setInt(4, goodUrls.get(domainId)); + insertDomainInfo.addBatch(); + if ((++i % 1000) == 0) { + insertDomainInfo.executeBatch(); + } + } + if ((i % 1000) != 0) { + insertDomainInfo.executeBatch(); + } + } + } +} diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java index 0d2d3e3c..c55b597d 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTest.java @@ -153,7 +153,7 @@ class BTreeWriterTest { for (int i = 0; i < 500; i++) { long val = (long)(Long.MAX_VALUE * Math.random()); while (toPut.contains((int)val)) val = (long)(Long.MAX_VALUE * Math.random()); - assertEquals(-1, reader.findEntry( val)); + assertTrue(reader.findEntry( val) < 0); } } } catch (Exception e) { @@ -203,7 +203,7 @@ class BTreeWriterTest { for (int i = 0; i < 500; i++) { long val = (long) (Long.MAX_VALUE * Math.random()); while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random()); - assertEquals(-1, reader.findEntry(val)); + assertTrue(reader.findEntry( val) < 0); } } } catch (Exception e) { @@ -255,7 +255,7 @@ class BTreeWriterTest { for (int i = 0; i < 500; i++) { long val = (long) (Long.MAX_VALUE * Math.random()); while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random()); - assertEquals(-1, reader.findEntry(val & mask)); + assertTrue(reader.findEntry(val & mask) < 0); } } } catch (Exception e) { @@ -309,7 +309,7 @@ class BTreeWriterTest { for (int i = 0; i < 500; i++) { long val = (long) (Long.MAX_VALUE * Math.random()); while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random()); - assertEquals(-1, reader.findEntry(val & mask)); + assertTrue(reader.findEntry(val & mask) < 0); } } } catch (Exception e) { diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTestCachedReader.java b/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTestCachedReader.java deleted file mode 100644 index 8c98d2e6..00000000 --- a/marginalia_nu/src/test/java/nu/marginalia/util/btree/BTreeWriterTestCachedReader.java +++ /dev/null @@ -1,382 +0,0 @@ -package nu.marginalia.util.btree; - -import nu.marginalia.util.btree.model.BTreeContext; -import nu.marginalia.util.btree.model.BTreeHeader; -import nu.marginalia.util.multimap.MultimapFileLong; -import org.junit.jupiter.api.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.io.RandomAccessFile; -import java.nio.channels.FileChannel; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; -import java.util.HashSet; -import java.util.Set; -import java.util.StringJoiner; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; - -class BTreeWriterTestCachedReader { - - final BTreeContext ctx = new BTreeContext(4, 2, 0xFFFF_FFFF_FFFF_FFFFL, 8); - final BTreeWriter writer = new BTreeWriter(null, ctx); - - Logger logger = LoggerFactory.getLogger(getClass()); - @Test - void testSmallDataBlock() { - var header = writer.makeHeader(1024, ctx.BLOCK_SIZE_WORDS()/2); - assertEquals(1024 + BTreeHeader.BTreeHeaderSizeLongs, header.dataOffsetLongs()); - assertEquals(header.dataOffsetLongs(), header.indexOffsetLongs()); - } - - @Test - void testLayerCount() { - int wsq = ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS(); - int wcub = ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS(); - - assertEquals(2, writer.makeHeader(1024, wsq-1).layers()); - assertEquals(2, writer.makeHeader(1024, wsq).layers()); - assertEquals(3, writer.makeHeader(1024, wsq+1).layers()); - - assertEquals(3, writer.makeHeader(1024, wcub-1).layers()); - assertEquals(3, writer.makeHeader(1024, wcub).layers()); - assertEquals(4, writer.makeHeader(1024, wcub+1).layers()); - } - - @Test - void testLayerOffset() { - int wcub = ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS(); - System.out.println(writer.makeHeader(1025, wcub).relativeIndexLayerOffset(ctx, 0)); - System.out.println(writer.makeHeader(1025, wcub).relativeIndexLayerOffset(ctx, 1)); - System.out.println(writer.makeHeader(1025, wcub).relativeIndexLayerOffset(ctx, 2)); - - for (int i = 0; i < 1024; i++) { - var header = writer.makeHeader(0, i); - - - printTreeLayout(i, header, ctx); - - if (header.layers() >= 1) { - assertEquals(1, ctx.indexLayerSize(i, header.layers() - 1) / ctx.BLOCK_SIZE_WORDS()); - } - } - } - - private void printTreeLayout(int numEntries, BTreeHeader header, BTreeContext ctx) { - StringJoiner sj = new StringJoiner(","); - for (int l = 0; l < header.layers(); l++) { - sj.add(""+ctx.indexLayerSize(numEntries, l)/ctx.BLOCK_SIZE_WORDS()); - } - System.out.println(numEntries + ":" + sj); - } - - @Test - public void testWriteEntrySize2() throws IOException { - - var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat"); - Set toPut = new HashSet<>(); - - for (int i = 0; i < 500; i++) { - while (!toPut.add((int)(Integer.MAX_VALUE * Math.random()))); - } - - int[] data = toPut.stream().mapToInt(Integer::valueOf).sorted().toArray(); - - try { - RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw"); - MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000); - - { - var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (slice) -> { - for (int i = 0; i < data.length; i++) { - slice.put(2L*i, data[i]); - slice.put( 2L*i + 1, i); - } - }); - mmf.force(); - } - - { - var reader = new BTreeReader(mmf, ctx, 0); - for (int i = 0; i < data.length; i++) { - long offset = reader.findEntry(data[i]); - assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); - assertEquals(i, mmf.get(offset+1)); - } - } - } catch (Exception e) { - e.printStackTrace(); - } finally { - Files.delete(tempFile); - } - } - - - @Test - public void testQB() throws IOException { - - var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat"); - Set toPut = new HashSet<>(); - - for (int i = 0; i < 144646; i++) { - while (!toPut.add(3*i)); - } - - int[] data = toPut.stream().mapToInt(Integer::valueOf).sorted().toArray(); - - try { - RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw"); - MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000); - - { - var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (slice) -> { - for (int i = 0; i < data.length; i++) { - slice.put(2L*i, data[i]); - slice.put( 2L*i + 1, i); - } - }); - mmf.force(); - } - - { - var reader = new BTreeReader(mmf, ctx, 0); - - for (int i = 0; i < data.length; i++) { - long offset = reader.findEntry(data[i]); - } - - long[] d = new long[] { -1, 1, 5000, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 30, 300, 303, 306, 312, 330,3000,30000,300000}; - BTreeQueryBuffer buffer = new BTreeQueryBuffer(d, d.length); - Arrays.sort(buffer.data); - - System.out.println("layers = " + reader.getHeader().layers()); - reader.retainEntries(buffer); - buffer.finalizeFiltering(); - - for (int i = 0; i < buffer.end; i++) { - System.out.println(buffer.data[i]); - } - } - - - } catch (Exception e) { - e.printStackTrace(); - } finally { - Files.delete(tempFile); - } - } - - @Test - public void testWriteEntrySize2Small() throws IOException { - - var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat"); - Set toPut = new HashSet<>(); - - for (int i = 0; i < 5; i++) { - while (!toPut.add((int)(Integer.MAX_VALUE * Math.random()))); - } - - int[] data = toPut.stream().mapToInt(Integer::valueOf).sorted().toArray(); - - try { - RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw"); - MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000); - - { - var writer = new BTreeWriter(mmf, ctx); - writer.write( 0, toPut.size(), (slice) -> { - for (int i = 0; i < data.length; i++) { - slice.put(2L*i, data[i]); - slice.put(2L*i + 1, i); - } - }); - mmf.force(); - } - - { - var reader = new BTreeReader(mmf, ctx, 0); - for (int i = 0; i < data.length; i++) { - long offset = reader.findEntry(data[i]); - assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); - assertEquals(i, mmf.get(offset+1)); - } - - for (int i = 0; i < 500; i++) { - long val = (long)(Long.MAX_VALUE * Math.random()); - while (toPut.contains((int)val)) val = (long)(Long.MAX_VALUE * Math.random()); - assertEquals(-1, reader.findEntry(val)); - } - } - } catch (Exception e) { - e.printStackTrace(); - } finally { - Files.delete(tempFile); - } - } - - - @Test - public void testWriteEqualityNotMasked() throws IOException { - for (int bs = 2; bs <= 4; bs++) { - var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat"); - Set toPut = new HashSet<>(); - - var ctx = new BTreeContext(5, 1, ~0, bs); - - for (int i = 0; i < 500; i++) { - while (!toPut.add((long) (Long.MAX_VALUE * Math.random()))) ; - } - - long[] data = toPut.stream().mapToLong(Long::valueOf).sorted().toArray(); - - try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) { - { - var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (slice) -> { - for (int i = 0; i < data.length; i++) { - slice.put(i, data[i]); - } - }); - mmf.force(); - } - - { - var reader = new BTreeReader(mmf, ctx, 0); - - printTreeLayout(toPut.size(), reader.getHeader(), ctx); - - for (int i = 0; i < data.length; i++) { - long offset = reader.findEntry(data[i]); - assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); - assertEquals(data[i], mmf.get(offset)); - } - - for (int i = 0; i < 500; i++) { - long val = (long) (Long.MAX_VALUE * Math.random()); - while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random()); - assertEquals(-1, reader.findEntry(val)); - } - } - } catch (Exception e) { - e.printStackTrace(); - } finally { - Files.delete(tempFile); - } - } - } - - @Test - public void testWriteEqualityMasked() throws IOException { - - for (int bs = 2; bs <= 4; bs++) { - var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat"); - Set toPut = new HashSet<>(); - - long mask = 0xFFFF_FFFF_0000_0000L; - var ctx = new BTreeContext(5, 1, mask, bs); - - for (int i = 0; i < 500; i++) { - while (!toPut.add((long) (Long.MAX_VALUE * Math.random()))) ; - } - - long[] data = toPut.stream().mapToLong(Long::valueOf).sorted().toArray(); - - try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) { - { - var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (slice) -> { - for (int i = 0; i < data.length; i++) { - slice.put(i, data[i]); - } - }); - mmf.force(); - } - - { - var reader = new BTreeReader(mmf, ctx, 0); - - printTreeLayout(toPut.size(), reader.getHeader(), ctx); - - for (int i = 0; i < data.length; i++) { - long offset = reader.findEntry(data[i] & mask); - assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); - assertEquals(data[i], mmf.get(offset)); - } - - for (int i = 0; i < 500; i++) { - long val = (long) (Long.MAX_VALUE * Math.random()); - while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random()); - assertEquals(-1, reader.findEntry(val & mask)); - } - } - } catch (Exception e) { - e.printStackTrace(); - } finally { - Files.delete(tempFile); - } - } - } - - @Test - public void testWriteTwoEqualityMasked() throws IOException { - - for (int bs = 2; bs <= 4; bs++) { - var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat"); - Set toPut = new HashSet<>(); - - long mask = 0xFFFF_FFFF_0000_0000L; - var ctx = new BTreeContext(5, 2, mask, bs); - - for (int i = 0; i < 500; i++) { - while (!toPut.add((long) (Long.MAX_VALUE * Math.random()))) ; - } - - long[] data = toPut.stream().mapToLong(Long::valueOf).sorted().toArray(); - - try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) { - { - var writer = new BTreeWriter(mmf, ctx); - writer.write(0, toPut.size(), (slice) -> { - for (int i = 0; i < data.length; i++) { - slice.put(i*2L, data[i]); - slice.put(i*2L+1, i); - } - }); - mmf.force(); - } - - { - var reader = new BTreeReader(mmf, ctx, 0); - - printTreeLayout(toPut.size(), reader.getHeader(), ctx); - - for (int i = 0; i < data.length; i++) { - long offset = reader.findEntry(data[i] & mask); - assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset); - assertEquals(data[i], mmf.get(offset)); - assertEquals(i, mmf.get(offset+1)); - } - - for (int i = 0; i < 500; i++) { - long val = (long) (Long.MAX_VALUE * Math.random()); - while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random()); - assertEquals(-1, reader.findEntry(val & mask)); - } - } - } catch (Exception e) { - e.printStackTrace(); - } finally { - Files.delete(tempFile); - } - } - } - - - -} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LanguageFilterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LanguageFilterTest.java index 7a0abab3..50730390 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LanguageFilterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LanguageFilterTest.java @@ -18,7 +18,7 @@ class LanguageFilterTest { } @Test - public void isStringChinsese() { + public void isStringChinese() { var languageFilter = new LanguageFilter(); assertTrue(languageFilter.isBlockedUnicodeRange("溶岩ドームの手前に広がる斜面(木が生えているところ)は普賢岳の山体です.今回の噴火にともない,このあたりの山体がマグマに押されて変形し,北(写真では左)にむかって100mほどせりだしました\n")); } diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/MultimapFileTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/MultimapFileTest.java index fc13e7c3..99785031 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/MultimapFileTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/MultimapFileTest.java @@ -103,6 +103,7 @@ class MultimapFileTest { var sorter = file.createSorter(Path.of("/tmp"), 16, 2); for (int start = 0; start < 8; start+=2) { + System.out.println("~"); for (int end = start; end < 128; end+=2) { for (int i = 0; i < 128; i+=2) { file.put(i, -i/2); @@ -110,9 +111,17 @@ class MultimapFileTest { } sorter.quickSortLH(start, end); for (int i = start+2; i < end; i+=2) { + + System.out.println("**" + i); + System.out.println(file.get(i-2)); + System.out.println(file.get(i-1)); + System.out.println(file.get(i)); + System.out.println(file.get(i+1)); + assertTrue(file.get(i-2) <= file.get(i)); assertEquals(file.get(i+1), -file.get(i)); } + System.out.println("~"); } } @@ -158,13 +167,17 @@ class MultimapFileTest { var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 128, 8); var sorter = file.createSorter(Path.of("/tmp"), 16, 2); - for (int start = 0; start < 8; start+=2) { + for (int start = 2; start < 8; start+=2) { for (int end = start+2; end < 126; end+=2) { for (int i = 0; i < 128; i+=2) { file.put(i, -(128-i/2)); - file.put(i+1, (128-i)/2); + file.put(i+1, (128-i/2)); } - sorter.insertionSort(start, (end - start)); + file.put(0, 0xFFFF_FFFFL); + file.put(end, 0x7FFF_FFFFL); + sorter.insertionSort(start, (end - start)/2); + assertEquals(0xFFFF_FFFFL, file.get(0)); + assertEquals(file.get(end), 0x7FFF_FFFFL); for (int i = start+2; i < end; i+=2) { assertTrue(file.get(i-2) <= file.get(i)); assertEquals(file.get(i+1), -file.get(i)); @@ -178,14 +191,14 @@ class MultimapFileTest { var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 128, 8); var sorter = file.createSorter(Path.of("/tmp"), 16, 2); - for (int start = 0; start < 512; start+=2) { + for (int start = 0; start < 512; start+=18) { System.out.println(start); - for (int end = start+2; end < 8192; end+=2) { + for (int end = start+2; end < 8192; end+=68) { for (int i = 0; i < 8192; i+=2) { file.put(i, -i/2); file.put(i+1, i/2); } - sorter.quickSortLH(start, end); + sorter.mergeSort(start, end-start); assertEquals(file.get(start+1), -file.get(start)); for (int i = start+2; i < end; i+=2) { @@ -216,24 +229,6 @@ class MultimapFileTest { } } - @Test - void sortInternalSS2() throws IOException { - var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8); - var sorter = file.createSorter(Path.of("/tmp"), 16, 2); - for (int i = 0; i < 32; i+=2) { - file.put(i, 32-i/2); - file.put(i+1, ~(32-i/2)); - } - - sorter.sortRange( 2, 14); - - for (int i = 2+2; i < 16; i+=2) { - System.out.println(file.get(i) + "-" + ~file.get(i+1)); - assertTrue(file.get(i) > file.get(i-2)); - } - } - - @Test void sortExternal() throws IOException { var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8); @@ -252,25 +247,7 @@ class MultimapFileTest { assertTrue(searcher.binarySearchTest(file.get(i), 2, 16)); } } - @Test - void sortExternalSS2() throws IOException { - var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8); - var sorter = file.createSorter(Path.of("/tmp"), 2, 2); - var searcher = file.createSearcher(); - for (int i = 0; i < 32; i+=2) { - file.put(i, 32-i/2); - file.put(i+1, ~(32-i/2)); - } - - sorter.sortRange( 2, 14); - file.force(); - - for (int i = 2+2; i < 16; i+=2) { - System.out.println(file.get(i) + "-" + ~file.get(i+1)); - assertTrue(file.get(i) > file.get(i-2)); - } - } @Test void close() { diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryTest.java index 374db415..7290a01a 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryTest.java @@ -136,14 +136,12 @@ class IndexQueryTest { IndexQuery query = new IndexQuery(List.of(threesRange().asPrefixSource(102, 200))); - /** Read from 17s range */ - - // 17s range is shorter and should read fully in one go + /** Read from 3s range */ query.getMoreResults(buffer); System.out.println(Arrays.toString(buffer.copyData())); assertFalse(buffer.isEmpty()); - assertArrayEquals(LongStream.range(0, 100).map(l -> l*17).toArray(), buffer.copyData()); + assertArrayEquals(LongStream.range(100, 200).filter(v -> (v % 3) == 0).toArray(), buffer.copyData()); }