mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Fixes in sorting logic, and optimized update domain statistics to not take 4+ hours.
This commit is contained in:
parent
05762fe200
commit
5393167bf8
@ -74,7 +74,7 @@ public enum UnicodeRanges {
|
||||
|
||||
for (int i = 0; i < Math.min(2000, text.length()); i++) {
|
||||
char c = text.charAt(i);
|
||||
if (c >= min && c <= max) {
|
||||
if (c >= min && c <= this.max) {
|
||||
if (count++ > max) {
|
||||
return true;
|
||||
}
|
||||
|
@ -216,6 +216,9 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
|
||||
|
||||
@Override
|
||||
public long get(long idx) {
|
||||
if (idx < 0)
|
||||
throw new IllegalArgumentException("get("+idx+")");
|
||||
|
||||
if (idx >= mappedSize)
|
||||
grow(idx);
|
||||
|
||||
@ -650,7 +653,7 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
|
||||
if (start + n + wordSize - 1 >= mappedSize)
|
||||
grow(start + n + wordSize - 1);
|
||||
|
||||
if (n == 1) {
|
||||
if (n <= 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
@ -659,33 +662,29 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
|
||||
int off = (int) (start % bufferSize);
|
||||
|
||||
for (int i = 1; i < n; i++) {
|
||||
for (int j = i; j > 0; j--) {
|
||||
int a = off + wordSize*(j-1);
|
||||
int b = off + wordSize*j;
|
||||
long key = buffer.get(off + i * wordSize);
|
||||
|
||||
if (buffer.get(a) > buffer.get(b)) {
|
||||
for (int w = 0; w < wordSize; w++) {
|
||||
long tmp = buffer.get(a+w);
|
||||
buffer.put(a+w, buffer.get(b+w));
|
||||
buffer.put(b+w, tmp);
|
||||
}
|
||||
int j = i - 1;
|
||||
while (j >= 0 && buffer.get(off + wordSize*j) > key) {
|
||||
for (int w = 0; w < wordSize; w++) {
|
||||
long tmp = buffer.get(off+wordSize*j+w);
|
||||
buffer.put(off+wordSize*j+w, buffer.get(off+wordSize*(j+1)+w));
|
||||
buffer.put(off+wordSize*(j+1)+w, tmp);
|
||||
}
|
||||
else break;
|
||||
j--;
|
||||
}
|
||||
buffer.put(off + (j+1) * wordSize, key);
|
||||
}
|
||||
}
|
||||
else for (int i = 1; i < n; i++) {
|
||||
for (int j = i; j > 0; j--) {
|
||||
long a = start + (long)wordSize*(j-1);
|
||||
long b = start + (long)wordSize*j;
|
||||
long key = get(start + (long) i * wordSize);
|
||||
|
||||
if (get(a) > get(b)) {
|
||||
swap(a, b);
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
int j = i - 1;
|
||||
while (j >= 0 && get(start + (long)wordSize*j) > key) {
|
||||
swapn(wordSize, start + (long)wordSize*j, start + (long)wordSize*(j+1));
|
||||
j--;
|
||||
}
|
||||
put(start + (long) (j+1) * wordSize, key);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -135,7 +135,7 @@ public class MultimapSorter {
|
||||
if (low >= 0 && highInclusive >= 0 && low < highInclusive) {
|
||||
|
||||
if (highInclusive - low < 32) {
|
||||
multimapFileLong.insertionSort(wordSize, low, (int) (1 + (highInclusive - low) / wordSize));
|
||||
multimapFileLong.insertionSort(wordSize, low, (int) ((wordSize + highInclusive - low) / wordSize));
|
||||
}
|
||||
else {
|
||||
long p = multimapFileLong.quickSortPartition(wordSize, low, highInclusive);
|
||||
|
@ -28,32 +28,8 @@ public class ReindexTriggerMain {
|
||||
.followRedirects(true)
|
||||
.build();
|
||||
|
||||
try (var ds = db.provideConnection(); var conn = ds.getConnection(); var stmt = conn.createStatement()) {
|
||||
var rs = stmt.executeQuery("SELECT ID, DOMAIN_NAME, STATE, INDEXED FROM EC_DOMAIN LIMIT 100");
|
||||
while (rs.next()) {
|
||||
System.out.printf("%d %s %s %d\n",
|
||||
rs.getInt(1),
|
||||
rs.getString(2),
|
||||
rs.getString(3),
|
||||
rs.getInt(4));
|
||||
}
|
||||
|
||||
rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, PATH, VISITED, STATE FROM EC_URL LIMIT 100");
|
||||
while (rs.next()) {
|
||||
System.out.printf("%d %d %s %d %s\n",
|
||||
rs.getInt(1),
|
||||
rs.getInt(2),
|
||||
rs.getString(3),
|
||||
rs.getInt(4),
|
||||
rs.getString(5));
|
||||
|
||||
}
|
||||
|
||||
stmt.executeUpdate("INSERT IGNORE INTO DOMAIN_METADATA(ID,GOOD_URLS,KNOWN_URLS,VISITED_URLS) SELECT ID,0,0,0 FROM EC_DOMAIN WHERE INDEXED>0");
|
||||
stmt.executeUpdate("UPDATE DOMAIN_METADATA INNER JOIN (SELECT DOMAIN_ID,COUNT(*) CNT FROM EC_URL WHERE VISITED AND STATE='ok' GROUP BY DOMAIN_ID) T ON T.DOMAIN_ID=ID SET GOOD_URLS=CNT");
|
||||
stmt.executeUpdate("UPDATE DOMAIN_METADATA INNER JOIN (SELECT DOMAIN_ID,COUNT(*) CNT FROM EC_URL GROUP BY DOMAIN_ID) T ON T.DOMAIN_ID=ID SET KNOWN_URLS=CNT");
|
||||
stmt.executeUpdate("UPDATE DOMAIN_METADATA INNER JOIN (SELECT DOMAIN_ID,COUNT(*) CNT FROM EC_URL WHERE VISITED GROUP BY DOMAIN_ID) T ON T.DOMAIN_ID=ID SET VISITED_URLS=CNT");
|
||||
}
|
||||
var updateStatistics = new UpdateDomainStatistics(db.provideConnection());
|
||||
updateStatistics.run();
|
||||
|
||||
var rb = new RequestBody() {
|
||||
|
||||
|
@ -0,0 +1,66 @@
|
||||
package nu.marginalia.wmsa.edge.converting;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import gnu.trove.map.hash.TIntIntHashMap;
|
||||
|
||||
import java.sql.SQLException;
|
||||
|
||||
public class UpdateDomainStatistics {
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
public UpdateDomainStatistics(HikariDataSource dataSource) {
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
public void run() throws SQLException {
|
||||
|
||||
// This looks weird, but it's actually much faster than doing the computations with SQL queries
|
||||
//
|
||||
// ... in part because we can assume the data is immutable and don't mind consuming egregious
|
||||
// resources
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.createStatement();
|
||||
var domainInfoQuery = conn.prepareStatement("SELECT DOMAIN_ID, VISITED, STATE='ok' FROM EC_URL");
|
||||
var insertDomainInfo = conn.prepareStatement("INSERT INTO DOMAIN_METADATA(ID,KNOWN_URLS,GOOD_URLS,VISITED_URLS) VALUES (?, ?, ?, ?)")
|
||||
) {
|
||||
|
||||
stmt.executeUpdate("DELETE FROM DOMAIN_METADATA");
|
||||
|
||||
TIntIntHashMap knownUrls = new TIntIntHashMap(1_000_000, 0.75f, 0, 0);
|
||||
TIntIntHashMap visitedUrls = new TIntIntHashMap(1_000_000, 0.75f, 0, 0);
|
||||
TIntIntHashMap goodUrls = new TIntIntHashMap(1_000_000, 0.75f, 0, 0);
|
||||
|
||||
domainInfoQuery.setFetchSize(10_000);
|
||||
var rsp = domainInfoQuery.executeQuery();
|
||||
while (rsp.next()) {
|
||||
int domainId = rsp.getInt(1);
|
||||
boolean visited = rsp.getBoolean(2);
|
||||
boolean stateOk = rsp.getBoolean(3);
|
||||
|
||||
knownUrls.adjustOrPutValue(domainId, 1, 1);
|
||||
if (visited) {
|
||||
visitedUrls.adjustOrPutValue(domainId, 1, 1);
|
||||
if (stateOk) {
|
||||
goodUrls.adjustOrPutValue(domainId, 1, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int i = 0;
|
||||
for (int domainId : knownUrls.keys()) {
|
||||
insertDomainInfo.setInt(1, domainId);
|
||||
insertDomainInfo.setInt(2, knownUrls.get(domainId));
|
||||
insertDomainInfo.setInt(3, visitedUrls.get(domainId));
|
||||
insertDomainInfo.setInt(4, goodUrls.get(domainId));
|
||||
insertDomainInfo.addBatch();
|
||||
if ((++i % 1000) == 0) {
|
||||
insertDomainInfo.executeBatch();
|
||||
}
|
||||
}
|
||||
if ((i % 1000) != 0) {
|
||||
insertDomainInfo.executeBatch();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -153,7 +153,7 @@ class BTreeWriterTest {
|
||||
for (int i = 0; i < 500; i++) {
|
||||
long val = (long)(Long.MAX_VALUE * Math.random());
|
||||
while (toPut.contains((int)val)) val = (long)(Long.MAX_VALUE * Math.random());
|
||||
assertEquals(-1, reader.findEntry( val));
|
||||
assertTrue(reader.findEntry( val) < 0);
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
@ -203,7 +203,7 @@ class BTreeWriterTest {
|
||||
for (int i = 0; i < 500; i++) {
|
||||
long val = (long) (Long.MAX_VALUE * Math.random());
|
||||
while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random());
|
||||
assertEquals(-1, reader.findEntry(val));
|
||||
assertTrue(reader.findEntry( val) < 0);
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
@ -255,7 +255,7 @@ class BTreeWriterTest {
|
||||
for (int i = 0; i < 500; i++) {
|
||||
long val = (long) (Long.MAX_VALUE * Math.random());
|
||||
while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random());
|
||||
assertEquals(-1, reader.findEntry(val & mask));
|
||||
assertTrue(reader.findEntry(val & mask) < 0);
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
@ -309,7 +309,7 @@ class BTreeWriterTest {
|
||||
for (int i = 0; i < 500; i++) {
|
||||
long val = (long) (Long.MAX_VALUE * Math.random());
|
||||
while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random());
|
||||
assertEquals(-1, reader.findEntry(val & mask));
|
||||
assertTrue(reader.findEntry(val & mask) < 0);
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
|
@ -1,382 +0,0 @@
|
||||
package nu.marginalia.util.btree;
|
||||
|
||||
import nu.marginalia.util.btree.model.BTreeContext;
|
||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.RandomAccessFile;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
class BTreeWriterTestCachedReader {
|
||||
|
||||
final BTreeContext ctx = new BTreeContext(4, 2, 0xFFFF_FFFF_FFFF_FFFFL, 8);
|
||||
final BTreeWriter writer = new BTreeWriter(null, ctx);
|
||||
|
||||
Logger logger = LoggerFactory.getLogger(getClass());
|
||||
@Test
|
||||
void testSmallDataBlock() {
|
||||
var header = writer.makeHeader(1024, ctx.BLOCK_SIZE_WORDS()/2);
|
||||
assertEquals(1024 + BTreeHeader.BTreeHeaderSizeLongs, header.dataOffsetLongs());
|
||||
assertEquals(header.dataOffsetLongs(), header.indexOffsetLongs());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testLayerCount() {
|
||||
int wsq = ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS();
|
||||
int wcub = ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS();
|
||||
|
||||
assertEquals(2, writer.makeHeader(1024, wsq-1).layers());
|
||||
assertEquals(2, writer.makeHeader(1024, wsq).layers());
|
||||
assertEquals(3, writer.makeHeader(1024, wsq+1).layers());
|
||||
|
||||
assertEquals(3, writer.makeHeader(1024, wcub-1).layers());
|
||||
assertEquals(3, writer.makeHeader(1024, wcub).layers());
|
||||
assertEquals(4, writer.makeHeader(1024, wcub+1).layers());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testLayerOffset() {
|
||||
int wcub = ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS();
|
||||
System.out.println(writer.makeHeader(1025, wcub).relativeIndexLayerOffset(ctx, 0));
|
||||
System.out.println(writer.makeHeader(1025, wcub).relativeIndexLayerOffset(ctx, 1));
|
||||
System.out.println(writer.makeHeader(1025, wcub).relativeIndexLayerOffset(ctx, 2));
|
||||
|
||||
for (int i = 0; i < 1024; i++) {
|
||||
var header = writer.makeHeader(0, i);
|
||||
|
||||
|
||||
printTreeLayout(i, header, ctx);
|
||||
|
||||
if (header.layers() >= 1) {
|
||||
assertEquals(1, ctx.indexLayerSize(i, header.layers() - 1) / ctx.BLOCK_SIZE_WORDS());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void printTreeLayout(int numEntries, BTreeHeader header, BTreeContext ctx) {
|
||||
StringJoiner sj = new StringJoiner(",");
|
||||
for (int l = 0; l < header.layers(); l++) {
|
||||
sj.add(""+ctx.indexLayerSize(numEntries, l)/ctx.BLOCK_SIZE_WORDS());
|
||||
}
|
||||
System.out.println(numEntries + ":" + sj);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWriteEntrySize2() throws IOException {
|
||||
|
||||
var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat");
|
||||
Set<Integer> toPut = new HashSet<>();
|
||||
|
||||
for (int i = 0; i < 500; i++) {
|
||||
while (!toPut.add((int)(Integer.MAX_VALUE * Math.random())));
|
||||
}
|
||||
|
||||
int[] data = toPut.stream().mapToInt(Integer::valueOf).sorted().toArray();
|
||||
|
||||
try {
|
||||
RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw");
|
||||
MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000);
|
||||
|
||||
{
|
||||
var writer = new BTreeWriter(mmf, ctx);
|
||||
writer.write(0, toPut.size(), (slice) -> {
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
slice.put(2L*i, data[i]);
|
||||
slice.put( 2L*i + 1, i);
|
||||
}
|
||||
});
|
||||
mmf.force();
|
||||
}
|
||||
|
||||
{
|
||||
var reader = new BTreeReader(mmf, ctx, 0);
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
long offset = reader.findEntry(data[i]);
|
||||
assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset);
|
||||
assertEquals(i, mmf.get(offset+1));
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
Files.delete(tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testQB() throws IOException {
|
||||
|
||||
var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat");
|
||||
Set<Integer> toPut = new HashSet<>();
|
||||
|
||||
for (int i = 0; i < 144646; i++) {
|
||||
while (!toPut.add(3*i));
|
||||
}
|
||||
|
||||
int[] data = toPut.stream().mapToInt(Integer::valueOf).sorted().toArray();
|
||||
|
||||
try {
|
||||
RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw");
|
||||
MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000);
|
||||
|
||||
{
|
||||
var writer = new BTreeWriter(mmf, ctx);
|
||||
writer.write(0, toPut.size(), (slice) -> {
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
slice.put(2L*i, data[i]);
|
||||
slice.put( 2L*i + 1, i);
|
||||
}
|
||||
});
|
||||
mmf.force();
|
||||
}
|
||||
|
||||
{
|
||||
var reader = new BTreeReader(mmf, ctx, 0);
|
||||
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
long offset = reader.findEntry(data[i]);
|
||||
}
|
||||
|
||||
long[] d = new long[] { -1, 1, 5000, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 30, 300, 303, 306, 312, 330,3000,30000,300000};
|
||||
BTreeQueryBuffer buffer = new BTreeQueryBuffer(d, d.length);
|
||||
Arrays.sort(buffer.data);
|
||||
|
||||
System.out.println("layers = " + reader.getHeader().layers());
|
||||
reader.retainEntries(buffer);
|
||||
buffer.finalizeFiltering();
|
||||
|
||||
for (int i = 0; i < buffer.end; i++) {
|
||||
System.out.println(buffer.data[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
Files.delete(tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWriteEntrySize2Small() throws IOException {
|
||||
|
||||
var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat");
|
||||
Set<Integer> toPut = new HashSet<>();
|
||||
|
||||
for (int i = 0; i < 5; i++) {
|
||||
while (!toPut.add((int)(Integer.MAX_VALUE * Math.random())));
|
||||
}
|
||||
|
||||
int[] data = toPut.stream().mapToInt(Integer::valueOf).sorted().toArray();
|
||||
|
||||
try {
|
||||
RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw");
|
||||
MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000);
|
||||
|
||||
{
|
||||
var writer = new BTreeWriter(mmf, ctx);
|
||||
writer.write( 0, toPut.size(), (slice) -> {
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
slice.put(2L*i, data[i]);
|
||||
slice.put(2L*i + 1, i);
|
||||
}
|
||||
});
|
||||
mmf.force();
|
||||
}
|
||||
|
||||
{
|
||||
var reader = new BTreeReader(mmf, ctx, 0);
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
long offset = reader.findEntry(data[i]);
|
||||
assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset);
|
||||
assertEquals(i, mmf.get(offset+1));
|
||||
}
|
||||
|
||||
for (int i = 0; i < 500; i++) {
|
||||
long val = (long)(Long.MAX_VALUE * Math.random());
|
||||
while (toPut.contains((int)val)) val = (long)(Long.MAX_VALUE * Math.random());
|
||||
assertEquals(-1, reader.findEntry(val));
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
Files.delete(tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testWriteEqualityNotMasked() throws IOException {
|
||||
for (int bs = 2; bs <= 4; bs++) {
|
||||
var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat");
|
||||
Set<Long> toPut = new HashSet<>();
|
||||
|
||||
var ctx = new BTreeContext(5, 1, ~0, bs);
|
||||
|
||||
for (int i = 0; i < 500; i++) {
|
||||
while (!toPut.add((long) (Long.MAX_VALUE * Math.random()))) ;
|
||||
}
|
||||
|
||||
long[] data = toPut.stream().mapToLong(Long::valueOf).sorted().toArray();
|
||||
|
||||
try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) {
|
||||
{
|
||||
var writer = new BTreeWriter(mmf, ctx);
|
||||
writer.write(0, toPut.size(), (slice) -> {
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
slice.put(i, data[i]);
|
||||
}
|
||||
});
|
||||
mmf.force();
|
||||
}
|
||||
|
||||
{
|
||||
var reader = new BTreeReader(mmf, ctx, 0);
|
||||
|
||||
printTreeLayout(toPut.size(), reader.getHeader(), ctx);
|
||||
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
long offset = reader.findEntry(data[i]);
|
||||
assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset);
|
||||
assertEquals(data[i], mmf.get(offset));
|
||||
}
|
||||
|
||||
for (int i = 0; i < 500; i++) {
|
||||
long val = (long) (Long.MAX_VALUE * Math.random());
|
||||
while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random());
|
||||
assertEquals(-1, reader.findEntry(val));
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
Files.delete(tempFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWriteEqualityMasked() throws IOException {
|
||||
|
||||
for (int bs = 2; bs <= 4; bs++) {
|
||||
var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat");
|
||||
Set<Long> toPut = new HashSet<>();
|
||||
|
||||
long mask = 0xFFFF_FFFF_0000_0000L;
|
||||
var ctx = new BTreeContext(5, 1, mask, bs);
|
||||
|
||||
for (int i = 0; i < 500; i++) {
|
||||
while (!toPut.add((long) (Long.MAX_VALUE * Math.random()))) ;
|
||||
}
|
||||
|
||||
long[] data = toPut.stream().mapToLong(Long::valueOf).sorted().toArray();
|
||||
|
||||
try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) {
|
||||
{
|
||||
var writer = new BTreeWriter(mmf, ctx);
|
||||
writer.write(0, toPut.size(), (slice) -> {
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
slice.put(i, data[i]);
|
||||
}
|
||||
});
|
||||
mmf.force();
|
||||
}
|
||||
|
||||
{
|
||||
var reader = new BTreeReader(mmf, ctx, 0);
|
||||
|
||||
printTreeLayout(toPut.size(), reader.getHeader(), ctx);
|
||||
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
long offset = reader.findEntry(data[i] & mask);
|
||||
assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset);
|
||||
assertEquals(data[i], mmf.get(offset));
|
||||
}
|
||||
|
||||
for (int i = 0; i < 500; i++) {
|
||||
long val = (long) (Long.MAX_VALUE * Math.random());
|
||||
while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random());
|
||||
assertEquals(-1, reader.findEntry(val & mask));
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
Files.delete(tempFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWriteTwoEqualityMasked() throws IOException {
|
||||
|
||||
for (int bs = 2; bs <= 4; bs++) {
|
||||
var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat");
|
||||
Set<Long> toPut = new HashSet<>();
|
||||
|
||||
long mask = 0xFFFF_FFFF_0000_0000L;
|
||||
var ctx = new BTreeContext(5, 2, mask, bs);
|
||||
|
||||
for (int i = 0; i < 500; i++) {
|
||||
while (!toPut.add((long) (Long.MAX_VALUE * Math.random()))) ;
|
||||
}
|
||||
|
||||
long[] data = toPut.stream().mapToLong(Long::valueOf).sorted().toArray();
|
||||
|
||||
try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) {
|
||||
{
|
||||
var writer = new BTreeWriter(mmf, ctx);
|
||||
writer.write(0, toPut.size(), (slice) -> {
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
slice.put(i*2L, data[i]);
|
||||
slice.put(i*2L+1, i);
|
||||
}
|
||||
});
|
||||
mmf.force();
|
||||
}
|
||||
|
||||
{
|
||||
var reader = new BTreeReader(mmf, ctx, 0);
|
||||
|
||||
printTreeLayout(toPut.size(), reader.getHeader(), ctx);
|
||||
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
long offset = reader.findEntry(data[i] & mask);
|
||||
assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset);
|
||||
assertEquals(data[i], mmf.get(offset));
|
||||
assertEquals(i, mmf.get(offset+1));
|
||||
}
|
||||
|
||||
for (int i = 0; i < 500; i++) {
|
||||
long val = (long) (Long.MAX_VALUE * Math.random());
|
||||
while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random());
|
||||
assertEquals(-1, reader.findEntry(val & mask));
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
Files.delete(tempFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
@ -18,7 +18,7 @@ class LanguageFilterTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void isStringChinsese() {
|
||||
public void isStringChinese() {
|
||||
var languageFilter = new LanguageFilter();
|
||||
assertTrue(languageFilter.isBlockedUnicodeRange("溶岩ドームの手前に広がる斜面(木が生えているところ)は普賢岳の山体です.今回の噴火にともない,このあたりの山体がマグマに押されて変形し,北(写真では左)にむかって100mほどせりだしました\n"));
|
||||
}
|
||||
|
@ -103,6 +103,7 @@ class MultimapFileTest {
|
||||
var sorter = file.createSorter(Path.of("/tmp"), 16, 2);
|
||||
|
||||
for (int start = 0; start < 8; start+=2) {
|
||||
System.out.println("~");
|
||||
for (int end = start; end < 128; end+=2) {
|
||||
for (int i = 0; i < 128; i+=2) {
|
||||
file.put(i, -i/2);
|
||||
@ -110,9 +111,17 @@ class MultimapFileTest {
|
||||
}
|
||||
sorter.quickSortLH(start, end);
|
||||
for (int i = start+2; i < end; i+=2) {
|
||||
|
||||
System.out.println("**" + i);
|
||||
System.out.println(file.get(i-2));
|
||||
System.out.println(file.get(i-1));
|
||||
System.out.println(file.get(i));
|
||||
System.out.println(file.get(i+1));
|
||||
|
||||
assertTrue(file.get(i-2) <= file.get(i));
|
||||
assertEquals(file.get(i+1), -file.get(i));
|
||||
}
|
||||
System.out.println("~");
|
||||
}
|
||||
}
|
||||
|
||||
@ -158,13 +167,17 @@ class MultimapFileTest {
|
||||
var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 128, 8);
|
||||
var sorter = file.createSorter(Path.of("/tmp"), 16, 2);
|
||||
|
||||
for (int start = 0; start < 8; start+=2) {
|
||||
for (int start = 2; start < 8; start+=2) {
|
||||
for (int end = start+2; end < 126; end+=2) {
|
||||
for (int i = 0; i < 128; i+=2) {
|
||||
file.put(i, -(128-i/2));
|
||||
file.put(i+1, (128-i)/2);
|
||||
file.put(i+1, (128-i/2));
|
||||
}
|
||||
sorter.insertionSort(start, (end - start));
|
||||
file.put(0, 0xFFFF_FFFFL);
|
||||
file.put(end, 0x7FFF_FFFFL);
|
||||
sorter.insertionSort(start, (end - start)/2);
|
||||
assertEquals(0xFFFF_FFFFL, file.get(0));
|
||||
assertEquals(file.get(end), 0x7FFF_FFFFL);
|
||||
for (int i = start+2; i < end; i+=2) {
|
||||
assertTrue(file.get(i-2) <= file.get(i));
|
||||
assertEquals(file.get(i+1), -file.get(i));
|
||||
@ -178,14 +191,14 @@ class MultimapFileTest {
|
||||
var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 128, 8);
|
||||
var sorter = file.createSorter(Path.of("/tmp"), 16, 2);
|
||||
|
||||
for (int start = 0; start < 512; start+=2) {
|
||||
for (int start = 0; start < 512; start+=18) {
|
||||
System.out.println(start);
|
||||
for (int end = start+2; end < 8192; end+=2) {
|
||||
for (int end = start+2; end < 8192; end+=68) {
|
||||
for (int i = 0; i < 8192; i+=2) {
|
||||
file.put(i, -i/2);
|
||||
file.put(i+1, i/2);
|
||||
}
|
||||
sorter.quickSortLH(start, end);
|
||||
sorter.mergeSort(start, end-start);
|
||||
|
||||
assertEquals(file.get(start+1), -file.get(start));
|
||||
for (int i = start+2; i < end; i+=2) {
|
||||
@ -216,24 +229,6 @@ class MultimapFileTest {
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
void sortInternalSS2() throws IOException {
|
||||
var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8);
|
||||
var sorter = file.createSorter(Path.of("/tmp"), 16, 2);
|
||||
for (int i = 0; i < 32; i+=2) {
|
||||
file.put(i, 32-i/2);
|
||||
file.put(i+1, ~(32-i/2));
|
||||
}
|
||||
|
||||
sorter.sortRange( 2, 14);
|
||||
|
||||
for (int i = 2+2; i < 16; i+=2) {
|
||||
System.out.println(file.get(i) + "-" + ~file.get(i+1));
|
||||
assertTrue(file.get(i) > file.get(i-2));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
void sortExternal() throws IOException {
|
||||
var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8);
|
||||
@ -252,25 +247,7 @@ class MultimapFileTest {
|
||||
assertTrue(searcher.binarySearchTest(file.get(i), 2, 16));
|
||||
}
|
||||
}
|
||||
@Test
|
||||
void sortExternalSS2() throws IOException {
|
||||
var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8);
|
||||
var sorter = file.createSorter(Path.of("/tmp"), 2, 2);
|
||||
var searcher = file.createSearcher();
|
||||
|
||||
for (int i = 0; i < 32; i+=2) {
|
||||
file.put(i, 32-i/2);
|
||||
file.put(i+1, ~(32-i/2));
|
||||
}
|
||||
|
||||
sorter.sortRange( 2, 14);
|
||||
file.force();
|
||||
|
||||
for (int i = 2+2; i < 16; i+=2) {
|
||||
System.out.println(file.get(i) + "-" + ~file.get(i+1));
|
||||
assertTrue(file.get(i) > file.get(i-2));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
void close() {
|
||||
|
@ -136,14 +136,12 @@ class IndexQueryTest {
|
||||
|
||||
IndexQuery query = new IndexQuery(List.of(threesRange().asPrefixSource(102, 200)));
|
||||
|
||||
/** Read from 17s range */
|
||||
|
||||
// 17s range is shorter and should read fully in one go
|
||||
/** Read from 3s range */
|
||||
|
||||
query.getMoreResults(buffer);
|
||||
System.out.println(Arrays.toString(buffer.copyData()));
|
||||
assertFalse(buffer.isEmpty());
|
||||
assertArrayEquals(LongStream.range(0, 100).map(l -> l*17).toArray(), buffer.copyData());
|
||||
assertArrayEquals(LongStream.range(100, 200).filter(v -> (v % 3) == 0).toArray(), buffer.copyData());
|
||||
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user