mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Fixes in sorting logic, and optimized update domain statistics to not take 4+ hours.
This commit is contained in:
parent
05762fe200
commit
5393167bf8
@ -74,7 +74,7 @@ public enum UnicodeRanges {
|
|||||||
|
|
||||||
for (int i = 0; i < Math.min(2000, text.length()); i++) {
|
for (int i = 0; i < Math.min(2000, text.length()); i++) {
|
||||||
char c = text.charAt(i);
|
char c = text.charAt(i);
|
||||||
if (c >= min && c <= max) {
|
if (c >= min && c <= this.max) {
|
||||||
if (count++ > max) {
|
if (count++ > max) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -216,6 +216,9 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long get(long idx) {
|
public long get(long idx) {
|
||||||
|
if (idx < 0)
|
||||||
|
throw new IllegalArgumentException("get("+idx+")");
|
||||||
|
|
||||||
if (idx >= mappedSize)
|
if (idx >= mappedSize)
|
||||||
grow(idx);
|
grow(idx);
|
||||||
|
|
||||||
@ -650,7 +653,7 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
|
|||||||
if (start + n + wordSize - 1 >= mappedSize)
|
if (start + n + wordSize - 1 >= mappedSize)
|
||||||
grow(start + n + wordSize - 1);
|
grow(start + n + wordSize - 1);
|
||||||
|
|
||||||
if (n == 1) {
|
if (n <= 1) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -659,33 +662,29 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
|
|||||||
int off = (int) (start % bufferSize);
|
int off = (int) (start % bufferSize);
|
||||||
|
|
||||||
for (int i = 1; i < n; i++) {
|
for (int i = 1; i < n; i++) {
|
||||||
for (int j = i; j > 0; j--) {
|
long key = buffer.get(off + i * wordSize);
|
||||||
int a = off + wordSize*(j-1);
|
|
||||||
int b = off + wordSize*j;
|
|
||||||
|
|
||||||
if (buffer.get(a) > buffer.get(b)) {
|
int j = i - 1;
|
||||||
|
while (j >= 0 && buffer.get(off + wordSize*j) > key) {
|
||||||
for (int w = 0; w < wordSize; w++) {
|
for (int w = 0; w < wordSize; w++) {
|
||||||
long tmp = buffer.get(a+w);
|
long tmp = buffer.get(off+wordSize*j+w);
|
||||||
buffer.put(a+w, buffer.get(b+w));
|
buffer.put(off+wordSize*j+w, buffer.get(off+wordSize*(j+1)+w));
|
||||||
buffer.put(b+w, tmp);
|
buffer.put(off+wordSize*(j+1)+w, tmp);
|
||||||
}
|
}
|
||||||
|
j--;
|
||||||
}
|
}
|
||||||
else break;
|
buffer.put(off + (j+1) * wordSize, key);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else for (int i = 1; i < n; i++) {
|
else for (int i = 1; i < n; i++) {
|
||||||
for (int j = i; j > 0; j--) {
|
long key = get(start + (long) i * wordSize);
|
||||||
long a = start + (long)wordSize*(j-1);
|
|
||||||
long b = start + (long)wordSize*j;
|
|
||||||
|
|
||||||
if (get(a) > get(b)) {
|
int j = i - 1;
|
||||||
swap(a, b);
|
while (j >= 0 && get(start + (long)wordSize*j) > key) {
|
||||||
}
|
swapn(wordSize, start + (long)wordSize*j, start + (long)wordSize*(j+1));
|
||||||
else {
|
j--;
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
put(start + (long) (j+1) * wordSize, key);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -135,7 +135,7 @@ public class MultimapSorter {
|
|||||||
if (low >= 0 && highInclusive >= 0 && low < highInclusive) {
|
if (low >= 0 && highInclusive >= 0 && low < highInclusive) {
|
||||||
|
|
||||||
if (highInclusive - low < 32) {
|
if (highInclusive - low < 32) {
|
||||||
multimapFileLong.insertionSort(wordSize, low, (int) (1 + (highInclusive - low) / wordSize));
|
multimapFileLong.insertionSort(wordSize, low, (int) ((wordSize + highInclusive - low) / wordSize));
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
long p = multimapFileLong.quickSortPartition(wordSize, low, highInclusive);
|
long p = multimapFileLong.quickSortPartition(wordSize, low, highInclusive);
|
||||||
|
@ -28,32 +28,8 @@ public class ReindexTriggerMain {
|
|||||||
.followRedirects(true)
|
.followRedirects(true)
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
try (var ds = db.provideConnection(); var conn = ds.getConnection(); var stmt = conn.createStatement()) {
|
var updateStatistics = new UpdateDomainStatistics(db.provideConnection());
|
||||||
var rs = stmt.executeQuery("SELECT ID, DOMAIN_NAME, STATE, INDEXED FROM EC_DOMAIN LIMIT 100");
|
updateStatistics.run();
|
||||||
while (rs.next()) {
|
|
||||||
System.out.printf("%d %s %s %d\n",
|
|
||||||
rs.getInt(1),
|
|
||||||
rs.getString(2),
|
|
||||||
rs.getString(3),
|
|
||||||
rs.getInt(4));
|
|
||||||
}
|
|
||||||
|
|
||||||
rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, PATH, VISITED, STATE FROM EC_URL LIMIT 100");
|
|
||||||
while (rs.next()) {
|
|
||||||
System.out.printf("%d %d %s %d %s\n",
|
|
||||||
rs.getInt(1),
|
|
||||||
rs.getInt(2),
|
|
||||||
rs.getString(3),
|
|
||||||
rs.getInt(4),
|
|
||||||
rs.getString(5));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
stmt.executeUpdate("INSERT IGNORE INTO DOMAIN_METADATA(ID,GOOD_URLS,KNOWN_URLS,VISITED_URLS) SELECT ID,0,0,0 FROM EC_DOMAIN WHERE INDEXED>0");
|
|
||||||
stmt.executeUpdate("UPDATE DOMAIN_METADATA INNER JOIN (SELECT DOMAIN_ID,COUNT(*) CNT FROM EC_URL WHERE VISITED AND STATE='ok' GROUP BY DOMAIN_ID) T ON T.DOMAIN_ID=ID SET GOOD_URLS=CNT");
|
|
||||||
stmt.executeUpdate("UPDATE DOMAIN_METADATA INNER JOIN (SELECT DOMAIN_ID,COUNT(*) CNT FROM EC_URL GROUP BY DOMAIN_ID) T ON T.DOMAIN_ID=ID SET KNOWN_URLS=CNT");
|
|
||||||
stmt.executeUpdate("UPDATE DOMAIN_METADATA INNER JOIN (SELECT DOMAIN_ID,COUNT(*) CNT FROM EC_URL WHERE VISITED GROUP BY DOMAIN_ID) T ON T.DOMAIN_ID=ID SET VISITED_URLS=CNT");
|
|
||||||
}
|
|
||||||
|
|
||||||
var rb = new RequestBody() {
|
var rb = new RequestBody() {
|
||||||
|
|
||||||
|
@ -0,0 +1,66 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting;
|
||||||
|
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import gnu.trove.map.hash.TIntIntHashMap;
|
||||||
|
|
||||||
|
import java.sql.SQLException;
|
||||||
|
|
||||||
|
public class UpdateDomainStatistics {
|
||||||
|
private final HikariDataSource dataSource;
|
||||||
|
|
||||||
|
public UpdateDomainStatistics(HikariDataSource dataSource) {
|
||||||
|
this.dataSource = dataSource;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void run() throws SQLException {
|
||||||
|
|
||||||
|
// This looks weird, but it's actually much faster than doing the computations with SQL queries
|
||||||
|
//
|
||||||
|
// ... in part because we can assume the data is immutable and don't mind consuming egregious
|
||||||
|
// resources
|
||||||
|
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.createStatement();
|
||||||
|
var domainInfoQuery = conn.prepareStatement("SELECT DOMAIN_ID, VISITED, STATE='ok' FROM EC_URL");
|
||||||
|
var insertDomainInfo = conn.prepareStatement("INSERT INTO DOMAIN_METADATA(ID,KNOWN_URLS,GOOD_URLS,VISITED_URLS) VALUES (?, ?, ?, ?)")
|
||||||
|
) {
|
||||||
|
|
||||||
|
stmt.executeUpdate("DELETE FROM DOMAIN_METADATA");
|
||||||
|
|
||||||
|
TIntIntHashMap knownUrls = new TIntIntHashMap(1_000_000, 0.75f, 0, 0);
|
||||||
|
TIntIntHashMap visitedUrls = new TIntIntHashMap(1_000_000, 0.75f, 0, 0);
|
||||||
|
TIntIntHashMap goodUrls = new TIntIntHashMap(1_000_000, 0.75f, 0, 0);
|
||||||
|
|
||||||
|
domainInfoQuery.setFetchSize(10_000);
|
||||||
|
var rsp = domainInfoQuery.executeQuery();
|
||||||
|
while (rsp.next()) {
|
||||||
|
int domainId = rsp.getInt(1);
|
||||||
|
boolean visited = rsp.getBoolean(2);
|
||||||
|
boolean stateOk = rsp.getBoolean(3);
|
||||||
|
|
||||||
|
knownUrls.adjustOrPutValue(domainId, 1, 1);
|
||||||
|
if (visited) {
|
||||||
|
visitedUrls.adjustOrPutValue(domainId, 1, 1);
|
||||||
|
if (stateOk) {
|
||||||
|
goodUrls.adjustOrPutValue(domainId, 1, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int i = 0;
|
||||||
|
for (int domainId : knownUrls.keys()) {
|
||||||
|
insertDomainInfo.setInt(1, domainId);
|
||||||
|
insertDomainInfo.setInt(2, knownUrls.get(domainId));
|
||||||
|
insertDomainInfo.setInt(3, visitedUrls.get(domainId));
|
||||||
|
insertDomainInfo.setInt(4, goodUrls.get(domainId));
|
||||||
|
insertDomainInfo.addBatch();
|
||||||
|
if ((++i % 1000) == 0) {
|
||||||
|
insertDomainInfo.executeBatch();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ((i % 1000) != 0) {
|
||||||
|
insertDomainInfo.executeBatch();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -153,7 +153,7 @@ class BTreeWriterTest {
|
|||||||
for (int i = 0; i < 500; i++) {
|
for (int i = 0; i < 500; i++) {
|
||||||
long val = (long)(Long.MAX_VALUE * Math.random());
|
long val = (long)(Long.MAX_VALUE * Math.random());
|
||||||
while (toPut.contains((int)val)) val = (long)(Long.MAX_VALUE * Math.random());
|
while (toPut.contains((int)val)) val = (long)(Long.MAX_VALUE * Math.random());
|
||||||
assertEquals(-1, reader.findEntry( val));
|
assertTrue(reader.findEntry( val) < 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
@ -203,7 +203,7 @@ class BTreeWriterTest {
|
|||||||
for (int i = 0; i < 500; i++) {
|
for (int i = 0; i < 500; i++) {
|
||||||
long val = (long) (Long.MAX_VALUE * Math.random());
|
long val = (long) (Long.MAX_VALUE * Math.random());
|
||||||
while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random());
|
while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random());
|
||||||
assertEquals(-1, reader.findEntry(val));
|
assertTrue(reader.findEntry( val) < 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
@ -255,7 +255,7 @@ class BTreeWriterTest {
|
|||||||
for (int i = 0; i < 500; i++) {
|
for (int i = 0; i < 500; i++) {
|
||||||
long val = (long) (Long.MAX_VALUE * Math.random());
|
long val = (long) (Long.MAX_VALUE * Math.random());
|
||||||
while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random());
|
while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random());
|
||||||
assertEquals(-1, reader.findEntry(val & mask));
|
assertTrue(reader.findEntry(val & mask) < 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
@ -309,7 +309,7 @@ class BTreeWriterTest {
|
|||||||
for (int i = 0; i < 500; i++) {
|
for (int i = 0; i < 500; i++) {
|
||||||
long val = (long) (Long.MAX_VALUE * Math.random());
|
long val = (long) (Long.MAX_VALUE * Math.random());
|
||||||
while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random());
|
while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random());
|
||||||
assertEquals(-1, reader.findEntry(val & mask));
|
assertTrue(reader.findEntry(val & mask) < 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
@ -1,382 +0,0 @@
|
|||||||
package nu.marginalia.util.btree;
|
|
||||||
|
|
||||||
import nu.marginalia.util.btree.model.BTreeContext;
|
|
||||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
|
||||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.RandomAccessFile;
|
|
||||||
import java.nio.channels.FileChannel;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.StringJoiner;
|
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
|
||||||
|
|
||||||
class BTreeWriterTestCachedReader {
|
|
||||||
|
|
||||||
final BTreeContext ctx = new BTreeContext(4, 2, 0xFFFF_FFFF_FFFF_FFFFL, 8);
|
|
||||||
final BTreeWriter writer = new BTreeWriter(null, ctx);
|
|
||||||
|
|
||||||
Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
@Test
|
|
||||||
void testSmallDataBlock() {
|
|
||||||
var header = writer.makeHeader(1024, ctx.BLOCK_SIZE_WORDS()/2);
|
|
||||||
assertEquals(1024 + BTreeHeader.BTreeHeaderSizeLongs, header.dataOffsetLongs());
|
|
||||||
assertEquals(header.dataOffsetLongs(), header.indexOffsetLongs());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testLayerCount() {
|
|
||||||
int wsq = ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS();
|
|
||||||
int wcub = ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS();
|
|
||||||
|
|
||||||
assertEquals(2, writer.makeHeader(1024, wsq-1).layers());
|
|
||||||
assertEquals(2, writer.makeHeader(1024, wsq).layers());
|
|
||||||
assertEquals(3, writer.makeHeader(1024, wsq+1).layers());
|
|
||||||
|
|
||||||
assertEquals(3, writer.makeHeader(1024, wcub-1).layers());
|
|
||||||
assertEquals(3, writer.makeHeader(1024, wcub).layers());
|
|
||||||
assertEquals(4, writer.makeHeader(1024, wcub+1).layers());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testLayerOffset() {
|
|
||||||
int wcub = ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS();
|
|
||||||
System.out.println(writer.makeHeader(1025, wcub).relativeIndexLayerOffset(ctx, 0));
|
|
||||||
System.out.println(writer.makeHeader(1025, wcub).relativeIndexLayerOffset(ctx, 1));
|
|
||||||
System.out.println(writer.makeHeader(1025, wcub).relativeIndexLayerOffset(ctx, 2));
|
|
||||||
|
|
||||||
for (int i = 0; i < 1024; i++) {
|
|
||||||
var header = writer.makeHeader(0, i);
|
|
||||||
|
|
||||||
|
|
||||||
printTreeLayout(i, header, ctx);
|
|
||||||
|
|
||||||
if (header.layers() >= 1) {
|
|
||||||
assertEquals(1, ctx.indexLayerSize(i, header.layers() - 1) / ctx.BLOCK_SIZE_WORDS());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void printTreeLayout(int numEntries, BTreeHeader header, BTreeContext ctx) {
|
|
||||||
StringJoiner sj = new StringJoiner(",");
|
|
||||||
for (int l = 0; l < header.layers(); l++) {
|
|
||||||
sj.add(""+ctx.indexLayerSize(numEntries, l)/ctx.BLOCK_SIZE_WORDS());
|
|
||||||
}
|
|
||||||
System.out.println(numEntries + ":" + sj);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testWriteEntrySize2() throws IOException {
|
|
||||||
|
|
||||||
var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat");
|
|
||||||
Set<Integer> toPut = new HashSet<>();
|
|
||||||
|
|
||||||
for (int i = 0; i < 500; i++) {
|
|
||||||
while (!toPut.add((int)(Integer.MAX_VALUE * Math.random())));
|
|
||||||
}
|
|
||||||
|
|
||||||
int[] data = toPut.stream().mapToInt(Integer::valueOf).sorted().toArray();
|
|
||||||
|
|
||||||
try {
|
|
||||||
RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw");
|
|
||||||
MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000);
|
|
||||||
|
|
||||||
{
|
|
||||||
var writer = new BTreeWriter(mmf, ctx);
|
|
||||||
writer.write(0, toPut.size(), (slice) -> {
|
|
||||||
for (int i = 0; i < data.length; i++) {
|
|
||||||
slice.put(2L*i, data[i]);
|
|
||||||
slice.put( 2L*i + 1, i);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
mmf.force();
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
var reader = new BTreeReader(mmf, ctx, 0);
|
|
||||||
for (int i = 0; i < data.length; i++) {
|
|
||||||
long offset = reader.findEntry(data[i]);
|
|
||||||
assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset);
|
|
||||||
assertEquals(i, mmf.get(offset+1));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
} finally {
|
|
||||||
Files.delete(tempFile);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testQB() throws IOException {
|
|
||||||
|
|
||||||
var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat");
|
|
||||||
Set<Integer> toPut = new HashSet<>();
|
|
||||||
|
|
||||||
for (int i = 0; i < 144646; i++) {
|
|
||||||
while (!toPut.add(3*i));
|
|
||||||
}
|
|
||||||
|
|
||||||
int[] data = toPut.stream().mapToInt(Integer::valueOf).sorted().toArray();
|
|
||||||
|
|
||||||
try {
|
|
||||||
RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw");
|
|
||||||
MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000);
|
|
||||||
|
|
||||||
{
|
|
||||||
var writer = new BTreeWriter(mmf, ctx);
|
|
||||||
writer.write(0, toPut.size(), (slice) -> {
|
|
||||||
for (int i = 0; i < data.length; i++) {
|
|
||||||
slice.put(2L*i, data[i]);
|
|
||||||
slice.put( 2L*i + 1, i);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
mmf.force();
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
var reader = new BTreeReader(mmf, ctx, 0);
|
|
||||||
|
|
||||||
for (int i = 0; i < data.length; i++) {
|
|
||||||
long offset = reader.findEntry(data[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
long[] d = new long[] { -1, 1, 5000, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 30, 300, 303, 306, 312, 330,3000,30000,300000};
|
|
||||||
BTreeQueryBuffer buffer = new BTreeQueryBuffer(d, d.length);
|
|
||||||
Arrays.sort(buffer.data);
|
|
||||||
|
|
||||||
System.out.println("layers = " + reader.getHeader().layers());
|
|
||||||
reader.retainEntries(buffer);
|
|
||||||
buffer.finalizeFiltering();
|
|
||||||
|
|
||||||
for (int i = 0; i < buffer.end; i++) {
|
|
||||||
System.out.println(buffer.data[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
} catch (Exception e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
} finally {
|
|
||||||
Files.delete(tempFile);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testWriteEntrySize2Small() throws IOException {
|
|
||||||
|
|
||||||
var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat");
|
|
||||||
Set<Integer> toPut = new HashSet<>();
|
|
||||||
|
|
||||||
for (int i = 0; i < 5; i++) {
|
|
||||||
while (!toPut.add((int)(Integer.MAX_VALUE * Math.random())));
|
|
||||||
}
|
|
||||||
|
|
||||||
int[] data = toPut.stream().mapToInt(Integer::valueOf).sorted().toArray();
|
|
||||||
|
|
||||||
try {
|
|
||||||
RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw");
|
|
||||||
MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000);
|
|
||||||
|
|
||||||
{
|
|
||||||
var writer = new BTreeWriter(mmf, ctx);
|
|
||||||
writer.write( 0, toPut.size(), (slice) -> {
|
|
||||||
for (int i = 0; i < data.length; i++) {
|
|
||||||
slice.put(2L*i, data[i]);
|
|
||||||
slice.put(2L*i + 1, i);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
mmf.force();
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
var reader = new BTreeReader(mmf, ctx, 0);
|
|
||||||
for (int i = 0; i < data.length; i++) {
|
|
||||||
long offset = reader.findEntry(data[i]);
|
|
||||||
assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset);
|
|
||||||
assertEquals(i, mmf.get(offset+1));
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < 500; i++) {
|
|
||||||
long val = (long)(Long.MAX_VALUE * Math.random());
|
|
||||||
while (toPut.contains((int)val)) val = (long)(Long.MAX_VALUE * Math.random());
|
|
||||||
assertEquals(-1, reader.findEntry(val));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
} finally {
|
|
||||||
Files.delete(tempFile);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testWriteEqualityNotMasked() throws IOException {
|
|
||||||
for (int bs = 2; bs <= 4; bs++) {
|
|
||||||
var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat");
|
|
||||||
Set<Long> toPut = new HashSet<>();
|
|
||||||
|
|
||||||
var ctx = new BTreeContext(5, 1, ~0, bs);
|
|
||||||
|
|
||||||
for (int i = 0; i < 500; i++) {
|
|
||||||
while (!toPut.add((long) (Long.MAX_VALUE * Math.random()))) ;
|
|
||||||
}
|
|
||||||
|
|
||||||
long[] data = toPut.stream().mapToLong(Long::valueOf).sorted().toArray();
|
|
||||||
|
|
||||||
try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) {
|
|
||||||
{
|
|
||||||
var writer = new BTreeWriter(mmf, ctx);
|
|
||||||
writer.write(0, toPut.size(), (slice) -> {
|
|
||||||
for (int i = 0; i < data.length; i++) {
|
|
||||||
slice.put(i, data[i]);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
mmf.force();
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
var reader = new BTreeReader(mmf, ctx, 0);
|
|
||||||
|
|
||||||
printTreeLayout(toPut.size(), reader.getHeader(), ctx);
|
|
||||||
|
|
||||||
for (int i = 0; i < data.length; i++) {
|
|
||||||
long offset = reader.findEntry(data[i]);
|
|
||||||
assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset);
|
|
||||||
assertEquals(data[i], mmf.get(offset));
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < 500; i++) {
|
|
||||||
long val = (long) (Long.MAX_VALUE * Math.random());
|
|
||||||
while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random());
|
|
||||||
assertEquals(-1, reader.findEntry(val));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
} finally {
|
|
||||||
Files.delete(tempFile);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testWriteEqualityMasked() throws IOException {
|
|
||||||
|
|
||||||
for (int bs = 2; bs <= 4; bs++) {
|
|
||||||
var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat");
|
|
||||||
Set<Long> toPut = new HashSet<>();
|
|
||||||
|
|
||||||
long mask = 0xFFFF_FFFF_0000_0000L;
|
|
||||||
var ctx = new BTreeContext(5, 1, mask, bs);
|
|
||||||
|
|
||||||
for (int i = 0; i < 500; i++) {
|
|
||||||
while (!toPut.add((long) (Long.MAX_VALUE * Math.random()))) ;
|
|
||||||
}
|
|
||||||
|
|
||||||
long[] data = toPut.stream().mapToLong(Long::valueOf).sorted().toArray();
|
|
||||||
|
|
||||||
try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) {
|
|
||||||
{
|
|
||||||
var writer = new BTreeWriter(mmf, ctx);
|
|
||||||
writer.write(0, toPut.size(), (slice) -> {
|
|
||||||
for (int i = 0; i < data.length; i++) {
|
|
||||||
slice.put(i, data[i]);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
mmf.force();
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
var reader = new BTreeReader(mmf, ctx, 0);
|
|
||||||
|
|
||||||
printTreeLayout(toPut.size(), reader.getHeader(), ctx);
|
|
||||||
|
|
||||||
for (int i = 0; i < data.length; i++) {
|
|
||||||
long offset = reader.findEntry(data[i] & mask);
|
|
||||||
assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset);
|
|
||||||
assertEquals(data[i], mmf.get(offset));
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < 500; i++) {
|
|
||||||
long val = (long) (Long.MAX_VALUE * Math.random());
|
|
||||||
while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random());
|
|
||||||
assertEquals(-1, reader.findEntry(val & mask));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
} finally {
|
|
||||||
Files.delete(tempFile);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testWriteTwoEqualityMasked() throws IOException {
|
|
||||||
|
|
||||||
for (int bs = 2; bs <= 4; bs++) {
|
|
||||||
var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat");
|
|
||||||
Set<Long> toPut = new HashSet<>();
|
|
||||||
|
|
||||||
long mask = 0xFFFF_FFFF_0000_0000L;
|
|
||||||
var ctx = new BTreeContext(5, 2, mask, bs);
|
|
||||||
|
|
||||||
for (int i = 0; i < 500; i++) {
|
|
||||||
while (!toPut.add((long) (Long.MAX_VALUE * Math.random()))) ;
|
|
||||||
}
|
|
||||||
|
|
||||||
long[] data = toPut.stream().mapToLong(Long::valueOf).sorted().toArray();
|
|
||||||
|
|
||||||
try (MultimapFileLong mmf = MultimapFileLong.forOutput(tempFile, 1000)) {
|
|
||||||
{
|
|
||||||
var writer = new BTreeWriter(mmf, ctx);
|
|
||||||
writer.write(0, toPut.size(), (slice) -> {
|
|
||||||
for (int i = 0; i < data.length; i++) {
|
|
||||||
slice.put(i*2L, data[i]);
|
|
||||||
slice.put(i*2L+1, i);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
mmf.force();
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
var reader = new BTreeReader(mmf, ctx, 0);
|
|
||||||
|
|
||||||
printTreeLayout(toPut.size(), reader.getHeader(), ctx);
|
|
||||||
|
|
||||||
for (int i = 0; i < data.length; i++) {
|
|
||||||
long offset = reader.findEntry(data[i] & mask);
|
|
||||||
assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset);
|
|
||||||
assertEquals(data[i], mmf.get(offset));
|
|
||||||
assertEquals(i, mmf.get(offset+1));
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < 500; i++) {
|
|
||||||
long val = (long) (Long.MAX_VALUE * Math.random());
|
|
||||||
while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random());
|
|
||||||
assertEquals(-1, reader.findEntry(val & mask));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
} finally {
|
|
||||||
Files.delete(tempFile);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
@ -18,7 +18,7 @@ class LanguageFilterTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void isStringChinsese() {
|
public void isStringChinese() {
|
||||||
var languageFilter = new LanguageFilter();
|
var languageFilter = new LanguageFilter();
|
||||||
assertTrue(languageFilter.isBlockedUnicodeRange("溶岩ドームの手前に広がる斜面(木が生えているところ)は普賢岳の山体です.今回の噴火にともない,このあたりの山体がマグマに押されて変形し,北(写真では左)にむかって100mほどせりだしました\n"));
|
assertTrue(languageFilter.isBlockedUnicodeRange("溶岩ドームの手前に広がる斜面(木が生えているところ)は普賢岳の山体です.今回の噴火にともない,このあたりの山体がマグマに押されて変形し,北(写真では左)にむかって100mほどせりだしました\n"));
|
||||||
}
|
}
|
||||||
|
@ -103,6 +103,7 @@ class MultimapFileTest {
|
|||||||
var sorter = file.createSorter(Path.of("/tmp"), 16, 2);
|
var sorter = file.createSorter(Path.of("/tmp"), 16, 2);
|
||||||
|
|
||||||
for (int start = 0; start < 8; start+=2) {
|
for (int start = 0; start < 8; start+=2) {
|
||||||
|
System.out.println("~");
|
||||||
for (int end = start; end < 128; end+=2) {
|
for (int end = start; end < 128; end+=2) {
|
||||||
for (int i = 0; i < 128; i+=2) {
|
for (int i = 0; i < 128; i+=2) {
|
||||||
file.put(i, -i/2);
|
file.put(i, -i/2);
|
||||||
@ -110,9 +111,17 @@ class MultimapFileTest {
|
|||||||
}
|
}
|
||||||
sorter.quickSortLH(start, end);
|
sorter.quickSortLH(start, end);
|
||||||
for (int i = start+2; i < end; i+=2) {
|
for (int i = start+2; i < end; i+=2) {
|
||||||
|
|
||||||
|
System.out.println("**" + i);
|
||||||
|
System.out.println(file.get(i-2));
|
||||||
|
System.out.println(file.get(i-1));
|
||||||
|
System.out.println(file.get(i));
|
||||||
|
System.out.println(file.get(i+1));
|
||||||
|
|
||||||
assertTrue(file.get(i-2) <= file.get(i));
|
assertTrue(file.get(i-2) <= file.get(i));
|
||||||
assertEquals(file.get(i+1), -file.get(i));
|
assertEquals(file.get(i+1), -file.get(i));
|
||||||
}
|
}
|
||||||
|
System.out.println("~");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -158,13 +167,17 @@ class MultimapFileTest {
|
|||||||
var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 128, 8);
|
var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 128, 8);
|
||||||
var sorter = file.createSorter(Path.of("/tmp"), 16, 2);
|
var sorter = file.createSorter(Path.of("/tmp"), 16, 2);
|
||||||
|
|
||||||
for (int start = 0; start < 8; start+=2) {
|
for (int start = 2; start < 8; start+=2) {
|
||||||
for (int end = start+2; end < 126; end+=2) {
|
for (int end = start+2; end < 126; end+=2) {
|
||||||
for (int i = 0; i < 128; i+=2) {
|
for (int i = 0; i < 128; i+=2) {
|
||||||
file.put(i, -(128-i/2));
|
file.put(i, -(128-i/2));
|
||||||
file.put(i+1, (128-i)/2);
|
file.put(i+1, (128-i/2));
|
||||||
}
|
}
|
||||||
sorter.insertionSort(start, (end - start));
|
file.put(0, 0xFFFF_FFFFL);
|
||||||
|
file.put(end, 0x7FFF_FFFFL);
|
||||||
|
sorter.insertionSort(start, (end - start)/2);
|
||||||
|
assertEquals(0xFFFF_FFFFL, file.get(0));
|
||||||
|
assertEquals(file.get(end), 0x7FFF_FFFFL);
|
||||||
for (int i = start+2; i < end; i+=2) {
|
for (int i = start+2; i < end; i+=2) {
|
||||||
assertTrue(file.get(i-2) <= file.get(i));
|
assertTrue(file.get(i-2) <= file.get(i));
|
||||||
assertEquals(file.get(i+1), -file.get(i));
|
assertEquals(file.get(i+1), -file.get(i));
|
||||||
@ -178,14 +191,14 @@ class MultimapFileTest {
|
|||||||
var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 128, 8);
|
var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 128, 8);
|
||||||
var sorter = file.createSorter(Path.of("/tmp"), 16, 2);
|
var sorter = file.createSorter(Path.of("/tmp"), 16, 2);
|
||||||
|
|
||||||
for (int start = 0; start < 512; start+=2) {
|
for (int start = 0; start < 512; start+=18) {
|
||||||
System.out.println(start);
|
System.out.println(start);
|
||||||
for (int end = start+2; end < 8192; end+=2) {
|
for (int end = start+2; end < 8192; end+=68) {
|
||||||
for (int i = 0; i < 8192; i+=2) {
|
for (int i = 0; i < 8192; i+=2) {
|
||||||
file.put(i, -i/2);
|
file.put(i, -i/2);
|
||||||
file.put(i+1, i/2);
|
file.put(i+1, i/2);
|
||||||
}
|
}
|
||||||
sorter.quickSortLH(start, end);
|
sorter.mergeSort(start, end-start);
|
||||||
|
|
||||||
assertEquals(file.get(start+1), -file.get(start));
|
assertEquals(file.get(start+1), -file.get(start));
|
||||||
for (int i = start+2; i < end; i+=2) {
|
for (int i = start+2; i < end; i+=2) {
|
||||||
@ -216,24 +229,6 @@ class MultimapFileTest {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
|
||||||
void sortInternalSS2() throws IOException {
|
|
||||||
var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8);
|
|
||||||
var sorter = file.createSorter(Path.of("/tmp"), 16, 2);
|
|
||||||
for (int i = 0; i < 32; i+=2) {
|
|
||||||
file.put(i, 32-i/2);
|
|
||||||
file.put(i+1, ~(32-i/2));
|
|
||||||
}
|
|
||||||
|
|
||||||
sorter.sortRange( 2, 14);
|
|
||||||
|
|
||||||
for (int i = 2+2; i < 16; i+=2) {
|
|
||||||
System.out.println(file.get(i) + "-" + ~file.get(i+1));
|
|
||||||
assertTrue(file.get(i) > file.get(i-2));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void sortExternal() throws IOException {
|
void sortExternal() throws IOException {
|
||||||
var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8);
|
var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8);
|
||||||
@ -252,25 +247,7 @@ class MultimapFileTest {
|
|||||||
assertTrue(searcher.binarySearchTest(file.get(i), 2, 16));
|
assertTrue(searcher.binarySearchTest(file.get(i), 2, 16));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@Test
|
|
||||||
void sortExternalSS2() throws IOException {
|
|
||||||
var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8);
|
|
||||||
var sorter = file.createSorter(Path.of("/tmp"), 2, 2);
|
|
||||||
var searcher = file.createSearcher();
|
|
||||||
|
|
||||||
for (int i = 0; i < 32; i+=2) {
|
|
||||||
file.put(i, 32-i/2);
|
|
||||||
file.put(i+1, ~(32-i/2));
|
|
||||||
}
|
|
||||||
|
|
||||||
sorter.sortRange( 2, 14);
|
|
||||||
file.force();
|
|
||||||
|
|
||||||
for (int i = 2+2; i < 16; i+=2) {
|
|
||||||
System.out.println(file.get(i) + "-" + ~file.get(i+1));
|
|
||||||
assertTrue(file.get(i) > file.get(i-2));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void close() {
|
void close() {
|
||||||
|
@ -136,14 +136,12 @@ class IndexQueryTest {
|
|||||||
|
|
||||||
IndexQuery query = new IndexQuery(List.of(threesRange().asPrefixSource(102, 200)));
|
IndexQuery query = new IndexQuery(List.of(threesRange().asPrefixSource(102, 200)));
|
||||||
|
|
||||||
/** Read from 17s range */
|
/** Read from 3s range */
|
||||||
|
|
||||||
// 17s range is shorter and should read fully in one go
|
|
||||||
|
|
||||||
query.getMoreResults(buffer);
|
query.getMoreResults(buffer);
|
||||||
System.out.println(Arrays.toString(buffer.copyData()));
|
System.out.println(Arrays.toString(buffer.copyData()));
|
||||||
assertFalse(buffer.isEmpty());
|
assertFalse(buffer.isEmpty());
|
||||||
assertArrayEquals(LongStream.range(0, 100).map(l -> l*17).toArray(), buffer.copyData());
|
assertArrayEquals(LongStream.range(100, 200).filter(v -> (v % 3) == 0).toArray(), buffer.copyData());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user