diff --git a/code/common/model/src/main/java/nu/marginalia/model/id/UrlIdCodec.java b/code/common/model/src/main/java/nu/marginalia/model/id/UrlIdCodec.java new file mode 100644 index 00000000..86c8deac --- /dev/null +++ b/code/common/model/src/main/java/nu/marginalia/model/id/UrlIdCodec.java @@ -0,0 +1,94 @@ +package nu.marginalia.model.id; + +/** URL id encoding scheme, including an optional ranking part that's used in the indices and washed away + * outside. The ranking part is put in the highest bits so that when we sort the documents by id, they're + * actually sorted by rank. Next is the domain id part, which keeps documents from the same domain clustered. + * Finally is the document ordinal part, which is a non-unique sequence number for within the current set of + * documents loaded. The same ID may be re-used over time as a new index is loaded. + *
+ *Part | Bits | Cardinality |
---|---|---|
rank | 6 bits | 64 | + *
domain | 31 bits | 2 billion | + *
document | 26 bits | 67 million | + *
+ * [ | rank | domain | url ]
+ * 0 1 6 38 64
+ *
+ */
+public class UrlIdCodec {
+ private static final long RANK_MASK = 0x8600_0000_0000_0000L;
+ private static final int DOCORD_MASK = 0x03FF_FFFF;
+
+ /** Encode a URL id without a ranking element */
+ public static long encodeId(int domainId, int documentOrdinal) {
+ domainId &= 0x7FFF_FFFFL;
+ documentOrdinal &= 0x03FF_FFFF;
+
+ return ((long) domainId << 26) | documentOrdinal;
+ }
+
+ /** Encode a URL id with the optional ranking part
+ *
+ * @param rank [0,1] the importance of the domain, low is good
+ * @param domainId
+ * @param documentOrdinal
+ * @return
+ */
+ public static long encodeIdWithRank(float rank, int domainId, int documentOrdinal) {
+ long rankPart = (int)(rank * (1<<6));
+
+ if (rankPart >= 64) rankPart = 63;
+ if (rankPart < 0) rankPart = 0;
+
+ return encodeId(domainId, documentOrdinal) | (rankPart << 57);
+ }
+
+ /** Add a ranking element to an existing combined URL id.
+ *
+ * @param rank [0,1] the importance of the domain, low is good
+ * @param urlId
+ */
+ public static long addRank(float rank, long urlId) {
+ long rankPart = (int)(rank * (1<<6));
+
+ if (rankPart >= 64) rankPart = 63;
+ if (rankPart < 0) rankPart = 0;
+
+ return (urlId&(~RANK_MASK)) | (rankPart << 57);
+ }
+
+ /** Extract the domain component from this URL id */
+ public static int getDomainId(long combinedId) {
+ return (int) ((combinedId >>> 26) & 0x7FFF_FFFFL);
+ }
+
+ /** Extract the document ordinal component from this URL id */
+ public static int getDocumentOrdinal(long combinedId) {
+ return (int) (combinedId & DOCORD_MASK);
+ }
+
+
+ /** Extract the document ordinal component from this URL id */
+ public static int getRank(long combinedId) {
+ return (int) (combinedId >>> 57);
+ }
+
+ /** Mask out the ranking element from this URL id */
+ public static long removeRank(long combinedId) {
+ return combinedId & (~RANK_MASK);
+ }
+
+}
diff --git a/code/common/model/src/test/java/nu/marginalia/model/id/UrlIdCodecTest.java b/code/common/model/src/test/java/nu/marginalia/model/id/UrlIdCodecTest.java
new file mode 100644
index 00000000..10fda63b
--- /dev/null
+++ b/code/common/model/src/test/java/nu/marginalia/model/id/UrlIdCodecTest.java
@@ -0,0 +1,37 @@
+package nu.marginalia.model.id;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+class UrlIdCodecTest {
+ @Test
+ public void testDocumentBounds() {
+ long encoded = UrlIdCodec.encodeId(0, ~0);
+ assertEquals(0, UrlIdCodec.getDomainId(encoded));
+ }
+
+ @Test
+ public void testDomainBounds() {
+ long encoded = UrlIdCodec.encodeId(~0, 0);
+ assertEquals(0x7FFF_FFFF, UrlIdCodec.getDomainId(encoded));
+ assertEquals(0, UrlIdCodec.getRank(encoded));
+ assertEquals(0, UrlIdCodec.getDocumentOrdinal(encoded));
+ }
+
+ @Test
+ public void testRankBounds() {
+ long encoded = UrlIdCodec.encodeIdWithRank(1.0f, 0, 0);
+ assertEquals(0, UrlIdCodec.getDomainId(encoded));
+ assertEquals(63, UrlIdCodec.getRank(encoded));
+ assertEquals(0, UrlIdCodec.getDocumentOrdinal(encoded));
+ }
+
+ @Test
+ public void testRankBoundsNeg() {
+ long encoded = UrlIdCodec.encodeIdWithRank(-1.0f, 0, 0);
+ assertEquals(0, UrlIdCodec.getDomainId(encoded));
+ assertEquals(0, UrlIdCodec.getRank(encoded));
+ assertEquals(0, UrlIdCodec.getDocumentOrdinal(encoded));
+ }
+}
\ No newline at end of file