Skip to main content

icydb_core/db/index/
fingerprint.rs

1use crate::{
2    error::{ErrorClass, ErrorOrigin, InternalError},
3    value::Value,
4};
5use canic_utils::hash::Xxh3;
6
7///
8/// ValueTag
9///
10/// Can we remove ValueTag?
11/// Yes, technically.
12///
13/// Should we?
14/// Almost certainly no, unless you control all serialization + don't need hashing + don't care about stability.
15///
16/// Why keep it?
17/// Binary stability, hashing, sorting, versioning, IC-safe ABI, robustness.
18///
19
20#[repr(u8)]
21#[derive(Clone, Copy, Debug, Eq, PartialEq)]
22pub enum ValueTag {
23    Account = 1,
24    Blob = 2,
25    Bool = 3,
26    Date = 4,
27    Decimal = 5,
28    Duration = 6,
29    Enum = 7,
30    E8s = 8,
31    E18s = 9,
32    Float32 = 10,
33    Float64 = 11,
34    Int = 12,
35    Int128 = 13,
36    IntBig = 14,
37    List = 15,
38    Map = 16,
39    Null = 17,
40    Principal = 18,
41    Subaccount = 19,
42    Text = 20,
43    Timestamp = 21,
44    Uint = 22,
45    Uint128 = 23,
46    UintBig = 24,
47    Ulid = 25,
48    Unit = 26,
49}
50
51impl ValueTag {
52    #[must_use]
53    pub const fn to_u8(self) -> u8 {
54        self as u8
55    }
56}
57
58///
59/// Canonical Byte Representation
60///
61
62const fn value_tag(value: &Value) -> u8 {
63    match value {
64        Value::Account(_) => ValueTag::Account,
65        Value::Blob(_) => ValueTag::Blob,
66        Value::Bool(_) => ValueTag::Bool,
67        Value::Date(_) => ValueTag::Date,
68        Value::Decimal(_) => ValueTag::Decimal,
69        Value::Duration(_) => ValueTag::Duration,
70        Value::Enum(_) => ValueTag::Enum,
71        Value::E8s(_) => ValueTag::E8s,
72        Value::E18s(_) => ValueTag::E18s,
73        Value::Float32(_) => ValueTag::Float32,
74        Value::Float64(_) => ValueTag::Float64,
75        Value::Int(_) => ValueTag::Int,
76        Value::Int128(_) => ValueTag::Int128,
77        Value::IntBig(_) => ValueTag::IntBig,
78        Value::List(_) => ValueTag::List,
79        Value::Map(_) => ValueTag::Map,
80        Value::Null => ValueTag::Null,
81        Value::Principal(_) => ValueTag::Principal,
82        Value::Subaccount(_) => ValueTag::Subaccount,
83        Value::Text(_) => ValueTag::Text,
84        Value::Timestamp(_) => ValueTag::Timestamp,
85        Value::Uint(_) => ValueTag::Uint,
86        Value::Uint128(_) => ValueTag::Uint128,
87        Value::UintBig(_) => ValueTag::UintBig,
88        Value::Ulid(_) => ValueTag::Ulid,
89        Value::Unit => ValueTag::Unit,
90    }
91    .to_u8()
92}
93
94fn feed_i32(h: &mut Xxh3, x: i32) {
95    h.update(&x.to_be_bytes());
96}
97fn feed_i64(h: &mut Xxh3, x: i64) {
98    h.update(&x.to_be_bytes());
99}
100fn feed_i128(h: &mut Xxh3, x: i128) {
101    h.update(&x.to_be_bytes());
102}
103fn feed_u8(h: &mut Xxh3, x: u8) {
104    h.update(&[x]);
105}
106fn feed_u32(h: &mut Xxh3, x: u32) {
107    h.update(&x.to_be_bytes());
108}
109fn feed_u64(h: &mut Xxh3, x: u64) {
110    h.update(&x.to_be_bytes());
111}
112fn feed_u128(h: &mut Xxh3, x: u128) {
113    h.update(&x.to_be_bytes());
114}
115fn feed_bytes(h: &mut Xxh3, b: &[u8]) {
116    h.update(b);
117}
118
119#[cfg(test)]
120thread_local! {
121    static TEST_HASH_OVERRIDE: std::cell::Cell<Option<[u8; 16]>> =
122        const { std::cell::Cell::new(None) };
123}
124
125#[cfg(test)]
126#[expect(dead_code)]
127pub(crate) fn with_test_hash_override<T>(hash: [u8; 16], f: impl FnOnce() -> T) -> T {
128    TEST_HASH_OVERRIDE.with(|cell| {
129        let previous = cell.replace(Some(hash));
130        let out = f();
131        cell.set(previous);
132        out
133    })
134}
135
136#[cfg(test)]
137#[allow(clippy::redundant_closure_for_method_calls)]
138fn test_hash_override() -> Option<[u8; 16]> {
139    TEST_HASH_OVERRIDE.with(|cell| cell.get())
140}
141
142#[allow(clippy::cast_possible_truncation)]
143#[allow(clippy::too_many_lines)]
144fn write_to_hasher(value: &Value, h: &mut Xxh3) -> Result<(), InternalError> {
145    feed_u8(h, value_tag(value));
146
147    match value {
148        Value::Account(a) => {
149            let bytes = a.to_bytes().map_err(|err| {
150                InternalError::new(
151                    ErrorClass::Unsupported,
152                    ErrorOrigin::Serialize,
153                    err.to_string(),
154                )
155            })?;
156            feed_bytes(h, &bytes);
157        }
158        Value::Blob(v) => {
159            feed_u8(h, 0x01);
160            feed_u32(h, v.len() as u32);
161            feed_bytes(h, v);
162        }
163        Value::Bool(b) => {
164            feed_u8(h, u8::from(*b));
165        }
166        Value::Date(d) => feed_i32(h, d.get()),
167        Value::Decimal(d) => {
168            // encode (sign, scale, mantissa) deterministically:
169            feed_u8(h, u8::from(d.is_sign_negative()));
170            feed_u32(h, d.scale());
171            feed_bytes(h, &d.mantissa().to_be_bytes());
172        }
173        Value::Duration(t) => {
174            feed_u64(h, t.get());
175        }
176        Value::Enum(v) => {
177            match &v.path {
178                Some(path) => {
179                    feed_u8(h, 0x01); // path present
180                    feed_u32(h, path.len() as u32);
181                    feed_bytes(h, path.as_bytes());
182                }
183                None => feed_u8(h, 0x00), // path absent -> loose match
184            }
185
186            feed_u32(h, v.variant.len() as u32);
187            feed_bytes(h, v.variant.as_bytes());
188
189            match &v.payload {
190                Some(payload) => {
191                    feed_u8(h, 0x01); // payload present
192                    write_to_hasher(payload, h)?; // include nested value
193                }
194                None => feed_u8(h, 0x00),
195            }
196        }
197        Value::E8s(v) => {
198            feed_u64(h, v.get());
199        }
200        Value::E18s(v) => {
201            feed_bytes(h, &v.to_be_bytes());
202        }
203        Value::Float32(v) => {
204            feed_bytes(h, &v.to_be_bytes());
205        }
206        Value::Float64(v) => {
207            feed_bytes(h, &v.to_be_bytes());
208        }
209        Value::Int(i) => {
210            feed_i64(h, *i);
211        }
212        Value::Int128(i) => {
213            feed_i128(h, i.get());
214        }
215        Value::IntBig(v) => {
216            let bytes = v.to_leb128();
217            feed_u32(h, bytes.len() as u32);
218            feed_bytes(h, &bytes);
219        }
220        Value::List(xs) => {
221            feed_u32(h, xs.len() as u32);
222            for x in xs {
223                feed_u8(h, 0xFF);
224                write_to_hasher(x, h)?; // recurse, no sub-hash
225            }
226        }
227        Value::Map(entries) => {
228            feed_u32(h, entries.len() as u32);
229            for (key, value) in entries {
230                feed_u8(h, 0xFD);
231                write_to_hasher(key, h)?;
232                feed_u8(h, 0xFE);
233                write_to_hasher(value, h)?;
234            }
235        }
236        Value::Principal(p) => {
237            let raw = p.to_bytes().map_err(|err| {
238                InternalError::new(
239                    ErrorClass::Unsupported,
240                    ErrorOrigin::Serialize,
241                    err.to_string(),
242                )
243            })?;
244            feed_u32(h, raw.len() as u32);
245            feed_bytes(h, &raw);
246        }
247        Value::Subaccount(s) => {
248            feed_bytes(h, &s.to_bytes());
249        }
250        Value::Text(s) => {
251            // If you need case/Unicode insensitivity, normalize; else skip (much faster)
252            // let norm = normalize_nfkc_casefold(s);
253            // feed_u32( h, norm.len() as u32);
254            // feed_bytes( h, norm.as_bytes());
255            feed_u32(h, s.len() as u32);
256            feed_bytes(h, s.as_bytes());
257        }
258        Value::Timestamp(t) => {
259            feed_u64(h, t.get());
260        }
261        Value::Uint(u) => {
262            feed_u64(h, *u);
263        }
264        Value::Uint128(u) => {
265            feed_u128(h, u.get());
266        }
267        Value::UintBig(v) => {
268            let bytes = v.to_leb128();
269            feed_u32(h, bytes.len() as u32);
270            feed_bytes(h, &bytes);
271        }
272        Value::Ulid(u) => {
273            feed_bytes(h, &u.to_bytes());
274        }
275        Value::Null | Value::Unit => {
276            // NOTE: Non-indexable values intentionally contribute no hash input.
277        }
278    }
279
280    Ok(())
281}
282
283/// Stable hash used for index/storage fingerprints.
284pub fn hash_value(value: &Value) -> Result<[u8; 16], InternalError> {
285    const VERSION: u8 = 1;
286
287    #[cfg(test)]
288    if let Some(override_hash) = test_hash_override() {
289        return Ok(override_hash);
290    }
291
292    let mut h = Xxh3::with_seed(0);
293    feed_u8(&mut h, VERSION); // version
294
295    write_to_hasher(value, &mut h)?;
296    Ok(h.digest128().to_be_bytes())
297}
298
299/// Index fingerprint semantics:
300///
301/// - Only indexable values produce fingerprints.
302/// - `Value::Null` does not produce fingerprints and
303///   therefore do not participate in indexing.
304/// - For unique indexes, uniqueness is enforced only over indexable values.
305///   Multiple rows with non-indexable values are permitted.
306///
307/// This behavior matches SQL-style UNIQUE constraints with NULL values.
308///
309/// Stable 128-bit hash used for index keys; returns `None` for non-indexable values.
310pub fn to_index_fingerprint(value: &Value) -> Result<Option<[u8; 16]>, InternalError> {
311    if matches!(value, Value::Null) {
312        // Intentionally skipped: non-indexable values do not participate in indexes.
313        return Ok(None);
314    }
315
316    Ok(Some(hash_value(value)?))
317}
318
319///
320/// TESTS
321///
322
323#[cfg(test)]
324mod tests {
325    use super::*;
326    use crate::{
327        types::{Float32 as F32, Float64 as F64},
328        value::{Value, ValueEnum},
329    };
330
331    fn v_f64(x: f64) -> Value {
332        Value::Float64(F64::try_new(x).expect("finite f64"))
333    }
334    fn v_f32(x: f32) -> Value {
335        Value::Float32(F32::try_new(x).expect("finite f32"))
336    }
337    fn v_i(x: i64) -> Value {
338        Value::Int(x)
339    }
340    fn v_txt(s: &str) -> Value {
341        Value::Text(s.to_string())
342    }
343
344    #[test]
345    fn hash_is_deterministic_for_int() {
346        let v = Value::Int(42);
347        let a = hash_value(&v).expect("hash value");
348        let b = hash_value(&v).expect("hash value");
349        assert_eq!(a, b, "hash should be deterministic for same value");
350    }
351
352    #[test]
353    fn different_variants_produce_different_hashes() {
354        let a = hash_value(&Value::Int(5)).expect("hash value");
355        let b = hash_value(&Value::Uint(5)).expect("hash value");
356        assert_ne!(
357            a, b,
358            "Int(5) and Uint(5) must hash differently (different tag)"
359        );
360    }
361
362    #[test]
363    fn enum_hash_tracks_path_presence() {
364        let strict = Value::Enum(ValueEnum::new("A", Some("MyEnum")));
365        let loose = Value::Enum(ValueEnum::new("A", None));
366        assert_ne!(
367            hash_value(&strict).expect("hash value"),
368            hash_value(&loose).expect("hash value"),
369            "Enum hashes must differ when path is present vs absent"
370        );
371    }
372
373    #[test]
374    fn enum_hash_includes_payload() {
375        let base = ValueEnum::new("A", Some("MyEnum"));
376        let with_one = Value::Enum(base.clone().with_payload(Value::Uint(1)));
377        let with_two = Value::Enum(base.with_payload(Value::Uint(2)));
378
379        assert_ne!(
380            hash_value(&with_one).expect("hash value"),
381            hash_value(&with_two).expect("hash value"),
382            "Enum payload must influence hash/fingerprint"
383        );
384    }
385
386    #[test]
387    fn float32_and_float64_hash_differ() {
388        let a = hash_value(&v_f32(1.0)).expect("hash value");
389        let b = hash_value(&v_f64(1.0)).expect("hash value");
390        assert_ne!(
391            a, b,
392            "Float32 and Float64 must hash differently (different tag)"
393        );
394    }
395
396    #[test]
397    fn text_is_length_and_content_sensitive() {
398        let a = hash_value(&v_txt("foo")).expect("hash value");
399        let b = hash_value(&v_txt("bar")).expect("hash value");
400        assert_ne!(a, b, "different strings should hash differently");
401
402        let c = hash_value(&v_txt("foo")).expect("hash value");
403        assert_eq!(a, c, "same string should hash the same");
404    }
405
406    #[test]
407    fn list_hash_is_order_sensitive() {
408        let l1 = Value::from_slice(&[v_i(1), v_i(2)]);
409        let l2 = Value::from_slice(&[v_i(2), v_i(1)]);
410        assert_ne!(
411            hash_value(&l1).expect("hash value"),
412            hash_value(&l2).expect("hash value"),
413            "list order should affect hash"
414        );
415    }
416
417    #[test]
418    fn list_hash_is_length_sensitive() {
419        let l1 = Value::from_slice(&[v_i(1)]);
420        let l2 = Value::from_slice(&[v_i(1), v_i(1)]);
421        assert_ne!(
422            hash_value(&l1).expect("hash value"),
423            hash_value(&l2).expect("hash value"),
424            "list length should affect hash"
425        );
426    }
427
428    #[test]
429    fn list_blob_boundaries_are_length_framed() {
430        let left = Value::List(vec![
431            Value::Blob(vec![0x10, 0xFF, 0x02, 0x11]),
432            Value::Blob(vec![0x12]),
433        ]);
434        let right = Value::List(vec![
435            Value::Blob(vec![0x10]),
436            Value::Blob(vec![0x11, 0xFF, 0x02, 0x12]),
437        ]);
438
439        assert_ne!(
440            hash_value(&left).expect("hash value"),
441            hash_value(&right).expect("hash value"),
442            "blob boundaries must be length-framed to avoid collisions"
443        );
444    }
445}