Skip to main content

icydb_core/db/index/
fingerprint.rs

1use crate::{
2    error::{ErrorClass, ErrorOrigin, InternalError},
3    value::Value,
4};
5use canic_utils::hash::Xxh3;
6
7///
8/// ValueTag
9///
10/// Can we remove ValueTag?
11/// Yes, technically.
12///
13/// Should we?
14/// Almost certainly no, unless you control all serialization + don't need hashing + don't care about stability.
15///
16/// Why keep it?
17/// Binary stability, hashing, sorting, versioning, IC-safe ABI, robustness.
18///
19
20#[repr(u8)]
21#[derive(Clone, Copy, Debug, Eq, PartialEq)]
22pub enum ValueTag {
23    Account = 1,
24    Blob = 2,
25    Bool = 3,
26    Date = 4,
27    Decimal = 5,
28    Duration = 6,
29    Enum = 7,
30    E8s = 8,
31    E18s = 9,
32    Float32 = 10,
33    Float64 = 11,
34    Int = 12,
35    Int128 = 13,
36    IntBig = 14,
37    List = 15,
38    Map = 16,
39    None = 17,
40    Principal = 18,
41    Subaccount = 19,
42    Text = 20,
43    Timestamp = 21,
44    Uint = 22,
45    Uint128 = 23,
46    UintBig = 24,
47    Ulid = 25,
48    Unit = 26,
49    Unsupported = 27,
50}
51
52impl ValueTag {
53    #[must_use]
54    pub const fn to_u8(self) -> u8 {
55        self as u8
56    }
57}
58
59///
60/// Canonical Byte Representation
61///
62
63const fn value_tag(value: &Value) -> u8 {
64    match value {
65        Value::Account(_) => ValueTag::Account,
66        Value::Blob(_) => ValueTag::Blob,
67        Value::Bool(_) => ValueTag::Bool,
68        Value::Date(_) => ValueTag::Date,
69        Value::Decimal(_) => ValueTag::Decimal,
70        Value::Duration(_) => ValueTag::Duration,
71        Value::Enum(_) => ValueTag::Enum,
72        Value::E8s(_) => ValueTag::E8s,
73        Value::E18s(_) => ValueTag::E18s,
74        Value::Float32(_) => ValueTag::Float32,
75        Value::Float64(_) => ValueTag::Float64,
76        Value::Int(_) => ValueTag::Int,
77        Value::Int128(_) => ValueTag::Int128,
78        Value::IntBig(_) => ValueTag::IntBig,
79        Value::List(_) => ValueTag::List,
80        Value::Map(_) => ValueTag::Map,
81        Value::None => ValueTag::None,
82        Value::Principal(_) => ValueTag::Principal,
83        Value::Subaccount(_) => ValueTag::Subaccount,
84        Value::Text(_) => ValueTag::Text,
85        Value::Timestamp(_) => ValueTag::Timestamp,
86        Value::Uint(_) => ValueTag::Uint,
87        Value::Uint128(_) => ValueTag::Uint128,
88        Value::UintBig(_) => ValueTag::UintBig,
89        Value::Ulid(_) => ValueTag::Ulid,
90        Value::Unit => ValueTag::Unit,
91        Value::Unsupported => ValueTag::Unsupported,
92    }
93    .to_u8()
94}
95
96fn feed_i32(h: &mut Xxh3, x: i32) {
97    h.update(&x.to_be_bytes());
98}
99fn feed_i64(h: &mut Xxh3, x: i64) {
100    h.update(&x.to_be_bytes());
101}
102fn feed_i128(h: &mut Xxh3, x: i128) {
103    h.update(&x.to_be_bytes());
104}
105fn feed_u8(h: &mut Xxh3, x: u8) {
106    h.update(&[x]);
107}
108fn feed_u32(h: &mut Xxh3, x: u32) {
109    h.update(&x.to_be_bytes());
110}
111fn feed_u64(h: &mut Xxh3, x: u64) {
112    h.update(&x.to_be_bytes());
113}
114fn feed_u128(h: &mut Xxh3, x: u128) {
115    h.update(&x.to_be_bytes());
116}
117fn feed_bytes(h: &mut Xxh3, b: &[u8]) {
118    h.update(b);
119}
120
121#[cfg(test)]
122thread_local! {
123    static TEST_HASH_OVERRIDE: std::cell::Cell<Option<[u8; 16]>> =
124        const { std::cell::Cell::new(None) };
125}
126
127#[cfg(test)]
128#[expect(dead_code)]
129pub(crate) fn with_test_hash_override<T>(hash: [u8; 16], f: impl FnOnce() -> T) -> T {
130    TEST_HASH_OVERRIDE.with(|cell| {
131        let previous = cell.replace(Some(hash));
132        let out = f();
133        cell.set(previous);
134        out
135    })
136}
137
138#[cfg(test)]
139#[allow(clippy::redundant_closure_for_method_calls)]
140fn test_hash_override() -> Option<[u8; 16]> {
141    TEST_HASH_OVERRIDE.with(|cell| cell.get())
142}
143
144#[allow(clippy::cast_possible_truncation)]
145#[allow(clippy::too_many_lines)]
146fn write_to_hasher(value: &Value, h: &mut Xxh3) -> Result<(), InternalError> {
147    feed_u8(h, value_tag(value));
148
149    match value {
150        Value::Account(a) => {
151            let bytes = a.to_bytes().map_err(|err| {
152                InternalError::new(
153                    ErrorClass::Unsupported,
154                    ErrorOrigin::Serialize,
155                    err.to_string(),
156                )
157            })?;
158            feed_bytes(h, &bytes);
159        }
160        Value::Blob(v) => {
161            feed_u8(h, 0x01);
162            feed_u32(h, v.len() as u32);
163            feed_bytes(h, v);
164        }
165        Value::Bool(b) => {
166            feed_u8(h, u8::from(*b));
167        }
168        Value::Date(d) => feed_i32(h, d.get()),
169        Value::Decimal(d) => {
170            // encode (sign, scale, mantissa) deterministically:
171            feed_u8(h, u8::from(d.is_sign_negative()));
172            feed_u32(h, d.scale());
173            feed_bytes(h, &d.mantissa().to_be_bytes());
174        }
175        Value::Duration(t) => {
176            feed_u64(h, t.get());
177        }
178        Value::Enum(v) => {
179            match &v.path {
180                Some(path) => {
181                    feed_u8(h, 0x01); // path present
182                    feed_u32(h, path.len() as u32);
183                    feed_bytes(h, path.as_bytes());
184                }
185                None => feed_u8(h, 0x00), // path absent -> loose match
186            }
187
188            feed_u32(h, v.variant.len() as u32);
189            feed_bytes(h, v.variant.as_bytes());
190
191            match &v.payload {
192                Some(payload) => {
193                    feed_u8(h, 0x01); // payload present
194                    write_to_hasher(payload, h)?; // include nested value
195                }
196                None => feed_u8(h, 0x00),
197            }
198        }
199        Value::E8s(v) => {
200            feed_u64(h, v.get());
201        }
202        Value::E18s(v) => {
203            feed_bytes(h, &v.to_be_bytes());
204        }
205        Value::Float32(v) => {
206            feed_bytes(h, &v.to_be_bytes());
207        }
208        Value::Float64(v) => {
209            feed_bytes(h, &v.to_be_bytes());
210        }
211        Value::Int(i) => {
212            feed_i64(h, *i);
213        }
214        Value::Int128(i) => {
215            feed_i128(h, i.get());
216        }
217        Value::IntBig(v) => {
218            let bytes = v.to_leb128();
219            feed_u32(h, bytes.len() as u32);
220            feed_bytes(h, &bytes);
221        }
222        Value::List(xs) => {
223            feed_u32(h, xs.len() as u32);
224            for x in xs {
225                feed_u8(h, 0xFF);
226                write_to_hasher(x, h)?; // recurse, no sub-hash
227            }
228        }
229        Value::Map(entries) => {
230            feed_u32(h, entries.len() as u32);
231            for (key, value) in entries {
232                feed_u8(h, 0xFD);
233                write_to_hasher(key, h)?;
234                feed_u8(h, 0xFE);
235                write_to_hasher(value, h)?;
236            }
237        }
238        Value::Principal(p) => {
239            let raw = p.to_bytes().map_err(|err| {
240                InternalError::new(
241                    ErrorClass::Unsupported,
242                    ErrorOrigin::Serialize,
243                    err.to_string(),
244                )
245            })?;
246            feed_u32(h, raw.len() as u32);
247            feed_bytes(h, &raw);
248        }
249        Value::Subaccount(s) => {
250            feed_bytes(h, &s.to_bytes());
251        }
252        Value::Text(s) => {
253            // If you need case/Unicode insensitivity, normalize; else skip (much faster)
254            // let norm = normalize_nfkc_casefold(s);
255            // feed_u32( h, norm.len() as u32);
256            // feed_bytes( h, norm.as_bytes());
257            feed_u32(h, s.len() as u32);
258            feed_bytes(h, s.as_bytes());
259        }
260        Value::Timestamp(t) => {
261            feed_u64(h, t.get());
262        }
263        Value::Uint(u) => {
264            feed_u64(h, *u);
265        }
266        Value::Uint128(u) => {
267            feed_u128(h, u.get());
268        }
269        Value::UintBig(v) => {
270            let bytes = v.to_leb128();
271            feed_u32(h, bytes.len() as u32);
272            feed_bytes(h, &bytes);
273        }
274        Value::Ulid(u) => {
275            feed_bytes(h, &u.to_bytes());
276        }
277        Value::None | Value::Unit | Value::Unsupported => {
278            // NOTE: Non-indexable values intentionally contribute no hash input.
279        }
280    }
281
282    Ok(())
283}
284
285/// Stable hash used for index/storage fingerprints.
286pub fn hash_value(value: &Value) -> Result<[u8; 16], InternalError> {
287    const VERSION: u8 = 1;
288
289    #[cfg(test)]
290    if let Some(override_hash) = test_hash_override() {
291        return Ok(override_hash);
292    }
293
294    let mut h = Xxh3::with_seed(0);
295    feed_u8(&mut h, VERSION); // version
296
297    write_to_hasher(value, &mut h)?;
298    Ok(h.digest128().to_be_bytes())
299}
300
301/// Index fingerprint semantics:
302///
303/// - Only indexable values produce fingerprints.
304/// - `Value::None` and `Value::Unsupported` do not produce fingerprints and
305///   therefore do not participate in indexing.
306/// - For unique indexes, uniqueness is enforced only over indexable values.
307///   Multiple rows with non-indexable values are permitted.
308///
309/// This behavior matches SQL-style UNIQUE constraints with NULL values.
310///
311/// Stable 128-bit hash used for index keys; returns `None` for non-indexable values.
312pub fn to_index_fingerprint(value: &Value) -> Result<Option<[u8; 16]>, InternalError> {
313    if matches!(value, Value::None | Value::Unsupported) {
314        // Intentionally skipped: non-indexable values do not participate in indexes.
315        return Ok(None);
316    }
317
318    Ok(Some(hash_value(value)?))
319}
320
321///
322/// TESTS
323///
324
325#[cfg(test)]
326mod tests {
327    use super::*;
328    use crate::{
329        types::{Float32 as F32, Float64 as F64},
330        value::{Value, ValueEnum},
331    };
332
333    fn v_f64(x: f64) -> Value {
334        Value::Float64(F64::try_new(x).expect("finite f64"))
335    }
336    fn v_f32(x: f32) -> Value {
337        Value::Float32(F32::try_new(x).expect("finite f32"))
338    }
339    fn v_i(x: i64) -> Value {
340        Value::Int(x)
341    }
342    fn v_txt(s: &str) -> Value {
343        Value::Text(s.to_string())
344    }
345
346    #[test]
347    fn hash_is_deterministic_for_int() {
348        let v = Value::Int(42);
349        let a = hash_value(&v).expect("hash value");
350        let b = hash_value(&v).expect("hash value");
351        assert_eq!(a, b, "hash should be deterministic for same value");
352    }
353
354    #[test]
355    fn different_variants_produce_different_hashes() {
356        let a = hash_value(&Value::Int(5)).expect("hash value");
357        let b = hash_value(&Value::Uint(5)).expect("hash value");
358        assert_ne!(
359            a, b,
360            "Int(5) and Uint(5) must hash differently (different tag)"
361        );
362    }
363
364    #[test]
365    fn enum_hash_tracks_path_presence() {
366        let strict = Value::Enum(ValueEnum::new("A", Some("MyEnum")));
367        let loose = Value::Enum(ValueEnum::new("A", None));
368        assert_ne!(
369            hash_value(&strict).expect("hash value"),
370            hash_value(&loose).expect("hash value"),
371            "Enum hashes must differ when path is present vs absent"
372        );
373    }
374
375    #[test]
376    fn enum_hash_includes_payload() {
377        let base = ValueEnum::new("A", Some("MyEnum"));
378        let with_one = Value::Enum(base.clone().with_payload(Value::Uint(1)));
379        let with_two = Value::Enum(base.with_payload(Value::Uint(2)));
380
381        assert_ne!(
382            hash_value(&with_one).expect("hash value"),
383            hash_value(&with_two).expect("hash value"),
384            "Enum payload must influence hash/fingerprint"
385        );
386    }
387
388    #[test]
389    fn float32_and_float64_hash_differ() {
390        let a = hash_value(&v_f32(1.0)).expect("hash value");
391        let b = hash_value(&v_f64(1.0)).expect("hash value");
392        assert_ne!(
393            a, b,
394            "Float32 and Float64 must hash differently (different tag)"
395        );
396    }
397
398    #[test]
399    fn text_is_length_and_content_sensitive() {
400        let a = hash_value(&v_txt("foo")).expect("hash value");
401        let b = hash_value(&v_txt("bar")).expect("hash value");
402        assert_ne!(a, b, "different strings should hash differently");
403
404        let c = hash_value(&v_txt("foo")).expect("hash value");
405        assert_eq!(a, c, "same string should hash the same");
406    }
407
408    #[test]
409    fn list_hash_is_order_sensitive() {
410        let l1 = Value::from_slice(&[v_i(1), v_i(2)]);
411        let l2 = Value::from_slice(&[v_i(2), v_i(1)]);
412        assert_ne!(
413            hash_value(&l1).expect("hash value"),
414            hash_value(&l2).expect("hash value"),
415            "list order should affect hash"
416        );
417    }
418
419    #[test]
420    fn list_hash_is_length_sensitive() {
421        let l1 = Value::from_slice(&[v_i(1)]);
422        let l2 = Value::from_slice(&[v_i(1), v_i(1)]);
423        assert_ne!(
424            hash_value(&l1).expect("hash value"),
425            hash_value(&l2).expect("hash value"),
426            "list length should affect hash"
427        );
428    }
429
430    #[test]
431    fn list_blob_boundaries_are_length_framed() {
432        let left = Value::List(vec![
433            Value::Blob(vec![0x10, 0xFF, 0x02, 0x11]),
434            Value::Blob(vec![0x12]),
435        ]);
436        let right = Value::List(vec![
437            Value::Blob(vec![0x10]),
438            Value::Blob(vec![0x11, 0xFF, 0x02, 0x12]),
439        ]);
440
441        assert_ne!(
442            hash_value(&left).expect("hash value"),
443            hash_value(&right).expect("hash value"),
444            "blob boundaries must be length-framed to avoid collisions"
445        );
446    }
447}