Skip to main content

trs_dataframe/dataframe/
index.rs

1use std::hash::{Hash, Hasher};
2
3use data_value::DataValue;
4use halfbrown::HashMap;
5
6use super::{column_store::ColumnFrame, Key};
7
8/// Fast non-allocating hash for DataValue
9fn hash_datavalue(value: &DataValue) -> u64 {
10    use data_value::DataValue::*;
11
12    let mut hasher = std::collections::hash_map::DefaultHasher::new();
13
14    // Write a discriminant byte to differentiate types
15    match value {
16        Null => hasher.write_u8(0),
17        Bool(b) => {
18            hasher.write_u8(1);
19            hasher.write_u8(*b as u8);
20        }
21        U8(u) => {
22            hasher.write_u8(2);
23            hasher.write_u8(*u);
24        }
25        U32(u) => {
26            hasher.write_u8(3);
27            hasher.write_u32(*u);
28        }
29        I32(i) => {
30            hasher.write_u8(4);
31            hasher.write_i32(*i);
32        }
33        U64(u) => {
34            hasher.write_u8(5);
35            hasher.write_u64(*u);
36        }
37        I64(i) => {
38            hasher.write_u8(6);
39            hasher.write_i64(*i);
40        }
41        F32(f) => {
42            hasher.write_u8(7);
43            hasher.write_u32(f.to_bits());
44        }
45        F64(f) => {
46            hasher.write_u8(8);
47            hasher.write_u64(f.to_bits());
48        }
49        U128(u) => {
50            hasher.write_u8(9);
51            hasher.write_u64((*u >> 64) as u64);
52            hasher.write_u64(*u as u64);
53        }
54        I128(i) => {
55            hasher.write_u8(10);
56            let u = *i as u128;
57            hasher.write_u64((u >> 64) as u64);
58            hasher.write_u64(u as u64);
59        }
60        String(s) => {
61            hasher.write_u8(11);
62            hasher.write(s.as_bytes());
63        }
64        Bytes(b) => {
65            hasher.write_u8(12);
66            hasher.write(b);
67        }
68        Vec(v) => {
69            hasher.write_u8(13);
70            hasher.write_usize(v.len());
71            for item in v {
72                // Recursive hashing - combine child hashes
73                let item_hash = hash_datavalue(item);
74                hasher.write_u64(item_hash);
75            }
76        }
77        Map(m) => {
78            hasher.write_u8(14);
79            hasher.write_usize(m.len());
80            // Sort keys for consistent hashing
81            let mut keys: std::vec::Vec<_> = m.keys().collect();
82            keys.sort();
83            for k in keys {
84                hasher.write(k.as_bytes());
85                if let Some(v) = m.get(k) {
86                    let val_hash = hash_datavalue(v);
87                    hasher.write_u64(val_hash);
88                }
89            }
90        }
91        EnumNumber(e) => {
92            hasher.write_u8(15);
93            hasher.write_i32(*e);
94        }
95    }
96
97    hasher.finish()
98}
99
100#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)]
101struct VecIndex {
102    hash: u64,
103}
104
105impl VecIndex {
106    pub fn new(value: &[DataValue]) -> Self {
107        // Combine hashes of all values using a simple combiner
108        // This is much faster than formatting to strings
109        let mut combined_hash: u64 = 0;
110        for v in value.iter() {
111            let h = hash_datavalue(v);
112            // Rotate and XOR for better distribution
113            combined_hash = combined_hash.wrapping_mul(31).wrapping_add(h);
114        }
115        Self {
116            hash: combined_hash,
117        }
118    }
119}
120impl Hash for VecIndex {
121    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
122        state.write_u64(self.hash);
123    }
124}
125
126impl From<&[DataValue]> for VecIndex {
127    fn from(value: &[DataValue]) -> Self {
128        Self::new(value)
129    }
130}
131
132#[derive(Debug)]
133pub struct Index {
134    index: HashMap<VecIndex, usize>,
135}
136
137impl Index {
138    /// Create the index for the given keys and the [`ColumnFrame`] for the given keys.
139    /// This will enumerate the values and store them in the index with current values
140    pub fn new(key: Vec<Key>, df: &ColumnFrame) -> Self {
141        let selected = df.select(Some(key.as_slice()));
142        let mut this = Self {
143            index: HashMap::new(),
144        };
145
146        for (index, candidate) in selected.rows().into_iter().enumerate() {
147            if let Some(slice) = candidate.as_slice() {
148                this.index.insert(VecIndex::from(slice), index);
149            }
150        }
151        this
152    }
153
154    pub fn get(&self, values: &[DataValue]) -> Option<usize> {
155        self.index.get(&VecIndex::from(values)).cloned()
156    }
157
158    pub fn join(self, other: Index) -> Vec<(usize, Option<usize>)> {
159        let mut output = Vec::with_capacity(self.index.len());
160        for (index, left_index) in self.index.into_iter() {
161            let idx = other.index.get(&index);
162            output.push((left_index, idx.cloned()));
163        }
164
165        output
166    }
167}
168
169#[cfg(test)]
170mod tests {
171    use super::*;
172    use data_value::DataValue;
173    use std::collections::HashMap;
174
175    #[test]
176    fn test_hash_datavalue_null() {
177        let val = DataValue::Null;
178        let hash1 = hash_datavalue(&val);
179        let hash2 = hash_datavalue(&val);
180        assert_eq!(hash1, hash2);
181    }
182
183    #[test]
184    fn test_hash_datavalue_bool() {
185        let true_val = DataValue::Bool(true);
186        let false_val = DataValue::Bool(false);
187
188        let hash_true = hash_datavalue(&true_val);
189        let hash_false = hash_datavalue(&false_val);
190
191        assert_eq!(hash_true, hash_datavalue(&true_val));
192        assert_eq!(hash_false, hash_datavalue(&false_val));
193        assert_ne!(hash_true, hash_false);
194    }
195
196    #[test]
197    fn test_hash_datavalue_u8() {
198        let val1 = DataValue::U8(0);
199        let val2 = DataValue::U8(255);
200        let val3 = DataValue::U8(42);
201
202        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val1));
203        assert_eq!(hash_datavalue(&val2), hash_datavalue(&val2));
204        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
205        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val3));
206    }
207
208    #[test]
209    fn test_hash_datavalue_u32() {
210        let val1 = DataValue::U32(0);
211        let val2 = DataValue::U32(u32::MAX);
212        let val3 = DataValue::U32(12345);
213
214        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val1));
215        assert_eq!(hash_datavalue(&val2), hash_datavalue(&val2));
216        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
217        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val3));
218    }
219
220    #[test]
221    fn test_hash_datavalue_i32() {
222        let val1 = DataValue::I32(0);
223        let val2 = DataValue::I32(-1);
224        let val3 = DataValue::I32(i32::MAX);
225        let val4 = DataValue::I32(i32::MIN);
226
227        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val1));
228        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
229        assert_ne!(hash_datavalue(&val3), hash_datavalue(&val4));
230    }
231
232    #[test]
233    fn test_hash_datavalue_u64() {
234        let val1 = DataValue::U64(0);
235        let val2 = DataValue::U64(u64::MAX);
236        let val3 = DataValue::U64(123456789);
237
238        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val1));
239        assert_eq!(hash_datavalue(&val2), hash_datavalue(&val2));
240        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
241        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val3));
242    }
243
244    #[test]
245    fn test_hash_datavalue_i64() {
246        let val1 = DataValue::I64(0);
247        let val2 = DataValue::I64(-1);
248        let val3 = DataValue::I64(i64::MAX);
249        let val4 = DataValue::I64(i64::MIN);
250
251        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val1));
252        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
253        assert_ne!(hash_datavalue(&val3), hash_datavalue(&val4));
254    }
255
256    #[test]
257    fn test_hash_datavalue_f32() {
258        let val1 = DataValue::F32(0.0);
259        let val2 = DataValue::F32(-0.0);
260        let val3 = DataValue::F32(3.14);
261        let val4 = DataValue::F32(f32::INFINITY);
262        let val5 = DataValue::F32(f32::NEG_INFINITY);
263        let val6 = DataValue::F32(f32::NAN);
264
265        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val1));
266        assert_eq!(hash_datavalue(&val3), hash_datavalue(&val3));
267        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
268        assert_ne!(hash_datavalue(&val4), hash_datavalue(&val5));
269        assert_eq!(
270            hash_datavalue(&val6),
271            hash_datavalue(&DataValue::F32(f32::NAN))
272        );
273    }
274
275    #[test]
276    fn test_hash_datavalue_f64() {
277        let val1 = DataValue::F64(0.0);
278        let val2 = DataValue::F64(-0.0);
279        let val3 = DataValue::F64(3.14159);
280        let val4 = DataValue::F64(f64::INFINITY);
281        let val5 = DataValue::F64(f64::NEG_INFINITY);
282        let val6 = DataValue::F64(f64::NAN);
283
284        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val1));
285        assert_eq!(hash_datavalue(&val3), hash_datavalue(&val3));
286        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
287        assert_ne!(hash_datavalue(&val4), hash_datavalue(&val5));
288        assert_eq!(
289            hash_datavalue(&val6),
290            hash_datavalue(&DataValue::F64(f64::NAN))
291        );
292    }
293
294    #[test]
295    fn test_hash_datavalue_u128() {
296        let val1 = DataValue::U128(0);
297        let val2 = DataValue::U128(u128::MAX);
298        let val3 = DataValue::U128(12345678901234567890);
299
300        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val1));
301        assert_eq!(hash_datavalue(&val2), hash_datavalue(&val2));
302        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
303        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val3));
304    }
305
306    #[test]
307    fn test_hash_datavalue_i128() {
308        let val1 = DataValue::I128(0);
309        let val2 = DataValue::I128(-1);
310        let val3 = DataValue::I128(i128::MAX);
311        let val4 = DataValue::I128(i128::MIN);
312
313        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val1));
314        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
315        assert_ne!(hash_datavalue(&val3), hash_datavalue(&val4));
316    }
317
318    #[test]
319    fn test_hash_datavalue_string() {
320        let val1 = DataValue::String("hello".into());
321        let val2 = DataValue::String("world".into());
322        let val3 = DataValue::String("hello".into());
323        let val4 = DataValue::String("".into());
324
325        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val3));
326        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
327        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val4));
328        assert_eq!(hash_datavalue(&val4), hash_datavalue(&val4));
329    }
330
331    #[test]
332    fn test_hash_datavalue_bytes() {
333        let val1 = DataValue::Bytes(vec![1, 2, 3]);
334        let val2 = DataValue::Bytes(vec![1, 2, 3]);
335        let val3 = DataValue::Bytes(vec![3, 2, 1]);
336        let val4 = DataValue::Bytes(vec![]);
337
338        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val2));
339        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val3));
340        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val4));
341        assert_eq!(
342            hash_datavalue(&val4),
343            hash_datavalue(&DataValue::Bytes(vec![]))
344        );
345    }
346
347    #[test]
348    fn test_hash_datavalue_vec_empty() {
349        let val = DataValue::Vec(vec![]);
350        assert_eq!(hash_datavalue(&val), hash_datavalue(&val));
351    }
352
353    #[test]
354    fn test_hash_datavalue_vec_basic() {
355        let val1 = DataValue::Vec(vec![
356            DataValue::I32(1),
357            DataValue::I32(2),
358            DataValue::I32(3),
359        ]);
360        let val2 = DataValue::Vec(vec![
361            DataValue::I32(1),
362            DataValue::I32(2),
363            DataValue::I32(3),
364        ]);
365        let val3 = DataValue::Vec(vec![
366            DataValue::I32(3),
367            DataValue::I32(2),
368            DataValue::I32(1),
369        ]);
370
371        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val2));
372        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val3));
373    }
374
375    #[test]
376    fn test_hash_datavalue_vec_nested() {
377        let val1 = DataValue::Vec(vec![
378            DataValue::Vec(vec![DataValue::I32(1), DataValue::I32(2)]),
379            DataValue::Vec(vec![DataValue::I32(3), DataValue::I32(4)]),
380        ]);
381        let val2 = DataValue::Vec(vec![
382            DataValue::Vec(vec![DataValue::I32(1), DataValue::I32(2)]),
383            DataValue::Vec(vec![DataValue::I32(3), DataValue::I32(4)]),
384        ]);
385        let val3 = DataValue::Vec(vec![
386            DataValue::Vec(vec![DataValue::I32(1), DataValue::I32(2)]),
387            DataValue::Vec(vec![DataValue::I32(3), DataValue::I32(5)]),
388        ]);
389
390        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val2));
391        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val3));
392    }
393
394    #[test]
395    fn test_hash_datavalue_vec_mixed_types() {
396        let val = DataValue::Vec(vec![
397            DataValue::I32(42),
398            DataValue::String("test".into()),
399            DataValue::Bool(true),
400            DataValue::Null,
401        ]);
402
403        assert_eq!(hash_datavalue(&val), hash_datavalue(&val));
404    }
405
406    #[test]
407    fn test_hash_datavalue_map_empty() {
408        let val = DataValue::Map(HashMap::new());
409        assert_eq!(hash_datavalue(&val), hash_datavalue(&val));
410    }
411
412    #[test]
413    fn test_hash_datavalue_map_basic() {
414        let mut map1 = HashMap::new();
415        map1.insert("key1".into(), DataValue::I32(1));
416        map1.insert("key2".into(), DataValue::I32(2));
417
418        let mut map2 = HashMap::new();
419        map2.insert("key1".into(), DataValue::I32(1));
420        map2.insert("key2".into(), DataValue::I32(2));
421
422        let val1 = DataValue::Map(map1);
423        let val2 = DataValue::Map(map2);
424
425        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val2));
426    }
427
428    #[test]
429    fn test_hash_datavalue_map_insertion_order_independence() {
430        let mut map1 = HashMap::new();
431        map1.insert("a".into(), DataValue::I32(1));
432        map1.insert("b".into(), DataValue::I32(2));
433        map1.insert("c".into(), DataValue::I32(3));
434
435        let mut map2 = HashMap::new();
436        map2.insert("c".into(), DataValue::I32(3));
437        map2.insert("a".into(), DataValue::I32(1));
438        map2.insert("b".into(), DataValue::I32(2));
439
440        let val1 = DataValue::Map(map1);
441        let val2 = DataValue::Map(map2);
442
443        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val2));
444    }
445
446    #[test]
447    fn test_hash_datavalue_map_different_values() {
448        let mut map1 = HashMap::new();
449        map1.insert("key".into(), DataValue::I32(1));
450
451        let mut map2 = HashMap::new();
452        map2.insert("key".into(), DataValue::I32(2));
453
454        let val1 = DataValue::Map(map1);
455        let val2 = DataValue::Map(map2);
456
457        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
458    }
459
460    #[test]
461    fn test_hash_datavalue_map_nested() {
462        let mut inner_map = HashMap::new();
463        inner_map.insert("inner".into(), DataValue::I32(42));
464
465        let mut outer_map = HashMap::new();
466        outer_map.insert("outer".into(), DataValue::Map(inner_map.clone()));
467
468        let mut outer_map2 = HashMap::new();
469        outer_map2.insert("outer".into(), DataValue::Map(inner_map));
470
471        let val1 = DataValue::Map(outer_map);
472        let val2 = DataValue::Map(outer_map2);
473
474        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val2));
475    }
476
477    #[test]
478    fn test_hash_datavalue_enum_number() {
479        let val1 = DataValue::EnumNumber(0);
480        let val2 = DataValue::EnumNumber(1);
481        let val3 = DataValue::EnumNumber(-1);
482        let val4 = DataValue::EnumNumber(0);
483
484        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val4));
485        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
486        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val3));
487    }
488
489    #[test]
490    fn test_hash_datavalue_type_differentiation() {
491        let u32_val = DataValue::U32(42);
492        let i32_val = DataValue::I32(42);
493        let u64_val = DataValue::U64(42);
494        let i64_val = DataValue::I64(42);
495        let f32_val = DataValue::F32(42.0);
496        let f64_val = DataValue::F64(42.0);
497
498        let u32_hash = hash_datavalue(&u32_val);
499        let i32_hash = hash_datavalue(&i32_val);
500        let u64_hash = hash_datavalue(&u64_val);
501        let i64_hash = hash_datavalue(&i64_val);
502        let f32_hash = hash_datavalue(&f32_val);
503        let f64_hash = hash_datavalue(&f64_val);
504
505        assert_ne!(u32_hash, i32_hash);
506        assert_ne!(u32_hash, u64_hash);
507        assert_ne!(u32_hash, i64_hash);
508        assert_ne!(i32_hash, u64_hash);
509        assert_ne!(i32_hash, i64_hash);
510        assert_ne!(u64_hash, i64_hash);
511        assert_ne!(f32_hash, f64_hash);
512    }
513
514    #[test]
515    fn test_hash_datavalue_null_vs_zero() {
516        let null_val = DataValue::Null;
517        let zero_i32 = DataValue::I32(0);
518        let zero_u32 = DataValue::U32(0);
519        let false_bool = DataValue::Bool(false);
520
521        let null_hash = hash_datavalue(&null_val);
522        let zero_i32_hash = hash_datavalue(&zero_i32);
523        let zero_u32_hash = hash_datavalue(&zero_u32);
524        let false_hash = hash_datavalue(&false_bool);
525
526        assert_ne!(null_hash, zero_i32_hash);
527        assert_ne!(null_hash, zero_u32_hash);
528        assert_ne!(null_hash, false_hash);
529    }
530
531    #[test]
532    fn test_hash_datavalue_string_vs_bytes() {
533        let string_val = DataValue::String("hello".into());
534        let bytes_val = DataValue::Bytes(b"hello".to_vec());
535
536        assert_ne!(hash_datavalue(&string_val), hash_datavalue(&bytes_val));
537    }
538
539    #[test]
540    fn test_hash_datavalue_consistency() {
541        let test_values = vec![
542            DataValue::Null,
543            DataValue::Bool(true),
544            DataValue::U8(255),
545            DataValue::I32(-42),
546            DataValue::String("test".into()),
547            DataValue::Vec(vec![DataValue::I32(1), DataValue::I32(2)]),
548        ];
549
550        for val in test_values {
551            let hash1 = hash_datavalue(&val);
552            let hash2 = hash_datavalue(&val);
553            let hash3 = hash_datavalue(&val);
554            assert_eq!(hash1, hash2);
555            assert_eq!(hash2, hash3);
556        }
557    }
558}