Skip to main content

trs_dataframe/dataframe/
index.rs

1use std::hash::{Hash, Hasher};
2
3use data_value::DataValue;
4use halfbrown::HashMap;
5
6use super::{column_store::ColumnFrame, Key};
7
8/// Fast non-allocating hash for DataValue
9fn hash_datavalue(value: &DataValue) -> u64 {
10    use data_value::DataValue::*;
11
12    let mut hasher = std::collections::hash_map::DefaultHasher::new();
13
14    // Write a discriminant byte to differentiate types
15    match value {
16        Null => hasher.write_u8(0),
17        Bool(b) => {
18            hasher.write_u8(1);
19            hasher.write_u8(*b as u8);
20        }
21        U8(u) => {
22            hasher.write_u8(2);
23            hasher.write_u8(*u);
24        }
25        U32(u) => {
26            hasher.write_u8(3);
27            hasher.write_u32(*u);
28        }
29        I32(i) => {
30            hasher.write_u8(4);
31            hasher.write_i32(*i);
32        }
33        U64(u) => {
34            hasher.write_u8(5);
35            hasher.write_u64(*u);
36        }
37        I64(i) => {
38            hasher.write_u8(6);
39            hasher.write_i64(*i);
40        }
41        F32(f) => {
42            hasher.write_u8(7);
43            hasher.write_u32(f.to_bits());
44        }
45        F64(f) => {
46            hasher.write_u8(8);
47            hasher.write_u64(f.to_bits());
48        }
49        U128(u) => {
50            hasher.write_u8(9);
51            hasher.write_u64((*u >> 64) as u64);
52            hasher.write_u64(*u as u64);
53        }
54        I128(i) => {
55            hasher.write_u8(10);
56            let u = *i as u128;
57            hasher.write_u64((u >> 64) as u64);
58            hasher.write_u64(u as u64);
59        }
60        String(s) => {
61            hasher.write_u8(11);
62            hasher.write(s.as_bytes());
63        }
64        Bytes(b) => {
65            hasher.write_u8(12);
66            hasher.write(b);
67        }
68        Vec(v) => {
69            hasher.write_u8(13);
70            hasher.write_usize(v.len());
71            for item in v {
72                // Recursive hashing - combine child hashes
73                let item_hash = hash_datavalue(item);
74                hasher.write_u64(item_hash);
75            }
76        }
77        Map(m) => {
78            hasher.write_u8(14);
79            hasher.write_usize(m.len());
80            // Sort keys for consistent hashing
81            let mut keys: std::vec::Vec<_> = m.keys().collect();
82            keys.sort();
83            for k in keys {
84                hasher.write(k.as_bytes());
85                if let Some(v) = m.get(k) {
86                    let val_hash = hash_datavalue(v);
87                    hasher.write_u64(val_hash);
88                }
89            }
90        }
91        EnumNumber(e) => {
92            hasher.write_u8(15);
93            hasher.write_i32(*e);
94        }
95    }
96
97    hasher.finish()
98}
99
100#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)]
101struct VecIndex {
102    hash: u64,
103}
104
105impl VecIndex {
106    pub fn new(value: &[DataValue]) -> Self {
107        // Combine hashes of all values using a simple combiner
108        // This is much faster than formatting to strings
109        let mut combined_hash: u64 = 0;
110        for v in value.iter() {
111            let h = hash_datavalue(v);
112            // Rotate and XOR for better distribution
113            combined_hash = combined_hash.wrapping_mul(31).wrapping_add(h);
114        }
115        Self {
116            hash: combined_hash,
117        }
118    }
119}
120impl Hash for VecIndex {
121    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
122        state.write_u64(self.hash);
123    }
124}
125
126impl From<&[DataValue]> for VecIndex {
127    fn from(value: &[DataValue]) -> Self {
128        Self::new(value)
129    }
130}
131
132#[derive(Debug)]
133pub struct Index {
134    index: HashMap<VecIndex, Vec<usize>>,
135}
136
137impl Index {
138    /// Create the index for the given keys and the [`ColumnFrame`] for the given keys.
139    /// This will enumerate the values and store them in the index with current values
140    pub fn new(key: Vec<Key>, df: &ColumnFrame) -> Self {
141        let selected = df.select(Some(key.as_slice()));
142        let mut this = Self {
143            index: HashMap::new(),
144        };
145
146        for (index, candidate) in selected.rows().into_iter().enumerate() {
147            if let Some(slice) = candidate.as_slice() {
148                this.index
149                    .entry(VecIndex::from(slice))
150                    .or_default()
151                    .push(index);
152            }
153        }
154        this
155    }
156
157    pub fn get(&self, values: &[DataValue]) -> Option<&[usize]> {
158        self.index
159            .get(&VecIndex::from(values))
160            .map(|idx| idx.as_slice())
161    }
162
163    pub fn join(self, other: Index) -> Vec<(Vec<usize>, Vec<usize>)> {
164        let mut output = Vec::with_capacity(self.index.len());
165        for (index, left_index) in self.index.into_iter() {
166            if let Some(right_idx) = other.index.get(&index) {
167                output.push((left_index, right_idx.clone()));
168            }
169        }
170
171        output
172    }
173}
174
175#[cfg(test)]
176mod tests {
177    use super::*;
178    use data_value::DataValue;
179    use std::collections::HashMap;
180
181    #[test]
182    fn test_hash_datavalue_null() {
183        let val = DataValue::Null;
184        let hash1 = hash_datavalue(&val);
185        let hash2 = hash_datavalue(&val);
186        assert_eq!(hash1, hash2);
187    }
188
189    #[test]
190    fn test_hash_datavalue_bool() {
191        let true_val = DataValue::Bool(true);
192        let false_val = DataValue::Bool(false);
193
194        let hash_true = hash_datavalue(&true_val);
195        let hash_false = hash_datavalue(&false_val);
196
197        assert_eq!(hash_true, hash_datavalue(&true_val));
198        assert_eq!(hash_false, hash_datavalue(&false_val));
199        assert_ne!(hash_true, hash_false);
200    }
201
202    #[test]
203    fn test_hash_datavalue_u8() {
204        let val1 = DataValue::U8(0);
205        let val2 = DataValue::U8(255);
206        let val3 = DataValue::U8(42);
207
208        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val1));
209        assert_eq!(hash_datavalue(&val2), hash_datavalue(&val2));
210        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
211        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val3));
212    }
213
214    #[test]
215    fn test_hash_datavalue_u32() {
216        let val1 = DataValue::U32(0);
217        let val2 = DataValue::U32(u32::MAX);
218        let val3 = DataValue::U32(12345);
219
220        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val1));
221        assert_eq!(hash_datavalue(&val2), hash_datavalue(&val2));
222        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
223        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val3));
224    }
225
226    #[test]
227    fn test_hash_datavalue_i32() {
228        let val1 = DataValue::I32(0);
229        let val2 = DataValue::I32(-1);
230        let val3 = DataValue::I32(i32::MAX);
231        let val4 = DataValue::I32(i32::MIN);
232
233        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val1));
234        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
235        assert_ne!(hash_datavalue(&val3), hash_datavalue(&val4));
236    }
237
238    #[test]
239    fn test_hash_datavalue_u64() {
240        let val1 = DataValue::U64(0);
241        let val2 = DataValue::U64(u64::MAX);
242        let val3 = DataValue::U64(123456789);
243
244        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val1));
245        assert_eq!(hash_datavalue(&val2), hash_datavalue(&val2));
246        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
247        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val3));
248    }
249
250    #[test]
251    fn test_hash_datavalue_i64() {
252        let val1 = DataValue::I64(0);
253        let val2 = DataValue::I64(-1);
254        let val3 = DataValue::I64(i64::MAX);
255        let val4 = DataValue::I64(i64::MIN);
256
257        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val1));
258        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
259        assert_ne!(hash_datavalue(&val3), hash_datavalue(&val4));
260    }
261
262    #[test]
263    fn test_hash_datavalue_f32() {
264        let val1 = DataValue::F32(0.0);
265        let val2 = DataValue::F32(-0.0);
266        let val3 = DataValue::F32(3.14);
267        let val4 = DataValue::F32(f32::INFINITY);
268        let val5 = DataValue::F32(f32::NEG_INFINITY);
269        let val6 = DataValue::F32(f32::NAN);
270
271        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val1));
272        assert_eq!(hash_datavalue(&val3), hash_datavalue(&val3));
273        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
274        assert_ne!(hash_datavalue(&val4), hash_datavalue(&val5));
275        assert_eq!(
276            hash_datavalue(&val6),
277            hash_datavalue(&DataValue::F32(f32::NAN))
278        );
279    }
280
281    #[test]
282    fn test_hash_datavalue_f64() {
283        let val1 = DataValue::F64(0.0);
284        let val2 = DataValue::F64(-0.0);
285        let val3 = DataValue::F64(3.14159);
286        let val4 = DataValue::F64(f64::INFINITY);
287        let val5 = DataValue::F64(f64::NEG_INFINITY);
288        let val6 = DataValue::F64(f64::NAN);
289
290        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val1));
291        assert_eq!(hash_datavalue(&val3), hash_datavalue(&val3));
292        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
293        assert_ne!(hash_datavalue(&val4), hash_datavalue(&val5));
294        assert_eq!(
295            hash_datavalue(&val6),
296            hash_datavalue(&DataValue::F64(f64::NAN))
297        );
298    }
299
300    #[test]
301    fn test_hash_datavalue_u128() {
302        let val1 = DataValue::U128(0);
303        let val2 = DataValue::U128(u128::MAX);
304        let val3 = DataValue::U128(12345678901234567890);
305
306        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val1));
307        assert_eq!(hash_datavalue(&val2), hash_datavalue(&val2));
308        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
309        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val3));
310    }
311
312    #[test]
313    fn test_hash_datavalue_i128() {
314        let val1 = DataValue::I128(0);
315        let val2 = DataValue::I128(-1);
316        let val3 = DataValue::I128(i128::MAX);
317        let val4 = DataValue::I128(i128::MIN);
318
319        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val1));
320        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
321        assert_ne!(hash_datavalue(&val3), hash_datavalue(&val4));
322    }
323
324    #[test]
325    fn test_hash_datavalue_string() {
326        let val1 = DataValue::String("hello".into());
327        let val2 = DataValue::String("world".into());
328        let val3 = DataValue::String("hello".into());
329        let val4 = DataValue::String("".into());
330
331        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val3));
332        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
333        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val4));
334        assert_eq!(hash_datavalue(&val4), hash_datavalue(&val4));
335    }
336
337    #[test]
338    fn test_hash_datavalue_bytes() {
339        let val1 = DataValue::Bytes(vec![1, 2, 3]);
340        let val2 = DataValue::Bytes(vec![1, 2, 3]);
341        let val3 = DataValue::Bytes(vec![3, 2, 1]);
342        let val4 = DataValue::Bytes(vec![]);
343
344        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val2));
345        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val3));
346        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val4));
347        assert_eq!(
348            hash_datavalue(&val4),
349            hash_datavalue(&DataValue::Bytes(vec![]))
350        );
351    }
352
353    #[test]
354    fn test_hash_datavalue_vec_empty() {
355        let val = DataValue::Vec(vec![]);
356        assert_eq!(hash_datavalue(&val), hash_datavalue(&val));
357    }
358
359    #[test]
360    fn test_hash_datavalue_vec_basic() {
361        let val1 = DataValue::Vec(vec![
362            DataValue::I32(1),
363            DataValue::I32(2),
364            DataValue::I32(3),
365        ]);
366        let val2 = DataValue::Vec(vec![
367            DataValue::I32(1),
368            DataValue::I32(2),
369            DataValue::I32(3),
370        ]);
371        let val3 = DataValue::Vec(vec![
372            DataValue::I32(3),
373            DataValue::I32(2),
374            DataValue::I32(1),
375        ]);
376
377        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val2));
378        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val3));
379    }
380
381    #[test]
382    fn test_hash_datavalue_vec_nested() {
383        let val1 = DataValue::Vec(vec![
384            DataValue::Vec(vec![DataValue::I32(1), DataValue::I32(2)]),
385            DataValue::Vec(vec![DataValue::I32(3), DataValue::I32(4)]),
386        ]);
387        let val2 = DataValue::Vec(vec![
388            DataValue::Vec(vec![DataValue::I32(1), DataValue::I32(2)]),
389            DataValue::Vec(vec![DataValue::I32(3), DataValue::I32(4)]),
390        ]);
391        let val3 = DataValue::Vec(vec![
392            DataValue::Vec(vec![DataValue::I32(1), DataValue::I32(2)]),
393            DataValue::Vec(vec![DataValue::I32(3), DataValue::I32(5)]),
394        ]);
395
396        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val2));
397        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val3));
398    }
399
400    #[test]
401    fn test_hash_datavalue_vec_mixed_types() {
402        let val = DataValue::Vec(vec![
403            DataValue::I32(42),
404            DataValue::String("test".into()),
405            DataValue::Bool(true),
406            DataValue::Null,
407        ]);
408
409        assert_eq!(hash_datavalue(&val), hash_datavalue(&val));
410    }
411
412    #[test]
413    fn test_hash_datavalue_map_empty() {
414        let val = DataValue::Map(HashMap::new());
415        assert_eq!(hash_datavalue(&val), hash_datavalue(&val));
416    }
417
418    #[test]
419    fn test_hash_datavalue_map_basic() {
420        let mut map1 = HashMap::new();
421        map1.insert("key1".into(), DataValue::I32(1));
422        map1.insert("key2".into(), DataValue::I32(2));
423
424        let mut map2 = HashMap::new();
425        map2.insert("key1".into(), DataValue::I32(1));
426        map2.insert("key2".into(), DataValue::I32(2));
427
428        let val1 = DataValue::Map(map1);
429        let val2 = DataValue::Map(map2);
430
431        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val2));
432    }
433
434    #[test]
435    fn test_hash_datavalue_map_insertion_order_independence() {
436        let mut map1 = HashMap::new();
437        map1.insert("a".into(), DataValue::I32(1));
438        map1.insert("b".into(), DataValue::I32(2));
439        map1.insert("c".into(), DataValue::I32(3));
440
441        let mut map2 = HashMap::new();
442        map2.insert("c".into(), DataValue::I32(3));
443        map2.insert("a".into(), DataValue::I32(1));
444        map2.insert("b".into(), DataValue::I32(2));
445
446        let val1 = DataValue::Map(map1);
447        let val2 = DataValue::Map(map2);
448
449        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val2));
450    }
451
452    #[test]
453    fn test_hash_datavalue_map_different_values() {
454        let mut map1 = HashMap::new();
455        map1.insert("key".into(), DataValue::I32(1));
456
457        let mut map2 = HashMap::new();
458        map2.insert("key".into(), DataValue::I32(2));
459
460        let val1 = DataValue::Map(map1);
461        let val2 = DataValue::Map(map2);
462
463        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
464    }
465
466    #[test]
467    fn test_hash_datavalue_map_nested() {
468        let mut inner_map = HashMap::new();
469        inner_map.insert("inner".into(), DataValue::I32(42));
470
471        let mut outer_map = HashMap::new();
472        outer_map.insert("outer".into(), DataValue::Map(inner_map.clone()));
473
474        let mut outer_map2 = HashMap::new();
475        outer_map2.insert("outer".into(), DataValue::Map(inner_map));
476
477        let val1 = DataValue::Map(outer_map);
478        let val2 = DataValue::Map(outer_map2);
479
480        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val2));
481    }
482
483    #[test]
484    fn test_hash_datavalue_enum_number() {
485        let val1 = DataValue::EnumNumber(0);
486        let val2 = DataValue::EnumNumber(1);
487        let val3 = DataValue::EnumNumber(-1);
488        let val4 = DataValue::EnumNumber(0);
489
490        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val4));
491        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
492        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val3));
493    }
494
495    #[test]
496    fn test_hash_datavalue_type_differentiation() {
497        let u32_val = DataValue::U32(42);
498        let i32_val = DataValue::I32(42);
499        let u64_val = DataValue::U64(42);
500        let i64_val = DataValue::I64(42);
501        let f32_val = DataValue::F32(42.0);
502        let f64_val = DataValue::F64(42.0);
503
504        let u32_hash = hash_datavalue(&u32_val);
505        let i32_hash = hash_datavalue(&i32_val);
506        let u64_hash = hash_datavalue(&u64_val);
507        let i64_hash = hash_datavalue(&i64_val);
508        let f32_hash = hash_datavalue(&f32_val);
509        let f64_hash = hash_datavalue(&f64_val);
510
511        assert_ne!(u32_hash, i32_hash);
512        assert_ne!(u32_hash, u64_hash);
513        assert_ne!(u32_hash, i64_hash);
514        assert_ne!(i32_hash, u64_hash);
515        assert_ne!(i32_hash, i64_hash);
516        assert_ne!(u64_hash, i64_hash);
517        assert_ne!(f32_hash, f64_hash);
518    }
519
520    #[test]
521    fn test_hash_datavalue_null_vs_zero() {
522        let null_val = DataValue::Null;
523        let zero_i32 = DataValue::I32(0);
524        let zero_u32 = DataValue::U32(0);
525        let false_bool = DataValue::Bool(false);
526
527        let null_hash = hash_datavalue(&null_val);
528        let zero_i32_hash = hash_datavalue(&zero_i32);
529        let zero_u32_hash = hash_datavalue(&zero_u32);
530        let false_hash = hash_datavalue(&false_bool);
531
532        assert_ne!(null_hash, zero_i32_hash);
533        assert_ne!(null_hash, zero_u32_hash);
534        assert_ne!(null_hash, false_hash);
535    }
536
537    #[test]
538    fn test_hash_datavalue_string_vs_bytes() {
539        let string_val = DataValue::String("hello".into());
540        let bytes_val = DataValue::Bytes(b"hello".to_vec());
541
542        assert_ne!(hash_datavalue(&string_val), hash_datavalue(&bytes_val));
543    }
544
545    #[test]
546    fn test_hash_datavalue_consistency() {
547        let test_values = vec![
548            DataValue::Null,
549            DataValue::Bool(true),
550            DataValue::U8(255),
551            DataValue::I32(-42),
552            DataValue::String("test".into()),
553            DataValue::Vec(vec![DataValue::I32(1), DataValue::I32(2)]),
554        ];
555
556        for val in test_values {
557            let hash1 = hash_datavalue(&val);
558            let hash2 = hash_datavalue(&val);
559            let hash3 = hash_datavalue(&val);
560            assert_eq!(hash1, hash2);
561            assert_eq!(hash2, hash3);
562        }
563    }
564}