Skip to main content

trs_dataframe/dataframe/
index.rs

1use std::hash::{Hash, Hasher};
2
3use data_value::DataValue;
4use halfbrown::HashMap;
5
6use super::{column_store::ColumnFrame, Key};
7
8/// Fast non-allocating hash for DataValue
9pub fn hash_datavalue(value: &DataValue) -> u64 {
10    use data_value::DataValue::*;
11
12    let mut hasher = std::collections::hash_map::DefaultHasher::new();
13
14    // Write a discriminant byte to differentiate types
15    match value {
16        Null => hasher.write_u8(0),
17        Bool(b) => {
18            hasher.write_u8(1);
19            hasher.write_u8(*b as u8);
20        }
21        U8(u) => {
22            hasher.write_u8(2);
23            hasher.write_u8(*u);
24        }
25        U32(u) => {
26            hasher.write_u8(3);
27            hasher.write_u32(*u);
28        }
29        I32(i) => {
30            hasher.write_u8(4);
31            hasher.write_i32(*i);
32        }
33        U64(u) => {
34            hasher.write_u8(5);
35            hasher.write_u64(*u);
36        }
37        I64(i) => {
38            hasher.write_u8(6);
39            hasher.write_i64(*i);
40        }
41        F32(f) => {
42            hasher.write_u8(7);
43            hasher.write_u32(f.to_bits());
44        }
45        F64(f) => {
46            hasher.write_u8(8);
47            hasher.write_u64(f.to_bits());
48        }
49        U128(u) => {
50            hasher.write_u8(9);
51            hasher.write_u64((*u >> 64) as u64);
52            hasher.write_u64(*u as u64);
53        }
54        I128(i) => {
55            hasher.write_u8(10);
56            let u = *i as u128;
57            hasher.write_u64((u >> 64) as u64);
58            hasher.write_u64(u as u64);
59        }
60        String(s) => {
61            hasher.write_u8(11);
62            hasher.write(s.as_bytes());
63        }
64        Bytes(b) => {
65            hasher.write_u8(12);
66            hasher.write(b);
67        }
68        Vec(v) => {
69            hasher.write_u8(13);
70            hasher.write_usize(v.len());
71            for item in v {
72                // Recursive hashing - combine child hashes
73                let item_hash = hash_datavalue(item);
74                hasher.write_u64(item_hash);
75            }
76        }
77        Map(m) => {
78            hasher.write_u8(14);
79            hasher.write_usize(m.len());
80            // Sort keys for consistent hashing
81            let mut keys: std::vec::Vec<_> = m.keys().collect();
82            keys.sort();
83            for k in keys {
84                hasher.write(k.as_bytes());
85                if let Some(v) = m.get(k) {
86                    let val_hash = hash_datavalue(v);
87                    hasher.write_u64(val_hash);
88                }
89            }
90        }
91        EnumNumber(e) => {
92            hasher.write_u8(15);
93            hasher.write_i32(*e);
94        }
95    }
96
97    hasher.finish()
98}
99
100#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)]
101struct VecIndex {
102    hash: u64,
103}
104
105impl VecIndex {
106    pub fn new(value: &[DataValue]) -> Self {
107        // Combine hashes of all values using a simple combiner
108        // This is much faster than formatting to strings
109        let mut combined_hash: u64 = 0;
110        for v in value.iter() {
111            let h = hash_datavalue(v);
112            // Rotate and XOR for better distribution
113            combined_hash = combined_hash.wrapping_mul(31).wrapping_add(h);
114        }
115        Self {
116            hash: combined_hash,
117        }
118    }
119}
120impl Hash for VecIndex {
121    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
122        state.write_u64(self.hash);
123    }
124}
125
126impl From<&[DataValue]> for VecIndex {
127    fn from(value: &[DataValue]) -> Self {
128        Self::new(value)
129    }
130}
131
132/// Hash-based row index for join operations over composite keys.
133#[derive(Debug)]
134pub struct Index {
135    index: HashMap<VecIndex, Vec<usize>>,
136}
137
138impl Index {
139    /// Create the index for the given keys and the [`ColumnFrame`] for the given keys.
140    /// This will enumerate the values and store them in the index with current values
141    pub fn new(key: Vec<Key>, df: &ColumnFrame) -> Self {
142        let selected = df.select(Some(key.as_slice()));
143        let mut this = Self {
144            index: HashMap::new(),
145        };
146
147        for (index, candidate) in selected.rows().into_iter().enumerate() {
148            if let Some(slice) = candidate.as_slice() {
149                this.index
150                    .entry(VecIndex::from(slice))
151                    .or_default()
152                    .push(index);
153            }
154        }
155        this
156    }
157
158    /// Returns the row indices matching the given composite key values, if any.
159    pub fn get(&self, values: &[DataValue]) -> Option<&[usize]> {
160        self.index
161            .get(&VecIndex::from(values))
162            .map(|idx| idx.as_slice())
163    }
164
165    /// Performs an inner hash-join with `other`, returning pairs of matching row-index vectors.
166    pub fn join(self, other: Index) -> Vec<(Vec<usize>, Vec<usize>)> {
167        let mut output = Vec::with_capacity(self.index.len());
168        for (index, left_index) in self.index.into_iter() {
169            if let Some(right_idx) = other.index.get(&index) {
170                output.push((left_index, right_idx.clone()));
171            }
172        }
173
174        output
175    }
176}
177
178#[cfg(test)]
179mod tests {
180    use super::*;
181    use data_value::DataValue;
182    use std::collections::HashMap;
183
184    #[test]
185    fn test_hash_datavalue_null() {
186        let val = DataValue::Null;
187        let hash1 = hash_datavalue(&val);
188        let hash2 = hash_datavalue(&val);
189        assert_eq!(hash1, hash2);
190    }
191
192    #[test]
193    fn test_hash_datavalue_bool() {
194        let true_val = DataValue::Bool(true);
195        let false_val = DataValue::Bool(false);
196
197        let hash_true = hash_datavalue(&true_val);
198        let hash_false = hash_datavalue(&false_val);
199
200        assert_eq!(hash_true, hash_datavalue(&true_val));
201        assert_eq!(hash_false, hash_datavalue(&false_val));
202        assert_ne!(hash_true, hash_false);
203    }
204
205    #[test]
206    fn test_hash_datavalue_u8() {
207        let val1 = DataValue::U8(0);
208        let val2 = DataValue::U8(255);
209        let val3 = DataValue::U8(42);
210
211        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val1));
212        assert_eq!(hash_datavalue(&val2), hash_datavalue(&val2));
213        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
214        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val3));
215    }
216
217    #[test]
218    fn test_hash_datavalue_u32() {
219        let val1 = DataValue::U32(0);
220        let val2 = DataValue::U32(u32::MAX);
221        let val3 = DataValue::U32(12345);
222
223        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val1));
224        assert_eq!(hash_datavalue(&val2), hash_datavalue(&val2));
225        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
226        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val3));
227    }
228
229    #[test]
230    fn test_hash_datavalue_i32() {
231        let val1 = DataValue::I32(0);
232        let val2 = DataValue::I32(-1);
233        let val3 = DataValue::I32(i32::MAX);
234        let val4 = DataValue::I32(i32::MIN);
235
236        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val1));
237        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
238        assert_ne!(hash_datavalue(&val3), hash_datavalue(&val4));
239    }
240
241    #[test]
242    fn test_hash_datavalue_u64() {
243        let val1 = DataValue::U64(0);
244        let val2 = DataValue::U64(u64::MAX);
245        let val3 = DataValue::U64(123456789);
246
247        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val1));
248        assert_eq!(hash_datavalue(&val2), hash_datavalue(&val2));
249        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
250        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val3));
251    }
252
253    #[test]
254    fn test_hash_datavalue_i64() {
255        let val1 = DataValue::I64(0);
256        let val2 = DataValue::I64(-1);
257        let val3 = DataValue::I64(i64::MAX);
258        let val4 = DataValue::I64(i64::MIN);
259
260        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val1));
261        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
262        assert_ne!(hash_datavalue(&val3), hash_datavalue(&val4));
263    }
264
265    #[test]
266    fn test_hash_datavalue_f32() {
267        let val1 = DataValue::F32(0.0);
268        let val2 = DataValue::F32(-0.0);
269        let val3 = DataValue::F32(3.14);
270        let val4 = DataValue::F32(f32::INFINITY);
271        let val5 = DataValue::F32(f32::NEG_INFINITY);
272        let val6 = DataValue::F32(f32::NAN);
273
274        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val1));
275        assert_eq!(hash_datavalue(&val3), hash_datavalue(&val3));
276        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
277        assert_ne!(hash_datavalue(&val4), hash_datavalue(&val5));
278        assert_eq!(
279            hash_datavalue(&val6),
280            hash_datavalue(&DataValue::F32(f32::NAN))
281        );
282    }
283
284    #[test]
285    fn test_hash_datavalue_f64() {
286        let val1 = DataValue::F64(0.0);
287        let val2 = DataValue::F64(-0.0);
288        let val3 = DataValue::F64(3.14159);
289        let val4 = DataValue::F64(f64::INFINITY);
290        let val5 = DataValue::F64(f64::NEG_INFINITY);
291        let val6 = DataValue::F64(f64::NAN);
292
293        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val1));
294        assert_eq!(hash_datavalue(&val3), hash_datavalue(&val3));
295        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
296        assert_ne!(hash_datavalue(&val4), hash_datavalue(&val5));
297        assert_eq!(
298            hash_datavalue(&val6),
299            hash_datavalue(&DataValue::F64(f64::NAN))
300        );
301    }
302
303    #[test]
304    fn test_hash_datavalue_u128() {
305        let val1 = DataValue::U128(0);
306        let val2 = DataValue::U128(u128::MAX);
307        let val3 = DataValue::U128(12345678901234567890);
308
309        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val1));
310        assert_eq!(hash_datavalue(&val2), hash_datavalue(&val2));
311        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
312        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val3));
313    }
314
315    #[test]
316    fn test_hash_datavalue_i128() {
317        let val1 = DataValue::I128(0);
318        let val2 = DataValue::I128(-1);
319        let val3 = DataValue::I128(i128::MAX);
320        let val4 = DataValue::I128(i128::MIN);
321
322        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val1));
323        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
324        assert_ne!(hash_datavalue(&val3), hash_datavalue(&val4));
325    }
326
327    #[test]
328    fn test_hash_datavalue_string() {
329        let val1 = DataValue::String("hello".into());
330        let val2 = DataValue::String("world".into());
331        let val3 = DataValue::String("hello".into());
332        let val4 = DataValue::String("".into());
333
334        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val3));
335        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
336        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val4));
337        assert_eq!(hash_datavalue(&val4), hash_datavalue(&val4));
338    }
339
340    #[test]
341    fn test_hash_datavalue_bytes() {
342        let val1 = DataValue::Bytes(vec![1, 2, 3]);
343        let val2 = DataValue::Bytes(vec![1, 2, 3]);
344        let val3 = DataValue::Bytes(vec![3, 2, 1]);
345        let val4 = DataValue::Bytes(vec![]);
346
347        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val2));
348        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val3));
349        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val4));
350        assert_eq!(
351            hash_datavalue(&val4),
352            hash_datavalue(&DataValue::Bytes(vec![]))
353        );
354    }
355
356    #[test]
357    fn test_hash_datavalue_vec_empty() {
358        let val = DataValue::Vec(vec![]);
359        assert_eq!(hash_datavalue(&val), hash_datavalue(&val));
360    }
361
362    #[test]
363    fn test_hash_datavalue_vec_basic() {
364        let val1 = DataValue::Vec(vec![
365            DataValue::I32(1),
366            DataValue::I32(2),
367            DataValue::I32(3),
368        ]);
369        let val2 = DataValue::Vec(vec![
370            DataValue::I32(1),
371            DataValue::I32(2),
372            DataValue::I32(3),
373        ]);
374        let val3 = DataValue::Vec(vec![
375            DataValue::I32(3),
376            DataValue::I32(2),
377            DataValue::I32(1),
378        ]);
379
380        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val2));
381        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val3));
382    }
383
384    #[test]
385    fn test_hash_datavalue_vec_nested() {
386        let val1 = DataValue::Vec(vec![
387            DataValue::Vec(vec![DataValue::I32(1), DataValue::I32(2)]),
388            DataValue::Vec(vec![DataValue::I32(3), DataValue::I32(4)]),
389        ]);
390        let val2 = DataValue::Vec(vec![
391            DataValue::Vec(vec![DataValue::I32(1), DataValue::I32(2)]),
392            DataValue::Vec(vec![DataValue::I32(3), DataValue::I32(4)]),
393        ]);
394        let val3 = DataValue::Vec(vec![
395            DataValue::Vec(vec![DataValue::I32(1), DataValue::I32(2)]),
396            DataValue::Vec(vec![DataValue::I32(3), DataValue::I32(5)]),
397        ]);
398
399        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val2));
400        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val3));
401    }
402
403    #[test]
404    fn test_hash_datavalue_vec_mixed_types() {
405        let val = DataValue::Vec(vec![
406            DataValue::I32(42),
407            DataValue::String("test".into()),
408            DataValue::Bool(true),
409            DataValue::Null,
410        ]);
411
412        assert_eq!(hash_datavalue(&val), hash_datavalue(&val));
413    }
414
415    #[test]
416    fn test_hash_datavalue_map_empty() {
417        let val = DataValue::Map(HashMap::new());
418        assert_eq!(hash_datavalue(&val), hash_datavalue(&val));
419    }
420
421    #[test]
422    fn test_hash_datavalue_map_basic() {
423        let mut map1 = HashMap::new();
424        map1.insert("key1".into(), DataValue::I32(1));
425        map1.insert("key2".into(), DataValue::I32(2));
426
427        let mut map2 = HashMap::new();
428        map2.insert("key1".into(), DataValue::I32(1));
429        map2.insert("key2".into(), DataValue::I32(2));
430
431        let val1 = DataValue::Map(map1);
432        let val2 = DataValue::Map(map2);
433
434        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val2));
435    }
436
437    #[test]
438    fn test_hash_datavalue_map_insertion_order_independence() {
439        let mut map1 = HashMap::new();
440        map1.insert("a".into(), DataValue::I32(1));
441        map1.insert("b".into(), DataValue::I32(2));
442        map1.insert("c".into(), DataValue::I32(3));
443
444        let mut map2 = HashMap::new();
445        map2.insert("c".into(), DataValue::I32(3));
446        map2.insert("a".into(), DataValue::I32(1));
447        map2.insert("b".into(), DataValue::I32(2));
448
449        let val1 = DataValue::Map(map1);
450        let val2 = DataValue::Map(map2);
451
452        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val2));
453    }
454
455    #[test]
456    fn test_hash_datavalue_map_different_values() {
457        let mut map1 = HashMap::new();
458        map1.insert("key".into(), DataValue::I32(1));
459
460        let mut map2 = HashMap::new();
461        map2.insert("key".into(), DataValue::I32(2));
462
463        let val1 = DataValue::Map(map1);
464        let val2 = DataValue::Map(map2);
465
466        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
467    }
468
469    #[test]
470    fn test_hash_datavalue_map_nested() {
471        let mut inner_map = HashMap::new();
472        inner_map.insert("inner".into(), DataValue::I32(42));
473
474        let mut outer_map = HashMap::new();
475        outer_map.insert("outer".into(), DataValue::Map(inner_map.clone()));
476
477        let mut outer_map2 = HashMap::new();
478        outer_map2.insert("outer".into(), DataValue::Map(inner_map));
479
480        let val1 = DataValue::Map(outer_map);
481        let val2 = DataValue::Map(outer_map2);
482
483        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val2));
484    }
485
486    #[test]
487    fn test_hash_datavalue_enum_number() {
488        let val1 = DataValue::EnumNumber(0);
489        let val2 = DataValue::EnumNumber(1);
490        let val3 = DataValue::EnumNumber(-1);
491        let val4 = DataValue::EnumNumber(0);
492
493        assert_eq!(hash_datavalue(&val1), hash_datavalue(&val4));
494        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val2));
495        assert_ne!(hash_datavalue(&val1), hash_datavalue(&val3));
496    }
497
498    #[test]
499    fn test_hash_datavalue_type_differentiation() {
500        let u32_val = DataValue::U32(42);
501        let i32_val = DataValue::I32(42);
502        let u64_val = DataValue::U64(42);
503        let i64_val = DataValue::I64(42);
504        let f32_val = DataValue::F32(42.0);
505        let f64_val = DataValue::F64(42.0);
506
507        let u32_hash = hash_datavalue(&u32_val);
508        let i32_hash = hash_datavalue(&i32_val);
509        let u64_hash = hash_datavalue(&u64_val);
510        let i64_hash = hash_datavalue(&i64_val);
511        let f32_hash = hash_datavalue(&f32_val);
512        let f64_hash = hash_datavalue(&f64_val);
513
514        assert_ne!(u32_hash, i32_hash);
515        assert_ne!(u32_hash, u64_hash);
516        assert_ne!(u32_hash, i64_hash);
517        assert_ne!(i32_hash, u64_hash);
518        assert_ne!(i32_hash, i64_hash);
519        assert_ne!(u64_hash, i64_hash);
520        assert_ne!(f32_hash, f64_hash);
521    }
522
523    #[test]
524    fn test_hash_datavalue_null_vs_zero() {
525        let null_val = DataValue::Null;
526        let zero_i32 = DataValue::I32(0);
527        let zero_u32 = DataValue::U32(0);
528        let false_bool = DataValue::Bool(false);
529
530        let null_hash = hash_datavalue(&null_val);
531        let zero_i32_hash = hash_datavalue(&zero_i32);
532        let zero_u32_hash = hash_datavalue(&zero_u32);
533        let false_hash = hash_datavalue(&false_bool);
534
535        assert_ne!(null_hash, zero_i32_hash);
536        assert_ne!(null_hash, zero_u32_hash);
537        assert_ne!(null_hash, false_hash);
538    }
539
540    #[test]
541    fn test_hash_datavalue_string_vs_bytes() {
542        let string_val = DataValue::String("hello".into());
543        let bytes_val = DataValue::Bytes(b"hello".to_vec());
544
545        assert_ne!(hash_datavalue(&string_val), hash_datavalue(&bytes_val));
546    }
547
548    #[test]
549    fn test_hash_datavalue_consistency() {
550        let test_values = vec![
551            DataValue::Null,
552            DataValue::Bool(true),
553            DataValue::U8(255),
554            DataValue::I32(-42),
555            DataValue::String("test".into()),
556            DataValue::Vec(vec![DataValue::I32(1), DataValue::I32(2)]),
557        ];
558
559        for val in test_values {
560            let hash1 = hash_datavalue(&val);
561            let hash2 = hash_datavalue(&val);
562            let hash3 = hash_datavalue(&val);
563            assert_eq!(hash1, hash2);
564            assert_eq!(hash2, hash3);
565        }
566    }
567}