Skip to main content

nodedb_query/msgpack_scan/
compare.rs

1//! Byte-level comparison and hashing for MessagePack field values.
2//!
3//! Operates on raw byte ranges returned by `extract_field`. Used for
4//! join key matching, GROUP BY key deduplication, ORDER BY, and DISTINCT.
5
6use std::cmp::Ordering;
7use std::hash::{BuildHasher, Hasher};
8
9use crate::msgpack_scan::reader::{read_f64, read_i64, read_null, str_bounds};
10
11/// Hash the raw bytes of a MessagePack value at `range` within `buf`.
12/// Uses a fast non-cryptographic hash suitable for hash joins and GROUP BY.
13///
14/// For canonical-encoded documents (integers in smallest form, sorted keys),
15/// semantically equal values produce identical byte sequences and thus
16/// identical hashes.
17pub fn hash_field_bytes(buf: &[u8], range: (usize, usize)) -> u64 {
18    let slice = match buf.get(range.0..range.1) {
19        Some(s) => s,
20        None => return 0,
21    };
22    let hasher_builder = std::collections::hash_map::RandomState::new();
23    let mut hasher = hasher_builder.build_hasher();
24    hasher.write(slice);
25    hasher.finish()
26}
27
28/// Hash the raw bytes using a provided `RandomState` for consistent hashing
29/// within a single query (all docs hashed with the same seed).
30pub fn hash_field_bytes_with(
31    buf: &[u8],
32    range: (usize, usize),
33    state: &std::collections::hash_map::RandomState,
34) -> u64 {
35    let slice = match buf.get(range.0..range.1) {
36        Some(s) => s,
37        None => return 0,
38    };
39    let mut hasher = state.build_hasher();
40    hasher.write(slice);
41    hasher.finish()
42}
43
44/// Compare two MessagePack values by their decoded content.
45///
46/// Comparison order:
47/// 1. Null < Bool < Number < String < Binary < Array < Map
48/// 2. Within numbers: compare as f64
49/// 3. Within strings: lexicographic on raw bytes (valid UTF-8 guarantees
50///    byte order = Unicode code-point order for ASCII/Latin-1)
51/// 4. Fallback: raw byte comparison
52pub fn compare_field_bytes(
53    a_buf: &[u8],
54    a_range: (usize, usize),
55    b_buf: &[u8],
56    b_range: (usize, usize),
57) -> Ordering {
58    let a_off = a_range.0;
59    let b_off = b_range.0;
60
61    let a_tag = match a_buf.get(a_off) {
62        Some(&t) => t,
63        None => return Ordering::Less,
64    };
65    let b_tag = match b_buf.get(b_off) {
66        Some(&t) => t,
67        None => return Ordering::Greater,
68    };
69
70    let a_type = type_rank(a_tag);
71    let b_type = type_rank(b_tag);
72
73    if a_type != b_type {
74        return a_type.cmp(&b_type);
75    }
76
77    match a_type {
78        0 => Ordering::Equal, // both null
79        1 => {
80            // bool
81            let a_val = a_tag == 0xc3; // true
82            let b_val = b_tag == 0xc3;
83            a_val.cmp(&b_val)
84        }
85        2 => {
86            // number — compare as f64
87            match (read_f64(a_buf, a_off), read_f64(b_buf, b_off)) {
88                (Some(a), Some(b)) => a.partial_cmp(&b).unwrap_or(Ordering::Equal),
89                (Some(_), None) => Ordering::Greater,
90                (None, Some(_)) => Ordering::Less,
91                (None, None) => Ordering::Equal,
92            }
93        }
94        3 => {
95            // string — compare raw bytes
96            match (str_bounds(a_buf, a_off), str_bounds(b_buf, b_off)) {
97                (Some((a_s, a_l)), Some((b_s, b_l))) => {
98                    let a_bytes = &a_buf[a_s..a_s + a_l];
99                    let b_bytes = &b_buf[b_s..b_s + b_l];
100                    a_bytes.cmp(b_bytes)
101                }
102                _ => Ordering::Equal,
103            }
104        }
105        _ => {
106            // binary, array, map, ext — fallback to raw byte comparison
107            let a_slice = &a_buf[a_range.0..a_range.1];
108            let b_slice = &b_buf[b_range.0..b_range.1];
109            a_slice.cmp(b_slice)
110        }
111    }
112}
113
114/// Compare two numeric MessagePack values as i64.
115/// Useful when the caller knows both values are integers.
116pub fn compare_field_i64(a_buf: &[u8], a_off: usize, b_buf: &[u8], b_off: usize) -> Ordering {
117    match (read_i64(a_buf, a_off), read_i64(b_buf, b_off)) {
118        (Some(a), Some(b)) => a.cmp(&b),
119        (Some(_), None) => Ordering::Greater,
120        (None, Some(_)) => Ordering::Less,
121        (None, None) => Ordering::Equal,
122    }
123}
124
125/// Check if two MessagePack values are byte-identical.
126/// For canonical-encoded documents, byte equality implies semantic equality.
127pub fn field_bytes_eq(
128    a_buf: &[u8],
129    a_range: (usize, usize),
130    b_buf: &[u8],
131    b_range: (usize, usize),
132) -> bool {
133    let a_slice = match a_buf.get(a_range.0..a_range.1) {
134        Some(s) => s,
135        None => return false,
136    };
137    let b_slice = match b_buf.get(b_range.0..b_range.1) {
138        Some(s) => s,
139        None => return false,
140    };
141    a_slice == b_slice
142}
143
144/// Check if a field value is null without extracting it.
145pub fn is_field_null(buf: &[u8], range: (usize, usize)) -> bool {
146    read_null(buf, range.0)
147}
148
149/// Type rank for cross-type ordering. Lower rank = sorts first.
150/// Null(0) < Bool(1) < Number(2) < String(3) < Binary(4) < Array(5) < Map(6) < Ext(7)
151fn type_rank(tag: u8) -> u8 {
152    match tag {
153        0xc0 => 0,                      // nil
154        0xc2 | 0xc3 => 1,               // bool
155        0x00..=0x7f | 0xe0..=0xff => 2, // fixint
156        0xca..=0xd3 => 2,               // float/uint/int
157        0xa0..=0xbf | 0xd9..=0xdb => 3, // string
158        0xc4..=0xc6 => 4,               // binary
159        0x90..=0x9f | 0xdc | 0xdd => 5, // array
160        0x80..=0x8f | 0xde | 0xdf => 6, // map
161        0xc7..=0xc9 | 0xd4..=0xd8 => 7, // ext
162        _ => 8,                         // unknown
163    }
164}
165
166#[cfg(test)]
167mod tests {
168    use super::*;
169    use serde_json::json;
170
171    fn encode(v: &serde_json::Value) -> Vec<u8> {
172        nodedb_types::json_msgpack::json_to_msgpack(v).expect("encode")
173    }
174
175    fn val_range(buf: &[u8]) -> (usize, usize) {
176        (0, buf.len())
177    }
178
179    #[test]
180    fn hash_same_bytes_same_hash() {
181        let buf = encode(&json!(42));
182        let state = std::collections::hash_map::RandomState::new();
183        let h1 = hash_field_bytes_with(&buf, val_range(&buf), &state);
184        let h2 = hash_field_bytes_with(&buf, val_range(&buf), &state);
185        assert_eq!(h1, h2);
186    }
187
188    #[test]
189    fn hash_different_values_likely_differ() {
190        let buf1 = encode(&json!(42));
191        let buf2 = encode(&json!(43));
192        let state = std::collections::hash_map::RandomState::new();
193        let h1 = hash_field_bytes_with(&buf1, val_range(&buf1), &state);
194        let h2 = hash_field_bytes_with(&buf2, val_range(&buf2), &state);
195        assert_ne!(h1, h2);
196    }
197
198    #[test]
199    fn compare_integers() {
200        let a = encode(&json!(10));
201        let b = encode(&json!(20));
202        assert_eq!(
203            compare_field_bytes(&a, val_range(&a), &b, val_range(&b)),
204            Ordering::Less
205        );
206        assert_eq!(
207            compare_field_bytes(&b, val_range(&b), &a, val_range(&a)),
208            Ordering::Greater
209        );
210        assert_eq!(
211            compare_field_bytes(&a, val_range(&a), &a, val_range(&a)),
212            Ordering::Equal
213        );
214    }
215
216    #[test]
217    fn compare_strings() {
218        let a = encode(&json!("apple"));
219        let b = encode(&json!("banana"));
220        assert_eq!(
221            compare_field_bytes(&a, val_range(&a), &b, val_range(&b)),
222            Ordering::Less
223        );
224    }
225
226    #[test]
227    fn compare_cross_type_null_vs_number() {
228        let a = encode(&json!(null));
229        let b = encode(&json!(42));
230        assert_eq!(
231            compare_field_bytes(&a, val_range(&a), &b, val_range(&b)),
232            Ordering::Less
233        );
234    }
235
236    #[test]
237    fn compare_cross_type_string_vs_number() {
238        let a = encode(&json!(42));
239        let b = encode(&json!("hello"));
240        assert_eq!(
241            compare_field_bytes(&a, val_range(&a), &b, val_range(&b)),
242            Ordering::Less
243        );
244    }
245
246    #[test]
247    fn compare_booleans() {
248        let a = encode(&json!(false));
249        let b = encode(&json!(true));
250        assert_eq!(
251            compare_field_bytes(&a, val_range(&a), &b, val_range(&b)),
252            Ordering::Less
253        );
254    }
255
256    #[test]
257    fn compare_negative_integers() {
258        let a = encode(&json!(-10));
259        let b = encode(&json!(-5));
260        assert_eq!(
261            compare_field_bytes(&a, val_range(&a), &b, val_range(&b)),
262            Ordering::Less
263        );
264    }
265
266    #[test]
267    fn field_bytes_eq_works() {
268        let a = encode(&json!("test"));
269        let b = encode(&json!("test"));
270        let c = encode(&json!("other"));
271        assert!(field_bytes_eq(&a, val_range(&a), &b, val_range(&b)));
272        assert!(!field_bytes_eq(&a, val_range(&a), &c, val_range(&c)));
273    }
274
275    #[test]
276    fn is_field_null_works() {
277        let null_buf = encode(&json!(null));
278        let int_buf = encode(&json!(42));
279        assert!(is_field_null(&null_buf, val_range(&null_buf)));
280        assert!(!is_field_null(&int_buf, val_range(&int_buf)));
281    }
282
283    #[test]
284    fn compare_floats() {
285        let a = encode(&json!(1.5));
286        let b = encode(&json!(2.5));
287        assert_eq!(
288            compare_field_bytes(&a, val_range(&a), &b, val_range(&b)),
289            Ordering::Less
290        );
291    }
292
293    #[test]
294    fn hash_from_extracted_field() {
295        let buf = encode(&json!({"id": 42}));
296        let range = crate::msgpack_scan::field::extract_field(&buf, 0, "id").unwrap();
297        let h = hash_field_bytes(&buf, range);
298        assert_ne!(h, 0);
299    }
300}