Skip to main content

nodedb_query/msgpack_scan/
compare.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! Byte-level comparison and hashing for MessagePack field values.
4//!
5//! Operates on raw byte ranges returned by `extract_field`. Used for
6//! join key matching, GROUP BY key deduplication, ORDER BY, and DISTINCT.
7
8use std::cmp::Ordering;
9use std::hash::{BuildHasher, Hasher};
10
11use crate::msgpack_scan::reader::{read_f64, read_i64, read_null, str_bounds};
12
13/// Hash the raw bytes of a MessagePack value at `range` within `buf`.
14/// Uses a fast non-cryptographic hash suitable for hash joins and GROUP BY.
15///
16/// For canonical-encoded documents (integers in smallest form, sorted keys),
17/// semantically equal values produce identical byte sequences and thus
18/// identical hashes.
19pub fn hash_field_bytes(buf: &[u8], range: (usize, usize)) -> u64 {
20    let slice = match buf.get(range.0..range.1) {
21        Some(s) => s,
22        None => return 0,
23    };
24    let hasher_builder = std::collections::hash_map::RandomState::new();
25    let mut hasher = hasher_builder.build_hasher();
26    hasher.write(slice);
27    hasher.finish()
28}
29
30/// Hash the raw bytes using a provided `RandomState` for consistent hashing
31/// within a single query (all docs hashed with the same seed).
32pub fn hash_field_bytes_with(
33    buf: &[u8],
34    range: (usize, usize),
35    state: &std::collections::hash_map::RandomState,
36) -> u64 {
37    let slice = match buf.get(range.0..range.1) {
38        Some(s) => s,
39        None => return 0,
40    };
41    let mut hasher = state.build_hasher();
42    hasher.write(slice);
43    hasher.finish()
44}
45
46/// Compare two MessagePack values by their decoded content.
47///
48/// Comparison order:
49/// 1. Null < Bool < Number < String < Binary < Array < Map
50/// 2. Within numbers: compare as f64
51/// 3. Within strings: lexicographic on raw bytes (valid UTF-8 guarantees
52///    byte order = Unicode code-point order for ASCII/Latin-1)
53/// 4. Fallback: raw byte comparison
54pub fn compare_field_bytes(
55    a_buf: &[u8],
56    a_range: (usize, usize),
57    b_buf: &[u8],
58    b_range: (usize, usize),
59) -> Ordering {
60    let a_off = a_range.0;
61    let b_off = b_range.0;
62
63    let a_tag = match a_buf.get(a_off) {
64        Some(&t) => t,
65        None => return Ordering::Less,
66    };
67    let b_tag = match b_buf.get(b_off) {
68        Some(&t) => t,
69        None => return Ordering::Greater,
70    };
71
72    let a_type = type_rank(a_tag);
73    let b_type = type_rank(b_tag);
74
75    if a_type != b_type {
76        return a_type.cmp(&b_type);
77    }
78
79    match a_type {
80        0 => Ordering::Equal, // both null
81        1 => {
82            // bool
83            let a_val = a_tag == 0xc3; // true
84            let b_val = b_tag == 0xc3;
85            a_val.cmp(&b_val)
86        }
87        2 => {
88            // number — compare as f64
89            match (read_f64(a_buf, a_off), read_f64(b_buf, b_off)) {
90                (Some(a), Some(b)) => a.partial_cmp(&b).unwrap_or(Ordering::Equal),
91                (Some(_), None) => Ordering::Greater,
92                (None, Some(_)) => Ordering::Less,
93                (None, None) => Ordering::Equal,
94            }
95        }
96        3 => {
97            // string — compare raw bytes
98            match (str_bounds(a_buf, a_off), str_bounds(b_buf, b_off)) {
99                (Some((a_s, a_l)), Some((b_s, b_l))) => {
100                    let a_bytes = &a_buf[a_s..a_s + a_l];
101                    let b_bytes = &b_buf[b_s..b_s + b_l];
102                    a_bytes.cmp(b_bytes)
103                }
104                _ => Ordering::Equal,
105            }
106        }
107        _ => {
108            // binary, array, map, ext — fallback to raw byte comparison
109            let a_slice = &a_buf[a_range.0..a_range.1];
110            let b_slice = &b_buf[b_range.0..b_range.1];
111            a_slice.cmp(b_slice)
112        }
113    }
114}
115
116/// Compare two numeric MessagePack values as i64.
117/// Useful when the caller knows both values are integers.
118pub fn compare_field_i64(a_buf: &[u8], a_off: usize, b_buf: &[u8], b_off: usize) -> Ordering {
119    match (read_i64(a_buf, a_off), read_i64(b_buf, b_off)) {
120        (Some(a), Some(b)) => a.cmp(&b),
121        (Some(_), None) => Ordering::Greater,
122        (None, Some(_)) => Ordering::Less,
123        (None, None) => Ordering::Equal,
124    }
125}
126
127/// Check if two MessagePack values are byte-identical.
128/// For canonical-encoded documents, byte equality implies semantic equality.
129pub fn field_bytes_eq(
130    a_buf: &[u8],
131    a_range: (usize, usize),
132    b_buf: &[u8],
133    b_range: (usize, usize),
134) -> bool {
135    let a_slice = match a_buf.get(a_range.0..a_range.1) {
136        Some(s) => s,
137        None => return false,
138    };
139    let b_slice = match b_buf.get(b_range.0..b_range.1) {
140        Some(s) => s,
141        None => return false,
142    };
143    a_slice == b_slice
144}
145
146/// Check if a field value is null without extracting it.
147pub fn is_field_null(buf: &[u8], range: (usize, usize)) -> bool {
148    read_null(buf, range.0)
149}
150
151/// Type rank for cross-type ordering. Lower rank = sorts first.
152/// Null(0) < Bool(1) < Number(2) < String(3) < Binary(4) < Array(5) < Map(6) < Ext(7)
153fn type_rank(tag: u8) -> u8 {
154    match tag {
155        0xc0 => 0,                      // nil
156        0xc2 | 0xc3 => 1,               // bool
157        0x00..=0x7f | 0xe0..=0xff => 2, // fixint
158        0xca..=0xd3 => 2,               // float/uint/int
159        0xa0..=0xbf | 0xd9..=0xdb => 3, // string
160        0xc4..=0xc6 => 4,               // binary
161        0x90..=0x9f | 0xdc | 0xdd => 5, // array
162        0x80..=0x8f | 0xde | 0xdf => 6, // map
163        0xc7..=0xc9 | 0xd4..=0xd8 => 7, // ext
164        _ => 8,                         // unknown
165    }
166}
167
168#[cfg(test)]
169mod tests {
170    use super::*;
171    use serde_json::json;
172
173    fn encode(v: &serde_json::Value) -> Vec<u8> {
174        nodedb_types::json_msgpack::json_to_msgpack(v).expect("encode")
175    }
176
177    fn val_range(buf: &[u8]) -> (usize, usize) {
178        (0, buf.len())
179    }
180
181    #[test]
182    fn hash_same_bytes_same_hash() {
183        let buf = encode(&json!(42));
184        let state = std::collections::hash_map::RandomState::new();
185        let h1 = hash_field_bytes_with(&buf, val_range(&buf), &state);
186        let h2 = hash_field_bytes_with(&buf, val_range(&buf), &state);
187        assert_eq!(h1, h2);
188    }
189
190    #[test]
191    fn hash_different_values_likely_differ() {
192        let buf1 = encode(&json!(42));
193        let buf2 = encode(&json!(43));
194        let state = std::collections::hash_map::RandomState::new();
195        let h1 = hash_field_bytes_with(&buf1, val_range(&buf1), &state);
196        let h2 = hash_field_bytes_with(&buf2, val_range(&buf2), &state);
197        assert_ne!(h1, h2);
198    }
199
200    #[test]
201    fn compare_integers() {
202        let a = encode(&json!(10));
203        let b = encode(&json!(20));
204        assert_eq!(
205            compare_field_bytes(&a, val_range(&a), &b, val_range(&b)),
206            Ordering::Less
207        );
208        assert_eq!(
209            compare_field_bytes(&b, val_range(&b), &a, val_range(&a)),
210            Ordering::Greater
211        );
212        assert_eq!(
213            compare_field_bytes(&a, val_range(&a), &a, val_range(&a)),
214            Ordering::Equal
215        );
216    }
217
218    #[test]
219    fn compare_strings() {
220        let a = encode(&json!("apple"));
221        let b = encode(&json!("banana"));
222        assert_eq!(
223            compare_field_bytes(&a, val_range(&a), &b, val_range(&b)),
224            Ordering::Less
225        );
226    }
227
228    #[test]
229    fn compare_cross_type_null_vs_number() {
230        let a = encode(&json!(null));
231        let b = encode(&json!(42));
232        assert_eq!(
233            compare_field_bytes(&a, val_range(&a), &b, val_range(&b)),
234            Ordering::Less
235        );
236    }
237
238    #[test]
239    fn compare_cross_type_string_vs_number() {
240        let a = encode(&json!(42));
241        let b = encode(&json!("hello"));
242        assert_eq!(
243            compare_field_bytes(&a, val_range(&a), &b, val_range(&b)),
244            Ordering::Less
245        );
246    }
247
248    #[test]
249    fn compare_booleans() {
250        let a = encode(&json!(false));
251        let b = encode(&json!(true));
252        assert_eq!(
253            compare_field_bytes(&a, val_range(&a), &b, val_range(&b)),
254            Ordering::Less
255        );
256    }
257
258    #[test]
259    fn compare_negative_integers() {
260        let a = encode(&json!(-10));
261        let b = encode(&json!(-5));
262        assert_eq!(
263            compare_field_bytes(&a, val_range(&a), &b, val_range(&b)),
264            Ordering::Less
265        );
266    }
267
268    #[test]
269    fn field_bytes_eq_works() {
270        let a = encode(&json!("test"));
271        let b = encode(&json!("test"));
272        let c = encode(&json!("other"));
273        assert!(field_bytes_eq(&a, val_range(&a), &b, val_range(&b)));
274        assert!(!field_bytes_eq(&a, val_range(&a), &c, val_range(&c)));
275    }
276
277    #[test]
278    fn is_field_null_works() {
279        let null_buf = encode(&json!(null));
280        let int_buf = encode(&json!(42));
281        assert!(is_field_null(&null_buf, val_range(&null_buf)));
282        assert!(!is_field_null(&int_buf, val_range(&int_buf)));
283    }
284
285    #[test]
286    fn compare_floats() {
287        let a = encode(&json!(1.5));
288        let b = encode(&json!(2.5));
289        assert_eq!(
290            compare_field_bytes(&a, val_range(&a), &b, val_range(&b)),
291            Ordering::Less
292        );
293    }
294
295    #[test]
296    fn hash_from_extracted_field() {
297        let buf = encode(&json!({"id": 42}));
298        let range = crate::msgpack_scan::field::extract_field(&buf, 0, "id").unwrap();
299        let h = hash_field_bytes(&buf, range);
300        assert_ne!(h, 0);
301    }
302}