quamina 0.6.0

Fast pattern-matching library for filtering JSON events
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
//! IEEE 754 float64 to ordered bytes conversion for automaton numeric matching.
//!
//! This module provides a representation of float64 numbers that preserves ordering
//! when compared as byte sequences. This allows numeric comparisons to be performed
//! using the same automaton-based approach used for string matching.
//!
//! The implementation is based on Arne Hormann's "numbits" construct, which was
//! also used in the disk format of DB2.
//!
//! # How it works
//!
//! IEEE 754 float64 representation: (sign | exponent | mantissa)
//! - 1 bit sign
//! - 11 bits exponent
//! - 52 bits mantissa
//!
//! The problem: IEEE 754 doesn't preserve numeric ordering when compared as unsigned integers.
//! Negative numbers have the sign bit set, making them appear "larger" than positive numbers.
//!
//! The solution: Transform the bit representation:
//! - For positive numbers (sign bit 0): XOR with sign bit (1 << 63)
//! - For negative numbers (sign bit 1): Negate (XOR with !0)
//!
//! This transformation ensures that when comparing the resulting integers:
//! - Negative numbers compare less than positive numbers
//! - Numbers within the same sign compare in the correct order

/// Tag byte prepended to all Q-number sequences to create namespace separation
/// from raw ASCII/UTF-8 bytes. This prevents numeric range FAs from matching
/// string values — `0x80` never appears as the first byte of a JSON string value
/// (strings start with `"` = 34), while all Q-number content bytes are in 0-127.
pub(crate) const Q_NUMBER_PREFIX: u8 = 0x80;

/// Maximum bytes needed for base-128 encoding of a 64-bit value, plus prefix tag.
pub(crate) const MAX_BYTES_IN_ENCODING: usize = 11;

/// Stack-allocated Q-number representation for zero-allocation numeric matching.
///
/// This type holds a Q-number in a fixed 11-byte stack buffer (prefix tag + up to
/// 10 bytes of base-128 encoded 64-bit value), avoiding heap allocation during the
/// matching hot path.
///
/// Created via [`q_num_stack`]. Use [`as_slice`](Self::as_slice) to get the
/// actual bytes for FA traversal.
#[derive(Clone, Copy, Debug)]
pub struct QNumberStack {
    bytes: [u8; MAX_BYTES_IN_ENCODING],
    len: u8,
}

impl QNumberStack {
    /// Returns the Q-number bytes as a slice.
    #[inline]
    #[must_use]
    pub fn as_slice(&self) -> &[u8] {
        &self.bytes[..self.len as usize]
    }

    /// Returns the length of the Q-number.
    #[inline]
    #[must_use]
    pub const fn len(&self) -> usize {
        self.len as usize
    }

    /// Returns true if the Q-number is empty (should never happen for valid floats).
    #[inline]
    #[must_use]
    pub const fn is_empty(&self) -> bool {
        self.len == 0
    }
}

/// Convert numbits to a stack-allocated Q-number with prefix tag.
pub(crate) fn to_q_number_stack(nb: u64) -> QNumberStack {
    let mut nb = nb;
    let mut bytes = [0u8; MAX_BYTES_IN_ENCODING];

    // First byte is always the prefix tag
    bytes[0] = Q_NUMBER_PREFIX;

    // Count trailing zero septets (same algorithm as to_q_number)
    let mut trailing_zeroes = 0usize;
    // Content bytes start at index 1 (after prefix), max 10 content bytes
    let mut index = MAX_BYTES_IN_ENCODING - 1; // 10

    loop {
        let prev = index;
        if nb & 0x7f != 0 {
            break;
        }
        trailing_zeroes += 1;
        nb >>= 7;
        if index <= 1 {
            break;
        }
        index -= 1;
        debug_assert!(index < prev, "to_q_number_stack: index must decrease");
    }

    // Content length (excluding prefix)
    let content_len = (MAX_BYTES_IN_ENCODING - 1) - trailing_zeroes;

    // Fill content bytes from right to left (starting at index 1)
    for i in (1..=content_len).rev() {
        bytes[i] = (nb & 0x7f) as u8;
        nb >>= 7;
    }

    // Total length = prefix + content. `content_len` is bounded by
    // MAX_BYTES_IN_ENCODING - 1 (i.e. 10), so the sum never overflows u8.
    QNumberStack {
        bytes,
        len: u8::try_from(1 + content_len)
            .expect("Q-number length bounded by MAX_BYTES_IN_ENCODING"),
    }
}

/// Convert a float64 to a stack-allocated Q-number (zero heap allocation).
///
/// Use this for the **matching hot path** where the Q-number is temporary and
/// doesn't need to be stored. The stack allocation avoids heap overhead.
///
/// For pattern building where you need to store the bytes, use [`q_num_from_f64`]
/// which returns a `Vec<u8>`.
///
/// Both functions produce identical byte sequences.
#[must_use]
pub fn q_num_stack(f: f64) -> QNumberStack {
    to_q_number_stack(numbits_from_f64(f))
}

/// Convert a float64 to its numbits representation.
///
/// The resulting u64 can be compared directly to preserve numeric ordering:
/// if a < b as floats, then numbits_from_f64(a) < numbits_from_f64(b) as u64.
///
/// Note: This implementation ignores NaN, -0, and infinities because JSON
/// rules and Quamina's parsers prevent those values from occurring.
pub(crate) const fn numbits_from_f64(f: f64) -> u64 {
    let u = f.to_bits();
    // Branchless transform: if the high bit is 0, XOR with the sign bit
    // (1 << 63); if it's 1, negate (XOR with !0). Raph Levien spotted that
    // a sign-extending right shift gives us the right mask for free —
    // see https://mastodon.online/@raph/113071041069390831 .
    //
    // `cast_signed` / `cast_unsigned` reinterpret the bits without changing
    // the value, so the `>> 63` between them is the arithmetic shift that
    // turns into a single instruction.
    let mask = (u.cast_signed() >> 63).cast_unsigned() | (1 << 63);
    u ^ mask
}

/// Convert numbits to a minimal variable-width encoding that preserves ordering.
///
/// Storing 8 bytes of data in base-128 would in principle require 10 bytes,
/// but since the encoding is big-endian, trailing zeroes don't count, so
/// the encoding can be as short as one byte.
///
/// Idea and some code by Axel Wagner.
pub(crate) fn to_q_number(nb: u64) -> Vec<u8> {
    let mut nb = nb;

    // Iterate through the numbits 7 bits at a time, right to left,
    // first bypassing bits that generate trailing zeroes in the encoded form.
    // Note that index could go to 0 if the numbits value was 0,
    // but that value represents NaN and can't appear in JSON.
    let mut trailing_zeroes = 0usize;
    let max_content_bytes = MAX_BYTES_IN_ENCODING - 1; // 10 content bytes
    let mut index = max_content_bytes - 1;

    loop {
        let prev = index;
        if nb & 0x7f != 0 {
            break;
        }
        trailing_zeroes += 1;
        nb >>= 7;
        if index == 0 {
            break;
        }
        index -= 1;
        debug_assert!(index < prev, "to_q_number: index must decrease");
    }

    // Now fill in the byte encoding for the digits up to the last non-zero
    let content_len = max_content_bytes - trailing_zeroes;
    // Total: 1 prefix byte + content bytes
    let mut result = vec![0u8; 1 + content_len];
    result[0] = Q_NUMBER_PREFIX;

    for i in (1..=content_len).rev() {
        result[i] = (nb & 0x7f) as u8;
        nb >>= 7;
    }

    result
}

/// Convert a float64 to its Q-number representation (heap-allocated).
///
/// Use this for **pattern building** where the Q-number bytes need to be stored
/// or passed to FA construction functions.
///
/// For the matching hot path where the Q-number is temporary, use [`q_num_stack`]
/// which avoids heap allocation.
///
/// Both functions produce identical byte sequences.
#[must_use]
pub fn q_num_from_f64(f: f64) -> Vec<u8> {
    to_q_number(numbits_from_f64(f))
}

#[cfg(test)]
mod tests {
    use super::*;

    /// Parse a byte string as a float64 and convert to Q-number.
    /// Returns None if the string cannot be parsed as a float.
    fn q_num_from_bytes(bytes: &[u8]) -> Option<Vec<u8>> {
        let s = std::str::from_utf8(bytes).ok()?;
        let f: f64 = s.parse().ok()?;
        Some(q_num_from_f64(f))
    }

    /// Format a Q-number for debugging.
    fn q_num_to_string(q: &[u8]) -> String {
        q.iter()
            .map(|b| format!("{b:02x}"))
            .collect::<Vec<_>>()
            .join("-")
    }

    #[test]
    #[allow(clippy::inconsistent_digit_grouping)]
    fn test_wildly_varying_numbers_are_comparable() {
        let data: Vec<f64> = vec![
            -5_000_000_000.0,
            -4_999_999_999.99999,
            -4_999_999_999.99998,
            -4_999_999_999.99997,
            -999999999.99999,
            -999999999.99,
            -10000.0,
            -122.413496,
            -0.000002,
            0.0,
            0.000001,
            3.8,
            3.9,
            11.0,
            12.0,
            122.415028,
            2.5e4,
            999999999.999998,
            999999999.999999,
            4_999_999_999.99997,
            4_999_999_999.99998,
            4_999_999_999.99999,
            5_000_000_000.0,
        ];

        for i in 1..data.len() {
            let s0 = q_num_from_f64(data[i - 1]);
            let s1 = q_num_from_f64(data[i]);
            assert!(
                s0 < s1,
                "Ordering failed at index {}: {} ({:?}) should be < {} ({:?})",
                i,
                data[i - 1],
                q_num_to_string(&s0),
                data[i],
                q_num_to_string(&s1)
            );
        }
    }

    #[test]
    fn test_float_variants() {
        // Different representations of the same value should produce the same Q-number
        let floats: Vec<f64> = vec![350.0, 350.0, 350.0000000000, 3.5e2];
        let q_nums: Vec<Vec<u8>> = floats.iter().map(|&f| q_num_from_f64(f)).collect();

        for i in 1..q_nums.len() {
            assert_eq!(
                q_nums[i],
                q_nums[i - 1],
                "Q-numbers differ for {} vs {}",
                floats[i - 1],
                floats[i]
            );
        }
    }

    #[test]
    fn test_byte_variants() {
        // Different string representations of the same value should produce the same Q-number
        let strings: Vec<&str> = vec!["350", "350.0", "350.0000", "3.5e2"];
        let q_nums: Vec<Vec<u8>> = strings
            .iter()
            .map(|s| q_num_from_bytes(s.as_bytes()).unwrap())
            .collect();

        for i in 1..q_nums.len() {
            assert_eq!(
                q_nums[i],
                q_nums[i - 1],
                "Q-numbers differ for '{}' vs '{}'",
                strings[i - 1],
                strings[i]
            );
        }
    }

    /// Shared helper: generate `count` random floats via LCG, sort them, and verify
    /// that Q-number ordering is preserved.
    fn verify_ordering_random(count: usize) {
        use std::cmp::Ordering;

        let mut floats: Vec<f64> = Vec::new();
        let mut rng_state = 12345u64;

        for _ in 0..count {
            rng_state = rng_state.wrapping_mul(6364136223846793005).wrapping_add(1);
            // Stuff 52 random bits into a 1.0-exponent and subtract 1.0 to
            // get a uniform [0.0, 1.0) without any lossy `as f64` cast.
            let unit = f64::from_bits((rng_state >> 12) | 0x3FF0_0000_0000_0000) - 1.0;
            let f = unit.mul_add(2_000_000_000.0, -1_000_000_000.0);
            floats.push(f);
        }

        floats.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));

        let q_nums: Vec<Vec<u8>> = floats.iter().map(|&f| q_num_from_f64(f)).collect();

        for i in 1..q_nums.len() {
            assert!(
                q_nums[i - 1] <= q_nums[i],
                "Q-number ordering failed at index {}: {:?} > {:?} (floats: {} > {})",
                i,
                q_nums[i - 1],
                q_nums[i],
                floats[i - 1],
                floats[i]
            );
        }
    }

    // MIRI SKIP RATIONALE: 10,000 random floats with sort + Q-number conversion is slow
    // under Miri (~90s). Coverage: test_ordering_random_miri_friendly uses 50 floats.
    #[test]
    #[cfg_attr(miri, ignore)]
    fn test_ordering_random() {
        verify_ordering_random(10_000);
    }

    /// Miri-friendly version — 50 random floats instead of 10,000.
    #[test]
    fn test_ordering_random_miri_friendly() {
        verify_ordering_random(50);
    }

    #[test]
    fn test_bad_numbers() {
        let bads = vec!["xy", "- 53", "124x", "1.5ee7"];
        for bad in bads {
            assert!(
                q_num_from_bytes(bad.as_bytes()).is_none(),
                "Should reject: {bad}"
            );
        }
    }

    #[test]
    fn test_q_number_length() {
        // Test that Q-numbers are variable length (strip trailing zeros)
        let q0 = q_num_from_f64(0.0);
        let q_large = q_num_from_f64(1e15);

        // Both should be <= MAX_BYTES_IN_ENCODING
        assert!(q0.len() <= MAX_BYTES_IN_ENCODING);
        assert!(q_large.len() <= MAX_BYTES_IN_ENCODING);

        // Different values may have different lengths
        // (specific lengths depend on the encoding)
    }

    #[test]
    fn test_numbits_ordering_property() {
        // Verify that numbits preserves ordering at the u64 level
        let pairs: Vec<(f64, f64)> = vec![
            (-100.0, -50.0),
            (-50.0, 0.0),
            (0.0, 50.0),
            (50.0, 100.0),
            (-1e10, 1e10),
            (0.0001, 0.0002),
        ];

        for (a, b) in pairs {
            let na = numbits_from_f64(a);
            let nb = numbits_from_f64(b);
            assert!(
                na < nb,
                "numbits ordering failed: {a} ({na}) should be < {b} ({nb})"
            );
        }
    }

    #[test]
    fn test_zero_handling() {
        // Test that 0.0 and -0.0 both work (though JSON typically only has 0)
        let q_zero = q_num_from_f64(0.0);
        let q_neg_zero = q_num_from_f64(-0.0);

        // Both should produce valid Q-numbers
        assert!(!q_zero.is_empty());
        assert!(!q_neg_zero.is_empty());
    }

    // =========================================================================
    // Tests for Q-number variant equivalence
    // =========================================================================

    #[test]
    fn test_q_number_variants_equivalence() {
        // Vec and Stack implementations must produce identical byte sequences
        let test_values: Vec<f64> = vec![
            0.0,
            1.0,
            -1.0,
            42.0,
            -42.0,
            999.0,
            1000.0,
            123456.0,
            0.000001,
            -0.000001,
            std::f64::consts::PI,
            -std::f64::consts::PI,
            1e10,
            -1e10,
            1e-10,
            -1e-10,
            f64::MIN_POSITIVE,
            f64::MAX,
            f64::MIN,
        ];

        for &val in &test_values {
            let vec_result = q_num_from_f64(val);
            let stack_result = q_num_stack(val);

            assert_eq!(
                vec_result.as_slice(),
                stack_result.as_slice(),
                "Stack variant differs from Vec for value {val}"
            );
        }
    }

    #[test]
    fn test_q_number_variants_random() {
        // Test with random values to ensure equivalence
        let mut rng_state = 54321u64;

        for _ in 0..100 {
            rng_state = rng_state.wrapping_mul(6364136223846793005).wrapping_add(1);
            let f = f64::from_bits(rng_state & 0x7FEFFFFFFFFFFFFF); // Avoid NaN/Inf

            // Skip if not finite
            if !f.is_finite() {
                continue;
            }

            let vec_result = q_num_from_f64(f);
            let stack_result = q_num_stack(f);

            assert_eq!(
                vec_result.as_slice(),
                stack_result.as_slice(),
                "Stack variant differs from Vec for value {f}"
            );
        }
    }

    #[test]
    fn test_q_number_stack_length() {
        // Test that QNumberStack correctly reports length
        let q0 = q_num_stack(0.0);
        let q_large = q_num_stack(1e15);

        assert!(!q0.is_empty());
        assert!(q0.len() <= MAX_BYTES_IN_ENCODING);
        assert!(q_large.len() <= MAX_BYTES_IN_ENCODING);
        assert!(!q0.is_empty());
    }

    #[test]
    fn test_q_number_stack_len_matches_slice() {
        let values = [
            0.0,
            1.0,
            -1.0,
            42.0,
            1e15,
            -1e-10,
            f64::MAX,
            f64::MIN_POSITIVE,
        ];
        for &val in &values {
            let q = q_num_stack(val);
            assert_eq!(
                q.len(),
                q.as_slice().len(),
                "len() disagrees with as_slice().len() for {val}"
            );
        }
    }

    #[test]
    fn test_q_number_stack_is_empty_synthetic() {
        // A zero-length QNumberStack should report is_empty() == true.
        // This can't happen via q_num_stack (all valid floats produce non-empty),
        // but verifies is_empty correctness at the boundary.
        let empty = QNumberStack {
            bytes: [0; MAX_BYTES_IN_ENCODING],
            len: 0,
        };
        assert!(empty.is_empty());
        assert_eq!(empty.len(), 0);
    }

    #[test]
    fn test_encoding_with_zero_numbits() {
        // nb=0 corresponds to NaN (can't appear in JSON), but exercising
        // the encoding boundary verifies the trailing-zero-skip loop guard.
        // With the Q_NUMBER_PREFIX, even zero numbits produces the prefix byte.
        let vec_result = to_q_number(0);
        assert_eq!(vec_result, vec![Q_NUMBER_PREFIX]);

        let stack_result = to_q_number_stack(0);
        assert_eq!(stack_result.len(), 1);
        assert_eq!(stack_result.as_slice(), &[Q_NUMBER_PREFIX]);
    }
}