Skip to main content

jetro_core/
scan.rs

1//! SIMD byte-scan over raw JSON bytes.
2//!
3//! Locates `"key":` occurrences in a JSON document without first parsing
4//! the document into a tree.  `memchr` (AVX2-internal when available)
5//! jumps byte-by-byte to the next structurally relevant character — a
6//! `"` outside strings, a `"` or `\\` inside strings — so the scanner
7//! traverses the document at near-memory-bandwidth speed.
8//!
9//! ## When to use
10//!
11//! For `$..key` (all descendants by name) or `$..find(@.key op lit)`
12//! shapes over a **large** JSON document where the caller retained the
13//! raw bytes (see `Jetro::from_bytes`).  Skips the tree walk entirely —
14//! scan cost is bounded by byte length, not node count.
15//!
16//! ## When not to use
17//!
18//! - Document already parsed; raw bytes discarded — fall back to the
19//!   tree walker in `eval/mod.rs::collect_desc`.
20//! - Document is tiny (< a few KB): `serde_json` per-hit parse cost
21//!   overtakes the scan win.
22//!
23//! ## Correctness
24//!
25//! The scanner respects JSON string-literal escape rules: an unescaped
26//! `"` toggles `in_string`, and a `\\` inside a string skips the next
27//! byte.  A needle match must begin at a `"` encountered while
28//! `in_string` is *false* — exactly where a JSON object-key literal
29//! can legally appear.  Hits inside string values (`"comment":"the
30//! \"test\" case"`) are therefore rejected.
31
32use memchr::{memchr, memchr2};
33use serde_json::Value;
34
35/// Scan raw JSON `bytes` for every `"key":` occurrence that starts at a
36/// structural position (i.e. not inside a string literal).  Returns the
37/// byte offset of each matching opening `"` in document order.
38pub fn find_key_positions(bytes: &[u8], key: &str) -> Vec<usize> {
39    let needle = {
40        let mut s = String::with_capacity(key.len() + 3);
41        s.push('"');
42        s.push_str(key);
43        s.push_str("\":");
44        s
45    };
46    let needle_b = needle.as_bytes();
47    if needle_b.len() > bytes.len() { return Vec::new(); }
48
49    let mut out = Vec::new();
50    let mut i = 0usize;
51    let mut in_string = false;
52    let mut escape = false;
53
54    while i < bytes.len() {
55        if escape {
56            escape = false;
57            i += 1;
58            continue;
59        }
60        if in_string {
61            // SIMD jump to next `\\` or `"` (the only state-changing bytes
62            // inside a string literal).
63            let rest = &bytes[i..];
64            match memchr2(b'\\', b'"', rest) {
65                Some(off) => {
66                    i += off;
67                    match bytes[i] {
68                        b'\\' => { escape = true;     i += 1; }
69                        b'"'  => { in_string = false; i += 1; }
70                        _     => unreachable!(),
71                    }
72                }
73                None => break,
74            }
75        } else {
76            // SIMD jump to next `"` — only positions where a key literal
77            // can start (or where a top-level string value can open).
78            let rest = &bytes[i..];
79            match memchr(b'"', rest) {
80                Some(off) => {
81                    let q = i + off;
82                    if q + needle_b.len() <= bytes.len()
83                        && &bytes[q..q + needle_b.len()] == needle_b {
84                        out.push(q);
85                        // Past `"key":` we remain outside any string,
86                        // pointing at the start of the value.
87                        i = q + needle_b.len();
88                    } else {
89                        in_string = true;
90                        i = q + 1;
91                    }
92                }
93                None => break,
94            }
95        }
96    }
97    out
98}
99
100/// Extract every value paired with `key` at any depth.  Uses
101/// `find_key_positions` to locate each `"key":` site and then parses the
102/// single value that follows via a streaming `serde_json::Deserializer`
103/// (stops at the end of the first value — not the whole document).
104///
105/// Parse failures are silently skipped; they should never arise on
106/// valid JSON input, and we refuse to panic on malformed payloads.
107pub fn extract_values(bytes: &[u8], key: &str) -> Vec<Value> {
108    let positions = find_key_positions(bytes, key);
109    let mut out = Vec::with_capacity(positions.len());
110    let prefix_len = key.len() + 3; // `"key":`
111    for pos in positions {
112        let mut start = pos + prefix_len;
113        while start < bytes.len()
114            && matches!(bytes[start], b' ' | b'\t' | b'\n' | b'\r') {
115            start += 1;
116        }
117        if start >= bytes.len() { continue; }
118        let mut stream = serde_json::Deserializer::from_slice(&bytes[start..])
119            .into_iter::<Value>();
120        if let Some(Ok(v)) = stream.next() {
121            out.push(v);
122        }
123    }
124    out
125}
126
127/// Span of a single JSON value in `bytes`: start offset inclusive,
128/// end offset exclusive.  Produced by `find_key_value_spans`; the caller
129/// may compare raw bytes against a literal without allocating a `Value`.
130#[derive(Debug, Clone, Copy)]
131pub struct ValueSpan {
132    pub start: usize,
133    pub end:   usize,
134}
135
136/// Early-exit variant of `find_key_value_spans` — returns the first
137/// span paired with `key` encountered in document order, or `None` if
138/// the key does not appear.  Powers the `Descendant(k) + .first()`
139/// fast path: walks only as far as needed to find one match, rather
140/// than scanning the entire byte buffer.
141pub fn find_first_key_value_span(bytes: &[u8], key: &str) -> Option<ValueSpan> {
142    let needle = {
143        let mut s = String::with_capacity(key.len() + 3);
144        s.push('"');
145        s.push_str(key);
146        s.push_str("\":");
147        s
148    };
149    let needle_b = needle.as_bytes();
150    if needle_b.len() > bytes.len() { return None; }
151    let prefix_len = needle_b.len();
152
153    let mut i = 0usize;
154    let mut in_string = false;
155    let mut escape = false;
156
157    while i < bytes.len() {
158        if escape { escape = false; i += 1; continue; }
159        if in_string {
160            let rest = &bytes[i..];
161            match memchr2(b'\\', b'"', rest) {
162                Some(off) => {
163                    i += off;
164                    match bytes[i] {
165                        b'\\' => { escape = true;     i += 1; }
166                        b'"'  => { in_string = false; i += 1; }
167                        _     => unreachable!(),
168                    }
169                }
170                None => return None,
171            }
172        } else {
173            let rest = &bytes[i..];
174            match memchr(b'"', rest) {
175                Some(off) => {
176                    let q = i + off;
177                    if q + prefix_len <= bytes.len()
178                        && &bytes[q..q + prefix_len] == needle_b
179                    {
180                        let mut start = q + prefix_len;
181                        while start < bytes.len()
182                            && matches!(bytes[start], b' ' | b'\t' | b'\n' | b'\r')
183                        { start += 1; }
184                        if start >= bytes.len() { return None; }
185                        return value_end(bytes, start)
186                            .map(|end| ValueSpan { start, end });
187                    } else {
188                        in_string = true;
189                        i = q + 1;
190                    }
191                }
192                None => return None,
193            }
194        }
195    }
196    None
197}
198
199/// Locate the byte span of every value paired with `key`.  Skips
200/// whitespace between `:` and the value and then walks the value to its
201/// end — strings obey escape rules, containers track nesting depth,
202/// scalars run until the next structural terminator.
203pub fn find_key_value_spans(bytes: &[u8], key: &str) -> Vec<ValueSpan> {
204    let positions = find_key_positions(bytes, key);
205    let prefix_len = key.len() + 3;
206    let mut out = Vec::with_capacity(positions.len());
207    for pos in positions {
208        let mut start = pos + prefix_len;
209        while start < bytes.len()
210            && matches!(bytes[start], b' ' | b'\t' | b'\n' | b'\r') {
211            start += 1;
212        }
213        if start >= bytes.len() { continue; }
214        if let Some(end) = value_end(bytes, start) {
215            out.push(ValueSpan { start, end });
216        }
217    }
218    out
219}
220
221/// Extract the parsed `Value` for every `key` site whose raw bytes
222/// equal `lit`.  Matches by bytewise equality on the span — safe for
223/// JSON primitives (strings, numbers, bools, null) which serialise
224/// canonically, not for objects/arrays.  Non-matching sites are
225/// skipped without paying the `serde_json` parse cost.
226pub fn extract_values_eq(bytes: &[u8], key: &str, lit: &[u8]) -> Vec<Value> {
227    let mut out = Vec::new();
228    for span in find_key_value_spans(bytes, key) {
229        if span.end - span.start == lit.len()
230            && &bytes[span.start..span.end] == lit
231        {
232            if let Ok(v) = serde_json::from_slice::<Value>(&bytes[span.start..span.end]) {
233                out.push(v);
234            }
235        }
236    }
237    out
238}
239
240/// Extract every value for `key` **whose raw bytes equal `lit`** after
241/// trimming leading whitespace.  `lit` is expected to be pre-serialised
242/// JSON (e.g. `br#""action""#`, `b"42"`).  Bytewise comparison is safe
243/// for JSON primitives with canonical serialisation; it is *not* correct
244/// for objects/arrays where key order or whitespace may differ.
245///
246/// Skips non-matching sites entirely — no `Value` allocation.
247pub fn count_key_value_eq(bytes: &[u8], key: &str, lit: &[u8]) -> usize {
248    let mut n = 0;
249    for span in find_key_value_spans(bytes, key) {
250        if span.end - span.start == lit.len()
251            && &bytes[span.start..span.end] == lit
252        {
253            n += 1;
254        }
255    }
256    n
257}
258
259/// Locate the byte span of every **enclosing object** whose `key` field
260/// equals the canonical-serialised literal `lit`.  Powers the SIMD fast
261/// path for `$..find(@.key == lit)`.
262///
263/// Implementation: single forward pass.  An explicit stack tracks the
264/// start position of every currently-open `{`.  When `"key":` is encountered
265/// at the top object and the following value bytes equal `lit`, the top
266/// frame is flagged.  On matching `}`, flagged frames emit a `ValueSpan`
267/// covering the object.  Output is sorted by `start` so order matches the
268/// DFS pre-order of the tree walker.
269///
270/// Bytewise literal comparison is safe for JSON primitives (int, string,
271/// bool, null) because they serialise canonically.  It is **not** correct
272/// for floats (`1.0` / `1` representation variance) or structured values
273/// (object key order / whitespace) — callers must reject those literals.
274pub fn find_enclosing_objects_eq(bytes: &[u8], key: &str, lit: &[u8]) -> Vec<ValueSpan> {
275    let needle = {
276        let mut s = String::with_capacity(key.len() + 3);
277        s.push('"');
278        s.push_str(key);
279        s.push_str("\":");
280        s
281    };
282    let needle_b = needle.as_bytes();
283    let mut out = Vec::new();
284    let mut stack: Vec<(usize, bool)> = Vec::new();
285    let mut i = 0usize;
286    let mut in_string = false;
287    let mut escape = false;
288
289    while i < bytes.len() {
290        if escape { escape = false; i += 1; continue; }
291        if in_string {
292            let rest = &bytes[i..];
293            match memchr2(b'\\', b'"', rest) {
294                Some(off) => {
295                    i += off;
296                    match bytes[i] {
297                        b'\\' => { escape = true;     i += 1; }
298                        b'"'  => { in_string = false; i += 1; }
299                        _     => unreachable!(),
300                    }
301                }
302                None => break,
303            }
304            continue;
305        }
306        match bytes[i] {
307            b'{' => { stack.push((i, false)); i += 1; }
308            b'}' => {
309                if let Some((start, matched)) = stack.pop() {
310                    if matched { out.push(ValueSpan { start, end: i + 1 }); }
311                }
312                i += 1;
313            }
314            b'"' => {
315                if i + needle_b.len() <= bytes.len()
316                    && &bytes[i..i + needle_b.len()] == needle_b
317                {
318                    let mut vs = i + needle_b.len();
319                    while vs < bytes.len()
320                        && matches!(bytes[vs], b' ' | b'\t' | b'\n' | b'\r')
321                    { vs += 1; }
322                    if let Some(ve) = value_end(bytes, vs) {
323                        if ve - vs == lit.len() && &bytes[vs..ve] == lit {
324                            if let Some(top) = stack.last_mut() { top.1 = true; }
325                        }
326                        i = ve;
327                    } else {
328                        i = vs;
329                    }
330                } else {
331                    in_string = true;
332                    i += 1;
333                }
334            }
335            _ => i += 1,
336        }
337    }
338
339    out.sort_by_key(|s| s.start);
340    out
341}
342
343/// Like `find_enclosing_objects_eq` but accepts N `(key, lit)` conjuncts.
344/// An object is emitted iff it *directly* contains every listed key with
345/// the matching canonical literal value.  Each frame carries a bitmask of
346/// which conjuncts have matched so far (max 64 conjuncts).
347pub fn find_enclosing_objects_eq_multi(
348    bytes: &[u8],
349    conjuncts: &[(String, Vec<u8>)],
350) -> Vec<ValueSpan> {
351    assert!(conjuncts.len() <= 64, "at most 64 conjuncts supported");
352    if conjuncts.is_empty() { return Vec::new(); }
353
354    // Pre-build each needle as `"<key>":` so we compare against a
355    // contiguous slice at the current cursor.
356    let needles: Vec<Vec<u8>> = conjuncts.iter().map(|(k, _)| {
357        let mut s = Vec::with_capacity(k.len() + 3);
358        s.push(b'"');
359        s.extend_from_slice(k.as_bytes());
360        s.extend_from_slice(b"\":");
361        s
362    }).collect();
363    let full_mask: u64 = if conjuncts.len() == 64 { u64::MAX }
364                         else { (1u64 << conjuncts.len()) - 1 };
365
366    let mut out = Vec::new();
367    let mut stack: Vec<(usize, u64)> = Vec::new();
368    let mut i = 0usize;
369    let mut in_string = false;
370    let mut escape = false;
371
372    while i < bytes.len() {
373        if escape { escape = false; i += 1; continue; }
374        if in_string {
375            let rest = &bytes[i..];
376            match memchr2(b'\\', b'"', rest) {
377                Some(off) => {
378                    i += off;
379                    match bytes[i] {
380                        b'\\' => { escape = true;     i += 1; }
381                        b'"'  => { in_string = false; i += 1; }
382                        _     => unreachable!(),
383                    }
384                }
385                None => break,
386            }
387            continue;
388        }
389        match bytes[i] {
390            b'{' => { stack.push((i, 0u64)); i += 1; }
391            b'}' => {
392                if let Some((start, mask)) = stack.pop() {
393                    if mask == full_mask {
394                        out.push(ValueSpan { start, end: i + 1 });
395                    }
396                }
397                i += 1;
398            }
399            b'"' => {
400                let mut matched_idx: Option<usize> = None;
401                for (idx, nb) in needles.iter().enumerate() {
402                    if i + nb.len() <= bytes.len() && &bytes[i..i + nb.len()] == &nb[..] {
403                        matched_idx = Some(idx);
404                        break;
405                    }
406                }
407                if let Some(idx) = matched_idx {
408                    let nb = &needles[idx];
409                    let mut vs = i + nb.len();
410                    while vs < bytes.len()
411                        && matches!(bytes[vs], b' ' | b'\t' | b'\n' | b'\r')
412                    { vs += 1; }
413                    if let Some(ve) = value_end(bytes, vs) {
414                        let lit = &conjuncts[idx].1;
415                        if ve - vs == lit.len() && &bytes[vs..ve] == &lit[..] {
416                            if let Some(top) = stack.last_mut() {
417                                top.1 |= 1u64 << idx;
418                            }
419                        }
420                        i = ve;
421                    } else {
422                        i = vs;
423                    }
424                } else {
425                    in_string = true;
426                    i += 1;
427                }
428            }
429            _ => i += 1,
430        }
431    }
432
433    out.sort_by_key(|s| s.start);
434    out
435}
436
437/// Comparison operator for numeric-range byte scans.  Mirrors the subset of
438/// `ast::BinOp` that makes sense against a canonical JSON number literal.
439#[derive(Debug, Clone, Copy, PartialEq, Eq)]
440pub enum ScanCmp { Lt, Lte, Gt, Gte }
441
442impl ScanCmp {
443    #[inline]
444    fn holds(self, lhs: f64, rhs: f64) -> bool {
445        match self {
446            ScanCmp::Lt  => lhs <  rhs,
447            ScanCmp::Lte => lhs <= rhs,
448            ScanCmp::Gt  => lhs >  rhs,
449            ScanCmp::Gte => lhs >= rhs,
450        }
451    }
452}
453
454/// A single predicate against the value paired with a key inside an
455/// enclosing object.  Drives `find_enclosing_objects_mixed`.
456#[derive(Debug, Clone)]
457pub enum ScanPred {
458    /// Bytewise equality against a canonical JSON literal (int/string/
459    /// bool/null — same shape as `find_enclosing_objects_eq_multi`).
460    Eq(Vec<u8>),
461    /// Numeric comparison: value parsed as f64 then `op` applied vs
462    /// `threshold`.  Non-numeric values do not match.
463    Cmp(ScanCmp, f64),
464}
465
466/// Locate the byte span of every enclosing object whose `key` field is a
467/// JSON number satisfying `op threshold`.  Powers the fast path for
468/// `$..find(@.key op num)` where `op` ∈ `<`, `<=`, `>`, `>=`.
469///
470/// Matches the shape of `find_enclosing_objects_eq` — single forward pass,
471/// stack of opened `{` frames, flag-on-match, emit-on-close, output sorted
472/// by start offset so order mirrors the tree walker's DFS pre-order.
473///
474/// The value is byte-parsed via `parse_num_span`.  Non-numeric values and
475/// malformed numbers are skipped (the conjunct simply doesn't fire).
476pub fn find_enclosing_objects_cmp(
477    bytes: &[u8],
478    key: &str,
479    op: ScanCmp,
480    threshold: f64,
481) -> Vec<ValueSpan> {
482    let needle = {
483        let mut s = String::with_capacity(key.len() + 3);
484        s.push('"');
485        s.push_str(key);
486        s.push_str("\":");
487        s
488    };
489    let needle_b = needle.as_bytes();
490    let mut out = Vec::new();
491    let mut stack: Vec<(usize, bool)> = Vec::new();
492    let mut i = 0usize;
493    let mut in_string = false;
494    let mut escape = false;
495
496    while i < bytes.len() {
497        if escape { escape = false; i += 1; continue; }
498        if in_string {
499            let rest = &bytes[i..];
500            match memchr2(b'\\', b'"', rest) {
501                Some(off) => {
502                    i += off;
503                    match bytes[i] {
504                        b'\\' => { escape = true;     i += 1; }
505                        b'"'  => { in_string = false; i += 1; }
506                        _     => unreachable!(),
507                    }
508                }
509                None => break,
510            }
511            continue;
512        }
513        match bytes[i] {
514            b'{' => { stack.push((i, false)); i += 1; }
515            b'}' => {
516                if let Some((start, matched)) = stack.pop() {
517                    if matched { out.push(ValueSpan { start, end: i + 1 }); }
518                }
519                i += 1;
520            }
521            b'"' => {
522                if i + needle_b.len() <= bytes.len()
523                    && &bytes[i..i + needle_b.len()] == needle_b
524                {
525                    let mut vs = i + needle_b.len();
526                    while vs < bytes.len()
527                        && matches!(bytes[vs], b' ' | b'\t' | b'\n' | b'\r')
528                    { vs += 1; }
529                    if let Some(ve) = value_end(bytes, vs) {
530                        if let Some((_, as_f, _)) = parse_num_span(&bytes[vs..ve]) {
531                            if op.holds(as_f, threshold) {
532                                if let Some(top) = stack.last_mut() { top.1 = true; }
533                            }
534                        }
535                        i = ve;
536                    } else {
537                        i = vs;
538                    }
539                } else {
540                    in_string = true;
541                    i += 1;
542                }
543            }
544            _ => i += 1,
545        }
546    }
547
548    out.sort_by_key(|s| s.start);
549    out
550}
551
552/// Extract the span of the *direct* child named `key` inside an object
553/// whose bytes span is `obj_bytes[0] == b'{'`.  Depth-aware: matches
554/// only keys at the top level of the object, not keys nested inside
555/// arrays or sub-objects.  Returned span is relative to `obj_bytes`.
556pub fn find_direct_field(obj_bytes: &[u8], key: &str) -> Option<ValueSpan> {
557    if obj_bytes.is_empty() || obj_bytes[0] != b'{' { return None; }
558    let needle = {
559        let mut s = String::with_capacity(key.len() + 3);
560        s.push('"');
561        s.push_str(key);
562        s.push_str("\":");
563        s
564    };
565    let needle_b = needle.as_bytes();
566    let mut depth: usize = 0;
567    let mut i = 0usize;
568    let mut in_string = false;
569    let mut escape = false;
570    while i < obj_bytes.len() {
571        if escape { escape = false; i += 1; continue; }
572        if in_string {
573            match memchr2(b'\\', b'"', &obj_bytes[i..]) {
574                Some(off) => {
575                    i += off;
576                    match obj_bytes[i] {
577                        b'\\' => { escape = true;     i += 1; }
578                        b'"'  => { in_string = false; i += 1; }
579                        _     => unreachable!(),
580                    }
581                }
582                None => return None,
583            }
584            continue;
585        }
586        match obj_bytes[i] {
587            b'{' | b'[' => { depth += 1; i += 1; }
588            b'}' | b']' => {
589                if depth == 0 { return None; }
590                depth -= 1;
591                i += 1;
592            }
593            b'"' => {
594                if depth == 1
595                    && i + needle_b.len() <= obj_bytes.len()
596                    && &obj_bytes[i..i + needle_b.len()] == needle_b
597                {
598                    let mut vs = i + needle_b.len();
599                    while vs < obj_bytes.len()
600                        && matches!(obj_bytes[vs], b' ' | b'\t' | b'\n' | b'\r')
601                    { vs += 1; }
602                    return value_end(obj_bytes, vs)
603                        .map(|end| ValueSpan { start: vs, end });
604                }
605                in_string = true;
606                i += 1;
607            }
608            _ => i += 1,
609        }
610    }
611    None
612}
613
614/// Mixed multi-conjunct scan: each conjunct is `(key, ScanPred)` and an
615/// enclosing object is emitted iff every conjunct matches on the same
616/// `{...}` frame.  Generalises `find_enclosing_objects_eq_multi` to allow
617/// equality literals and numeric-range comparisons in the same query.
618/// Frames carry a bitmask of satisfied conjuncts (max 64).
619pub fn find_enclosing_objects_mixed(
620    bytes: &[u8],
621    conjuncts: &[(String, ScanPred)],
622) -> Vec<ValueSpan> {
623    assert!(conjuncts.len() <= 64, "at most 64 conjuncts supported");
624    if conjuncts.is_empty() { return Vec::new(); }
625
626    let needles: Vec<Vec<u8>> = conjuncts.iter().map(|(k, _)| {
627        let mut s = Vec::with_capacity(k.len() + 3);
628        s.push(b'"');
629        s.extend_from_slice(k.as_bytes());
630        s.extend_from_slice(b"\":");
631        s
632    }).collect();
633    let full_mask: u64 = if conjuncts.len() == 64 { u64::MAX }
634                         else { (1u64 << conjuncts.len()) - 1 };
635
636    let mut out = Vec::new();
637    let mut stack: Vec<(usize, u64)> = Vec::new();
638    let mut i = 0usize;
639    let mut in_string = false;
640    let mut escape = false;
641
642    while i < bytes.len() {
643        if escape { escape = false; i += 1; continue; }
644        if in_string {
645            let rest = &bytes[i..];
646            match memchr2(b'\\', b'"', rest) {
647                Some(off) => {
648                    i += off;
649                    match bytes[i] {
650                        b'\\' => { escape = true;     i += 1; }
651                        b'"'  => { in_string = false; i += 1; }
652                        _     => unreachable!(),
653                    }
654                }
655                None => break,
656            }
657            continue;
658        }
659        match bytes[i] {
660            b'{' => { stack.push((i, 0u64)); i += 1; }
661            b'}' => {
662                if let Some((start, mask)) = stack.pop() {
663                    if mask == full_mask {
664                        out.push(ValueSpan { start, end: i + 1 });
665                    }
666                }
667                i += 1;
668            }
669            b'"' => {
670                let mut matched_idx: Option<usize> = None;
671                for (idx, nb) in needles.iter().enumerate() {
672                    if i + nb.len() <= bytes.len() && &bytes[i..i + nb.len()] == &nb[..] {
673                        matched_idx = Some(idx);
674                        break;
675                    }
676                }
677                if let Some(idx) = matched_idx {
678                    let nb = &needles[idx];
679                    let mut vs = i + nb.len();
680                    while vs < bytes.len()
681                        && matches!(bytes[vs], b' ' | b'\t' | b'\n' | b'\r')
682                    { vs += 1; }
683                    if let Some(ve) = value_end(bytes, vs) {
684                        let fires = match &conjuncts[idx].1 {
685                            ScanPred::Eq(lit) =>
686                                ve - vs == lit.len() && &bytes[vs..ve] == &lit[..],
687                            ScanPred::Cmp(op, thresh) =>
688                                parse_num_span(&bytes[vs..ve])
689                                    .map(|(_, f, _)| op.holds(f, *thresh))
690                                    .unwrap_or(false),
691                        };
692                        if fires {
693                            if let Some(top) = stack.last_mut() {
694                                top.1 |= 1u64 << idx;
695                            }
696                        }
697                        i = ve;
698                    } else {
699                        i = vs;
700                    }
701                } else {
702                    in_string = true;
703                    i += 1;
704                }
705            }
706            _ => i += 1,
707        }
708    }
709
710    out.sort_by_key(|s| s.start);
711    out
712}
713
714/// Fold numeric values over `spans` into `(int_sum, float_sum, is_float, n)`.
715/// Integer spans accumulate into `int_sum`; a single float promotes the
716/// whole fold to `float_sum` (which tracks the running total as f64).
717/// Spans that don't parse as numbers are skipped.
718#[derive(Debug, Clone, Copy, Default)]
719pub struct NumFold {
720    pub int_sum:   i64,
721    pub float_sum: f64,
722    pub is_float:  bool,
723    pub count:     usize,
724    pub min_i:     i64,
725    pub max_i:     i64,
726    pub min_f:     f64,
727    pub max_f:     f64,
728    pub any:       bool,
729}
730
731/// Parse a span of JSON numeric bytes. Returns Some((as_i64, as_f64, is_int))
732/// or None if not a valid number. Canonical JSON numbers only: `-?\d+(\.\d+)?(e±\d+)?`.
733#[inline]
734pub fn parse_num_span(s: &[u8]) -> Option<(i64, f64, bool)> {
735    let s = std::str::from_utf8(s).ok()?;
736    // Integer-looking path (no '.', 'e', 'E') — try i64 first.
737    let has_frac_or_exp = s.bytes().any(|b| matches!(b, b'.' | b'e' | b'E'));
738    if !has_frac_or_exp {
739        if let Ok(n) = s.parse::<i64>() {
740            return Some((n, n as f64, true));
741        }
742    }
743    s.parse::<f64>().ok().map(|f| (f as i64, f, false))
744}
745
746/// Fold numeric spans for sum/avg/min/max. Walks each span, parses as
747/// number, updates the accumulators. Non-numeric spans are skipped.
748pub fn fold_nums(bytes: &[u8], spans: &[ValueSpan]) -> NumFold {
749    let mut f = NumFold::default();
750    for s in spans {
751        let slice = &bytes[s.start..s.end];
752        let Some((i, x, is_int)) = parse_num_span(slice) else { continue };
753        f.count += 1;
754        if !f.any {
755            f.any = true;
756            f.min_i = i; f.max_i = i;
757            f.min_f = x; f.max_f = x;
758        } else {
759            if i < f.min_i { f.min_i = i; }
760            if i > f.max_i { f.max_i = i; }
761            if x < f.min_f { f.min_f = x; }
762            if x > f.max_f { f.max_f = x; }
763        }
764        if is_int && !f.is_float {
765            f.int_sum = f.int_sum.wrapping_add(i);
766            f.float_sum += x;
767        } else {
768            if !f.is_float { f.float_sum = f.int_sum as f64; f.is_float = true; }
769            f.float_sum += x;
770        }
771    }
772    f
773}
774
775/// Fold the direct child named `key` of each enclosing object span
776/// into a single `NumFold`.  Combines `find_direct_field` +
777/// `parse_num_span` without materialising any intermediate `Val`.
778/// Spans missing the key or whose value is non-numeric are skipped.
779pub fn fold_direct_field_nums(
780    bytes: &[u8], spans: &[ValueSpan], key: &str,
781) -> NumFold {
782    let mut f = NumFold::default();
783    for s in spans {
784        let obj_bytes = &bytes[s.start..s.end];
785        let Some(vs) = find_direct_field(obj_bytes, key) else { continue };
786        let Some((i, x, is_int)) = parse_num_span(&obj_bytes[vs.start..vs.end])
787            else { continue };
788        f.count += 1;
789        if !f.any {
790            f.any = true;
791            f.min_i = i; f.max_i = i;
792            f.min_f = x; f.max_f = x;
793        } else {
794            if i < f.min_i { f.min_i = i; }
795            if i > f.max_i { f.max_i = i; }
796            if x < f.min_f { f.min_f = x; }
797            if x > f.max_f { f.max_f = x; }
798        }
799        if is_int && !f.is_float {
800            f.int_sum = f.int_sum.wrapping_add(i);
801            f.float_sum += x;
802        } else {
803            if !f.is_float { f.float_sum = f.int_sum as f64; f.is_float = true; }
804            f.float_sum += x;
805        }
806    }
807    f
808}
809
810/// Walk a JSON value starting at `start`, return the exclusive end offset.
811/// Returns `None` on malformed input (missing close, truncated literal).
812fn value_end(bytes: &[u8], start: usize) -> Option<usize> {
813    if start >= bytes.len() { return None; }
814    match bytes[start] {
815        b'"' => {
816            // Walk the string respecting escapes.
817            let mut i = start + 1;
818            let mut escape = false;
819            while i < bytes.len() {
820                if escape { escape = false; i += 1; continue; }
821                match bytes[i] {
822                    b'\\' => { escape = true; i += 1; }
823                    b'"'  => return Some(i + 1),
824                    _     => i += 1,
825                }
826            }
827            None
828        }
829        b'{' | b'[' => {
830            let open = bytes[start];
831            let close = if open == b'{' { b'}' } else { b']' };
832            let mut depth = 1usize;
833            let mut i = start + 1;
834            let mut in_string = false;
835            let mut escape = false;
836            while i < bytes.len() {
837                let b = bytes[i];
838                if escape { escape = false; i += 1; continue; }
839                if in_string {
840                    match b {
841                        b'\\' => escape = true,
842                        b'"'  => in_string = false,
843                        _     => {}
844                    }
845                    i += 1;
846                    continue;
847                }
848                match b {
849                    b'"' => in_string = true,
850                    c if c == open  => depth += 1,
851                    c if c == close => {
852                        depth -= 1;
853                        if depth == 0 { return Some(i + 1); }
854                    }
855                    _ => {}
856                }
857                i += 1;
858            }
859            None
860        }
861        _ => {
862            // Scalar (number / bool / null) — scan until structural terminator.
863            let mut i = start;
864            while i < bytes.len() {
865                match bytes[i] {
866                    b',' | b'}' | b']' | b' ' | b'\t' | b'\n' | b'\r' => break,
867                    _ => i += 1,
868                }
869            }
870            if i == start { None } else { Some(i) }
871        }
872    }
873}
874
875#[cfg(test)]
876mod tests {
877    use super::*;
878
879    #[test]
880    fn finds_top_level_key() {
881        let doc = br#"{"test": 42, "other": 7}"#;
882        let pos = find_key_positions(doc, "test");
883        assert_eq!(pos, vec![1]);
884    }
885
886    #[test]
887    fn finds_nested_keys() {
888        let doc = br#"{"a":{"test":1},"b":[{"test":2},{"test":3}]}"#;
889        let pos = find_key_positions(doc, "test");
890        assert_eq!(pos.len(), 3);
891    }
892
893    #[test]
894    fn ignores_hits_inside_string_values() {
895        let doc = br#"{"comment":"the \"test\": lie","test":99}"#;
896        let vals = extract_values(doc, "test");
897        assert_eq!(vals, vec![serde_json::json!(99)]);
898    }
899
900    #[test]
901    fn does_not_match_longer_key_suffix() {
902        let doc = br#"{"nottest":1,"test":2}"#;
903        let vals = extract_values(doc, "test");
904        assert_eq!(vals, vec![serde_json::json!(2)]);
905    }
906
907    #[test]
908    fn handles_escaped_backslash_then_quote() {
909        // Backslash escapes itself: `"c:\\"` closes normally.  The
910        // subsequent `"test":` must then be recognised.
911        let doc = br#"{"path":"c:\\","test":"ok"}"#;
912        let vals = extract_values(doc, "test");
913        assert_eq!(vals, vec![serde_json::json!("ok")]);
914    }
915
916    #[test]
917    fn empty_on_missing_key() {
918        let doc = br#"{"a":1,"b":2}"#;
919        assert!(find_key_positions(doc, "zzz").is_empty());
920        assert!(extract_values(doc, "zzz").is_empty());
921    }
922
923    #[test]
924    fn extracts_nested_object_value() {
925        let doc = br#"{"test":{"nested":[1,2,3]}}"#;
926        let vals = extract_values(doc, "test");
927        assert_eq!(vals, vec![serde_json::json!({"nested":[1,2,3]})]);
928    }
929
930    #[test]
931    fn extracts_all_nested_hits_in_order() {
932        let doc = br#"{"a":{"test":1},"b":[{"test":2},{"test":3}]}"#;
933        let vals = extract_values(doc, "test");
934        assert_eq!(vals, vec![
935            serde_json::json!(1),
936            serde_json::json!(2),
937            serde_json::json!(3),
938        ]);
939    }
940
941    #[test]
942    fn spans_cover_every_value_kind() {
943        let doc = br#"{"a":1,"b":"two","c":[1,2,3],"d":{"x":1},"e":true}"#;
944        let keys_and_expected: &[(&str, &[u8])] = &[
945            ("a", b"1"),
946            ("b", b"\"two\""),
947            ("c", b"[1,2,3]"),
948            ("d", b"{\"x\":1}"),
949            ("e", b"true"),
950        ];
951        for (k, want) in keys_and_expected {
952            let spans = find_key_value_spans(doc, k);
953            assert_eq!(spans.len(), 1, "key {} not found", k);
954            assert_eq!(&doc[spans[0].start..spans[0].end], *want);
955        }
956    }
957
958    #[test]
959    fn count_eq_matches_only_literal_equals() {
960        let doc = br#"{"a":[{"type":"action"},{"type":"idle"},{"type":"action"},{"type":"noop"}]}"#;
961        assert_eq!(count_key_value_eq(doc, "type", br#""action""#), 2);
962        assert_eq!(count_key_value_eq(doc, "type", br#""missing""#), 0);
963    }
964
965    #[test]
966    fn count_eq_numeric_literal() {
967        let doc = br#"{"xs":[{"n":10},{"n":42},{"n":10},{"n":42}]}"#;
968        assert_eq!(count_key_value_eq(doc, "n", b"42"), 2);
969        assert_eq!(count_key_value_eq(doc, "n", b"10"), 2);
970    }
971
972    #[test]
973    fn spans_skip_whitespace_after_colon() {
974        let doc = br#"{"a":   42   ,"b":  "x"}"#;
975        let a = find_key_value_spans(doc, "a");
976        assert_eq!(&doc[a[0].start..a[0].end], b"42");
977        let b = find_key_value_spans(doc, "b");
978        assert_eq!(&doc[b[0].start..b[0].end], b"\"x\"");
979    }
980
981    #[test]
982    fn enclosing_object_simple_match() {
983        let doc = br#"{"events":[{"type":"action","id":1},{"type":"idle","id":2},{"type":"action","id":3}]}"#;
984        let spans = find_enclosing_objects_eq(doc, "type", br#""action""#);
985        assert_eq!(spans.len(), 2);
986        let objs: Vec<_> = spans.iter()
987            .map(|s| serde_json::from_slice::<serde_json::Value>(&doc[s.start..s.end]).unwrap())
988            .collect();
989        assert_eq!(objs[0], serde_json::json!({"type":"action","id":1}));
990        assert_eq!(objs[1], serde_json::json!({"type":"action","id":3}));
991    }
992
993    #[test]
994    fn enclosing_object_nested_both_match() {
995        // Outer and inner object both have type:"x" — both must be emitted
996        // in start-offset order (matches tree walker DFS pre-order).
997        let doc = br#"{"type":"x","child":{"type":"x","n":2}}"#;
998        let spans = find_enclosing_objects_eq(doc, "type", br#""x""#);
999        assert_eq!(spans.len(), 2);
1000        assert!(spans[0].start < spans[1].start);
1001        assert_eq!(&doc[spans[0].start..spans[0].end], doc);
1002        assert_eq!(&doc[spans[1].start..spans[1].end], br#"{"type":"x","n":2}"#);
1003    }
1004
1005    #[test]
1006    fn enclosing_object_nested_inner_only() {
1007        let doc = br#"{"type":"a","child":{"type":"b","n":2}}"#;
1008        let spans = find_enclosing_objects_eq(doc, "type", br#""b""#);
1009        assert_eq!(spans.len(), 1);
1010        assert_eq!(&doc[spans[0].start..spans[0].end], br#"{"type":"b","n":2}"#);
1011    }
1012
1013    #[test]
1014    fn enclosing_object_ignores_string_value_containing_needle() {
1015        let doc = br#"{"comment":"the \"type\":\"action\" label","events":[{"type":"action"}]}"#;
1016        let spans = find_enclosing_objects_eq(doc, "type", br#""action""#);
1017        assert_eq!(spans.len(), 1);
1018        assert_eq!(&doc[spans[0].start..spans[0].end], br#"{"type":"action"}"#);
1019    }
1020
1021    #[test]
1022    fn enclosing_object_numeric_literal() {
1023        let doc = br#"[{"v":10},{"v":42},{"v":42}]"#;
1024        let spans = find_enclosing_objects_eq(doc, "v", b"42");
1025        assert_eq!(spans.len(), 2);
1026    }
1027
1028    #[test]
1029    fn enclosing_object_no_match() {
1030        let doc = br#"{"xs":[{"v":1},{"v":2}]}"#;
1031        let spans = find_enclosing_objects_eq(doc, "v", b"99");
1032        assert!(spans.is_empty());
1033    }
1034
1035    #[test]
1036    fn enclosing_object_multi_and_both_match() {
1037        let doc = br#"[{"t":"a","v":1},{"t":"a","v":2},{"t":"b","v":1}]"#;
1038        let c = vec![
1039            ("t".to_string(), br#""a""#.to_vec()),
1040            ("v".to_string(), b"1".to_vec()),
1041        ];
1042        let spans = find_enclosing_objects_eq_multi(doc, &c);
1043        assert_eq!(spans.len(), 1);
1044        assert_eq!(&doc[spans[0].start..spans[0].end], br#"{"t":"a","v":1}"#);
1045    }
1046
1047    #[test]
1048    fn enclosing_object_multi_and_nested_propagates() {
1049        // Child must match on its own; parent's fields don't leak inward.
1050        let doc = br#"{"t":"a","child":{"t":"a","v":1},"v":1}"#;
1051        let c = vec![
1052            ("t".to_string(), br#""a""#.to_vec()),
1053            ("v".to_string(), b"1".to_vec()),
1054        ];
1055        let spans = find_enclosing_objects_eq_multi(doc, &c);
1056        assert_eq!(spans.len(), 2);
1057        assert!(spans[0].start < spans[1].start);
1058    }
1059
1060    #[test]
1061    fn enclosing_object_multi_and_partial_no_match() {
1062        // Only one conjunct matches → no emit.
1063        let doc = br#"[{"t":"a","v":2},{"t":"b","v":1}]"#;
1064        let c = vec![
1065            ("t".to_string(), br#""a""#.to_vec()),
1066            ("v".to_string(), b"1".to_vec()),
1067        ];
1068        let spans = find_enclosing_objects_eq_multi(doc, &c);
1069        assert!(spans.is_empty());
1070    }
1071}