Skip to main content

jetro_experimental/
api.rs

1//! Public API surface for jetro-experimental.
2//!
3//! Designed for jetro-core and other downstream tools to consume without
4//! depending on internal column layouts.  Internals (`Stage1`,
5//! `StructIndex`, `KeyBitmaps`) remain accessible for tests/benches but
6//! should not be used by library callers.
7//!
8//! Stable types:
9//!   - [`TokenId`]            — opaque token handle (newtype around u32)
10//!   - [`TokenKind`]          — closed enum, `#[non_exhaustive]`
11//!   - [`ByteSpan`]           — byte range in source buffer
12//!   - [`BuildOptions`]       — knobs for partial builds
13//!   - [`StructuralIndex`]    — opaque facade over the internal columns
14//!   - [`KeyHits`]            — lazy iterator over key matches
15//!   - [`Error`]              — public error enum
16//!
17//! Stable entry points:
18//!   - [`from_bytes`]         — build a structural index from JSON bytes
19//!
20//! Stable fused query helpers:
21//!   - [`find_eq`]            — `$..find(key == literal)`
22//!   - [`count_key`]          — popcount-only count of `$..find(key)` hits
23//!   - [`find_eq_compound`]   — multi-key AND
24//!   - [`json_string_eq`]     — byte-compare a JSON string value to a plain
25//!                               literal
26//!
27//! All iterators are lazy.  Memory layout is private; future refactors can
28//! swap Roaring for any other bitmap library without breaking callers.
29
30use std::sync::Arc;
31
32use crate::index::StructIndex;
33use crate::keys::{value_for_key as keys_value_for_key, KeyBitmaps, Role};
34use crate::stage1::Kind;
35
36/// Opaque token handle.  Internally a `u32` index into the structural index.
37#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)]
38pub struct TokenId(pub(crate) u32);
39
40impl TokenId {
41    #[inline]
42    pub fn raw(self) -> u32 {
43        self.0
44    }
45}
46
47impl From<u32> for TokenId {
48    #[inline]
49    fn from(v: u32) -> Self {
50        Self(v)
51    }
52}
53
54#[derive(Copy, Clone, Eq, PartialEq, Debug)]
55#[non_exhaustive]
56pub enum TokenKind {
57    Object,
58    Array,
59    Key,
60    String,
61    Scalar,
62    ObjectEnd,
63    ArrayEnd,
64}
65
66#[derive(Copy, Clone, Eq, PartialEq, Debug)]
67pub struct ByteSpan {
68    pub start: u32,
69    pub end: u32,
70}
71
72impl ByteSpan {
73    #[inline]
74    pub fn len(self) -> u32 {
75        self.end.saturating_sub(self.start)
76    }
77    #[inline]
78    pub fn is_empty(self) -> bool {
79        self.end <= self.start
80    }
81    #[inline]
82    pub fn slice<'a>(self, bytes: &'a [u8]) -> &'a [u8] {
83        &bytes[self.start as usize..self.end as usize]
84    }
85}
86
87/// Public error enum.  Implements `std::error::Error`.
88#[derive(Debug)]
89pub enum Error {
90    Parse(String),
91    UnbalancedClose,
92    Truncated,
93    InvalidUtf8,
94}
95
96impl std::fmt::Display for Error {
97    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
98        match self {
99            Error::Parse(s) => write!(f, "parse error: {s}"),
100            Error::UnbalancedClose => write!(f, "unbalanced container close"),
101            Error::Truncated => write!(f, "truncated input"),
102            Error::InvalidUtf8 => write!(f, "invalid UTF-8"),
103        }
104    }
105}
106
107impl std::error::Error for Error {}
108
109#[derive(Clone, Debug)]
110#[non_exhaustive]
111pub struct BuildOptions {
112    pub keys: bool,
113    pub roles: bool,
114    pub close_of: bool,
115    pub tape_alignment: bool,
116}
117
118impl Default for BuildOptions {
119    fn default() -> Self {
120        Self {
121            keys: true,
122            roles: true,
123            close_of: true,
124            tape_alignment: false,
125        }
126    }
127}
128
129impl BuildOptions {
130    pub fn minimal() -> Self {
131        Self {
132            keys: false,
133            roles: false,
134            close_of: false,
135            tape_alignment: false,
136        }
137    }
138
139    pub fn keys_only() -> Self {
140        Self {
141            keys: true,
142            roles: true,
143            close_of: false,
144            tape_alignment: false,
145        }
146    }
147
148    pub fn for_jetro_tape() -> Self {
149        Self {
150            keys: true,
151            roles: true,
152            close_of: true,
153            tape_alignment: true,
154        }
155    }
156}
157
158
159/// Opaque structural index over a JSON document.  Internal layout is
160/// subject to change; consume only via the public methods.
161pub struct StructuralIndex {
162    pub(crate) inner: Arc<Inner>,
163}
164
165pub(crate) struct Inner {
166    pub idx: StructIndex,
167    pub keys: Option<KeyBitmaps>,
168}
169
170// Compile-time assertions that the public API surface is thread-safe.
171// `StructuralIndex` is just an `Arc<Inner>`; both `StructIndex` and
172// `KeyBitmaps` are owned-data structures (Vec / HashMap / Box<str> /
173// croaring::Bitmap) which are all `Send + Sync`.
174const _: fn() = || {
175    fn assert_send<T: Send>() {}
176    fn assert_sync<T: Sync>() {}
177    assert_send::<StructuralIndex>();
178    assert_sync::<StructuralIndex>();
179    assert_send::<TokenId>();
180    assert_sync::<TokenId>();
181    assert_send::<ByteSpan>();
182    assert_sync::<ByteSpan>();
183    assert_send::<Error>();
184    assert_sync::<Error>();
185    assert_send::<BuildOptions>();
186    assert_sync::<BuildOptions>();
187};
188
189impl StructuralIndex {
190    /// Total number of stage1 tokens covered by this index.
191    pub fn token_count(&self) -> u32 {
192        self.inner.idx.stage1.len() as u32
193    }
194
195    /// Maximum nesting depth observed in the source document.
196    pub fn max_depth(&self) -> u16 {
197        self.inner.idx.stage1.depth.iter().copied().max().unwrap_or(0)
198    }
199
200    /// All tokens in document order.
201    pub fn tokens(&self) -> Tokens<'_> {
202        Tokens {
203            idx: self,
204            cur: 0,
205            end: self.token_count(),
206        }
207    }
208
209    /// Classify a token as Object/Array/Key/String/Scalar/etc.  Reads the
210    /// stage1 kind plus (when keys are built) the role to disambiguate
211    /// Quote-as-Key from Quote-as-Value.
212    #[inline]
213    pub fn kind(&self, tok: TokenId) -> TokenKind {
214        let i = tok.0 as usize;
215        let k = self.inner.idx.stage1.kind[i];
216        let role = self
217            .inner
218            .keys
219            .as_ref()
220            .map(|kb| kb.role[i])
221            .unwrap_or(Role::None);
222        match (k, role) {
223            (Kind::ObjOpen, _) => TokenKind::Object,
224            (Kind::ArrOpen, _) => TokenKind::Array,
225            (Kind::ObjClose, _) => TokenKind::ObjectEnd,
226            (Kind::ArrClose, _) => TokenKind::ArrayEnd,
227            (Kind::Quote, Role::Key) => TokenKind::Key,
228            (Kind::Quote, _) => TokenKind::String,
229            (Kind::Scalar, _) => TokenKind::Scalar,
230            (Kind::Colon | Kind::Comma, _) => TokenKind::Scalar, // unreachable in practice
231        }
232    }
233
234    /// Nesting depth of a token (root = 0).
235    #[inline]
236    pub fn depth(&self, tok: TokenId) -> u16 {
237        self.inner.idx.stage1.depth[tok.0 as usize]
238    }
239
240    /// Byte offset of a token's first byte in the source buffer.
241    #[inline]
242    pub fn byte_offset(&self, tok: TokenId) -> u32 {
243        self.inner.idx.stage1.offset[tok.0 as usize]
244    }
245
246    /// Byte span of this token in the source.
247    ///
248    /// **Container** (`Object` / `Array`): `[open_off, close_off+1)`.
249    /// **Container close**: single byte.
250    /// **String / Scalar**: requires `byte_span_in(tok, bytes)` for
251    /// byte-accurate ranges; this method returns a *coarse* upper bound by
252    /// peeking at the next token's offset.  Use `byte_span_in` whenever
253    /// you have the source bytes available.
254    pub fn byte_span(&self, tok: TokenId) -> ByteSpan {
255        let s = &self.inner.idx.stage1;
256        let i = tok.0 as usize;
257        let start = s.offset[i];
258
259        let end = match s.kind[i] {
260            Kind::ObjOpen | Kind::ArrOpen => {
261                let close = self.inner.idx.close_of[i];
262                if close >= 0 {
263                    s.offset[close as usize] + 1
264                } else {
265                    start + 1
266                }
267            }
268            Kind::ObjClose | Kind::ArrClose => start + 1,
269            Kind::Quote | Kind::Scalar => {
270                if i + 1 < s.offset.len() {
271                    s.offset[i + 1]
272                } else {
273                    start + 1
274                }
275            }
276            Kind::Colon | Kind::Comma => start + 1,
277        };
278        ByteSpan { start, end }
279    }
280
281    /// Byte-accurate span using source bytes to find the exact end of
282    /// strings (closing `"` honouring escapes) and scalars (next delimiter).
283    pub fn byte_span_in(&self, tok: TokenId, bytes: &[u8]) -> ByteSpan {
284        let s = &self.inner.idx.stage1;
285        let i = tok.0 as usize;
286        let start = s.offset[i];
287        let end = match s.kind[i] {
288            Kind::ObjOpen | Kind::ArrOpen => {
289                let close = self.inner.idx.close_of[i];
290                if close >= 0 {
291                    s.offset[close as usize] + 1
292                } else {
293                    start + 1
294                }
295            }
296            Kind::ObjClose | Kind::ArrClose => start + 1,
297            Kind::Quote => scan_string_end(bytes, start),
298            Kind::Scalar => scan_scalar_end(bytes, start),
299            Kind::Colon | Kind::Comma => start + 1,
300        };
301        ByteSpan { start, end }
302    }
303
304    pub fn parent(&self, tok: TokenId) -> Option<TokenId> {
305        let p = self.inner.idx.parent[tok.0 as usize];
306        if p < 0 {
307            None
308        } else {
309            Some(TokenId(p as u32))
310        }
311    }
312
313    pub fn close_of(&self, container: TokenId) -> Option<TokenId> {
314        let c = self.inner.idx.close_of[container.0 as usize];
315        if c < 0 {
316            None
317        } else {
318            Some(TokenId(c as u32))
319        }
320    }
321
322    pub fn tape_index(&self, tok: TokenId) -> Option<u32> {
323        let t = self.inner.idx.tape_of[tok.0 as usize];
324        if t == u32::MAX {
325            None
326        } else {
327            Some(t)
328        }
329    }
330
331    /// Innermost container token enclosing the given byte position.
332    /// Returns None if `pos` is outside any container.
333    pub fn container_at_byte(&self, pos: u32) -> Option<TokenId> {
334        self.inner.idx.container_at(pos).map(TokenId)
335    }
336
337    /// Walk parents innermost→root.
338    pub fn ancestors(&self, tok: TokenId) -> Ancestors<'_> {
339        Ancestors {
340            parent: &self.inner.idx.parent,
341            cur: self.inner.idx.parent[tok.0 as usize],
342        }
343    }
344
345    pub fn slice<'a>(&self, bytes: &'a [u8], tok: TokenId) -> &'a [u8] {
346        let span = self.byte_span_in(tok, bytes);
347        &bytes[span.start as usize..(span.end as usize).min(bytes.len())]
348    }
349
350    /// Whether the Mison key-bitmap layer was built (controlled by
351    /// `BuildOptions::keys`).
352    pub fn has_keys(&self) -> bool {
353        self.inner.keys.is_some()
354    }
355
356    pub fn keys_named<'a>(&'a self, name: &str, depth: Option<u16>) -> KeyHits<'a> {
357        let kb = match &self.inner.keys {
358            Some(k) => k,
359            None => return KeyHits::empty(),
360        };
361        let id = match kb.by_name.get(name) {
362            Some(&id) => id,
363            None => return KeyHits::empty(),
364        };
365        let bm = &kb.bitmaps[id as usize];
366        let result = match depth.and_then(|d| kb.depth_bitmaps.get(d as usize)) {
367            Some(dm) => bm.and(dm),
368            None => bm.clone(),
369        };
370        KeyHits::from_bitmap(result)
371    }
372
373    pub fn has_key(&self, name: &str) -> bool {
374        self.inner
375            .keys
376            .as_ref()
377            .map(|k| k.by_name.contains_key(name))
378            .unwrap_or(false)
379    }
380
381    pub fn keys_seen(&self) -> impl Iterator<Item = &str> + '_ {
382        self.inner
383            .keys
384            .as_ref()
385            .into_iter()
386            .flat_map(|k| k.dict.iter().map(|s| &**s))
387    }
388
389    pub fn value_for_key(&self, key_tok: TokenId) -> Option<TokenId> {
390        keys_value_for_key(&self.inner.idx, key_tok.0).map(TokenId)
391    }
392
393    /// Subtree-restricted key search: matches only tokens whose token-index
394    /// falls within `[root.raw(), close_of(root)]`.
395    ///
396    /// Iterates the precomputed key bitmap via a borrowed roaring cursor —
397    /// no bitmap allocation, no range AND. The returned `KeyHits` walks the
398    /// underlying bitmap with `reset_at_or_after(lo)` and stops once the
399    /// cursor's value exceeds `close_of(root)`.
400    pub fn keys_named_in<'a>(&'a self, name: &str, root: TokenId) -> KeyHits<'a> {
401        let kb = match &self.inner.keys {
402            Some(k) => k,
403            None => return KeyHits::empty(),
404        };
405        let id = match kb.by_name.get(name) {
406            Some(&id) => id,
407            None => return KeyHits::empty(),
408        };
409        let close = self
410            .close_of(root)
411            .map(|t| t.0)
412            .unwrap_or(self.token_count().saturating_sub(1));
413        let bm = &kb.bitmaps[id as usize];
414        KeyHits::bounded(bm, root.0, close)
415    }
416
417    /// Direct field lookup on a single container token: returns the value
418    /// token of the named key, if present.
419    ///
420    /// Walks the global key bitmap for `name` via a borrowed roaring cursor
421    /// seeked to `parent.0 + 1` and stopping at `close_of(parent)`. Matches
422    /// the first key token whose `parent[]` entry equals `parent`. No bitmap
423    /// allocation, no range AND, no `KeyHits` materialisation — pure cursor
424    /// seek over the precomputed key bitmap.
425    pub fn field_of(&self, parent: TokenId, name: &str) -> Option<TokenId> {
426        let kb = self.inner.keys.as_ref()?;
427        let id = *kb.by_name.get(name)?;
428        let bm = &kb.bitmaps[id as usize];
429        let close = self
430            .close_of(parent)
431            .map(|t| t.0)
432            .unwrap_or_else(|| self.token_count().saturating_sub(1));
433        let lo = parent.0.saturating_add(1);
434        if lo > close {
435            return None;
436        }
437        let mut cur = bm.cursor();
438        cur.reset_at_or_after(lo);
439        while let Some(v) = cur.current() {
440            if v > close {
441                break;
442            }
443            let k_tok = TokenId(v);
444            if self.parent(k_tok) == Some(parent) {
445                return self.value_for_key(k_tok);
446            }
447            cur.move_next();
448        }
449        None
450    }
451
452    /// Subtree range as a bitmap.  Cheap; range AND is a Roaring SIMD
453    /// primitive on the consumer side.
454    pub fn subtree_bitmap(&self, root: TokenId) -> croaring::Bitmap {
455        let close = self
456            .close_of(root)
457            .map(|t| t.0)
458            .unwrap_or(self.token_count().saturating_sub(1));
459        let mut out = croaring::Bitmap::new();
460        out.add_range(root.0..=close);
461        out
462    }
463}
464
465
466pub struct Tokens<'a> {
467    idx: &'a StructuralIndex,
468    cur: u32,
469    end: u32,
470}
471
472impl<'a> Iterator for Tokens<'a> {
473    type Item = TokenId;
474    fn next(&mut self) -> Option<TokenId> {
475        if self.cur >= self.end {
476            return None;
477        }
478        let _ = self.idx;
479        let t = TokenId(self.cur);
480        self.cur += 1;
481        Some(t)
482    }
483}
484
485pub struct Ancestors<'a> {
486    parent: &'a [i32],
487    cur: i32,
488}
489
490impl<'a> Iterator for Ancestors<'a> {
491    type Item = TokenId;
492    fn next(&mut self) -> Option<TokenId> {
493        if self.cur < 0 {
494            return None;
495        }
496        let out = TokenId(self.cur as u32);
497        self.cur = self.parent[self.cur as usize];
498        Some(out)
499    }
500}
501
502/// Lazy iterator over key matches.  Backed by a Roaring bitmap; supports
503/// composition (and/or) without materialising intermediate Vecs.
504///
505/// Iteration uses a materialised `Vec<u32>` (cached on first call) so the
506/// underlying bitmap is preserved for subsequent ops like `count()` /
507/// `first()` / `last()`.
508/// Iterator over key tokens. Has two modes:
509/// - **Owned**: holds a `croaring::Bitmap` (used by `keys_named` and after
510///   `and`/`or` set ops). Iteration materialises a `Vec<u32>` once.
511/// - **Bounded**: borrows the source key bitmap and walks it via a
512///   roaring cursor restricted to `[lo, hi]`. No allocation, no AND.
513///   Used by `keys_named_in` for subtree-restricted scans (the hot path).
514pub struct KeyHits<'a> {
515    state: KeyHitsState<'a>,
516}
517
518enum KeyHitsState<'a> {
519    Empty,
520    Owned {
521        bitmap: croaring::Bitmap,
522        cache: Option<Vec<u32>>,
523        pos: usize,
524    },
525    Bounded {
526        bitmap: &'a croaring::Bitmap,
527        cursor: Option<croaring::bitmap::BitmapCursor<'a>>,
528        lo: u32,
529        hi: u32,
530        started: bool,
531    },
532}
533
534impl<'a> KeyHits<'a> {
535    fn from_bitmap(bm: croaring::Bitmap) -> Self {
536        Self {
537            state: KeyHitsState::Owned {
538                bitmap: bm,
539                cache: None,
540                pos: 0,
541            },
542        }
543    }
544
545    pub(crate) fn bounded(bitmap: &'a croaring::Bitmap, lo: u32, hi: u32) -> Self {
546        if lo > hi {
547            return Self::empty();
548        }
549        Self {
550            state: KeyHitsState::Bounded {
551                bitmap,
552                cursor: None,
553                lo,
554                hi,
555                started: false,
556            },
557        }
558    }
559
560    fn empty() -> Self {
561        Self {
562            state: KeyHitsState::Empty,
563        }
564    }
565
566    /// Convert any state into an owned bitmap (allocates only when bounded);
567    /// used by set-ops (`and`/`or`) which require materialised bitmaps.
568    fn into_owned_bitmap(self) -> Option<croaring::Bitmap> {
569        match self.state {
570            KeyHitsState::Empty => None,
571            KeyHitsState::Owned { bitmap, .. } => Some(bitmap),
572            KeyHitsState::Bounded { bitmap, lo, hi, .. } => {
573                let mut range = croaring::Bitmap::new();
574                range.add_range(lo..=hi);
575                let mut out = bitmap.clone();
576                out.and_inplace(&range);
577                Some(out)
578            }
579        }
580    }
581
582    pub fn at_depth(self, _depth: u16) -> Self {
583        self
584    }
585
586    pub fn and(self, other: KeyHits<'a>) -> Self {
587        match (self.into_owned_bitmap(), other.into_owned_bitmap()) {
588            (Some(mut a), Some(b)) => {
589                a.and_inplace(&b);
590                Self::from_bitmap(a)
591            }
592            _ => Self::empty(),
593        }
594    }
595
596    pub fn or(self, other: KeyHits<'a>) -> Self {
597        match (self.into_owned_bitmap(), other.into_owned_bitmap()) {
598            (Some(mut a), Some(b)) => {
599                a.or_inplace(&b);
600                Self::from_bitmap(a)
601            }
602            (Some(a), None) | (None, Some(a)) => Self::from_bitmap(a),
603            _ => Self::empty(),
604        }
605    }
606
607    pub fn count(self) -> u64 {
608        match self.state {
609            KeyHitsState::Empty => 0,
610            KeyHitsState::Owned { bitmap, .. } => bitmap.cardinality(),
611            KeyHitsState::Bounded { bitmap, lo, hi, .. } => {
612                bitmap.range_cardinality(lo..=hi)
613            }
614        }
615    }
616
617    pub fn first(self) -> Option<TokenId> {
618        match self.state {
619            KeyHitsState::Empty => None,
620            KeyHitsState::Owned { bitmap, .. } => bitmap.minimum().map(TokenId),
621            KeyHitsState::Bounded { bitmap, lo, hi, .. } => {
622                let mut cur = bitmap.cursor();
623                cur.reset_at_or_after(lo);
624                cur.current().filter(|&v| v <= hi).map(TokenId)
625            }
626        }
627    }
628
629    pub fn last(self) -> Option<TokenId> {
630        match self.state {
631            KeyHitsState::Empty => None,
632            KeyHitsState::Owned { bitmap, .. } => bitmap.maximum().map(TokenId),
633            KeyHitsState::Bounded { bitmap, lo, hi, .. } => {
634                // Walk forward from lo, tracking the last value <= hi.
635                // Roaring cursors don't expose a backwards-from variant in this
636                // crate version; the bounded last is rare so the linear scan is
637                // acceptable.
638                let mut cur = bitmap.cursor();
639                cur.reset_at_or_after(lo);
640                let mut last = None;
641                while let Some(v) = cur.current() {
642                    if v > hi {
643                        break;
644                    }
645                    last = Some(TokenId(v));
646                    cur.move_next();
647                }
648                last
649            }
650        }
651    }
652
653    pub fn collect_into(self, buf: &mut Vec<TokenId>) {
654        match self.state {
655            KeyHitsState::Empty => {}
656            KeyHitsState::Owned { bitmap, .. } => {
657                buf.extend(bitmap.to_vec().into_iter().map(TokenId));
658            }
659            KeyHitsState::Bounded { bitmap, lo, hi, .. } => {
660                let mut cur = bitmap.cursor();
661                cur.reset_at_or_after(lo);
662                while let Some(v) = cur.current() {
663                    if v > hi {
664                        break;
665                    }
666                    buf.push(TokenId(v));
667                    cur.move_next();
668                }
669            }
670        }
671    }
672}
673
674impl<'a> Iterator for KeyHits<'a> {
675    type Item = TokenId;
676    fn next(&mut self) -> Option<TokenId> {
677        match &mut self.state {
678            KeyHitsState::Empty => None,
679            KeyHitsState::Owned { bitmap, cache, pos } => {
680                if cache.is_none() {
681                    *cache = Some(bitmap.to_vec());
682                }
683                let v = cache.as_ref()?.get(*pos).copied()?;
684                *pos += 1;
685                Some(TokenId(v))
686            }
687            KeyHitsState::Bounded {
688                bitmap,
689                cursor,
690                lo,
691                hi,
692                started,
693            } => {
694                if !*started {
695                    let mut c = bitmap.cursor();
696                    c.reset_at_or_after(*lo);
697                    *cursor = Some(c);
698                    *started = true;
699                } else if let Some(c) = cursor.as_mut() {
700                    c.move_next();
701                }
702                let c = cursor.as_ref()?;
703                let v = c.current()?;
704                if v > *hi {
705                    return None;
706                }
707                Some(TokenId(v))
708            }
709        }
710    }
711}
712
713
714pub fn from_bytes(bytes: &[u8]) -> Result<StructuralIndex, Error> {
715    from_bytes_with(bytes, BuildOptions::default())
716}
717
718pub fn from_bytes_with(bytes: &[u8], opts: BuildOptions) -> Result<StructuralIndex, Error> {
719    let idx = StructIndex::build(bytes).map_err(|s| Error::Parse(s.to_string()))?;
720    let keys = if opts.keys {
721        Some(KeyBitmaps::build(&idx, bytes))
722    } else {
723        None
724    };
725    Ok(StructuralIndex {
726        inner: Arc::new(Inner { idx, keys }),
727    })
728}
729
730/// `$..find(key == literal)` — emit enclosing-Object token ids.
731///
732/// `literal` is the **plain** value bytes to match (e.g. `b"PushEvent"`).
733/// String values are unquoted before comparison.
734pub fn find_eq<'a>(
735    idx: &'a StructuralIndex,
736    bytes: &'a [u8],
737    key: &str,
738    literal: &[u8],
739) -> impl Iterator<Item = TokenId> + 'a {
740    let key_hits = idx.keys_named(key, None);
741    let bytes_ref = bytes;
742    let literal_ref = literal.to_vec();
743    let idx_ref = idx;
744    key_hits
745        .filter_map(move |k_tok| {
746            let v_tok = idx_ref.value_for_key(k_tok)?;
747            let span = idx_ref.byte_span_in(v_tok, bytes_ref);
748            let v_bytes = &bytes_ref[span.start as usize..span.end as usize];
749            if value_matches(v_bytes, &literal_ref) {
750                idx_ref.parent(k_tok)
751            } else {
752                None
753            }
754        })
755}
756
757/// O(1) — popcount of the key bitmap.
758pub fn count_key(idx: &StructuralIndex, key: &str) -> u64 {
759    idx.keys_named(key, None).count()
760}
761
762/// Compound `$..find(k1 == l1 AND k2 == l2 ...)`.  Conds applied within the
763/// SAME object (parent-token equality).
764pub fn find_eq_compound<'a>(
765    idx: &'a StructuralIndex,
766    bytes: &'a [u8],
767    conds: &'a [(&str, &[u8])],
768) -> impl Iterator<Item = TokenId> + 'a {
769    // Strategy: bitmap-AND key sets across conds, but key positions don't
770    // share parents directly — must verify per candidate that all conds match
771    // within the same enclosing object.
772    let mut cands: Vec<TokenId> = Vec::new();
773    if let Some((first_key, _)) = conds.first() {
774        idx.keys_named(first_key, None).collect_into(&mut cands);
775    }
776    cands.into_iter().filter_map(move |k_tok| {
777        let parent_obj = idx.parent(k_tok)?;
778        // Verify every cond resolves to the same parent obj.
779        for (key, lit) in conds.iter() {
780            let mut matched = false;
781            for k in idx.keys_named(key, None) {
782                if idx.parent(k) == Some(parent_obj) {
783                    if let Some(v) = idx.value_for_key(k) {
784                        let span = idx.byte_span_in(v, bytes);
785                        let v_bytes = &bytes[span.start as usize..span.end as usize];
786                        if value_matches(v_bytes, lit) {
787                            matched = true;
788                            break;
789                        }
790                    }
791                }
792            }
793            if !matched {
794                return None;
795            }
796        }
797        Some(parent_obj)
798    })
799}
800
801
802/// Compare a JSON-encoded value byte-slice against a plain literal.
803/// - If `value` is a JSON string (`"…"`), compares the unquoted body.
804/// - Otherwise compares the raw bytes (number / bool / null).
805///
806/// Fast path uses `memchr::memchr` (SIMD-accelerated) to detect escape
807/// sequences and short-circuit the comparison.  When the body contains
808/// no `\` byte, we fall through to a direct slice equality which LLVM
809/// auto-vectorises to AVX2/SSE on x86_64.
810pub fn json_string_eq(value: &[u8], literal: &[u8]) -> bool {
811    value_matches(value, literal)
812}
813
814fn value_matches(value: &[u8], literal: &[u8]) -> bool {
815    if value.len() >= 2 && value[0] == b'"' && value[value.len() - 1] == b'"' {
816        let body = &value[1..value.len() - 1];
817        // SIMD escape probe: memchr is AVX2/SSE2 on x86_64, NEON on aarch64.
818        // Common case (no escapes) takes one scan + one slice compare.
819        if memchr::memchr(b'\\', body).is_some() {
820            // Slow path: needs unescape decode.  Conservative for now;
821            // upgrade to SIMD JSON-unescape when needed.
822            return slow_decode_eq(body, literal);
823        }
824        body == literal
825    } else {
826        value == literal
827    }
828}
829
830/// Fallback for escaped strings: decode standard JSON escapes byte-by-byte
831/// and compare to the literal.  Handles `\\ \" \/ \n \t \r \b \f`.  Does
832/// NOT handle `\uXXXX` (caller takes responsibility — returns false).
833fn slow_decode_eq(body: &[u8], literal: &[u8]) -> bool {
834    let mut i = 0;
835    let mut j = 0;
836    while i < body.len() && j < literal.len() {
837        let (decoded, consumed) = match body[i] {
838            b'\\' if i + 1 < body.len() => match body[i + 1] {
839                b'"' => (b'"', 2),
840                b'\\' => (b'\\', 2),
841                b'/' => (b'/', 2),
842                b'n' => (b'\n', 2),
843                b't' => (b'\t', 2),
844                b'r' => (b'\r', 2),
845                b'b' => (b'\x08', 2),
846                b'f' => (b'\x0c', 2),
847                _ => return false, // \uXXXX or unknown escape — punt
848            },
849            c => (c, 1),
850        };
851        if decoded != literal[j] {
852            return false;
853        }
854        i += consumed;
855        j += 1;
856    }
857    i == body.len() && j == literal.len()
858}
859
860/// Scan from `start` (the opening `"`) to the matching closing `"`,
861/// honouring `\\` escapes.  Returns one past the closing quote.
862fn scan_string_end(bytes: &[u8], start: u32) -> u32 {
863    let mut i = (start + 1) as usize;
864    while i < bytes.len() {
865        match bytes[i] {
866            b'\\' => i = i.saturating_add(2),
867            b'"' => return (i + 1) as u32,
868            _ => i += 1,
869        }
870    }
871    bytes.len() as u32
872}
873
874/// Scan from `start` until the next JSON delimiter byte.
875fn scan_scalar_end(bytes: &[u8], start: u32) -> u32 {
876    let mut i = start as usize;
877    while i < bytes.len() {
878        match bytes[i] {
879            b',' | b'}' | b']' | b' ' | b'\t' | b'\n' | b'\r' => return i as u32,
880            _ => i += 1,
881        }
882    }
883    bytes.len() as u32
884}
885
886
887/// Parse a JSON number byte-slice as `f64`.  Uses `fast-float` (SSE/AVX
888/// SIMD) when the feature is enabled, falls back to `str::parse` otherwise.
889///
890/// Returns None on malformed input or non-UTF-8 bytes.
891pub fn parse_f64(bytes: &[u8]) -> Option<f64> {
892    #[cfg(feature = "fast-numbers")]
893    {
894        fast_float::parse(bytes).ok()
895    }
896    #[cfg(not(feature = "fast-numbers"))]
897    {
898        std::str::from_utf8(bytes).ok()?.parse::<f64>().ok()
899    }
900}
901
902/// Parse a JSON number byte-slice as `i64`.  Tries integer parse first,
903/// then falls back to `parse_f64` (truncating fractional component).
904pub fn parse_i64(bytes: &[u8]) -> Option<i64> {
905    if let Ok(s) = std::str::from_utf8(bytes) {
906        if let Ok(n) = s.parse::<i64>() {
907            return Some(n);
908        }
909    }
910    parse_f64(bytes).map(|f| f as i64)
911}
912
913/// Determine whether a value byte-slice is a JSON number that compares
914/// equal to the literal interpreted as a number.  Used by `find_eq` when
915/// the caller wants numeric semantics rather than byte-for-byte match.
916pub fn json_number_eq(value: &[u8], literal: &[u8]) -> bool {
917    match (parse_f64(value), parse_f64(literal)) {
918        (Some(a), Some(b)) => a == b,
919        _ => false,
920    }
921}
922
923
924/// Build an Aho-Corasick + Teddy automaton over multiple JSON-key patterns
925/// for raw byte-scan over NDJSON / streaming input.  Each pattern is the
926/// full `"keyname":` byte sequence — caller wraps `keyname` in quotes +
927/// trailing colon to avoid sub-string false positives.
928///
929/// Useful when the index has not yet been built — e.g. per-line scanning
930/// of a JSONL stream.  After building the index via `from_bytes` /
931/// `from_simdjson`, prefer `keys_named` (Roaring AND).
932#[cfg(feature = "multi-key")]
933pub fn multi_key_finder(keys: &[&str]) -> aho_corasick::AhoCorasick {
934    use aho_corasick::AhoCorasickBuilder;
935    let patterns: Vec<String> = keys.iter().map(|k| format!("\"{}\":", k)).collect();
936    AhoCorasickBuilder::new()
937        .ascii_case_insensitive(false)
938        .build(&patterns)
939        .expect("aho-corasick build")
940}
941
942/// Yield (pattern_index, byte_offset) for every match of any pattern in
943/// `bytes`.  pattern_index is the index into the `keys` slice originally
944/// passed to `multi_key_finder`.
945#[cfg(feature = "multi-key")]
946pub fn multi_key_scan<'a>(
947    finder: &'a aho_corasick::AhoCorasick,
948    bytes: &'a [u8],
949) -> impl Iterator<Item = (usize, usize)> + 'a {
950    finder.find_iter(bytes).map(|m| (m.pattern().as_usize(), m.start()))
951}
952
953
954/// SIMD-accelerated UTF-8 validation via `simdutf8` (5-10 GB/s).  Returns
955/// `Ok(())` if the input is valid UTF-8.
956#[cfg(feature = "validate-utf8")]
957pub fn validate_utf8(bytes: &[u8]) -> Result<(), Error> {
958    simdutf8::basic::from_utf8(bytes)
959        .map(|_| ())
960        .map_err(|_| Error::InvalidUtf8)
961}
962
963
964#[cfg(test)]
965mod tests {
966    use super::*;
967
968    #[test]
969    fn from_bytes_basic_roundtrip() {
970        let buf = br#"{"a":1,"b":"hi","c":[1,2,3]}"#;
971        let idx = from_bytes(buf).unwrap();
972        assert!(idx.token_count() > 0);
973        assert!(idx.has_keys());
974
975        let mut keys: Vec<&str> = idx.keys_seen().collect();
976        keys.sort();
977        assert_eq!(keys, vec!["a", "b", "c"]);
978    }
979
980    #[test]
981    fn keys_named_returns_token_ids() {
982        let buf = br#"{"x":1,"y":2,"x":3}"#;
983        let idx = from_bytes(buf).unwrap();
984        let xs: Vec<TokenId> = idx.keys_named("x", None).collect();
985        assert!(!xs.is_empty(), "expected at least one 'x' match");
986        for t in &xs {
987            assert_eq!(idx.kind(*t), TokenKind::Key);
988        }
989    }
990
991    #[test]
992    fn count_key_uses_popcount() {
993        let buf = br#"{"a":{"x":1},"b":{"x":2},"c":{"x":3,"y":4}}"#;
994        let idx = from_bytes(buf).unwrap();
995        let c = count_key(&idx, "x");
996        assert_eq!(c, 3);
997        let cy = count_key(&idx, "y");
998        assert_eq!(cy, 1);
999    }
1000
1001    #[test]
1002    fn key_hits_first_short_circuits() {
1003        let buf = br#"{"a":1,"b":2,"c":3,"d":4}"#;
1004        let idx = from_bytes(buf).unwrap();
1005        let first = idx.keys_named("c", None).first();
1006        assert!(first.is_some());
1007        assert_eq!(idx.kind(first.unwrap()), TokenKind::Key);
1008    }
1009
1010    #[test]
1011    fn find_eq_returns_enclosing_objects() {
1012        let buf = br#"{"a":{"x":"test"},"b":{"x":"nope"},"c":{"x":"test"}}"#;
1013        let idx = from_bytes(buf).unwrap();
1014        let hits: Vec<TokenId> = find_eq(&idx, buf, "x", b"test").collect();
1015        assert_eq!(hits.len(), 2);
1016        for t in &hits {
1017            assert_eq!(idx.kind(*t), TokenKind::Object);
1018        }
1019    }
1020
1021    #[test]
1022    fn container_at_byte_works() {
1023        let buf = br#"{"a":{"x":1},"b":2}"#;
1024        let idx = from_bytes(buf).unwrap();
1025        // byte 9 lands inside "x" inside the inner object.
1026        let c = idx.container_at_byte(9).unwrap();
1027        // Should resolve to the inner object (deeper than root).
1028        assert_eq!(idx.kind(c), TokenKind::Object);
1029        assert!(idx.depth(c) >= 1);
1030    }
1031
1032    #[test]
1033    fn build_options_minimal_skips_keys() {
1034        let buf = br#"{"a":1}"#;
1035        let idx = from_bytes_with(buf, BuildOptions::minimal()).unwrap();
1036        assert!(!idx.has_keys());
1037        assert_eq!(idx.keys_named("a", None).count(), 0);
1038    }
1039
1040    #[test]
1041    fn ancestors_walks_to_root() {
1042        let buf = br#"{"a":{"b":{"c":42}}}"#;
1043        let idx = from_bytes(buf).unwrap();
1044        // Find the deepest scalar
1045        let scalar_tok = idx
1046            .tokens()
1047            .find(|t| idx.kind(*t) == TokenKind::Scalar)
1048            .unwrap();
1049        let chain: Vec<TokenId> = idx.ancestors(scalar_tok).collect();
1050        assert!(!chain.is_empty());
1051        // Last ancestor's parent must be root (None)
1052        assert!(idx.parent(*chain.last().unwrap()).is_none());
1053    }
1054
1055    #[test]
1056    fn json_string_eq_handles_quotes() {
1057        assert!(json_string_eq(b"\"hello\"", b"hello"));
1058        assert!(!json_string_eq(b"\"hello\"", b"world"));
1059        // Escape decode now handled — `\n` decodes to newline.
1060        assert!(json_string_eq(b"\"he\\nllo\"", b"he\nllo"));
1061        // Non-string raw match.
1062        assert!(json_string_eq(b"42", b"42"));
1063        assert!(json_string_eq(b"true", b"true"));
1064    }
1065
1066    #[test]
1067    fn find_eq_compound_intersect() {
1068        let buf = br#"[{"k1":"a","k2":"b"},{"k1":"a","k2":"c"},{"k1":"x","k2":"b"}]"#;
1069        let idx = from_bytes(buf).unwrap();
1070        let conds: &[(&str, &[u8])] = &[("k1", b"a"), ("k2", b"b")];
1071        let hits: Vec<TokenId> = find_eq_compound(&idx, buf, conds).collect();
1072        assert_eq!(hits.len(), 1, "exactly one obj should match both conds");
1073    }
1074
1075    #[test]
1076    fn json_string_eq_handles_simple_escapes() {
1077        assert!(json_string_eq(b"\"a\\nb\"", b"a\nb"));
1078        assert!(json_string_eq(b"\"a\\\\b\"", b"a\\b"));
1079        assert!(json_string_eq(b"\"a\\\"b\"", b"a\"b"));
1080        assert!(!json_string_eq(b"\"a\\nb\"", b"axb"));
1081        // \uXXXX still punted (returns false).
1082        assert!(!json_string_eq(b"\"\\u0041\"", b"A"));
1083    }
1084
1085    #[test]
1086    fn parse_f64_works_with_or_without_fast_numbers() {
1087        assert_eq!(parse_f64(b"3.14"), Some(3.14));
1088        assert_eq!(parse_f64(b"-1e10"), Some(-1e10));
1089        assert_eq!(parse_f64(b"42"), Some(42.0));
1090        assert_eq!(parse_f64(b"not a number"), None);
1091    }
1092
1093    #[test]
1094    fn parse_i64_falls_back_to_f64_truncate() {
1095        assert_eq!(parse_i64(b"42"), Some(42));
1096        assert_eq!(parse_i64(b"-7"), Some(-7));
1097        assert_eq!(parse_i64(b"3.9"), Some(3)); // truncates
1098    }
1099
1100    #[test]
1101    fn json_number_eq_compares_numerically() {
1102        assert!(json_number_eq(b"42", b"42"));
1103        assert!(json_number_eq(b"42.0", b"42")); // numeric equality, not byte
1104        assert!(!json_number_eq(b"42", b"43"));
1105        assert!(!json_number_eq(b"abc", b"42"));
1106    }
1107
1108    #[cfg(feature = "multi-key")]
1109    #[test]
1110    fn multi_key_finder_matches_top_level_keys() {
1111        let keys = ["type", "actor", "repo"];
1112        let finder = multi_key_finder(&keys);
1113        let bytes = br#"{"type":"PushEvent","actor":{"login":"x"},"repo":{"name":"y"}}"#;
1114        let hits: Vec<(usize, usize)> = multi_key_scan(&finder, bytes).collect();
1115        assert_eq!(hits.len(), 3);
1116        // Patterns matched in input order
1117        let pattern_indices: Vec<usize> = hits.iter().map(|(p, _)| *p).collect();
1118        assert_eq!(pattern_indices, vec![0, 1, 2]);
1119    }
1120
1121    #[cfg(feature = "validate-utf8")]
1122    #[test]
1123    fn validate_utf8_accepts_valid_input() {
1124        assert!(validate_utf8(b"hello").is_ok());
1125        assert!(validate_utf8(b"\xff\xfe").is_err()); // invalid UTF-8
1126    }
1127
1128}