Skip to main content

jetro_core/
lib.rs

1//! Jetro core — parser, compiler, and VM for the Jetro JSON query language.
2//!
3//! # Execution path
4//!
5//! ```text
6//! source text
7//!   │  parse::parser::parse() → Expr AST
8//!   │  plan::physical::plan_query() → QueryPlan (physical IR)
9//!   │  exec::router::collect_*() → dispatches to:
10//!   │    StructuralIndex backend  (jetro-experimental bitmap)
11//!   │    ViewPipeline backend     (borrowed tape/Val navigation)
12//!   │    Pipeline backend         (pull-based composed stages)
13//!   └─  VM fallback               (bytecode stack machine)
14//! ```
15//!
16//! # Quick start
17//!
18//! ```rust
19//! use jetro_core::Jetro;
20//! let j = Jetro::from_bytes(br#"{"books":[{"price":12}]}"#.to_vec()).unwrap();
21//! assert_eq!(j.collect("$.books.len()").unwrap(), serde_json::json!(1));
22//! ```
23
24pub(crate) mod builtins;
25pub(crate) mod compile;
26pub(crate) mod data;
27pub(crate) mod exec;
28pub(crate) mod ir;
29pub(crate) mod parse;
30pub(crate) mod plan;
31pub(crate) mod util;
32pub(crate) mod vm;
33
34#[cfg(test)]
35mod tests;
36
37use serde_json::Value;
38use std::cell::{OnceCell, RefCell};
39use std::collections::HashMap;
40use std::sync::Arc;
41use std::sync::Mutex;
42use data::value::Val;
43
44pub use data::context::EvalError;
45#[cfg(test)]
46use parse::parser::ParseError;
47use vm::VM;
48
49/// Internal parser surface re-exported only when the `fuzz_internal` feature
50/// is enabled. Used by the `cargo-fuzz` harness to reach the PEG parser
51/// without going through `Jetro::collect`. NOT a stable public API.
52#[cfg(feature = "fuzz_internal")]
53pub mod __fuzz_internal {
54    pub use crate::parse::parser::{parse, ParseError};
55    pub use crate::plan::physical::plan_query;
56}
57
58
59#[cfg(test)]
60#[derive(Debug)]
61pub(crate) enum Error {
62    Parse(ParseError),
63    Eval(EvalError),
64}
65
66#[cfg(test)]
67impl std::fmt::Display for Error {
68    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
69        match self {
70            Error::Parse(e) => write!(f, "{}", e),
71            Error::Eval(e) => write!(f, "{}", e),
72        }
73    }
74}
75#[cfg(test)]
76impl std::error::Error for Error {
77    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
78        match self {
79            Error::Parse(e) => Some(e),
80            Error::Eval(_) => None,
81        }
82    }
83}
84
85#[cfg(test)]
86impl From<ParseError> for Error {
87    fn from(e: ParseError) -> Self {
88        Error::Parse(e)
89    }
90}
91#[cfg(test)]
92impl From<EvalError> for Error {
93    fn from(e: EvalError) -> Self {
94        Error::Eval(e)
95    }
96}
97
98/// Primary entry point. Holds a JSON document and evaluates expressions against
99/// it. Lazy fields (`root_val`, `tape`, `structural_index`, `objvec_cache`)
100/// are populated on first use so callers only pay for the representations a
101/// particular query actually needs.
102pub struct Jetro {
103    /// The `serde_json::Value` root document; unused when `simd-json` is enabled
104    /// (the tape is the authoritative source in that case).
105    document: Value,
106    /// Cached `Val` tree — built once and reused across `collect()` calls.
107    root_val: OnceCell<Val>,
108    /// Retained raw bytes for lazy tape and structural-index materialisation.
109    raw_bytes: Option<Arc<[u8]>>,
110
111    /// Lazily parsed simd-json tape; `Err` is cached to avoid re-parsing after failure.
112    #[cfg(feature = "simd-json")]
113    tape: OnceCell<std::result::Result<Arc<crate::data::tape::TapeData>, String>>,
114    /// Unused placeholder so the field name is consistent regardless of features.
115    #[cfg(not(feature = "simd-json"))]
116    #[allow(dead_code)]
117    tape: OnceCell<()>,
118
119    /// Lazily built bitmap structural index for accelerated key-presence queries.
120    structural_index:
121        OnceCell<std::result::Result<Arc<jetro_experimental::StructuralIndex>, String>>,
122
123    /// Per-document cache from `Arc<Vec<Val>>` pointer addresses to promoted
124    /// `ObjVecData` columnar representations; keyed by pointer to avoid re-promotion.
125    pub(crate) objvec_cache:
126        std::sync::Mutex<std::collections::HashMap<usize, Arc<crate::data::value::ObjVecData>>>,
127
128    /// Per-document VM cache used by `Jetro::collect`; not shared across document handles.
129    vm: RefCell<VM>,
130}
131
132
133/// Long-lived multi-document query engine with an explicit plan cache.
134/// Use when the same process evaluates many expressions over many documents —
135/// parse/lower/compile work is amortised by this object, not hidden in
136/// thread-local state.
137pub struct JetroEngine {
138    /// Maps `"<context_key>\0<expr>"` to compiled `QueryPlan`; evicted wholesale when full.
139    plan_cache: Mutex<HashMap<String, ir::physical::QueryPlan>>,
140    /// Maximum number of entries before the cache is cleared; 0 disables caching.
141    plan_cache_limit: usize,
142    /// The shared `VM` used by all `collect*` calls on this engine instance.
143    vm: Mutex<VM>,
144    /// Engine-owned JSON object-key intern cache. Used by [`JetroEngine::parse_value`]
145    /// and [`JetroEngine::parse_bytes`] (and the `collect_*` shortcuts that go through
146    /// them) so each engine instance has an isolated key cache. Documents built via
147    /// the standalone `Jetro::from_bytes`/`From<serde_json::Value>` paths use the
148    /// process-wide [`crate::data::intern::default_cache`] instead.
149    keys: Arc<crate::data::intern::KeyCache>,
150}
151
152/// Error returned by `JetroEngine::collect_bytes` and similar methods that
153/// may fail during JSON parsing or during expression evaluation.
154#[derive(Debug)]
155pub enum JetroEngineError {
156    /// JSON parsing failed before evaluation could begin.
157    Json(serde_json::Error),
158    /// Expression evaluation failed (the JSON was valid but the query errored).
159    Eval(EvalError),
160}
161
162impl std::fmt::Display for JetroEngineError {
163    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
164        match self {
165            Self::Json(err) => write!(f, "{}", err),
166            Self::Eval(err) => write!(f, "{}", err),
167        }
168    }
169}
170
171impl std::error::Error for JetroEngineError {
172    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
173        match self {
174            Self::Json(err) => Some(err),
175            Self::Eval(_) => None,
176        }
177    }
178}
179
180impl From<serde_json::Error> for JetroEngineError {
181    fn from(err: serde_json::Error) -> Self {
182        Self::Json(err)
183    }
184}
185
186impl From<EvalError> for JetroEngineError {
187    fn from(err: EvalError) -> Self {
188        Self::Eval(err)
189    }
190}
191
192impl Default for JetroEngine {
193    fn default() -> Self {
194        Self::new()
195    }
196}
197
198impl JetroEngine {
199    /// Default maximum plan-cache size; the cache is cleared wholesale when reached.
200    const DEFAULT_PLAN_CACHE_LIMIT: usize = 256;
201
202    /// Create a `JetroEngine` with the default plan-cache limit of 256 entries.
203    pub fn new() -> Self {
204        Self::with_plan_cache_limit(Self::DEFAULT_PLAN_CACHE_LIMIT)
205    }
206
207    /// Create a `JetroEngine` with an explicit plan-cache capacity.
208    /// Set `plan_cache_limit` to 0 to disable caching entirely.
209    pub fn with_plan_cache_limit(plan_cache_limit: usize) -> Self {
210        Self {
211            plan_cache: Mutex::new(HashMap::new()),
212            plan_cache_limit,
213            vm: Mutex::new(VM::new()),
214            keys: crate::data::intern::KeyCache::new(),
215        }
216    }
217
218    /// Borrow this engine's JSON key-intern cache.
219    pub fn keys(&self) -> &Arc<crate::data::intern::KeyCache> {
220        &self.keys
221    }
222
223    /// Discard all cached query plans and the engine's key-intern cache,
224    /// forcing re-compilation and re-interning on the next call.
225    pub fn clear_cache(&self) {
226        self.plan_cache.lock().expect("plan cache poisoned").clear();
227        self.keys.clear();
228    }
229
230    /// Build a `Jetro` document from a `serde_json::Value` with object keys
231    /// interned into this engine's key cache. Use this in place of
232    /// `Jetro::from(...)` / the `From<serde_json::Value>` impl when
233    /// per-engine key isolation is required.
234    pub fn parse_value(&self, document: Value) -> Jetro {
235        let root = Val::from_value_with(&self.keys, &document);
236        Jetro::from_val_and_value(root, document)
237    }
238
239    /// Parse raw JSON bytes into a `Jetro` document with object keys
240    /// interned into this engine's key cache. With `simd-json`, the tape
241    /// is materialised eagerly so interning happens once at parse time
242    /// (subsequent `collect` calls reuse the cached `Val` tree).
243    pub fn parse_bytes(
244        &self,
245        bytes: Vec<u8>,
246    ) -> std::result::Result<Jetro, JetroEngineError> {
247        let document = Jetro::from_bytes(bytes)?;
248        // Force materialisation so keys are interned through this
249        // engine's cache rather than the default thread-local one when
250        // `collect` later asks for `root_val`.
251        let _ = document.root_val_with(&self.keys)?;
252        Ok(document)
253    }
254
255    /// Evaluate a Jetro expression against an already-constructed `Jetro` document,
256    /// using the engine's shared plan cache and `VM`.
257    pub fn collect<S: AsRef<str>>(
258        &self,
259        document: &Jetro,
260        expr: S,
261    ) -> std::result::Result<Value, EvalError> {
262        let plan = self.cached_plan(expr.as_ref(), exec::router::planning_context(document));
263        let mut vm = self.vm.lock().expect("vm cache poisoned");
264        exec::router::collect_plan_json_with_vm(document, &plan, &mut vm)
265    }
266
267    /// Convenience wrapper: wrap a `serde_json::Value` in a `Jetro` and evaluate `expr`.
268    /// Routes through [`JetroEngine::parse_value`] so the document's object keys are
269    /// interned into this engine's key cache.
270    pub fn collect_value<S: AsRef<str>>(
271        &self,
272        document: Value,
273        expr: S,
274    ) -> std::result::Result<Value, EvalError> {
275        let document = self.parse_value(document);
276        self.collect(&document, expr)
277    }
278
279    /// Parse raw JSON bytes into a `Jetro` document and evaluate `expr`,
280    /// returning a `JetroEngineError` on either parse or evaluation failure.
281    /// Routes through [`JetroEngine::parse_bytes`] so the document's object keys
282    /// are interned into this engine's key cache.
283    pub fn collect_bytes<S: AsRef<str>>(
284        &self,
285        bytes: Vec<u8>,
286        expr: S,
287    ) -> std::result::Result<Value, JetroEngineError> {
288        let document = self.parse_bytes(bytes)?;
289        Ok(self.collect(&document, expr)?)
290    }
291
292    /// Look up a compiled `QueryPlan` by expression string and planning context,
293    /// compiling and inserting it if not already cached; evicts the whole cache if full.
294    fn cached_plan(&self, expr: &str, context: plan::physical::PlanningContext) -> ir::physical::QueryPlan {
295        let mut cache = self.plan_cache.lock().expect("plan cache poisoned");
296        let cache_key = format!("{}\0{}", context.cache_key(), expr);
297        if let Some(plan) = cache.get(&cache_key) {
298            return plan.clone();
299        }
300
301        let plan = plan::physical::plan_query_with_context(expr, context);
302        if self.plan_cache_limit > 0 {
303            if cache.len() >= self.plan_cache_limit {
304                cache.clear();
305            }
306            cache.insert(cache_key, plan.clone());
307        }
308        plan
309    }
310}
311
312impl exec::pipeline::PipelineData for Jetro {
313    fn promote_objvec(&self, arr: &Arc<Vec<Val>>) -> Option<Arc<crate::data::value::ObjVecData>> {
314        self.get_or_promote_objvec(arr)
315    }
316}
317
318impl Jetro {
319    /// Return a reference to the lazily parsed simd-json `TapeData`, parsing raw bytes
320    /// on first access. Returns `Ok(None)` when no raw bytes are stored.
321    #[cfg(feature = "simd-json")]
322    pub(crate) fn lazy_tape(
323        &self,
324    ) -> std::result::Result<Option<&Arc<crate::data::tape::TapeData>>, EvalError> {
325        if let Some(result) = self.tape.get() {
326            return result
327                .as_ref()
328                .map(Some)
329                .map_err(|err| EvalError(format!("Invalid JSON: {err}")));
330        }
331        let Some(raw) = self.raw_bytes.as_ref() else {
332            return Ok(None);
333        };
334        let bytes: Vec<u8> = (**raw).to_vec();
335        let parsed = crate::data::tape::TapeData::parse(bytes).map_err(|err| err.to_string());
336        let _ = self.tape.set(parsed);
337        self.tape
338            .get()
339            .expect("tape cache initialized")
340            .as_ref()
341            .map(Some)
342            .map_err(|err| EvalError(format!("Invalid JSON: {err}")))
343    }
344
345    /// Look up or build an `ObjVecData` columnar representation for the given
346    /// `Arc<Vec<Val>>` array, caching the result by pointer address.
347    pub(crate) fn get_or_promote_objvec(
348        &self,
349        arr: &Arc<Vec<Val>>,
350    ) -> Option<Arc<crate::data::value::ObjVecData>> {
351        let key = Arc::as_ptr(arr) as usize;
352        if let Ok(cache) = self.objvec_cache.lock() {
353            if let Some(d) = cache.get(&key) {
354                return Some(Arc::clone(d));
355            }
356        }
357        let promoted = exec::pipeline::Pipeline::try_promote_objvec_arr(arr)?;
358        if let Ok(mut cache) = self.objvec_cache.lock() {
359            cache.entry(key).or_insert_with(|| Arc::clone(&promoted));
360        }
361        Some(promoted)
362    }
363
364    /// Internal constructor that wraps a `serde_json::Value` without raw bytes.
365    pub(crate) fn new(document: Value) -> Self {
366        Self {
367            document,
368            root_val: OnceCell::new(),
369            objvec_cache: Default::default(),
370            raw_bytes: None,
371            tape: OnceCell::new(),
372            structural_index: OnceCell::new(),
373            vm: RefCell::new(VM::new()),
374        }
375    }
376
377    /// Build a `Jetro` whose `root_val` is pre-cached with `root` (constructed by the
378    /// caller, typically via [`Val::from_value_with`] using an engine-owned key cache).
379    /// `document` is retained for back-compat with non-`simd-json` callers and tests
380    /// that read the original `serde_json::Value`.
381    pub(crate) fn from_val_and_value(root: Val, document: Value) -> Self {
382        let root_val = OnceCell::new();
383        let _ = root_val.set(root);
384        Self {
385            document,
386            root_val,
387            objvec_cache: Default::default(),
388            raw_bytes: None,
389            tape: OnceCell::new(),
390            structural_index: OnceCell::new(),
391            vm: RefCell::new(VM::new()),
392        }
393    }
394
395    /// Like [`Jetro::root_val`] but interns object keys through `keys` instead of the
396    /// process-wide default. Used by [`JetroEngine::parse_bytes`] to materialise the
397    /// `Val` tree once at parse time so subsequent `collect` calls find a populated
398    /// `root_val` cache and skip re-interning.
399    pub(crate) fn root_val_with(
400        &self,
401        keys: &crate::data::intern::KeyCache,
402    ) -> std::result::Result<Val, EvalError> {
403        if let Some(root) = self.root_val.get() {
404            return Ok(root.clone());
405        }
406        let root = {
407            #[cfg(feature = "simd-json")]
408            {
409                if let Some(tape) = self.lazy_tape()? {
410                    Val::from_tape_data_with(keys, tape)
411                } else {
412                    Val::from_value_with(keys, &self.document)
413                }
414            }
415            #[cfg(not(feature = "simd-json"))]
416            {
417                Val::from_value_with(keys, &self.document)
418            }
419        };
420        let _ = self.root_val.set(root);
421        Ok(self.root_val.get().expect("root val initialized").clone())
422    }
423
424    /// Parse raw JSON bytes and build a `Jetro` query handle.
425    /// When the `simd-json` feature is enabled the bytes are not parsed eagerly;
426    /// the tape is built lazily on the first query that needs it.
427    pub fn from_bytes(bytes: Vec<u8>) -> std::result::Result<Self, serde_json::Error> {
428        
429        
430        #[cfg(feature = "simd-json")]
431        {
432            return Ok(Self {
433                document: Value::Null,
434                root_val: OnceCell::new(),
435                objvec_cache: Default::default(),
436                raw_bytes: Some(Arc::from(bytes.into_boxed_slice())),
437                tape: OnceCell::new(),
438                structural_index: OnceCell::new(),
439                vm: RefCell::new(VM::new()),
440            });
441        }
442        #[allow(unreachable_code)]
443        {
444            let document: Value = serde_json::from_slice(&bytes)?;
445            Ok(Self {
446                document,
447                root_val: OnceCell::new(),
448                objvec_cache: Default::default(),
449                raw_bytes: Some(Arc::from(bytes.into_boxed_slice())),
450                tape: OnceCell::new(),
451                structural_index: OnceCell::new(),
452                vm: RefCell::new(VM::new()),
453            })
454        }
455    }
456
457    /// Borrow this document's VM cache, falling back to a temporary VM on re-entrant use.
458    pub(crate) fn with_vm<F, R>(&self, f: F) -> R
459    where
460        F: FnOnce(&mut VM) -> R,
461    {
462        match self.vm.try_borrow_mut() {
463            Ok(mut vm) => f(&mut vm),
464            Err(_) => {
465                let mut vm = VM::new();
466                f(&mut vm)
467            }
468        }
469    }
470
471    /// Return the raw JSON byte slice if this handle was constructed from bytes,
472    /// or `None` if it was constructed from a `serde_json::Value`.
473    pub(crate) fn raw_bytes(&self) -> Option<&[u8]> {
474        self.raw_bytes.as_deref()
475    }
476
477    /// Return a reference to the lazily built `StructuralIndex` for key-presence
478    /// queries, constructing it from raw bytes on first access if available.
479    pub(crate) fn lazy_structural_index(
480        &self,
481    ) -> std::result::Result<Option<&Arc<jetro_experimental::StructuralIndex>>, EvalError> {
482        if let Some(result) = self.structural_index.get() {
483            return result
484                .as_ref()
485                .map(Some)
486                .map_err(|err| EvalError(format!("Invalid JSON: {err}")));
487        }
488        let Some(raw) = self.raw_bytes.as_ref() else {
489            return Ok(None);
490        };
491        let built = jetro_experimental::from_bytes_with(
492            raw.as_ref(),
493            jetro_experimental::BuildOptions::keys_only(),
494        )
495        .map(Arc::new)
496        .map_err(|err| err.to_string());
497        let _ = self.structural_index.set(built);
498        self.structural_index
499            .get()
500            .expect("structural index cache initialized")
501            .as_ref()
502            .map(Some)
503            .map_err(|err| EvalError(format!("Invalid JSON: {err}")))
504    }
505
506    /// Return the root `Val` for the document, building and caching it from the
507    /// tape (simd-json) or from the `serde_json::Value` on first access.
508    pub(crate) fn root_val(&self) -> std::result::Result<Val, EvalError> {
509        if let Some(root) = self.root_val.get() {
510            return Ok(root.clone());
511        }
512        let root = {
513            #[cfg(feature = "simd-json")]
514            {
515                if let Some(tape) = self.lazy_tape()? {
516                    Val::from_tape_data(tape)
517                } else {
518                    Val::from(&self.document)
519                }
520            }
521            #[cfg(not(feature = "simd-json"))]
522            {
523                Val::from(&self.document)
524            }
525        };
526        let _ = self.root_val.set(root);
527        Ok(self.root_val.get().expect("root val initialized").clone())
528    }
529
530    /// Return `true` if the `Val` tree has already been materialised; used in
531    /// tests to assert that lazy evaluation is working correctly.
532    #[cfg(test)]
533    pub(crate) fn root_val_is_materialized(&self) -> bool {
534        self.root_val.get().is_some()
535    }
536
537    #[cfg(test)]
538    pub(crate) fn structural_index_is_built(&self) -> bool {
539        self.structural_index.get().is_some()
540    }
541
542    #[cfg(all(test, feature = "simd-json"))]
543    pub(crate) fn tape_is_built(&self) -> bool {
544        self.tape.get().is_some()
545    }
546
547    #[cfg(all(test, feature = "simd-json"))]
548    pub(crate) fn reset_tape_materialized_subtrees(&self) {
549        if let Ok(Some(tape)) = self.lazy_tape() {
550            tape.reset_materialized_subtrees();
551        }
552    }
553
554    #[cfg(all(test, feature = "simd-json"))]
555    pub(crate) fn tape_materialized_subtrees(&self) -> usize {
556        self.lazy_tape()
557            .ok()
558            .flatten()
559            .map(|tape| tape.materialized_subtrees())
560            .unwrap_or(0)
561    }
562
563    /// Evaluate a Jetro expression against this document and return the result
564    /// as a `serde_json::Value`. Uses this document's VM with compile and
565    /// path-resolution caches for repeated calls.
566    pub fn collect<S: AsRef<str>>(&self, expr: S) -> std::result::Result<Value, EvalError> {
567        exec::router::collect_json(self, expr.as_ref())
568    }
569}
570
571/// Wrap an existing `serde_json::Value` in a `Jetro` handle without raw bytes.
572/// Prefer `Jetro::from_bytes` when you have the original JSON source, as it
573/// enables the tape and structural-index lazy backends.
574impl From<Value> for Jetro {
575    /// Convert a `serde_json::Value` into a `Jetro` query handle.
576    fn from(v: Value) -> Self {
577        Self::new(v)
578    }
579}