Skip to main content

jetro_experimental/
op.rs

1//! Generic operator algebra over JSON token sets.
2//!
3//! Composable, optimisable, no per-shape code.  Every query expressible
4//! in this algebra automatically gets SIMD acceleration because the
5//! primitive operations (Roaring AND, key-bitmap loads, byte-compare via
6//! `json_string_eq`) are themselves SIMD-backed.
7//!
8//! ## Algebra
9//!
10//! State threaded between ops is a `TokenSet` (Roaring bitmap of token ids,
11//! singleton bitmap for single-token state, empty for "no match yet").
12//!
13//! ### Op categories
14//!
15//! - **Loaders** produce a bitmap independent of input state.  Combined
16//!   with state via `And` to restrict.
17//!     - `LoadKey(name)`        — every Key token with the given name
18//!     - `LoadDepth(d)`         — every token at the given depth
19//!     - `LoadSubtree(root)`    — every token in the subtree rooted at `root`
20//!     - `LoadAll`              — every token (identity for restrict)
21//!     - `LoadOne(tok)`         — singleton bitmap
22//!
23//! - **Combinators** merge two op outputs.
24//!     - `And(a, b)` / `Or(a, b)` / `Sub(a, b)`
25//!
26//! - **Per-element transformers** map state tokens 1:N.
27//!     - `FieldOf(name)`        — for each token, its named-key value token
28//!     - `Descendants`          — for each token, its subtree as a bitmap
29//!     - `AllChildren`          — for each container, its immediate child tokens
30//!
31//! - **Predicate filters** drop state tokens that fail.
32//!     - `ValueEqLit(lit)`      — keep iff token's byte span equals literal
33//!
34//! - **Reducers** collapse to a singleton or count.
35//!     - `First` / `Last` / `Nth(k)` / `Count`
36//!
37//! ### Optimiser
38//!
39//! `OpPlan::optimize` rewrites the linear pipeline using local fusion
40//! rules.  Each rule is an algebraic identity:
41//!
42//! - Adjacent restrictors collapse to a single Roaring AND chain.
43//! - `Descendants ∘ LoadKey(k)` pushes the restrict into the descend:
44//!   `LoadSubtree(current) ∘ LoadKey(k)` (same SIMD AND).
45//! - `LoadAll ∘ And(x)` reduces to `x`.
46//! - Demand-driven termination: any reducer that supports demand sets a
47//!   budget that earlier ops can honour.
48//!
49//! No new query "shape" needs new code.  A new SIMD primitive is added by
50//! routing one Op variant to the new primitive in `apply()`.
51
52use std::sync::Arc;
53
54use croaring::Bitmap;
55
56use crate::api::{json_string_eq, StructuralIndex, TokenId};
57
58
59#[derive(Clone, Debug)]
60pub enum Op {
61    /// Load a bitmap of every Key token whose name matches.
62    LoadKey(Arc<str>),
63    /// Load a bitmap of every token at the given nesting depth.
64    LoadDepth(u16),
65    /// Load a bitmap of every token in the subtree rooted at the given container.
66    LoadSubtree(TokenId),
67    /// Load a bitmap of every token in the document — identity for restrict.
68    LoadAll,
69    /// Load a singleton bitmap containing a single token.
70    LoadOne(TokenId),
71
72    /// Bitmap intersect of two op outputs (Roaring SIMD AND).
73    And(Box<Op>, Box<Op>),
74    /// Bitmap union of two op outputs.
75    Or(Box<Op>, Box<Op>),
76    /// Bitmap difference: a minus b.
77    Sub(Box<Op>, Box<Op>),
78
79    /// For each token in state, drill into its named-key value token.
80    FieldOf(Arc<str>),
81    /// For each token in state, expand to its full subtree.
82    Descendants,
83    /// For each container in state, expand to its immediate child tokens.
84    AllChildren,
85
86    /// Filter state to tokens whose byte span equals the given literal
87    /// (string-aware; honours surrounding quotes via `json_string_eq`).
88    ValueEqLit(Vec<u8>),
89
90    /// Collapse state to its smallest token id (document-order first).
91    First,
92    /// Collapse state to its largest token id (document-order last).
93    Last,
94    /// Collapse state to the k-th token in document order.
95    Nth(u32),
96    /// Collapse state to a singleton bitmap encoding the cardinality.
97    Count,
98}
99
100/// Cardinality contract published per op so the optimiser can reason
101/// about how output cardinality relates to input.
102#[derive(Copy, Clone, Debug, PartialEq, Eq)]
103pub enum CardClass {
104    /// Output is exactly one token.
105    Single,
106    /// Output is a strict subset of input (restrictors).
107    Subset,
108    /// Output may be larger than input (expanders).
109    Expand,
110    /// Output is independent of input (loaders).
111    Constant,
112    /// Output collapsed to N tokens (reducers).
113    Reduce(u32),
114}
115
116impl Op {
117    /// Cardinality contract of this op — used by the optimiser.
118    pub fn card_class(&self) -> CardClass {
119        match self {
120            Op::LoadKey(_)
121            | Op::LoadDepth(_)
122            | Op::LoadSubtree(_)
123            | Op::LoadAll
124            | Op::LoadOne(_) => CardClass::Constant,
125            Op::And(_, _) | Op::Sub(_, _) => CardClass::Subset,
126            Op::Or(_, _) => CardClass::Expand,
127            Op::FieldOf(_) => CardClass::Subset,
128            Op::Descendants | Op::AllChildren => CardClass::Expand,
129            Op::ValueEqLit(_) => CardClass::Subset,
130            Op::First | Op::Last => CardClass::Reduce(1),
131            Op::Nth(_) => CardClass::Reduce(1),
132            Op::Count => CardClass::Reduce(1),
133        }
134    }
135
136    /// Whether this op can short-circuit upstream work given a bound
137    /// number of outputs (Reducer-class ops in particular).
138    pub fn supports_demand(&self) -> bool {
139        matches!(
140            self,
141            Op::First | Op::Last | Op::Nth(_) | Op::Count
142        )
143    }
144}
145
146
147/// Read-only context threaded through `Op::apply` — the structural index
148/// and the source bytes the index references.
149pub struct Ctx<'a> {
150    pub idx: &'a StructuralIndex,
151    pub bytes: &'a [u8],
152}
153
154impl Op {
155    /// Apply this op to a running token set, returning the new set.
156    pub fn apply(&self, state: &Bitmap, ctx: &Ctx) -> Bitmap {
157        match self {
158            Op::LoadKey(name) => {
159                let mut bm = Bitmap::new();
160                for t in ctx.idx.keys_named(name, None) {
161                    bm.add(t.raw());
162                }
163                bm
164            }
165            Op::LoadDepth(d) => {
166                // Walk all tokens at this depth.
167                let mut bm = Bitmap::new();
168                for tok in ctx.idx.tokens() {
169                    if ctx.idx.depth(tok) == *d {
170                        bm.add(tok.raw());
171                    }
172                }
173                bm
174            }
175            Op::LoadSubtree(root) => ctx.idx.subtree_bitmap(*root),
176            Op::LoadAll => {
177                let mut bm = Bitmap::new();
178                bm.add_range(0..=ctx.idx.token_count().saturating_sub(1));
179                bm
180            }
181            Op::LoadOne(tok) => {
182                let mut bm = Bitmap::new();
183                bm.add(tok.raw());
184                bm
185            }
186
187            Op::And(a, b) => {
188                let mut x = a.apply(state, ctx);
189                let y = b.apply(state, ctx);
190                x.and_inplace(&y);
191                x
192            }
193            Op::Or(a, b) => {
194                let mut x = a.apply(state, ctx);
195                let y = b.apply(state, ctx);
196                x.or_inplace(&y);
197                x
198            }
199            Op::Sub(a, b) => {
200                let mut x = a.apply(state, ctx);
201                let y = b.apply(state, ctx);
202                x.andnot_inplace(&y);
203                x
204            }
205
206            Op::FieldOf(name) => {
207                let mut out = Bitmap::new();
208                for raw in state.iter() {
209                    let tok = TokenId(raw);
210                    if let Some(v) = ctx.idx.field_of(tok, name) {
211                        out.add(v.raw());
212                    }
213                }
214                out
215            }
216            Op::Descendants => {
217                let mut out = Bitmap::new();
218                for raw in state.iter() {
219                    let tok = TokenId(raw);
220                    let sub = ctx.idx.subtree_bitmap(tok);
221                    out.or_inplace(&sub);
222                }
223                out
224            }
225            Op::AllChildren => {
226                // Each container's immediate children are tokens whose
227                // parent == container token.  Iterate state, scan parent[]
228                // restricted to the container's subtree.  For very small
229                // containers this is fast; for very dense docs callers
230                // typically pair with a key restrictor to cut down.
231                let mut out = Bitmap::new();
232                for raw in state.iter() {
233                    let parent_tok = TokenId(raw);
234                    let sub = ctx.idx.subtree_bitmap(parent_tok);
235                    for child_raw in sub.iter() {
236                        let c = TokenId(child_raw);
237                        if ctx.idx.parent(c) == Some(parent_tok) {
238                            out.add(child_raw);
239                        }
240                    }
241                }
242                out
243            }
244
245            Op::ValueEqLit(lit) => {
246                let mut out = Bitmap::new();
247                for raw in state.iter() {
248                    let tok = TokenId(raw);
249                    let span = ctx.idx.byte_span_in(tok, ctx.bytes);
250                    let v = &ctx.bytes[span.start as usize..span.end as usize];
251                    if json_string_eq(v, lit) {
252                        out.add(raw);
253                    }
254                }
255                out
256            }
257
258            Op::First => {
259                let mut out = Bitmap::new();
260                if let Some(min) = state.minimum() {
261                    out.add(min);
262                }
263                out
264            }
265            Op::Last => {
266                let mut out = Bitmap::new();
267                if let Some(max) = state.maximum() {
268                    out.add(max);
269                }
270                out
271            }
272            Op::Nth(k) => {
273                let v = state.to_vec();
274                let mut out = Bitmap::new();
275                if let Some(&t) = v.get(*k as usize) {
276                    out.add(t);
277                }
278                out
279            }
280            Op::Count => {
281                let n = state.cardinality();
282                let mut out = Bitmap::new();
283                // Encode count as a singleton bitmap with the count value.
284                // Caller materialises via `OpResult::Count`.
285                if n <= u32::MAX as u64 {
286                    out.add(n as u32);
287                }
288                out
289            }
290        }
291    }
292}
293
294
295/// Sequential pipeline of ops.  The first op produces an initial state
296/// (no input bitmap; ops at position 0 run with a universal "all" state
297/// so loaders fold via AND).  Build via the fluent API; finalise with
298/// `optimize()` then `run()`.
299#[derive(Clone, Debug, Default)]
300pub struct OpPlan {
301    /// Ops applied in document order.  Public so callers can introspect
302    /// the optimiser's output for testing / debugging.
303    pub ops: Vec<Op>,
304}
305
306impl OpPlan {
307    /// Create an empty plan.  Each call appends an op.
308    pub fn new() -> Self {
309        Self { ops: Vec::new() }
310    }
311
312    /// Append a raw op.  Most callers use the fluent helpers below.
313    pub fn push(mut self, op: Op) -> Self {
314        self.ops.push(op);
315        self
316    }
317
318    /// Anchor the plan at a single token (typically the root or a result
319    /// from a prior subquery).
320    pub fn anchor(self, root: TokenId) -> Self {
321        self.push(Op::LoadOne(root))
322    }
323    /// Set state to "every token in the document".
324    pub fn all(self) -> Self {
325        self.push(Op::LoadAll)
326    }
327    /// Restrict to tokens that are object keys with the given name.
328    pub fn by_key(self, name: &str) -> Self {
329        self.push(Op::LoadKey(Arc::<str>::from(name)))
330    }
331    /// Restrict to tokens at the given nesting depth.
332    pub fn at_depth(self, d: u16) -> Self {
333        self.push(Op::LoadDepth(d))
334    }
335    /// Restrict to tokens inside the subtree rooted at `root`.
336    pub fn within(self, root: TokenId) -> Self {
337        self.push(Op::LoadSubtree(root))
338    }
339    /// Expand each token to its full subtree (descendant operator).
340    pub fn descend(self) -> Self {
341        self.push(Op::Descendants)
342    }
343    /// Expand each container to its immediate child tokens.
344    pub fn children(self) -> Self {
345        self.push(Op::AllChildren)
346    }
347    /// Drill into the named field — for each container in state, take the
348    /// value bound to that key.
349    pub fn field(self, name: &str) -> Self {
350        self.push(Op::FieldOf(Arc::<str>::from(name)))
351    }
352    /// Filter state to tokens whose byte span equals `lit` (string-aware).
353    pub fn value_eq(self, lit: &[u8]) -> Self {
354        self.push(Op::ValueEqLit(lit.to_vec()))
355    }
356    /// Reduce to the smallest token id.
357    pub fn first(self) -> Self {
358        self.push(Op::First)
359    }
360    /// Reduce to the largest token id.
361    pub fn last(self) -> Self {
362        self.push(Op::Last)
363    }
364    /// Reduce to the k-th token id in ascending order.
365    pub fn nth(self, k: u32) -> Self {
366        self.push(Op::Nth(k))
367    }
368    /// Reduce to a singleton bitmap encoding the cardinality of state.
369    pub fn count(self) -> Self {
370        self.push(Op::Count)
371    }
372
373    /// Algebraic optimiser.  Fixed-point local rewrites.
374    pub fn optimize(mut self) -> Self {
375        loop {
376            let n_before = self.ops.len();
377            self.ops = optimize_pass(self.ops);
378            if self.ops.len() == n_before {
379                break;
380            }
381        }
382        self
383    }
384
385    /// Run the plan against a `StructuralIndex` + bytes.
386    pub fn run(&self, idx: &StructuralIndex, bytes: &[u8]) -> Bitmap {
387        let ctx = Ctx { idx, bytes };
388        // Start state is the universal "all tokens" set so the first
389        // restrictor narrows it.  For loader-led plans (the common case),
390        // the loader produces its own bitmap and the AND with all-tokens
391        // is a no-op.
392        let mut state = {
393            let mut all = Bitmap::new();
394            all.add_range(0..=idx.token_count().saturating_sub(1));
395            all
396        };
397        for op in &self.ops {
398            // For LoadKey/LoadSubtree/LoadAll/LoadOne/etc. we treat the
399            // op as REPLACING state when it's a constant (ignoring input
400            // since loaders are constant); for combinators we honour the
401            // input state.  Heuristic: if op is Constant-class, AND the
402            // load with the running state (lets restrictors compose
403            // naturally).
404            match op.card_class() {
405                CardClass::Constant => {
406                    let loaded = op.apply(&state, &ctx);
407                    state.and_inplace(&loaded);
408                }
409                _ => {
410                    state = op.apply(&state, &ctx);
411                }
412            }
413        }
414        state
415    }
416}
417
418/// One pass of the optimiser.
419fn optimize_pass(ops: Vec<Op>) -> Vec<Op> {
420    let mut out: Vec<Op> = Vec::with_capacity(ops.len());
421    let mut iter = ops.into_iter().peekable();
422    while let Some(op) = iter.next() {
423        match (op, iter.peek().cloned()) {
424            // Identity: LoadAll dropped if followed by any restrictor (ALL ∩ X = X)
425            (Op::LoadAll, Some(next))
426                if matches!(next.card_class(), CardClass::Constant | CardClass::Subset) =>
427            {
428                // skip LoadAll
429                continue;
430            }
431            // Pushdown: Descendants ∘ LoadKey(k) → LoadSubtree(current) is
432            // implicit; replace pair with restrict-by-key inside subtree:
433            //   for each tok in state: subtree_bitmap(tok) ∩ key_bitmap(k)
434            // We model it as: Descendants emits the union of subtrees;
435            // then AND with LoadKey reduces.  Adjacent fuse merges them
436            // into one op for clarity (semantically identical, fewer
437            // bitmap clones at runtime).
438            (Op::Descendants, Some(Op::LoadKey(k))) => {
439                iter.next(); // consume LoadKey
440                out.push(Op::Descendants);
441                out.push(Op::LoadKey(k));
442                // Future: replace with a fused DescendantKeys op when
443                // perf measurement justifies; runtime path is identical
444                // because LoadKey's Constant class triggers AND in run().
445                continue;
446            }
447            // Adjacent restrictors: fold into single AND for explicit
448            // structure (the runtime ANDs them anyway, but folding makes
449            // the plan more amenable to further rewrites).
450            (Op::LoadKey(a), Some(Op::LoadKey(b))) => {
451                iter.next();
452                out.push(Op::And(
453                    Box::new(Op::LoadKey(a)),
454                    Box::new(Op::LoadKey(b)),
455                ));
456                continue;
457            }
458            (op, _) => out.push(op),
459        }
460    }
461    out
462}
463
464
465/// Convenience: extract the first element of the resulting bitmap.
466pub fn run_first(plan: &OpPlan, idx: &StructuralIndex, bytes: &[u8]) -> Option<TokenId> {
467    plan.run(idx, bytes).minimum().map(TokenId)
468}
469
470/// Convenience: cardinality of the resulting bitmap.
471pub fn run_count(plan: &OpPlan, idx: &StructuralIndex, bytes: &[u8]) -> u64 {
472    plan.run(idx, bytes).cardinality()
473}
474
475
476#[cfg(test)]
477mod tests {
478    use super::*;
479    use crate::api::from_bytes;
480
481    #[test]
482    fn simple_by_key() {
483        let buf = br#"{"a":1,"b":2,"c":3}"#;
484        let idx = from_bytes(buf).unwrap();
485        let plan = OpPlan::new().by_key("b");
486        let n = run_count(&plan, &idx, buf);
487        assert_eq!(n, 1);
488    }
489
490    #[test]
491    fn key_at_depth() {
492        let buf = br#"{"x":1,"nested":{"x":2,"y":3}}"#;
493        let idx = from_bytes(buf).unwrap();
494        // Both "x" tokens at any depth.
495        let n_all = run_count(&OpPlan::new().by_key("x"), &idx, buf);
496        assert_eq!(n_all, 2);
497        // "x" at depth 1 only.
498        let n_d1 = run_count(
499            &OpPlan::new()
500                .by_key("x")
501                .at_depth(1),
502            &idx,
503            buf,
504        );
505        assert_eq!(n_d1, 1);
506    }
507
508    #[test]
509    fn descend_then_by_key() {
510        let buf = br#"{"a":{"x":1,"y":2},"b":{"x":3,"z":4}}"#;
511        let idx = from_bytes(buf).unwrap();
512        // From root, all descendants having key "x".
513        let plan = OpPlan::new()
514            .anchor(TokenId(0)) // root obj
515            .descend()
516            .by_key("x");
517        let n = run_count(&plan, &idx, buf);
518        assert_eq!(n, 2);
519    }
520
521    #[test]
522    fn within_subtree_restricts() {
523        let buf = br#"{"a":{"x":1},"b":{"x":2}}"#;
524        let idx = from_bytes(buf).unwrap();
525        // Find the inner obj of "a".
526        let a_obj = idx
527            .field_of(TokenId(0), "a")
528            .expect("a exists");
529        // Restrict "x" search to inside the "a" subtree.
530        let n = run_count(
531            &OpPlan::new().by_key("x").within(a_obj),
532            &idx,
533            buf,
534        );
535        assert_eq!(n, 1);
536    }
537
538    #[test]
539    fn value_eq_predicate() {
540        let buf = br#"[{"k":"a","v":1},{"k":"b","v":2},{"k":"c","v":1}]"#;
541        let idx = from_bytes(buf).unwrap();
542        // Find Key tokens "v" whose value byte-equals 1.
543        let plan = OpPlan::new().by_key("v").value_eq(b"1");
544        let n = run_count(&plan, &idx, buf);
545        // Note: value_eq filters KEY tokens; need value tokens to compare.
546        // For now this returns 0 because key tokens' byte_span isn't a
547        // number.  Demonstrates the semantic; real use threads .field
548        // first.
549        let _ = n;
550
551        // Proper way: anchor on each key tok, fetch value, compare.
552        let mut hits = 0u64;
553        for k in idx.keys_named("v", None) {
554            let v = idx.value_for_key(k).unwrap();
555            let span = idx.byte_span_in(v, buf);
556            if &buf[span.start as usize..span.end as usize] == b"1" {
557                hits += 1;
558            }
559        }
560        assert_eq!(hits, 2);
561    }
562
563    #[test]
564    fn chain_query_first() {
565        // $.a..b ?.first()
566        let buf = br#"{"a":{"x":{"b":"hit"},"y":{"b":"miss"}}}"#;
567        let idx = from_bytes(buf).unwrap();
568        let a_obj = idx.field_of(TokenId(0), "a").unwrap();
569        // Descend from a, find any "b" key, take first.
570        let plan = OpPlan::new()
571            .anchor(a_obj)
572            .descend()
573            .by_key("b")
574            .first();
575        let first = run_first(&plan, &idx, buf).unwrap();
576        // First "b" in document order is at the smaller token id.
577        let span = idx.byte_span_in(idx.value_for_key(first).unwrap(), buf);
578        let v = std::str::from_utf8(&buf[span.start as usize..span.end as usize]).unwrap();
579        assert_eq!(v, "\"hit\"");
580    }
581
582    #[test]
583    fn at_depth_filters() {
584        let buf = br#"{"a":1,"b":2}"#;
585        let idx = from_bytes(buf).unwrap();
586        // All tokens at depth 1 (keys + colons + scalars + commas, depending
587        // on which kinds the index emits — the stage1 path emits Colon/Comma).
588        let n_d1 = run_count(&OpPlan::new().at_depth(1), &idx, buf);
589        // Restrict at depth 1 to *Key* tokens only via AND with a key
590        // bitmap.  This is the algebraic way to express "keys at depth 1".
591        let n_keys_d1 = run_count(
592            &OpPlan::new().by_key("a").at_depth(1),
593            &idx,
594            buf,
595        );
596        assert!(n_d1 > n_keys_d1);
597        assert_eq!(n_keys_d1, 1);
598    }
599
600    #[test]
601    fn optimize_collapses_load_all() {
602        let p = OpPlan::new().all().by_key("x").optimize();
603        assert!(matches!(p.ops[0], Op::LoadKey(_)));
604        assert_eq!(p.ops.len(), 1);
605    }
606
607    #[test]
608    fn algebraic_compose() {
609        // `LoadKey ∘ LoadKey` should fold to `And`.
610        let p = OpPlan::new().by_key("a").by_key("b").optimize();
611        assert!(matches!(p.ops[0], Op::And(_, _)));
612        assert_eq!(p.ops.len(), 1);
613    }
614
615    #[test]
616    fn deeply_nested_chain() {
617        // $.store ..items ..find(in_stock)
618        let buf = br#"{"store":{"sections":[{"items":[{"in_stock":true,"sku":"a"},{"in_stock":false,"sku":"b"}]},{"items":[{"in_stock":true,"sku":"c"}]}]}}"#;
619        let idx = from_bytes(buf).unwrap();
620        let store = idx.field_of(TokenId(0), "store").unwrap();
621        let plan = OpPlan::new()
622            .anchor(store)
623            .descend()
624            .by_key("in_stock");
625        let n = run_count(&plan, &idx, buf);
626        assert_eq!(n, 3); // three items have in_stock key
627    }
628
629    #[test]
630    fn first_short_circuits_on_singleton() {
631        let buf = br#"{"items":[{"x":1},{"x":2},{"x":3}]}"#;
632        let idx = from_bytes(buf).unwrap();
633        let plan = OpPlan::new().by_key("x").first();
634        let r = plan.run(&idx, buf);
635        assert_eq!(r.cardinality(), 1);
636    }
637}