jetro_experimental/op.rs
1//! Generic operator algebra over JSON token sets.
2//!
3//! Composable, optimisable, no per-shape code. Every query expressible
4//! in this algebra automatically gets SIMD acceleration because the
5//! primitive operations (Roaring AND, key-bitmap loads, byte-compare via
6//! `json_string_eq`) are themselves SIMD-backed.
7//!
8//! ## Algebra
9//!
10//! State threaded between ops is a `TokenSet` (Roaring bitmap of token ids,
11//! singleton bitmap for single-token state, empty for "no match yet").
12//!
13//! ### Op categories
14//!
15//! - **Loaders** produce a bitmap independent of input state. Combined
16//! with state via `And` to restrict.
17//! - `LoadKey(name)` — every Key token with the given name
18//! - `LoadDepth(d)` — every token at the given depth
19//! - `LoadSubtree(root)` — every token in the subtree rooted at `root`
20//! - `LoadAll` — every token (identity for restrict)
21//! - `LoadOne(tok)` — singleton bitmap
22//!
23//! - **Combinators** merge two op outputs.
24//! - `And(a, b)` / `Or(a, b)` / `Sub(a, b)`
25//!
26//! - **Per-element transformers** map state tokens 1:N.
27//! - `FieldOf(name)` — for each token, its named-key value token
28//! - `Descendants` — for each token, its subtree as a bitmap
29//! - `AllChildren` — for each container, its immediate child tokens
30//!
31//! - **Predicate filters** drop state tokens that fail.
32//! - `ValueEqLit(lit)` — keep iff token's byte span equals literal
33//!
34//! - **Reducers** collapse to a singleton or count.
35//! - `First` / `Last` / `Nth(k)` / `Count`
36//!
37//! ### Optimiser
38//!
39//! `OpPlan::optimize` rewrites the linear pipeline using local fusion
40//! rules. Each rule is an algebraic identity:
41//!
42//! - Adjacent restrictors collapse to a single Roaring AND chain.
43//! - `Descendants ∘ LoadKey(k)` pushes the restrict into the descend:
44//! `LoadSubtree(current) ∘ LoadKey(k)` (same SIMD AND).
45//! - `LoadAll ∘ And(x)` reduces to `x`.
46//! - Demand-driven termination: any reducer that supports demand sets a
47//! budget that earlier ops can honour.
48//!
49//! No new query "shape" needs new code. A new SIMD primitive is added by
50//! routing one Op variant to the new primitive in `apply()`.
51
52use std::sync::Arc;
53
54use croaring::Bitmap;
55
56use crate::api::{json_string_eq, StructuralIndex, TokenId};
57
58
59#[derive(Clone, Debug)]
60pub enum Op {
61 /// Load a bitmap of every Key token whose name matches.
62 LoadKey(Arc<str>),
63 /// Load a bitmap of every token at the given nesting depth.
64 LoadDepth(u16),
65 /// Load a bitmap of every token in the subtree rooted at the given container.
66 LoadSubtree(TokenId),
67 /// Load a bitmap of every token in the document — identity for restrict.
68 LoadAll,
69 /// Load a singleton bitmap containing a single token.
70 LoadOne(TokenId),
71
72 /// Bitmap intersect of two op outputs (Roaring SIMD AND).
73 And(Box<Op>, Box<Op>),
74 /// Bitmap union of two op outputs.
75 Or(Box<Op>, Box<Op>),
76 /// Bitmap difference: a minus b.
77 Sub(Box<Op>, Box<Op>),
78
79 /// For each token in state, drill into its named-key value token.
80 FieldOf(Arc<str>),
81 /// For each token in state, expand to its full subtree.
82 Descendants,
83 /// For each container in state, expand to its immediate child tokens.
84 AllChildren,
85
86 /// Filter state to tokens whose byte span equals the given literal
87 /// (string-aware; honours surrounding quotes via `json_string_eq`).
88 ValueEqLit(Vec<u8>),
89
90 /// Collapse state to its smallest token id (document-order first).
91 First,
92 /// Collapse state to its largest token id (document-order last).
93 Last,
94 /// Collapse state to the k-th token in document order.
95 Nth(u32),
96 /// Collapse state to a singleton bitmap encoding the cardinality.
97 Count,
98}
99
100/// Cardinality contract published per op so the optimiser can reason
101/// about how output cardinality relates to input.
102#[derive(Copy, Clone, Debug, PartialEq, Eq)]
103pub enum CardClass {
104 /// Output is exactly one token.
105 Single,
106 /// Output is a strict subset of input (restrictors).
107 Subset,
108 /// Output may be larger than input (expanders).
109 Expand,
110 /// Output is independent of input (loaders).
111 Constant,
112 /// Output collapsed to N tokens (reducers).
113 Reduce(u32),
114}
115
116impl Op {
117 /// Cardinality contract of this op — used by the optimiser.
118 pub fn card_class(&self) -> CardClass {
119 match self {
120 Op::LoadKey(_)
121 | Op::LoadDepth(_)
122 | Op::LoadSubtree(_)
123 | Op::LoadAll
124 | Op::LoadOne(_) => CardClass::Constant,
125 Op::And(_, _) | Op::Sub(_, _) => CardClass::Subset,
126 Op::Or(_, _) => CardClass::Expand,
127 Op::FieldOf(_) => CardClass::Subset,
128 Op::Descendants | Op::AllChildren => CardClass::Expand,
129 Op::ValueEqLit(_) => CardClass::Subset,
130 Op::First | Op::Last => CardClass::Reduce(1),
131 Op::Nth(_) => CardClass::Reduce(1),
132 Op::Count => CardClass::Reduce(1),
133 }
134 }
135
136 /// Whether this op can short-circuit upstream work given a bound
137 /// number of outputs (Reducer-class ops in particular).
138 pub fn supports_demand(&self) -> bool {
139 matches!(
140 self,
141 Op::First | Op::Last | Op::Nth(_) | Op::Count
142 )
143 }
144}
145
146
147/// Read-only context threaded through `Op::apply` — the structural index
148/// and the source bytes the index references.
149pub struct Ctx<'a> {
150 pub idx: &'a StructuralIndex,
151 pub bytes: &'a [u8],
152}
153
154impl Op {
155 /// Apply this op to a running token set, returning the new set.
156 pub fn apply(&self, state: &Bitmap, ctx: &Ctx) -> Bitmap {
157 match self {
158 Op::LoadKey(name) => {
159 let mut bm = Bitmap::new();
160 for t in ctx.idx.keys_named(name, None) {
161 bm.add(t.raw());
162 }
163 bm
164 }
165 Op::LoadDepth(d) => {
166 // Walk all tokens at this depth.
167 let mut bm = Bitmap::new();
168 for tok in ctx.idx.tokens() {
169 if ctx.idx.depth(tok) == *d {
170 bm.add(tok.raw());
171 }
172 }
173 bm
174 }
175 Op::LoadSubtree(root) => ctx.idx.subtree_bitmap(*root),
176 Op::LoadAll => {
177 let mut bm = Bitmap::new();
178 bm.add_range(0..=ctx.idx.token_count().saturating_sub(1));
179 bm
180 }
181 Op::LoadOne(tok) => {
182 let mut bm = Bitmap::new();
183 bm.add(tok.raw());
184 bm
185 }
186
187 Op::And(a, b) => {
188 let mut x = a.apply(state, ctx);
189 let y = b.apply(state, ctx);
190 x.and_inplace(&y);
191 x
192 }
193 Op::Or(a, b) => {
194 let mut x = a.apply(state, ctx);
195 let y = b.apply(state, ctx);
196 x.or_inplace(&y);
197 x
198 }
199 Op::Sub(a, b) => {
200 let mut x = a.apply(state, ctx);
201 let y = b.apply(state, ctx);
202 x.andnot_inplace(&y);
203 x
204 }
205
206 Op::FieldOf(name) => {
207 let mut out = Bitmap::new();
208 for raw in state.iter() {
209 let tok = TokenId(raw);
210 if let Some(v) = ctx.idx.field_of(tok, name) {
211 out.add(v.raw());
212 }
213 }
214 out
215 }
216 Op::Descendants => {
217 let mut out = Bitmap::new();
218 for raw in state.iter() {
219 let tok = TokenId(raw);
220 let sub = ctx.idx.subtree_bitmap(tok);
221 out.or_inplace(&sub);
222 }
223 out
224 }
225 Op::AllChildren => {
226 // Each container's immediate children are tokens whose
227 // parent == container token. Iterate state, scan parent[]
228 // restricted to the container's subtree. For very small
229 // containers this is fast; for very dense docs callers
230 // typically pair with a key restrictor to cut down.
231 let mut out = Bitmap::new();
232 for raw in state.iter() {
233 let parent_tok = TokenId(raw);
234 let sub = ctx.idx.subtree_bitmap(parent_tok);
235 for child_raw in sub.iter() {
236 let c = TokenId(child_raw);
237 if ctx.idx.parent(c) == Some(parent_tok) {
238 out.add(child_raw);
239 }
240 }
241 }
242 out
243 }
244
245 Op::ValueEqLit(lit) => {
246 let mut out = Bitmap::new();
247 for raw in state.iter() {
248 let tok = TokenId(raw);
249 let span = ctx.idx.byte_span_in(tok, ctx.bytes);
250 let v = &ctx.bytes[span.start as usize..span.end as usize];
251 if json_string_eq(v, lit) {
252 out.add(raw);
253 }
254 }
255 out
256 }
257
258 Op::First => {
259 let mut out = Bitmap::new();
260 if let Some(min) = state.minimum() {
261 out.add(min);
262 }
263 out
264 }
265 Op::Last => {
266 let mut out = Bitmap::new();
267 if let Some(max) = state.maximum() {
268 out.add(max);
269 }
270 out
271 }
272 Op::Nth(k) => {
273 let v = state.to_vec();
274 let mut out = Bitmap::new();
275 if let Some(&t) = v.get(*k as usize) {
276 out.add(t);
277 }
278 out
279 }
280 Op::Count => {
281 let n = state.cardinality();
282 let mut out = Bitmap::new();
283 // Encode count as a singleton bitmap with the count value.
284 // Caller materialises via `OpResult::Count`.
285 if n <= u32::MAX as u64 {
286 out.add(n as u32);
287 }
288 out
289 }
290 }
291 }
292}
293
294
295/// Sequential pipeline of ops. The first op produces an initial state
296/// (no input bitmap; ops at position 0 run with a universal "all" state
297/// so loaders fold via AND). Build via the fluent API; finalise with
298/// `optimize()` then `run()`.
299#[derive(Clone, Debug, Default)]
300pub struct OpPlan {
301 /// Ops applied in document order. Public so callers can introspect
302 /// the optimiser's output for testing / debugging.
303 pub ops: Vec<Op>,
304}
305
306impl OpPlan {
307 /// Create an empty plan. Each call appends an op.
308 pub fn new() -> Self {
309 Self { ops: Vec::new() }
310 }
311
312 /// Append a raw op. Most callers use the fluent helpers below.
313 pub fn push(mut self, op: Op) -> Self {
314 self.ops.push(op);
315 self
316 }
317
318 /// Anchor the plan at a single token (typically the root or a result
319 /// from a prior subquery).
320 pub fn anchor(self, root: TokenId) -> Self {
321 self.push(Op::LoadOne(root))
322 }
323 /// Set state to "every token in the document".
324 pub fn all(self) -> Self {
325 self.push(Op::LoadAll)
326 }
327 /// Restrict to tokens that are object keys with the given name.
328 pub fn by_key(self, name: &str) -> Self {
329 self.push(Op::LoadKey(Arc::<str>::from(name)))
330 }
331 /// Restrict to tokens at the given nesting depth.
332 pub fn at_depth(self, d: u16) -> Self {
333 self.push(Op::LoadDepth(d))
334 }
335 /// Restrict to tokens inside the subtree rooted at `root`.
336 pub fn within(self, root: TokenId) -> Self {
337 self.push(Op::LoadSubtree(root))
338 }
339 /// Expand each token to its full subtree (descendant operator).
340 pub fn descend(self) -> Self {
341 self.push(Op::Descendants)
342 }
343 /// Expand each container to its immediate child tokens.
344 pub fn children(self) -> Self {
345 self.push(Op::AllChildren)
346 }
347 /// Drill into the named field — for each container in state, take the
348 /// value bound to that key.
349 pub fn field(self, name: &str) -> Self {
350 self.push(Op::FieldOf(Arc::<str>::from(name)))
351 }
352 /// Filter state to tokens whose byte span equals `lit` (string-aware).
353 pub fn value_eq(self, lit: &[u8]) -> Self {
354 self.push(Op::ValueEqLit(lit.to_vec()))
355 }
356 /// Reduce to the smallest token id.
357 pub fn first(self) -> Self {
358 self.push(Op::First)
359 }
360 /// Reduce to the largest token id.
361 pub fn last(self) -> Self {
362 self.push(Op::Last)
363 }
364 /// Reduce to the k-th token id in ascending order.
365 pub fn nth(self, k: u32) -> Self {
366 self.push(Op::Nth(k))
367 }
368 /// Reduce to a singleton bitmap encoding the cardinality of state.
369 pub fn count(self) -> Self {
370 self.push(Op::Count)
371 }
372
373 /// Algebraic optimiser. Fixed-point local rewrites.
374 pub fn optimize(mut self) -> Self {
375 loop {
376 let n_before = self.ops.len();
377 self.ops = optimize_pass(self.ops);
378 if self.ops.len() == n_before {
379 break;
380 }
381 }
382 self
383 }
384
385 /// Run the plan against a `StructuralIndex` + bytes.
386 pub fn run(&self, idx: &StructuralIndex, bytes: &[u8]) -> Bitmap {
387 let ctx = Ctx { idx, bytes };
388 // Start state is the universal "all tokens" set so the first
389 // restrictor narrows it. For loader-led plans (the common case),
390 // the loader produces its own bitmap and the AND with all-tokens
391 // is a no-op.
392 let mut state = {
393 let mut all = Bitmap::new();
394 all.add_range(0..=idx.token_count().saturating_sub(1));
395 all
396 };
397 for op in &self.ops {
398 // For LoadKey/LoadSubtree/LoadAll/LoadOne/etc. we treat the
399 // op as REPLACING state when it's a constant (ignoring input
400 // since loaders are constant); for combinators we honour the
401 // input state. Heuristic: if op is Constant-class, AND the
402 // load with the running state (lets restrictors compose
403 // naturally).
404 match op.card_class() {
405 CardClass::Constant => {
406 let loaded = op.apply(&state, &ctx);
407 state.and_inplace(&loaded);
408 }
409 _ => {
410 state = op.apply(&state, &ctx);
411 }
412 }
413 }
414 state
415 }
416}
417
418/// One pass of the optimiser.
419fn optimize_pass(ops: Vec<Op>) -> Vec<Op> {
420 let mut out: Vec<Op> = Vec::with_capacity(ops.len());
421 let mut iter = ops.into_iter().peekable();
422 while let Some(op) = iter.next() {
423 match (op, iter.peek().cloned()) {
424 // Identity: LoadAll dropped if followed by any restrictor (ALL ∩ X = X)
425 (Op::LoadAll, Some(next))
426 if matches!(next.card_class(), CardClass::Constant | CardClass::Subset) =>
427 {
428 // skip LoadAll
429 continue;
430 }
431 // Pushdown: Descendants ∘ LoadKey(k) → LoadSubtree(current) is
432 // implicit; replace pair with restrict-by-key inside subtree:
433 // for each tok in state: subtree_bitmap(tok) ∩ key_bitmap(k)
434 // We model it as: Descendants emits the union of subtrees;
435 // then AND with LoadKey reduces. Adjacent fuse merges them
436 // into one op for clarity (semantically identical, fewer
437 // bitmap clones at runtime).
438 (Op::Descendants, Some(Op::LoadKey(k))) => {
439 iter.next(); // consume LoadKey
440 out.push(Op::Descendants);
441 out.push(Op::LoadKey(k));
442 // Future: replace with a fused DescendantKeys op when
443 // perf measurement justifies; runtime path is identical
444 // because LoadKey's Constant class triggers AND in run().
445 continue;
446 }
447 // Adjacent restrictors: fold into single AND for explicit
448 // structure (the runtime ANDs them anyway, but folding makes
449 // the plan more amenable to further rewrites).
450 (Op::LoadKey(a), Some(Op::LoadKey(b))) => {
451 iter.next();
452 out.push(Op::And(
453 Box::new(Op::LoadKey(a)),
454 Box::new(Op::LoadKey(b)),
455 ));
456 continue;
457 }
458 (op, _) => out.push(op),
459 }
460 }
461 out
462}
463
464
465/// Convenience: extract the first element of the resulting bitmap.
466pub fn run_first(plan: &OpPlan, idx: &StructuralIndex, bytes: &[u8]) -> Option<TokenId> {
467 plan.run(idx, bytes).minimum().map(TokenId)
468}
469
470/// Convenience: cardinality of the resulting bitmap.
471pub fn run_count(plan: &OpPlan, idx: &StructuralIndex, bytes: &[u8]) -> u64 {
472 plan.run(idx, bytes).cardinality()
473}
474
475
476#[cfg(test)]
477mod tests {
478 use super::*;
479 use crate::api::from_bytes;
480
481 #[test]
482 fn simple_by_key() {
483 let buf = br#"{"a":1,"b":2,"c":3}"#;
484 let idx = from_bytes(buf).unwrap();
485 let plan = OpPlan::new().by_key("b");
486 let n = run_count(&plan, &idx, buf);
487 assert_eq!(n, 1);
488 }
489
490 #[test]
491 fn key_at_depth() {
492 let buf = br#"{"x":1,"nested":{"x":2,"y":3}}"#;
493 let idx = from_bytes(buf).unwrap();
494 // Both "x" tokens at any depth.
495 let n_all = run_count(&OpPlan::new().by_key("x"), &idx, buf);
496 assert_eq!(n_all, 2);
497 // "x" at depth 1 only.
498 let n_d1 = run_count(
499 &OpPlan::new()
500 .by_key("x")
501 .at_depth(1),
502 &idx,
503 buf,
504 );
505 assert_eq!(n_d1, 1);
506 }
507
508 #[test]
509 fn descend_then_by_key() {
510 let buf = br#"{"a":{"x":1,"y":2},"b":{"x":3,"z":4}}"#;
511 let idx = from_bytes(buf).unwrap();
512 // From root, all descendants having key "x".
513 let plan = OpPlan::new()
514 .anchor(TokenId(0)) // root obj
515 .descend()
516 .by_key("x");
517 let n = run_count(&plan, &idx, buf);
518 assert_eq!(n, 2);
519 }
520
521 #[test]
522 fn within_subtree_restricts() {
523 let buf = br#"{"a":{"x":1},"b":{"x":2}}"#;
524 let idx = from_bytes(buf).unwrap();
525 // Find the inner obj of "a".
526 let a_obj = idx
527 .field_of(TokenId(0), "a")
528 .expect("a exists");
529 // Restrict "x" search to inside the "a" subtree.
530 let n = run_count(
531 &OpPlan::new().by_key("x").within(a_obj),
532 &idx,
533 buf,
534 );
535 assert_eq!(n, 1);
536 }
537
538 #[test]
539 fn value_eq_predicate() {
540 let buf = br#"[{"k":"a","v":1},{"k":"b","v":2},{"k":"c","v":1}]"#;
541 let idx = from_bytes(buf).unwrap();
542 // Find Key tokens "v" whose value byte-equals 1.
543 let plan = OpPlan::new().by_key("v").value_eq(b"1");
544 let n = run_count(&plan, &idx, buf);
545 // Note: value_eq filters KEY tokens; need value tokens to compare.
546 // For now this returns 0 because key tokens' byte_span isn't a
547 // number. Demonstrates the semantic; real use threads .field
548 // first.
549 let _ = n;
550
551 // Proper way: anchor on each key tok, fetch value, compare.
552 let mut hits = 0u64;
553 for k in idx.keys_named("v", None) {
554 let v = idx.value_for_key(k).unwrap();
555 let span = idx.byte_span_in(v, buf);
556 if &buf[span.start as usize..span.end as usize] == b"1" {
557 hits += 1;
558 }
559 }
560 assert_eq!(hits, 2);
561 }
562
563 #[test]
564 fn chain_query_first() {
565 // $.a..b ?.first()
566 let buf = br#"{"a":{"x":{"b":"hit"},"y":{"b":"miss"}}}"#;
567 let idx = from_bytes(buf).unwrap();
568 let a_obj = idx.field_of(TokenId(0), "a").unwrap();
569 // Descend from a, find any "b" key, take first.
570 let plan = OpPlan::new()
571 .anchor(a_obj)
572 .descend()
573 .by_key("b")
574 .first();
575 let first = run_first(&plan, &idx, buf).unwrap();
576 // First "b" in document order is at the smaller token id.
577 let span = idx.byte_span_in(idx.value_for_key(first).unwrap(), buf);
578 let v = std::str::from_utf8(&buf[span.start as usize..span.end as usize]).unwrap();
579 assert_eq!(v, "\"hit\"");
580 }
581
582 #[test]
583 fn at_depth_filters() {
584 let buf = br#"{"a":1,"b":2}"#;
585 let idx = from_bytes(buf).unwrap();
586 // All tokens at depth 1 (keys + colons + scalars + commas, depending
587 // on which kinds the index emits — the stage1 path emits Colon/Comma).
588 let n_d1 = run_count(&OpPlan::new().at_depth(1), &idx, buf);
589 // Restrict at depth 1 to *Key* tokens only via AND with a key
590 // bitmap. This is the algebraic way to express "keys at depth 1".
591 let n_keys_d1 = run_count(
592 &OpPlan::new().by_key("a").at_depth(1),
593 &idx,
594 buf,
595 );
596 assert!(n_d1 > n_keys_d1);
597 assert_eq!(n_keys_d1, 1);
598 }
599
600 #[test]
601 fn optimize_collapses_load_all() {
602 let p = OpPlan::new().all().by_key("x").optimize();
603 assert!(matches!(p.ops[0], Op::LoadKey(_)));
604 assert_eq!(p.ops.len(), 1);
605 }
606
607 #[test]
608 fn algebraic_compose() {
609 // `LoadKey ∘ LoadKey` should fold to `And`.
610 let p = OpPlan::new().by_key("a").by_key("b").optimize();
611 assert!(matches!(p.ops[0], Op::And(_, _)));
612 assert_eq!(p.ops.len(), 1);
613 }
614
615 #[test]
616 fn deeply_nested_chain() {
617 // $.store ..items ..find(in_stock)
618 let buf = br#"{"store":{"sections":[{"items":[{"in_stock":true,"sku":"a"},{"in_stock":false,"sku":"b"}]},{"items":[{"in_stock":true,"sku":"c"}]}]}}"#;
619 let idx = from_bytes(buf).unwrap();
620 let store = idx.field_of(TokenId(0), "store").unwrap();
621 let plan = OpPlan::new()
622 .anchor(store)
623 .descend()
624 .by_key("in_stock");
625 let n = run_count(&plan, &idx, buf);
626 assert_eq!(n, 3); // three items have in_stock key
627 }
628
629 #[test]
630 fn first_short_circuits_on_singleton() {
631 let buf = br#"{"items":[{"x":1},{"x":2},{"x":3}]}"#;
632 let idx = from_bytes(buf).unwrap();
633 let plan = OpPlan::new().by_key("x").first();
634 let r = plan.run(&idx, buf);
635 assert_eq!(r.cardinality(), 1);
636 }
637}