Skip to main content

jetro_core/
analysis.rs

1//! Static analysis passes over compiled `Program` IR.
2//!
3//! Forward-flow analyses that produce *abstract domains* for each opcode
4//! position.  Callers (compiler, planner, caller code) can use these to:
5//!
6//! - emit specialised opcodes where a value's type / nullness / cardinality
7//!   is statically known,
8//! - reject ill-typed expressions at compile time,
9//! - enable further peephole passes that require type awareness.
10//!
11//! The analyses run on the flat `Arc<[Opcode]>` IR and are intentionally
12//! conservative — when uncertain, they return the `Unknown` top element of
13//! the lattice.
14
15use std::sync::Arc;
16use std::collections::HashMap;
17
18use super::vm::{Program, Opcode, BuiltinMethod};
19use super::ast::KindType;
20
21// ── Type lattice ──────────────────────────────────────────────────────────────
22
23#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
24pub enum VType {
25    /// Bottom: no value (unreachable).
26    Bottom,
27    Null,
28    Bool,
29    Int,
30    Float,
31    /// Numeric — Int or Float (partially refined).
32    Num,
33    Str,
34    Arr,
35    Obj,
36    /// Top: any type possible.
37    Unknown,
38}
39
40impl VType {
41    /// Least upper bound (join) for the lattice.
42    pub fn join(self, other: VType) -> VType {
43        if self == other { return self; }
44        match (self, other) {
45            (VType::Bottom, x) | (x, VType::Bottom) => x,
46            (VType::Int, VType::Float) | (VType::Float, VType::Int)
47                | (VType::Int, VType::Num) | (VType::Num, VType::Int)
48                | (VType::Float, VType::Num) | (VType::Num, VType::Float)
49                => VType::Num,
50            _ => VType::Unknown,
51        }
52    }
53
54    pub fn is_array_like(self) -> bool { matches!(self, VType::Arr) }
55    pub fn is_object_like(self) -> bool { matches!(self, VType::Obj) }
56    pub fn is_numeric(self) -> bool { matches!(self, VType::Int | VType::Float | VType::Num) }
57    pub fn is_string(self) -> bool { matches!(self, VType::Str) }
58}
59
60// ── Null-ness lattice ─────────────────────────────────────────────────────────
61
62#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
63pub enum Nullness {
64    /// Proven null.
65    AlwaysNull,
66    /// Proven non-null.
67    NonNull,
68    /// May or may not be null.
69    MaybeNull,
70}
71
72impl Nullness {
73    pub fn join(self, other: Nullness) -> Nullness {
74        if self == other { return self; }
75        Nullness::MaybeNull
76    }
77}
78
79// ── Cardinality lattice ───────────────────────────────────────────────────────
80
81#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
82pub enum Cardinality {
83    /// Exactly 0 elements (empty).
84    Zero,
85    /// Exactly 1 element (scalar-wrapped or unwrapped).
86    One,
87    /// 0 or 1 (e.g. result of `?` quantifier).
88    ZeroOrOne,
89    /// Multiple elements possible.
90    Many,
91    /// Not an array (scalar domain).
92    NotArray,
93    /// Unknown shape.
94    Unknown,
95}
96
97impl Cardinality {
98    pub fn join(self, other: Cardinality) -> Cardinality {
99        if self == other { return self; }
100        match (self, other) {
101            (Cardinality::Zero, Cardinality::One)
102                | (Cardinality::One, Cardinality::Zero) => Cardinality::ZeroOrOne,
103            _ => Cardinality::Unknown,
104        }
105    }
106}
107
108// ── Abstract value ────────────────────────────────────────────────────────────
109
110#[derive(Debug, Clone, Copy, PartialEq, Eq)]
111pub struct AbstractVal {
112    pub ty:   VType,
113    pub null: Nullness,
114    pub card: Cardinality,
115}
116
117impl AbstractVal {
118    pub const UNKNOWN: Self = Self {
119        ty: VType::Unknown, null: Nullness::MaybeNull, card: Cardinality::Unknown,
120    };
121    pub const NULL: Self = Self {
122        ty: VType::Null, null: Nullness::AlwaysNull, card: Cardinality::NotArray,
123    };
124    pub fn scalar(ty: VType) -> Self {
125        Self { ty, null: Nullness::NonNull, card: Cardinality::NotArray }
126    }
127    pub fn array() -> Self {
128        Self { ty: VType::Arr, null: Nullness::NonNull, card: Cardinality::Many }
129    }
130    pub fn object() -> Self {
131        Self { ty: VType::Obj, null: Nullness::NonNull, card: Cardinality::NotArray }
132    }
133    pub fn join(self, other: AbstractVal) -> AbstractVal {
134        AbstractVal {
135            ty:   self.ty.join(other.ty),
136            null: self.null.join(other.null),
137            card: self.card.join(other.card),
138        }
139    }
140}
141
142// ── Forward type inference ────────────────────────────────────────────────────
143
144/// Walk opcodes of `program` forward, simulating a stack of `AbstractVal`s.
145/// Returns the top-of-stack abstract value at program end (i.e. the result type).
146pub fn infer_result_type(program: &Program) -> AbstractVal {
147    let mut stack: Vec<AbstractVal> = Vec::with_capacity(16);
148    let mut env: HashMap<Arc<str>, AbstractVal> = HashMap::new();
149    for op in program.ops.iter() { apply_op_env(op, &mut stack, &mut env); }
150    stack.pop().unwrap_or(AbstractVal::UNKNOWN)
151}
152
153/// Same as `infer_result_type` but exposes the bindings environment after
154/// the program finishes — useful for debugging the interprocedural flow.
155pub fn infer_result_type_with_env(program: &Program)
156    -> (AbstractVal, HashMap<Arc<str>, AbstractVal>)
157{
158    let mut stack: Vec<AbstractVal> = Vec::with_capacity(16);
159    let mut env: HashMap<Arc<str>, AbstractVal> = HashMap::new();
160    for op in program.ops.iter() { apply_op_env(op, &mut stack, &mut env); }
161    (stack.pop().unwrap_or(AbstractVal::UNKNOWN), env)
162}
163
164fn apply_op_env(op: &Opcode, stack: &mut Vec<AbstractVal>,
165                env: &mut HashMap<Arc<str>, AbstractVal>) {
166    // Handle binding & ident lookup here, delegate scalar cases to apply_op.
167    match op {
168        Opcode::LoadIdent(name) => {
169            let av = env.get(name).copied().unwrap_or(AbstractVal::UNKNOWN);
170            stack.push(av);
171        }
172        Opcode::BindVar(name) => {
173            // TOS preserved; record type.
174            if let Some(top) = stack.last().copied() { env.insert(name.clone(), top); }
175        }
176        Opcode::StoreVar(name) => {
177            let v = stack.pop().unwrap_or(AbstractVal::UNKNOWN);
178            env.insert(name.clone(), v);
179        }
180        Opcode::LetExpr { name, body } => {
181            let init = stack.pop().unwrap_or(AbstractVal::UNKNOWN);
182            let saved = env.get(name).copied();
183            env.insert(name.clone(), init);
184            let mut sub_stack: Vec<AbstractVal> = Vec::with_capacity(8);
185            for op2 in body.ops.iter() { apply_op_env(op2, &mut sub_stack, env); }
186            let res = sub_stack.pop().unwrap_or(AbstractVal::UNKNOWN);
187            // Restore shadowed binding.
188            match saved {
189                Some(v) => { env.insert(name.clone(), v); }
190                None    => { env.remove(name); }
191            }
192            stack.push(res);
193        }
194        _ => apply_op(op, stack),
195    }
196}
197
198fn apply_op(op: &Opcode, stack: &mut Vec<AbstractVal>) {
199    macro_rules! pop2 { () => {{ let r = stack.pop().unwrap_or(AbstractVal::UNKNOWN); let l = stack.pop().unwrap_or(AbstractVal::UNKNOWN); (l, r) }} }
200    macro_rules! pop1 { () => { stack.pop().unwrap_or(AbstractVal::UNKNOWN) } }
201    match op {
202        Opcode::PushNull      => stack.push(AbstractVal::NULL),
203        Opcode::PushBool(_)   => stack.push(AbstractVal::scalar(VType::Bool)),
204        Opcode::PushInt(_)    => stack.push(AbstractVal::scalar(VType::Int)),
205        Opcode::PushFloat(_)  => stack.push(AbstractVal::scalar(VType::Float)),
206        Opcode::PushStr(_)    => stack.push(AbstractVal::scalar(VType::Str)),
207        Opcode::PushRoot | Opcode::PushCurrent | Opcode::LoadIdent(_)
208                              => stack.push(AbstractVal::UNKNOWN),
209        Opcode::GetField(_) | Opcode::OptField(_) | Opcode::FieldChain(_) => {
210            pop1!();
211            stack.push(AbstractVal::UNKNOWN);
212        }
213        Opcode::GetIndex(_) | Opcode::DynIndex(_) => {
214            pop1!();
215            stack.push(AbstractVal::UNKNOWN);
216        }
217        Opcode::GetSlice(_, _) => {
218            pop1!();
219            stack.push(AbstractVal::array());
220        }
221        Opcode::Descendant(_) | Opcode::DescendAll => {
222            pop1!();
223            stack.push(AbstractVal::array());
224        }
225        Opcode::InlineFilter(_) => {
226            pop1!();
227            stack.push(AbstractVal::array());
228        }
229        Opcode::Quantifier(kind) => {
230            pop1!();
231            use super::ast::QuantifierKind;
232            let card = match kind {
233                QuantifierKind::First => Cardinality::ZeroOrOne,
234                QuantifierKind::One   => Cardinality::One,
235            };
236            stack.push(AbstractVal { ty: VType::Unknown, null: Nullness::MaybeNull, card });
237        }
238        Opcode::RootChain(_) => stack.push(AbstractVal::UNKNOWN),
239        Opcode::FilterCount(_) => {
240            pop1!();
241            stack.push(AbstractVal::scalar(VType::Int));
242        }
243        Opcode::FindFirst(_) | Opcode::FindOne(_) => {
244            pop1!();
245            stack.push(AbstractVal::UNKNOWN);
246        }
247        Opcode::FilterMap { .. } | Opcode::FilterFilter { .. }
248            | Opcode::MapMap { .. } | Opcode::MapFilter { .. } => {
249            pop1!();
250            stack.push(AbstractVal::array());
251        }
252        Opcode::MapSum(_) | Opcode::FilterMapSum { .. }
253        | Opcode::MapMin(_) | Opcode::MapMax(_)
254        | Opcode::FilterMapMin { .. } | Opcode::FilterMapMax { .. }
255        | Opcode::MapFieldSum(_) | Opcode::MapFieldMin(_) | Opcode::MapFieldMax(_) => {
256            pop1!();
257            stack.push(AbstractVal::scalar(VType::Num));
258        }
259        Opcode::MapAvg(_) | Opcode::FilterMapAvg { .. } | Opcode::MapFieldAvg(_) => {
260            pop1!();
261            stack.push(AbstractVal::scalar(VType::Float));
262        }
263        Opcode::MapToJsonJoin { .. } => {
264            pop1!();
265            stack.push(AbstractVal::scalar(VType::Str));
266        }
267        Opcode::StrTrimUpper | Opcode::StrTrimLower
268        | Opcode::StrUpperTrim | Opcode::StrLowerTrim
269        | Opcode::StrSplitReverseJoin { .. } => {
270            pop1!();
271            stack.push(AbstractVal::scalar(VType::Str));
272        }
273        Opcode::MapReplaceLit { .. }
274        | Opcode::MapUpperReplaceLit { .. }
275        | Opcode::MapLowerReplaceLit { .. }
276        | Opcode::MapStrConcat { .. } => {
277            pop1!();
278            stack.push(AbstractVal::array());
279        }
280        Opcode::MapSplitCount { .. }
281        | Opcode::MapSplitFirst { .. }
282        | Opcode::MapSplitNth { .. } => {
283            pop1!();
284            stack.push(AbstractVal::array());
285        }
286        Opcode::MapSplitCountSum { .. } => {
287            pop1!();
288            stack.push(AbstractVal::scalar(VType::Int));
289        }
290        Opcode::MapSplitLenSum { .. }
291        | Opcode::MapProject { .. }
292        | Opcode::MapStrSlice { .. }
293        | Opcode::MapFString(_) => {
294            pop1!();
295            stack.push(AbstractVal::array());
296        }
297        Opcode::FilterFieldEqLitCount(_, _)
298        | Opcode::FilterFieldCmpLitCount(_, _, _)
299        | Opcode::FilterFieldCmpFieldCount(_, _, _)
300        | Opcode::FilterFieldsAllEqLitCount(_)
301        | Opcode::FilterFieldsAllCmpLitCount(_)
302        | Opcode::UniqueCount => {
303            pop1!();
304            stack.push(AbstractVal::scalar(VType::Int));
305        }
306        Opcode::TopN { .. } | Opcode::MapFlatten(_) | Opcode::FilterTakeWhile { .. }
307            | Opcode::FilterDropWhile { .. } | Opcode::MapUnique(_)
308            | Opcode::EquiJoin { .. }
309            | Opcode::MapField(_) | Opcode::MapFieldChain(_) | Opcode::MapFieldUnique(_)
310            | Opcode::MapFieldChainUnique(_)
311            | Opcode::FlatMapChain(_)
312            | Opcode::FilterFieldEqLit(_, _) | Opcode::FilterFieldCmpLit(_, _, _)
313            | Opcode::FilterCurrentCmpLit(_, _)
314            | Opcode::FilterStrVecStartsWith(_)
315            | Opcode::FilterStrVecEndsWith(_)
316            | Opcode::FilterStrVecContains(_)
317            | Opcode::MapStrVecUpper
318            | Opcode::MapStrVecLower
319            | Opcode::MapStrVecTrim
320            | Opcode::MapNumVecArith { .. }
321            | Opcode::MapNumVecNeg
322            | Opcode::FilterFieldCmpField(_, _, _)
323            | Opcode::FilterFieldEqLitMapField(_, _, _)
324            | Opcode::FilterFieldCmpLitMapField(_, _, _, _)
325            | Opcode::GroupByField(_)
326            | Opcode::CountByField(_)
327            | Opcode::UniqueByField(_) => {
328            pop1!();
329            stack.push(AbstractVal::array());
330        }
331        Opcode::MapFirst(_) | Opcode::MapLast(_) | Opcode::FilterMapFirst { .. }
332            | Opcode::FilterLast { .. }
333            | Opcode::ArgExtreme { .. } => {
334            pop1!();
335            stack.push(AbstractVal::UNKNOWN);
336        }
337        Opcode::Add | Opcode::Sub | Opcode::Mul | Opcode::Mod => {
338            let (l, r) = pop2!();
339            stack.push(AbstractVal::scalar(l.ty.join(r.ty)));
340        }
341        Opcode::Div => {
342            pop2!();
343            stack.push(AbstractVal::scalar(VType::Float));
344        }
345        Opcode::Eq | Opcode::Neq | Opcode::Lt | Opcode::Lte
346            | Opcode::Gt | Opcode::Gte | Opcode::Fuzzy => {
347            pop2!();
348            stack.push(AbstractVal::scalar(VType::Bool));
349        }
350        Opcode::Not => { pop1!(); stack.push(AbstractVal::scalar(VType::Bool)); }
351        Opcode::Neg => { let v = pop1!(); stack.push(AbstractVal::scalar(v.ty)); }
352        Opcode::AndOp(_) | Opcode::OrOp(_) => {
353            pop1!();
354            stack.push(AbstractVal::scalar(VType::Bool));
355        }
356        Opcode::CoalesceOp(_) => { pop1!(); stack.push(AbstractVal::UNKNOWN); }
357        Opcode::IfElse { .. } => { pop1!(); stack.push(AbstractVal::UNKNOWN); }
358        Opcode::CallMethod(call) | Opcode::CallOptMethod(call) => {
359            pop1!();
360            stack.push(method_result_type(call.method));
361        }
362        Opcode::MakeObj(_) => stack.push(AbstractVal::object()),
363        Opcode::MakeArr(_) => stack.push(AbstractVal::array()),
364        Opcode::FString(_) => stack.push(AbstractVal::scalar(VType::Str)),
365        Opcode::KindCheck { .. } => { pop1!(); stack.push(AbstractVal::scalar(VType::Bool)); }
366        Opcode::SetCurrent => {} // TOS stays as current
367        Opcode::BindVar(_) => {} // TOS preserved
368        Opcode::StoreVar(_) => { pop1!(); }
369        Opcode::BindObjDestructure(_) | Opcode::BindArrDestructure(_) => {} // TOS preserved
370        Opcode::LetExpr { .. } => {
371            pop1!();
372            stack.push(AbstractVal::UNKNOWN);
373        }
374        Opcode::ListComp(_) | Opcode::SetComp(_) => stack.push(AbstractVal::array()),
375        Opcode::DictComp(_) => stack.push(AbstractVal::object()),
376        Opcode::GetPointer(_) => stack.push(AbstractVal::UNKNOWN),
377        Opcode::PatchEval(_) => stack.push(AbstractVal::UNKNOWN),
378        Opcode::CastOp(ty) => {
379            pop1!();
380            use super::ast::CastType;
381            let av = match ty {
382                CastType::Int    => AbstractVal::scalar(VType::Int),
383                CastType::Float  => AbstractVal::scalar(VType::Float),
384                CastType::Number => AbstractVal::scalar(VType::Num),
385                CastType::Str    => AbstractVal::scalar(VType::Str),
386                CastType::Bool   => AbstractVal::scalar(VType::Bool),
387                CastType::Array  => AbstractVal::array(),
388                CastType::Object => AbstractVal::object(),
389                CastType::Null   => AbstractVal::NULL,
390            };
391            stack.push(av);
392        }
393    }
394}
395
396/// Static result-type mapping for builtin methods (conservative).
397pub fn method_result_type(m: BuiltinMethod) -> AbstractVal {
398    use BuiltinMethod::*;
399    match m {
400        // → Int
401        Len | Count | Sum | IndexOf | LastIndexOf => AbstractVal::scalar(VType::Int),
402        // → Bool
403        Any | All | Has | Missing | Includes | StartsWith | EndsWith => AbstractVal::scalar(VType::Bool),
404        // → Str
405        Upper | Lower | Capitalize | TitleCase | Trim | TrimLeft | TrimRight
406            | ToString | ToJson | ToBase64 | FromBase64 | UrlEncode | UrlDecode
407            | HtmlEscape | HtmlUnescape | Repeat | PadLeft | PadRight
408            | Replace | ReplaceAll | StripPrefix | StripSuffix | Indent | Dedent
409            | Join | ToCsv | ToTsv | Type
410            => AbstractVal::scalar(VType::Str),
411        // → Float
412        Avg => AbstractVal::scalar(VType::Float),
413        // → Num (Min/Max depend on input; treat as unknown scalar)
414        Min | Max | ToNumber => AbstractVal::scalar(VType::Num),
415        // → Bool (to_bool)
416        ToBool => AbstractVal::scalar(VType::Bool),
417        // → Arr
418        Keys | Values | Entries | ToPairs | Reverse | Unique | Flatten | Compact
419            | Chars | Lines | Words | Split | Sort | Filter | Map | FlatMap
420            | Enumerate | Pairwise | Window | Chunk | TakeWhile | DropWhile
421            | Accumulate | Zip | ZipLongest | Diff | Intersect | Union
422            | Append | Prepend | Remove | Matches | Scan | Slice
423            => AbstractVal::array(),
424        // → Obj
425        FromPairs | Invert | Pick | Omit | Merge | DeepMerge | Defaults | Rename
426            | TransformKeys | TransformValues | FilterKeys | FilterValues | Pivot
427            | GroupBy | CountBy | IndexBy | Partition | FlattenKeys | UnflattenKeys
428            | SetPath | DelPath | DelPaths | Set | Update
429            => AbstractVal::object(),
430        // → various
431        First | Last | Nth | GetPath => AbstractVal::UNKNOWN,
432        HasPath => AbstractVal::scalar(VType::Bool),
433        FromJson | Or => AbstractVal::UNKNOWN,
434        EquiJoin => AbstractVal::array(),
435        Unknown => AbstractVal::UNKNOWN,
436    }
437}
438
439// ── Alias / use-count analysis ────────────────────────────────────────────────
440
441/// Count `LoadIdent(name)` references across an entire program (including sub-programs).
442pub fn count_ident_uses(program: &Program, name: &str) -> usize {
443    let mut n = 0;
444    count_ident_uses_in_ops(&program.ops, name, &mut n);
445    n
446}
447
448fn count_ident_uses_in_ops(ops: &[Opcode], name: &str, acc: &mut usize) {
449    for op in ops {
450        match op {
451            Opcode::LoadIdent(s) if s.as_ref() == name => *acc += 1,
452            Opcode::AndOp(p) | Opcode::OrOp(p) | Opcode::CoalesceOp(p)
453                | Opcode::InlineFilter(p) | Opcode::FilterCount(p)
454                | Opcode::FindFirst(p) | Opcode::FindOne(p)
455                | Opcode::DynIndex(p)
456                | Opcode::MapSum(p) | Opcode::MapAvg(p)
457                | Opcode::MapMin(p) | Opcode::MapMax(p)
458                | Opcode::MapFlatten(p)
459                | Opcode::MapFirst(p) | Opcode::MapLast(p)
460                | Opcode::FilterLast { pred: p }
461                => count_ident_uses_in_ops(&p.ops, name, acc),
462            Opcode::FilterTakeWhile { pred, stop } => {
463                count_ident_uses_in_ops(&pred.ops, name, acc);
464                count_ident_uses_in_ops(&stop.ops, name, acc);
465            }
466            Opcode::IfElse { then_, else_ } => {
467                count_ident_uses_in_ops(&then_.ops, name, acc);
468                count_ident_uses_in_ops(&else_.ops, name, acc);
469            }
470            Opcode::FilterMap { pred, map }
471                | Opcode::FilterMapSum { pred, map }
472                | Opcode::FilterMapAvg { pred, map }
473                | Opcode::FilterMapFirst { pred, map }
474                | Opcode::FilterMapMin { pred, map }
475                | Opcode::FilterMapMax { pred, map } => {
476                count_ident_uses_in_ops(&pred.ops, name, acc);
477                count_ident_uses_in_ops(&map.ops, name, acc);
478            }
479            Opcode::MapFilter { map, pred } => {
480                count_ident_uses_in_ops(&map.ops, name, acc);
481                count_ident_uses_in_ops(&pred.ops, name, acc);
482            }
483            Opcode::FilterFilter { p1, p2 } => {
484                count_ident_uses_in_ops(&p1.ops, name, acc);
485                count_ident_uses_in_ops(&p2.ops, name, acc);
486            }
487            Opcode::MapMap { f1, f2 } => {
488                count_ident_uses_in_ops(&f1.ops, name, acc);
489                count_ident_uses_in_ops(&f2.ops, name, acc);
490            }
491            Opcode::CallMethod(c) | Opcode::CallOptMethod(c) => {
492                for p in c.sub_progs.iter() { count_ident_uses_in_ops(&p.ops, name, acc); }
493            }
494            Opcode::LetExpr { body, .. } => count_ident_uses_in_ops(&body.ops, name, acc),
495            Opcode::ListComp(spec) | Opcode::SetComp(spec) => {
496                count_ident_uses_in_ops(&spec.expr.ops, name, acc);
497                count_ident_uses_in_ops(&spec.iter.ops, name, acc);
498                if let Some(c) = &spec.cond { count_ident_uses_in_ops(&c.ops, name, acc); }
499            }
500            Opcode::DictComp(spec) => {
501                count_ident_uses_in_ops(&spec.key.ops, name, acc);
502                count_ident_uses_in_ops(&spec.val.ops, name, acc);
503                count_ident_uses_in_ops(&spec.iter.ops, name, acc);
504                if let Some(c) = &spec.cond { count_ident_uses_in_ops(&c.ops, name, acc); }
505            }
506            Opcode::MakeObj(entries) => {
507                use super::vm::CompiledObjEntry;
508                for e in entries.iter() {
509                    match e {
510                        CompiledObjEntry::Short { .. } => {}
511                        CompiledObjEntry::Kv { prog, cond, .. } => {
512                            count_ident_uses_in_ops(&prog.ops, name, acc);
513                            if let Some(c) = cond { count_ident_uses_in_ops(&c.ops, name, acc); }
514                        }
515                        CompiledObjEntry::KvPath { .. } => {}
516                        CompiledObjEntry::Dynamic { key, val } => {
517                            count_ident_uses_in_ops(&key.ops, name, acc);
518                            count_ident_uses_in_ops(&val.ops, name, acc);
519                        }
520                        CompiledObjEntry::Spread(p) => count_ident_uses_in_ops(&p.ops, name, acc),
521                        CompiledObjEntry::SpreadDeep(p) => count_ident_uses_in_ops(&p.ops, name, acc),
522                    }
523                }
524            }
525            Opcode::MakeArr(progs) => {
526                for p in progs.iter() { count_ident_uses_in_ops(&p.ops, name, acc); }
527            }
528            Opcode::FString(parts) => {
529                use super::vm::CompiledFSPart;
530                for p in parts.iter() {
531                    if let CompiledFSPart::Interp { prog, .. } = p {
532                        count_ident_uses_in_ops(&prog.ops, name, acc);
533                    }
534                }
535            }
536            _ => {}
537        }
538    }
539}
540
541// ── Projection set (field access pattern) ─────────────────────────────────────
542
543/// Collect all field names directly accessed from a program.  Used by
544/// projection-pushdown analysis: if an object is later accessed only via these
545/// fields, other fields can be trimmed early.
546pub fn collect_accessed_fields(program: &Program) -> Vec<Arc<str>> {
547    let mut set = Vec::new();
548    collect_fields_in_ops(&program.ops, &mut set);
549    set
550}
551
552fn collect_fields_in_ops(ops: &[Opcode], acc: &mut Vec<Arc<str>>) {
553    for op in ops {
554        match op {
555            Opcode::GetField(k) | Opcode::OptField(k) | Opcode::Descendant(k)
556                | Opcode::MapFieldSum(k) | Opcode::MapFieldAvg(k)
557                | Opcode::MapFieldMin(k) | Opcode::MapFieldMax(k)
558                | Opcode::MapField(k) | Opcode::MapFieldUnique(k)
559                | Opcode::GroupByField(k)
560                | Opcode::CountByField(k)
561                | Opcode::UniqueByField(k)
562                | Opcode::FilterFieldEqLit(k, _) | Opcode::FilterFieldCmpLit(k, _, _)
563                | Opcode::FilterFieldEqLitCount(k, _) | Opcode::FilterFieldCmpLitCount(k, _, _)
564                => {
565                if !acc.iter().any(|a: &Arc<str>| a == k) { acc.push(k.clone()); }
566            }
567            Opcode::FilterFieldCmpField(k1, _, k2)
568                | Opcode::FilterFieldCmpFieldCount(k1, _, k2) => {
569                if !acc.iter().any(|a: &Arc<str>| a == k1) { acc.push(k1.clone()); }
570                if !acc.iter().any(|a: &Arc<str>| a == k2) { acc.push(k2.clone()); }
571            }
572            Opcode::FlatMapChain(ks) => {
573                for k in ks.iter() {
574                    if !acc.iter().any(|a: &Arc<str>| a == k) { acc.push(k.clone()); }
575                }
576            }
577            Opcode::RootChain(chain) => {
578                for k in chain.iter() {
579                    if !acc.iter().any(|a: &Arc<str>| a == k) { acc.push(k.clone()); }
580                }
581            }
582            Opcode::AndOp(p) | Opcode::OrOp(p) | Opcode::CoalesceOp(p)
583                | Opcode::InlineFilter(p) | Opcode::FilterCount(p)
584                | Opcode::FindFirst(p) | Opcode::FindOne(p)
585                | Opcode::DynIndex(p)
586                | Opcode::MapSum(p) | Opcode::MapAvg(p)
587                | Opcode::MapMin(p) | Opcode::MapMax(p)
588                | Opcode::MapFlatten(p)
589                | Opcode::MapFirst(p) | Opcode::MapLast(p)
590                | Opcode::FilterLast { pred: p }
591                => collect_fields_in_ops(&p.ops, acc),
592            Opcode::FilterTakeWhile { pred, stop } => {
593                collect_fields_in_ops(&pred.ops, acc);
594                collect_fields_in_ops(&stop.ops, acc);
595            }
596            Opcode::IfElse { then_, else_ } => {
597                collect_fields_in_ops(&then_.ops, acc);
598                collect_fields_in_ops(&else_.ops, acc);
599            }
600            Opcode::FilterMap { pred, map }
601                | Opcode::FilterMapSum { pred, map }
602                | Opcode::FilterMapAvg { pred, map }
603                | Opcode::FilterMapFirst { pred, map }
604                | Opcode::FilterMapMin { pred, map }
605                | Opcode::FilterMapMax { pred, map } => {
606                collect_fields_in_ops(&pred.ops, acc);
607                collect_fields_in_ops(&map.ops, acc);
608            }
609            Opcode::MapFilter { map, pred } => {
610                collect_fields_in_ops(&map.ops, acc);
611                collect_fields_in_ops(&pred.ops, acc);
612            }
613            Opcode::FilterFilter { p1, p2 } => {
614                collect_fields_in_ops(&p1.ops, acc);
615                collect_fields_in_ops(&p2.ops, acc);
616            }
617            Opcode::MapMap { f1, f2 } => {
618                collect_fields_in_ops(&f1.ops, acc);
619                collect_fields_in_ops(&f2.ops, acc);
620            }
621            Opcode::CallMethod(c) | Opcode::CallOptMethod(c) => {
622                for p in c.sub_progs.iter() { collect_fields_in_ops(&p.ops, acc); }
623            }
624            Opcode::LetExpr { body, .. } => collect_fields_in_ops(&body.ops, acc),
625            _ => {}
626        }
627    }
628}
629
630// ── Structural opcode hashing (for CSE) ───────────────────────────────────────
631
632/// Hash a program's opcode sequence into a stable identifier.  Two programs
633/// with the same opcodes hash to the same value — enables CSE across `let`
634/// initialisers or parallel `->` binds.
635pub fn program_signature(program: &Program) -> u64 {
636    use std::collections::hash_map::DefaultHasher;
637    use std::hash::Hasher;
638    let mut h = DefaultHasher::new();
639    hash_ops(&program.ops, &mut h);
640    h.finish()
641}
642
643fn hash_ops(ops: &[Opcode], h: &mut impl std::hash::Hasher) {
644    use std::hash::Hash;
645    for op in ops {
646        // Discriminant + data for stable signature.
647        std::mem::discriminant(op).hash(h);
648        match op {
649            Opcode::PushInt(n) => n.hash(h),
650            Opcode::PushStr(s) => s.as_bytes().hash(h),
651            Opcode::PushBool(b) => b.hash(h),
652            Opcode::GetField(k) | Opcode::OptField(k) | Opcode::Descendant(k)
653                | Opcode::LoadIdent(k) | Opcode::GetPointer(k)
654                => k.as_bytes().hash(h),
655            Opcode::GetIndex(i) => i.hash(h),
656            Opcode::CallMethod(c) | Opcode::CallOptMethod(c) => {
657                (c.method as u8).hash(h);
658                for p in c.sub_progs.iter() { hash_ops(&p.ops, h); }
659            }
660            Opcode::AndOp(p) | Opcode::OrOp(p) | Opcode::CoalesceOp(p)
661                | Opcode::InlineFilter(p) | Opcode::FilterCount(p)
662                | Opcode::FindFirst(p) | Opcode::FindOne(p)
663                | Opcode::DynIndex(p)
664                | Opcode::MapSum(p) | Opcode::MapAvg(p)
665                | Opcode::MapMin(p) | Opcode::MapMax(p)
666                | Opcode::MapFlatten(p)
667                | Opcode::MapFirst(p) | Opcode::MapLast(p)
668                | Opcode::FilterLast { pred: p }
669                => hash_ops(&p.ops, h),
670            Opcode::FilterTakeWhile { pred, stop } => {
671                hash_ops(&pred.ops, h);
672                hash_ops(&stop.ops, h);
673            }
674            Opcode::IfElse { then_, else_ } => {
675                hash_ops(&then_.ops, h);
676                hash_ops(&else_.ops, h);
677            }
678            Opcode::RootChain(chain) => {
679                for k in chain.iter() { k.as_bytes().hash(h); }
680            }
681            _ => {}
682        }
683    }
684}
685
686// ── Common-subexpression detection ────────────────────────────────────────────
687
688/// Find sub-programs (via `Arc<Program>` pointers inside opcodes) that appear
689/// multiple times across the program tree.  Returns a map of
690/// `signature → count` for analysis / potential reuse.
691pub fn find_common_subexprs(program: &Program) -> HashMap<u64, usize> {
692    let mut map: HashMap<u64, usize> = HashMap::new();
693    walk_subprograms(&program.ops, &mut map);
694    map.retain(|_, &mut n| n >= 2);
695    map
696}
697
698fn walk_subprograms(ops: &[Opcode], map: &mut HashMap<u64, usize>) {
699    for op in ops {
700        let sub_progs: Vec<&Arc<Program>> = match op {
701            Opcode::AndOp(p) | Opcode::OrOp(p) | Opcode::CoalesceOp(p)
702                | Opcode::InlineFilter(p) | Opcode::FilterCount(p)
703                | Opcode::FindFirst(p) | Opcode::FindOne(p)
704                | Opcode::DynIndex(p)
705                | Opcode::MapSum(p) | Opcode::MapAvg(p)
706                | Opcode::MapMin(p) | Opcode::MapMax(p)
707                | Opcode::MapFlatten(p)
708                | Opcode::MapFirst(p) | Opcode::MapLast(p)
709                | Opcode::FilterLast { pred: p }
710                => vec![p],
711            Opcode::FilterTakeWhile { pred, stop } => vec![pred, stop],
712            Opcode::IfElse { then_, else_ } => vec![then_, else_],
713            Opcode::FilterMap { pred, map: m }
714                | Opcode::FilterMapSum { pred, map: m }
715                | Opcode::FilterMapAvg { pred, map: m }
716                | Opcode::FilterMapFirst { pred, map: m }
717                | Opcode::FilterMapMin { pred, map: m }
718                | Opcode::FilterMapMax { pred, map: m } => vec![pred, m],
719            Opcode::MapFilter { map: m, pred } => vec![m, pred],
720            Opcode::FilterFilter { p1, p2 } => vec![p1, p2],
721            Opcode::MapMap { f1, f2 } => vec![f1, f2],
722            Opcode::CallMethod(c) | Opcode::CallOptMethod(c) =>
723                c.sub_progs.iter().collect(),
724            Opcode::LetExpr { body, .. } => vec![body],
725            Opcode::MakeArr(progs) => progs.iter().collect(),
726            Opcode::MakeObj(entries) => {
727                use super::vm::CompiledObjEntry;
728                let mut v = Vec::new();
729                for e in entries.iter() {
730                    match e {
731                        CompiledObjEntry::Short { .. } => {}
732                        CompiledObjEntry::Kv { prog, cond, .. } => {
733                            v.push(prog);
734                            if let Some(c) = cond { v.push(c); }
735                        }
736                        CompiledObjEntry::KvPath { .. } => {}
737                        CompiledObjEntry::Dynamic { key, val } => { v.push(key); v.push(val); }
738                        CompiledObjEntry::Spread(p) => v.push(p),
739                        CompiledObjEntry::SpreadDeep(p) => v.push(p),
740                    }
741                }
742                v
743            }
744            _ => continue,
745        };
746        for p in sub_progs {
747            let sig = program_signature(p);
748            *map.entry(sig).or_insert(0) += 1;
749            walk_subprograms(&p.ops, map);
750        }
751    }
752}
753
754// ── AST-level ident use walker (for dead-let) ────────────────────────────────
755
756/// True if any sub-expression references `name` as a bare identifier.
757/// Walks the AST without compiling.  Shadowing by inner `let` / `lambda` /
758/// comprehension binders is respected: inner bindings with the same name
759/// hide the outer one.
760pub fn expr_uses_ident(expr: &super::ast::Expr, name: &str) -> bool {
761    use super::ast::{Expr, Step, PipeStep, BindTarget, FStringPart, ArrayElem, ObjField, Arg};
762    match expr {
763        Expr::Ident(n) => n == name,
764        Expr::Null | Expr::Bool(_) | Expr::Int(_) | Expr::Float(_)
765            | Expr::Str(_) | Expr::Root | Expr::Current => false,
766        Expr::FString(parts) => parts.iter().any(|p| match p {
767            FStringPart::Lit(_) => false,
768            FStringPart::Interp { expr, .. } => expr_uses_ident(expr, name),
769        }),
770        Expr::Chain(base, steps) => {
771            if expr_uses_ident(base, name) { return true; }
772            steps.iter().any(|s| match s {
773                Step::DynIndex(e) | Step::InlineFilter(e) => expr_uses_ident(e, name),
774                Step::Method(_, args) | Step::OptMethod(_, args) =>
775                    args.iter().any(|a| match a {
776                        Arg::Pos(e) | Arg::Named(_, e) => expr_uses_ident(e, name),
777                    }),
778                _ => false,
779            })
780        }
781        Expr::BinOp(l, _, r) => expr_uses_ident(l, name) || expr_uses_ident(r, name),
782        Expr::UnaryNeg(e) | Expr::Not(e) => expr_uses_ident(e, name),
783        Expr::Kind { expr, .. } => expr_uses_ident(expr, name),
784        Expr::Coalesce(l, r) => expr_uses_ident(l, name) || expr_uses_ident(r, name),
785        Expr::Object(fields) => fields.iter().any(|f| match f {
786            ObjField::Kv { val, cond, .. } =>
787                expr_uses_ident(val, name)
788                || cond.as_ref().map_or(false, |c| expr_uses_ident(c, name)),
789            ObjField::Short(n) => n == name,
790            ObjField::Dynamic { key, val } =>
791                expr_uses_ident(key, name) || expr_uses_ident(val, name),
792            ObjField::Spread(e) => expr_uses_ident(e, name),
793            ObjField::SpreadDeep(e) => expr_uses_ident(e, name),
794        }),
795        Expr::Array(elems) => elems.iter().any(|e| match e {
796            ArrayElem::Expr(e) | ArrayElem::Spread(e) => expr_uses_ident(e, name),
797        }),
798        Expr::Pipeline { base, steps } => {
799            if expr_uses_ident(base, name) { return true; }
800            steps.iter().any(|s| match s {
801                PipeStep::Forward(e) => expr_uses_ident(e, name),
802                PipeStep::Bind(bt) => match bt {
803                    BindTarget::Name(n) => n == name,
804                    BindTarget::Obj { fields, rest } =>
805                        fields.iter().any(|f| f == name)
806                        || rest.as_ref().map_or(false, |r| r == name),
807                    BindTarget::Arr(ns) => ns.iter().any(|n| n == name),
808                },
809            })
810        }
811        Expr::ListComp { expr, vars, iter, cond }
812        | Expr::SetComp  { expr, vars, iter, cond }
813        | Expr::GenComp  { expr, vars, iter, cond } => {
814            if expr_uses_ident(iter, name) { return true; }
815            if vars.iter().any(|v| v == name) { return false; } // shadowed
816            expr_uses_ident(expr, name)
817                || cond.as_ref().map_or(false, |c| expr_uses_ident(c, name))
818        }
819        Expr::DictComp { key, val, vars, iter, cond } => {
820            if expr_uses_ident(iter, name) { return true; }
821            if vars.iter().any(|v| v == name) { return false; }
822            expr_uses_ident(key, name) || expr_uses_ident(val, name)
823                || cond.as_ref().map_or(false, |c| expr_uses_ident(c, name))
824        }
825        Expr::Lambda { params, body } => {
826            if params.iter().any(|p| p == name) { return false; }
827            expr_uses_ident(body, name)
828        }
829        Expr::Let { name: n, init, body } => {
830            if expr_uses_ident(init, name) { return true; }
831            if n == name { return false; } // inner let shadows
832            expr_uses_ident(body, name)
833        }
834        Expr::IfElse { cond, then_, else_ } => {
835            expr_uses_ident(cond, name)
836                || expr_uses_ident(then_, name)
837                || expr_uses_ident(else_, name)
838        }
839        Expr::GlobalCall { args, .. } => args.iter().any(|a| match a {
840            Arg::Pos(e) | Arg::Named(_, e) => expr_uses_ident(e, name),
841        }),
842        Expr::Cast { expr, .. } => expr_uses_ident(expr, name),
843        Expr::Patch { root, ops } => {
844            use super::ast::PathStep;
845            if expr_uses_ident(root, name) { return true; }
846            ops.iter().any(|op| {
847                op.path.iter().any(|s| match s {
848                    PathStep::WildcardFilter(e) => expr_uses_ident(e, name),
849                    _ => false,
850                })
851                || expr_uses_ident(&op.val, name)
852                || op.cond.as_ref().map_or(false, |c| expr_uses_ident(c, name))
853            })
854        }
855        Expr::DeleteMark => false,
856    }
857}
858
859/// True if the expression is pure — no side-effecting global calls or
860/// unknown methods.  Enables dropping unused `let` initialisers safely.
861pub fn expr_is_pure(expr: &super::ast::Expr) -> bool {
862    use super::ast::{Expr, Step, Arg};
863    match expr {
864        Expr::Null | Expr::Bool(_) | Expr::Int(_) | Expr::Float(_)
865            | Expr::Str(_) | Expr::Root | Expr::Current | Expr::Ident(_) => true,
866        Expr::FString(_) => true,
867        Expr::Chain(base, steps) => {
868            if !expr_is_pure(base) { return false; }
869            steps.iter().all(|s| match s {
870                Step::DynIndex(e) | Step::InlineFilter(e) => expr_is_pure(e),
871                Step::Method(_, args) | Step::OptMethod(_, args) =>
872                    args.iter().all(|a| match a {
873                        Arg::Pos(e) | Arg::Named(_, e) => expr_is_pure(e),
874                    }),
875                _ => true,
876            })
877        }
878        Expr::BinOp(l, _, r) | Expr::Coalesce(l, r) =>
879            expr_is_pure(l) && expr_is_pure(r),
880        Expr::UnaryNeg(e) | Expr::Not(e) | Expr::Kind { expr: e, .. } =>
881            expr_is_pure(e),
882        // All Jetro exprs are pure in practice; global calls may throw but no side effects.
883        _ => true,
884    }
885}
886
887// ── CSE: canonicalise identical sub-programs ─────────────────────────────────
888
889/// Walk `program` and replace every `Arc<Program>` inside opcodes with a
890/// canonical `Arc` keyed by `program_signature`.  Structurally-identical
891/// sub-programs end up pointing at the same allocation, reducing memory
892/// and enabling downstream caches to hit on the same key.
893///
894/// Returns a new `Program` with deduplicated sub-programs.
895pub fn dedup_subprograms(program: &Program) -> Arc<Program> {
896    let mut cache: HashMap<u64, Arc<Program>> = HashMap::new();
897    dedup_rec(program, &mut cache)
898}
899
900fn dedup_rec(program: &Program, cache: &mut HashMap<u64, Arc<Program>>) -> Arc<Program> {
901    let sig = program_signature(program);
902    if let Some(a) = cache.get(&sig) { return Arc::clone(a); }
903    let new_ops: Vec<Opcode> = program.ops.iter().map(|op| rewrite_op(op, cache)).collect();
904    let ics = crate::vm::fresh_ics(new_ops.len());
905    let out = Arc::new(Program {
906        ops:           new_ops.into(),
907        source:        program.source.clone(),
908        id:            program.id,
909        is_structural: program.is_structural,
910        ics,
911    });
912    cache.insert(sig, Arc::clone(&out));
913    out
914}
915
916fn rewrite_op(op: &Opcode, cache: &mut HashMap<u64, Arc<Program>>) -> Opcode {
917    use super::vm::{CompiledObjEntry, CompiledFSPart, CompSpec, DictCompSpec};
918    match op {
919        Opcode::AndOp(p)        => Opcode::AndOp(dedup_rec(p, cache)),
920        Opcode::OrOp(p)         => Opcode::OrOp(dedup_rec(p, cache)),
921        Opcode::CoalesceOp(p)   => Opcode::CoalesceOp(dedup_rec(p, cache)),
922        Opcode::IfElse { then_, else_ } => Opcode::IfElse {
923            then_: dedup_rec(then_, cache),
924            else_: dedup_rec(else_, cache),
925        },
926        Opcode::InlineFilter(p) => Opcode::InlineFilter(dedup_rec(p, cache)),
927        Opcode::FilterCount(p)  => Opcode::FilterCount(dedup_rec(p, cache)),
928        Opcode::MapSum(p)       => Opcode::MapSum(dedup_rec(p, cache)),
929        Opcode::MapAvg(p)       => Opcode::MapAvg(dedup_rec(p, cache)),
930        Opcode::MapMin(p)       => Opcode::MapMin(dedup_rec(p, cache)),
931        Opcode::MapMax(p)       => Opcode::MapMax(dedup_rec(p, cache)),
932        Opcode::MapFlatten(p)   => Opcode::MapFlatten(dedup_rec(p, cache)),
933        Opcode::MapFirst(p)     => Opcode::MapFirst(dedup_rec(p, cache)),
934        Opcode::MapLast(p)      => Opcode::MapLast(dedup_rec(p, cache)),
935        Opcode::FilterLast  { pred } => Opcode::FilterLast  { pred: dedup_rec(pred, cache) },
936        Opcode::FilterTakeWhile { pred, stop } => Opcode::FilterTakeWhile {
937            pred: dedup_rec(pred, cache),
938            stop: dedup_rec(stop, cache),
939        },
940        Opcode::FindFirst(p)    => Opcode::FindFirst(dedup_rec(p, cache)),
941        Opcode::FindOne(p)      => Opcode::FindOne(dedup_rec(p, cache)),
942        Opcode::DynIndex(p)     => Opcode::DynIndex(dedup_rec(p, cache)),
943        Opcode::FilterMap { pred, map } => Opcode::FilterMap {
944            pred: dedup_rec(pred, cache),
945            map:  dedup_rec(map,  cache),
946        },
947        Opcode::FilterMapSum { pred, map } => Opcode::FilterMapSum {
948            pred: dedup_rec(pred, cache),
949            map:  dedup_rec(map,  cache),
950        },
951        Opcode::FilterMapAvg { pred, map } => Opcode::FilterMapAvg {
952            pred: dedup_rec(pred, cache),
953            map:  dedup_rec(map,  cache),
954        },
955        Opcode::FilterMapFirst { pred, map } => Opcode::FilterMapFirst {
956            pred: dedup_rec(pred, cache),
957            map:  dedup_rec(map,  cache),
958        },
959        Opcode::FilterMapMin { pred, map } => Opcode::FilterMapMin {
960            pred: dedup_rec(pred, cache),
961            map:  dedup_rec(map,  cache),
962        },
963        Opcode::FilterMapMax { pred, map } => Opcode::FilterMapMax {
964            pred: dedup_rec(pred, cache),
965            map:  dedup_rec(map,  cache),
966        },
967        Opcode::MapFilter { map, pred } => Opcode::MapFilter {
968            map:  dedup_rec(map,  cache),
969            pred: dedup_rec(pred, cache),
970        },
971        Opcode::FilterFilter { p1, p2 } => Opcode::FilterFilter {
972            p1: dedup_rec(p1, cache),
973            p2: dedup_rec(p2, cache),
974        },
975        Opcode::MapMap { f1, f2 } => Opcode::MapMap {
976            f1: dedup_rec(f1, cache),
977            f2: dedup_rec(f2, cache),
978        },
979        Opcode::LetExpr { name, body } => Opcode::LetExpr {
980            name: name.clone(),
981            body: dedup_rec(body, cache),
982        },
983        Opcode::CallMethod(c) => Opcode::CallMethod(rewrite_call(c, cache)),
984        Opcode::CallOptMethod(c) => Opcode::CallOptMethod(rewrite_call(c, cache)),
985        Opcode::MakeArr(progs) => {
986            let new_progs: Vec<Arc<Program>> = progs.iter().map(|p| dedup_rec(p, cache)).collect();
987            Opcode::MakeArr(new_progs.into())
988        }
989        Opcode::MakeObj(entries) => {
990            let new_entries: Vec<CompiledObjEntry> = entries.iter().map(|e| match e {
991                CompiledObjEntry::Short { name, ic } =>
992                    CompiledObjEntry::Short { name: name.clone(), ic: ic.clone() },
993                CompiledObjEntry::Kv { key, prog, optional, cond } => CompiledObjEntry::Kv {
994                    key: key.clone(),
995                    prog: dedup_rec(prog, cache),
996                    optional: *optional,
997                    cond: cond.as_ref().map(|c| dedup_rec(c, cache)),
998                },
999                CompiledObjEntry::KvPath { key, steps, optional, ics } => CompiledObjEntry::KvPath {
1000                    key: key.clone(),
1001                    steps: steps.clone(),
1002                    optional: *optional,
1003                    ics: ics.clone(),
1004                },
1005                CompiledObjEntry::Dynamic { key, val } => CompiledObjEntry::Dynamic {
1006                    key: dedup_rec(key, cache),
1007                    val: dedup_rec(val, cache),
1008                },
1009                CompiledObjEntry::Spread(p) => CompiledObjEntry::Spread(dedup_rec(p, cache)),
1010                CompiledObjEntry::SpreadDeep(p) => CompiledObjEntry::SpreadDeep(dedup_rec(p, cache)),
1011            }).collect();
1012            Opcode::MakeObj(new_entries.into())
1013        }
1014        Opcode::FString(parts) => {
1015            let new_parts: Vec<CompiledFSPart> = parts.iter().map(|p| match p {
1016                CompiledFSPart::Lit(s) => CompiledFSPart::Lit(s.clone()),
1017                CompiledFSPart::Interp { prog, fmt } => CompiledFSPart::Interp {
1018                    prog: dedup_rec(prog, cache),
1019                    fmt:  fmt.clone(),
1020                },
1021            }).collect();
1022            Opcode::FString(new_parts.into())
1023        }
1024        Opcode::ListComp(spec) => {
1025            let new_spec = CompSpec {
1026                expr:  dedup_rec(&spec.expr, cache),
1027                iter:  dedup_rec(&spec.iter, cache),
1028                cond:  spec.cond.as_ref().map(|c| dedup_rec(c, cache)),
1029                vars:  spec.vars.clone(),
1030            };
1031            Opcode::ListComp(Arc::new(new_spec))
1032        }
1033        Opcode::SetComp(spec) => {
1034            let new_spec = CompSpec {
1035                expr:  dedup_rec(&spec.expr, cache),
1036                iter:  dedup_rec(&spec.iter, cache),
1037                cond:  spec.cond.as_ref().map(|c| dedup_rec(c, cache)),
1038                vars:  spec.vars.clone(),
1039            };
1040            Opcode::SetComp(Arc::new(new_spec))
1041        }
1042        Opcode::DictComp(spec) => {
1043            let new_spec = DictCompSpec {
1044                key:   dedup_rec(&spec.key, cache),
1045                val:   dedup_rec(&spec.val, cache),
1046                iter:  dedup_rec(&spec.iter, cache),
1047                cond:  spec.cond.as_ref().map(|c| dedup_rec(c, cache)),
1048                vars:  spec.vars.clone(),
1049            };
1050            Opcode::DictComp(Arc::new(new_spec))
1051        }
1052        _ => op.clone(),
1053    }
1054}
1055
1056fn rewrite_call(c: &Arc<super::vm::CompiledCall>, cache: &mut HashMap<u64, Arc<Program>>) -> Arc<super::vm::CompiledCall> {
1057    use super::vm::CompiledCall;
1058    let new_subs: Vec<Arc<Program>> = c.sub_progs.iter().map(|p| dedup_rec(p, cache)).collect();
1059    Arc::new(CompiledCall {
1060        method:    c.method,
1061        name:      c.name.clone(),
1062        sub_progs: new_subs.into(),
1063        orig_args: c.orig_args.clone(),
1064    })
1065}
1066
1067// ── Cost model ────────────────────────────────────────────────────────────────
1068
1069/// Rough, ordinal cost estimate per opcode.  Not a wall-clock number — only
1070/// useful to compare relative cost (e.g. for AndOp operand reordering).
1071pub fn opcode_cost(op: &Opcode) -> u32 {
1072    match op {
1073        Opcode::PushNull | Opcode::PushBool(_) | Opcode::PushInt(_)
1074            | Opcode::PushFloat(_) | Opcode::PushStr(_)
1075            | Opcode::PushRoot | Opcode::PushCurrent | Opcode::LoadIdent(_) => 1,
1076        Opcode::GetField(_) | Opcode::OptField(_) | Opcode::GetIndex(_)
1077            | Opcode::RootChain(_) | Opcode::FieldChain(_)
1078            | Opcode::GetPointer(_) => 2,
1079        Opcode::GetSlice(..) | Opcode::Descendant(_) => 5,
1080        Opcode::DescendAll => 20,
1081        Opcode::Not | Opcode::Neg | Opcode::SetCurrent
1082            | Opcode::BindVar(_) | Opcode::StoreVar(_)
1083            | Opcode::BindObjDestructure(_) | Opcode::BindArrDestructure(_) => 1,
1084        Opcode::Add | Opcode::Sub | Opcode::Mul | Opcode::Div | Opcode::Mod
1085            | Opcode::Eq | Opcode::Neq | Opcode::Lt | Opcode::Lte
1086            | Opcode::Gt | Opcode::Gte | Opcode::Fuzzy => 2,
1087        Opcode::KindCheck { .. } => 2,
1088        Opcode::AndOp(p) | Opcode::OrOp(p) | Opcode::CoalesceOp(p) => 2 + program_cost(p),
1089        Opcode::IfElse { then_, else_ } => 2 + program_cost(then_) + program_cost(else_),
1090        Opcode::InlineFilter(p) | Opcode::FilterCount(p)
1091            | Opcode::FindFirst(p) | Opcode::FindOne(p)
1092            | Opcode::MapSum(p) | Opcode::MapAvg(p)
1093            | Opcode::MapMin(p) | Opcode::MapMax(p)
1094            | Opcode::MapFlatten(p)
1095            | Opcode::MapFirst(p) | Opcode::MapLast(p)
1096            | Opcode::FilterLast { pred: p }
1097            | Opcode::DynIndex(p) => 10 + program_cost(p),
1098        Opcode::FilterTakeWhile { pred, stop } => 10 + program_cost(pred) + program_cost(stop),
1099        Opcode::FilterDropWhile { pred, drop } => 10 + program_cost(pred) + program_cost(drop),
1100        Opcode::MapUnique(p) => 15 + program_cost(p),
1101        Opcode::EquiJoin { rhs, .. } => 25 + program_cost(rhs),
1102        Opcode::FilterMap { pred, map }
1103            | Opcode::FilterMapSum { pred, map }
1104            | Opcode::FilterMapAvg { pred, map }
1105            | Opcode::FilterMapFirst { pred, map }
1106            | Opcode::FilterMapMin { pred, map }
1107            | Opcode::FilterMapMax { pred, map } => 10 + program_cost(pred) + program_cost(map),
1108        Opcode::MapFilter { map, pred } => 10 + program_cost(map) + program_cost(pred),
1109        Opcode::FilterFilter { p1, p2 } => 10 + program_cost(p1) + program_cost(p2),
1110        Opcode::MapMap { f1, f2 } => 10 + program_cost(f1) + program_cost(f2),
1111        Opcode::TopN { n, .. } => 15 + *n as u32,
1112        Opcode::CallMethod(c) | Opcode::CallOptMethod(c) => {
1113            let base = match c.method {
1114                BuiltinMethod::Filter | BuiltinMethod::Map | BuiltinMethod::FlatMap => 10,
1115                BuiltinMethod::Sort   => 30,
1116                BuiltinMethod::GroupBy | BuiltinMethod::IndexBy => 25,
1117                BuiltinMethod::Len    | BuiltinMethod::Count => 2,
1118                _ => 8,
1119            };
1120            base + c.sub_progs.iter().map(|p| program_cost(p)).sum::<u32>()
1121        }
1122        Opcode::MakeObj(entries) => {
1123            use super::vm::CompiledObjEntry;
1124            entries.iter().map(|e| match e {
1125                CompiledObjEntry::Short { .. } => 2,
1126                CompiledObjEntry::Kv { prog, cond, .. } =>
1127                    2 + program_cost(prog) + cond.as_ref().map_or(0, |c| program_cost(c)),
1128                CompiledObjEntry::KvPath { steps, .. } => 2 + steps.len() as u32,
1129                CompiledObjEntry::Dynamic { key, val } => 3 + program_cost(key) + program_cost(val),
1130                CompiledObjEntry::Spread(p) => 5 + program_cost(p),
1131                CompiledObjEntry::SpreadDeep(p) => 8 + program_cost(p),
1132            }).sum()
1133        }
1134        Opcode::MakeArr(progs) => progs.iter().map(|p| 1 + program_cost(p)).sum(),
1135        Opcode::FString(parts) => {
1136            use super::vm::CompiledFSPart;
1137            parts.iter().map(|p| match p {
1138                CompiledFSPart::Lit(_) => 1,
1139                CompiledFSPart::Interp { prog, .. } => 3 + program_cost(prog),
1140            }).sum()
1141        }
1142        Opcode::ListComp(s) | Opcode::SetComp(s) =>
1143            15 + program_cost(&s.expr) + program_cost(&s.iter)
1144               + s.cond.as_ref().map_or(0, |c| program_cost(c)),
1145        Opcode::DictComp(s) =>
1146            15 + program_cost(&s.key) + program_cost(&s.val) + program_cost(&s.iter)
1147               + s.cond.as_ref().map_or(0, |c| program_cost(c)),
1148        Opcode::LetExpr { body, .. } => 2 + program_cost(body),
1149        Opcode::Quantifier(_) => 2,
1150        Opcode::CastOp(_) => 2,
1151        Opcode::PatchEval(_) => 50,
1152        Opcode::MapFieldSum(_) | Opcode::MapFieldAvg(_)
1153            | Opcode::MapFieldMin(_) | Opcode::MapFieldMax(_)
1154            | Opcode::MapField(_) => 5,
1155        Opcode::MapFieldChain(ks) => 5 + ks.len() as u32 * 2,
1156        Opcode::MapFieldChainUnique(ks) => 8 + ks.len() as u32 * 2,
1157        Opcode::MapFieldUnique(_) => 8,
1158        Opcode::FlatMapChain(ks) => 5 + ks.len() as u32 * 3,
1159        Opcode::FilterFieldEqLit(_, _) | Opcode::FilterFieldCmpLit(_, _, _)
1160            | Opcode::FilterCurrentCmpLit(_, _)
1161            | Opcode::FilterStrVecStartsWith(_)
1162            | Opcode::FilterStrVecEndsWith(_)
1163            | Opcode::FilterStrVecContains(_)
1164            | Opcode::MapStrVecUpper
1165            | Opcode::MapStrVecLower
1166            | Opcode::MapStrVecTrim
1167            | Opcode::MapNumVecArith { .. }
1168            | Opcode::MapNumVecNeg
1169            | Opcode::FilterFieldCmpField(_, _, _) => 5,
1170        Opcode::FilterFieldEqLitMapField(_, _, _)
1171            | Opcode::FilterFieldCmpLitMapField(_, _, _, _) => 6,
1172        Opcode::FilterFieldEqLitCount(_, _) | Opcode::FilterFieldCmpLitCount(_, _, _)
1173            | Opcode::FilterFieldCmpFieldCount(_, _, _) => 4,
1174        Opcode::FilterFieldsAllEqLitCount(pairs) => 4 + pairs.len() as u32 * 2,
1175        Opcode::FilterFieldsAllCmpLitCount(triples) => 4 + triples.len() as u32 * 2,
1176        Opcode::GroupByField(_) => 15,
1177        Opcode::CountByField(_) => 12,
1178        Opcode::UniqueByField(_) => 14,
1179        Opcode::UniqueCount => 6,
1180        Opcode::ArgExtreme { .. } => 12,
1181        Opcode::MapToJsonJoin { .. } => 8,
1182        Opcode::StrTrimUpper | Opcode::StrTrimLower
1183        | Opcode::StrUpperTrim | Opcode::StrLowerTrim => 2,
1184        Opcode::StrSplitReverseJoin { .. } => 4,
1185        Opcode::MapReplaceLit { .. } => 10,
1186        Opcode::MapUpperReplaceLit { .. } | Opcode::MapLowerReplaceLit { .. } => 11,
1187        Opcode::MapStrConcat { .. } => 6,
1188        Opcode::MapSplitLenSum { .. } => 4,
1189        Opcode::MapProject { .. } => 7,
1190        Opcode::MapStrSlice { .. } => 4,
1191        Opcode::MapFString(_) => 8,
1192        Opcode::MapSplitCount { .. } => 3,
1193        Opcode::MapSplitFirst { .. } => 3,
1194        Opcode::MapSplitNth   { .. } => 3,
1195        Opcode::MapSplitCountSum { .. } => 3,
1196    }
1197}
1198
1199/// Total cost of a program (sum of per-op costs).
1200pub fn program_cost(program: &Program) -> u32 {
1201    program.ops.iter().map(opcode_cost).sum()
1202}
1203
1204// ── Monotonicity lattice ─────────────────────────────────────────────────────
1205
1206/// Tracks ordering properties of array-like values through the pipeline.
1207#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1208pub enum Monotonicity {
1209    /// Order unknown.
1210    Unknown,
1211    /// Ascending by natural comparator.
1212    Asc,
1213    /// Descending by natural comparator.
1214    Desc,
1215    /// Not an ordered collection.
1216    NotArray,
1217}
1218
1219impl Monotonicity {
1220    pub fn after(self, op: &Opcode) -> Monotonicity {
1221        match op {
1222            Opcode::CallMethod(c) if c.sub_progs.is_empty() => match c.method {
1223                BuiltinMethod::Sort    => Monotonicity::Asc,
1224                BuiltinMethod::Reverse => match self {
1225                    Monotonicity::Asc  => Monotonicity::Desc,
1226                    Monotonicity::Desc => Monotonicity::Asc,
1227                    x => x,
1228                },
1229                BuiltinMethod::Filter  => self, // order preserved
1230                BuiltinMethod::Map     => Monotonicity::Unknown, // key changes
1231                _ => Monotonicity::Unknown,
1232            }
1233            Opcode::TopN { asc, .. } => if *asc { Monotonicity::Asc } else { Monotonicity::Desc },
1234            Opcode::MakeArr(_) | Opcode::ListComp(_) => Monotonicity::Unknown,
1235            _ => self,
1236        }
1237    }
1238}
1239
1240/// Walk program and determine monotonicity of the final result.
1241pub fn infer_monotonicity(program: &Program) -> Monotonicity {
1242    let mut m = Monotonicity::Unknown;
1243    for op in program.ops.iter() { m = m.after(op); }
1244    m
1245}
1246
1247// ── Escape analysis ───────────────────────────────────────────────────────────
1248
1249/// Simple escape check: does the program's final value contain references to
1250/// the input document (i.e. survive returning)?  If not, the compiler may
1251/// emit value-copying ops rather than Arc-sharing to free the original doc.
1252///
1253/// Returns `false` only when the result is a scalar or newly-constructed
1254/// object/array with no root references.
1255pub fn escapes_doc(program: &Program) -> bool {
1256    for op in program.ops.iter() {
1257        match op {
1258            Opcode::PushRoot | Opcode::PushCurrent | Opcode::RootChain(_)
1259                | Opcode::GetField(_) | Opcode::GetIndex(_) | Opcode::GetSlice(..)
1260                | Opcode::Descendant(_) | Opcode::DescendAll
1261                | Opcode::GetPointer(_) | Opcode::OptField(_) => return true,
1262            Opcode::CallMethod(c) | Opcode::CallOptMethod(c)
1263                if c.sub_progs.iter().any(|p| escapes_doc(p)) => return true,
1264            _ => {}
1265        }
1266    }
1267    false
1268}
1269
1270// ── Selectivity scoring ──────────────────────────────────────────────────────
1271
1272/// Rough selectivity estimate for AST predicates.  Lower score → more
1273/// selective (filters out more rows).  Used to reorder `and` operands so
1274/// cheaper / more-selective predicate runs first (short-circuit friendly).
1275pub fn selectivity_score(expr: &super::ast::Expr) -> u32 {
1276    use super::ast::{Expr, BinOp};
1277    match expr {
1278        Expr::Bool(true) => 1000,        // no filtering
1279        Expr::Bool(false) => 0,          // max filtering
1280        Expr::BinOp(_, BinOp::Eq, _)    => 1,
1281        Expr::BinOp(_, BinOp::Neq, _)   => 5,
1282        Expr::BinOp(_, BinOp::Lt, _) | Expr::BinOp(_, BinOp::Gt, _)
1283            | Expr::BinOp(_, BinOp::Lte, _) | Expr::BinOp(_, BinOp::Gte, _) => 3,
1284        Expr::BinOp(_, BinOp::Fuzzy, _) => 2,
1285        Expr::BinOp(l, BinOp::And, r) =>
1286            selectivity_score(l).min(selectivity_score(r)),
1287        Expr::BinOp(l, BinOp::Or, r)  =>
1288            selectivity_score(l) + selectivity_score(r),
1289        Expr::Not(e) => 10u32.saturating_sub(selectivity_score(e)),
1290        Expr::Kind { .. } => 2,
1291        _ => 5,
1292    }
1293}
1294
1295// ── Kind-check specialisation ────────────────────────────────────────────────
1296
1297/// When a `KindCheck` is applied to a value with a statically-known type,
1298/// the check can be constant-folded to true/false at compile time.
1299pub fn fold_kind_check(val_ty: VType, target: KindType, negate: bool) -> Option<bool> {
1300    let matches = match (val_ty, target) {
1301        (VType::Null, KindType::Null)   => true,
1302        (VType::Bool, KindType::Bool)   => true,
1303        (VType::Int | VType::Float | VType::Num, KindType::Number) => true,
1304        (VType::Str,  KindType::Str)    => true,
1305        (VType::Arr,  KindType::Array)  => true,
1306        (VType::Obj,  KindType::Object) => true,
1307        (VType::Unknown, _)             => return None,
1308        _                               => false,
1309    };
1310    Some(if negate { !matches } else { matches })
1311}