runmat_ignition/
vm.rs

1use crate::functions::{Bytecode, ExecutionContext, UserFunction};
2use crate::gc_roots::InterpretContext;
3use crate::instr::Instr;
4#[cfg(feature = "native-accel")]
5use runmat_accelerate::fusion_exec::{
6    execute_centered_gram, execute_elementwise, execute_explained_variance,
7    execute_image_normalize, execute_matmul_epilogue, execute_power_step_normalize,
8    execute_reduction, FusionExecutionRequest,
9};
10#[cfg(feature = "native-accel")]
11use runmat_accelerate::{
12    activate_fusion_plan, deactivate_fusion_plan, fusion_residency, prepare_fusion_plan,
13    set_current_pc,
14};
15#[cfg(feature = "native-accel")]
16use runmat_accelerate::{
17    active_group_plan_clone, value_is_all_keyword, FusionKind, ReductionAxes, ShapeInfo,
18    ValueOrigin, VarKind,
19};
20use runmat_builtins::{Type, Value};
21use runmat_runtime::{
22    builtins::common::tensor,
23    builtins::stats::random::stochastic_evolution::stochastic_evolution_host,
24    call_builtin, gather_if_needed,
25    workspace::{self as runtime_workspace, WorkspaceResolver},
26};
27use std::cell::{Cell, RefCell};
28use std::collections::{HashMap, HashSet};
29use std::convert::TryInto;
30use std::sync::Once;
31#[cfg(feature = "native-accel")]
32use std::sync::OnceLock;
33
34thread_local! {
35    static CURRENT_PC: Cell<usize> = const { Cell::new(0) };
36}
37
38#[inline]
39fn set_vm_pc(pc: usize) {
40    CURRENT_PC.with(|cell| cell.set(pc));
41}
42
43#[inline]
44fn current_pc() -> usize {
45    CURRENT_PC.with(|cell| cell.get())
46}
47
48#[cfg(feature = "native-accel")]
49struct FusionPlanGuard;
50
51#[cfg(feature = "native-accel")]
52impl Drop for FusionPlanGuard {
53    fn drop(&mut self) {
54        deactivate_fusion_plan();
55    }
56}
57
58struct InterpreterTiming {
59    enabled: bool,
60    host_span_start: Option<(std::time::Instant, usize)>,
61    host_span_last_pc: Option<usize>,
62    host_span_instrs: u64,
63    seq: u64,
64}
65
66impl InterpreterTiming {
67    fn new() -> Self {
68        let enabled = std::env::var("RUNMAT_INTERPRETER_TIMING")
69            .map(|v| v == "1" || v.eq_ignore_ascii_case("true") || v.eq_ignore_ascii_case("yes"))
70            .unwrap_or(false);
71        Self {
72            enabled,
73            host_span_start: None,
74            host_span_last_pc: None,
75            host_span_instrs: 0,
76            seq: 0,
77        }
78    }
79
80    fn note_host_instr(&mut self, pc: usize) {
81        if !self.enabled {
82            return;
83        }
84        if self.host_span_start.is_none() {
85            self.host_span_start = Some((std::time::Instant::now(), pc));
86            self.host_span_instrs = 0;
87        }
88        self.host_span_instrs += 1;
89        self.host_span_last_pc = Some(pc);
90    }
91
92    fn flush_host_span(&mut self, reason: &str, detail: Option<&str>) {
93        if !self.enabled {
94            return;
95        }
96        let Some((start, start_pc)) = self.host_span_start.take() else {
97            return;
98        };
99        let duration = start.elapsed();
100        let end_pc = self.host_span_last_pc.unwrap_or(start_pc);
101        let instrs = self.host_span_instrs.max(1);
102        if let Some(extra) = detail {
103            log::debug!(
104                "interpreter_host_span seq={} reason={} detail={} pc_span=[{}..{}] instrs={} duration_ns={}",
105                self.seq,
106                reason,
107                extra,
108                start_pc,
109                end_pc,
110                instrs,
111                duration.as_nanos()
112            );
113        } else {
114            log::debug!(
115                "interpreter_host_span seq={} reason={} pc_span=[{}..{}] instrs={} duration_ns={}",
116                self.seq,
117                reason,
118                start_pc,
119                end_pc,
120                instrs,
121                duration.as_nanos()
122            );
123        }
124        self.seq += 1;
125        self.host_span_last_pc = None;
126        self.host_span_instrs = 0;
127    }
128}
129
130impl Drop for InterpreterTiming {
131    fn drop(&mut self) {
132        self.flush_host_span("drop", None);
133    }
134}
135
136#[derive(Clone, Copy)]
137enum AutoBinaryOp {
138    Elementwise,
139    MatMul,
140}
141
142#[derive(Clone, Copy)]
143enum AutoUnaryOp {
144    Transpose,
145}
146
147#[cfg(feature = "native-accel")]
148fn accel_promote_binary(op: AutoBinaryOp, a: &Value, b: &Value) -> Result<(Value, Value), String> {
149    use runmat_accelerate::{promote_binary, BinaryOp};
150    let mapped = match op {
151        AutoBinaryOp::Elementwise => BinaryOp::Elementwise,
152        AutoBinaryOp::MatMul => BinaryOp::MatMul,
153    };
154    promote_binary(mapped, a, b).map_err(|e| e.to_string())
155}
156
157#[cfg(not(feature = "native-accel"))]
158fn accel_promote_binary(_op: AutoBinaryOp, a: &Value, b: &Value) -> Result<(Value, Value), String> {
159    Ok((a.clone(), b.clone()))
160}
161
162#[cfg(feature = "native-accel")]
163fn accel_promote_unary(op: AutoUnaryOp, value: &Value) -> Result<Value, String> {
164    use runmat_accelerate::{promote_unary, UnaryOp};
165    let mapped = match op {
166        AutoUnaryOp::Transpose => UnaryOp::Transpose,
167    };
168    promote_unary(mapped, value).map_err(|e| e.to_string())
169}
170
171#[cfg(not(feature = "native-accel"))]
172fn accel_promote_unary(_op: AutoUnaryOp, value: &Value) -> Result<Value, String> {
173    Ok(value.clone())
174}
175
176#[cfg(feature = "native-accel")]
177fn accel_prepare_args(name: &str, args: &[Value]) -> Result<Vec<Value>, String> {
178    runmat_accelerate::prepare_builtin_args(name, args).map_err(|e| e.to_string())
179}
180
181#[cfg(not(feature = "native-accel"))]
182fn accel_prepare_args(_name: &str, args: &[Value]) -> Result<Vec<Value>, String> {
183    Ok(args.to_vec())
184}
185
186fn call_builtin_auto(name: &str, args: &[Value]) -> Result<Value, String> {
187    let prepared = accel_prepare_args(name, args)?;
188    runmat_runtime::call_builtin(name, &prepared)
189}
190
191#[cfg(feature = "native-accel")]
192#[inline]
193fn fusion_debug_enabled() -> bool {
194    static FLAG: OnceLock<bool> = OnceLock::new();
195    *FLAG.get_or_init(|| match std::env::var("RUNMAT_DEBUG_FUSION") {
196        Ok(v) => v == "1" || v.eq_ignore_ascii_case("true") || v.eq_ignore_ascii_case("yes"),
197        Err(_) => false,
198    })
199}
200
201#[cfg(feature = "native-accel")]
202fn log_fusion_span_window(
203    plan: &runmat_accelerate::FusionGroupPlan,
204    bytecode: &Bytecode,
205    pc: usize,
206) {
207    if !fusion_debug_enabled() || !log::log_enabled!(log::Level::Debug) {
208        return;
209    }
210    if bytecode.instructions.is_empty() {
211        return;
212    }
213    let window = 3usize;
214    let span = plan.group.span.clone();
215    let total = bytecode.instructions.len();
216    let start = span.start.saturating_sub(window);
217    let mut end = span.end + window;
218    if end >= total {
219        end = total.saturating_sub(1);
220    }
221    if end < span.end {
222        end = span.end;
223    }
224    let mut ops: Vec<String> = Vec::new();
225    for idx in start..=end {
226        let instr = &bytecode.instructions[idx];
227        let mut tags: Vec<&'static str> = Vec::new();
228        if idx == pc {
229            tags.push("pc");
230        }
231        if idx == span.start {
232            tags.push("start");
233        }
234        if idx == span.end {
235            tags.push("end");
236        }
237        let tag_str = if tags.is_empty() {
238            String::new()
239        } else {
240            format!("<{}>", tags.join(","))
241        };
242        ops.push(format!("{}{} {:?}", idx, tag_str, instr));
243    }
244    log::debug!(
245        "fusion plan {} span window [{}..{}]: {}",
246        plan.index,
247        start,
248        end,
249        ops.join(" | ")
250    );
251}
252
253// Namespace used for error identifiers (e.g., "MATLAB:..." or "RunMat:...")
254const ERROR_NAMESPACE: &str = "MATLAB";
255
256#[inline]
257fn mex(id: &str, msg: &str) -> String {
258    // Normalize identifier to always use the configured namespace prefix.
259    // If caller passes "Namespace:suffix", strip the namespace and re-prefix with ERROR_NAMESPACE.
260    let suffix = match id.find(':') {
261        Some(pos) => &id[pos + 1..],
262        None => id,
263    };
264    let ident = format!("{ERROR_NAMESPACE}:{suffix}");
265    let pc = current_pc();
266    format!("{ident} (pc={pc}): {msg}")
267}
268
269#[derive(Clone)]
270enum SliceSelector {
271    Colon,
272    Scalar(usize),
273    Indices(Vec<usize>),
274}
275
276#[derive(Debug, Clone)]
277struct SlicePlan {
278    indices: Vec<u32>,
279    output_shape: Vec<usize>,
280    selection_lengths: Vec<usize>,
281    dims: usize,
282}
283
284fn cartesian_product<F: FnMut(&[usize])>(lists: &[Vec<usize>], mut f: F) {
285    let dims = lists.len();
286    if dims == 0 {
287        return;
288    }
289    let mut idx = vec![0usize; dims];
290    loop {
291        let current: Vec<usize> = (0..dims).map(|d| lists[d][idx[d]]).collect();
292        f(&current);
293        let mut d = 0usize;
294        while d < dims {
295            idx[d] += 1;
296            if idx[d] < lists[d].len() {
297                break;
298            }
299            idx[d] = 0;
300            d += 1;
301        }
302        if d == dims {
303            break;
304        }
305    }
306}
307
308fn cartesian_positions<F: FnMut(&[usize])>(lengths: &[usize], mut f: F) {
309    if lengths.is_empty() || lengths.contains(&0) {
310        return;
311    }
312    let dims = lengths.len();
313    let mut idx = vec![0usize; dims];
314    loop {
315        f(&idx);
316        let mut d = 0usize;
317        while d < dims {
318            idx[d] += 1;
319            if idx[d] < lengths[d] {
320                break;
321            }
322            idx[d] = 0;
323            d += 1;
324        }
325        if d == dims {
326            break;
327        }
328    }
329}
330
331fn total_len_from_shape(shape: &[usize]) -> usize {
332    if shape.is_empty() {
333        1
334    } else {
335        shape.iter().copied().product()
336    }
337}
338
339fn indices_from_value_linear(value: &Value, total_len: usize) -> Result<Vec<usize>, String> {
340    match value {
341        Value::Num(n) => {
342            let idx = *n as isize;
343            if idx < 1 || (idx as usize) > total_len {
344                return Err(mex("IndexOutOfBounds", "Index out of bounds"));
345            }
346            Ok(vec![idx as usize])
347        }
348        Value::Int(int_val) => {
349            let idx = int_val.to_i64();
350            if idx < 1 || (idx as usize) > total_len {
351                return Err(mex("IndexOutOfBounds", "Index out of bounds"));
352            }
353            Ok(vec![idx as usize])
354        }
355        Value::Tensor(idx_t) => {
356            let len = idx_t.shape.iter().product::<usize>();
357            if len == total_len {
358                let mut indices = Vec::new();
359                for (i, &val) in idx_t.data.iter().enumerate() {
360                    if val != 0.0 {
361                        indices.push(i + 1);
362                    }
363                }
364                Ok(indices)
365            } else {
366                let mut indices = Vec::with_capacity(len);
367                for &val in &idx_t.data {
368                    let idx = val as isize;
369                    if idx < 1 || (idx as usize) > total_len {
370                        return Err(mex("IndexOutOfBounds", "Index out of bounds"));
371                    }
372                    indices.push(idx as usize);
373                }
374                Ok(indices)
375            }
376        }
377        Value::LogicalArray(la) => {
378            if la.data.len() != total_len {
379                return Err(mex(
380                    "IndexShape",
381                    "Logical mask length mismatch for linear indexing",
382                ));
383            }
384            let mut indices = Vec::new();
385            for (i, &b) in la.data.iter().enumerate() {
386                if b != 0 {
387                    indices.push(i + 1);
388                }
389            }
390            Ok(indices)
391        }
392        _ => Err(mex(
393            "UnsupportedIndexType",
394            "Unsupported index type for linear indexing",
395        )),
396    }
397}
398
399fn selector_from_value_dim(value: &Value, dim_len: usize) -> Result<SliceSelector, String> {
400    match value {
401        Value::Num(n) => {
402            let idx = *n as isize;
403            if idx < 1 || (idx as usize) > dim_len {
404                return Err(mex("IndexOutOfBounds", "Index out of bounds"));
405            }
406            Ok(SliceSelector::Scalar(idx as usize))
407        }
408        Value::Int(int_val) => {
409            let idx = int_val.to_i64();
410            if idx < 1 || (idx as usize) > dim_len {
411                return Err(mex("IndexOutOfBounds", "Index out of bounds"));
412            }
413            Ok(SliceSelector::Scalar(idx as usize))
414        }
415        Value::Tensor(idx_t) => {
416            let len = idx_t.shape.iter().product::<usize>();
417            if len == dim_len {
418                let mut indices = Vec::new();
419                for (i, &val) in idx_t.data.iter().enumerate() {
420                    if val != 0.0 {
421                        indices.push(i + 1);
422                    }
423                }
424                Ok(SliceSelector::Indices(indices))
425            } else {
426                let mut indices = Vec::with_capacity(len);
427                for &val in &idx_t.data {
428                    let idx = val as isize;
429                    if idx < 1 || (idx as usize) > dim_len {
430                        return Err(mex("IndexOutOfBounds", "Index out of bounds"));
431                    }
432                    indices.push(idx as usize);
433                }
434                Ok(SliceSelector::Indices(indices))
435            }
436        }
437        Value::LogicalArray(la) => {
438            if la.data.len() != dim_len {
439                return Err(mex(
440                    "IndexShape",
441                    "Logical mask length mismatch for dimension",
442                ));
443            }
444            let mut indices = Vec::new();
445            for (i, &b) in la.data.iter().enumerate() {
446                if b != 0 {
447                    indices.push(i + 1);
448                }
449            }
450            Ok(SliceSelector::Indices(indices))
451        }
452        _ => Err(mex(
453            "UnsupportedIndexType",
454            "Unsupported index type for slicing",
455        )),
456    }
457}
458
459fn build_slice_selectors(
460    dims: usize,
461    colon_mask: u32,
462    end_mask: u32,
463    numeric: &[Value],
464    base_shape: &[usize],
465) -> Result<Vec<SliceSelector>, String> {
466    let mut selectors = Vec::with_capacity(dims);
467    if dims == 1 {
468        let total_len = total_len_from_shape(base_shape);
469        if (colon_mask & 1u32) != 0 {
470            selectors.push(SliceSelector::Indices((1..=total_len).collect()));
471            return Ok(selectors);
472        }
473        if (end_mask & 1u32) != 0 {
474            selectors.push(SliceSelector::Scalar(total_len.max(1)));
475            return Ok(selectors);
476        }
477        let value = numeric.first().ok_or_else(|| {
478            mex(
479                "MissingNumericIndex",
480                "missing numeric index for linear slice",
481            )
482        })?;
483        let idxs = indices_from_value_linear(value, total_len)?;
484        selectors.push(SliceSelector::Indices(idxs));
485        return Ok(selectors);
486    }
487
488    let mut numeric_iter = 0usize;
489    for d in 0..dims {
490        let is_colon = (colon_mask & (1u32 << d)) != 0;
491        if is_colon {
492            selectors.push(SliceSelector::Colon);
493            continue;
494        }
495        let dim_len = base_shape.get(d).copied().unwrap_or(1);
496        let is_end = (end_mask & (1u32 << d)) != 0;
497        if is_end {
498            selectors.push(SliceSelector::Scalar(dim_len));
499            continue;
500        }
501        let value = numeric
502            .get(numeric_iter)
503            .ok_or_else(|| mex("MissingNumericIndex", "missing numeric index for slice"))?;
504        numeric_iter += 1;
505        selectors.push(selector_from_value_dim(value, dim_len)?);
506    }
507    Ok(selectors)
508}
509
510fn build_slice_plan(
511    selectors: &[SliceSelector],
512    dims: usize,
513    base_shape: &[usize],
514) -> Result<SlicePlan, String> {
515    let total_len = total_len_from_shape(base_shape);
516    if dims == 1 {
517        let list = selectors
518            .first()
519            .cloned()
520            .unwrap_or(SliceSelector::Indices(Vec::new()));
521        let indices = match list {
522            SliceSelector::Colon => (1..=total_len).collect::<Vec<usize>>(),
523            SliceSelector::Scalar(i) => vec![i],
524            SliceSelector::Indices(v) => v,
525        };
526        if indices.iter().any(|&i| i == 0 || i > total_len) {
527            return Err(mex("IndexOutOfBounds", "Index out of bounds"));
528        }
529        let zero_based: Vec<u32> = indices.iter().map(|&i| (i - 1) as u32).collect();
530        let count = zero_based.len();
531        let shape = if count <= 1 {
532            vec![1, 1]
533        } else {
534            vec![count, 1]
535        };
536        return Ok(SlicePlan {
537            indices: zero_based,
538            output_shape: shape,
539            selection_lengths: vec![count],
540            dims,
541        });
542    }
543
544    let mut selection_lengths = Vec::with_capacity(dims);
545    let mut per_dim_lists: Vec<Vec<usize>> = Vec::with_capacity(dims);
546    for (d, sel) in selectors.iter().enumerate().take(dims) {
547        let dim_len = base_shape.get(d).copied().unwrap_or(1);
548        let idxs = match sel {
549            SliceSelector::Colon => (1..=dim_len).collect::<Vec<usize>>(),
550            SliceSelector::Scalar(i) => vec![*i],
551            SliceSelector::Indices(v) => v.clone(),
552        };
553        if idxs.iter().any(|&i| i == 0 || i > dim_len) {
554            return Err(mex("IndexOutOfBounds", "Index out of bounds"));
555        }
556        selection_lengths.push(idxs.len());
557        per_dim_lists.push(idxs);
558    }
559
560    if selection_lengths.contains(&0) {
561        let mut out_shape = selection_lengths.clone();
562        if dims == 2 {
563            if selection_lengths[0] > 1 && selection_lengths[1] == 1 {
564                out_shape = vec![selection_lengths[0], 1];
565            } else if selection_lengths[0] == 1 && selection_lengths[1] > 1 {
566                out_shape = vec![1, selection_lengths[1]];
567            }
568        }
569        return Ok(SlicePlan {
570            indices: Vec::new(),
571            output_shape: out_shape,
572            selection_lengths,
573            dims,
574        });
575    }
576
577    let mut base_norm = base_shape.to_vec();
578    if base_norm.len() < dims {
579        base_norm.resize(dims, 1);
580    }
581
582    let mut strides = vec![1usize; dims];
583    for d in 1..dims {
584        strides[d] = strides[d - 1] * base_norm[d - 1].max(1);
585    }
586
587    let mut indices = Vec::new();
588    cartesian_product(&per_dim_lists, |multi| {
589        let mut lin = 0usize;
590        for d in 0..dims {
591            let idx = multi[d] - 1;
592            lin += idx * strides[d];
593        }
594        indices.push(lin as u32);
595    });
596
597    let mut out_shape = selection_lengths.clone();
598    if dims == 2 {
599        if selection_lengths[0] > 1 && selection_lengths[1] == 1 {
600            out_shape = vec![selection_lengths[0], 1];
601        } else if selection_lengths[0] == 1 && selection_lengths[1] > 1 {
602            out_shape = vec![1, selection_lengths[1]];
603        }
604    }
605    let total_out: usize = selection_lengths.iter().product();
606    if total_out == 1 {
607        out_shape = vec![1, 1];
608    }
609
610    Ok(SlicePlan {
611        indices,
612        output_shape: out_shape,
613        selection_lengths,
614        dims,
615    })
616}
617
618fn gather_string_slice(
619    sa: &runmat_builtins::StringArray,
620    plan: &SlicePlan,
621) -> Result<Value, String> {
622    if plan.indices.is_empty() {
623        let empty = runmat_builtins::StringArray::new(Vec::new(), plan.output_shape.clone())
624            .map_err(|e| format!("Slice error: {e}"))?;
625        return Ok(Value::StringArray(empty));
626    }
627    if plan.indices.len() == 1 {
628        let lin = plan.indices[0] as usize;
629        let value = sa
630            .data
631            .get(lin)
632            .cloned()
633            .ok_or_else(|| "Slice error: string index out of bounds".to_string())?;
634        return Ok(Value::String(value));
635    }
636    let mut out = Vec::with_capacity(plan.indices.len());
637    for &lin in &plan.indices {
638        let idx = lin as usize;
639        let value = sa
640            .data
641            .get(idx)
642            .cloned()
643            .ok_or_else(|| "Slice error: string index out of bounds".to_string())?;
644        out.push(value);
645    }
646    let out_sa = runmat_builtins::StringArray::new(out, plan.output_shape.clone())
647        .map_err(|e| format!("Slice error: {e}"))?;
648    Ok(Value::StringArray(out_sa))
649}
650
651enum StringAssignView {
652    Scalar(String),
653    Array {
654        data: Vec<String>,
655        shape: Vec<usize>,
656        strides: Vec<usize>,
657    },
658}
659
660fn build_string_rhs_view(
661    rhs: &Value,
662    selection_lengths: &[usize],
663) -> Result<StringAssignView, String> {
664    let dims = selection_lengths.len().max(1);
665    match rhs {
666        Value::String(s) => Ok(StringAssignView::Scalar(s.clone())),
667        Value::Num(n) => Ok(StringAssignView::Scalar(n.to_string())),
668        Value::Int(i) => Ok(StringAssignView::Scalar(i.to_i64().to_string())),
669        Value::Tensor(t) => {
670            let mut shape = t.shape.clone();
671            if shape.len() < dims {
672                shape.resize(dims, 1);
673            } else if shape.len() > dims {
674                if shape.iter().skip(dims).any(|&s| s != 1) {
675                    return Err("shape mismatch for slice assign".to_string());
676                }
677                shape.truncate(dims);
678            }
679            for (rhs_len, sel_len) in shape.iter().zip(selection_lengths.iter()) {
680                if !(*rhs_len == 1 || *rhs_len == *sel_len) {
681                    return Err("shape mismatch for slice assign".to_string());
682                }
683            }
684            let mut strides = vec![1usize; dims];
685            for d in 1..dims {
686                strides[d] = strides[d - 1] * shape[d - 1].max(1);
687            }
688            let data = t.data.iter().map(|v| v.to_string()).collect();
689            Ok(StringAssignView::Array {
690                data,
691                shape,
692                strides,
693            })
694        }
695        Value::StringArray(sa) => {
696            let mut shape = sa.shape.clone();
697            if shape.len() < dims {
698                shape.resize(dims, 1);
699            } else if shape.len() > dims {
700                if shape.iter().skip(dims).any(|&s| s != 1) {
701                    return Err("shape mismatch for slice assign".to_string());
702                }
703                shape.truncate(dims);
704            }
705            for (rhs_len, sel_len) in shape.iter().zip(selection_lengths.iter()) {
706                if !(*rhs_len == 1 || *rhs_len == *sel_len) {
707                    return Err("shape mismatch for slice assign".to_string());
708                }
709            }
710            let mut strides = vec![1usize; dims];
711            for d in 1..dims {
712                strides[d] = strides[d - 1] * shape[d - 1].max(1);
713            }
714            Ok(StringAssignView::Array {
715                data: sa.data.clone(),
716                shape,
717                strides,
718            })
719        }
720        _ => Err("rhs must be string or string array".to_string()),
721    }
722}
723
724fn scatter_string_with_plan(
725    sa: &mut runmat_builtins::StringArray,
726    plan: &SlicePlan,
727    view: &StringAssignView,
728) -> Result<(), String> {
729    if plan.indices.is_empty() {
730        return Ok(());
731    }
732    let mut idx_iter = plan.indices.iter();
733    cartesian_positions(&plan.selection_lengths, |position| {
734        if let Some(&lin) = idx_iter.next() {
735            let replacement = match view {
736                StringAssignView::Scalar(s) => s.clone(),
737                StringAssignView::Array {
738                    data,
739                    shape,
740                    strides,
741                } => {
742                    let mut rlin = 0usize;
743                    for (d, &pos_val) in position.iter().enumerate() {
744                        let rhs_len = shape.get(d).copied().unwrap_or(1);
745                        let pos = if rhs_len == 1 { 0 } else { pos_val };
746                        rlin += pos * strides.get(d).copied().unwrap_or(1);
747                    }
748                    data.get(rlin).cloned().unwrap_or_default()
749                }
750            };
751            if let Some(slot) = sa.data.get_mut(lin as usize) {
752                *slot = replacement;
753            }
754        }
755    });
756    Ok(())
757}
758
759fn apply_end_offsets_to_numeric(
760    numeric: &[Value],
761    dims: usize,
762    colon_mask: u32,
763    end_mask: u32,
764    end_offsets: &[(usize, i64)],
765    base_shape: &[usize],
766) -> Vec<Value> {
767    let mut adjusted = numeric.to_vec();
768    for (position, offset) in end_offsets {
769        if let Some(value) = adjusted.get_mut(*position) {
770            let mut seen_numeric = 0usize;
771            let mut dim_for_pos = 0usize;
772            for d in 0..dims {
773                let is_colon = (colon_mask & (1u32 << d)) != 0;
774                let is_end = (end_mask & (1u32 << d)) != 0;
775                if is_colon || is_end {
776                    continue;
777                }
778                if seen_numeric == *position {
779                    dim_for_pos = d;
780                    break;
781                }
782                seen_numeric += 1;
783            }
784            let dim_len = base_shape.get(dim_for_pos).copied().unwrap_or(1);
785            let idx_val = (dim_len as isize) - (*offset as isize);
786            *value = Value::Num(idx_val as f64);
787        }
788    }
789    adjusted
790}
791
792fn materialize_rhs_linear(rhs: &Value, count: usize) -> Result<Vec<f64>, String> {
793    if count == 0 {
794        return Ok(Vec::new());
795    }
796    let host_rhs = runmat_runtime::gather_if_needed(rhs)?;
797    match host_rhs {
798        Value::Num(n) => Ok(vec![n; count]),
799        Value::Int(int_val) => Ok(vec![int_val.to_f64(); count]),
800        Value::Bool(b) => Ok(vec![if b { 1.0 } else { 0.0 }; count]),
801        Value::Tensor(t) => {
802            if t.data.len() == count {
803                Ok(t.data)
804            } else if t.data.len() == 1 {
805                Ok(vec![t.data[0]; count])
806            } else {
807                Err("shape mismatch for slice assign".to_string())
808            }
809        }
810        Value::LogicalArray(la) => {
811            if la.data.len() == count {
812                let out: Vec<f64> = la
813                    .data
814                    .into_iter()
815                    .map(|b| if b != 0 { 1.0 } else { 0.0 })
816                    .collect();
817                Ok(out)
818            } else if la.data.len() == 1 {
819                let val = if la.data[0] != 0 { 1.0 } else { 0.0 };
820                Ok(vec![val; count])
821            } else {
822                Err("shape mismatch for slice assign".to_string())
823            }
824        }
825        other => Err(format!("slice assign: unsupported RHS type {:?}", other)),
826    }
827}
828
829fn materialize_rhs_nd(rhs: &Value, selection_lengths: &[usize]) -> Result<Vec<f64>, String> {
830    let total: usize = selection_lengths.iter().copied().product();
831    if total == 0 {
832        return Ok(Vec::new());
833    }
834    let rhs_host = runmat_runtime::gather_if_needed(rhs)?;
835    enum RhsView {
836        Scalar(f64),
837        Tensor {
838            data: Vec<f64>,
839            shape: Vec<usize>,
840            strides: Vec<usize>,
841        },
842    }
843    let view = match rhs_host {
844        Value::Num(n) => RhsView::Scalar(n),
845        Value::Int(iv) => RhsView::Scalar(iv.to_f64()),
846        Value::Bool(b) => RhsView::Scalar(if b { 1.0 } else { 0.0 }),
847        Value::Tensor(t) => {
848            let mut shape = t.shape.clone();
849            if shape.len() < selection_lengths.len() {
850                shape.resize(selection_lengths.len(), 1);
851            }
852            if shape.len() > selection_lengths.len() {
853                if shape.iter().skip(selection_lengths.len()).any(|&s| s != 1) {
854                    return Err("shape mismatch for slice assign".to_string());
855                }
856                shape.truncate(selection_lengths.len());
857            }
858            for (dim_len, &sel_len) in shape.iter().zip(selection_lengths.iter()) {
859                if *dim_len != 1 && *dim_len != sel_len {
860                    return Err("shape mismatch for slice assign".to_string());
861                }
862            }
863            let mut strides = vec![1usize; selection_lengths.len()];
864            for d in 1..selection_lengths.len() {
865                strides[d] = strides[d - 1] * shape[d - 1].max(1);
866            }
867            if t.data.len()
868                != shape
869                    .iter()
870                    .copied()
871                    .fold(1usize, |acc, len| acc.saturating_mul(len.max(1)))
872            {
873                return Err("shape mismatch for slice assign".to_string());
874            }
875            RhsView::Tensor {
876                data: t.data,
877                shape,
878                strides,
879            }
880        }
881        Value::LogicalArray(la) => {
882            if la.shape.len() > selection_lengths.len()
883                && la
884                    .shape
885                    .iter()
886                    .skip(selection_lengths.len())
887                    .any(|&s| s != 1)
888            {
889                return Err("shape mismatch for slice assign".to_string());
890            }
891            let mut shape = la.shape.clone();
892            if shape.len() < selection_lengths.len() {
893                shape.resize(selection_lengths.len(), 1);
894            } else {
895                shape.truncate(selection_lengths.len());
896            }
897            for (dim_len, &sel_len) in shape.iter().zip(selection_lengths.iter()) {
898                if *dim_len != 1 && *dim_len != sel_len {
899                    return Err("shape mismatch for slice assign".to_string());
900                }
901            }
902            let mut strides = vec![1usize; selection_lengths.len()];
903            for d in 1..selection_lengths.len() {
904                strides[d] = strides[d - 1] * shape[d - 1].max(1);
905            }
906            if la.data.len()
907                != shape
908                    .iter()
909                    .copied()
910                    .fold(1usize, |acc, len| acc.saturating_mul(len.max(1)))
911            {
912                return Err("shape mismatch for slice assign".to_string());
913            }
914            let data: Vec<f64> = la
915                .data
916                .into_iter()
917                .map(|b| if b != 0 { 1.0 } else { 0.0 })
918                .collect();
919            RhsView::Tensor {
920                data,
921                shape,
922                strides,
923            }
924        }
925        other => return Err(format!("slice assign: unsupported RHS type {:?}", other)),
926    };
927
928    let mut out = Vec::with_capacity(total);
929    cartesian_positions(selection_lengths, |positions| match &view {
930        RhsView::Scalar(val) => out.push(*val),
931        RhsView::Tensor {
932            data,
933            shape,
934            strides,
935        } => {
936            let mut rlin = 0usize;
937            for d in 0..positions.len() {
938                let rhs_len = shape[d];
939                let pos = if rhs_len == 1 { 0 } else { positions[d] };
940                rlin += pos * strides[d];
941            }
942            let value = data.get(rlin).copied().unwrap_or(0.0);
943            out.push(value);
944        }
945    });
946    Ok(out)
947}
948
949thread_local! {
950    static GLOBALS: RefCell<HashMap<String, Value>> = RefCell::new(HashMap::new());
951}
952
953thread_local! {
954    static PERSISTENTS: RefCell<HashMap<(String, usize), Value>> = RefCell::new(HashMap::new());
955}
956
957thread_local! {
958    static PERSISTENTS_BY_NAME: RefCell<HashMap<(String, String), Value>> = RefCell::new(HashMap::new());
959}
960
961struct WorkspaceState {
962    names: HashMap<String, usize>,
963    assigned: HashSet<String>,
964    data_ptr: *const Value,
965    len: usize,
966}
967
968type WorkspaceSnapshot = (HashMap<String, usize>, HashSet<String>);
969
970thread_local! {
971    static WORKSPACE_STATE: RefCell<Option<WorkspaceState>> = const { RefCell::new(None) };
972    static PENDING_WORKSPACE: RefCell<Option<WorkspaceSnapshot>> = const { RefCell::new(None) };
973    static LAST_WORKSPACE_STATE: RefCell<Option<WorkspaceSnapshot>> = const { RefCell::new(None) };
974}
975
976struct WorkspaceStateGuard;
977
978impl Drop for WorkspaceStateGuard {
979    fn drop(&mut self) {
980        WORKSPACE_STATE.with(|state| {
981            let mut state_mut = state.borrow_mut();
982            if let Some(ws) = state_mut.take() {
983                LAST_WORKSPACE_STATE.with(|slot| {
984                    *slot.borrow_mut() = Some((ws.names, ws.assigned));
985                });
986            }
987        });
988    }
989}
990
991fn set_workspace_state(
992    names: HashMap<String, usize>,
993    assigned: HashSet<String>,
994    vars: &[Value],
995) -> WorkspaceStateGuard {
996    WORKSPACE_STATE.with(|state| {
997        *state.borrow_mut() = Some(WorkspaceState {
998            names,
999            assigned,
1000            data_ptr: vars.as_ptr(),
1001            len: vars.len(),
1002        });
1003    });
1004    WorkspaceStateGuard
1005}
1006
1007fn refresh_workspace_state(vars: &[Value]) {
1008    WORKSPACE_STATE.with(|state| {
1009        if let Some(ws) = state.borrow_mut().as_mut() {
1010            ws.data_ptr = vars.as_ptr();
1011            ws.len = vars.len();
1012        }
1013    });
1014}
1015
1016fn workspace_lookup(name: &str) -> Option<Value> {
1017    WORKSPACE_STATE.with(|state| {
1018        let state_ref = state.borrow();
1019        let ws = state_ref.as_ref()?;
1020        let idx = ws.names.get(name)?;
1021        if !ws.assigned.contains(name) {
1022            return None;
1023        }
1024        if *idx >= ws.len {
1025            return None;
1026        }
1027        unsafe {
1028            let ptr = ws.data_ptr.add(*idx);
1029            Some((*ptr).clone())
1030        }
1031    })
1032}
1033
1034fn workspace_snapshot() -> Vec<(String, Value)> {
1035    WORKSPACE_STATE.with(|state| {
1036        if let Some(ws) = state.borrow().as_ref() {
1037            let mut entries: Vec<(String, Value)> = ws
1038                .names
1039                .iter()
1040                .filter_map(|(name, idx)| {
1041                    if *idx >= ws.len {
1042                        return None;
1043                    }
1044                    if !ws.assigned.contains(name) {
1045                        return None;
1046                    }
1047                    unsafe {
1048                        let ptr = ws.data_ptr.add(*idx);
1049                        Some((name.clone(), (*ptr).clone()))
1050                    }
1051                })
1052                .collect();
1053            entries.sort_by(|a, b| a.0.cmp(&b.0));
1054            entries
1055        } else {
1056            Vec::new()
1057        }
1058    })
1059}
1060
1061fn workspace_global_names() -> Vec<String> {
1062    let mut names = Vec::new();
1063    GLOBALS.with(|globals| {
1064        let map = globals.borrow();
1065        for key in map.keys() {
1066            if !key.starts_with("var_") {
1067                names.push(key.clone());
1068            }
1069        }
1070    });
1071    names.sort();
1072    names
1073}
1074
1075fn set_workspace_variable(name: &str, value: Value, vars: &mut Vec<Value>) -> Result<(), String> {
1076    let mut result = Ok(());
1077    WORKSPACE_STATE.with(|state| {
1078        let mut state_mut = state.borrow_mut();
1079        match state_mut.as_mut() {
1080            Some(ws) => {
1081                let idx = if let Some(idx) = ws.names.get(name).copied() {
1082                    idx
1083                } else {
1084                    let idx = vars.len();
1085                    ws.names.insert(name.to_string(), idx);
1086                    idx
1087                };
1088                if idx >= vars.len() {
1089                    vars.resize(idx + 1, Value::Num(0.0));
1090                }
1091                vars[idx] = value;
1092                ws.data_ptr = vars.as_ptr();
1093                ws.len = vars.len();
1094                ws.assigned.insert(name.to_string());
1095            }
1096            None => {
1097                result = Err("load: workspace state unavailable".to_string());
1098            }
1099        }
1100    });
1101    result
1102}
1103
1104fn assign_loaded_variables(
1105    vars: &mut Vec<Value>,
1106    entries: &[(String, Value)],
1107) -> Result<(), String> {
1108    for (name, value) in entries {
1109        set_workspace_variable(name, value.clone(), vars)?;
1110    }
1111    refresh_workspace_state(vars);
1112    Ok(())
1113}
1114
1115fn ensure_workspace_resolver_registered() {
1116    static REGISTER: Once = Once::new();
1117    REGISTER.call_once(|| {
1118        runtime_workspace::register_workspace_resolver(WorkspaceResolver {
1119            lookup: workspace_lookup,
1120            snapshot: workspace_snapshot,
1121            globals: workspace_global_names,
1122        });
1123    });
1124}
1125
1126pub struct PendingWorkspaceGuard;
1127
1128impl Drop for PendingWorkspaceGuard {
1129    fn drop(&mut self) {
1130        PENDING_WORKSPACE.with(|slot| {
1131            slot.borrow_mut().take();
1132        });
1133    }
1134}
1135
1136pub fn push_pending_workspace(
1137    names: HashMap<String, usize>,
1138    assigned: HashSet<String>,
1139) -> PendingWorkspaceGuard {
1140    PENDING_WORKSPACE.with(|slot| {
1141        *slot.borrow_mut() = Some((names, assigned));
1142    });
1143    PendingWorkspaceGuard
1144}
1145
1146pub fn take_updated_workspace_state() -> Option<(HashMap<String, usize>, HashSet<String>)> {
1147    LAST_WORKSPACE_STATE.with(|slot| slot.borrow_mut().take())
1148}
1149
1150thread_local! {
1151    // (nargin, nargout) for current call
1152    static CALL_COUNTS: RefCell<Vec<(usize, usize)>> = const { RefCell::new(Vec::new()) };
1153}
1154
1155macro_rules! handle_rel_binary { ($op:tt, $name:literal, $stack:ident) => {{
1156    let b = $stack.pop().ok_or(mex("StackUnderflow","stack underflow"))?; let a = $stack.pop().ok_or(mex("StackUnderflow","stack underflow"))?;
1157    match (&a, &b) {
1158        (Value::Object(obj), _) => { let args = vec![Value::Object(obj.clone()), Value::String($name.to_string()), b.clone()]; match call_builtin("call_method", &args) { Ok(v) => $stack.push(v), Err(_) => { let aa: f64 = (&a).try_into()?; let bb: f64 = (&b).try_into()?; $stack.push(Value::Num(if aa $op bb {1.0}else{0.0})) } } }
1159        (_, Value::Object(obj)) => { let rev = match $name { "lt" => "gt", "le" => "ge", "gt" => "lt", "ge" => "le", other => other };
1160            let args = vec![Value::Object(obj.clone()), Value::String(rev.to_string()), a.clone()]; match call_builtin("call_method", &args) { Ok(v) => $stack.push(v), Err(_) => { let aa: f64 = (&a).try_into()?; let bb: f64 = (&b).try_into()?; $stack.push(Value::Num(if aa $op bb {1.0}else{0.0})) } } }
1161        _ => { let bb: f64 = (&b).try_into()?; let aa: f64 = (&a).try_into()?; $stack.push(Value::Num(if aa $op bb {1.0}else{0.0})) }
1162    }
1163}}; }
1164pub fn interpret_with_vars(
1165    bytecode: &Bytecode,
1166    initial_vars: &mut [Value],
1167    current_function_name: Option<&str>,
1168) -> Result<Vec<Value>, String> {
1169    ensure_workspace_resolver_registered();
1170    #[cfg(feature = "native-accel")]
1171    let fusion_plan = prepare_fusion_plan(bytecode.accel_graph.as_ref(), &bytecode.fusion_groups);
1172    #[cfg(feature = "native-accel")]
1173    activate_fusion_plan(fusion_plan.clone());
1174    #[cfg(feature = "native-accel")]
1175    let _fusion_guard = FusionPlanGuard;
1176    let mut stack: Vec<Value> = Vec::new();
1177    let mut vars = initial_vars.to_vec();
1178    if vars.len() < bytecode.var_count {
1179        vars.resize(bytecode.var_count, Value::Num(0.0));
1180    }
1181    let pending_state = PENDING_WORKSPACE.with(|slot| slot.borrow_mut().take());
1182    let _workspace_guard = pending_state.map(|(names, assigned)| {
1183        let filtered_assigned: HashSet<String> = assigned
1184            .into_iter()
1185            .filter(|name| names.contains_key(name))
1186            .collect();
1187        set_workspace_state(names, filtered_assigned, &vars)
1188    });
1189    refresh_workspace_state(&vars);
1190    let mut pc: usize = 0;
1191    let mut context = ExecutionContext {
1192        call_stack: Vec::new(),
1193        locals: Vec::new(),
1194        instruction_pointer: 0,
1195        functions: bytecode.functions.clone(),
1196    };
1197    let mut _gc_context = InterpretContext::new(&stack, &vars)?;
1198    // Register thread-local globals/persistents as GC roots for the duration of this execution
1199    let mut thread_roots: Vec<Value> = Vec::new();
1200    GLOBALS.with(|g| {
1201        for v in g.borrow().values() {
1202            thread_roots.push(v.clone());
1203        }
1204    });
1205    PERSISTENTS.with(|p| {
1206        for v in p.borrow().values() {
1207            thread_roots.push(v.clone());
1208        }
1209    });
1210    // Name-based table may duplicate persistents; harmless if included
1211    PERSISTENTS_BY_NAME.with(|p| {
1212        for v in p.borrow().values() {
1213            thread_roots.push(v.clone());
1214        }
1215    });
1216    let _ = _gc_context.register_global_values(thread_roots, "thread_globals_persistents");
1217    let current_func_name_str: String = current_function_name
1218        .map(|s| s.to_string())
1219        .unwrap_or_else(|| "<main>".to_string());
1220    // Track per-execution alias maps for globals/persistents
1221    let mut global_aliases: HashMap<usize, String> = HashMap::new();
1222    let mut persistent_aliases: HashMap<usize, String> = HashMap::new();
1223    // Stack of (catch_pc, catch_var_global_index)
1224    let mut try_stack: Vec<(usize, Option<usize>)> = Vec::new();
1225    // Track last caught exception for possible rethrow handling
1226    let mut last_exception: Option<runmat_builtins::MException> = None;
1227    // Runtime import registry for this execution
1228    let mut imports: Vec<(Vec<String>, bool)> = Vec::new();
1229    // Helper to resolve unqualified static accesses if Class.* is imported
1230    let _resolve_static =
1231        |imports: &Vec<(Vec<String>, bool)>, name: &str| -> Option<(String, String)> {
1232            // Return (class_name, member) for unqualified 'member' where Class.* imported
1233            for (path, wildcard) in imports {
1234                if !*wildcard {
1235                    continue;
1236                }
1237                if path.len() == 1 {
1238                    // Class.* style
1239                    let class_name = path[0].clone();
1240                    // We cannot know member names here; VM paths for LoadMember/CallMethod will enforce static
1241                    return Some((class_name, name.to_string()));
1242                }
1243            }
1244            None
1245        };
1246    #[inline]
1247    fn bench_start() -> Option<std::time::Instant> {
1248        None
1249    }
1250    #[inline]
1251    fn bench_end(_label: &str, _start: Option<std::time::Instant>) {}
1252    let debug_stack = std::env::var("RUNMAT_DEBUG_STACK")
1253        .map(|v| v == "1" || v.eq_ignore_ascii_case("true"))
1254        .unwrap_or(false);
1255    let mut interpreter_timing = InterpreterTiming::new();
1256    macro_rules! vm_bail {
1257        ($err:expr) => {{
1258            let e: String = $err.to_string();
1259            if let Some((catch_pc, catch_var)) = try_stack.pop() {
1260                if let Some(var_idx) = catch_var {
1261                    if var_idx >= vars.len() {
1262                        vars.resize(var_idx + 1, Value::Num(0.0));
1263                        refresh_workspace_state(&vars);
1264                    }
1265                    let mex = parse_exception(&e);
1266                    last_exception = Some(mex.clone());
1267                    vars[var_idx] = Value::MException(mex);
1268                }
1269                pc = catch_pc;
1270                continue;
1271            } else {
1272                return Err(e);
1273            }
1274        }};
1275    }
1276    while pc < bytecode.instructions.len() {
1277        set_vm_pc(pc);
1278        #[cfg(feature = "native-accel")]
1279        set_current_pc(pc);
1280        #[cfg(feature = "native-accel")]
1281        if let (Some(plan), Some(graph)) =
1282            (active_group_plan_clone(), bytecode.accel_graph.as_ref())
1283        {
1284            if plan.group.span.start == pc {
1285                #[cfg(feature = "native-accel")]
1286                {
1287                    let detail = format!(
1288                        "plan={} kind={:?} span=[{}..{}]",
1289                        plan.index, plan.group.kind, plan.group.span.start, plan.group.span.end
1290                    );
1291                    interpreter_timing.flush_host_span("before_fusion", Some(detail.as_str()));
1292                }
1293                #[cfg(feature = "native-accel")]
1294                log_fusion_span_window(&plan, bytecode, pc);
1295                match try_execute_fusion_group(&plan, graph, &mut stack, &mut vars, &context) {
1296                    Ok(result) => {
1297                        stack.push(result);
1298                        pc = plan.group.span.end + 1;
1299                        continue;
1300                    }
1301                    Err(err) => {
1302                        log::debug!("fusion fallback at pc {}: {}", pc, err);
1303                    }
1304                }
1305            }
1306        }
1307        interpreter_timing.note_host_instr(pc);
1308        if debug_stack {
1309            eprintln!(
1310                "Instr pc={} {:?} stack_len={}",
1311                pc,
1312                &bytecode.instructions[pc],
1313                stack.len()
1314            );
1315        }
1316        match bytecode.instructions[pc].clone() {
1317            Instr::AndAnd(target) => {
1318                let lhs: f64 = (&stack
1319                    .pop()
1320                    .ok_or(mex("StackUnderflow", "stack underflow"))?)
1321                    .try_into()?;
1322                if lhs == 0.0 {
1323                    pc = target;
1324                    continue;
1325                }
1326            }
1327            Instr::OrOr(target) => {
1328                let lhs: f64 = (&stack
1329                    .pop()
1330                    .ok_or(mex("StackUnderflow", "stack underflow"))?)
1331                    .try_into()?;
1332                if lhs != 0.0 {
1333                    pc = target;
1334                    continue;
1335                }
1336            }
1337            Instr::Swap => {
1338                let a = stack
1339                    .pop()
1340                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
1341                let b = stack
1342                    .pop()
1343                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
1344                stack.push(a);
1345                stack.push(b);
1346            }
1347            Instr::CallFeval(argc) => {
1348                // Pop explicit args
1349                let mut args = Vec::with_capacity(argc);
1350                for _ in 0..argc {
1351                    args.push(
1352                        stack
1353                            .pop()
1354                            .ok_or(mex("StackUnderflow", "stack underflow"))?,
1355                    );
1356                }
1357                args.reverse();
1358                // Pop function value
1359                let func_val = stack
1360                    .pop()
1361                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
1362                match func_val {
1363                    Value::Closure(c) => {
1364                        // User-defined function via closure: prepend captures then dispatch like CallFunction
1365                        let name = c.function_name;
1366                        let mut call_args = c.captures.clone();
1367                        call_args.extend(args);
1368                        // Try runtime builtin target for closures (e.g., call_method)
1369                        if let Ok(result) = runmat_runtime::call_builtin(&name, &call_args) {
1370                            stack.push(result);
1371                            pc += 1;
1372                            continue;
1373                        }
1374                        let func: UserFunction = match context
1375                            .functions
1376                            .get(&name)
1377                            .or_else(|| bytecode.functions.get(&name))
1378                        {
1379                            Some(f) => f.clone(),
1380                            None => vm_bail!(mex(
1381                                "UndefinedFunction",
1382                                &format!("Undefined function: {name}")
1383                            )),
1384                        };
1385                        let arg_count = call_args.len();
1386                        if !func.has_varargin {
1387                            if arg_count < func.params.len() {
1388                                vm_bail!(mex(
1389                                    "NotEnoughInputs",
1390                                    &format!(
1391                                        "Function '{name}' expects {} inputs, got {arg_count}",
1392                                        func.params.len()
1393                                    )
1394                                ));
1395                            }
1396                            if arg_count > func.params.len() {
1397                                vm_bail!(mex(
1398                                    "TooManyInputs",
1399                                    &format!(
1400                                        "Function '{name}' expects {} inputs, got {arg_count}",
1401                                        func.params.len()
1402                                    )
1403                                ));
1404                            }
1405                        }
1406                        let var_map = runmat_hir::remapping::create_complete_function_var_map(
1407                            &func.params,
1408                            &func.outputs,
1409                            &func.body,
1410                        );
1411                        let local_var_count = var_map.len();
1412                        let remapped_body =
1413                            runmat_hir::remapping::remap_function_body(&func.body, &var_map);
1414                        let func_vars_count = local_var_count.max(func.params.len());
1415                        let mut func_vars = vec![Value::Num(0.0); func_vars_count];
1416                        if func.has_varargin {
1417                            let fixed = func.params.len().saturating_sub(1);
1418                            for i in 0..fixed {
1419                                if i < call_args.len() && i < func_vars.len() {
1420                                    func_vars[i] = call_args[i].clone();
1421                                }
1422                            }
1423                            let mut rest: Vec<Value> = if call_args.len() > fixed {
1424                                call_args[fixed..].to_vec()
1425                            } else {
1426                                Vec::new()
1427                            };
1428                            let cell = runmat_builtins::CellArray::new(
1429                                std::mem::take(&mut rest),
1430                                1,
1431                                if call_args.len() > fixed {
1432                                    call_args.len() - fixed
1433                                } else {
1434                                    0
1435                                },
1436                            )
1437                            .map_err(|e| format!("varargin: {e}"))?;
1438                            if fixed < func_vars.len() {
1439                                func_vars[fixed] = Value::Cell(cell);
1440                            }
1441                        } else {
1442                            for (i, _param_id) in func.params.iter().enumerate() {
1443                                if i < call_args.len() && i < func_vars.len() {
1444                                    func_vars[i] = call_args[i].clone();
1445                                }
1446                            }
1447                        }
1448                        // Copy referenced globals into local frame
1449                        for (original_var_id, local_var_id) in &var_map {
1450                            let local_index = local_var_id.0;
1451                            let global_index = original_var_id.0;
1452                            if local_index < func_vars.len() && global_index < vars.len() {
1453                                let is_parameter = func
1454                                    .params
1455                                    .iter()
1456                                    .any(|param_id| param_id == original_var_id);
1457                                if !is_parameter {
1458                                    func_vars[local_index] = vars[global_index].clone();
1459                                }
1460                            }
1461                        }
1462                        // Initialize varargout if needed
1463                        if func.has_varargout {
1464                            if let Some(varargout_oid) = func.outputs.last() {
1465                                if let Some(local_id) = var_map.get(varargout_oid) {
1466                                    if local_id.0 < func_vars.len() {
1467                                        let empty = runmat_builtins::CellArray::new(vec![], 1, 0)
1468                                            .map_err(|e| format!("varargout init: {e}"))?;
1469                                        func_vars[local_id.0] = Value::Cell(empty);
1470                                    }
1471                                }
1472                            }
1473                        }
1474                        let mut func_var_types = func.var_types.clone();
1475                        if func_var_types.len() < local_var_count {
1476                            func_var_types.resize(local_var_count, Type::Unknown);
1477                        }
1478                        let func_program = runmat_hir::HirProgram {
1479                            body: remapped_body,
1480                            var_types: func_var_types,
1481                        };
1482                        let func_bytecode =
1483                            crate::compile_with_functions(&func_program, &bytecode.functions)?;
1484                        // Merge nested functions into current execution context for future closure calls
1485                        for (k, v) in func_bytecode.functions.iter() {
1486                            context.functions.insert(k.clone(), v.clone());
1487                        }
1488                        let func_result_vars = match interpret_function(&func_bytecode, func_vars) {
1489                            Ok(v) => v,
1490                            Err(e) => vm_bail!(e),
1491                        };
1492                        if let Some(output_var_id) = func.outputs.first() {
1493                            let local_output_index =
1494                                var_map.get(output_var_id).map(|id| id.0).unwrap_or(0);
1495                            if local_output_index < func_result_vars.len() {
1496                                stack.push(func_result_vars[local_output_index].clone());
1497                            } else {
1498                                stack.push(Value::Num(0.0));
1499                            }
1500                        } else {
1501                            stack.push(Value::Num(0.0));
1502                        }
1503                    }
1504                    other => {
1505                        // Forward to runtime feval for string/char handles and builtins
1506                        let mut argv = Vec::with_capacity(1 + args.len());
1507                        argv.push(other);
1508                        argv.extend(args);
1509                        match runmat_runtime::call_builtin("feval", &argv) {
1510                            Ok(result) => stack.push(result),
1511                            Err(err) => vm_bail!(err),
1512                        }
1513                    }
1514                }
1515            }
1516            Instr::CallFevalExpandMulti(_specs) => {
1517                vm_bail!("feval expand not supported in this execution mode".to_string());
1518            }
1519            Instr::LoadConst(c) => {
1520                stack.push(Value::Num(c));
1521                if debug_stack {
1522                    eprintln!("  -> LoadConst pushed {}, new_len={}", c, stack.len());
1523                }
1524            }
1525            Instr::LoadComplex(re, im) => {
1526                stack.push(Value::Complex(re, im));
1527                if debug_stack {
1528                    eprintln!(
1529                        "  -> LoadComplex pushed ({}, {}), new_len={}",
1530                        re,
1531                        im,
1532                        stack.len()
1533                    );
1534                }
1535            }
1536            Instr::LoadBool(b) => stack.push(Value::Bool(b)),
1537            Instr::LoadString(s) => stack.push(Value::String(s)),
1538            Instr::LoadCharRow(s) => {
1539                let ca = runmat_builtins::CharArray::new(s.chars().collect(), 1, s.chars().count())
1540                    .map_err(|e| mex("CharError", &e))?;
1541                stack.push(Value::CharArray(ca));
1542            }
1543            Instr::LoadVar(i) => {
1544                let v = vars[i].clone();
1545                if std::env::var("RUNMAT_DEBUG_INDEX").as_deref() == Ok("1") {
1546                    match &v {
1547                        Value::GpuTensor(h) => {
1548                            eprintln!(
1549                                "LoadVar pc={} var={} => GpuTensor shape={:?}",
1550                                pc, i, h.shape
1551                            );
1552                        }
1553                        Value::Tensor(t) => {
1554                            eprintln!("LoadVar pc={} var={} => Tensor shape={:?}", pc, i, t.shape);
1555                        }
1556                        _ => {}
1557                    }
1558                }
1559                stack.push(v)
1560            }
1561            Instr::StoreVar(i) => {
1562                let val = stack
1563                    .pop()
1564                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
1565                if let Ok(filter) = std::env::var("RUNMAT_DEBUG_STORE_VAR") {
1566                    let log_this = if filter.trim().eq_ignore_ascii_case("*") {
1567                        true
1568                    } else if let Ok(target) = filter.trim().parse::<usize>() {
1569                        target == i
1570                    } else {
1571                        false
1572                    };
1573                    if log_this {
1574                        eprintln!("StoreVar pc={} var={} value={:?}", pc, i, val);
1575                    }
1576                }
1577                if std::env::var("RUNMAT_DEBUG_INDEX").as_deref() == Ok("1") {
1578                    match &val {
1579                        Value::GpuTensor(h) => {
1580                            eprintln!(
1581                                "StoreVar pc={} var={} := GpuTensor shape={:?}",
1582                                pc, i, h.shape
1583                            );
1584                        }
1585                        Value::Tensor(t) => {
1586                            eprintln!("StoreVar pc={} var={} := Tensor shape={:?}", pc, i, t.shape);
1587                        }
1588                        _ => {}
1589                    }
1590                }
1591                if i < vars.len() {
1592                    #[cfg(feature = "native-accel")]
1593                    clear_residency(&vars[i]);
1594                }
1595                if i >= vars.len() {
1596                    vars.resize(i + 1, Value::Num(0.0));
1597                    refresh_workspace_state(&vars);
1598                }
1599                vars[i] = val;
1600                // If this var is declared global, update the global table entry
1601                // We optimistically write-through whenever StoreVar happens and a global exists for this name
1602                let key = format!("var_{i}");
1603                GLOBALS.with(|g| {
1604                    let mut m = g.borrow_mut();
1605                    if m.contains_key(&key) {
1606                        m.insert(key, vars[i].clone());
1607                    }
1608                });
1609                if let Some(name) = global_aliases.get(&i) {
1610                    GLOBALS.with(|g| {
1611                        g.borrow_mut().insert(name.clone(), vars[i].clone());
1612                    });
1613                }
1614            }
1615            Instr::LoadLocal(offset) => {
1616                if let Some(current_frame) = context.call_stack.last() {
1617                    let local_index = current_frame.locals_start + offset;
1618                    if local_index >= context.locals.len() {
1619                        vm_bail!("Local variable index out of bounds".to_string());
1620                    }
1621                    stack.push(context.locals[local_index].clone());
1622                } else if offset < vars.len() {
1623                    stack.push(vars[offset].clone());
1624                } else {
1625                    stack.push(Value::Num(0.0));
1626                }
1627            }
1628            Instr::StoreLocal(offset) => {
1629                let val = stack
1630                    .pop()
1631                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
1632                if let Some(current_frame) = context.call_stack.last() {
1633                    let local_index = current_frame.locals_start + offset;
1634                    while context.locals.len() <= local_index {
1635                        context.locals.push(Value::Num(0.0));
1636                    }
1637                    #[cfg(feature = "native-accel")]
1638                    if local_index < context.locals.len() {
1639                        clear_residency(&context.locals[local_index]);
1640                    }
1641                    context.locals[local_index] = val;
1642                } else {
1643                    if offset >= vars.len() {
1644                        vars.resize(offset + 1, Value::Num(0.0));
1645                        refresh_workspace_state(&vars);
1646                    }
1647                    #[cfg(feature = "native-accel")]
1648                    if offset < vars.len() {
1649                        clear_residency(&vars[offset]);
1650                    }
1651                    vars[offset] = val;
1652                    // write-through to persistents if this local is a declared persistent for current function
1653                    let func_name = context
1654                        .call_stack
1655                        .last()
1656                        .map(|f| f.function_name.clone())
1657                        .unwrap_or_else(|| "<main>".to_string());
1658                    let key = (func_name, offset);
1659                    PERSISTENTS.with(|p| {
1660                        let mut m = p.borrow_mut();
1661                        if m.contains_key(&key) {
1662                            m.insert(key, vars[offset].clone());
1663                        }
1664                    });
1665                }
1666            }
1667            Instr::EnterScope(local_count) => {
1668                for _ in 0..local_count {
1669                    context.locals.push(Value::Num(0.0));
1670                }
1671            }
1672            Instr::ExitScope(local_count) => {
1673                for _ in 0..local_count {
1674                    if let Some(val) = context.locals.pop() {
1675                        #[cfg(feature = "native-accel")]
1676                        clear_residency(&val);
1677                    }
1678                }
1679            }
1680            Instr::RegisterImport { path, wildcard } => {
1681                imports.push((path, wildcard));
1682            }
1683            Instr::DeclareGlobal(indices) => {
1684                // Bind local var slots to global table entries by name (var_N)
1685                for i in indices.into_iter() {
1686                    let key = format!("var_{i}");
1687                    let val_opt = GLOBALS.with(|g| g.borrow().get(&key).cloned());
1688                    if let Some(v) = val_opt {
1689                        if i >= vars.len() {
1690                            vars.resize(i + 1, Value::Num(0.0));
1691                            refresh_workspace_state(&vars);
1692                        }
1693                        vars[i] = v;
1694                    }
1695                }
1696            }
1697            Instr::DeclareGlobalNamed(indices, names) => {
1698                for (pos, i) in indices.into_iter().enumerate() {
1699                    let name = names
1700                        .get(pos)
1701                        .cloned()
1702                        .unwrap_or_else(|| format!("var_{i}"));
1703                    let val_opt = GLOBALS.with(|g| g.borrow().get(&name).cloned());
1704                    if let Some(v) = val_opt {
1705                        if i >= vars.len() {
1706                            vars.resize(i + 1, Value::Num(0.0));
1707                            refresh_workspace_state(&vars);
1708                        }
1709                        vars[i] = v;
1710                    }
1711                    GLOBALS.with(|g| {
1712                        let mut m = g.borrow_mut();
1713                        if let Some(v) = m.get(&name).cloned() {
1714                            m.insert(format!("var_{i}"), v);
1715                        }
1716                    });
1717                    global_aliases.insert(i, name);
1718                }
1719            }
1720            Instr::DeclarePersistent(indices) => {
1721                // Initialize locals from persistent table if present
1722                let func_name = current_func_name_str.clone();
1723                for i in indices.into_iter() {
1724                    let key = (func_name.clone(), i);
1725                    let val_opt = PERSISTENTS.with(|p| p.borrow().get(&key).cloned());
1726                    if let Some(v) = val_opt {
1727                        if i >= vars.len() {
1728                            vars.resize(i + 1, Value::Num(0.0));
1729                            refresh_workspace_state(&vars);
1730                        }
1731                        vars[i] = v;
1732                    }
1733                }
1734            }
1735            Instr::DeclarePersistentNamed(indices, names) => {
1736                let func_name = current_func_name_str.clone();
1737                for (pos, i) in indices.into_iter().enumerate() {
1738                    let name = names
1739                        .get(pos)
1740                        .cloned()
1741                        .unwrap_or_else(|| format!("var_{i}"));
1742                    let key = (func_name.clone(), i);
1743                    let val_opt = PERSISTENTS_BY_NAME
1744                        .with(|p| p.borrow().get(&(func_name.clone(), name.clone())).cloned())
1745                        .or_else(|| PERSISTENTS.with(|p| p.borrow().get(&key).cloned()));
1746                    if let Some(v) = val_opt {
1747                        if i >= vars.len() {
1748                            vars.resize(i + 1, Value::Num(0.0));
1749                            refresh_workspace_state(&vars);
1750                        }
1751                        vars[i] = v;
1752                    }
1753                    persistent_aliases.insert(i, name);
1754                }
1755            }
1756            Instr::Add => {
1757                // If either operand is an object, try operator overloading
1758                let b = stack
1759                    .pop()
1760                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
1761                let a = stack
1762                    .pop()
1763                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
1764                match (&a, &b) {
1765                    (Value::Object(obj), _) => {
1766                        let args = vec![
1767                            Value::Object(obj.clone()),
1768                            Value::String("plus".to_string()),
1769                            b.clone(),
1770                        ];
1771                        match call_builtin("call_method", &args) {
1772                            Ok(v) => stack.push(v),
1773                            Err(_) => {
1774                                let v = call_builtin("plus", &[a.clone(), b.clone()])?;
1775                                stack.push(v)
1776                            }
1777                        }
1778                    }
1779                    (_, Value::Object(obj)) => {
1780                        let args = vec![
1781                            Value::Object(obj.clone()),
1782                            Value::String("plus".to_string()),
1783                            a.clone(),
1784                        ];
1785                        match call_builtin("call_method", &args) {
1786                            Ok(v) => stack.push(v),
1787                            Err(_) => {
1788                                let v = call_builtin("plus", &[a.clone(), b.clone()])?;
1789                                stack.push(v)
1790                            }
1791                        }
1792                    }
1793                    _ => {
1794                        let (a_acc, b_acc) =
1795                            accel_promote_binary(AutoBinaryOp::Elementwise, &a, &b)?;
1796                        let v = call_builtin("plus", &[a_acc, b_acc])?;
1797                        stack.push(v)
1798                    }
1799                }
1800            }
1801            Instr::Sub => {
1802                let b = stack
1803                    .pop()
1804                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
1805                let a = stack
1806                    .pop()
1807                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
1808                match (&a, &b) {
1809                    (Value::Object(obj), _) => {
1810                        let args = vec![Value::Object(obj.clone()), b.clone()];
1811                        match call_builtin("minus", &args) {
1812                            Ok(v) => stack.push(v),
1813                            Err(_) => {
1814                                let v = call_builtin("minus", &[a.clone(), b.clone()])?;
1815                                stack.push(v)
1816                            }
1817                        }
1818                    }
1819                    (_, Value::Object(obj)) => {
1820                        let args = vec![Value::Object(obj.clone()), a.clone()];
1821                        match call_builtin("uminus", &args) {
1822                            Ok(v) => stack.push(v),
1823                            Err(_) => {
1824                                let v = call_builtin("minus", &[a.clone(), b.clone()])?;
1825                                stack.push(v)
1826                            }
1827                        }
1828                    }
1829                    _ => {
1830                        let (a_acc, b_acc) =
1831                            accel_promote_binary(AutoBinaryOp::Elementwise, &a, &b)?;
1832                        let v = call_builtin("minus", &[a_acc, b_acc])?;
1833                        stack.push(v)
1834                    }
1835                }
1836            }
1837            Instr::Mul => {
1838                let b = stack
1839                    .pop()
1840                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
1841                let a = stack
1842                    .pop()
1843                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
1844                match (&a, &b) {
1845                    (Value::Object(obj), _) => {
1846                        let args = vec![
1847                            Value::Object(obj.clone()),
1848                            Value::String("mtimes".to_string()),
1849                            b.clone(),
1850                        ];
1851                        match call_builtin("call_method", &args) {
1852                            Ok(v) => stack.push(v),
1853                            Err(_) => {
1854                                let v = runmat_runtime::matrix::value_matmul(&a, &b)?;
1855                                stack.push(v)
1856                            }
1857                        }
1858                    }
1859                    (_, Value::Object(obj)) => {
1860                        let args = vec![
1861                            Value::Object(obj.clone()),
1862                            Value::String("mtimes".to_string()),
1863                            a.clone(),
1864                        ];
1865                        match call_builtin("call_method", &args) {
1866                            Ok(v) => stack.push(v),
1867                            Err(_) => {
1868                                let v = runmat_runtime::matrix::value_matmul(&a, &b)?;
1869                                stack.push(v)
1870                            }
1871                        }
1872                    }
1873                    _ => {
1874                        let (a_acc, b_acc) = accel_promote_binary(AutoBinaryOp::MatMul, &a, &b)?;
1875                        let v = runmat_runtime::matrix::value_matmul(&a_acc, &b_acc)?;
1876                        stack.push(v)
1877                    }
1878                }
1879            }
1880            Instr::Div => {
1881                let b = stack
1882                    .pop()
1883                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
1884                let a = stack
1885                    .pop()
1886                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
1887                match (&a, &b) {
1888                    (Value::Object(obj), _) => {
1889                        let args = vec![
1890                            Value::Object(obj.clone()),
1891                            Value::String("mrdivide".to_string()),
1892                            b.clone(),
1893                        ];
1894                        match call_builtin("call_method", &args) {
1895                            Ok(v) => stack.push(v),
1896                            Err(_) => {
1897                                let (a_acc, b_acc) =
1898                                    accel_promote_binary(AutoBinaryOp::Elementwise, &a, &b)?;
1899                                let v = runmat_runtime::call_builtin("rdivide", &[a_acc, b_acc])?;
1900                                stack.push(v)
1901                            }
1902                        }
1903                    }
1904                    (_, Value::Object(obj)) => {
1905                        let args = vec![
1906                            Value::Object(obj.clone()),
1907                            Value::String("mrdivide".to_string()),
1908                            a.clone(),
1909                        ];
1910                        match call_builtin("call_method", &args) {
1911                            Ok(v) => stack.push(v),
1912                            Err(_) => {
1913                                let (a_acc, b_acc) =
1914                                    accel_promote_binary(AutoBinaryOp::Elementwise, &a, &b)?;
1915                                let v = runmat_runtime::call_builtin("rdivide", &[a_acc, b_acc])?;
1916                                stack.push(v)
1917                            }
1918                        }
1919                    }
1920                    _ => {
1921                        let (a_acc, b_acc) =
1922                            accel_promote_binary(AutoBinaryOp::Elementwise, &a, &b)?;
1923                        let v = runmat_runtime::call_builtin("rdivide", &[a_acc, b_acc])?;
1924                        stack.push(v)
1925                    }
1926                }
1927            }
1928            Instr::Pow => {
1929                let b = stack
1930                    .pop()
1931                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
1932                let a = stack
1933                    .pop()
1934                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
1935                match (&a, &b) {
1936                    (Value::Object(obj), _) | (_, Value::Object(obj)) => {
1937                        let arg_val = if matches!(&a, Value::Object(_)) {
1938                            b.clone()
1939                        } else {
1940                            a.clone()
1941                        };
1942                        let args = vec![
1943                            Value::Object(obj.clone()),
1944                            Value::String("power".to_string()),
1945                            arg_val,
1946                        ];
1947                        match call_builtin("call_method", &args) {
1948                            Ok(v) => stack.push(v),
1949                            Err(_) => {
1950                                let v = runmat_runtime::power(&a, &b)?;
1951                                stack.push(v)
1952                            }
1953                        }
1954                    }
1955                    _ => {
1956                        let (a_acc, b_acc) =
1957                            accel_promote_binary(AutoBinaryOp::Elementwise, &a, &b)?;
1958                        let v = runmat_runtime::power(&a_acc, &b_acc)?;
1959                        stack.push(v)
1960                    }
1961                }
1962            }
1963            Instr::Neg => {
1964                let value = stack
1965                    .pop()
1966                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
1967                match &value {
1968                    Value::Object(obj) => {
1969                        let args = vec![Value::Object(obj.clone())];
1970                        match call_builtin("uminus", &args) {
1971                            Ok(v) => stack.push(v),
1972                            Err(_) => {
1973                                let result = runmat_runtime::call_builtin(
1974                                    "times",
1975                                    &[value.clone(), runmat_builtins::Value::Num(-1.0)],
1976                                )?;
1977                                stack.push(result)
1978                            }
1979                        }
1980                    }
1981                    _ => {
1982                        let result = runmat_runtime::call_builtin(
1983                            "times",
1984                            &[value.clone(), runmat_builtins::Value::Num(-1.0)],
1985                        )?;
1986                        stack.push(result);
1987                    }
1988                }
1989            }
1990            Instr::UPlus => {
1991                let value = stack
1992                    .pop()
1993                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
1994                match &value {
1995                    Value::Object(obj) => {
1996                        let args = vec![Value::Object(obj.clone())];
1997                        match call_builtin("uplus", &args) {
1998                            Ok(v) => stack.push(v),
1999                            Err(_) => stack.push(value),
2000                        }
2001                    }
2002                    _ => stack.push(value),
2003                }
2004            }
2005            Instr::Transpose => {
2006                let value = stack
2007                    .pop()
2008                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
2009                let promoted = accel_promote_unary(AutoUnaryOp::Transpose, &value)?;
2010                let args = [promoted];
2011                let result = runmat_runtime::call_builtin("transpose", &args)?;
2012                stack.push(result);
2013            }
2014            Instr::ConjugateTranspose => {
2015                let value = stack
2016                    .pop()
2017                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
2018                let promoted = accel_promote_unary(AutoUnaryOp::Transpose, &value)?;
2019                let args = [promoted];
2020                let result = runmat_runtime::call_builtin("ctranspose", &args)?;
2021                stack.push(result);
2022            }
2023            Instr::ElemMul => {
2024                let b = stack
2025                    .pop()
2026                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
2027                let a = stack
2028                    .pop()
2029                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
2030                match (&a, &b) {
2031                    (Value::Object(obj), _) => {
2032                        let args = vec![
2033                            Value::Object(obj.clone()),
2034                            Value::String("times".to_string()),
2035                            b.clone(),
2036                        ];
2037                        match call_builtin("call_method", &args) {
2038                            Ok(v) => stack.push(v),
2039                            Err(_) => {
2040                                let (a_acc, b_acc) =
2041                                    accel_promote_binary(AutoBinaryOp::Elementwise, &a, &b)?;
2042                                stack.push(runmat_runtime::call_builtin("times", &[a_acc, b_acc])?)
2043                            }
2044                        }
2045                    }
2046                    (_, Value::Object(obj)) => {
2047                        let args = vec![
2048                            Value::Object(obj.clone()),
2049                            Value::String("times".to_string()),
2050                            a.clone(),
2051                        ];
2052                        match call_builtin("call_method", &args) {
2053                            Ok(v) => stack.push(v),
2054                            Err(_) => {
2055                                let (a_acc, b_acc) =
2056                                    accel_promote_binary(AutoBinaryOp::Elementwise, &a, &b)?;
2057                                stack.push(runmat_runtime::call_builtin("times", &[a_acc, b_acc])?)
2058                            }
2059                        }
2060                    }
2061                    _ => {
2062                        let (a_acc, b_acc) =
2063                            accel_promote_binary(AutoBinaryOp::Elementwise, &a, &b)?;
2064                        stack.push(runmat_runtime::call_builtin("times", &[a_acc, b_acc])?)
2065                    }
2066                }
2067            }
2068            Instr::ElemDiv => {
2069                let b = stack
2070                    .pop()
2071                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
2072                let a = stack
2073                    .pop()
2074                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
2075                match (&a, &b) {
2076                    (Value::Object(obj), _) => {
2077                        let args = vec![
2078                            Value::Object(obj.clone()),
2079                            Value::String("rdivide".to_string()),
2080                            b.clone(),
2081                        ];
2082                        match call_builtin("call_method", &args) {
2083                            Ok(v) => stack.push(v),
2084                            Err(_) => {
2085                                let (a_acc, b_acc) =
2086                                    accel_promote_binary(AutoBinaryOp::Elementwise, &a, &b)?;
2087                                stack
2088                                    .push(runmat_runtime::call_builtin("rdivide", &[a_acc, b_acc])?)
2089                            }
2090                        }
2091                    }
2092                    (_, Value::Object(obj)) => {
2093                        let args = vec![
2094                            Value::Object(obj.clone()),
2095                            Value::String("rdivide".to_string()),
2096                            a.clone(),
2097                        ];
2098                        match call_builtin("call_method", &args) {
2099                            Ok(v) => stack.push(v),
2100                            Err(_) => {
2101                                let (a_acc, b_acc) =
2102                                    accel_promote_binary(AutoBinaryOp::Elementwise, &a, &b)?;
2103                                stack
2104                                    .push(runmat_runtime::call_builtin("rdivide", &[a_acc, b_acc])?)
2105                            }
2106                        }
2107                    }
2108                    _ => {
2109                        let (a_acc, b_acc) =
2110                            accel_promote_binary(AutoBinaryOp::Elementwise, &a, &b)?;
2111                        stack.push(runmat_runtime::call_builtin("rdivide", &[a_acc, b_acc])?)
2112                    }
2113                }
2114            }
2115            Instr::ElemPow => {
2116                let b = stack
2117                    .pop()
2118                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
2119                let a = stack
2120                    .pop()
2121                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
2122                match (&a, &b) {
2123                    (Value::Object(obj), _) | (_, Value::Object(obj)) => {
2124                        let args = vec![
2125                            Value::Object(obj.clone()),
2126                            if matches!(&a, Value::Object(_)) {
2127                                b.clone()
2128                            } else {
2129                                a.clone()
2130                            },
2131                        ];
2132                        match call_builtin("power", &args) {
2133                            Ok(v) => stack.push(v),
2134                            Err(_) => {
2135                                let (a_acc, b_acc) =
2136                                    accel_promote_binary(AutoBinaryOp::Elementwise, &a, &b)?;
2137                                stack.push(runmat_runtime::call_builtin("power", &[a_acc, b_acc])?)
2138                            }
2139                        }
2140                    }
2141                    _ => {
2142                        let (a_acc, b_acc) =
2143                            accel_promote_binary(AutoBinaryOp::Elementwise, &a, &b)?;
2144                        stack.push(runmat_runtime::call_builtin("power", &[a_acc, b_acc])?)
2145                    }
2146                }
2147            }
2148            Instr::ElemLeftDiv => {
2149                let b = stack
2150                    .pop()
2151                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
2152                let a = stack
2153                    .pop()
2154                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
2155                match (&a, &b) {
2156                    (Value::Object(obj), _) => {
2157                        let args = vec![
2158                            Value::Object(obj.clone()),
2159                            Value::String("ldivide".to_string()),
2160                            b.clone(),
2161                        ];
2162                        match call_builtin("call_method", &args) {
2163                            Ok(v) => stack.push(v),
2164                            Err(_) => {
2165                                let (b_acc, a_acc) =
2166                                    accel_promote_binary(AutoBinaryOp::Elementwise, &b, &a)?;
2167                                stack
2168                                    .push(runmat_runtime::call_builtin("rdivide", &[b_acc, a_acc])?)
2169                            }
2170                        }
2171                    }
2172                    (_, Value::Object(obj)) => {
2173                        let args = vec![
2174                            Value::Object(obj.clone()),
2175                            Value::String("ldivide".to_string()),
2176                            a.clone(),
2177                        ];
2178                        match call_builtin("call_method", &args) {
2179                            Ok(v) => stack.push(v),
2180                            Err(_) => {
2181                                let (b_acc, a_acc) =
2182                                    accel_promote_binary(AutoBinaryOp::Elementwise, &b, &a)?;
2183                                stack
2184                                    .push(runmat_runtime::call_builtin("rdivide", &[b_acc, a_acc])?)
2185                            }
2186                        }
2187                    }
2188                    _ => {
2189                        let (b_acc, a_acc) =
2190                            accel_promote_binary(AutoBinaryOp::Elementwise, &b, &a)?;
2191                        stack.push(runmat_runtime::call_builtin("rdivide", &[b_acc, a_acc])?)
2192                    }
2193                }
2194            }
2195            Instr::LessEqual => {
2196                let b = stack
2197                    .pop()
2198                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
2199                let a = stack
2200                    .pop()
2201                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
2202                match (&a, &b) {
2203                    (Value::Object(obj), _) => {
2204                        let args = vec![
2205                            Value::Object(obj.clone()),
2206                            Value::String("le".to_string()),
2207                            b.clone(),
2208                        ];
2209                        match call_builtin("call_method", &args) {
2210                            Ok(v) => stack.push(v),
2211                            Err(_) => {
2212                                // Fallback: le(a,b) = ~gt(a,b)
2213                                let args2 = vec![
2214                                    Value::Object(obj.clone()),
2215                                    Value::String("gt".to_string()),
2216                                    b.clone(),
2217                                ];
2218                                match call_builtin("call_method", &args2) {
2219                                    Ok(v) => {
2220                                        let truth: f64 = (&v).try_into()?;
2221                                        stack.push(Value::Num(if truth == 0.0 {
2222                                            1.0
2223                                        } else {
2224                                            0.0
2225                                        }));
2226                                    }
2227                                    Err(_) => {
2228                                        let aa: f64 = (&a).try_into()?;
2229                                        let bb: f64 = (&b).try_into()?;
2230                                        stack.push(Value::Num(if aa <= bb { 1.0 } else { 0.0 }));
2231                                    }
2232                                }
2233                            }
2234                        }
2235                    }
2236                    (_, Value::Object(obj)) => {
2237                        let args = vec![
2238                            Value::Object(obj.clone()),
2239                            Value::String("ge".to_string()),
2240                            a.clone(),
2241                        ];
2242                        match call_builtin("call_method", &args) {
2243                            Ok(v) => stack.push(v),
2244                            Err(_) => {
2245                                // Fallback: ge(b,a) = ~lt(b,a) hence le(a,b) = ge(b,a)
2246                                let args2 = vec![
2247                                    Value::Object(obj.clone()),
2248                                    Value::String("lt".to_string()),
2249                                    a.clone(),
2250                                ];
2251                                match call_builtin("call_method", &args2) {
2252                                    Ok(v) => {
2253                                        let truth: f64 = (&v).try_into()?;
2254                                        stack.push(Value::Num(if truth == 0.0 {
2255                                            1.0
2256                                        } else {
2257                                            0.0
2258                                        }));
2259                                    }
2260                                    Err(_) => {
2261                                        let aa: f64 = (&a).try_into()?;
2262                                        let bb: f64 = (&b).try_into()?;
2263                                        stack.push(Value::Num(if aa <= bb { 1.0 } else { 0.0 }));
2264                                    }
2265                                }
2266                            }
2267                        }
2268                    }
2269                    _ => {
2270                        let bb: f64 = (&b).try_into()?;
2271                        let aa: f64 = (&a).try_into()?;
2272                        stack.push(Value::Num(if aa <= bb { 1.0 } else { 0.0 }));
2273                    }
2274                }
2275            }
2276            Instr::Less => {
2277                handle_rel_binary!(<, "lt", stack);
2278            }
2279            Instr::Greater => {
2280                handle_rel_binary!(>, "gt", stack);
2281            }
2282            Instr::GreaterEqual => {
2283                let b = stack
2284                    .pop()
2285                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
2286                let a = stack
2287                    .pop()
2288                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
2289                match (&a, &b) {
2290                    (Value::Object(obj), _) => {
2291                        let args = vec![
2292                            Value::Object(obj.clone()),
2293                            Value::String("ge".to_string()),
2294                            b.clone(),
2295                        ];
2296                        match call_builtin("call_method", &args) {
2297                            Ok(v) => stack.push(v),
2298                            Err(_) => {
2299                                // Fallback: ge(a,b) = ~lt(a,b)
2300                                let args2 = vec![
2301                                    Value::Object(obj.clone()),
2302                                    Value::String("lt".to_string()),
2303                                    b.clone(),
2304                                ];
2305                                match call_builtin("call_method", &args2) {
2306                                    Ok(v) => {
2307                                        let truth: f64 = (&v).try_into()?;
2308                                        stack.push(Value::Num(if truth == 0.0 {
2309                                            1.0
2310                                        } else {
2311                                            0.0
2312                                        }));
2313                                    }
2314                                    Err(_) => {
2315                                        let aa: f64 = (&a).try_into()?;
2316                                        let bb: f64 = (&b).try_into()?;
2317                                        stack.push(Value::Num(if aa >= bb { 1.0 } else { 0.0 }));
2318                                    }
2319                                }
2320                            }
2321                        }
2322                    }
2323                    (_, Value::Object(obj)) => {
2324                        let args = vec![
2325                            Value::Object(obj.clone()),
2326                            Value::String("le".to_string()),
2327                            a.clone(),
2328                        ];
2329                        match call_builtin("call_method", &args) {
2330                            Ok(v) => stack.push(v),
2331                            Err(_) => {
2332                                // Fallback: le(b,a) = ~gt(b,a); hence ge(a,b) = le(b,a)
2333                                let args2 = vec![
2334                                    Value::Object(obj.clone()),
2335                                    Value::String("gt".to_string()),
2336                                    a.clone(),
2337                                ];
2338                                match call_builtin("call_method", &args2) {
2339                                    Ok(v) => {
2340                                        let truth: f64 = (&v).try_into()?;
2341                                        stack.push(Value::Num(if truth == 0.0 {
2342                                            1.0
2343                                        } else {
2344                                            0.0
2345                                        }));
2346                                    }
2347                                    Err(_) => {
2348                                        let aa: f64 = (&a).try_into()?;
2349                                        let bb: f64 = (&b).try_into()?;
2350                                        stack.push(Value::Num(if aa >= bb { 1.0 } else { 0.0 }));
2351                                    }
2352                                }
2353                            }
2354                        }
2355                    }
2356                    _ => {
2357                        let bb: f64 = (&b).try_into()?;
2358                        let aa: f64 = (&a).try_into()?;
2359                        stack.push(Value::Num(if aa >= bb { 1.0 } else { 0.0 }));
2360                    }
2361                }
2362            }
2363            Instr::Equal => {
2364                let b = stack
2365                    .pop()
2366                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
2367                let a = stack
2368                    .pop()
2369                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
2370                match (&a, &b) {
2371                    (Value::Object(obj), _) => {
2372                        let args = vec![
2373                            Value::Object(obj.clone()),
2374                            Value::String("eq".to_string()),
2375                            b.clone(),
2376                        ];
2377                        match call_builtin("call_method", &args) {
2378                            Ok(v) => stack.push(v),
2379                            Err(_) => {
2380                                let aa: f64 = (&a).try_into()?;
2381                                let bb: f64 = (&b).try_into()?;
2382                                stack.push(Value::Num(if aa == bb { 1.0 } else { 0.0 }))
2383                            }
2384                        }
2385                    }
2386                    (_, Value::Object(obj)) => {
2387                        let args = vec![
2388                            Value::Object(obj.clone()),
2389                            Value::String("eq".to_string()),
2390                            a.clone(),
2391                        ];
2392                        match call_builtin("call_method", &args) {
2393                            Ok(v) => stack.push(v),
2394                            Err(_) => {
2395                                let aa: f64 = (&a).try_into()?;
2396                                let bb: f64 = (&b).try_into()?;
2397                                stack.push(Value::Num(if aa == bb { 1.0 } else { 0.0 }))
2398                            }
2399                        }
2400                    }
2401                    (Value::HandleObject(_), _) | (_, Value::HandleObject(_)) => {
2402                        // Delegate to runtime eq builtin which implements identity semantics
2403                        let v = runmat_runtime::call_builtin("eq", &[a.clone(), b.clone()])?;
2404                        stack.push(v);
2405                    }
2406                    (Value::Tensor(ta), Value::Tensor(tb)) => {
2407                        // Element-wise eq; shapes must match
2408                        if ta.shape != tb.shape {
2409                            return Err(mex(
2410                                "ShapeMismatch",
2411                                "shape mismatch for element-wise comparison",
2412                            ));
2413                        }
2414                        let mut out = Vec::with_capacity(ta.data.len());
2415                        for i in 0..ta.data.len() {
2416                            out.push(if (ta.data[i] - tb.data[i]).abs() < 1e-12 {
2417                                1.0
2418                            } else {
2419                                0.0
2420                            });
2421                        }
2422                        stack.push(Value::Tensor(
2423                            runmat_builtins::Tensor::new(out, ta.shape.clone())
2424                                .map_err(|e| format!("eq: {e}"))?,
2425                        ));
2426                    }
2427                    (Value::Tensor(t), Value::Num(_)) | (Value::Tensor(t), Value::Int(_)) => {
2428                        let s = match &b {
2429                            Value::Num(n) => *n,
2430                            Value::Int(i) => i.to_f64(),
2431                            _ => 0.0,
2432                        };
2433                        let out: Vec<f64> = t
2434                            .data
2435                            .iter()
2436                            .map(|x| if (*x - s).abs() < 1e-12 { 1.0 } else { 0.0 })
2437                            .collect();
2438                        stack.push(Value::Tensor(
2439                            runmat_builtins::Tensor::new(out, t.shape.clone())
2440                                .map_err(|e| format!("eq: {e}"))?,
2441                        ));
2442                    }
2443                    (Value::Num(_), Value::Tensor(t)) | (Value::Int(_), Value::Tensor(t)) => {
2444                        let s = match &a {
2445                            Value::Num(n) => *n,
2446                            Value::Int(i) => i.to_f64(),
2447                            _ => 0.0,
2448                        };
2449                        let out: Vec<f64> = t
2450                            .data
2451                            .iter()
2452                            .map(|x| if (s - *x).abs() < 1e-12 { 1.0 } else { 0.0 })
2453                            .collect();
2454                        stack.push(Value::Tensor(
2455                            runmat_builtins::Tensor::new(out, t.shape.clone())
2456                                .map_err(|e| format!("eq: {e}"))?,
2457                        ));
2458                    }
2459                    (Value::StringArray(sa), Value::StringArray(sb)) => {
2460                        if sa.shape != sb.shape {
2461                            return Err(mex(
2462                                "ShapeMismatch",
2463                                "shape mismatch for string array comparison",
2464                            ));
2465                        }
2466                        let mut out = Vec::with_capacity(sa.data.len());
2467                        for i in 0..sa.data.len() {
2468                            out.push(if sa.data[i] == sb.data[i] { 1.0 } else { 0.0 });
2469                        }
2470                        stack.push(Value::Tensor(
2471                            runmat_builtins::Tensor::new(out, sa.shape.clone())
2472                                .map_err(|e| format!("eq: {e}"))?,
2473                        ));
2474                    }
2475                    (Value::StringArray(sa), Value::String(s)) => {
2476                        let mut out = Vec::with_capacity(sa.data.len());
2477                        for i in 0..sa.data.len() {
2478                            out.push(if sa.data[i] == *s { 1.0 } else { 0.0 });
2479                        }
2480                        stack.push(Value::Tensor(
2481                            runmat_builtins::Tensor::new(out, sa.shape.clone())
2482                                .map_err(|e| format!("eq: {e}"))?,
2483                        ));
2484                    }
2485                    (Value::String(s), Value::StringArray(sa)) => {
2486                        let mut out = Vec::with_capacity(sa.data.len());
2487                        for i in 0..sa.data.len() {
2488                            out.push(if *s == sa.data[i] { 1.0 } else { 0.0 });
2489                        }
2490                        stack.push(Value::Tensor(
2491                            runmat_builtins::Tensor::new(out, sa.shape.clone())
2492                                .map_err(|e| format!("eq: {e}"))?,
2493                        ));
2494                    }
2495                    (Value::String(a_s), Value::String(b_s)) => {
2496                        stack.push(Value::Num(if a_s == b_s { 1.0 } else { 0.0 }));
2497                    }
2498                    _ => {
2499                        let bb: f64 = (&b).try_into()?;
2500                        let aa: f64 = (&a).try_into()?;
2501                        stack.push(Value::Num(if aa == bb { 1.0 } else { 0.0 }));
2502                    }
2503                }
2504            }
2505            Instr::NotEqual => {
2506                let b = stack
2507                    .pop()
2508                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
2509                let a = stack
2510                    .pop()
2511                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
2512                match (&a, &b) {
2513                    (Value::Object(obj), _) => {
2514                        let args = vec![
2515                            Value::Object(obj.clone()),
2516                            Value::String("ne".to_string()),
2517                            b.clone(),
2518                        ];
2519                        match call_builtin("call_method", &args) {
2520                            Ok(v) => stack.push(v),
2521                            Err(_) => {
2522                                // Fallback: ne(a,b) = ~eq(a,b)
2523                                let args2 = vec![
2524                                    Value::Object(obj.clone()),
2525                                    Value::String("eq".to_string()),
2526                                    b.clone(),
2527                                ];
2528                                match call_builtin("call_method", &args2) {
2529                                    Ok(v) => {
2530                                        let truth: f64 = (&v).try_into()?;
2531                                        stack.push(Value::Num(if truth == 0.0 {
2532                                            1.0
2533                                        } else {
2534                                            0.0
2535                                        }));
2536                                    }
2537                                    Err(_) => {
2538                                        let aa: f64 = (&a).try_into()?;
2539                                        let bb: f64 = (&b).try_into()?;
2540                                        stack.push(Value::Num(if aa != bb { 1.0 } else { 0.0 }));
2541                                    }
2542                                }
2543                            }
2544                        }
2545                    }
2546                    (_, Value::Object(obj)) => {
2547                        let args = vec![
2548                            Value::Object(obj.clone()),
2549                            Value::String("ne".to_string()),
2550                            a.clone(),
2551                        ];
2552                        match call_builtin("call_method", &args) {
2553                            Ok(v) => stack.push(v),
2554                            Err(_) => {
2555                                // Fallback: ne(b,a) = ~eq(b,a)
2556                                let args2 = vec![
2557                                    Value::Object(obj.clone()),
2558                                    Value::String("eq".to_string()),
2559                                    a.clone(),
2560                                ];
2561                                match call_builtin("call_method", &args2) {
2562                                    Ok(v) => {
2563                                        let truth: f64 = (&v).try_into()?;
2564                                        stack.push(Value::Num(if truth == 0.0 {
2565                                            1.0
2566                                        } else {
2567                                            0.0
2568                                        }));
2569                                    }
2570                                    Err(_) => {
2571                                        let aa: f64 = (&a).try_into()?;
2572                                        let bb: f64 = (&b).try_into()?;
2573                                        stack.push(Value::Num(if aa != bb { 1.0 } else { 0.0 }));
2574                                    }
2575                                }
2576                            }
2577                        }
2578                    }
2579                    (Value::HandleObject(_), _) | (_, Value::HandleObject(_)) => {
2580                        let v = runmat_runtime::call_builtin("ne", &[a.clone(), b.clone()])?;
2581                        stack.push(v);
2582                    }
2583                    (Value::Tensor(ta), Value::Tensor(tb)) => {
2584                        if ta.shape != tb.shape {
2585                            return Err(mex(
2586                                "ShapeMismatch",
2587                                "shape mismatch for element-wise comparison",
2588                            ));
2589                        }
2590                        let mut out = Vec::with_capacity(ta.data.len());
2591                        for i in 0..ta.data.len() {
2592                            out.push(if (ta.data[i] - tb.data[i]).abs() >= 1e-12 {
2593                                1.0
2594                            } else {
2595                                0.0
2596                            });
2597                        }
2598                        stack.push(Value::Tensor(
2599                            runmat_builtins::Tensor::new(out, ta.shape.clone())
2600                                .map_err(|e| format!("ne: {e}"))?,
2601                        ));
2602                    }
2603                    (Value::Tensor(t), Value::Num(_)) | (Value::Tensor(t), Value::Int(_)) => {
2604                        let s = match &b {
2605                            Value::Num(n) => *n,
2606                            Value::Int(i) => i.to_f64(),
2607                            _ => 0.0,
2608                        };
2609                        let out: Vec<f64> = t
2610                            .data
2611                            .iter()
2612                            .map(|x| if (*x - s).abs() >= 1e-12 { 1.0 } else { 0.0 })
2613                            .collect();
2614                        stack.push(Value::Tensor(
2615                            runmat_builtins::Tensor::new(out, t.shape.clone())
2616                                .map_err(|e| format!("ne: {e}"))?,
2617                        ));
2618                    }
2619                    (Value::Num(_), Value::Tensor(t)) | (Value::Int(_), Value::Tensor(t)) => {
2620                        let s = match &a {
2621                            Value::Num(n) => *n,
2622                            Value::Int(i) => i.to_f64(),
2623                            _ => 0.0,
2624                        };
2625                        let out: Vec<f64> = t
2626                            .data
2627                            .iter()
2628                            .map(|x| if (s - *x).abs() >= 1e-12 { 1.0 } else { 0.0 })
2629                            .collect();
2630                        stack.push(Value::Tensor(
2631                            runmat_builtins::Tensor::new(out, t.shape.clone())
2632                                .map_err(|e| format!("ne: {e}"))?,
2633                        ));
2634                    }
2635                    (Value::StringArray(sa), Value::StringArray(sb)) => {
2636                        if sa.shape != sb.shape {
2637                            return Err(mex(
2638                                "ShapeMismatch",
2639                                "shape mismatch for string array comparison",
2640                            ));
2641                        }
2642                        let mut out = Vec::with_capacity(sa.data.len());
2643                        for i in 0..sa.data.len() {
2644                            out.push(if sa.data[i] != sb.data[i] { 1.0 } else { 0.0 });
2645                        }
2646                        stack.push(Value::Tensor(
2647                            runmat_builtins::Tensor::new(out, sa.shape.clone())
2648                                .map_err(|e| format!("ne: {e}"))?,
2649                        ));
2650                    }
2651                    (Value::StringArray(sa), Value::String(s)) => {
2652                        let mut out = Vec::with_capacity(sa.data.len());
2653                        for i in 0..sa.data.len() {
2654                            out.push(if sa.data[i] != *s { 1.0 } else { 0.0 });
2655                        }
2656                        stack.push(Value::Tensor(
2657                            runmat_builtins::Tensor::new(out, sa.shape.clone())
2658                                .map_err(|e| format!("ne: {e}"))?,
2659                        ));
2660                    }
2661                    (Value::String(s), Value::StringArray(sa)) => {
2662                        let mut out = Vec::with_capacity(sa.data.len());
2663                        for i in 0..sa.data.len() {
2664                            out.push(if *s != sa.data[i] { 1.0 } else { 0.0 });
2665                        }
2666                        stack.push(Value::Tensor(
2667                            runmat_builtins::Tensor::new(out, sa.shape.clone())
2668                                .map_err(|e| format!("ne: {e}"))?,
2669                        ));
2670                    }
2671                    (Value::String(a_s), Value::String(b_s)) => {
2672                        stack.push(Value::Num(if a_s != b_s { 1.0 } else { 0.0 }));
2673                    }
2674                    _ => {
2675                        let bb: f64 = (&b).try_into()?;
2676                        let aa: f64 = (&a).try_into()?;
2677                        stack.push(Value::Num(if aa != bb { 1.0 } else { 0.0 }));
2678                    }
2679                }
2680            }
2681            Instr::JumpIfFalse(target) => {
2682                let cond: f64 = (&stack
2683                    .pop()
2684                    .ok_or(mex("StackUnderflow", "stack underflow"))?)
2685                    .try_into()?;
2686                if cond == 0.0 {
2687                    pc = target;
2688                    continue;
2689                }
2690            }
2691            Instr::Jump(target) => {
2692                pc = target;
2693                continue;
2694            }
2695            Instr::StochasticEvolution => {
2696                let steps_value = stack
2697                    .pop()
2698                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
2699                let scale_value = stack
2700                    .pop()
2701                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
2702                let drift_value = stack
2703                    .pop()
2704                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
2705                let state_value = stack
2706                    .pop()
2707                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
2708                let evolved = stochastic_evolution_dispatch(
2709                    state_value,
2710                    drift_value,
2711                    scale_value,
2712                    steps_value,
2713                )?;
2714                stack.push(evolved);
2715            }
2716            Instr::CallBuiltin(name, arg_count) => {
2717                if debug_stack {
2718                    eprintln!(
2719                        "CallBuiltin pc={} name={} arg_count={} stack_len={} top={:?}",
2720                        pc,
2721                        name,
2722                        arg_count,
2723                        stack.len(),
2724                        stack.last()
2725                    );
2726                }
2727                if name == "nargin" {
2728                    if arg_count != 0 {
2729                        vm_bail!(mex("TooManyInputs", "nargin takes no arguments").to_string());
2730                    }
2731                    let (nin, _) =
2732                        CALL_COUNTS.with(|cc| cc.borrow().last().cloned().unwrap_or((0, 0)));
2733                    stack.push(Value::Num(nin as f64));
2734                    pc += 1;
2735                    continue;
2736                }
2737                if name == "nargout" {
2738                    if arg_count != 0 {
2739                        vm_bail!(mex("TooManyInputs", "nargout takes no arguments").to_string());
2740                    }
2741                    let (_, nout) =
2742                        CALL_COUNTS.with(|cc| cc.borrow().last().cloned().unwrap_or((0, 0)));
2743                    stack.push(Value::Num(nout as f64));
2744                    pc += 1;
2745                    continue;
2746                }
2747                let mut args = Vec::new();
2748
2749                for _ in 0..arg_count {
2750                    args.push(
2751                        stack
2752                            .pop()
2753                            .ok_or(mex("StackUnderflow", "stack underflow"))?,
2754                    );
2755                }
2756                args.reverse();
2757
2758                let prepared_primary = accel_prepare_args(&name, &args)?;
2759                match runmat_runtime::call_builtin(&name, &prepared_primary) {
2760                    Ok(result) => stack.push(result),
2761                    Err(e) => {
2762                        // Specific-import matches: import pkg.foo; name == foo
2763                        let mut specific_matches: Vec<(String, Vec<Value>, Value)> = Vec::new();
2764                        for (path, wildcard) in &imports {
2765                            if *wildcard {
2766                                continue;
2767                            }
2768                            if path.last().map(|s| s.as_str()) == Some(name.as_str()) {
2769                                let qual = path.join(".");
2770                                let qual_args = accel_prepare_args(&qual, &prepared_primary)?;
2771                                if let Ok(value) = runmat_runtime::call_builtin(&qual, &qual_args) {
2772                                    specific_matches.push((qual, qual_args, value));
2773                                }
2774                            }
2775                        }
2776                        if specific_matches.len() > 1 {
2777                            let msg = specific_matches
2778                                .iter()
2779                                .map(|(q, _, _)| q.clone())
2780                                .collect::<Vec<_>>()
2781                                .join(", ");
2782                            vm_bail!(format!("ambiguous builtin '{}' via imports: {}", name, msg)
2783                                .to_string());
2784                        }
2785                        if let Some((_, _, value)) = specific_matches.pop() {
2786                            stack.push(value);
2787                        } else {
2788                            // Wildcard-import matches: import pkg.*; try pkg.name
2789                            let mut wildcard_matches: Vec<(String, Vec<Value>, Value)> = Vec::new();
2790                            for (path, wildcard) in &imports {
2791                                if !*wildcard {
2792                                    continue;
2793                                }
2794                                if path.is_empty() {
2795                                    continue;
2796                                }
2797                                let mut qual = String::new();
2798                                for (i, part) in path.iter().enumerate() {
2799                                    if i > 0 {
2800                                        qual.push('.');
2801                                    }
2802                                    qual.push_str(part);
2803                                }
2804                                qual.push('.');
2805                                qual.push_str(&name);
2806                                let qual_args = accel_prepare_args(&qual, &prepared_primary)?;
2807                                if let Ok(value) = runmat_runtime::call_builtin(&qual, &qual_args) {
2808                                    wildcard_matches.push((qual, qual_args, value));
2809                                }
2810                            }
2811                            if wildcard_matches.len() > 1 {
2812                                let msg = wildcard_matches
2813                                    .iter()
2814                                    .map(|(q, _, _)| q.clone())
2815                                    .collect::<Vec<_>>()
2816                                    .join(", ");
2817                                vm_bail!(format!(
2818                                    "ambiguous builtin '{}' via wildcard imports: {}",
2819                                    name, msg
2820                                )
2821                                .to_string());
2822                            }
2823                            if let Some((_, _, value)) = wildcard_matches.pop() {
2824                                stack.push(value);
2825                            } else {
2826                                // Special-case: rethrow() without explicit e uses last caught
2827                                if name == "rethrow" && args.is_empty() {
2828                                    if let Some(le) = &last_exception {
2829                                        vm_bail!(format!("{}: {}", le.identifier, le.message)
2830                                            .to_string());
2831                                    }
2832                                }
2833                                if let Some((catch_pc, catch_var)) = try_stack.pop() {
2834                                    if let Some(var_idx) = catch_var {
2835                                        if var_idx >= vars.len() {
2836                                            vars.resize(var_idx + 1, Value::Num(0.0));
2837                                            refresh_workspace_state(&vars);
2838                                        }
2839                                        let mex = parse_exception(&e);
2840                                        last_exception = Some(mex.clone());
2841                                        vars[var_idx] = Value::MException(mex);
2842                                    }
2843                                    pc = catch_pc;
2844                                    continue;
2845                                } else {
2846                                    return Err(e);
2847                                }
2848                            }
2849                        }
2850                    }
2851                }
2852            }
2853            Instr::CallBuiltinExpandLast(name, fixed_argc, num_indices) => {
2854                // Stack layout: [..., a1, a2, ..., a_fixed, base_for_cell, idx1, idx2, ...]
2855                // Build args vector by first collecting fixed args, then expanding cell indexing into comma-list
2856                // Evaluate indices and base
2857                let mut indices = Vec::with_capacity(num_indices);
2858                for _ in 0..num_indices {
2859                    let v = stack
2860                        .pop()
2861                        .ok_or(mex("StackUnderflow", "stack underflow"))?;
2862                    indices.push(v);
2863                }
2864                indices.reverse();
2865                let base = stack
2866                    .pop()
2867                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
2868                // Collect fixed args
2869                let mut fixed = Vec::with_capacity(fixed_argc);
2870                for _ in 0..fixed_argc {
2871                    fixed.push(
2872                        stack
2873                            .pop()
2874                            .ok_or(mex("StackUnderflow", "stack underflow"))?,
2875                    );
2876                }
2877                fixed.reverse();
2878                // Evaluate cell indexing, then flatten cell contents to extend args
2879                let expanded = match (base, indices.len()) {
2880                    (Value::Cell(ca), 1) => {
2881                        match &indices[0] {
2882                            Value::Num(n) => {
2883                                let i = *n as usize;
2884                                if i == 0 || i > ca.data.len() {
2885                                    return Err(mex(
2886                                        "CellIndexOutOfBounds",
2887                                        "Cell index out of bounds",
2888                                    ));
2889                                }
2890                                vec![(*ca.data[i - 1]).clone()]
2891                            }
2892                            Value::Int(i) => {
2893                                let iu = i.to_i64() as usize;
2894                                if iu == 0 || iu > ca.data.len() {
2895                                    return Err(mex(
2896                                        "CellIndexOutOfBounds",
2897                                        "Cell index out of bounds",
2898                                    ));
2899                                }
2900                                vec![(*ca.data[iu - 1]).clone()]
2901                            }
2902                            Value::Tensor(t) => {
2903                                // Treat as list of 1-based indices; expand each
2904                                let mut out: Vec<Value> = Vec::with_capacity(t.data.len());
2905                                for &val in &t.data {
2906                                    let iu = val as usize;
2907                                    if iu == 0 || iu > ca.data.len() {
2908                                        return Err(mex(
2909                                            "CellIndexOutOfBounds",
2910                                            "Cell index out of bounds",
2911                                        ));
2912                                    }
2913                                    out.push((*ca.data[iu - 1]).clone());
2914                                }
2915                                out
2916                            }
2917                            _ => return Err(mex("CellIndexType", "Unsupported cell index type")),
2918                        }
2919                    }
2920                    (Value::Cell(ca), 2) => {
2921                        let r: f64 = (&indices[0]).try_into()?;
2922                        let c: f64 = (&indices[1]).try_into()?;
2923                        let (ir, ic) = (r as usize, c as usize);
2924                        if ir == 0 || ir > ca.rows || ic == 0 || ic > ca.cols {
2925                            return Err(mex(
2926                                "CellSubscriptOutOfBounds",
2927                                "Cell subscript out of bounds",
2928                            ));
2929                        }
2930                        vec![(*ca.data[(ir - 1) * ca.cols + (ic - 1)]).clone()]
2931                    }
2932                    (other, _) => {
2933                        // Route to subsref(obj,'{}',{indices...}) if object
2934                        match other {
2935                            Value::Object(obj) => {
2936                                let cell = runmat_builtins::CellArray::new(
2937                                    indices.clone(),
2938                                    1,
2939                                    indices.len(),
2940                                )
2941                                .map_err(|e| format!("subsref build error: {e}"))?;
2942                                let v = match runmat_runtime::call_builtin(
2943                                    "call_method",
2944                                    &[
2945                                        Value::Object(obj),
2946                                        Value::String("subsref".to_string()),
2947                                        Value::String("{}".to_string()),
2948                                        Value::Cell(cell),
2949                                    ],
2950                                ) {
2951                                    Ok(v) => v,
2952                                    Err(e) => vm_bail!(e),
2953                                };
2954                                vec![v]
2955                            }
2956                            _ => {
2957                                return Err(mex(
2958                                    "ExpandError",
2959                                    "CallBuiltinExpandLast requires cell or object cell access",
2960                                ))
2961                            }
2962                        }
2963                    }
2964                };
2965                let mut args = fixed;
2966                args.extend(expanded.into_iter());
2967                match call_builtin_auto(&name, &args) {
2968                    Ok(v) => stack.push(v),
2969                    Err(e) => vm_bail!(e),
2970                }
2971            }
2972            Instr::CallBuiltinExpandAt(name, before_count, num_indices, after_count) => {
2973                // Stack layout: [..., a1..abefore, base, idx..., a_after...]
2974                let mut after: Vec<Value> = Vec::with_capacity(after_count);
2975                for _ in 0..after_count {
2976                    after.push(
2977                        stack
2978                            .pop()
2979                            .ok_or(mex("StackUnderflow", "stack underflow"))?,
2980                    );
2981                }
2982                after.reverse();
2983                let mut indices = Vec::with_capacity(num_indices);
2984                for _ in 0..num_indices {
2985                    indices.push(
2986                        stack
2987                            .pop()
2988                            .ok_or(mex("StackUnderflow", "stack underflow"))?,
2989                    );
2990                }
2991                indices.reverse();
2992                let base = stack
2993                    .pop()
2994                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
2995                let mut before: Vec<Value> = Vec::with_capacity(before_count);
2996                for _ in 0..before_count {
2997                    before.push(
2998                        stack
2999                            .pop()
3000                            .ok_or(mex("StackUnderflow", "stack underflow"))?,
3001                    );
3002                }
3003                before.reverse();
3004                let expanded = match (base, indices.len()) {
3005                    (Value::Cell(ca), 1) => match &indices[0] {
3006                        Value::Num(n) => {
3007                            let idx = *n as usize;
3008                            if idx == 0 || idx > ca.data.len() {
3009                                return Err(mex(
3010                                    "CellIndexOutOfBounds",
3011                                    "Cell index out of bounds",
3012                                ));
3013                            }
3014                            vec![(*ca.data[idx - 1]).clone()]
3015                        }
3016                        Value::Int(i) => {
3017                            let idx = i.to_i64() as usize;
3018                            if idx == 0 || idx > ca.data.len() {
3019                                return Err(mex(
3020                                    "CellIndexOutOfBounds",
3021                                    "Cell index out of bounds",
3022                                ));
3023                            }
3024                            vec![(*ca.data[idx - 1]).clone()]
3025                        }
3026                        Value::Tensor(t) => {
3027                            let mut out: Vec<Value> = Vec::with_capacity(t.data.len());
3028                            for &val in &t.data {
3029                                let iu = val as usize;
3030                                if iu == 0 || iu > ca.data.len() {
3031                                    return Err(mex(
3032                                        "CellIndexOutOfBounds",
3033                                        "Cell index out of bounds",
3034                                    ));
3035                                }
3036                                out.push((*ca.data[iu - 1]).clone());
3037                            }
3038                            out
3039                        }
3040                        _ => return Err(mex("CellIndexType", "Unsupported cell index type")),
3041                    },
3042                    (Value::Cell(ca), 2) => {
3043                        let r: f64 = (&indices[0]).try_into()?;
3044                        let c: f64 = (&indices[1]).try_into()?;
3045                        let (ir, ic) = (r as usize, c as usize);
3046                        if ir == 0 || ir > ca.rows || ic == 0 || ic > ca.cols {
3047                            return Err(mex(
3048                                "CellSubscriptOutOfBounds",
3049                                "Cell subscript out of bounds",
3050                            ));
3051                        }
3052                        vec![(*ca.data[(ir - 1) * ca.cols + (ic - 1)]).clone()]
3053                    }
3054                    (Value::Object(obj), _) => {
3055                        let idx_vals: Vec<Value> = indices
3056                            .iter()
3057                            .map(|v| Value::Num((v).try_into().unwrap_or(0.0)))
3058                            .collect();
3059                        let cell = runmat_runtime::call_builtin("__make_cell", &idx_vals)?;
3060                        let v = match runmat_runtime::call_builtin(
3061                            "call_method",
3062                            &[
3063                                Value::Object(obj),
3064                                Value::String("subsref".to_string()),
3065                                Value::String("{}".to_string()),
3066                                cell,
3067                            ],
3068                        ) {
3069                            Ok(v) => v,
3070                            Err(e) => vm_bail!(e),
3071                        };
3072                        vec![v]
3073                    }
3074                    _ => {
3075                        return Err(mex(
3076                            "ExpandError",
3077                            "CallBuiltinExpandAt requires cell or object cell access",
3078                        ))
3079                    }
3080                };
3081                let mut args = before;
3082                args.extend(expanded.into_iter());
3083                args.extend(after.into_iter());
3084                match call_builtin_auto(&name, &args) {
3085                    Ok(v) => stack.push(v),
3086                    Err(e) => vm_bail!(e),
3087                }
3088            }
3089            Instr::CallBuiltinExpandMulti(name, specs) => {
3090                // Build final args by walking specs left-to-right and popping from stack accordingly.
3091                let mut args: Vec<Value> = Vec::with_capacity(specs.len());
3092                // We'll reconstruct by first collecting a temporary vector and then reversing (since stack is LIFO)
3093                let mut temp: Vec<Value> = Vec::new();
3094                for spec in specs.iter().rev() {
3095                    if spec.is_expand {
3096                        let mut indices = Vec::with_capacity(spec.num_indices);
3097                        for _ in 0..spec.num_indices {
3098                            indices.push(
3099                                stack
3100                                    .pop()
3101                                    .ok_or(mex("StackUnderflow", "stack underflow"))?,
3102                            );
3103                        }
3104                        indices.reverse();
3105                        let base = stack
3106                            .pop()
3107                            .ok_or(mex("StackUnderflow", "stack underflow"))?;
3108                        #[cfg(feature = "native-accel")]
3109                        clear_residency(&base);
3110                        let expanded = if spec.expand_all {
3111                            match base {
3112                                Value::Cell(ca) => {
3113                                    ca.data.iter().map(|p| (*(*p)).clone()).collect()
3114                                }
3115                                Value::Object(obj) => {
3116                                    // subsref(obj,'{}', {}) with empty indices; expect a cell or value
3117                                    let empty = runmat_builtins::CellArray::new(vec![], 1, 0)
3118                                        .map_err(|e| format!("subsref build error: {e}"))?;
3119                                    let v = match runmat_runtime::call_builtin(
3120                                        "call_method",
3121                                        &[
3122                                            Value::Object(obj),
3123                                            Value::String("subsref".to_string()),
3124                                            Value::String("{}".to_string()),
3125                                            Value::Cell(empty),
3126                                        ],
3127                                    ) {
3128                                        Ok(v) => v,
3129                                        Err(e) => vm_bail!(e),
3130                                    };
3131                                    match v {
3132                                        Value::Cell(ca) => {
3133                                            ca.data.iter().map(|p| (*(*p)).clone()).collect()
3134                                        }
3135                                        other => vec![other],
3136                                    }
3137                                }
3138                                _ => return Err(mex(
3139                                    "ExpandError",
3140                                    "CallBuiltinExpandMulti requires cell or object for expand_all",
3141                                )),
3142                            }
3143                        } else {
3144                            match (base, indices.len()) {
3145                                (Value::Cell(ca), 1) => match &indices[0] {
3146                                    Value::Num(n) => {
3147                                        let idx = *n as usize;
3148                                        if idx == 0 || idx > ca.data.len() {
3149                                            return Err(mex(
3150                                                "CellIndexOutOfBounds",
3151                                                "Cell index out of bounds",
3152                                            ));
3153                                        }
3154                                        vec![(*ca.data[idx - 1]).clone()]
3155                                    }
3156                                    Value::Int(i) => {
3157                                        let idx = i.to_i64() as usize;
3158                                        if idx == 0 || idx > ca.data.len() {
3159                                            return Err(mex(
3160                                                "CellIndexOutOfBounds",
3161                                                "Cell index out of bounds",
3162                                            ));
3163                                        }
3164                                        vec![(*ca.data[idx - 1]).clone()]
3165                                    }
3166                                    Value::Tensor(t) => {
3167                                        let mut out: Vec<Value> = Vec::with_capacity(t.data.len());
3168                                        for &val in &t.data {
3169                                            let iu = val as usize;
3170                                            if iu == 0 || iu > ca.data.len() {
3171                                                return Err(mex(
3172                                                    "CellIndexOutOfBounds",
3173                                                    "Cell index out of bounds",
3174                                                ));
3175                                            }
3176                                            out.push((*ca.data[iu - 1]).clone());
3177                                        }
3178                                        out
3179                                    }
3180                                    _ => {
3181                                        return Err(mex(
3182                                            "CellIndexType",
3183                                            "Unsupported cell index type",
3184                                        ))
3185                                    }
3186                                },
3187                                (Value::Cell(ca), 2) => {
3188                                    let r: f64 = (&indices[0]).try_into()?;
3189                                    let c: f64 = (&indices[1]).try_into()?;
3190                                    let (ir, ic) = (r as usize, c as usize);
3191                                    if ir == 0 || ir > ca.rows || ic == 0 || ic > ca.cols {
3192                                        return Err(mex(
3193                                            "CellSubscriptOutOfBounds",
3194                                            "Cell subscript out of bounds",
3195                                        ));
3196                                    }
3197                                    vec![(*ca.data[(ir - 1) * ca.cols + (ic - 1)]).clone()]
3198                                }
3199                                (Value::Object(obj), _) => {
3200                                    let idx_vals: Vec<Value> = indices
3201                                        .iter()
3202                                        .map(|v| Value::Num((v).try_into().unwrap_or(0.0)))
3203                                        .collect();
3204                                    let cell =
3205                                        runmat_runtime::call_builtin("__make_cell", &idx_vals)?;
3206                                    let v = match runmat_runtime::call_builtin(
3207                                        "call_method",
3208                                        &[
3209                                            Value::Object(obj),
3210                                            Value::String("subsref".to_string()),
3211                                            Value::String("{}".to_string()),
3212                                            cell,
3213                                        ],
3214                                    ) {
3215                                        Ok(v) => v,
3216                                        Err(e) => vm_bail!(e),
3217                                    };
3218                                    vec![v]
3219                                }
3220                                _ => return Err(mex(
3221                                    "ExpandError",
3222                                    "CallBuiltinExpandMulti requires cell or object cell access",
3223                                )),
3224                            }
3225                        };
3226                        for v in expanded {
3227                            temp.push(v);
3228                        }
3229                    } else {
3230                        temp.push(
3231                            stack
3232                                .pop()
3233                                .ok_or(mex("StackUnderflow", "stack underflow"))?,
3234                        );
3235                    }
3236                }
3237                temp.reverse();
3238                args.extend(temp.into_iter());
3239                match call_builtin_auto(&name, &args) {
3240                    Ok(v) => stack.push(v),
3241                    Err(e) => vm_bail!(e),
3242                }
3243            }
3244            Instr::PackToRow(count) => {
3245                // Pop count values and build a 1xN numeric tensor (Num only; others error)
3246                let mut vals: Vec<f64> = Vec::with_capacity(count);
3247                let mut tmp: Vec<Value> = Vec::with_capacity(count);
3248                for _ in 0..count {
3249                    tmp.push(
3250                        stack
3251                            .pop()
3252                            .ok_or(mex("StackUnderflow", "stack underflow"))?,
3253                    );
3254                }
3255                tmp.reverse();
3256                for v in tmp {
3257                    let n: f64 = (&v).try_into()?;
3258                    vals.push(n);
3259                }
3260                let tens = runmat_builtins::Tensor::new(vals, vec![1, count])
3261                    .map_err(|e| format!("PackToRow: {e}"))?;
3262                stack.push(Value::Tensor(tens));
3263            }
3264            Instr::PackToCol(count) => {
3265                let mut vals: Vec<f64> = Vec::with_capacity(count);
3266                let mut tmp: Vec<Value> = Vec::with_capacity(count);
3267                for _ in 0..count {
3268                    tmp.push(
3269                        stack
3270                            .pop()
3271                            .ok_or(mex("StackUnderflow", "stack underflow"))?,
3272                    );
3273                }
3274                tmp.reverse();
3275                for v in tmp {
3276                    let n: f64 = (&v).try_into()?;
3277                    vals.push(n);
3278                }
3279                let tens = runmat_builtins::Tensor::new(vals, vec![count, 1])
3280                    .map_err(|e| format!("PackToCol: {e}"))?;
3281                stack.push(Value::Tensor(tens));
3282            }
3283            Instr::CallFunctionExpandMulti(name, specs) => {
3284                // Build args via specs, then invoke user function similar to CallFunction
3285                let mut temp: Vec<Value> = Vec::new();
3286                for spec in specs.iter().rev() {
3287                    if spec.is_expand {
3288                        let mut indices = Vec::with_capacity(spec.num_indices);
3289                        for _ in 0..spec.num_indices {
3290                            indices.push(
3291                                stack
3292                                    .pop()
3293                                    .ok_or(mex("StackUnderflow", "stack underflow"))?,
3294                            );
3295                        }
3296                        indices.reverse();
3297                        let base = stack
3298                            .pop()
3299                            .ok_or(mex("StackUnderflow", "stack underflow"))?;
3300                        let expanded = if spec.expand_all {
3301                            match base {
3302                                Value::Cell(ca) => ca.data.iter().map(|p| (*(*p)).clone()).collect::<Vec<Value>>(),
3303                                Value::Object(obj) => {
3304                                    let empty = runmat_builtins::CellArray::new(vec![], 1, 0).map_err(|e| format!("subsref build error: {e}"))?;
3305                                    let v = match runmat_runtime::call_builtin("call_method", &[
3306                                        Value::Object(obj),
3307                                        Value::String("subsref".to_string()),
3308                                        Value::String("{}".to_string()),
3309                                        Value::Cell(empty),
3310                                    ]) { Ok(v) => v, Err(e) => vm_bail!(e) };
3311                                    match v { Value::Cell(ca) => ca.data.iter().map(|p| (*(*p)).clone()).collect::<Vec<Value>>(), other => vec![other] }
3312                                }
3313                                _ => return Err("CallFunctionExpandMulti requires cell or object for expand_all".to_string()),
3314                            }
3315                        } else {
3316                            match (base, indices.len()) {
3317                                (Value::Cell(ca), 1) => match &indices[0] {
3318                                    Value::Num(n) => {
3319                                        let idx = *n as usize;
3320                                        if idx == 0 || idx > ca.data.len() {
3321                                            return Err(mex(
3322                                                "CellIndexOutOfBounds",
3323                                                "Cell index out of bounds",
3324                                            ));
3325                                        }
3326                                        vec![(*ca.data[idx - 1]).clone()]
3327                                    }
3328                                    Value::Int(i) => {
3329                                        let idx = i.to_i64() as usize;
3330                                        if idx == 0 || idx > ca.data.len() {
3331                                            return Err(mex(
3332                                                "CellIndexOutOfBounds",
3333                                                "Cell index out of bounds",
3334                                            ));
3335                                        }
3336                                        vec![(*ca.data[idx - 1]).clone()]
3337                                    }
3338                                    Value::Tensor(t) => {
3339                                        let mut out: Vec<Value> = Vec::with_capacity(t.data.len());
3340                                        for &val in &t.data {
3341                                            let iu = val as usize;
3342                                            if iu == 0 || iu > ca.data.len() {
3343                                                return Err(mex(
3344                                                    "CellIndexOutOfBounds",
3345                                                    "Cell index out of bounds",
3346                                                ));
3347                                            }
3348                                            out.push((*ca.data[iu - 1]).clone());
3349                                        }
3350                                        out
3351                                    }
3352                                    _ => {
3353                                        return Err(mex(
3354                                            "CellIndexType",
3355                                            "Unsupported cell index type",
3356                                        ))
3357                                    }
3358                                },
3359                                (Value::Cell(ca), 2) => {
3360                                    let r: f64 = (&indices[0]).try_into()?;
3361                                    let c: f64 = (&indices[1]).try_into()?;
3362                                    let (ir, ic) = (r as usize, c as usize);
3363                                    if ir == 0 || ir > ca.rows || ic == 0 || ic > ca.cols {
3364                                        return Err(mex(
3365                                            "CellSubscriptOutOfBounds",
3366                                            "Cell subscript out of bounds",
3367                                        ));
3368                                    }
3369                                    vec![(*ca.data[(ir - 1) * ca.cols + (ic - 1)]).clone()]
3370                                }
3371                                (Value::Object(obj), _) => {
3372                                    let cell = runmat_builtins::CellArray::new(
3373                                        indices.clone(),
3374                                        1,
3375                                        indices.len(),
3376                                    )
3377                                    .map_err(|e| format!("subsref build error: {e}"))?;
3378                                    let v = match runmat_runtime::call_builtin(
3379                                        "call_method",
3380                                        &[
3381                                            Value::Object(obj),
3382                                            Value::String("subsref".to_string()),
3383                                            Value::String("{}".to_string()),
3384                                            Value::Cell(cell),
3385                                        ],
3386                                    ) {
3387                                        Ok(v) => v,
3388                                        Err(e) => vm_bail!(e),
3389                                    };
3390                                    vec![v]
3391                                }
3392                                _ => return Err(
3393                                    "CallFunctionExpandMulti requires cell or object cell access"
3394                                        .to_string(),
3395                                ),
3396                            }
3397                        };
3398                        for v in expanded {
3399                            temp.push(v);
3400                        }
3401                    } else {
3402                        temp.push(
3403                            stack
3404                                .pop()
3405                                .ok_or(mex("StackUnderflow", "stack underflow"))?,
3406                        );
3407                    }
3408                }
3409                temp.reverse();
3410                let args = temp;
3411                let func: UserFunction = match bytecode.functions.get(&name) {
3412                    Some(f) => f.clone(),
3413                    None => vm_bail!(mex(
3414                        "UndefinedFunction",
3415                        &format!("Undefined function: {name}")
3416                    )),
3417                };
3418                let var_map = runmat_hir::remapping::create_complete_function_var_map(
3419                    &func.params,
3420                    &func.outputs,
3421                    &func.body,
3422                );
3423                let local_var_count = var_map.len();
3424                let remapped_body =
3425                    runmat_hir::remapping::remap_function_body(&func.body, &var_map);
3426                let func_vars_count = local_var_count.max(func.params.len());
3427                let mut func_vars = vec![Value::Num(0.0); func_vars_count];
3428                for (i, _param_id) in func.params.iter().enumerate() {
3429                    if i < args.len() && i < func_vars.len() {
3430                        func_vars[i] = args[i].clone();
3431                    }
3432                }
3433                for (original_var_id, local_var_id) in &var_map {
3434                    let local_index = local_var_id.0;
3435                    let global_index = original_var_id.0;
3436                    if local_index < func_vars.len() && global_index < vars.len() {
3437                        let is_parameter = func
3438                            .params
3439                            .iter()
3440                            .any(|param_id| param_id == original_var_id);
3441                        if !is_parameter {
3442                            func_vars[local_index] = vars[global_index].clone();
3443                        }
3444                    }
3445                }
3446                let mut func_var_types = func.var_types.clone();
3447                if func_var_types.len() < local_var_count {
3448                    func_var_types.resize(local_var_count, Type::Unknown);
3449                }
3450                let func_program = runmat_hir::HirProgram {
3451                    body: remapped_body,
3452                    var_types: func_var_types,
3453                };
3454                let func_bytecode =
3455                    crate::compile_with_functions(&func_program, &bytecode.functions)?;
3456                // Make nested closures visible to outer frames
3457                for (k, v) in func_bytecode.functions.iter() {
3458                    context.functions.insert(k.clone(), v.clone());
3459                }
3460                let func_result_vars = match interpret_function(&func_bytecode, func_vars) {
3461                    Ok(v) => v,
3462                    Err(e) => vm_bail!(e),
3463                };
3464                if let Some(output_var_id) = func.outputs.first() {
3465                    let local_output_index = var_map.get(output_var_id).map(|id| id.0).unwrap_or(0);
3466                    if local_output_index < func_result_vars.len() {
3467                        stack.push(func_result_vars[local_output_index].clone());
3468                    } else {
3469                        stack.push(Value::Num(0.0));
3470                    }
3471                } else {
3472                    stack.push(Value::Num(0.0));
3473                }
3474            }
3475            Instr::CallFunction(name, arg_count) => {
3476                // First, try runtime builtin fallback (some helpers like call_method)
3477                {
3478                    let mut args = Vec::new();
3479                    for _ in 0..arg_count {
3480                        args.push(
3481                            stack
3482                                .pop()
3483                                .ok_or(mex("StackUnderflow", "stack underflow"))?,
3484                        );
3485                    }
3486                    args.reverse();
3487                    let prepared_primary = accel_prepare_args(&name, &args)?;
3488                    if let Ok(result) = runmat_runtime::call_builtin(&name, &prepared_primary) {
3489                        stack.push(result);
3490                        pc += 1;
3491                        continue;
3492                    }
3493                    // Put args back if not a builtin: we'll handle as user function below
3494                    for v in prepared_primary.into_iter().rev() {
3495                        stack.push(v);
3496                    }
3497                }
3498                let func: UserFunction = match bytecode.functions.get(&name) {
3499                    Some(f) => f.clone(),
3500                    None => vm_bail!(mex(
3501                        "UndefinedFunction",
3502                        &format!("Undefined function: {name}")
3503                    )),
3504                };
3505                let mut args = Vec::new();
3506                for _ in 0..arg_count {
3507                    args.push(
3508                        stack
3509                            .pop()
3510                            .ok_or(mex("StackUnderflow", "stack underflow"))?,
3511                    );
3512                }
3513                args.reverse();
3514                if !func.has_varargin {
3515                    if arg_count < func.params.len() {
3516                        vm_bail!(mex(
3517                            "NotEnoughInputs",
3518                            &format!(
3519                                "Function '{name}' expects {} inputs, got {arg_count}",
3520                                func.params.len()
3521                            )
3522                        ));
3523                    }
3524                    if arg_count > func.params.len() {
3525                        vm_bail!(mex(
3526                            "TooManyInputs",
3527                            &format!(
3528                                "Function '{name}' expects {} inputs, got {arg_count}",
3529                                func.params.len()
3530                            )
3531                        ));
3532                    }
3533                } else {
3534                    let min_args = func.params.len().saturating_sub(1);
3535                    if arg_count < min_args {
3536                        vm_bail!(mex(
3537                            "NotEnoughInputs",
3538                            &format!("Function '{name}' expects at least {min_args} inputs, got {arg_count}")
3539                        ));
3540                    }
3541                }
3542                let var_map = runmat_hir::remapping::create_complete_function_var_map(
3543                    &func.params,
3544                    &func.outputs,
3545                    &func.body,
3546                );
3547                let local_var_count = var_map.len();
3548                let remapped_body =
3549                    runmat_hir::remapping::remap_function_body(&func.body, &var_map);
3550                let func_vars_count = local_var_count.max(func.params.len());
3551                let mut func_vars = vec![Value::Num(0.0); func_vars_count];
3552                if func.has_varargin {
3553                    // All fixed parameters except the last (varargin placeholder) are positional; pack the rest into a cell
3554                    let fixed = func.params.len().saturating_sub(1);
3555                    for i in 0..fixed {
3556                        if i < args.len() && i < func_vars.len() {
3557                            func_vars[i] = args[i].clone();
3558                        }
3559                    }
3560                    let mut rest: Vec<Value> = if args.len() > fixed {
3561                        args[fixed..].to_vec()
3562                    } else {
3563                        Vec::new()
3564                    };
3565                    // Create row cell for varargin
3566                    let cell = runmat_builtins::CellArray::new(
3567                        std::mem::take(&mut rest),
3568                        1,
3569                        if args.len() > fixed {
3570                            args.len() - fixed
3571                        } else {
3572                            0
3573                        },
3574                    )
3575                    .map_err(|e| format!("varargin: {e}"))?;
3576                    if fixed < func_vars.len() {
3577                        func_vars[fixed] = Value::Cell(cell);
3578                    }
3579                } else {
3580                    for (i, _param_id) in func.params.iter().enumerate() {
3581                        if i < args.len() && i < func_vars.len() {
3582                            func_vars[i] = args[i].clone();
3583                        }
3584                    }
3585                }
3586                // Copy referenced globals into local frame
3587                for (original_var_id, local_var_id) in &var_map {
3588                    let local_index = local_var_id.0;
3589                    let global_index = original_var_id.0;
3590                    if local_index < func_vars.len() && global_index < vars.len() {
3591                        let is_parameter = func
3592                            .params
3593                            .iter()
3594                            .any(|param_id| param_id == original_var_id);
3595                        if !is_parameter {
3596                            func_vars[local_index] = vars[global_index].clone();
3597                        }
3598                    }
3599                }
3600                // Initialize varargout cell if needed
3601                if func.has_varargout {
3602                    if let Some(varargout_oid) = func.outputs.last() {
3603                        if let Some(local_id) = var_map.get(varargout_oid) {
3604                            if local_id.0 < func_vars.len() {
3605                                let empty = runmat_builtins::CellArray::new(vec![], 1, 0)
3606                                    .map_err(|e| format!("varargout init: {e}"))?;
3607                                func_vars[local_id.0] = Value::Cell(empty);
3608                            }
3609                        }
3610                    }
3611                }
3612                let mut func_var_types = func.var_types.clone();
3613                if func_var_types.len() < local_var_count {
3614                    func_var_types.resize(local_var_count, Type::Unknown);
3615                }
3616                let func_program = runmat_hir::HirProgram {
3617                    body: remapped_body,
3618                    var_types: func_var_types,
3619                };
3620                let func_bytecode =
3621                    crate::compile_with_functions(&func_program, &bytecode.functions)?;
3622                let func_result_vars = match interpret_function_with_counts(
3623                    &func_bytecode,
3624                    func_vars,
3625                    &name,
3626                    1,
3627                    arg_count,
3628                ) {
3629                    Ok(v) => v,
3630                    Err(e) => {
3631                        if let Some((catch_pc, catch_var)) = try_stack.pop() {
3632                            if let Some(var_idx) = catch_var {
3633                                if var_idx >= vars.len() {
3634                                    vars.resize(var_idx + 1, Value::Num(0.0));
3635                                    refresh_workspace_state(&vars);
3636                                }
3637                                let mex = parse_exception(&e);
3638                                last_exception = Some(mex.clone());
3639                                vars[var_idx] = Value::MException(mex);
3640                            }
3641                            pc = catch_pc;
3642                            continue;
3643                        } else {
3644                            vm_bail!(e);
3645                        }
3646                    }
3647                };
3648                if func.has_varargout {
3649                    // Single-output call: return first varargout element if any, else 0
3650                    // For true multi-assign we already have CallFunctionMulti path
3651                    let first = func
3652                        .outputs
3653                        .first()
3654                        .and_then(|oid| var_map.get(oid))
3655                        .map(|lid| lid.0)
3656                        .unwrap_or(0);
3657                    if let Some(Value::Cell(ca)) = func_result_vars.get(first) {
3658                        if !ca.data.is_empty() {
3659                            stack.push((*ca.data[0]).clone());
3660                        } else {
3661                            stack.push(Value::Num(0.0));
3662                        }
3663                    } else if let Some(v) = func_result_vars.get(first) {
3664                        stack.push(v.clone());
3665                    } else {
3666                        stack.push(Value::Num(0.0));
3667                    }
3668                } else if let Some(output_var_id) = func.outputs.first() {
3669                    let local_output_index = var_map.get(output_var_id).map(|id| id.0).unwrap_or(0);
3670                    if local_output_index < func_result_vars.len() {
3671                        stack.push(func_result_vars[local_output_index].clone());
3672                    } else {
3673                        stack.push(Value::Num(0.0));
3674                    }
3675                } else {
3676                    vm_bail!(mex(
3677                        "TooManyOutputs",
3678                        &format!("Function '{name}' does not return outputs")
3679                    ));
3680                }
3681            }
3682            Instr::CallFunctionExpandAt(name, before_count, num_indices, after_count) => {
3683                // Assemble argument list with expansion at position
3684                let mut after: Vec<Value> = Vec::with_capacity(after_count);
3685                for _ in 0..after_count {
3686                    after.push(
3687                        stack
3688                            .pop()
3689                            .ok_or(mex("StackUnderflow", "stack underflow"))?,
3690                    );
3691                }
3692                after.reverse();
3693                let mut indices = Vec::with_capacity(num_indices);
3694                for _ in 0..num_indices {
3695                    indices.push(
3696                        stack
3697                            .pop()
3698                            .ok_or(mex("StackUnderflow", "stack underflow"))?,
3699                    );
3700                }
3701                indices.reverse();
3702                let base = stack
3703                    .pop()
3704                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
3705                let mut before: Vec<Value> = Vec::with_capacity(before_count);
3706                for _ in 0..before_count {
3707                    before.push(
3708                        stack
3709                            .pop()
3710                            .ok_or(mex("StackUnderflow", "stack underflow"))?,
3711                    );
3712                }
3713                before.reverse();
3714                let expanded = match (base, indices.len()) {
3715                    (Value::Cell(ca), 1) => match &indices[0] {
3716                        Value::Num(n) => {
3717                            let idx = *n as usize;
3718                            if idx == 0 || idx > ca.data.len() {
3719                                return Err(mex(
3720                                    "CellIndexOutOfBounds",
3721                                    "Cell index out of bounds",
3722                                ));
3723                            }
3724                            vec![(*ca.data[idx - 1]).clone()]
3725                        }
3726                        Value::Int(i) => {
3727                            let idx = i.to_i64() as usize;
3728                            if idx == 0 || idx > ca.data.len() {
3729                                return Err(mex(
3730                                    "CellIndexOutOfBounds",
3731                                    "Cell index out of bounds",
3732                                ));
3733                            }
3734                            vec![(*ca.data[idx - 1]).clone()]
3735                        }
3736                        Value::Tensor(t) => {
3737                            let mut out: Vec<Value> = Vec::with_capacity(t.data.len());
3738                            for &val in &t.data {
3739                                let iu = val as usize;
3740                                if iu == 0 || iu > ca.data.len() {
3741                                    return Err(mex(
3742                                        "CellIndexOutOfBounds",
3743                                        "Cell index out of bounds",
3744                                    ));
3745                                }
3746                                out.push((*ca.data[iu - 1]).clone());
3747                            }
3748                            out
3749                        }
3750                        _ => return Err(mex("CellIndexType", "Unsupported cell index type")),
3751                    },
3752                    (Value::Cell(ca), 2) => {
3753                        let r: f64 = (&indices[0]).try_into()?;
3754                        let c: f64 = (&indices[1]).try_into()?;
3755                        let (ir, ic) = (r as usize, c as usize);
3756                        if ir == 0 || ir > ca.rows || ic == 0 || ic > ca.cols {
3757                            return Err(mex(
3758                                "CellSubscriptOutOfBounds",
3759                                "Cell subscript out of bounds",
3760                            ));
3761                        }
3762                        vec![(*ca.data[(ir - 1) * ca.cols + (ic - 1)]).clone()]
3763                    }
3764                    (Value::Object(obj), _) => {
3765                        let idx_vals: Vec<Value> = indices
3766                            .iter()
3767                            .map(|v| Value::Num((v).try_into().unwrap_or(0.0)))
3768                            .collect();
3769                        let cell = runmat_runtime::call_builtin("__make_cell", &idx_vals)?;
3770                        let v = match runmat_runtime::call_builtin(
3771                            "call_method",
3772                            &[
3773                                Value::Object(obj),
3774                                Value::String("subsref".to_string()),
3775                                Value::String("{}".to_string()),
3776                                cell,
3777                            ],
3778                        ) {
3779                            Ok(v) => v,
3780                            Err(e) => vm_bail!(e),
3781                        };
3782                        vec![v]
3783                    }
3784                    _ => {
3785                        return Err(mex(
3786                            "ExpandError",
3787                            "CallBuiltinExpandAt requires cell or object cell access",
3788                        ))
3789                    }
3790                };
3791                let mut args = before;
3792                args.extend(expanded.into_iter());
3793                args.extend(after.into_iter());
3794                match call_builtin(&name, &args) {
3795                    Ok(v) => stack.push(v),
3796                    Err(e) => vm_bail!(e),
3797                }
3798            }
3799            Instr::CallFunctionMulti(name, arg_count, out_count) => {
3800                let func: UserFunction = match bytecode.functions.get(&name) {
3801                    Some(f) => f.clone(),
3802                    None => vm_bail!(format!("undefined function: {name}")),
3803                };
3804                let mut args = Vec::new();
3805                for _ in 0..arg_count {
3806                    args.push(
3807                        stack
3808                            .pop()
3809                            .ok_or(mex("StackUnderflow", "stack underflow"))?,
3810                    );
3811                }
3812                args.reverse();
3813                if !func.has_varargin {
3814                    if arg_count < func.params.len() {
3815                        vm_bail!(mex(
3816                            "NotEnoughInputs",
3817                            &format!(
3818                                "Function '{name}' expects {} inputs, got {arg_count}",
3819                                func.params.len()
3820                            )
3821                        ));
3822                    }
3823                    if arg_count > func.params.len() {
3824                        vm_bail!(mex(
3825                            "TooManyInputs",
3826                            &format!(
3827                                "Function '{name}' expects {} inputs, got {arg_count}",
3828                                func.params.len()
3829                            )
3830                        ));
3831                    }
3832                } else if arg_count + 1 < func.params.len() {
3833                    vm_bail!(mex(
3834                        "NotEnoughInputs",
3835                        &format!(
3836                            "Function '{name}' expects at least {} inputs, got {arg_count}",
3837                            func.params.len() - 1
3838                        )
3839                    ));
3840                }
3841                let var_map = runmat_hir::remapping::create_complete_function_var_map(
3842                    &func.params,
3843                    &func.outputs,
3844                    &func.body,
3845                );
3846                let local_var_count = var_map.len();
3847                let remapped_body =
3848                    runmat_hir::remapping::remap_function_body(&func.body, &var_map);
3849                let func_vars_count = local_var_count.max(func.params.len());
3850                let mut func_vars = vec![Value::Num(0.0); func_vars_count];
3851                if func.has_varargin {
3852                    let fixed = func.params.len().saturating_sub(1);
3853                    for i in 0..fixed {
3854                        if i < args.len() && i < func_vars.len() {
3855                            func_vars[i] = args[i].clone();
3856                        }
3857                    }
3858                    let mut rest: Vec<Value> = if args.len() > fixed {
3859                        args[fixed..].to_vec()
3860                    } else {
3861                        Vec::new()
3862                    };
3863                    let cell = runmat_builtins::CellArray::new(
3864                        std::mem::take(&mut rest),
3865                        1,
3866                        if args.len() > fixed {
3867                            args.len() - fixed
3868                        } else {
3869                            0
3870                        },
3871                    )
3872                    .map_err(|e| format!("varargin: {e}"))?;
3873                    if fixed < func_vars.len() {
3874                        func_vars[fixed] = Value::Cell(cell);
3875                    }
3876                } else {
3877                    for (i, _param_id) in func.params.iter().enumerate() {
3878                        if i < args.len() && i < func_vars.len() {
3879                            func_vars[i] = args[i].clone();
3880                        }
3881                    }
3882                }
3883                for (original_var_id, local_var_id) in &var_map {
3884                    let local_index = local_var_id.0;
3885                    let global_index = original_var_id.0;
3886                    if local_index < func_vars.len() && global_index < vars.len() {
3887                        let is_parameter = func
3888                            .params
3889                            .iter()
3890                            .any(|param_id| param_id == original_var_id);
3891                        if !is_parameter {
3892                            func_vars[local_index] = vars[global_index].clone();
3893                        }
3894                    }
3895                }
3896                // Initialize varargout cell if needed
3897                if func.has_varargout {
3898                    if let Some(varargout_oid) = func.outputs.last() {
3899                        if let Some(local_id) = var_map.get(varargout_oid) {
3900                            if local_id.0 < func_vars.len() {
3901                                let empty = runmat_builtins::CellArray::new(vec![], 1, 0)
3902                                    .map_err(|e| format!("varargout init: {e}"))?;
3903                                func_vars[local_id.0] = Value::Cell(empty);
3904                            }
3905                        }
3906                    }
3907                }
3908                let mut func_var_types = func.var_types.clone();
3909                if func_var_types.len() < local_var_count {
3910                    func_var_types.resize(local_var_count, Type::Unknown);
3911                }
3912                let func_program = runmat_hir::HirProgram {
3913                    body: remapped_body,
3914                    var_types: func_var_types,
3915                };
3916                let func_bytecode =
3917                    crate::compile_with_functions(&func_program, &bytecode.functions)?;
3918                let func_result_vars = match interpret_function_with_counts(
3919                    &func_bytecode,
3920                    func_vars,
3921                    &name,
3922                    out_count,
3923                    arg_count,
3924                ) {
3925                    Ok(v) => v,
3926                    Err(e) => {
3927                        if let Some((catch_pc, catch_var)) = try_stack.pop() {
3928                            if let Some(var_idx) = catch_var {
3929                                if var_idx >= vars.len() {
3930                                    vars.resize(var_idx + 1, Value::Num(0.0));
3931                                    refresh_workspace_state(&vars);
3932                                }
3933                                let mex = parse_exception(&e);
3934                                last_exception = Some(mex.clone());
3935                                vars[var_idx] = Value::MException(mex);
3936                            }
3937                            pc = catch_pc;
3938                            continue;
3939                        } else {
3940                            vm_bail!(e);
3941                        }
3942                    }
3943                };
3944                if func.has_varargout {
3945                    // Push named outputs first (excluding varargout itself), then fill from varargout cell, then pad with 0.0
3946                    let total_named = func.outputs.len().saturating_sub(1);
3947                    let mut pushed = 0usize;
3948                    // Push named outputs in order
3949                    for i in 0..total_named.min(out_count) {
3950                        if let Some(oid) = func.outputs.get(i) {
3951                            if let Some(local_id) = var_map.get(oid) {
3952                                let idx = local_id.0;
3953                                let v = func_result_vars
3954                                    .get(idx)
3955                                    .cloned()
3956                                    .unwrap_or(Value::Num(0.0));
3957                                stack.push(v);
3958                                pushed += 1;
3959                            }
3960                        }
3961                    }
3962                    if pushed < out_count {
3963                        // Now consume from varargout cell (last output)
3964                        if let Some(varargout_oid) = func.outputs.last() {
3965                            if let Some(local_id) = var_map.get(varargout_oid) {
3966                                if let Some(Value::Cell(ca)) = func_result_vars.get(local_id.0) {
3967                                    let available = ca.data.len();
3968                                    let need = out_count - pushed;
3969                                    if need > available {
3970                                        vm_bail!(mex("VarargoutMismatch", &format!("Function '{name}' returned {available} varargout values, {need} requested")));
3971                                    }
3972                                    for vi in 0..need {
3973                                        stack.push((*ca.data[vi]).clone());
3974                                    }
3975                                }
3976                            }
3977                        }
3978                    }
3979                    // No padding
3980                } else {
3981                    // Push out_count values; error if requesting more than defined
3982                    let defined = func.outputs.len();
3983                    if out_count > defined {
3984                        vm_bail!(mex(
3985                            "TooManyOutputs",
3986                            &format!("Function '{name}' defines {defined} outputs, {out_count} requested")
3987                        ));
3988                    }
3989                    for i in 0..out_count {
3990                        let v = func
3991                            .outputs
3992                            .get(i)
3993                            .and_then(|oid| var_map.get(oid))
3994                            .map(|lid| lid.0)
3995                            .and_then(|idx| func_result_vars.get(idx))
3996                            .cloned()
3997                            .unwrap_or(Value::Num(0.0));
3998                        stack.push(v);
3999                    }
4000                }
4001            }
4002            Instr::CallBuiltinMulti(name, arg_count, out_count) => {
4003                // Default behavior: try to call builtin; if success, use first output; pad rest with 0.0
4004                let mut args = Vec::new();
4005                for _ in 0..arg_count {
4006                    args.push(
4007                        stack
4008                            .pop()
4009                            .ok_or(mex("StackUnderflow", "stack underflow"))?,
4010                    );
4011                }
4012                args.reverse();
4013                if name == "gather" {
4014                    let eval = match runmat_runtime::builtins::acceleration::gpu::gather::evaluate(
4015                        &args,
4016                    ) {
4017                        Ok(eval) => eval,
4018                        Err(err) => vm_bail!(err),
4019                    };
4020                    let len = eval.len();
4021                    if out_count == 0 {
4022                        continue;
4023                    }
4024                    if len == 1 {
4025                        if out_count > 1 {
4026                            vm_bail!(mex("TooManyOutputs", "gather: too many output arguments"));
4027                        }
4028                        stack.push(eval.into_first());
4029                        continue;
4030                    }
4031                    if out_count != len {
4032                        vm_bail!(mex(
4033                            "TooManyOutputs",
4034                            "gather: number of outputs must match number of inputs"
4035                        ));
4036                    }
4037                    for value in eval.into_outputs() {
4038                        stack.push(value);
4039                    }
4040                    continue;
4041                }
4042                if name == "meshgrid" {
4043                    let eval = match runmat_runtime::builtins::array::creation::meshgrid::evaluate(
4044                        &args,
4045                    ) {
4046                        Ok(eval) => eval,
4047                        Err(err) => vm_bail!(err),
4048                    };
4049                    if out_count == 0 {
4050                        continue;
4051                    }
4052                    let available = eval.output_count();
4053                    if out_count > available {
4054                        let msg = if available == 2 {
4055                            "meshgrid with two inputs supports at most two outputs"
4056                        } else {
4057                            "meshgrid supports at most three outputs"
4058                        };
4059                        vm_bail!(mex("TooManyOutputs", msg));
4060                    }
4061                    let first = match eval.first() {
4062                        Ok(value) => value,
4063                        Err(err) => vm_bail!(err),
4064                    };
4065                    stack.push(first);
4066                    if out_count >= 2 {
4067                        let second = match eval.second() {
4068                            Ok(value) => value,
4069                            Err(err) => vm_bail!(err),
4070                        };
4071                        stack.push(second);
4072                    }
4073                    if out_count >= 3 {
4074                        let third = match eval.third() {
4075                            Ok(value) => value,
4076                            Err(err) => vm_bail!(err),
4077                        };
4078                        stack.push(third);
4079                    }
4080                    continue;
4081                }
4082                if name == "load" {
4083                    let eval = match runmat_runtime::builtins::io::mat::load::evaluate(&args) {
4084                        Ok(eval) => eval,
4085                        Err(err) => vm_bail!(err),
4086                    };
4087                    if out_count == 0 {
4088                        if let Err(err) = assign_loaded_variables(&mut vars, eval.variables()) {
4089                            vm_bail!(err);
4090                        }
4091                        continue;
4092                    }
4093                    if out_count > 1 {
4094                        vm_bail!(mex(
4095                            "TooManyOutputs",
4096                            "load supports at most one output argument"
4097                        ));
4098                    }
4099                    stack.push(eval.first_output());
4100                    for _ in 1..out_count {
4101                        stack.push(Value::Num(0.0));
4102                    }
4103                    continue;
4104                }
4105                if name == "fopen" {
4106                    let eval = match runmat_runtime::builtins::io::filetext::fopen::evaluate(&args)
4107                    {
4108                        Ok(eval) => eval,
4109                        Err(err) => vm_bail!(err),
4110                    };
4111                    if out_count == 0 {
4112                        continue;
4113                    }
4114                    let outputs = eval.outputs();
4115                    for i in 0..out_count {
4116                        if let Some(value) = outputs.get(i) {
4117                            stack.push(value.clone());
4118                        } else {
4119                            stack.push(Value::Num(0.0));
4120                        }
4121                    }
4122                    continue;
4123                }
4124                if name == "fgets" {
4125                    if args.is_empty() {
4126                        vm_bail!(mex(
4127                            "RuntimeError",
4128                            "fgets requires at least one input argument"
4129                        ));
4130                    }
4131                    let eval = match runmat_runtime::builtins::io::filetext::fgets::evaluate(
4132                        &args[0],
4133                        &args[1..],
4134                    ) {
4135                        Ok(eval) => eval,
4136                        Err(err) => vm_bail!(err),
4137                    };
4138                    if out_count == 0 {
4139                        continue;
4140                    }
4141                    let outputs = eval.outputs();
4142                    for i in 0..out_count {
4143                        if let Some(value) = outputs.get(i) {
4144                            stack.push(value.clone());
4145                        } else {
4146                            stack.push(Value::Num(0.0));
4147                        }
4148                    }
4149                    continue;
4150                }
4151                if name == "fclose" {
4152                    let eval = match runmat_runtime::builtins::io::filetext::fclose::evaluate(&args)
4153                    {
4154                        Ok(eval) => eval,
4155                        Err(err) => vm_bail!(err),
4156                    };
4157                    if out_count == 0 {
4158                        continue;
4159                    }
4160                    let outputs = eval.outputs();
4161                    for i in 0..out_count {
4162                        if let Some(value) = outputs.get(i) {
4163                            stack.push(value.clone());
4164                        } else {
4165                            stack.push(Value::Num(0.0));
4166                        }
4167                    }
4168                    continue;
4169                }
4170                if name == "mkdir" {
4171                    let eval = match runmat_runtime::builtins::io::repl_fs::mkdir::evaluate(&args) {
4172                        Ok(eval) => eval,
4173                        Err(err) => vm_bail!(err),
4174                    };
4175                    if out_count == 0 {
4176                        continue;
4177                    }
4178                    let outputs = eval.outputs();
4179                    for i in 0..out_count {
4180                        if let Some(value) = outputs.get(i) {
4181                            stack.push(value.clone());
4182                        } else {
4183                            stack.push(Value::Num(0.0));
4184                        }
4185                    }
4186                    continue;
4187                }
4188                if name == "setenv" {
4189                    let eval = match runmat_runtime::builtins::io::repl_fs::setenv::evaluate(&args)
4190                    {
4191                        Ok(eval) => eval,
4192                        Err(err) => vm_bail!(err),
4193                    };
4194                    if out_count == 0 {
4195                        continue;
4196                    }
4197                    let outputs = eval.outputs();
4198                    for i in 0..out_count {
4199                        if let Some(value) = outputs.get(i) {
4200                            stack.push(value.clone());
4201                        } else {
4202                            stack.push(Value::Num(0.0));
4203                        }
4204                    }
4205                    continue;
4206                }
4207                if name == "savepath" {
4208                    let eval =
4209                        match runmat_runtime::builtins::io::repl_fs::savepath::evaluate(&args) {
4210                            Ok(eval) => eval,
4211                            Err(err) => vm_bail!(err),
4212                        };
4213                    if out_count == 0 {
4214                        continue;
4215                    }
4216                    let outputs = eval.outputs();
4217                    for i in 0..out_count {
4218                        if let Some(value) = outputs.get(i) {
4219                            stack.push(value.clone());
4220                        } else {
4221                            stack.push(Value::Num(0.0));
4222                        }
4223                    }
4224                    continue;
4225                }
4226                if name == "copyfile" {
4227                    let eval =
4228                        match runmat_runtime::builtins::io::repl_fs::copyfile::evaluate(&args) {
4229                            Ok(eval) => eval,
4230                            Err(err) => vm_bail!(err),
4231                        };
4232                    if out_count == 0 {
4233                        continue;
4234                    }
4235                    let outputs = eval.outputs();
4236                    for i in 0..out_count {
4237                        if let Some(value) = outputs.get(i) {
4238                            stack.push(value.clone());
4239                        } else {
4240                            stack.push(Value::Num(0.0));
4241                        }
4242                    }
4243                    continue;
4244                }
4245                if name == "movefile" {
4246                    let eval =
4247                        match runmat_runtime::builtins::io::repl_fs::movefile::evaluate(&args) {
4248                            Ok(eval) => eval,
4249                            Err(err) => vm_bail!(err),
4250                        };
4251                    if out_count == 0 {
4252                        continue;
4253                    }
4254                    let outputs = eval.outputs();
4255                    for i in 0..out_count {
4256                        if let Some(value) = outputs.get(i) {
4257                            stack.push(value.clone());
4258                        } else {
4259                            stack.push(Value::Num(0.0));
4260                        }
4261                    }
4262                    continue;
4263                }
4264                if name == "rmdir" {
4265                    let eval = match runmat_runtime::builtins::io::repl_fs::rmdir::evaluate(&args) {
4266                        Ok(eval) => eval,
4267                        Err(err) => vm_bail!(err),
4268                    };
4269                    if out_count == 0 {
4270                        continue;
4271                    }
4272                    let outputs = eval.outputs();
4273                    for i in 0..out_count {
4274                        if let Some(value) = outputs.get(i) {
4275                            stack.push(value.clone());
4276                        } else {
4277                            stack.push(Value::Num(0.0));
4278                        }
4279                    }
4280                    continue;
4281                }
4282                if name == "orderfields" && !args.is_empty() {
4283                    let eval = match runmat_runtime::builtins::structs::core::orderfields::evaluate(
4284                        args[0].clone(),
4285                        &args[1..],
4286                    ) {
4287                        Ok(eval) => eval,
4288                        Err(err) => vm_bail!(err),
4289                    };
4290                    if out_count == 0 {
4291                        continue;
4292                    }
4293                    let (ordered, permutation) = eval.into_values();
4294                    stack.push(ordered);
4295                    if out_count >= 2 {
4296                        stack.push(permutation);
4297                    }
4298                    if out_count > 2 {
4299                        for _ in 2..out_count {
4300                            stack.push(Value::Num(0.0));
4301                        }
4302                    }
4303                    continue;
4304                }
4305                if name == "chol" {
4306                    if args.is_empty() {
4307                        vm_bail!(mex("NotEnoughInputs", "chol requires an input matrix"));
4308                    }
4309                    let eval = match runmat_runtime::builtins::math::linalg::factor::chol::evaluate(
4310                        args[0].clone(),
4311                        &args[1..],
4312                    ) {
4313                        Ok(v) => v,
4314                        Err(err) => vm_bail!(err),
4315                    };
4316                    match out_count {
4317                        0 => continue,
4318                        1 => {
4319                            if !eval.is_positive_definite() {
4320                                vm_bail!("Matrix must be positive definite.".to_string());
4321                            }
4322                            stack.push(eval.factor());
4323                            continue;
4324                        }
4325                        2 => {
4326                            stack.push(eval.factor());
4327                            stack.push(eval.flag());
4328                            continue;
4329                        }
4330                        _ => vm_bail!(mex(
4331                            "TooManyOutputs",
4332                            "chol currently supports at most two outputs"
4333                        )),
4334                    }
4335                }
4336                if name == "lu" {
4337                    if args.is_empty() {
4338                        vm_bail!(mex("NotEnoughInputs", "lu requires an input matrix"));
4339                    }
4340                    let eval = match runmat_runtime::builtins::math::linalg::factor::lu::evaluate(
4341                        args[0].clone(),
4342                        &args[1..],
4343                    ) {
4344                        Ok(v) => v,
4345                        Err(err) => vm_bail!(err),
4346                    };
4347                    match out_count {
4348                        0 => continue,
4349                        1 => {
4350                            stack.push(eval.combined());
4351                            continue;
4352                        }
4353                        2 => {
4354                            stack.push(eval.lower());
4355                            stack.push(eval.upper());
4356                            continue;
4357                        }
4358                        3 => {
4359                            stack.push(eval.lower());
4360                            stack.push(eval.upper());
4361                            stack.push(eval.permutation());
4362                            continue;
4363                        }
4364                        _ => vm_bail!(mex(
4365                            "TooManyOutputs",
4366                            "lu currently supports at most three outputs"
4367                        )),
4368                    }
4369                }
4370                if name == "linsolve" {
4371                    if args.len() < 2 {
4372                        vm_bail!(mex(
4373                            "NotEnoughInputs",
4374                            "linsolve requires coefficient and right-hand side inputs"
4375                        ));
4376                    }
4377                    let eval =
4378                        match runmat_runtime::builtins::math::linalg::solve::linsolve::evaluate_args(
4379                            args[0].clone(),
4380                            args[1].clone(),
4381                            &args[2..],
4382                        ) {
4383                            Ok(v) => v,
4384                            Err(err) => vm_bail!(err),
4385                        };
4386                    match out_count {
4387                        0 => continue,
4388                        1 => {
4389                            stack.push(eval.solution());
4390                            continue;
4391                        }
4392                        2 => {
4393                            stack.push(eval.solution());
4394                            stack.push(eval.reciprocal_condition());
4395                            continue;
4396                        }
4397                        _ => vm_bail!(mex(
4398                            "TooManyOutputs",
4399                            "linsolve currently supports at most two outputs"
4400                        )),
4401                    }
4402                }
4403                if name == "qr" {
4404                    if args.is_empty() {
4405                        vm_bail!(mex("NotEnoughInputs", "qr requires an input matrix"));
4406                    }
4407                    let eval = match runmat_runtime::builtins::math::linalg::factor::qr::evaluate(
4408                        args[0].clone(),
4409                        &args[1..],
4410                    ) {
4411                        Ok(v) => v,
4412                        Err(err) => vm_bail!(err),
4413                    };
4414                    match out_count {
4415                        0 => {
4416                            pc += 1;
4417                            continue;
4418                        }
4419                        1 => {
4420                            stack.push(eval.r());
4421                            pc += 1;
4422                            continue;
4423                        }
4424                        2 => {
4425                            stack.push(eval.q());
4426                            stack.push(eval.r());
4427                            pc += 1;
4428                            continue;
4429                        }
4430                        3 => {
4431                            stack.push(eval.q());
4432                            stack.push(eval.r());
4433                            stack.push(eval.permutation());
4434                            pc += 1;
4435                            continue;
4436                        }
4437                        _ => vm_bail!(mex(
4438                            "TooManyOutputs",
4439                            "qr currently supports at most three outputs"
4440                        )),
4441                    }
4442                }
4443                if name == "svd" {
4444                    if args.is_empty() {
4445                        vm_bail!(mex("NotEnoughInputs", "svd requires an input matrix"));
4446                    }
4447                    let eval = match runmat_runtime::builtins::math::linalg::factor::svd::evaluate(
4448                        args[0].clone(),
4449                        &args[1..],
4450                    ) {
4451                        Ok(v) => v,
4452                        Err(err) => vm_bail!(err),
4453                    };
4454                    match out_count {
4455                        0 => continue,
4456                        1 => {
4457                            stack.push(eval.singular_values());
4458                            continue;
4459                        }
4460                        2 => {
4461                            stack.push(eval.u());
4462                            stack.push(eval.sigma());
4463                            continue;
4464                        }
4465                        3 => {
4466                            stack.push(eval.u());
4467                            stack.push(eval.sigma());
4468                            stack.push(eval.v());
4469                            continue;
4470                        }
4471                        _ => vm_bail!(mex(
4472                            "TooManyOutputs",
4473                            "svd currently supports at most three outputs"
4474                        )),
4475                    }
4476                }
4477                if name == "eig" {
4478                    if args.is_empty() {
4479                        vm_bail!(mex("NotEnoughInputs", "eig requires an input matrix"));
4480                    }
4481                    let require_left = out_count >= 3;
4482                    let eval = match runmat_runtime::builtins::math::linalg::factor::eig::evaluate(
4483                        args[0].clone(),
4484                        &args[1..],
4485                        require_left,
4486                    ) {
4487                        Ok(v) => v,
4488                        Err(err) => vm_bail!(err),
4489                    };
4490                    match out_count {
4491                        0 => continue,
4492                        1 => {
4493                            stack.push(eval.eigenvalues());
4494                            continue;
4495                        }
4496                        2 => {
4497                            stack.push(eval.right());
4498                            stack.push(eval.diagonal());
4499                            continue;
4500                        }
4501                        3 => {
4502                            stack.push(eval.right());
4503                            stack.push(eval.diagonal());
4504                            let left = match eval.left() {
4505                                Ok(value) => value,
4506                                Err(err) => vm_bail!(err),
4507                            };
4508                            stack.push(left);
4509                            continue;
4510                        }
4511                        _ => vm_bail!(mex(
4512                            "TooManyOutputs",
4513                            "eig currently supports at most three outputs"
4514                        )),
4515                    }
4516                }
4517                // Special-case for 'find' to support [i,j,v] = find(A)
4518                if name == "find" && !args.is_empty() {
4519                    let eval = match runmat_runtime::builtins::array::indexing::find::evaluate(
4520                        args[0].clone(),
4521                        &args[1..],
4522                    ) {
4523                        Ok(eval) => eval,
4524                        Err(err) => vm_bail!(err),
4525                    };
4526                    if out_count == 0 {
4527                        continue;
4528                    }
4529                    if out_count <= 1 {
4530                        let linear = match eval.linear_value() {
4531                            Ok(v) => v,
4532                            Err(err) => vm_bail!(err),
4533                        };
4534                        stack.push(linear);
4535                        for _ in 1..out_count {
4536                            stack.push(Value::Num(0.0));
4537                        }
4538                    } else {
4539                        let rows = match eval.row_value() {
4540                            Ok(v) => v,
4541                            Err(err) => vm_bail!(err),
4542                        };
4543                        stack.push(rows);
4544                        let cols = match eval.column_value() {
4545                            Ok(v) => v,
4546                            Err(err) => vm_bail!(err),
4547                        };
4548                        stack.push(cols);
4549                        if out_count >= 3 {
4550                            let vals = match eval.values_value() {
4551                                Ok(v) => v,
4552                                Err(err) => vm_bail!(err),
4553                            };
4554                            stack.push(vals);
4555                        }
4556                        if out_count > 3 {
4557                            for _ in 3..out_count {
4558                                stack.push(Value::Num(0.0));
4559                            }
4560                        }
4561                    }
4562                    continue;
4563                }
4564                if name == "regexp" && args.len() >= 2 {
4565                    let eval = match runmat_runtime::builtins::strings::regex::regexp::evaluate(
4566                        args[0].clone(),
4567                        args[1].clone(),
4568                        &args[2..],
4569                    ) {
4570                        Ok(eval) => eval,
4571                        Err(err) => vm_bail!(err),
4572                    };
4573                    let mut values = match eval.outputs_for_multi() {
4574                        Ok(values) => values,
4575                        Err(err) => vm_bail!(err),
4576                    };
4577                    if out_count == 0 {
4578                        continue;
4579                    }
4580                    for _ in 0..out_count {
4581                        if !values.is_empty() {
4582                            stack.push(values.remove(0));
4583                        } else {
4584                            stack.push(Value::Num(0.0));
4585                        }
4586                    }
4587                    continue;
4588                }
4589                if name == "deconv" {
4590                    if args.len() < 2 {
4591                        vm_bail!(mex("MATLAB:minrhs", "Not enough input arguments."));
4592                    }
4593                    let eval = match runmat_runtime::builtins::math::signal::deconv::evaluate(
4594                        args[0].clone(),
4595                        args[1].clone(),
4596                    ) {
4597                        Ok(eval) => eval,
4598                        Err(err) => vm_bail!(err),
4599                    };
4600                    if out_count == 0 {
4601                        continue;
4602                    }
4603                    stack.push(eval.quotient());
4604                    if out_count >= 2 {
4605                        stack.push(eval.remainder());
4606                    }
4607                    if out_count > 2 {
4608                        for _ in 2..out_count {
4609                            stack.push(Value::Num(0.0));
4610                        }
4611                    }
4612                    continue;
4613                }
4614                if name == "polyder" {
4615                    if args.is_empty() {
4616                        vm_bail!(mex("MATLAB:minrhs", "Not enough input arguments."));
4617                    }
4618                    if out_count <= 1 {
4619                        let result = match args.len() {
4620                            1 => runmat_runtime::builtins::math::poly::polyder::derivative_single(
4621                                args[0].clone(),
4622                            ),
4623                            2 => runmat_runtime::builtins::math::poly::polyder::derivative_product(
4624                                args[0].clone(),
4625                                args[1].clone(),
4626                            ),
4627                            _ => vm_bail!("polyder: too many input arguments.".to_string()),
4628                        };
4629                        match result {
4630                            Ok(value) => {
4631                                if out_count == 0 {
4632                                    continue;
4633                                }
4634                                stack.push(value);
4635                            }
4636                            Err(err) => vm_bail!(err),
4637                        }
4638                        if out_count > 1 {
4639                            for _ in 1..out_count {
4640                                stack.push(Value::Num(0.0));
4641                            }
4642                        }
4643                        continue;
4644                    }
4645                    if args.len() != 2 {
4646                        vm_bail!(mex(
4647                            "MATLAB:minrhs",
4648                            "Not enough input arguments for quotient form."
4649                        ));
4650                    }
4651                    let eval =
4652                        match runmat_runtime::builtins::math::poly::polyder::evaluate_quotient(
4653                            args[0].clone(),
4654                            args[1].clone(),
4655                        ) {
4656                            Ok(eval) => eval,
4657                            Err(err) => vm_bail!(err),
4658                        };
4659                    stack.push(eval.numerator());
4660                    stack.push(eval.denominator());
4661                    if out_count > 2 {
4662                        for _ in 2..out_count {
4663                            stack.push(Value::Num(0.0));
4664                        }
4665                    }
4666                    continue;
4667                }
4668                if name == "polyval" {
4669                    if args.len() < 2 {
4670                        vm_bail!(mex("MATLAB:minrhs", "Not enough input arguments."));
4671                    }
4672                    let eval = match runmat_runtime::builtins::math::poly::polyval::evaluate(
4673                        args[0].clone(),
4674                        args[1].clone(),
4675                        &args[2..],
4676                        out_count >= 2,
4677                    ) {
4678                        Ok(eval) => eval,
4679                        Err(err) => vm_bail!(err),
4680                    };
4681                    if out_count == 0 {
4682                        continue;
4683                    }
4684                    stack.push(eval.value());
4685                    if out_count >= 2 {
4686                        let delta = match eval.delta() {
4687                            Ok(v) => v,
4688                            Err(err) => vm_bail!(err),
4689                        };
4690                        stack.push(delta);
4691                    }
4692                    if out_count > 2 {
4693                        for _ in 2..out_count {
4694                            stack.push(Value::Num(0.0));
4695                        }
4696                    }
4697                    continue;
4698                }
4699                if name == "polyfit" {
4700                    if args.len() < 3 {
4701                        vm_bail!(mex("MATLAB:minrhs", "Not enough input arguments."));
4702                    }
4703                    let eval = match runmat_runtime::builtins::math::poly::polyfit::evaluate(
4704                        args[0].clone(),
4705                        args[1].clone(),
4706                        args[2].clone(),
4707                        &args[3..],
4708                    ) {
4709                        Ok(eval) => eval,
4710                        Err(err) => vm_bail!(err),
4711                    };
4712                    if out_count == 0 {
4713                        continue;
4714                    }
4715                    stack.push(eval.coefficients());
4716                    if out_count >= 2 {
4717                        stack.push(eval.stats());
4718                    }
4719                    if out_count >= 3 {
4720                        stack.push(eval.mu());
4721                    }
4722                    if out_count > 3 {
4723                        for _ in 3..out_count {
4724                            stack.push(Value::Num(0.0));
4725                        }
4726                    }
4727                    continue;
4728                }
4729                if name == "filter" {
4730                    if args.len() < 3 {
4731                        vm_bail!(mex("MATLAB:minrhs", "Not enough input arguments."));
4732                    }
4733                    let eval = match runmat_runtime::builtins::math::signal::filter::evaluate(
4734                        args[0].clone(),
4735                        args[1].clone(),
4736                        args[2].clone(),
4737                        &args[3..],
4738                    ) {
4739                        Ok(eval) => eval,
4740                        Err(err) => vm_bail!(err),
4741                    };
4742                    if out_count == 0 {
4743                        continue;
4744                    }
4745                    if out_count == 1 {
4746                        stack.push(eval.into_value());
4747                    } else {
4748                        let (output, final_state) = eval.into_pair();
4749                        stack.push(output);
4750                        stack.push(final_state);
4751                        if out_count > 2 {
4752                            for _ in 2..out_count {
4753                                stack.push(Value::Num(0.0));
4754                            }
4755                        }
4756                    }
4757                    continue;
4758                }
4759                if name == "sort" && !args.is_empty() {
4760                    let eval = match runmat_runtime::builtins::array::sorting_sets::sort::evaluate(
4761                        args[0].clone(),
4762                        &args[1..],
4763                    ) {
4764                        Ok(eval) => eval,
4765                        Err(err) => vm_bail!(err),
4766                    };
4767                    if out_count == 0 {
4768                        continue;
4769                    }
4770                    let (sorted, indices) = eval.into_values();
4771                    stack.push(sorted);
4772                    if out_count >= 2 {
4773                        stack.push(indices);
4774                    }
4775                    if out_count > 2 {
4776                        for _ in 2..out_count {
4777                            stack.push(Value::Num(0.0));
4778                        }
4779                    }
4780                    continue;
4781                }
4782                if name == "cummin" && !args.is_empty() {
4783                    let eval = match runmat_runtime::builtins::math::reduction::evaluate_cummin(
4784                        args[0].clone(),
4785                        &args[1..],
4786                    ) {
4787                        Ok(eval) => eval,
4788                        Err(err) => vm_bail!(err),
4789                    };
4790                    if out_count == 0 {
4791                        continue;
4792                    }
4793                    let (values, indices) = eval.into_pair();
4794                    stack.push(values);
4795                    if out_count >= 2 {
4796                        stack.push(indices);
4797                    }
4798                    if out_count > 2 {
4799                        for _ in 2..out_count {
4800                            stack.push(Value::Num(0.0));
4801                        }
4802                    }
4803                    continue;
4804                }
4805                if name == "min" && !args.is_empty() {
4806                    let eval = match runmat_runtime::builtins::math::reduction::evaluate_min(
4807                        args[0].clone(),
4808                        &args[1..],
4809                    ) {
4810                        Ok(eval) => eval,
4811                        Err(err) => vm_bail!(err),
4812                    };
4813                    if out_count == 0 {
4814                        continue;
4815                    }
4816                    let (values, indices) = eval.into_pair();
4817                    stack.push(values);
4818                    if out_count >= 2 {
4819                        stack.push(indices);
4820                    }
4821                    if out_count > 2 {
4822                        for _ in 2..out_count {
4823                            stack.push(Value::Num(0.0));
4824                        }
4825                    }
4826                    continue;
4827                }
4828                if name == "sortrows" && !args.is_empty() {
4829                    let eval =
4830                        match runmat_runtime::builtins::array::sorting_sets::sortrows::evaluate(
4831                            args[0].clone(),
4832                            &args[1..],
4833                        ) {
4834                            Ok(eval) => eval,
4835                            Err(err) => vm_bail!(err.to_string()),
4836                        };
4837                    if out_count == 0 {
4838                        continue;
4839                    }
4840                    let (sorted, indices) = eval.into_values();
4841                    stack.push(sorted);
4842                    if out_count >= 2 {
4843                        stack.push(indices);
4844                    }
4845                    if out_count > 2 {
4846                        for _ in 2..out_count {
4847                            stack.push(Value::Num(0.0));
4848                        }
4849                    }
4850                    continue;
4851                }
4852                if name == "ismember" && args.len() >= 2 {
4853                    let eval =
4854                        match runmat_runtime::builtins::array::sorting_sets::ismember::evaluate(
4855                            args[0].clone(),
4856                            args[1].clone(),
4857                            &args[2..],
4858                        ) {
4859                            Ok(eval) => eval,
4860                            Err(err) => vm_bail!(err.to_string()),
4861                        };
4862                    if out_count == 0 {
4863                        continue;
4864                    }
4865                    if out_count == 1 {
4866                        stack.push(eval.into_mask_value());
4867                        continue;
4868                    }
4869                    let (mask, loc) = eval.into_pair();
4870                    stack.push(mask);
4871                    stack.push(loc);
4872                    if out_count > 2 {
4873                        for _ in 2..out_count {
4874                            stack.push(Value::Num(0.0));
4875                        }
4876                    }
4877                    continue;
4878                }
4879                if name == "intersect" && args.len() >= 2 {
4880                    let eval =
4881                        match runmat_runtime::builtins::array::sorting_sets::intersect::evaluate(
4882                            args[0].clone(),
4883                            args[1].clone(),
4884                            &args[2..],
4885                        ) {
4886                            Ok(eval) => eval,
4887                            Err(err) => vm_bail!(err.to_string()),
4888                        };
4889                    if out_count == 0 {
4890                        continue;
4891                    }
4892                    if out_count == 1 {
4893                        stack.push(eval.into_values_value());
4894                        continue;
4895                    }
4896                    if out_count == 2 {
4897                        let (values, ia) = eval.into_pair();
4898                        stack.push(values);
4899                        stack.push(ia);
4900                        continue;
4901                    }
4902                    let (values, ia, ib) = eval.into_triple();
4903                    stack.push(values);
4904                    stack.push(ia);
4905                    stack.push(ib);
4906                    if out_count > 3 {
4907                        for _ in 3..out_count {
4908                            stack.push(Value::Num(0.0));
4909                        }
4910                    }
4911                    continue;
4912                }
4913                if name == "union" && args.len() >= 2 {
4914                    let eval = match runmat_runtime::builtins::array::sorting_sets::union::evaluate(
4915                        args[0].clone(),
4916                        args[1].clone(),
4917                        &args[2..],
4918                    ) {
4919                        Ok(eval) => eval,
4920                        Err(err) => vm_bail!(err.to_string()),
4921                    };
4922                    if out_count == 0 {
4923                        continue;
4924                    }
4925                    if out_count == 1 {
4926                        stack.push(eval.into_values_value());
4927                        continue;
4928                    }
4929                    if out_count == 2 {
4930                        let (values, ia) = eval.into_pair();
4931                        stack.push(values);
4932                        stack.push(ia);
4933                        continue;
4934                    }
4935                    let (values, ia, ib) = eval.into_triple();
4936                    stack.push(values);
4937                    stack.push(ia);
4938                    stack.push(ib);
4939                    if out_count > 3 {
4940                        for _ in 3..out_count {
4941                            stack.push(Value::Num(0.0));
4942                        }
4943                    }
4944                    continue;
4945                }
4946                if name == "histcounts" && !args.is_empty() {
4947                    let eval = match runmat_runtime::builtins::stats::hist::histcounts::evaluate(
4948                        args[0].clone(),
4949                        &args[1..],
4950                    ) {
4951                        Ok(eval) => eval,
4952                        Err(err) => vm_bail!(err.to_string()),
4953                    };
4954                    if out_count == 0 {
4955                        continue;
4956                    }
4957                    if out_count == 1 {
4958                        stack.push(eval.into_counts_value());
4959                        continue;
4960                    }
4961                    let (counts, edges) = eval.into_pair();
4962                    stack.push(counts);
4963                    stack.push(edges);
4964                    if out_count > 2 {
4965                        for _ in 2..out_count {
4966                            stack.push(Value::Num(0.0));
4967                        }
4968                    }
4969                    continue;
4970                }
4971                if name == "histcounts2" && args.len() >= 2 {
4972                    let eval = match runmat_runtime::builtins::stats::hist::histcounts2::evaluate(
4973                        args[0].clone(),
4974                        args[1].clone(),
4975                        &args[2..],
4976                    ) {
4977                        Ok(eval) => eval,
4978                        Err(err) => vm_bail!(err.to_string()),
4979                    };
4980                    if out_count == 0 {
4981                        continue;
4982                    }
4983                    if out_count == 1 {
4984                        stack.push(eval.into_counts_value());
4985                        continue;
4986                    }
4987                    if out_count == 2 {
4988                        let (counts, xedges) = eval.into_pair();
4989                        stack.push(counts);
4990                        stack.push(xedges);
4991                        continue;
4992                    }
4993                    let (counts, xedges, yedges) = eval.into_triple();
4994                    stack.push(counts);
4995                    stack.push(xedges);
4996                    stack.push(yedges);
4997                    if out_count > 3 {
4998                        for _ in 3..out_count {
4999                            stack.push(Value::Num(0.0));
5000                        }
5001                    }
5002                    continue;
5003                }
5004                if name == "unique" && !args.is_empty() {
5005                    let eval = match runmat_runtime::builtins::array::sorting_sets::unique::evaluate(
5006                        args[0].clone(),
5007                        &args[1..],
5008                    ) {
5009                        Ok(eval) => eval,
5010                        Err(err) => vm_bail!(err.to_string()),
5011                    };
5012                    if out_count == 0 {
5013                        continue;
5014                    }
5015                    if out_count == 1 {
5016                        stack.push(eval.into_values_value());
5017                        continue;
5018                    }
5019                    if out_count == 2 {
5020                        let (values, ia) = eval.into_pair();
5021                        stack.push(values);
5022                        stack.push(ia);
5023                        continue;
5024                    }
5025                    let (values, ia, ic) = eval.into_triple();
5026                    stack.push(values);
5027                    stack.push(ia);
5028                    stack.push(ic);
5029                    if out_count > 3 {
5030                        for _ in 3..out_count {
5031                            stack.push(Value::Num(0.0));
5032                        }
5033                    }
5034                    continue;
5035                }
5036                match call_builtin(&name, &args) {
5037                    Ok(v) => match v {
5038                        Value::Tensor(t) => {
5039                            let mut pushed = 0usize;
5040                            for &val in t.data.iter() {
5041                                if pushed >= out_count {
5042                                    break;
5043                                }
5044                                stack.push(Value::Num(val));
5045                                pushed += 1;
5046                            }
5047                            for _ in pushed..out_count {
5048                                stack.push(Value::Num(0.0));
5049                            }
5050                        }
5051                        Value::Cell(ca) => {
5052                            let mut pushed = 0usize;
5053                            for v in &ca.data {
5054                                if pushed >= out_count {
5055                                    break;
5056                                }
5057                                stack.push((**v).clone());
5058                                pushed += 1;
5059                            }
5060                            for _ in pushed..out_count {
5061                                stack.push(Value::Num(0.0));
5062                            }
5063                        }
5064                        other => {
5065                            stack.push(other);
5066                            for _ in 1..out_count {
5067                                stack.push(Value::Num(0.0));
5068                            }
5069                        }
5070                    },
5071                    Err(e) => {
5072                        // Try wildcard imports resolution similar to CallBuiltin
5073                        let mut resolved = None;
5074                        for (path, wildcard) in &imports {
5075                            if !*wildcard {
5076                                continue;
5077                            }
5078                            let mut qual = String::new();
5079                            for (i, part) in path.iter().enumerate() {
5080                                if i > 0 {
5081                                    qual.push('.');
5082                                }
5083                                qual.push_str(part);
5084                            }
5085                            qual.push('.');
5086                            qual.push_str(&name);
5087                            if let Ok(v) = call_builtin(&qual, &args) {
5088                                resolved = Some(v);
5089                                break;
5090                            }
5091                        }
5092                        if let Some(v) = resolved {
5093                            match v {
5094                                Value::Tensor(t) => {
5095                                    let mut pushed = 0usize;
5096                                    for &val in t.data.iter() {
5097                                        if pushed >= out_count {
5098                                            break;
5099                                        }
5100                                        stack.push(Value::Num(val));
5101                                        pushed += 1;
5102                                    }
5103                                    for _ in pushed..out_count {
5104                                        stack.push(Value::Num(0.0));
5105                                    }
5106                                }
5107                                Value::Cell(ca) => {
5108                                    let mut pushed = 0usize;
5109                                    for v in &ca.data {
5110                                        if pushed >= out_count {
5111                                            break;
5112                                        }
5113                                        stack.push((**v).clone());
5114                                        pushed += 1;
5115                                    }
5116                                    for _ in pushed..out_count {
5117                                        stack.push(Value::Num(0.0));
5118                                    }
5119                                }
5120                                other => {
5121                                    stack.push(other);
5122                                    for _ in 1..out_count {
5123                                        stack.push(Value::Num(0.0));
5124                                    }
5125                                }
5126                            }
5127                        } else {
5128                            vm_bail!(e.to_string());
5129                        }
5130                    }
5131                }
5132            }
5133            Instr::EnterTry(catch_pc, catch_var) => {
5134                try_stack.push((catch_pc, catch_var));
5135            }
5136            Instr::PopTry => {
5137                try_stack.pop();
5138            }
5139            Instr::CreateMatrix(rows, cols) => {
5140                let total_elements = rows * cols;
5141                let mut row_major = Vec::with_capacity(total_elements);
5142                for _ in 0..total_elements {
5143                    let val: f64 = (&stack
5144                        .pop()
5145                        .ok_or(mex("StackUnderflow", "stack underflow"))?)
5146                        .try_into()?;
5147                    row_major.push(val);
5148                }
5149                row_major.reverse();
5150                // Reorder to column-major storage: cm[r + c*rows] = rm[r*cols + c]
5151                let mut data = vec![0.0; total_elements];
5152                for r in 0..rows {
5153                    for c in 0..cols {
5154                        data[r + c * rows] = row_major[r * cols + c];
5155                    }
5156                }
5157                let matrix = runmat_builtins::Tensor::new_2d(data, rows, cols)
5158                    .map_err(|e| format!("Matrix creation error: {e}"))?;
5159                stack.push(Value::Tensor(matrix));
5160            }
5161            Instr::CreateMatrixDynamic(num_rows) => {
5162                let mut row_lengths = Vec::new();
5163                for _ in 0..num_rows {
5164                    let row_len: f64 = (&stack
5165                        .pop()
5166                        .ok_or(mex("StackUnderflow", "stack underflow"))?)
5167                        .try_into()?;
5168                    row_lengths.push(row_len as usize);
5169                }
5170                row_lengths.reverse();
5171                let mut rows_data = Vec::new();
5172                for &row_len in row_lengths.iter().rev() {
5173                    let mut row_values = Vec::new();
5174                    for _ in 0..row_len {
5175                        row_values.push(
5176                            stack
5177                                .pop()
5178                                .ok_or(mex("StackUnderflow", "stack underflow"))?,
5179                        );
5180                    }
5181                    row_values.reverse();
5182                    rows_data.push(row_values);
5183                }
5184                rows_data.reverse();
5185                let result = runmat_runtime::create_matrix_from_values(&rows_data)?;
5186                stack.push(result);
5187            }
5188            Instr::CreateRange(has_step) => {
5189                if has_step {
5190                    let end: f64 = (&stack
5191                        .pop()
5192                        .ok_or(mex("StackUnderflow", "stack underflow"))?)
5193                        .try_into()?;
5194                    let step: f64 = (&stack
5195                        .pop()
5196                        .ok_or(mex("StackUnderflow", "stack underflow"))?)
5197                        .try_into()?;
5198                    let start: f64 = (&stack
5199                        .pop()
5200                        .ok_or(mex("StackUnderflow", "stack underflow"))?)
5201                        .try_into()?;
5202                    let range_result = runmat_runtime::create_range(start, Some(step), end)?;
5203                    stack.push(range_result);
5204                } else {
5205                    let end: f64 = (&stack
5206                        .pop()
5207                        .ok_or(mex("StackUnderflow", "stack underflow"))?)
5208                        .try_into()?;
5209                    let start: f64 = (&stack
5210                        .pop()
5211                        .ok_or(mex("StackUnderflow", "stack underflow"))?)
5212                        .try_into()?;
5213                    let range_result = runmat_runtime::create_range(start, None, end)?;
5214                    stack.push(range_result);
5215                }
5216            }
5217            Instr::Index(num_indices) => {
5218                let mut indices = Vec::new();
5219                let count = num_indices;
5220                for _ in 0..count {
5221                    let index_val: f64 = (&stack
5222                        .pop()
5223                        .ok_or(mex("StackUnderflow", "stack underflow"))?)
5224                        .try_into()?;
5225                    indices.push(index_val);
5226                }
5227                indices.reverse();
5228                let base = stack
5229                    .pop()
5230                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
5231                #[cfg(feature = "native-accel")]
5232                clear_residency(&base);
5233                match base {
5234                    Value::Object(obj) => {
5235                        let cell = runmat_builtins::CellArray::new(
5236                            indices.iter().map(|n| Value::Num(*n)).collect(),
5237                            1,
5238                            indices.len(),
5239                        )
5240                        .map_err(|e| format!("subsref build error: {e}"))?;
5241                        match runmat_runtime::call_builtin(
5242                            "call_method",
5243                            &[
5244                                Value::Object(obj),
5245                                Value::String("subsref".to_string()),
5246                                Value::String("()".to_string()),
5247                                Value::Cell(cell),
5248                            ],
5249                        ) {
5250                            Ok(v) => stack.push(v),
5251                            Err(e) => vm_bail!(e.to_string()),
5252                        }
5253                    }
5254                    other => {
5255                        let result = match runmat_runtime::perform_indexing(&other, &indices) {
5256                            Ok(v) => v,
5257                            Err(e) => vm_bail!(e.to_string()),
5258                        };
5259                        stack.push(result);
5260                    }
5261                }
5262            }
5263            Instr::IndexSlice(dims, numeric_count, colon_mask, end_mask) => {
5264                let __b = bench_start();
5265                // Pop numeric indices in reverse order (they were pushed in order), then base
5266                let mut numeric: Vec<Value> = Vec::with_capacity(numeric_count);
5267                for _ in 0..numeric_count {
5268                    numeric.push(
5269                        stack
5270                            .pop()
5271                            .ok_or(mex("StackUnderflow", "stack underflow"))?,
5272                    );
5273                }
5274                numeric.reverse();
5275                let mut base = stack
5276                    .pop()
5277                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
5278                let mut logical_base = false;
5279                base = match base {
5280                    Value::LogicalArray(la) => {
5281                        logical_base = true;
5282                        let data: Vec<f64> = la
5283                            .data
5284                            .iter()
5285                            .map(|&b| if b != 0 { 1.0 } else { 0.0 })
5286                            .collect();
5287                        let tensor = runmat_builtins::Tensor::new(data, la.shape.clone())
5288                            .map_err(|e| format!("slice: {e}"))?;
5289                        Value::Tensor(tensor)
5290                    }
5291                    other => other,
5292                };
5293                match base {
5294                    Value::Object(obj) => {
5295                        let cell =
5296                            runmat_builtins::CellArray::new(numeric.to_vec(), 1, numeric.len())
5297                                .map_err(|e| format!("subsref build error: {e}"))?;
5298                        match runmat_runtime::call_builtin(
5299                            "call_method",
5300                            &[
5301                                Value::Object(obj),
5302                                Value::String("subsref".to_string()),
5303                                Value::String("()".to_string()),
5304                                Value::Cell(cell),
5305                            ],
5306                        ) {
5307                            Ok(v) => stack.push(v),
5308                            Err(e) => vm_bail!(e.to_string()),
5309                        }
5310                    }
5311                    Value::Tensor(t) => {
5312                        let rank = t.shape.len();
5313                        // Build per-dimension selectors
5314                        #[derive(Clone)]
5315                        enum Sel {
5316                            Colon,
5317                            Scalar(usize),
5318                            Indices(Vec<usize>),
5319                        }
5320                        let mut selectors: Vec<Sel> = Vec::with_capacity(dims);
5321                        let mut num_iter = 0usize;
5322                        if dims == 1 {
5323                            let total = t.data.len();
5324                            let mut idxs: Vec<usize> = Vec::new();
5325                            let is_colon = (colon_mask & 1u32) != 0;
5326                            let is_end = (end_mask & 1u32) != 0;
5327                            if is_colon {
5328                                idxs = (1..=total).collect();
5329                            } else if is_end {
5330                                idxs = vec![total];
5331                            } else if let Some(v) = numeric.first() {
5332                                match v {
5333                                    Value::Num(n) => {
5334                                        let i = *n as isize;
5335                                        if i < 1 {
5336                                            vm_bail!(mex(
5337                                                "IndexOutOfBounds",
5338                                                "Index out of bounds"
5339                                            ));
5340                                        }
5341                                        idxs = vec![i as usize];
5342                                    }
5343                                    Value::Tensor(idx_t) => {
5344                                        let len = idx_t.shape.iter().product::<usize>();
5345                                        if len == total {
5346                                            for (i, &val) in idx_t.data.iter().enumerate() {
5347                                                if val != 0.0 {
5348                                                    idxs.push(i + 1);
5349                                                }
5350                                            }
5351                                        } else {
5352                                            for &val in &idx_t.data {
5353                                                let i = val as isize;
5354                                                if i < 1 {
5355                                                    vm_bail!(mex(
5356                                                        "IndexOutOfBounds",
5357                                                        "Index out of bounds"
5358                                                    ));
5359                                                }
5360                                                idxs.push(i as usize);
5361                                            }
5362                                        }
5363                                    }
5364                                    _ => vm_bail!(mex(
5365                                        "UnsupportedIndexType",
5366                                        "Unsupported index type"
5367                                    )),
5368                                }
5369                            } else {
5370                                vm_bail!(mex("MissingNumericIndex", "missing numeric index"));
5371                            }
5372                            if idxs.iter().any(|&i| i == 0 || i > total) {
5373                                vm_bail!(mex("IndexOutOfBounds", "Index out of bounds"));
5374                            }
5375                            if idxs.len() == 1 {
5376                                stack.push(Value::Num(t.data[idxs[0] - 1]));
5377                            } else {
5378                                let mut out = Vec::with_capacity(idxs.len());
5379                                for &i in &idxs {
5380                                    out.push(t.data[i - 1]);
5381                                }
5382                                let tens = runmat_builtins::Tensor::new(out, vec![idxs.len(), 1])
5383                                    .map_err(|e| format!("Slice error: {e}"))?;
5384                                stack.push(Value::Tensor(tens));
5385                            }
5386                        } else {
5387                            for d in 0..dims {
5388                                let is_colon = (colon_mask & (1u32 << d)) != 0;
5389                                let is_end = (end_mask & (1u32 << d)) != 0;
5390                                if is_colon {
5391                                    selectors.push(Sel::Colon);
5392                                } else if is_end {
5393                                    // Plain 'end' -> scalar size of this dim
5394                                    let dim_len = *t.shape.get(d).unwrap_or(&1);
5395                                    selectors.push(Sel::Scalar(dim_len));
5396                                } else {
5397                                    let v = numeric.get(num_iter).ok_or(mex(
5398                                        "MissingNumericIndex",
5399                                        "missing numeric index",
5400                                    ))?;
5401                                    num_iter += 1;
5402                                    match v {
5403                                        Value::Num(n) => {
5404                                            let idx = *n as isize;
5405                                            if idx < 1 {
5406                                                return Err(mex(
5407                                                    "IndexOutOfBounds",
5408                                                    "Index out of bounds",
5409                                                ));
5410                                            }
5411                                            selectors.push(Sel::Scalar(idx as usize));
5412                                        }
5413                                        Value::Tensor(idx_t) => {
5414                                            // Logical mask if length matches dimension
5415                                            let dim_len = *t.shape.get(d).unwrap_or(&1);
5416                                            let len = idx_t.shape.iter().product::<usize>();
5417                                            if len == dim_len {
5418                                                let mut indices = Vec::new();
5419                                                for (i, &val) in idx_t.data.iter().enumerate() {
5420                                                    if val != 0.0 {
5421                                                        indices.push(i + 1);
5422                                                    }
5423                                                }
5424                                                selectors.push(Sel::Indices(indices));
5425                                            } else {
5426                                                // Treat as explicit indices (1-based)
5427                                                let mut indices = Vec::with_capacity(len);
5428                                                for &val in &idx_t.data {
5429                                                    let idx = val as isize;
5430                                                    if idx < 1 {
5431                                                        return Err(mex(
5432                                                            "IndexOutOfBounds",
5433                                                            "Index out of bounds",
5434                                                        ));
5435                                                    }
5436                                                    indices.push(idx as usize);
5437                                                }
5438                                                selectors.push(Sel::Indices(indices));
5439                                            }
5440                                        }
5441                                        Value::LogicalArray(la) => {
5442                                            let dim_len = *t.shape.get(d).unwrap_or(&1);
5443                                            if la.data.len() == dim_len {
5444                                                let mut indices = Vec::new();
5445                                                for (i, &b) in la.data.iter().enumerate() {
5446                                                    if b != 0 {
5447                                                        indices.push(i + 1);
5448                                                    }
5449                                                }
5450                                                selectors.push(Sel::Indices(indices));
5451                                            } else {
5452                                                return Err(mex(
5453                                                    "IndexShape",
5454                                                    "Logical mask shape mismatch",
5455                                                ));
5456                                            }
5457                                        }
5458                                        _ => {
5459                                            return Err(mex(
5460                                                "UnsupportedIndexType",
5461                                                "Unsupported index type",
5462                                            ))
5463                                        }
5464                                    }
5465                                }
5466                            }
5467                            // 2-D fast paths
5468                            if dims == 2 {
5469                                let rows = if rank >= 1 { t.shape[0] } else { 1 };
5470                                let cols = if rank >= 2 { t.shape[1] } else { 1 };
5471                                match (&selectors[0], &selectors[1]) {
5472                                    // Full column
5473                                    (Sel::Colon, Sel::Scalar(j)) => {
5474                                        let j0 = *j - 1;
5475                                        if j0 >= cols {
5476                                            return Err(mex(
5477                                                "IndexOutOfBounds",
5478                                                "Index out of bounds",
5479                                            ));
5480                                        }
5481                                        let start = j0 * rows;
5482                                        let out = t.data[start..start + rows].to_vec();
5483                                        if out.len() == 1 {
5484                                            stack.push(Value::Num(out[0]));
5485                                        } else {
5486                                            let tens =
5487                                                runmat_builtins::Tensor::new(out, vec![rows, 1])
5488                                                    .map_err(|e| format!("Slice error: {e}"))?;
5489                                            stack.push(Value::Tensor(tens));
5490                                        }
5491                                        bench_end("IndexSlice2D.fast_col", __b);
5492                                        pc += 1;
5493                                        continue;
5494                                    }
5495                                    // Full row
5496                                    (Sel::Scalar(i), Sel::Colon) => {
5497                                        let i0 = *i - 1;
5498                                        if i0 >= rows {
5499                                            return Err(mex(
5500                                                "IndexOutOfBounds",
5501                                                "Index out of bounds",
5502                                            ));
5503                                        }
5504                                        let mut out: Vec<f64> = Vec::with_capacity(cols);
5505                                        for c in 0..cols {
5506                                            out.push(t.data[i0 + c * rows]);
5507                                        }
5508                                        if out.len() == 1 {
5509                                            stack.push(Value::Num(out[0]));
5510                                        } else {
5511                                            let tens =
5512                                                runmat_builtins::Tensor::new(out, vec![1, cols])
5513                                                    .map_err(|e| format!("Slice error: {e}"))?;
5514                                            stack.push(Value::Tensor(tens));
5515                                        }
5516                                        bench_end("IndexSlice2D.fast_row", __b);
5517                                        pc += 1;
5518                                        continue;
5519                                    }
5520                                    // Full columns subset: A(:, J)
5521                                    (Sel::Colon, Sel::Indices(js)) => {
5522                                        // Gather selected full columns into a [rows, |J|] tensor
5523                                        if js.is_empty() {
5524                                            let tens = runmat_builtins::Tensor::new(
5525                                                Vec::new(),
5526                                                vec![rows, 0],
5527                                            )
5528                                            .map_err(|e| format!("Slice error: {e}"))?;
5529                                            stack.push(Value::Tensor(tens));
5530                                        } else {
5531                                            let mut out: Vec<f64> =
5532                                                Vec::with_capacity(rows * js.len());
5533                                            for &j in js {
5534                                                let j0 = j - 1;
5535                                                if j0 >= cols {
5536                                                    return Err(mex(
5537                                                        "IndexOutOfBounds",
5538                                                        "Index out of bounds",
5539                                                    ));
5540                                                }
5541                                                let start = j0 * rows;
5542                                                out.extend_from_slice(&t.data[start..start + rows]);
5543                                            }
5544                                            let tens = runmat_builtins::Tensor::new(
5545                                                out,
5546                                                vec![rows, js.len()],
5547                                            )
5548                                            .map_err(|e| format!("Slice error: {e}"))?;
5549                                            stack.push(Value::Tensor(tens));
5550                                        }
5551                                        bench_end("IndexSlice2D.fast_cols", __b);
5552                                        pc += 1;
5553                                        continue;
5554                                    }
5555                                    // Selected rows full: A(I, :)
5556                                    (Sel::Indices(is), Sel::Colon) => {
5557                                        // Gather selected rows across all columns into [|I|, cols]
5558                                        if is.is_empty() {
5559                                            let tens = runmat_builtins::Tensor::new(
5560                                                Vec::new(),
5561                                                vec![0, cols],
5562                                            )
5563                                            .map_err(|e| format!("Slice error: {e}"))?;
5564                                            stack.push(Value::Tensor(tens));
5565                                        } else {
5566                                            let mut out: Vec<f64> =
5567                                                Vec::with_capacity(is.len() * cols);
5568                                            for c in 0..cols {
5569                                                for &i in is {
5570                                                    let i0 = i - 1;
5571                                                    if i0 >= rows {
5572                                                        return Err(mex(
5573                                                            "IndexOutOfBounds",
5574                                                            "Index out of bounds",
5575                                                        ));
5576                                                    }
5577                                                    out.push(t.data[i0 + c * rows]);
5578                                                }
5579                                            }
5580                                            let tens = runmat_builtins::Tensor::new(
5581                                                out,
5582                                                vec![is.len(), cols],
5583                                            )
5584                                            .map_err(|e| format!("Slice error: {e}"))?;
5585                                            stack.push(Value::Tensor(tens));
5586                                        }
5587                                        bench_end("IndexSlice2D.fast_rows_multi", __b);
5588                                        pc += 1;
5589                                        continue;
5590                                    }
5591                                    _ => {}
5592                                }
5593                            }
5594                            {
5595                                // Compute output shape and gather
5596                                let mut out_dims: Vec<usize> = Vec::new();
5597                                let mut per_dim_indices: Vec<Vec<usize>> = Vec::with_capacity(dims);
5598                                for (d, sel) in selectors.iter().enumerate().take(dims) {
5599                                    let dim_len = *t.shape.get(d).unwrap_or(&1);
5600                                    let idxs = match sel {
5601                                        Sel::Colon => (1..=dim_len).collect::<Vec<usize>>(),
5602                                        Sel::Scalar(i) => vec![*i],
5603                                        Sel::Indices(v) => v.clone(),
5604                                    };
5605                                    if idxs.iter().any(|&i| i == 0 || i > dim_len) {
5606                                        return Err(mex("IndexOutOfBounds", "Index out of bounds"));
5607                                    }
5608                                    if idxs.len() > 1 {
5609                                        out_dims.push(idxs.len());
5610                                    } else {
5611                                        out_dims.push(1);
5612                                    }
5613                                    per_dim_indices.push(idxs);
5614                                }
5615                                let mut out_dims: Vec<usize> =
5616                                    per_dim_indices.iter().map(|v| v.len()).collect();
5617                                // 2D mixed selectors shape correction to match MATLAB:
5618                                // (I, scalar) => column vector [len(I), 1]; (scalar, J) => row vector [1, len(J)]
5619                                if dims == 2 {
5620                                    match (
5621                                        &per_dim_indices[0].as_slice(),
5622                                        &per_dim_indices[1].as_slice(),
5623                                    ) {
5624                                        // I (len>1), scalar
5625                                        (i_list, j_list)
5626                                            if i_list.len() > 1 && j_list.len() == 1 =>
5627                                        {
5628                                            out_dims = vec![i_list.len(), 1];
5629                                        }
5630                                        // scalar, J (len>1)
5631                                        (i_list, j_list)
5632                                            if i_list.len() == 1 && j_list.len() > 1 =>
5633                                        {
5634                                            out_dims = vec![1, j_list.len()];
5635                                        }
5636                                        _ => {}
5637                                    }
5638                                }
5639                                // Strides for column-major order (first dimension fastest)
5640                                let mut strides: Vec<usize> = vec![0; dims];
5641                                let full_shape: Vec<usize> = if rank < dims {
5642                                    let mut s = t.shape.clone();
5643                                    s.resize(dims, 1);
5644                                    s
5645                                } else {
5646                                    t.shape.clone()
5647                                };
5648                                let mut acc = 1usize;
5649                                for (d, stride) in strides.iter_mut().enumerate().take(dims) {
5650                                    *stride = acc;
5651                                    acc *= full_shape[d];
5652                                }
5653                                // Cartesian product gather
5654                                let total_out: usize = out_dims.iter().product();
5655                                let mut out_data: Vec<f64> = Vec::with_capacity(total_out);
5656                                if out_dims.contains(&0)
5657                                    || per_dim_indices.iter().any(|v| v.is_empty())
5658                                {
5659                                    // Empty selection on some dimension -> empty tensor
5660                                    let out_tensor =
5661                                        runmat_builtins::Tensor::new(out_data, out_dims)
5662                                            .map_err(|e| format!("Slice error: {e}"))?;
5663                                    stack.push(Value::Tensor(out_tensor));
5664                                } else {
5665                                    fn cartesian<F: FnMut(&[usize])>(
5666                                        lists: &[Vec<usize>],
5667                                        mut f: F,
5668                                    ) {
5669                                        let dims = lists.len();
5670                                        let mut idx = vec![0usize; dims];
5671                                        loop {
5672                                            let current: Vec<usize> =
5673                                                (0..dims).map(|d| lists[d][idx[d]]).collect();
5674                                            f(&current);
5675                                            // Increment first dimension fastest (column-major order)
5676                                            let mut d = 0usize;
5677                                            while d < dims {
5678                                                idx[d] += 1;
5679                                                if idx[d] < lists[d].len() {
5680                                                    break;
5681                                                }
5682                                                idx[d] = 0;
5683                                                d += 1;
5684                                            }
5685                                            if d == dims {
5686                                                break;
5687                                            }
5688                                        }
5689                                    }
5690                                    cartesian(&per_dim_indices, |multi| {
5691                                        let mut lin = 0usize;
5692                                        for d in 0..dims {
5693                                            let i0 = multi[d] - 1;
5694                                            lin += i0 * strides[d];
5695                                        }
5696                                        out_data.push(t.data[lin]);
5697                                    });
5698                                    if out_data.len() == 1 {
5699                                        stack.push(Value::Num(out_data[0]));
5700                                    } else {
5701                                        let out_tensor =
5702                                            runmat_builtins::Tensor::new(out_data, out_dims)
5703                                                .map_err(|e| format!("Slice error: {e}"))?;
5704                                        stack.push(Value::Tensor(out_tensor));
5705                                    }
5706                                }
5707                            }
5708                        }
5709                    }
5710                    Value::GpuTensor(handle) => {
5711                        let provider = runmat_accelerate_api::provider()
5712                            .ok_or_else(|| "No acceleration provider registered".to_string())?;
5713                        let base_shape = handle.shape.clone();
5714                        let selectors = build_slice_selectors(
5715                            dims,
5716                            colon_mask,
5717                            end_mask,
5718                            &numeric,
5719                            &base_shape,
5720                        )
5721                        .map_err(|e| format!("slice: {e}"))?;
5722                        let plan =
5723                            build_slice_plan(&selectors, dims, &base_shape).map_err(|e| {
5724                                if e.contains("IndexOutOfBounds") {
5725                                    e.clone()
5726                                } else {
5727                                    format!("slice: {e}")
5728                                }
5729                            })?;
5730                        if plan.indices.is_empty() {
5731                            let zeros = provider
5732                                .zeros(&plan.output_shape)
5733                                .map_err(|e| format!("slice: {e}"))?;
5734                            stack.push(Value::GpuTensor(zeros));
5735                        } else {
5736                            let result = provider
5737                                .gather_linear(&handle, &plan.indices, &plan.output_shape)
5738                                .map_err(|e| format!("slice: {e}"))?;
5739                            stack.push(Value::GpuTensor(result));
5740                        }
5741                    }
5742                    Value::StringArray(sa) => {
5743                        let rank = sa.shape.len();
5744                        #[derive(Clone)]
5745                        enum Sel {
5746                            Colon,
5747                            Scalar(usize),
5748                            Indices(Vec<usize>),
5749                        }
5750                        let mut selectors: Vec<Sel> = Vec::with_capacity(dims);
5751                        let mut num_iter = 0usize;
5752                        if dims == 1 {
5753                            let total = sa.data.len();
5754                            let mut idxs: Vec<usize> = Vec::new();
5755                            let is_colon = (colon_mask & 1u32) != 0;
5756                            let is_end = (end_mask & 1u32) != 0;
5757                            if is_colon {
5758                                idxs = (1..=total).collect();
5759                            } else if is_end {
5760                                idxs = vec![total];
5761                            } else if let Some(v) = numeric.first() {
5762                                match v {
5763                                    Value::Num(n) => {
5764                                        let i = *n as isize;
5765                                        if i < 1 {
5766                                            vm_bail!(mex(
5767                                                "IndexOutOfBounds",
5768                                                "Index out of bounds"
5769                                            ));
5770                                        }
5771                                        idxs = vec![i as usize];
5772                                    }
5773                                    Value::Tensor(idx_t) => {
5774                                        let len = idx_t.shape.iter().product::<usize>();
5775                                        if len == total {
5776                                            for (i, &val) in idx_t.data.iter().enumerate() {
5777                                                if val != 0.0 {
5778                                                    idxs.push(i + 1);
5779                                                }
5780                                            }
5781                                        } else {
5782                                            for &val in &idx_t.data {
5783                                                let i = val as isize;
5784                                                if i < 1 {
5785                                                    vm_bail!(mex(
5786                                                        "IndexOutOfBounds",
5787                                                        "Index out of bounds"
5788                                                    ));
5789                                                }
5790                                                idxs.push(i as usize);
5791                                            }
5792                                        }
5793                                    }
5794                                    _ => vm_bail!(mex(
5795                                        "UnsupportedIndexType",
5796                                        "Unsupported index type"
5797                                    )),
5798                                }
5799                            } else {
5800                                vm_bail!(mex("MissingNumericIndex", "missing numeric index"));
5801                            }
5802                            if idxs.iter().any(|&i| i == 0 || i > total) {
5803                                vm_bail!(mex("IndexOutOfBounds", "Index out of bounds"));
5804                            }
5805                            if idxs.len() == 1 {
5806                                // MATLAB semantics: string array indexing returns a String (double-quoted)
5807                                stack.push(Value::String(sa.data[idxs[0] - 1].clone()));
5808                            } else {
5809                                let mut out: Vec<String> = Vec::with_capacity(idxs.len());
5810                                for &i in &idxs {
5811                                    out.push(sa.data[i - 1].clone());
5812                                }
5813                                let out_sa =
5814                                    runmat_builtins::StringArray::new(out, vec![idxs.len(), 1])
5815                                        .map_err(|e| format!("Slice error: {e}"))?;
5816                                stack.push(Value::StringArray(out_sa));
5817                            }
5818                        } else {
5819                            for d in 0..dims {
5820                                let is_colon = (colon_mask & (1u32 << d)) != 0;
5821                                let is_end = (end_mask & (1u32 << d)) != 0;
5822                                if is_colon {
5823                                    selectors.push(Sel::Colon);
5824                                } else if is_end {
5825                                    let dim_len = *sa.shape.get(d).unwrap_or(&1);
5826                                    selectors.push(Sel::Scalar(dim_len));
5827                                } else {
5828                                    let v = numeric.get(num_iter).ok_or(mex(
5829                                        "MissingNumericIndex",
5830                                        "missing numeric index",
5831                                    ))?;
5832                                    num_iter += 1;
5833                                    match v {
5834                                        Value::Num(n) => {
5835                                            let idx = *n as isize;
5836                                            if idx < 1 {
5837                                                return Err(mex(
5838                                                    "IndexOutOfBounds",
5839                                                    "Index out of bounds",
5840                                                ));
5841                                            }
5842                                            selectors.push(Sel::Scalar(idx as usize));
5843                                        }
5844                                        Value::Tensor(idx_t) => {
5845                                            let dim_len = *sa.shape.get(d).unwrap_or(&1);
5846                                            let len = idx_t.shape.iter().product::<usize>();
5847                                            let is_binary_mask = len == dim_len
5848                                                && idx_t.data.iter().all(|&x| x == 0.0 || x == 1.0);
5849                                            if is_binary_mask {
5850                                                let mut v = Vec::new();
5851                                                for (i, &val) in idx_t.data.iter().enumerate() {
5852                                                    if val != 0.0 {
5853                                                        v.push(i + 1);
5854                                                    }
5855                                                }
5856                                                selectors.push(Sel::Indices(v));
5857                                            } else {
5858                                                let mut v = Vec::with_capacity(len);
5859                                                for &val in &idx_t.data {
5860                                                    let idx = val as isize;
5861                                                    if idx < 1 {
5862                                                        vm_bail!(mex(
5863                                                            "IndexOutOfBounds",
5864                                                            "Index out of bounds"
5865                                                        ));
5866                                                    }
5867                                                    v.push(idx as usize);
5868                                                }
5869                                                selectors.push(Sel::Indices(v));
5870                                            }
5871                                        }
5872                                        _ => vm_bail!(mex(
5873                                            "UnsupportedIndexType",
5874                                            "Unsupported index type"
5875                                        )),
5876                                    }
5877                                }
5878                            }
5879                            let mut out_dims: Vec<usize> = Vec::new();
5880                            let mut per_dim_indices: Vec<Vec<usize>> = Vec::with_capacity(dims);
5881                            for (d, sel) in selectors.iter().enumerate().take(dims) {
5882                                let dim_len = *sa.shape.get(d).unwrap_or(&1);
5883                                let idxs = match sel {
5884                                    Sel::Colon => (1..=dim_len).collect::<Vec<usize>>(),
5885                                    Sel::Scalar(i) => vec![*i],
5886                                    Sel::Indices(v) => v.clone(),
5887                                };
5888                                if idxs.iter().any(|&i| i == 0 || i > dim_len) {
5889                                    return Err(mex("IndexOutOfBounds", "Index out of bounds"));
5890                                }
5891                                if idxs.len() > 1 {
5892                                    out_dims.push(idxs.len());
5893                                } else {
5894                                    out_dims.push(1);
5895                                }
5896                                per_dim_indices.push(idxs);
5897                            }
5898                            if dims == 2 {
5899                                match (
5900                                    &per_dim_indices[0].as_slice(),
5901                                    &per_dim_indices[1].as_slice(),
5902                                ) {
5903                                    (i_list, j_list) if i_list.len() > 1 && j_list.len() == 1 => {
5904                                        out_dims = vec![i_list.len(), 1];
5905                                    }
5906                                    (i_list, j_list) if i_list.len() == 1 && j_list.len() > 1 => {
5907                                        out_dims = vec![1, j_list.len()];
5908                                    }
5909                                    _ => {}
5910                                }
5911                            }
5912                            let mut strides: Vec<usize> = vec![0; dims];
5913                            let full_shape: Vec<usize> = if rank < dims {
5914                                let mut s = sa.shape.clone();
5915                                s.resize(dims, 1);
5916                                s
5917                            } else {
5918                                sa.shape.clone()
5919                            };
5920                            let mut acc = 1usize;
5921                            for (d, stride) in strides.iter_mut().enumerate().take(dims) {
5922                                *stride = acc;
5923                                acc *= full_shape[d];
5924                            }
5925                            let total_out: usize = out_dims.iter().product();
5926                            if total_out == 0 {
5927                                stack.push(Value::StringArray(
5928                                    runmat_builtins::StringArray::new(Vec::new(), out_dims)
5929                                        .map_err(|e| format!("Slice error: {e}"))?,
5930                                ));
5931                            } else {
5932                                fn cartesian<F: FnMut(&[usize])>(lists: &[Vec<usize>], mut f: F) {
5933                                    let dims = lists.len();
5934                                    let mut idx = vec![0usize; dims];
5935                                    loop {
5936                                        let current: Vec<usize> =
5937                                            (0..dims).map(|d| lists[d][idx[d]]).collect();
5938                                        f(&current);
5939                                        let mut d = 0usize;
5940                                        while d < dims {
5941                                            idx[d] += 1;
5942                                            if idx[d] < lists[d].len() {
5943                                                break;
5944                                            }
5945                                            idx[d] = 0;
5946                                            d += 1;
5947                                        }
5948                                        if d == dims {
5949                                            break;
5950                                        }
5951                                    }
5952                                }
5953                                let mut out_data: Vec<String> = Vec::with_capacity(total_out);
5954                                cartesian(&per_dim_indices, |multi| {
5955                                    let mut lin = 0usize;
5956                                    for d in 0..dims {
5957                                        let i0 = multi[d] - 1;
5958                                        lin += i0 * strides[d];
5959                                    }
5960                                    out_data.push(sa.data[lin].clone());
5961                                });
5962                                if out_data.len() == 1 {
5963                                    stack.push(Value::String(out_data[0].clone()));
5964                                } else {
5965                                    let out_sa =
5966                                        runmat_builtins::StringArray::new(out_data, out_dims)
5967                                            .map_err(|e| format!("Slice error: {e}"))?;
5968                                    stack.push(Value::StringArray(out_sa));
5969                                }
5970                            }
5971                        }
5972                    }
5973                    other => {
5974                        // Support 1-D linear indexing and scalar(1) on non-tensors
5975                        if dims == 1 {
5976                            let is_colon = (colon_mask & 1u32) != 0;
5977                            let is_end = (end_mask & 1u32) != 0;
5978                            if is_colon {
5979                                vm_bail!(mex(
5980                                    "SliceNonTensor",
5981                                    "Slicing only supported on tensors"
5982                                ));
5983                            }
5984                            let idx_val: f64 = if is_end {
5985                                1.0
5986                            } else {
5987                                match numeric.first() {
5988                                    Some(Value::Num(n)) => *n,
5989                                    Some(Value::Int(i)) => i.to_f64(),
5990                                    _ => 1.0,
5991                                }
5992                            };
5993                            let v = match runmat_runtime::perform_indexing(&other, &[idx_val]) {
5994                                Ok(v) => v,
5995                                Err(_e) => vm_bail!(mex(
5996                                    "SliceNonTensor",
5997                                    "Slicing only supported on tensors"
5998                                )),
5999                            };
6000                            stack.push(v);
6001                        } else {
6002                            vm_bail!(mex("SliceNonTensor", "Slicing only supported on tensors"));
6003                        }
6004                    }
6005                }
6006                if logical_base {
6007                    let result = stack
6008                        .pop()
6009                        .ok_or(mex("SliceNonTensor", "logical slice missing result"))?;
6010                    let converted = match result {
6011                        Value::Tensor(t) => {
6012                            let logical_data: Vec<u8> = t
6013                                .data
6014                                .iter()
6015                                .map(|&v| if v != 0.0 { 1 } else { 0 })
6016                                .collect();
6017                            if logical_data.len() <= 1 {
6018                                Value::Bool(logical_data.first().copied().unwrap_or(0) != 0)
6019                            } else {
6020                                let logical = runmat_builtins::LogicalArray::new(
6021                                    logical_data,
6022                                    t.shape.clone(),
6023                                )
6024                                .map_err(|e| mex("SliceNonTensor", &format!("slice: {e}")))?;
6025                                Value::LogicalArray(logical)
6026                            }
6027                        }
6028                        Value::Num(n) => Value::Bool(n != 0.0),
6029                        Value::Bool(_) | Value::LogicalArray(_) => result,
6030                        other => other,
6031                    };
6032                    stack.push(converted);
6033                }
6034                bench_end("IndexSlice", __b);
6035            }
6036            Instr::IndexRangeEnd {
6037                dims,
6038                numeric_count,
6039                colon_mask,
6040                end_mask,
6041                range_dims,
6042                range_has_step,
6043                end_offsets,
6044            } => {
6045                // Pop any numeric scalar indices (reverse), then for each range in reverse push step (if has), start; then base
6046                let mut numeric: Vec<Value> = Vec::with_capacity(numeric_count);
6047                for _ in 0..numeric_count {
6048                    numeric.push(
6049                        stack
6050                            .pop()
6051                            .ok_or(mex("StackUnderflow", "stack underflow"))?,
6052                    );
6053                }
6054                numeric.reverse();
6055                // Gather per-range params in reverse order of pushes
6056                let mut range_params: Vec<(f64, f64)> = Vec::with_capacity(range_dims.len());
6057                for i in (0..range_dims.len()).rev() {
6058                    let has_step = range_has_step[i];
6059                    let step = if has_step {
6060                        let v = stack
6061                            .pop()
6062                            .ok_or(mex("StackUnderflow", "stack underflow"))?;
6063                        match v {
6064                            Value::Num(n) => n,
6065                            Value::Int(i) => i.to_f64(),
6066                            Value::Tensor(t) if !t.data.is_empty() => t.data[0],
6067                            _ => 1.0,
6068                        }
6069                    } else {
6070                        1.0
6071                    };
6072                    let v = stack
6073                        .pop()
6074                        .ok_or(mex("StackUnderflow", "stack underflow"))?;
6075                    let start: f64 = match v {
6076                        Value::Num(n) => n,
6077                        Value::Int(i) => i.to_f64(),
6078                        Value::Tensor(t) if !t.data.is_empty() => t.data[0],
6079                        _ => 1.0,
6080                    };
6081                    range_params.push((start, step));
6082                }
6083                range_params.reverse();
6084                let base = stack
6085                    .pop()
6086                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
6087                #[cfg(feature = "native-accel")]
6088                clear_residency(&base);
6089                match base {
6090                    Value::Tensor(t) => {
6091                        let rank = t.shape.len();
6092                        #[derive(Clone)]
6093                        enum Sel {
6094                            Colon,
6095                            Scalar(usize),
6096                            Indices(Vec<usize>),
6097                            Range { start: i64, step: i64, end_off: i64 },
6098                        }
6099                        let mut selectors: Vec<Sel> = Vec::with_capacity(dims);
6100                        let mut num_iter = 0usize;
6101                        let mut rp_iter = 0usize;
6102                        for d in 0..dims {
6103                            let is_colon = (colon_mask & (1u32 << d)) != 0;
6104                            let is_end = (end_mask & (1u32 << d)) != 0;
6105                            if is_colon {
6106                                selectors.push(Sel::Colon);
6107                            } else if is_end {
6108                                selectors.push(Sel::Scalar(*t.shape.get(d).unwrap_or(&1)));
6109                            } else if let Some(pos) = range_dims.iter().position(|&rd| rd == d) {
6110                                let (st, sp) = range_params[rp_iter];
6111                                rp_iter += 1;
6112                                let off = end_offsets[pos];
6113                                selectors.push(Sel::Range {
6114                                    start: st as i64,
6115                                    step: if sp >= 0.0 {
6116                                        sp as i64
6117                                    } else {
6118                                        -(sp.abs() as i64)
6119                                    },
6120                                    end_off: off,
6121                                });
6122                            } else {
6123                                let v = numeric
6124                                    .get(num_iter)
6125                                    .ok_or(mex("MissingNumericIndex", "missing numeric index"))?;
6126                                num_iter += 1;
6127                                match v {
6128                                    Value::Num(n) => {
6129                                        let idx = *n as isize;
6130                                        if idx < 1 {
6131                                            vm_bail!(mex(
6132                                                "IndexOutOfBounds",
6133                                                "Index out of bounds"
6134                                            ));
6135                                        }
6136                                        selectors.push(Sel::Scalar(idx as usize));
6137                                    }
6138                                    Value::Tensor(idx_t) => {
6139                                        let dim_len = *t.shape.get(d).unwrap_or(&1);
6140                                        let len = idx_t.shape.iter().product::<usize>();
6141                                        if len == dim_len {
6142                                            let mut v = Vec::new();
6143                                            for (i, &val) in idx_t.data.iter().enumerate() {
6144                                                if val != 0.0 {
6145                                                    v.push(i + 1);
6146                                                }
6147                                            }
6148                                            selectors.push(Sel::Indices(v));
6149                                        } else {
6150                                            let mut v = Vec::with_capacity(len);
6151                                            for &val in &idx_t.data {
6152                                                let idx = val as isize;
6153                                                if idx < 1 {
6154                                                    vm_bail!(mex(
6155                                                        "IndexOutOfBounds",
6156                                                        "Index out of bounds"
6157                                                    ));
6158                                                }
6159                                                v.push(idx as usize);
6160                                            }
6161                                            selectors.push(Sel::Indices(v));
6162                                        }
6163                                    }
6164                                    _ => vm_bail!(mex(
6165                                        "UnsupportedIndexType",
6166                                        "Unsupported index type"
6167                                    )),
6168                                }
6169                            }
6170                        }
6171                        // Materialize per-dim indices, resolving ranges with end_off
6172                        let mut per_dim_indices: Vec<Vec<usize>> = Vec::with_capacity(dims);
6173                        let full_shape: Vec<usize> = if rank < dims {
6174                            let mut s = t.shape.clone();
6175                            s.resize(dims, 1);
6176                            s
6177                        } else {
6178                            t.shape.clone()
6179                        };
6180                        for (d, sel) in selectors.iter().enumerate().take(dims) {
6181                            let dim_len = full_shape[d] as i64;
6182                            let idxs: Vec<usize> = match sel {
6183                                Sel::Colon => (1..=full_shape[d]).collect(),
6184                                Sel::Scalar(i) => vec![*i],
6185                                Sel::Indices(v) => v.clone(),
6186                                Sel::Range {
6187                                    start,
6188                                    step,
6189                                    end_off,
6190                                } => {
6191                                    let mut v = Vec::new();
6192                                    let mut cur = *start;
6193                                    let stp = *step;
6194                                    let end_i = dim_len - *end_off;
6195                                    if stp == 0 {
6196                                        vm_bail!(mex("IndexStepZero", "Index step cannot be zero"));
6197                                    }
6198                                    if stp > 0 {
6199                                        while cur <= end_i {
6200                                            if cur < 1 || cur > dim_len {
6201                                                break;
6202                                            }
6203                                            v.push(cur as usize);
6204                                            cur += stp;
6205                                        }
6206                                    } else {
6207                                        while cur >= end_i {
6208                                            if cur < 1 || cur > dim_len {
6209                                                break;
6210                                            }
6211                                            v.push(cur as usize);
6212                                            cur += stp;
6213                                        }
6214                                    }
6215                                    v
6216                                }
6217                            };
6218                            if idxs.iter().any(|&i| i == 0 || i > full_shape[d]) {
6219                                vm_bail!(mex("IndexOutOfBounds", "Index out of bounds"));
6220                            }
6221                            per_dim_indices.push(idxs);
6222                        }
6223                        // Strides and gather
6224                        let mut strides: Vec<usize> = vec![0; dims];
6225                        let mut acc = 1usize;
6226                        for (d, stride) in strides.iter_mut().enumerate().take(dims) {
6227                            *stride = acc;
6228                            acc *= full_shape[d];
6229                        }
6230                        let total_out: usize = per_dim_indices.iter().map(|v| v.len()).product();
6231                        if total_out == 0 {
6232                            stack.push(Value::Tensor(
6233                                runmat_builtins::Tensor::new(Vec::new(), vec![0, 0])
6234                                    .map_err(|e| format!("Slice error: {e}"))?,
6235                            ));
6236                            continue;
6237                        }
6238                        let mut out_data: Vec<f64> = Vec::with_capacity(total_out);
6239                        fn cartesian<F: FnMut(&[usize])>(lists: &[Vec<usize>], mut f: F) {
6240                            let dims = lists.len();
6241                            let mut idx = vec![0usize; dims];
6242                            loop {
6243                                let current: Vec<usize> =
6244                                    (0..dims).map(|d| lists[d][idx[d]]).collect();
6245                                f(&current);
6246                                let mut d = 0usize;
6247                                while d < dims {
6248                                    idx[d] += 1;
6249                                    if idx[d] < lists[d].len() {
6250                                        break;
6251                                    }
6252                                    idx[d] = 0;
6253                                    d += 1;
6254                                }
6255                                if d == dims {
6256                                    break;
6257                                }
6258                            }
6259                        }
6260                        cartesian(&per_dim_indices, |multi| {
6261                            let mut lin = 0usize;
6262                            for d in 0..dims {
6263                                let i0 = multi[d] - 1;
6264                                lin += i0 * strides[d];
6265                            }
6266                            out_data.push(t.data[lin]);
6267                        });
6268                        if out_data.len() == 1 {
6269                            stack.push(Value::Num(out_data[0]));
6270                        } else {
6271                            let shape: Vec<usize> =
6272                                per_dim_indices.iter().map(|v| v.len().max(1)).collect();
6273                            let tens = runmat_builtins::Tensor::new(out_data, shape)
6274                                .map_err(|e| format!("Slice error: {e}"))?;
6275                            stack.push(Value::Tensor(tens));
6276                        }
6277                    }
6278                    Value::StringArray(sa) => {
6279                        let selectors =
6280                            build_slice_selectors(dims, colon_mask, end_mask, &numeric, &sa.shape)
6281                                .map_err(|e| format!("slice: {e}"))?;
6282                        let plan = build_slice_plan(&selectors, dims, &sa.shape).map_err(|e| {
6283                            if e.contains("IndexOutOfBounds") {
6284                                e.clone()
6285                            } else {
6286                                format!("slice: {e}")
6287                            }
6288                        })?;
6289                        let result =
6290                            gather_string_slice(&sa, &plan).map_err(|e| format!("slice: {e}"))?;
6291                        stack.push(result);
6292                    }
6293                    _ => vm_bail!(mex("SliceNonTensor", "Slicing only supported on tensors")),
6294                }
6295            }
6296
6297            Instr::IndexSliceEx(dims, numeric_count, colon_mask, end_mask, end_offsets) => {
6298                // Like IndexSlice, but apply end arithmetic to specified numeric indices
6299                let mut numeric: Vec<Value> = Vec::with_capacity(numeric_count);
6300                for _ in 0..numeric_count {
6301                    numeric.push(
6302                        stack
6303                            .pop()
6304                            .ok_or(mex("StackUnderflow", "stack underflow"))?,
6305                    );
6306                }
6307                numeric.reverse();
6308                let mut base = stack
6309                    .pop()
6310                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
6311                let mut numeric_values = numeric.clone();
6312                if let Value::GpuTensor(handle) = &base {
6313                    let adjusted = apply_end_offsets_to_numeric(
6314                        &numeric_values,
6315                        dims,
6316                        colon_mask,
6317                        end_mask,
6318                        &end_offsets,
6319                        &handle.shape,
6320                    );
6321                    if let Some(provider) = runmat_accelerate_api::provider() {
6322                        if let Ok(selectors) = build_slice_selectors(
6323                            dims,
6324                            colon_mask,
6325                            end_mask,
6326                            &adjusted,
6327                            &handle.shape,
6328                        ) {
6329                            if let Ok(plan) = build_slice_plan(&selectors, dims, &handle.shape) {
6330                                if plan.indices.is_empty() {
6331                                    let zeros = provider
6332                                        .zeros(&plan.output_shape)
6333                                        .map_err(|e| format!("slice: {e}"))?;
6334                                    stack.push(Value::GpuTensor(zeros));
6335                                    pc += 1;
6336                                    continue;
6337                                } else {
6338                                    let result = provider
6339                                        .gather_linear(handle, &plan.indices, &plan.output_shape)
6340                                        .map_err(|e| format!("slice: {e}"))?;
6341                                    stack.push(Value::GpuTensor(result));
6342                                    pc += 1;
6343                                    continue;
6344                                }
6345                            }
6346                        }
6347                        let host = provider
6348                            .download(handle)
6349                            .map_err(|e| format!("slice: {e}"))?;
6350                        let tensor = runmat_builtins::Tensor::new(host.data, host.shape)
6351                            .map_err(|e| format!("slice: {e}"))?;
6352                        base = Value::Tensor(tensor);
6353                        numeric_values = adjusted;
6354                    } else {
6355                        return Err("No acceleration provider registered".to_string());
6356                    }
6357                }
6358                match base {
6359                    Value::Tensor(t) => {
6360                        let adjusted = apply_end_offsets_to_numeric(
6361                            &numeric_values,
6362                            dims,
6363                            colon_mask,
6364                            end_mask,
6365                            &end_offsets,
6366                            &t.shape,
6367                        );
6368                        // Build selectors identical to IndexSlice path
6369                        let mut tmp_stack = Vec::new();
6370                        tmp_stack.push(Value::Tensor(t));
6371                        for v in adjusted {
6372                            tmp_stack.push(v);
6373                        }
6374                        // Swap stacks for reuse: assign and then fallthrough to IndexSlice body via small duplication
6375                        let mut numeric_vals: Vec<Value> = Vec::new();
6376                        let count = numeric_count;
6377                        let mut idx_iter = tmp_stack.into_iter();
6378                        let base = idx_iter
6379                            .next()
6380                            .ok_or(mex("StackUnderflow", "stack underflow"))?;
6381                        for _ in 0..count {
6382                            match idx_iter.next() {
6383                                Some(v) => numeric_vals.push(v),
6384                                None => return Err(mex("StackUnderflow", "stack underflow")),
6385                            }
6386                        }
6387                        match base {
6388                            Value::Tensor(t2) => {
6389                                // Inline small subset of IndexSlice gather for t2
6390                                let rank = t2.shape.len();
6391                                #[derive(Clone)]
6392                                enum Sel {
6393                                    Colon,
6394                                    Scalar(usize),
6395                                    Indices(Vec<usize>),
6396                                }
6397                                let mut selectors: Vec<Sel> = Vec::with_capacity(dims);
6398                                let mut num_iter = 0usize;
6399                                if dims == 1 {
6400                                    let total = t2.data.len();
6401                                    let mut idxs: Vec<usize> = Vec::new();
6402                                    let is_colon = (colon_mask & 1u32) != 0;
6403                                    let is_end = (end_mask & 1u32) != 0;
6404                                    if is_colon {
6405                                        idxs = (1..=total).collect();
6406                                    } else if is_end {
6407                                        idxs = vec![total];
6408                                    } else if let Some(v) = numeric_vals.first() {
6409                                        match v {
6410                                            Value::Num(n) => {
6411                                                let i = *n as isize;
6412                                                if i < 1 {
6413                                                    vm_bail!(mex(
6414                                                        "IndexOutOfBounds",
6415                                                        "Index out of bounds"
6416                                                    ));
6417                                                }
6418                                                idxs = vec![i as usize];
6419                                            }
6420                                            Value::Tensor(idx_t) => {
6421                                                let len = idx_t.shape.iter().product::<usize>();
6422                                                if len == total {
6423                                                    for (i, &val) in idx_t.data.iter().enumerate() {
6424                                                        if val != 0.0 {
6425                                                            idxs.push(i + 1);
6426                                                        }
6427                                                    }
6428                                                } else {
6429                                                    for &val in &idx_t.data {
6430                                                        let i = val as isize;
6431                                                        if i < 1 {
6432                                                            vm_bail!(mex(
6433                                                                "IndexOutOfBounds",
6434                                                                "Index out of bounds"
6435                                                            ));
6436                                                        }
6437                                                        idxs.push(i as usize);
6438                                                    }
6439                                                }
6440                                            }
6441                                            _ => vm_bail!(mex(
6442                                                "UnsupportedIndexType",
6443                                                "Unsupported index type"
6444                                            )),
6445                                        }
6446                                    } else {
6447                                        vm_bail!(mex(
6448                                            "MissingNumericIndex",
6449                                            "missing numeric index"
6450                                        ));
6451                                    }
6452                                    if idxs.iter().any(|&i| i == 0 || i > total) {
6453                                        vm_bail!(mex("IndexOutOfBounds", "Index out of bounds"));
6454                                    }
6455                                    if idxs.len() == 1 {
6456                                        stack.push(Value::Num(t2.data[idxs[0] - 1]));
6457                                    } else {
6458                                        let mut out = Vec::with_capacity(idxs.len());
6459                                        for &i in &idxs {
6460                                            out.push(t2.data[i - 1]);
6461                                        }
6462                                        let tens =
6463                                            runmat_builtins::Tensor::new(out, vec![idxs.len(), 1])
6464                                                .map_err(|e| format!("Slice error: {e}"))?;
6465                                        stack.push(Value::Tensor(tens));
6466                                    }
6467                                } else {
6468                                    for d in 0..dims {
6469                                        let is_colon = (colon_mask & (1u32 << d)) != 0;
6470                                        let is_end = (end_mask & (1u32 << d)) != 0;
6471                                        if is_colon {
6472                                            selectors.push(Sel::Colon);
6473                                        } else if is_end {
6474                                            let dim_len = *t2.shape.get(d).unwrap_or(&1);
6475                                            selectors.push(Sel::Scalar(dim_len));
6476                                        } else {
6477                                            let v = numeric_vals.get(num_iter).ok_or(mex(
6478                                                "MissingNumericIndex",
6479                                                "missing numeric index",
6480                                            ))?;
6481                                            num_iter += 1;
6482                                            match v {
6483                                                Value::Num(n) => {
6484                                                    let idx = *n as isize;
6485                                                    if idx < 1 {
6486                                                        return Err(mex(
6487                                                            "IndexOutOfBounds",
6488                                                            "Index out of bounds",
6489                                                        ));
6490                                                    }
6491                                                    selectors.push(Sel::Scalar(idx as usize));
6492                                                }
6493                                                Value::Tensor(idx_t) => {
6494                                                    let dim_len = *t2.shape.get(d).unwrap_or(&1);
6495                                                    let len = idx_t.shape.iter().product::<usize>();
6496                                                    if len == dim_len {
6497                                                        let mut indices = Vec::new();
6498                                                        for (i, &val) in
6499                                                            idx_t.data.iter().enumerate()
6500                                                        {
6501                                                            if val != 0.0 {
6502                                                                indices.push(i + 1);
6503                                                            }
6504                                                        }
6505                                                        selectors.push(Sel::Indices(indices));
6506                                                    } else {
6507                                                        let mut indices = Vec::with_capacity(len);
6508                                                        for &val in &idx_t.data {
6509                                                            let idx = val as isize;
6510                                                            if idx < 1 {
6511                                                                return Err(mex(
6512                                                                    "IndexOutOfBounds",
6513                                                                    "Index out of bounds",
6514                                                                ));
6515                                                            }
6516                                                            indices.push(idx as usize);
6517                                                        }
6518                                                        selectors.push(Sel::Indices(indices));
6519                                                    }
6520                                                }
6521                                                Value::LogicalArray(la) => {
6522                                                    let dim_len = *t2.shape.get(d).unwrap_or(&1);
6523                                                    if la.data.len() == dim_len {
6524                                                        let mut indices = Vec::new();
6525                                                        for (i, &b) in la.data.iter().enumerate() {
6526                                                            if b != 0 {
6527                                                                indices.push(i + 1);
6528                                                            }
6529                                                        }
6530                                                        selectors.push(Sel::Indices(indices));
6531                                                    } else {
6532                                                        return Err(mex(
6533                                                            "IndexShape",
6534                                                            "Logical mask shape mismatch",
6535                                                        ));
6536                                                    }
6537                                                }
6538                                                _ => {
6539                                                    return Err(mex(
6540                                                        "UnsupportedIndexType",
6541                                                        "Unsupported index type",
6542                                                    ))
6543                                                }
6544                                            }
6545                                        }
6546                                    }
6547                                    let mut out_dims: Vec<usize> = Vec::new();
6548                                    let mut per_dim_indices: Vec<Vec<usize>> =
6549                                        Vec::with_capacity(dims);
6550                                    for (d, sel) in selectors.iter().enumerate().take(dims) {
6551                                        let dim_len = *t2.shape.get(d).unwrap_or(&1);
6552                                        let idxs = match sel {
6553                                            Sel::Colon => (1..=dim_len).collect::<Vec<usize>>(),
6554                                            Sel::Scalar(i) => vec![*i],
6555                                            Sel::Indices(v) => v.clone(),
6556                                        };
6557                                        if idxs.iter().any(|&i| i == 0 || i > dim_len) {
6558                                            return Err(mex(
6559                                                "IndexOutOfBounds",
6560                                                "Index out of bounds",
6561                                            ));
6562                                        }
6563                                        if idxs.len() > 1 {
6564                                            out_dims.push(idxs.len());
6565                                        } else {
6566                                            out_dims.push(1);
6567                                        }
6568                                        per_dim_indices.push(idxs);
6569                                    }
6570                                    if dims == 2 {
6571                                        match (
6572                                            &per_dim_indices[0].as_slice(),
6573                                            &per_dim_indices[1].as_slice(),
6574                                        ) {
6575                                            (i_list, j_list)
6576                                                if i_list.len() > 1 && j_list.len() == 1 =>
6577                                            {
6578                                                out_dims = vec![i_list.len(), 1];
6579                                            }
6580                                            (i_list, j_list)
6581                                                if i_list.len() == 1 && j_list.len() > 1 =>
6582                                            {
6583                                                out_dims = vec![1, j_list.len()];
6584                                            }
6585                                            _ => {}
6586                                        }
6587                                    }
6588                                    let mut strides: Vec<usize> = vec![0; dims];
6589                                    let full_shape: Vec<usize> = if rank < dims {
6590                                        let mut s = t2.shape.clone();
6591                                        s.resize(dims, 1);
6592                                        s
6593                                    } else {
6594                                        t2.shape.clone()
6595                                    };
6596                                    let mut acc = 1usize;
6597                                    for d in 0..dims {
6598                                        strides[d] = acc;
6599                                        acc *= full_shape[d];
6600                                    }
6601                                    let total_out: usize = out_dims.iter().product();
6602                                    let mut out_data: Vec<f64> = Vec::with_capacity(total_out);
6603                                    if out_dims.contains(&0) {
6604                                        let out_tensor =
6605                                            runmat_builtins::Tensor::new(out_data, out_dims)
6606                                                .map_err(|e| format!("Slice error: {e}"))?;
6607                                        stack.push(Value::Tensor(out_tensor));
6608                                    } else {
6609                                        fn cartesian<F: FnMut(&[usize])>(
6610                                            lists: &[Vec<usize>],
6611                                            mut f: F,
6612                                        ) {
6613                                            let dims = lists.len();
6614                                            let mut idx = vec![0usize; dims];
6615                                            loop {
6616                                                let current: Vec<usize> =
6617                                                    (0..dims).map(|d| lists[d][idx[d]]).collect();
6618                                                f(&current);
6619                                                let mut d = 0usize;
6620                                                while d < dims {
6621                                                    idx[d] += 1;
6622                                                    if idx[d] < lists[d].len() {
6623                                                        break;
6624                                                    }
6625                                                    idx[d] = 0;
6626                                                    d += 1;
6627                                                }
6628                                                if d == dims {
6629                                                    break;
6630                                                }
6631                                            }
6632                                        }
6633                                        cartesian(&per_dim_indices, |multi| {
6634                                            let mut lin = 0usize;
6635                                            for d in 0..dims {
6636                                                let i0 = multi[d] - 1;
6637                                                lin += i0 * strides[d];
6638                                            }
6639                                            out_data.push(t2.data[lin]);
6640                                        });
6641                                        if out_data.len() == 1 {
6642                                            stack.push(Value::Num(out_data[0]));
6643                                        } else {
6644                                            let out_tensor =
6645                                                runmat_builtins::Tensor::new(out_data, out_dims)
6646                                                    .map_err(|e| format!("Slice error: {e}"))?;
6647                                            stack.push(Value::Tensor(out_tensor));
6648                                        }
6649                                    }
6650                                }
6651                            }
6652                            other => {
6653                                stack.push(other);
6654                            }
6655                        }
6656                    }
6657                    other => {
6658                        vm_bail!(mex(
6659                            "SliceNonTensor",
6660                            &format!("Slicing only supported on tensors: got {other:?}")
6661                        ));
6662                    }
6663                }
6664            }
6665            Instr::Index1DRangeEnd { has_step, offset } => {
6666                // Legacy 1-D path for end arithmetic
6667                let step_val: f64 = if has_step {
6668                    let v: f64 = (&stack
6669                        .pop()
6670                        .ok_or(mex("StackUnderflow", "stack underflow"))?)
6671                        .try_into()?;
6672                    v
6673                } else {
6674                    1.0
6675                };
6676                let start_val: f64 = (&stack
6677                    .pop()
6678                    .ok_or(mex("StackUnderflow", "stack underflow"))?)
6679                    .try_into()?;
6680                let base = stack
6681                    .pop()
6682                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
6683                match base {
6684                    Value::Tensor(t) => {
6685                        let total = t.data.len();
6686                        let end_idx = (total as i64) - offset; // inclusive
6687                        let mut out: Vec<f64> = Vec::new();
6688                        let mut cur = start_val as i64;
6689                        let step_i = if step_val >= 0.0 {
6690                            step_val as i64
6691                        } else {
6692                            -(step_val.abs() as i64)
6693                        };
6694                        if step_i == 0 {
6695                            return Err(mex("IndexStepZero", "Index step cannot be zero"));
6696                        }
6697                        if step_i > 0 {
6698                            while cur as i64 <= end_idx {
6699                                let idx0 = cur as usize;
6700                                if idx0 == 0 || idx0 > total {
6701                                    break;
6702                                }
6703                                out.push(t.data[idx0 - 1]);
6704                                cur += step_i;
6705                            }
6706                        } else {
6707                            while (cur as i64) >= end_idx {
6708                                let idx0 = cur as usize;
6709                                if idx0 == 0 || idx0 > total {
6710                                    break;
6711                                }
6712                                out.push(t.data[idx0 - 1]);
6713                                cur += step_i;
6714                            }
6715                        }
6716                        if out.len() == 1 {
6717                            stack.push(Value::Num(out[0]));
6718                        } else {
6719                            let tens =
6720                                runmat_builtins::Tensor::new(out.clone(), vec![out.len(), 1])
6721                                    .map_err(|e| format!("Range slice error: {e}"))?;
6722                            stack.push(Value::Tensor(tens));
6723                        }
6724                    }
6725                    _ => vm_bail!(mex("SliceNonTensor", "Slicing only supported on tensors")),
6726                }
6727            }
6728            Instr::StoreSlice(dims, numeric_count, colon_mask, end_mask) => {
6729                let __b = bench_start();
6730                // RHS value to scatter, then numeric indices, then base
6731                let rhs = stack
6732                    .pop()
6733                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
6734                let mut numeric: Vec<Value> = Vec::with_capacity(numeric_count);
6735                for _ in 0..numeric_count {
6736                    numeric.push(
6737                        stack
6738                            .pop()
6739                            .ok_or(mex("StackUnderflow", "stack underflow"))?,
6740                    );
6741                }
6742                numeric.reverse();
6743                let base = stack
6744                    .pop()
6745                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
6746                match base {
6747                    Value::Object(obj) => {
6748                        let cell =
6749                            runmat_builtins::CellArray::new(numeric.clone(), 1, numeric.len())
6750                                .map_err(|e| format!("subsasgn build error: {e}"))?;
6751                        match runmat_runtime::call_builtin(
6752                            "call_method",
6753                            &[
6754                                Value::Object(obj.clone()),
6755                                Value::String("subsasgn".to_string()),
6756                                Value::String("()".to_string()),
6757                                Value::Cell(cell.clone()),
6758                                rhs.clone(),
6759                            ],
6760                        ) {
6761                            Ok(v) => stack.push(v),
6762                            Err(_e) => {
6763                                // Fallback to direct builtin OverIdx.subsasgn if class method isn't registered
6764                                // Determine class name and call fully qualified builtin if present
6765                                let qualified = format!("{}.subsasgn", obj.class_name);
6766                                match runmat_runtime::call_builtin(
6767                                    &qualified,
6768                                    &[
6769                                        Value::Object(obj),
6770                                        Value::String("()".to_string()),
6771                                        Value::Cell(cell),
6772                                        rhs,
6773                                    ],
6774                                ) {
6775                                    Ok(v2) => stack.push(v2),
6776                                    Err(e2) => vm_bail!(e2),
6777                                }
6778                            }
6779                        }
6780                    }
6781                    Value::Tensor(mut t) => {
6782                        // F4: write barrier hook (placeholder) – in a full GC integration, call into GC pre/post here
6783                        // Linear 1-D indexing assignment: A(I) = rhs
6784                        if dims == 1 {
6785                            let total = t.data.len();
6786                            // Build linear index list
6787                            let mut lin_indices: Vec<usize> = Vec::new();
6788                            let is_colon = (colon_mask & 1u32) != 0;
6789                            let is_end = (end_mask & 1u32) != 0;
6790                            if is_colon {
6791                                lin_indices = (1..=total).collect();
6792                            } else if is_end {
6793                                lin_indices = vec![total];
6794                            } else {
6795                                let v = numeric
6796                                    .first()
6797                                    .ok_or(mex("MissingNumericIndex", "missing numeric index"))?;
6798                                match v {
6799                                    Value::Num(n) => {
6800                                        let i = *n as isize;
6801                                        if i < 1 || (i as usize) > total {
6802                                            vm_bail!(mex(
6803                                                "IndexOutOfBounds",
6804                                                "Index out of bounds"
6805                                            ));
6806                                        }
6807                                        lin_indices.push(i as usize);
6808                                    }
6809                                    Value::Tensor(idx_t) => {
6810                                        let len = idx_t.shape.iter().product::<usize>();
6811                                        if len == total {
6812                                            for (i, &val) in idx_t.data.iter().enumerate() {
6813                                                if val != 0.0 {
6814                                                    lin_indices.push(i + 1);
6815                                                }
6816                                            }
6817                                        } else {
6818                                            for &val in &idx_t.data {
6819                                                let i = val as isize;
6820                                                if i < 1 || (i as usize) > total {
6821                                                    vm_bail!(mex(
6822                                                        "IndexOutOfBounds",
6823                                                        "Index out of bounds"
6824                                                    ));
6825                                                }
6826                                                lin_indices.push(i as usize);
6827                                            }
6828                                        }
6829                                    }
6830                                    _ => vm_bail!(mex(
6831                                        "UnsupportedIndexType",
6832                                        "Unsupported index type"
6833                                    )),
6834                                }
6835                            }
6836                            // Scatter RHS
6837                            match rhs {
6838                                Value::Num(v) => {
6839                                    for &li in &lin_indices {
6840                                        t.data[li - 1] = v;
6841                                    }
6842                                }
6843                                Value::Tensor(rt) => {
6844                                    if rt.data.len() == 1 {
6845                                        let v = rt.data[0];
6846                                        for &li in &lin_indices {
6847                                            t.data[li - 1] = v;
6848                                        }
6849                                    } else if rt.data.len() == lin_indices.len() {
6850                                        for (k, &li) in lin_indices.iter().enumerate() {
6851                                            t.data[li - 1] = rt.data[k];
6852                                        }
6853                                    } else {
6854                                        vm_bail!(
6855                                            "shape mismatch for linear slice assign".to_string()
6856                                        );
6857                                    }
6858                                }
6859                                _ => vm_bail!("rhs must be numeric or tensor".to_string()),
6860                            }
6861                            stack.push(Value::Tensor(t));
6862                        } else {
6863                            let rank = t.shape.len();
6864                            #[derive(Clone)]
6865                            enum Sel {
6866                                Colon,
6867                                Scalar(usize),
6868                                Indices(Vec<usize>),
6869                            }
6870                            let mut selectors: Vec<Sel> = Vec::with_capacity(dims);
6871                            let mut num_iter = 0usize;
6872                            for d in 0..dims {
6873                                let is_colon = (colon_mask & (1u32 << d)) != 0;
6874                                let is_end = (end_mask & (1u32 << d)) != 0;
6875                                if is_colon {
6876                                    selectors.push(Sel::Colon);
6877                                } else if is_end {
6878                                    selectors.push(Sel::Scalar(*t.shape.get(d).unwrap_or(&1)));
6879                                } else {
6880                                    let v = numeric.get(num_iter).ok_or(mex(
6881                                        "MissingNumericIndex",
6882                                        "missing numeric index",
6883                                    ))?;
6884                                    num_iter += 1;
6885                                    match v {
6886                                        Value::Num(n) => {
6887                                            let idx = *n as isize;
6888                                            if idx < 1 {
6889                                                vm_bail!(mex(
6890                                                    "IndexOutOfBounds",
6891                                                    "Index out of bounds"
6892                                                ));
6893                                            }
6894                                            selectors.push(Sel::Scalar(idx as usize));
6895                                        }
6896                                        Value::Tensor(idx_t) => {
6897                                            let dim_len = *t.shape.get(d).unwrap_or(&1);
6898                                            let len = idx_t.shape.iter().product::<usize>();
6899                                            if len == dim_len {
6900                                                let mut v = Vec::new();
6901                                                for (i, &val) in idx_t.data.iter().enumerate() {
6902                                                    if val != 0.0 {
6903                                                        v.push(i + 1);
6904                                                    }
6905                                                }
6906                                                selectors.push(Sel::Indices(v));
6907                                            } else {
6908                                                let mut v = Vec::with_capacity(len);
6909                                                for &val in &idx_t.data {
6910                                                    let idx = val as isize;
6911                                                    if idx < 1 {
6912                                                        vm_bail!(mex(
6913                                                            "IndexOutOfBounds",
6914                                                            "Index out of bounds"
6915                                                        ));
6916                                                    }
6917                                                    v.push(idx as usize);
6918                                                }
6919                                                selectors.push(Sel::Indices(v));
6920                                            }
6921                                        }
6922                                        _ => vm_bail!(mex(
6923                                            "UnsupportedIndexType",
6924                                            "Unsupported index type"
6925                                        )),
6926                                    }
6927                                }
6928                            }
6929                            // 2-D write fast paths (full column/row) with strict broadcast checks
6930                            if dims == 2 {
6931                                let rows = if rank >= 1 { t.shape[0] } else { 1 };
6932                                let cols = if rank >= 2 { t.shape[1] } else { 1 };
6933                                match (&selectors[0], &selectors[1]) {
6934                                    // A(:, j) = rhs
6935                                    (Sel::Colon, Sel::Scalar(j)) => {
6936                                        let j0 = *j - 1;
6937                                        // Size growth semantics: extend columns if needed
6938                                        if j0 >= cols {
6939                                            let new_cols = j0 + 1;
6940                                            let new_rows = rows;
6941                                            let mut new_data = vec![0.0f64; new_rows * new_cols];
6942                                            for c in 0..cols {
6943                                                let src_off = c * rows;
6944                                                let dst_off = c * new_rows;
6945                                                new_data[dst_off..dst_off + rows].copy_from_slice(
6946                                                    &t.data[src_off..src_off + rows],
6947                                                );
6948                                            }
6949                                            t.data = new_data;
6950                                            t.shape = vec![new_rows, new_cols];
6951                                            t.rows = new_rows;
6952                                            t.cols = new_cols;
6953                                        }
6954                                        let start = j0 * rows;
6955                                        match rhs {
6956                                            Value::Num(v) => {
6957                                                for r in 0..rows {
6958                                                    t.data[start + r] = v;
6959                                                }
6960                                            }
6961                                            Value::Tensor(rt) => {
6962                                                let len = rt.data.len();
6963                                                if len == rows {
6964                                                    for r in 0..rows {
6965                                                        t.data[start + r] = rt.data[r];
6966                                                    }
6967                                                } else if len == 1 {
6968                                                    for r in 0..rows {
6969                                                        t.data[start + r] = rt.data[0];
6970                                                    }
6971                                                } else {
6972                                                    vm_bail!("shape mismatch for slice assign"
6973                                                        .to_string());
6974                                                }
6975                                            }
6976                                            _ => {
6977                                                vm_bail!("rhs must be numeric or tensor".to_string())
6978                                            }
6979                                        }
6980                                        stack.push(Value::Tensor(t));
6981                                        bench_end("StoreSlice2D.fast_col", __b);
6982                                        pc += 1;
6983                                        continue;
6984                                    }
6985                                    // A(i, :) = rhs
6986                                    (Sel::Scalar(i), Sel::Colon) => {
6987                                        let i0 = *i - 1;
6988                                        // Size growth semantics: extend rows if needed
6989                                        if i0 >= rows {
6990                                            let new_rows = i0 + 1;
6991                                            let new_cols = cols;
6992                                            let mut new_data = vec![0.0f64; new_rows * new_cols];
6993                                            for c in 0..cols {
6994                                                for r in 0..rows {
6995                                                    new_data[r + c * new_rows] =
6996                                                        t.data[r + c * rows];
6997                                                }
6998                                            }
6999                                            t.data = new_data;
7000                                            t.shape = vec![new_rows, new_cols];
7001                                            t.rows = new_rows;
7002                                            t.cols = new_cols;
7003                                        }
7004                                        match rhs {
7005                                            Value::Num(v) => {
7006                                                for c in 0..cols {
7007                                                    t.data[i0 + c * rows] = v;
7008                                                }
7009                                            }
7010                                            Value::Tensor(rt) => {
7011                                                let len = rt.data.len();
7012                                                if len == cols {
7013                                                    for c in 0..cols {
7014                                                        t.data[i0 + c * rows] = rt.data[c];
7015                                                    }
7016                                                } else if len == 1 {
7017                                                    for c in 0..cols {
7018                                                        t.data[i0 + c * rows] = rt.data[0];
7019                                                    }
7020                                                } else {
7021                                                    vm_bail!("shape mismatch for slice assign"
7022                                                        .to_string());
7023                                                }
7024                                            }
7025                                            _ => {
7026                                                vm_bail!("rhs must be numeric or tensor".to_string())
7027                                            }
7028                                        }
7029                                        stack.push(Value::Tensor(t));
7030                                        bench_end("StoreSlice2D.fast_row", __b);
7031                                        pc += 1;
7032                                        continue;
7033                                    }
7034                                    _ => {}
7035                                }
7036                            }
7037                            // Generic N-D writer path
7038                            // Build per-dim index lists and strides
7039                            let mut per_dim_indices: Vec<Vec<usize>> = Vec::with_capacity(dims);
7040                            let full_shape: Vec<usize> = if rank < dims {
7041                                let mut s = t.shape.clone();
7042                                s.resize(dims, 1);
7043                                s
7044                            } else {
7045                                t.shape.clone()
7046                            };
7047                            for d in 0..dims {
7048                                let dim_len = full_shape[d];
7049                                let idxs = match &selectors[d] {
7050                                    Sel::Colon => (1..=dim_len).collect(),
7051                                    Sel::Scalar(i) => vec![*i],
7052                                    Sel::Indices(v) => v.clone(),
7053                                };
7054                                if idxs.iter().any(|&i| i == 0 || i > dim_len) {
7055                                    vm_bail!(mex("IndexOutOfBounds", "Index out of bounds"));
7056                                }
7057                                per_dim_indices.push(idxs);
7058                            }
7059                            // Column-major strides (first dimension fastest)
7060                            let mut strides: Vec<usize> = vec![0; dims];
7061                            let mut acc = 1usize;
7062                            for d in 0..dims {
7063                                strides[d] = acc;
7064                                acc *= full_shape[d];
7065                            }
7066                            let total_out: usize =
7067                                per_dim_indices.iter().map(|v| v.len()).product();
7068                            // Prepare RHS values
7069                            enum RhsView {
7070                                Scalar(f64),
7071                                Tensor {
7072                                    data: Vec<f64>,
7073                                    shape: Vec<usize>,
7074                                    strides: Vec<usize>,
7075                                },
7076                            }
7077                            let rhs_view = match rhs {
7078                                Value::Num(n) => RhsView::Scalar(n),
7079                                Value::Tensor(rt) => {
7080                                    // Allow exact match or N-D broadcasting where rhs_dim is 1 or equals out_dim
7081                                    let mut shape = rt.shape.clone();
7082                                    if shape.len() < dims {
7083                                        shape.resize(dims, 1);
7084                                    }
7085                                    if shape.len() > dims {
7086                                        if shape.iter().skip(dims).any(|&s| s != 1) {
7087                                            vm_bail!("shape mismatch for slice assign".to_string());
7088                                        }
7089                                        shape.truncate(dims);
7090                                    }
7091                                    let mut ok = true;
7092                                    for d in 0..dims {
7093                                        let out_len = per_dim_indices[d].len();
7094                                        let rhs_len = shape[d];
7095                                        if !(rhs_len == 1 || rhs_len == out_len) {
7096                                            ok = false;
7097                                            break;
7098                                        }
7099                                    }
7100                                    if !ok {
7101                                        vm_bail!("shape mismatch for slice assign".to_string());
7102                                    }
7103                                    let mut rstrides = vec![0usize; dims];
7104                                    let mut racc = 1usize;
7105                                    for d in 0..dims {
7106                                        rstrides[d] = racc;
7107                                        racc *= shape[d];
7108                                    }
7109                                    RhsView::Tensor {
7110                                        data: rt.data,
7111                                        shape,
7112                                        strides: rstrides,
7113                                    }
7114                                }
7115                                _ => vm_bail!("rhs must be numeric or tensor".to_string()),
7116                            };
7117                            // Iterate and scatter
7118                            let mut _k = 0usize;
7119                            let mut idx = vec![0usize; dims];
7120                            if total_out == 0 {
7121                                stack.push(Value::Tensor(t));
7122                            } else {
7123                                loop {
7124                                    let mut lin = 0usize;
7125                                    for d in 0..dims {
7126                                        let i0 = per_dim_indices[d][idx[d]] - 1;
7127                                        lin += i0 * strides[d];
7128                                    }
7129                                    match &rhs_view {
7130                                        RhsView::Scalar(val) => t.data[lin] = *val,
7131                                        RhsView::Tensor {
7132                                            data,
7133                                            shape,
7134                                            strides,
7135                                        } => {
7136                                            let mut rlin = 0usize;
7137                                            for d in 0..dims {
7138                                                let rhs_len = shape[d];
7139                                                let pos = if rhs_len == 1 { 0 } else { idx[d] };
7140                                                rlin += pos * strides[d];
7141                                            }
7142                                            t.data[lin] = data[rlin];
7143                                        }
7144                                    }
7145                                    _k += 1;
7146                                    // Increment first dim fastest
7147                                    let mut d = 0usize;
7148                                    while d < dims {
7149                                        idx[d] += 1;
7150                                        if idx[d] < per_dim_indices[d].len() {
7151                                            break;
7152                                        }
7153                                        idx[d] = 0;
7154                                        d += 1;
7155                                    }
7156                                    if d == dims {
7157                                        break;
7158                                    }
7159                                }
7160                                stack.push(Value::Tensor(t));
7161                            }
7162                        }
7163                    }
7164                    Value::GpuTensor(handle) => {
7165                        if let Some(provider) = runmat_accelerate_api::provider() {
7166                            let base_shape = handle.shape.clone();
7167                            if let Ok(selectors) = build_slice_selectors(
7168                                dims,
7169                                colon_mask,
7170                                end_mask,
7171                                &numeric,
7172                                &base_shape,
7173                            ) {
7174                                if let Ok(plan) = build_slice_plan(&selectors, dims, &base_shape) {
7175                                    if plan.indices.is_empty() {
7176                                        stack.push(Value::GpuTensor(handle));
7177                                        bench_end("StoreSlice", __b);
7178                                        pc += 1;
7179                                        continue;
7180                                    }
7181                                    let values_result = if plan.dims == 1 {
7182                                        let count =
7183                                            plan.selection_lengths.first().copied().unwrap_or(0);
7184                                        materialize_rhs_linear(&rhs, count)
7185                                    } else {
7186                                        materialize_rhs_nd(&rhs, &plan.selection_lengths)
7187                                    };
7188                                    if let Ok(values) = values_result {
7189                                        if values.len() == plan.indices.len() {
7190                                            let value_shape = vec![values.len().max(1), 1];
7191                                            let upload_result = if values.is_empty() {
7192                                                provider.zeros(&[0, 1])
7193                                            } else {
7194                                                provider.upload(
7195                                                    &runmat_accelerate_api::HostTensorView {
7196                                                        data: &values,
7197                                                        shape: &value_shape,
7198                                                    },
7199                                                )
7200                                            };
7201                                            if let Ok(values_handle) = upload_result {
7202                                                if provider
7203                                                    .scatter_linear(
7204                                                        &handle,
7205                                                        &plan.indices,
7206                                                        &values_handle,
7207                                                    )
7208                                                    .is_ok()
7209                                                {
7210                                                    stack.push(Value::GpuTensor(handle));
7211                                                    bench_end("StoreSlice", __b);
7212                                                    pc += 1;
7213                                                    continue;
7214                                                }
7215                                            }
7216                                        }
7217                                    }
7218                                }
7219                            }
7220                        }
7221                        let h = handle;
7222                        // Attempt provider fast-paths for contiguous 2D row/col writes with GPU RHS
7223                        if dims == 2 {
7224                            let rows = h.shape.first().copied().unwrap_or(1);
7225                            let cols = h.shape.get(1).copied().unwrap_or(1);
7226                            // Build minimal selectors using handle shape for 'end'
7227                            #[derive(Clone)]
7228                            enum Sel {
7229                                Colon,
7230                                Scalar(usize),
7231                            }
7232                            #[allow(unused_assignments)]
7233                            let mut num_iter_fast = 0usize;
7234                            let sel0;
7235                            let sel1;
7236                            // d=0
7237                            let is_colon0 = (colon_mask & (1u32 << 0)) != 0;
7238                            let is_end0 = (end_mask & (1u32 << 0)) != 0;
7239                            if is_colon0 {
7240                                sel0 = Sel::Colon;
7241                            } else if is_end0 {
7242                                sel0 = Sel::Scalar(rows);
7243                            } else {
7244                                let v = numeric
7245                                    .get(num_iter_fast)
7246                                    .ok_or(mex("MissingNumericIndex", "missing numeric index"))?;
7247                                num_iter_fast += 1;
7248                                let n: f64 = v.try_into()?;
7249                                if n < 1.0 {
7250                                    return Err(mex("IndexOutOfBounds", "Index out of bounds"));
7251                                }
7252                                sel0 = Sel::Scalar(n as usize);
7253                            }
7254                            // d=1
7255                            let is_colon1 = (colon_mask & (1u32 << 1)) != 0;
7256                            let is_end1 = (end_mask & (1u32 << 1)) != 0;
7257                            if is_colon1 {
7258                                sel1 = Sel::Colon;
7259                            } else if is_end1 {
7260                                sel1 = Sel::Scalar(cols);
7261                            } else {
7262                                let v = numeric
7263                                    .get(num_iter_fast)
7264                                    .ok_or(mex("MissingNumericIndex", "missing numeric index"))?;
7265                                let n: f64 = v.try_into()?;
7266                                if n < 1.0 {
7267                                    return Err(mex("IndexOutOfBounds", "Index out of bounds"));
7268                                }
7269                                sel1 = Sel::Scalar(n as usize);
7270                            }
7271                            // silence unused-assignment lint in builds with two scalar indices
7272                            let _ = num_iter_fast;
7273                            // Column write A(:, j) = rhs (gpu)
7274                            if let (Sel::Colon, Sel::Scalar(j)) = (&sel0, &sel1) {
7275                                let j0 = *j - 1;
7276                                if j0 < cols {
7277                                    if let Value::GpuTensor(vh) = &rhs {
7278                                        let v_rows = match vh.shape.len() {
7279                                            1 | 2 => vh.shape[0],
7280                                            _ => 0,
7281                                        };
7282                                        if v_rows == rows {
7283                                            if let Some(p) = runmat_accelerate_api::provider() {
7284                                                match p.scatter_column(&h, j0, vh) {
7285                                                    Ok(new_h) => {
7286                                                        stack.push(Value::GpuTensor(new_h));
7287                                                        bench_end("StoreSlice2D.fast_col", __b);
7288                                                        pc += 1;
7289                                                        continue;
7290                                                    }
7291                                                    Err(_) => { /* fall through to gather path */ }
7292                                                }
7293                                            }
7294                                        }
7295                                    }
7296                                }
7297                            }
7298                            // Row write A(i, :) = rhs (gpu)
7299                            if let (Sel::Scalar(i), Sel::Colon) = (&sel0, &sel1) {
7300                                let i0 = *i - 1;
7301                                if i0 < rows {
7302                                    if let Value::GpuTensor(vh) = &rhs {
7303                                        let v_cols = match vh.shape.len() {
7304                                            1 => vh.shape[0],
7305                                            2 => vh.shape[1],
7306                                            _ => 0,
7307                                        };
7308                                        if v_cols == cols {
7309                                            if let Some(p) = runmat_accelerate_api::provider() {
7310                                                match p.scatter_row(&h, i0, vh) {
7311                                                    Ok(new_h) => {
7312                                                        stack.push(Value::GpuTensor(new_h));
7313                                                        bench_end("StoreSlice2D.fast_row", __b);
7314                                                        pc += 1;
7315                                                        continue;
7316                                                    }
7317                                                    Err(_) => { /* fall through */ }
7318                                                }
7319                                            }
7320                                        }
7321                                    }
7322                                }
7323                            }
7324                        }
7325                        // Gather–mutate–reupload fallback for slice assignment on GPU bases
7326                        let provider = runmat_accelerate_api::provider()
7327                            .ok_or_else(|| "No acceleration provider registered".to_string())?;
7328                        let host = provider
7329                            .download(&h)
7330                            .map_err(|e| format!("gather for slice assign: {e}"))?;
7331                        let mut t = runmat_builtins::Tensor::new(host.data, host.shape)
7332                            .map_err(|e| format!("slice assign: {e}"))?;
7333                        // Linear 1-D indexing assignment: A(I) = rhs
7334                        if dims == 1 {
7335                            let total = t.data.len();
7336                            // Build linear index list
7337                            let mut lin_indices: Vec<usize> = Vec::new();
7338                            let is_colon = (colon_mask & 1u32) != 0;
7339                            let is_end = (end_mask & 1u32) != 0;
7340                            if is_colon {
7341                                lin_indices = (1..=total).collect();
7342                            } else if is_end {
7343                                lin_indices = vec![total];
7344                            } else {
7345                                let v = numeric
7346                                    .first()
7347                                    .ok_or(mex("MissingNumericIndex", "missing numeric index"))?;
7348                                match v {
7349                                    Value::Num(n) => {
7350                                        let i = *n as isize;
7351                                        if i < 1 || (i as usize) > total {
7352                                            vm_bail!(mex(
7353                                                "IndexOutOfBounds",
7354                                                "Index out of bounds"
7355                                            ));
7356                                        }
7357                                        lin_indices.push(i as usize);
7358                                    }
7359                                    Value::Tensor(idx_t) => {
7360                                        let len = idx_t.shape.iter().product::<usize>();
7361                                        if len == total {
7362                                            for (i, &val) in idx_t.data.iter().enumerate() {
7363                                                if val != 0.0 {
7364                                                    lin_indices.push(i + 1);
7365                                                }
7366                                            }
7367                                        } else {
7368                                            for &val in &idx_t.data {
7369                                                let i = val as isize;
7370                                                if i < 1 || (i as usize) > total {
7371                                                    vm_bail!(mex(
7372                                                        "IndexOutOfBounds",
7373                                                        "Index out of bounds"
7374                                                    ));
7375                                                }
7376                                                lin_indices.push(i as usize);
7377                                            }
7378                                        }
7379                                    }
7380                                    _ => vm_bail!(mex(
7381                                        "UnsupportedIndexType",
7382                                        "Unsupported index type"
7383                                    )),
7384                                }
7385                            }
7386                            // Scatter RHS
7387                            match rhs {
7388                                Value::Num(v) => {
7389                                    for &li in &lin_indices {
7390                                        t.data[li - 1] = v;
7391                                    }
7392                                }
7393                                Value::Tensor(rt) => {
7394                                    if rt.data.len() == 1 {
7395                                        let v = rt.data[0];
7396                                        for &li in &lin_indices {
7397                                            t.data[li - 1] = v;
7398                                        }
7399                                    } else if rt.data.len() == lin_indices.len() {
7400                                        for (k, &li) in lin_indices.iter().enumerate() {
7401                                            t.data[li - 1] = rt.data[k];
7402                                        }
7403                                    } else {
7404                                        vm_bail!(
7405                                            "shape mismatch for linear slice assign".to_string()
7406                                        );
7407                                    }
7408                                }
7409                                _ => vm_bail!("rhs must be numeric or tensor".to_string()),
7410                            }
7411                            let view = runmat_accelerate_api::HostTensorView {
7412                                data: &t.data,
7413                                shape: &t.shape,
7414                            };
7415                            let new_h = provider
7416                                .upload(&view)
7417                                .map_err(|e| format!("reupload after slice assign: {e}"))?;
7418                            stack.push(Value::GpuTensor(new_h));
7419                        } else {
7420                            let rank = t.shape.len();
7421                            #[derive(Clone)]
7422                            enum Sel {
7423                                Colon,
7424                                Scalar(usize),
7425                                Indices(Vec<usize>),
7426                            }
7427                            let mut selectors: Vec<Sel> = Vec::with_capacity(dims);
7428                            let mut num_iter = 0usize;
7429                            for d in 0..dims {
7430                                let is_colon = (colon_mask & (1u32 << d)) != 0;
7431                                let is_end = (end_mask & (1u32 << d)) != 0;
7432                                if is_colon {
7433                                    selectors.push(Sel::Colon);
7434                                } else if is_end {
7435                                    selectors.push(Sel::Scalar(*t.shape.get(d).unwrap_or(&1)));
7436                                } else {
7437                                    let v = numeric.get(num_iter).ok_or(mex(
7438                                        "MissingNumericIndex",
7439                                        "missing numeric index",
7440                                    ))?;
7441                                    num_iter += 1;
7442                                    match v {
7443                                        Value::Num(n) => {
7444                                            let idx = *n as isize;
7445                                            if idx < 1 {
7446                                                vm_bail!(mex(
7447                                                    "IndexOutOfBounds",
7448                                                    "Index out of bounds"
7449                                                ));
7450                                            }
7451                                            selectors.push(Sel::Scalar(idx as usize));
7452                                        }
7453                                        Value::Tensor(idx_t) => {
7454                                            let dim_len = *t.shape.get(d).unwrap_or(&1);
7455                                            let len = idx_t.shape.iter().product::<usize>();
7456                                            if len == dim_len {
7457                                                let mut v = Vec::new();
7458                                                for (i, &val) in idx_t.data.iter().enumerate() {
7459                                                    if val != 0.0 {
7460                                                        v.push(i + 1);
7461                                                    }
7462                                                }
7463                                                selectors.push(Sel::Indices(v));
7464                                            } else {
7465                                                let mut v = Vec::with_capacity(len);
7466                                                for &val in &idx_t.data {
7467                                                    let idx = val as isize;
7468                                                    if idx < 1 {
7469                                                        vm_bail!(mex(
7470                                                            "IndexOutOfBounds",
7471                                                            "Index out of bounds"
7472                                                        ));
7473                                                    }
7474                                                    v.push(idx as usize);
7475                                                }
7476                                                selectors.push(Sel::Indices(v));
7477                                            }
7478                                        }
7479                                        _ => vm_bail!(mex(
7480                                            "UnsupportedIndexType",
7481                                            "Unsupported index type"
7482                                        )),
7483                                    }
7484                                }
7485                            }
7486                            // 2-D write fast paths (full column/row) with strict broadcast checks
7487                            if dims == 2 {
7488                                let rows = if rank >= 1 { t.shape[0] } else { 1 };
7489                                let cols = if rank >= 2 { t.shape[1] } else { 1 };
7490                                match (&selectors[0], &selectors[1]) {
7491                                    // A(:, j) = rhs
7492                                    (Sel::Colon, Sel::Scalar(j)) => {
7493                                        let j0 = *j - 1;
7494                                        // Size growth semantics: extend columns if needed
7495                                        if j0 >= cols {
7496                                            let new_cols = j0 + 1;
7497                                            let new_rows = rows;
7498                                            let mut new_data = vec![0.0f64; new_rows * new_cols];
7499                                            for c in 0..cols {
7500                                                let src_off = c * rows;
7501                                                let dst_off = c * new_rows;
7502                                                new_data[dst_off..dst_off + rows].copy_from_slice(
7503                                                    &t.data[src_off..src_off + rows],
7504                                                );
7505                                            }
7506                                            t.data = new_data;
7507                                            t.shape = vec![new_rows, new_cols];
7508                                        }
7509                                        let start = j0 * rows;
7510                                        // F5: try provider-side contig column scatter to avoid host round-trip on next writes (future optimization)
7511                                        match rhs {
7512                                            Value::Num(v) => {
7513                                                for r in 0..rows {
7514                                                    t.data[start + r] = v;
7515                                                }
7516                                            }
7517                                            Value::Tensor(rt) => {
7518                                                let len = rt.data.len();
7519                                                if len == rows {
7520                                                    for r in 0..rows {
7521                                                        t.data[start + r] = rt.data[r];
7522                                                    }
7523                                                } else if len == 1 {
7524                                                    for r in 0..rows {
7525                                                        t.data[start + r] = rt.data[0];
7526                                                    }
7527                                                } else {
7528                                                    vm_bail!("shape mismatch for slice assign"
7529                                                        .to_string());
7530                                                }
7531                                            }
7532                                            _ => {
7533                                                vm_bail!("rhs must be numeric or tensor".to_string())
7534                                            }
7535                                        }
7536                                        let view = runmat_accelerate_api::HostTensorView {
7537                                            data: &t.data,
7538                                            shape: &t.shape,
7539                                        };
7540                                        let new_h = provider.upload(&view).map_err(|e| {
7541                                            format!("reupload after slice assign: {e}")
7542                                        })?;
7543                                        stack.push(Value::GpuTensor(new_h));
7544                                        bench_end("StoreSlice2D.fast_col", __b);
7545                                        pc += 1;
7546                                        continue;
7547                                    }
7548                                    // A(i, :) = rhs
7549                                    (Sel::Scalar(i), Sel::Colon) => {
7550                                        let i0 = *i - 1;
7551                                        // Size growth semantics: extend rows if needed
7552                                        if i0 >= rows {
7553                                            let new_rows = i0 + 1;
7554                                            let new_cols = cols;
7555                                            let mut new_data = vec![0.0f64; new_rows * new_cols];
7556                                            for c in 0..cols {
7557                                                for r in 0..rows {
7558                                                    new_data[r + c * new_rows] =
7559                                                        t.data[r + c * rows];
7560                                                }
7561                                            }
7562                                            t.data = new_data;
7563                                            t.shape = vec![new_rows, new_cols];
7564                                        }
7565                                        // F5: try provider-side contig row scatter (future optimization)
7566                                        match rhs {
7567                                            Value::Num(v) => {
7568                                                for c in 0..cols {
7569                                                    t.data[i0 + c * rows] = v;
7570                                                }
7571                                            }
7572                                            Value::Tensor(rt) => {
7573                                                let len = rt.data.len();
7574                                                if len == cols {
7575                                                    for c in 0..cols {
7576                                                        t.data[i0 + c * rows] = rt.data[c];
7577                                                    }
7578                                                } else if len == 1 {
7579                                                    for c in 0..cols {
7580                                                        t.data[i0 + c * rows] = rt.data[0];
7581                                                    }
7582                                                } else {
7583                                                    vm_bail!("shape mismatch for slice assign"
7584                                                        .to_string());
7585                                                }
7586                                            }
7587                                            _ => {
7588                                                vm_bail!("rhs must be numeric or tensor".to_string())
7589                                            }
7590                                        }
7591                                        let view = runmat_accelerate_api::HostTensorView {
7592                                            data: &t.data,
7593                                            shape: &t.shape,
7594                                        };
7595                                        let new_h = provider.upload(&view).map_err(|e| {
7596                                            format!("reupload after slice assign: {e}")
7597                                        })?;
7598                                        stack.push(Value::GpuTensor(new_h));
7599                                        bench_end("StoreSlice2D.fast_row", __b);
7600                                        pc += 1;
7601                                        continue;
7602                                    }
7603                                    _ => {}
7604                                }
7605                            }
7606                            // Generic N-D writer path (GPU gather-mutate-reupload)
7607                            // Build per-dim index lists and strides
7608                            let mut per_dim_indices: Vec<Vec<usize>> = Vec::with_capacity(dims);
7609                            let full_shape: Vec<usize> = if rank < dims {
7610                                let mut s = t.shape.clone();
7611                                s.resize(dims, 1);
7612                                s
7613                            } else {
7614                                t.shape.clone()
7615                            };
7616                            for d in 0..dims {
7617                                let dim_len = full_shape[d];
7618                                let idxs = match &selectors[d] {
7619                                    Sel::Colon => (1..=dim_len).collect(),
7620                                    Sel::Scalar(i) => vec![*i],
7621                                    Sel::Indices(v) => v.clone(),
7622                                };
7623                                if idxs.iter().any(|&i| i == 0 || i > dim_len) {
7624                                    vm_bail!(mex("IndexOutOfBounds", "Index out of bounds"));
7625                                }
7626                                per_dim_indices.push(idxs);
7627                            }
7628                            // Column-major strides (first dimension fastest)
7629                            let mut strides: Vec<usize> = vec![0; dims];
7630                            let mut acc = 1usize;
7631                            for d in 0..dims {
7632                                strides[d] = acc;
7633                                acc *= full_shape[d];
7634                            }
7635                            let total_out: usize =
7636                                per_dim_indices.iter().map(|v| v.len()).product();
7637                            // Prepare RHS values
7638                            enum RhsView {
7639                                Scalar(f64),
7640                                Tensor {
7641                                    data: Vec<f64>,
7642                                    shape: Vec<usize>,
7643                                    strides: Vec<usize>,
7644                                },
7645                            }
7646                            let rhs_view = match rhs {
7647                                Value::Num(n) => RhsView::Scalar(n),
7648                                Value::Tensor(rt) => {
7649                                    // Allow exact match or N-D broadcasting where rhs_dim is 1 or equals out_dim
7650                                    let mut shape = rt.shape.clone();
7651                                    if shape.len() < dims {
7652                                        shape.resize(dims, 1);
7653                                    }
7654                                    if shape.len() > dims {
7655                                        if shape.iter().skip(dims).any(|&s| s != 1) {
7656                                            vm_bail!("shape mismatch for slice assign".to_string());
7657                                        }
7658                                        shape.truncate(dims);
7659                                    }
7660                                    let mut ok = true;
7661                                    for d in 0..dims {
7662                                        let out_len = per_dim_indices[d].len();
7663                                        let rhs_len = shape[d];
7664                                        if !(rhs_len == 1 || rhs_len == out_len) {
7665                                            ok = false;
7666                                            break;
7667                                        }
7668                                    }
7669                                    if !ok {
7670                                        vm_bail!("shape mismatch for slice assign".to_string());
7671                                    }
7672                                    let mut rstrides = vec![0usize; dims];
7673                                    let mut racc = 1usize;
7674                                    for d in 0..dims {
7675                                        rstrides[d] = racc;
7676                                        racc *= shape[d];
7677                                    }
7678                                    RhsView::Tensor {
7679                                        data: rt.data,
7680                                        shape,
7681                                        strides: rstrides,
7682                                    }
7683                                }
7684                                _ => vm_bail!("rhs must be numeric or tensor".to_string()),
7685                            };
7686                            // Iterate and scatter
7687                            let mut _k = 0usize;
7688                            let mut idx = vec![0usize; dims];
7689                            if total_out == 0 {
7690                                let view = runmat_accelerate_api::HostTensorView {
7691                                    data: &t.data,
7692                                    shape: &t.shape,
7693                                };
7694                                let new_h = provider
7695                                    .upload(&view)
7696                                    .map_err(|e| format!("reupload after slice assign: {e}"))?;
7697                                stack.push(Value::GpuTensor(new_h));
7698                            } else {
7699                                loop {
7700                                    let mut lin = 0usize;
7701                                    for d in 0..dims {
7702                                        let i0 = per_dim_indices[d][idx[d]] - 1;
7703                                        lin += i0 * strides[d];
7704                                    }
7705                                    match &rhs_view {
7706                                        RhsView::Scalar(val) => t.data[lin] = *val,
7707                                        RhsView::Tensor {
7708                                            data,
7709                                            shape,
7710                                            strides,
7711                                        } => {
7712                                            let mut rlin = 0usize;
7713                                            for d in 0..dims {
7714                                                let rhs_len = shape[d];
7715                                                let pos = if rhs_len == 1 { 0 } else { idx[d] };
7716                                                rlin += pos * strides[d];
7717                                            }
7718                                            t.data[lin] = data[rlin];
7719                                        }
7720                                    }
7721                                    _k += 1;
7722                                    // Increment first dim fastest
7723                                    let mut d = 0usize;
7724                                    while d < dims {
7725                                        idx[d] += 1;
7726                                        if idx[d] < per_dim_indices[d].len() {
7727                                            break;
7728                                        }
7729                                        idx[d] = 0;
7730                                        d += 1;
7731                                    }
7732                                    if d == dims {
7733                                        break;
7734                                    }
7735                                }
7736                                let view = runmat_accelerate_api::HostTensorView {
7737                                    data: &t.data,
7738                                    shape: &t.shape,
7739                                };
7740                                let new_h = provider
7741                                    .upload(&view)
7742                                    .map_err(|e| format!("reupload after slice assign: {e}"))?;
7743                                stack.push(Value::GpuTensor(new_h));
7744                            }
7745                        }
7746                    }
7747                    Value::StringArray(mut sa) => {
7748                        let selectors =
7749                            build_slice_selectors(dims, colon_mask, end_mask, &numeric, &sa.shape)
7750                                .map_err(|e| format!("slice assign: {e}"))?;
7751                        let plan = build_slice_plan(&selectors, dims, &sa.shape).map_err(|e| {
7752                            if e.contains("IndexOutOfBounds") {
7753                                e.clone()
7754                            } else {
7755                                format!("slice assign: {e}")
7756                            }
7757                        })?;
7758                        if plan.indices.is_empty() {
7759                            stack.push(Value::StringArray(sa));
7760                            bench_end("StoreSlice", __b);
7761                            pc += 1;
7762                            continue;
7763                        }
7764                        let rhs_view = build_string_rhs_view(&rhs, &plan.selection_lengths)
7765                            .map_err(|e| format!("slice assign: {e}"))?;
7766                        scatter_string_with_plan(&mut sa, &plan, &rhs_view)
7767                            .map_err(|e| format!("slice assign: {e}"))?;
7768                        stack.push(Value::StringArray(sa));
7769                        bench_end("StoreSlice", __b);
7770                        pc += 1;
7771                        continue;
7772                        // legacy path removed in favor of scatter_string_with_plan
7773                    }
7774                    _ => vm_bail!(
7775                        "Slicing assignment only supported on tensors or string arrays".to_string()
7776                    ),
7777                }
7778                bench_end("StoreSlice", __b);
7779            }
7780            Instr::StoreSliceEx(dims, numeric_count, colon_mask, end_mask, end_offsets) => {
7781                let rhs = stack
7782                    .pop()
7783                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
7784                let mut numeric: Vec<Value> = Vec::with_capacity(numeric_count);
7785                for _ in 0..numeric_count {
7786                    numeric.push(
7787                        stack
7788                            .pop()
7789                            .ok_or(mex("StackUnderflow", "stack underflow"))?,
7790                    );
7791                }
7792                numeric.reverse();
7793                let mut base = stack
7794                    .pop()
7795                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
7796                if let Value::GpuTensor(handle) = &base {
7797                    let adjusted = apply_end_offsets_to_numeric(
7798                        &numeric,
7799                        dims,
7800                        colon_mask,
7801                        end_mask,
7802                        &end_offsets,
7803                        &handle.shape,
7804                    );
7805                    if let Some(provider) = runmat_accelerate_api::provider() {
7806                        if let Ok(selectors) = build_slice_selectors(
7807                            dims,
7808                            colon_mask,
7809                            end_mask,
7810                            &adjusted,
7811                            &handle.shape,
7812                        ) {
7813                            if let Ok(plan) = build_slice_plan(&selectors, dims, &handle.shape) {
7814                                let values = if plan.dims == 1 {
7815                                    let count =
7816                                        plan.selection_lengths.first().copied().unwrap_or(0);
7817                                    materialize_rhs_linear(&rhs, count)
7818                                } else {
7819                                    materialize_rhs_nd(&rhs, &plan.selection_lengths)
7820                                }
7821                                .map_err(|e| format!("slice assign: {e}"))?;
7822                                if values.len() == plan.indices.len() {
7823                                    let value_shape = vec![values.len().max(1), 1];
7824                                    let upload_result = if values.is_empty() {
7825                                        provider.zeros(&[0, 1])
7826                                    } else {
7827                                        provider.upload(&runmat_accelerate_api::HostTensorView {
7828                                            data: &values,
7829                                            shape: &value_shape,
7830                                        })
7831                                    };
7832                                    if let Ok(values_handle) = upload_result {
7833                                        if provider
7834                                            .scatter_linear(handle, &plan.indices, &values_handle)
7835                                            .is_ok()
7836                                        {
7837                                            stack.push(Value::GpuTensor(handle.clone()));
7838                                            pc += 1;
7839                                            continue;
7840                                        }
7841                                    }
7842                                }
7843                            }
7844                        }
7845                        let host = provider
7846                            .download(handle)
7847                            .map_err(|e| format!("slice assign: {e}"))?;
7848                        let tensor = runmat_builtins::Tensor::new(host.data, host.shape)
7849                            .map_err(|e| format!("slice assign: {e}"))?;
7850                        base = Value::Tensor(tensor);
7851                    } else {
7852                        return Err("No acceleration provider registered".to_string());
7853                    }
7854                }
7855                match base {
7856                    Value::Tensor(t) => {
7857                        // Adjust numeric indices for end offsets, mapping numeric position to actual dimension
7858                        let mut adjusted = numeric.clone();
7859                        for (pos, off) in end_offsets {
7860                            if let Some(v) = adjusted.get_mut(pos) {
7861                                // Map numeric index position to dimension index by skipping colon and plain end dims
7862                                let mut seen_numeric = 0usize;
7863                                let mut dim_for_pos = 0usize;
7864                                for d in 0..dims {
7865                                    let is_colon = (colon_mask & (1u32 << d)) != 0;
7866                                    let is_end = (end_mask & (1u32 << d)) != 0;
7867                                    if is_colon || is_end {
7868                                        continue;
7869                                    }
7870                                    if seen_numeric == pos {
7871                                        dim_for_pos = d;
7872                                        break;
7873                                    }
7874                                    seen_numeric += 1;
7875                                }
7876                                let dim_len = *t.shape.get(dim_for_pos).unwrap_or(&1);
7877                                let idx_val = (dim_len as isize) - (off as isize);
7878                                *v = Value::Num(idx_val as f64);
7879                            }
7880                        }
7881                        // Reuse StoreSlice by pushing base back along with adjusted numerics and rhs
7882                        stack.push(Value::Tensor(t));
7883                        for v in adjusted {
7884                            stack.push(v);
7885                        }
7886                        stack.push(rhs);
7887                        // Fallthrough emulation: replicate logic of StoreSlice with broadcasting
7888                        let rhs = stack
7889                            .pop()
7890                            .ok_or(mex("StackUnderflow", "stack underflow"))?;
7891                        let mut numeric: Vec<Value> = Vec::with_capacity(numeric_count);
7892                        for _ in 0..numeric_count {
7893                            numeric.push(
7894                                stack
7895                                    .pop()
7896                                    .ok_or(mex("StackUnderflow", "stack underflow"))?,
7897                            );
7898                        }
7899                        numeric.reverse();
7900                        let base = stack
7901                            .pop()
7902                            .ok_or(mex("StackUnderflow", "stack underflow"))?;
7903                        match base {
7904                            Value::Tensor(mut t) => {
7905                                #[derive(Clone)]
7906                                enum Sel {
7907                                    Colon,
7908                                    Scalar(usize),
7909                                    Indices(Vec<usize>),
7910                                }
7911                                let mut selectors: Vec<Sel> = Vec::with_capacity(dims);
7912                                let mut num_iter = 0usize;
7913                                for d in 0..dims {
7914                                    let is_colon = (colon_mask & (1u32 << d)) != 0;
7915                                    let is_end = (end_mask & (1u32 << d)) != 0;
7916                                    if is_colon {
7917                                        selectors.push(Sel::Colon);
7918                                    } else if is_end {
7919                                        selectors.push(Sel::Scalar(*t.shape.get(d).unwrap_or(&1)));
7920                                    } else {
7921                                        let v = numeric.get(num_iter).ok_or(mex(
7922                                            "MissingNumericIndex",
7923                                            "missing numeric index",
7924                                        ))?;
7925                                        num_iter += 1;
7926                                        match v {
7927                                            Value::Num(n) => {
7928                                                let idx = *n as isize;
7929                                                if idx < 1 {
7930                                                    vm_bail!(mex(
7931                                                        "IndexOutOfBounds",
7932                                                        "Index out of bounds"
7933                                                    ));
7934                                                }
7935                                                selectors.push(Sel::Scalar(idx as usize));
7936                                            }
7937                                            Value::Tensor(idx_t) => {
7938                                                let dim_len = *t.shape.get(d).unwrap_or(&1);
7939                                                let len = idx_t.shape.iter().product::<usize>();
7940                                                if len == dim_len {
7941                                                    let mut vi = Vec::new();
7942                                                    for (i, &val) in idx_t.data.iter().enumerate() {
7943                                                        if val != 0.0 {
7944                                                            vi.push(i + 1);
7945                                                        }
7946                                                    }
7947                                                    selectors.push(Sel::Indices(vi));
7948                                                } else {
7949                                                    let mut vi = Vec::with_capacity(len);
7950                                                    for &val in &idx_t.data {
7951                                                        let idx = val as isize;
7952                                                        if idx < 1 {
7953                                                            vm_bail!(mex(
7954                                                                "IndexOutOfBounds",
7955                                                                "Index out of bounds"
7956                                                            ));
7957                                                        }
7958                                                        vi.push(idx as usize);
7959                                                    }
7960                                                    selectors.push(Sel::Indices(vi));
7961                                                }
7962                                            }
7963                                            _ => vm_bail!(mex(
7964                                                "UnsupportedIndexType",
7965                                                "Unsupported index type"
7966                                            )),
7967                                        }
7968                                    }
7969                                }
7970                                // Compute per-dim indices and strides
7971                                let mut per_dim_indices: Vec<Vec<usize>> = Vec::with_capacity(dims);
7972                                for (d, sel) in selectors.iter().enumerate().take(dims) {
7973                                    let dim_len = *t.shape.get(d).unwrap_or(&1);
7974                                    let idxs = match sel {
7975                                        Sel::Colon => (1..=dim_len).collect::<Vec<usize>>(),
7976                                        Sel::Scalar(i) => vec![*i],
7977                                        Sel::Indices(v) => v.clone(),
7978                                    };
7979                                    per_dim_indices.push(idxs);
7980                                }
7981                                let mut strides: Vec<usize> = vec![0; dims];
7982                                let mut acc = 1usize;
7983                                for (d, stride) in strides.iter_mut().enumerate().take(dims) {
7984                                    *stride = acc;
7985                                    acc *= *t.shape.get(d).unwrap_or(&1);
7986                                }
7987                                // Build RHS view with broadcasting like StoreSlice
7988                                enum RhsView {
7989                                    Scalar(f64),
7990                                    Tensor {
7991                                        data: Vec<f64>,
7992                                        shape: Vec<usize>,
7993                                        strides: Vec<usize>,
7994                                    },
7995                                }
7996                                let rhs_view =
7997                                    match rhs {
7998                                        Value::Num(n) => RhsView::Scalar(n),
7999                                        Value::Tensor(rt) => {
8000                                            let mut rshape = rt.shape.clone();
8001                                            if rshape.len() < dims {
8002                                                rshape.resize(dims, 1);
8003                                            }
8004                                            if rshape.len() > dims {
8005                                                if rshape.iter().skip(dims).any(|&s| s != 1) {
8006                                                    vm_bail!("shape mismatch for slice assign"
8007                                                        .to_string());
8008                                                }
8009                                                rshape.truncate(dims);
8010                                            }
8011                                            for d in 0..dims {
8012                                                let out_len = per_dim_indices[d].len();
8013                                                let rhs_len = rshape[d];
8014                                                if !(rhs_len == 1 || rhs_len == out_len) {
8015                                                    vm_bail!("shape mismatch for slice assign"
8016                                                        .to_string());
8017                                                }
8018                                            }
8019                                            let mut rstrides = vec![0usize; dims];
8020                                            let mut racc = 1usize;
8021                                            for d in 0..dims {
8022                                                rstrides[d] = racc;
8023                                                racc *= rshape[d];
8024                                            }
8025                                            RhsView::Tensor {
8026                                                data: rt.data,
8027                                                shape: rshape,
8028                                                strides: rstrides,
8029                                            }
8030                                        }
8031                                        _ => vm_bail!("rhs must be numeric or tensor".to_string()),
8032                                    };
8033                                // Map absolute indices to selection positions per dimension
8034                                use std::collections::HashMap;
8035                                let mut pos_maps: Vec<HashMap<usize, usize>> =
8036                                    Vec::with_capacity(dims);
8037                                for (_d, dim_idxs) in per_dim_indices.iter().enumerate().take(dims)
8038                                {
8039                                    let mut m = HashMap::new();
8040                                    for (p, &idx) in dim_idxs.iter().enumerate() {
8041                                        m.insert(idx, p);
8042                                    }
8043                                    pos_maps.push(m);
8044                                }
8045                                fn cartesian2<F: FnMut(&[usize])>(lists: &[Vec<usize>], mut f: F) {
8046                                    let dims = lists.len();
8047                                    let mut idx = vec![0usize; dims];
8048                                    loop {
8049                                        let cur: Vec<usize> =
8050                                            (0..dims).map(|d| lists[d][idx[d]]).collect();
8051                                        f(&cur);
8052                                        let mut d = 0usize;
8053                                        while d < dims {
8054                                            idx[d] += 1;
8055                                            if idx[d] < lists[d].len() {
8056                                                break;
8057                                            }
8058                                            idx[d] = 0;
8059                                            d += 1;
8060                                        }
8061                                        if d == dims {
8062                                            break;
8063                                        }
8064                                    }
8065                                }
8066                                cartesian2(&per_dim_indices, |multi| {
8067                                    let mut lin = 0usize;
8068                                    for d in 0..dims {
8069                                        let i0 = multi[d] - 1;
8070                                        lin += i0 * strides[d];
8071                                    }
8072                                    match &rhs_view {
8073                                        RhsView::Scalar(v) => {
8074                                            t.data[lin] = *v;
8075                                        }
8076                                        RhsView::Tensor {
8077                                            data,
8078                                            shape,
8079                                            strides: rstrides,
8080                                        } => {
8081                                            let mut rlin = 0usize;
8082                                            for d in 0..dims {
8083                                                let rhs_len = shape[d];
8084                                                let pos_in_dim = if rhs_len == 1 {
8085                                                    0
8086                                                } else {
8087                                                    *pos_maps[d].get(&multi[d]).unwrap_or(&0)
8088                                                };
8089                                                rlin += pos_in_dim * rstrides[d];
8090                                            }
8091                                            t.data[lin] = data[rlin];
8092                                        }
8093                                    }
8094                                });
8095                                stack.push(Value::Tensor(t));
8096                            }
8097                            Value::StringArray(mut sa) => {
8098                                let selectors = build_slice_selectors(
8099                                    dims, colon_mask, end_mask, &numeric, &sa.shape,
8100                                )
8101                                .map_err(|e| format!("slice assign: {e}"))?;
8102                                let plan =
8103                                    build_slice_plan(&selectors, dims, &sa.shape).map_err(|e| {
8104                                        if e.contains("IndexOutOfBounds") {
8105                                            e.clone()
8106                                        } else {
8107                                            format!("slice assign: {e}")
8108                                        }
8109                                    })?;
8110                                if plan.indices.is_empty() {
8111                                    stack.push(Value::StringArray(sa));
8112                                    pc += 1;
8113                                    continue;
8114                                }
8115                                let rhs_view = build_string_rhs_view(&rhs, &plan.selection_lengths)
8116                                    .map_err(|e| format!("slice assign: {e}"))?;
8117                                scatter_string_with_plan(&mut sa, &plan, &rhs_view)
8118                                    .map_err(|e| format!("slice assign: {e}"))?;
8119                                stack.push(Value::StringArray(sa));
8120                                pc += 1;
8121                                continue;
8122                            }
8123                            other => vm_bail!(format!("StoreSliceEx unsupported base: {other:?}")),
8124                        }
8125                    }
8126                    other => vm_bail!(format!(
8127                        "StoreSliceEx only supports tensors currently, got {other:?}"
8128                    )),
8129                }
8130            }
8131            Instr::StoreRangeEnd {
8132                dims,
8133                numeric_count,
8134                colon_mask,
8135                end_mask,
8136                range_dims,
8137                range_has_step,
8138                end_offsets,
8139            } => {
8140                // RHS, range params (per range dim), then base with numeric scalar indices interleaved
8141                let mut rhs = stack
8142                    .pop()
8143                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
8144                // Pop per-range params in reverse order
8145                let mut range_params: Vec<(f64, f64)> = Vec::with_capacity(range_dims.len());
8146                for i in (0..range_dims.len()).rev() {
8147                    let has = range_has_step[i];
8148                    let step = if has {
8149                        let v: f64 = (&stack
8150                            .pop()
8151                            .ok_or(mex("StackUnderflow", "stack underflow"))?)
8152                            .try_into()?;
8153                        v
8154                    } else {
8155                        1.0
8156                    };
8157                    let st: f64 = (&stack
8158                        .pop()
8159                        .ok_or(mex("StackUnderflow", "stack underflow"))?)
8160                        .try_into()?;
8161                    range_params.push((st, step));
8162                }
8163                range_params.reverse();
8164                let mut numeric: Vec<Value> = Vec::with_capacity(numeric_count);
8165                for _ in 0..numeric_count {
8166                    numeric.push(
8167                        stack
8168                            .pop()
8169                            .ok_or(mex("StackUnderflow", "stack underflow"))?,
8170                    );
8171                }
8172                numeric.reverse();
8173                let mut base = stack
8174                    .pop()
8175                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
8176                #[cfg(feature = "native-accel")]
8177                clear_residency(&base);
8178                // If base is not assignable but rhs is, swap them to handle reversed emission order
8179                let base_assignable = matches!(
8180                    base,
8181                    Value::Object(_) | Value::Tensor(_) | Value::GpuTensor(_)
8182                );
8183                if !base_assignable
8184                    && matches!(
8185                        rhs,
8186                        Value::Object(_) | Value::Tensor(_) | Value::GpuTensor(_)
8187                    )
8188                {
8189                    std::mem::swap(&mut base, &mut rhs);
8190                }
8191                match base {
8192                    Value::Tensor(mut t) => {
8193                        #[derive(Clone)]
8194                        enum Sel {
8195                            Colon,
8196                            Scalar(usize),
8197                            Indices(Vec<usize>),
8198                            Range { start: i64, step: i64, end_off: i64 },
8199                        }
8200                        let mut selectors: Vec<Sel> = Vec::with_capacity(dims);
8201                        let mut num_iter = 0usize;
8202                        let mut rp_iter = 0usize;
8203                        for d in 0..dims {
8204                            if let Some(pos) = range_dims.iter().position(|&rd| rd == d) {
8205                                let (st, sp) = range_params[rp_iter];
8206                                rp_iter += 1;
8207                                let step_i = if sp >= 0.0 {
8208                                    sp as i64
8209                                } else {
8210                                    -(sp.abs() as i64)
8211                                };
8212                                selectors.push(Sel::Range {
8213                                    start: st as i64,
8214                                    step: step_i,
8215                                    end_off: end_offsets[pos],
8216                                });
8217                                continue;
8218                            }
8219                            let is_colon = (colon_mask & (1u32 << d)) != 0;
8220                            let is_end = (end_mask & (1u32 << d)) != 0;
8221                            if is_colon {
8222                                selectors.push(Sel::Colon);
8223                                continue;
8224                            }
8225                            if is_end {
8226                                selectors.push(Sel::Scalar(*t.shape.get(d).unwrap_or(&1)));
8227                                continue;
8228                            }
8229                            let v = numeric
8230                                .get(num_iter)
8231                                .ok_or(mex("MissingNumericIndex", "missing numeric index"))?;
8232                            num_iter += 1;
8233                            match v {
8234                                Value::Num(n) => {
8235                                    let idx = *n as isize;
8236                                    if idx < 1 {
8237                                        vm_bail!(mex("IndexOutOfBounds", "Index out of bounds"));
8238                                    }
8239                                    selectors.push(Sel::Scalar(idx as usize));
8240                                }
8241                                Value::Tensor(idx_t) => {
8242                                    let dim_len = *t.shape.get(d).unwrap_or(&1);
8243                                    let len = idx_t.shape.iter().product::<usize>();
8244                                    if len == dim_len {
8245                                        let mut vi = Vec::new();
8246                                        for (i, &val) in idx_t.data.iter().enumerate() {
8247                                            if val != 0.0 {
8248                                                vi.push(i + 1);
8249                                            }
8250                                        }
8251                                        selectors.push(Sel::Indices(vi));
8252                                    } else {
8253                                        let mut vi = Vec::with_capacity(len);
8254                                        for &val in &idx_t.data {
8255                                            let idx = val as isize;
8256                                            if idx < 1 {
8257                                                vm_bail!(mex(
8258                                                    "IndexOutOfBounds",
8259                                                    "Index out of bounds"
8260                                                ));
8261                                            }
8262                                            vi.push(idx as usize);
8263                                        }
8264                                        selectors.push(Sel::Indices(vi));
8265                                    }
8266                                }
8267                                _ => {
8268                                    vm_bail!(mex("UnsupportedIndexType", "Unsupported index type"))
8269                                }
8270                            }
8271                        }
8272                        // Build index lists and scatter rhs with broadcasting
8273                        // debug removed
8274                        let mut per_dim_indices: Vec<Vec<usize>> = Vec::with_capacity(dims);
8275                        for (d, sel) in selectors.iter().enumerate().take(dims) {
8276                            let dim_len = *t.shape.get(d).unwrap_or(&1);
8277                            let idxs = match sel {
8278                                Sel::Colon => (1..=dim_len).collect::<Vec<usize>>(),
8279                                Sel::Scalar(i) => vec![*i],
8280                                Sel::Indices(v) => v.clone(),
8281                                Sel::Range {
8282                                    start,
8283                                    step,
8284                                    end_off,
8285                                } => {
8286                                    let mut v = Vec::new();
8287                                    let mut cur = *start;
8288                                    let end_i = (dim_len as i64) - *end_off;
8289                                    let stp = *step;
8290                                    if stp == 0 {
8291                                        vm_bail!(mex("IndexStepZero", "Index step cannot be zero"));
8292                                    }
8293                                    if stp > 0 {
8294                                        while cur <= end_i {
8295                                            if cur < 1 || cur > dim_len as i64 {
8296                                                break;
8297                                            }
8298                                            v.push(cur as usize);
8299                                            cur += stp;
8300                                        }
8301                                    } else {
8302                                        while cur >= end_i {
8303                                            if cur < 1 || cur > dim_len as i64 {
8304                                                break;
8305                                            }
8306                                            v.push(cur as usize);
8307                                            cur += stp;
8308                                        }
8309                                    }
8310                                    v
8311                                }
8312                            };
8313                            if idxs.iter().any(|&i| i == 0 || i > dim_len) {
8314                                vm_bail!(mex("IndexOutOfBounds", "Index out of bounds"));
8315                            }
8316                            per_dim_indices.push(idxs);
8317                        }
8318                        let mut strides: Vec<usize> = vec![0; dims];
8319                        let mut acc = 1usize;
8320                        for (d, stride) in strides.iter_mut().enumerate().take(dims) {
8321                            *stride = acc;
8322                            acc *= *t.shape.get(d).unwrap_or(&1);
8323                        }
8324                        let selection_empty = per_dim_indices.iter().any(|v| v.is_empty());
8325                        if selection_empty {
8326                            stack.push(Value::Tensor(t));
8327                        } else {
8328                            // Build broadcasting view for RHS with per-dimension shape
8329                            enum RhsView {
8330                                Scalar(f64),
8331                                Tensor {
8332                                    data: Vec<f64>,
8333                                    shape: Vec<usize>,
8334                                    strides: Vec<usize>,
8335                                },
8336                            }
8337                            let rhs_view = match rhs {
8338                                Value::Num(n) => RhsView::Scalar(n),
8339                                Value::Tensor(rt) => {
8340                                    if rt.data.is_empty() {
8341                                        vm_bail!("shape mismatch for slice assign".to_string());
8342                                    }
8343                                    // Normalize RHS shape to dims by padding with ones or validating extra dims are ones
8344                                    let mut rshape = rt.shape.clone();
8345                                    if rshape.len() < dims {
8346                                        rshape.resize(dims, 1);
8347                                    }
8348                                    if rshape.len() > dims {
8349                                        if rshape.iter().skip(dims).any(|&s| s != 1) {
8350                                            vm_bail!("shape mismatch for slice assign".to_string());
8351                                        }
8352                                        rshape.truncate(dims);
8353                                    }
8354                                    // Validate broadcasting compatibility
8355                                    for d in 0..dims {
8356                                        let out_len = per_dim_indices[d].len();
8357                                        let rhs_len = rshape[d];
8358                                        if !(rhs_len == 1 || rhs_len == out_len) {
8359                                            vm_bail!("shape mismatch for slice assign".to_string());
8360                                        }
8361                                    }
8362                                    // Build column-major strides for RHS
8363                                    let mut rstrides = vec![0usize; dims];
8364                                    let mut racc = 1usize;
8365                                    for d in 0..dims {
8366                                        rstrides[d] = racc;
8367                                        racc *= rshape[d];
8368                                    }
8369                                    if racc != rt.data.len() {
8370                                        vm_bail!("shape mismatch for slice assign".to_string());
8371                                    }
8372                                    RhsView::Tensor {
8373                                        data: rt.data,
8374                                        shape: rshape,
8375                                        strides: rstrides,
8376                                    }
8377                                }
8378                                _ => vm_bail!("rhs must be numeric or tensor".to_string()),
8379                            };
8380                            // Precompute mapping from absolute index to position-in-selection per dimension to ensure column-major consistent mapping
8381                            use std::collections::HashMap;
8382                            let mut pos_maps: Vec<HashMap<usize, usize>> = Vec::with_capacity(dims);
8383                            for dim_idxs in per_dim_indices.iter().take(dims) {
8384                                let mut m: HashMap<usize, usize> = HashMap::new();
8385                                for (p, &idx) in dim_idxs.iter().enumerate() {
8386                                    m.insert(idx, p);
8387                                }
8388                                pos_maps.push(m);
8389                            }
8390                            fn cartesian2<F: FnMut(&[usize])>(lists: &[Vec<usize>], mut f: F) {
8391                                let dims = lists.len();
8392                                let mut idx = vec![0usize; dims];
8393                                loop {
8394                                    let cur: Vec<usize> =
8395                                        (0..dims).map(|d| lists[d][idx[d]]).collect();
8396                                    f(&cur);
8397                                    let mut d = 0usize;
8398                                    while d < dims {
8399                                        idx[d] += 1;
8400                                        if idx[d] < lists[d].len() {
8401                                            break;
8402                                        }
8403                                        idx[d] = 0;
8404                                        d += 1;
8405                                    }
8406                                    if d == dims {
8407                                        break;
8408                                    }
8409                                }
8410                            }
8411                            // debug removed
8412                            let mut err_opt: Option<String> = None;
8413                            let mut _debug_count = 0usize;
8414                            cartesian2(&per_dim_indices, |multi| {
8415                                if err_opt.is_some() {
8416                                    return;
8417                                }
8418                                let mut lin = 0usize;
8419                                for d in 0..dims {
8420                                    let i0 = multi[d] - 1;
8421                                    lin += i0 * strides[d];
8422                                }
8423                                match &rhs_view {
8424                                    RhsView::Scalar(val) => t.data[lin] = *val,
8425                                    RhsView::Tensor {
8426                                        data,
8427                                        shape,
8428                                        strides: rstrides,
8429                                    } => {
8430                                        // Map selection coordinate to RHS coordinate with broadcasting
8431                                        let mut rlin = 0usize;
8432                                        for d in 0..dims {
8433                                            let rhs_len = shape[d];
8434                                            let pos_in_dim = if rhs_len == 1 {
8435                                                0
8436                                            } else {
8437                                                *pos_maps[d].get(&multi[d]).unwrap_or(&0)
8438                                            };
8439                                            rlin += pos_in_dim * rstrides[d];
8440                                        }
8441                                        if rlin >= data.len() {
8442                                            err_opt =
8443                                                Some("shape mismatch for slice assign".to_string());
8444                                            return;
8445                                        }
8446                                        t.data[lin] = data[rlin];
8447                                    }
8448                                }
8449                            });
8450                            let _ = (t.data.first(), t.data.len());
8451                            if let Some(e) = err_opt {
8452                                vm_bail!(e);
8453                            }
8454                            stack.push(Value::Tensor(t));
8455                        }
8456                    }
8457                    Value::GpuTensor(h) => {
8458                        let provider = runmat_accelerate_api::provider()
8459                            .ok_or_else(|| "No acceleration provider registered".to_string())?;
8460                        let host = provider
8461                            .download(&h)
8462                            .map_err(|e| format!("gather for range-end assign: {e}"))?;
8463                        let mut t = runmat_builtins::Tensor::new(host.data, host.shape)
8464                            .map_err(|e| format!("range-end assign: {e}"))?;
8465                        #[derive(Clone)]
8466                        enum Sel {
8467                            Colon,
8468                            Scalar(usize),
8469                            Indices(Vec<usize>),
8470                            Range { start: i64, step: i64, end_off: i64 },
8471                        }
8472                        let mut selectors: Vec<Sel> = Vec::with_capacity(dims);
8473                        let mut num_iter = 0usize;
8474                        let mut rp_iter = 0usize;
8475                        for d in 0..dims {
8476                            if let Some(pos) = range_dims.iter().position(|&rd| rd == d) {
8477                                let (st, sp) = range_params[rp_iter];
8478                                rp_iter += 1;
8479                                let step_i = if sp >= 0.0 {
8480                                    sp as i64
8481                                } else {
8482                                    -(sp.abs() as i64)
8483                                };
8484                                selectors.push(Sel::Range {
8485                                    start: st as i64,
8486                                    step: step_i,
8487                                    end_off: end_offsets[pos],
8488                                });
8489                                continue;
8490                            }
8491                            let is_colon = (colon_mask & (1u32 << d)) != 0;
8492                            let is_end = (end_mask & (1u32 << d)) != 0;
8493                            if is_colon {
8494                                selectors.push(Sel::Colon);
8495                                continue;
8496                            }
8497                            if is_end {
8498                                selectors.push(Sel::Scalar(*t.shape.get(d).unwrap_or(&1)));
8499                                continue;
8500                            }
8501                            let v = numeric
8502                                .get(num_iter)
8503                                .ok_or(mex("MissingNumericIndex", "missing numeric index"))?;
8504                            num_iter += 1;
8505                            match v {
8506                                Value::Num(n) => {
8507                                    let idx = *n as isize;
8508                                    if idx < 1 {
8509                                        vm_bail!(mex("IndexOutOfBounds", "Index out of bounds"));
8510                                    }
8511                                    selectors.push(Sel::Scalar(idx as usize));
8512                                }
8513                                Value::Tensor(idx_t) => {
8514                                    let dim_len = *t.shape.get(d).unwrap_or(&1);
8515                                    let len = idx_t.shape.iter().product::<usize>();
8516                                    if len == dim_len {
8517                                        let mut vi = Vec::new();
8518                                        for (i, &val) in idx_t.data.iter().enumerate() {
8519                                            if val != 0.0 {
8520                                                vi.push(i + 1);
8521                                            }
8522                                        }
8523                                        selectors.push(Sel::Indices(vi));
8524                                    } else {
8525                                        let mut vi = Vec::with_capacity(len);
8526                                        for &val in &idx_t.data {
8527                                            let idx = val as isize;
8528                                            if idx < 1 {
8529                                                vm_bail!(mex(
8530                                                    "IndexOutOfBounds",
8531                                                    "Index out of bounds"
8532                                                ));
8533                                            }
8534                                            vi.push(idx as usize);
8535                                        }
8536                                        selectors.push(Sel::Indices(vi));
8537                                    }
8538                                }
8539                                _ => {
8540                                    vm_bail!(mex("UnsupportedIndexType", "Unsupported index type"))
8541                                }
8542                            }
8543                        }
8544                        // Build index lists and scatter rhs with broadcasting
8545                        // debug removed
8546                        let mut per_dim_indices: Vec<Vec<usize>> = Vec::with_capacity(dims);
8547                        for (d, sel) in selectors.iter().enumerate().take(dims) {
8548                            let dim_len = *t.shape.get(d).unwrap_or(&1);
8549                            let idxs = match sel {
8550                                Sel::Colon => (1..=dim_len).collect::<Vec<usize>>(),
8551                                Sel::Scalar(i) => vec![*i],
8552                                Sel::Indices(v) => v.clone(),
8553                                Sel::Range {
8554                                    start,
8555                                    step,
8556                                    end_off,
8557                                } => {
8558                                    let mut v = Vec::new();
8559                                    let mut cur = *start;
8560                                    let end_i = (dim_len as i64) - *end_off;
8561                                    let stp = *step;
8562                                    if stp == 0 {
8563                                        vm_bail!(mex("IndexStepZero", "Index step cannot be zero"));
8564                                    }
8565                                    if stp > 0 {
8566                                        while cur <= end_i {
8567                                            if cur < 1 || cur > dim_len as i64 {
8568                                                break;
8569                                            }
8570                                            v.push(cur as usize);
8571                                            cur += stp;
8572                                        }
8573                                    } else {
8574                                        while cur >= end_i {
8575                                            if cur < 1 || cur > dim_len as i64 {
8576                                                break;
8577                                            }
8578                                            v.push(cur as usize);
8579                                            cur += stp;
8580                                        }
8581                                    }
8582                                    v
8583                                }
8584                            };
8585                            if idxs.iter().any(|&i| i == 0 || i > dim_len) {
8586                                vm_bail!(mex("IndexOutOfBounds", "Index out of bounds"));
8587                            }
8588                            per_dim_indices.push(idxs);
8589                        }
8590                        let mut strides: Vec<usize> = vec![0; dims];
8591                        let mut acc = 1usize;
8592                        for (d, stride) in strides.iter_mut().enumerate().take(dims) {
8593                            *stride = acc;
8594                            acc *= *t.shape.get(d).unwrap_or(&1);
8595                        }
8596                        let selection_empty = per_dim_indices.iter().any(|v| v.is_empty());
8597                        if selection_empty {
8598                            let view = runmat_accelerate_api::HostTensorView {
8599                                data: &t.data,
8600                                shape: &t.shape,
8601                            };
8602                            let new_h = provider
8603                                .upload(&view)
8604                                .map_err(|e| format!("reupload after range-end assign: {e}"))?;
8605                            stack.push(Value::GpuTensor(new_h));
8606                        } else {
8607                            // Build broadcasting view for RHS with per-dimension shape
8608                            enum RhsView {
8609                                Scalar(f64),
8610                                Tensor {
8611                                    data: Vec<f64>,
8612                                    shape: Vec<usize>,
8613                                    strides: Vec<usize>,
8614                                },
8615                            }
8616                            let rhs_view = match rhs {
8617                                Value::Num(n) => RhsView::Scalar(n),
8618                                Value::Tensor(rt) => {
8619                                    if rt.data.is_empty() {
8620                                        vm_bail!("shape mismatch for slice assign".to_string());
8621                                    }
8622                                    // Normalize RHS shape to dims by padding with ones or validating extra dims are ones
8623                                    let mut rshape = rt.shape.clone();
8624                                    if rshape.len() < dims {
8625                                        rshape.resize(dims, 1);
8626                                    }
8627                                    if rshape.len() > dims {
8628                                        if rshape.iter().skip(dims).any(|&s| s != 1) {
8629                                            vm_bail!("shape mismatch for slice assign".to_string());
8630                                        }
8631                                        rshape.truncate(dims);
8632                                    }
8633                                    // Validate broadcasting compatibility
8634                                    for d in 0..dims {
8635                                        let out_len = per_dim_indices[d].len();
8636                                        let rhs_len = rshape[d];
8637                                        if !(rhs_len == 1 || rhs_len == out_len) {
8638                                            vm_bail!("shape mismatch for slice assign".to_string());
8639                                        }
8640                                    }
8641                                    // Build column-major strides for RHS
8642                                    let mut rstrides = vec![0usize; dims];
8643                                    let mut racc = 1usize;
8644                                    for d in 0..dims {
8645                                        rstrides[d] = racc;
8646                                        racc *= rshape[d];
8647                                    }
8648                                    if racc != rt.data.len() {
8649                                        vm_bail!("shape mismatch for slice assign".to_string());
8650                                    }
8651                                    RhsView::Tensor {
8652                                        data: rt.data,
8653                                        shape: rshape,
8654                                        strides: rstrides,
8655                                    }
8656                                }
8657                                _ => vm_bail!("rhs must be numeric or tensor".to_string()),
8658                            };
8659                            // Precompute mapping from absolute index to position-in-selection per dimension to ensure column-major consistent mapping
8660                            use std::collections::HashMap;
8661                            let mut pos_maps: Vec<HashMap<usize, usize>> = Vec::with_capacity(dims);
8662                            for dim_idxs in per_dim_indices.iter().take(dims) {
8663                                let mut m: HashMap<usize, usize> = HashMap::new();
8664                                for (p, &idx) in dim_idxs.iter().enumerate() {
8665                                    m.insert(idx, p);
8666                                }
8667                                pos_maps.push(m);
8668                            }
8669                            // Iterate selection cartesian and scatter
8670                            let mut err_opt: Option<String> = None;
8671                            // Local cartesian iterator
8672                            fn cartesian2<F: FnMut(&[usize])>(lists: &[Vec<usize>], mut f: F) {
8673                                let dims = lists.len();
8674                                let mut idx = vec![0usize; dims];
8675                                loop {
8676                                    let cur: Vec<usize> =
8677                                        (0..dims).map(|d| lists[d][idx[d]]).collect();
8678                                    f(&cur);
8679                                    let mut d = 0usize;
8680                                    while d < dims {
8681                                        idx[d] += 1;
8682                                        if idx[d] < lists[d].len() {
8683                                            break;
8684                                        }
8685                                        idx[d] = 0;
8686                                        d += 1;
8687                                    }
8688                                    if d == dims {
8689                                        break;
8690                                    }
8691                                }
8692                            }
8693                            cartesian2(&per_dim_indices, |multi| {
8694                                if err_opt.is_some() {
8695                                    return;
8696                                }
8697                                let mut lin = 0usize;
8698                                for d in 0..dims {
8699                                    let i0 = multi[d] - 1;
8700                                    lin += i0 * strides[d];
8701                                }
8702                                match &rhs_view {
8703                                    RhsView::Scalar(val) => t.data[lin] = *val,
8704                                    RhsView::Tensor {
8705                                        data,
8706                                        shape,
8707                                        strides: rstrides,
8708                                    } => {
8709                                        let mut rlin = 0usize;
8710                                        for d in 0..dims {
8711                                            let rhs_len = shape[d];
8712                                            let pos_in_dim = if rhs_len == 1 {
8713                                                0
8714                                            } else {
8715                                                *pos_maps[d].get(&multi[d]).unwrap_or(&0)
8716                                            };
8717                                            rlin += pos_in_dim * rstrides[d];
8718                                        }
8719                                        if rlin >= data.len() {
8720                                            err_opt =
8721                                                Some("shape mismatch for slice assign".to_string());
8722                                            return;
8723                                        }
8724                                        t.data[lin] = data[rlin];
8725                                    }
8726                                }
8727                            });
8728                            if let Some(e) = err_opt {
8729                                vm_bail!(e);
8730                            }
8731                            let view = runmat_accelerate_api::HostTensorView {
8732                                data: &t.data,
8733                                shape: &t.shape,
8734                            };
8735                            let new_h = provider
8736                                .upload(&view)
8737                                .map_err(|e| format!("reupload after range-end assign: {e}"))?;
8738                            stack.push(Value::GpuTensor(new_h));
8739                        }
8740                    }
8741                    Value::Object(obj) => {
8742                        // Build cell of per-dim index descriptors to pass to subsasgn
8743                        let mut idx_values: Vec<Value> = Vec::with_capacity(dims);
8744                        let mut num_iter = 0usize;
8745                        let mut rp_iter = 0usize;
8746                        for d in 0..dims {
8747                            let is_colon = (colon_mask & (1u32 << d)) != 0;
8748                            let is_end = (end_mask & (1u32 << d)) != 0;
8749                            if is_colon {
8750                                idx_values.push(Value::String(":".to_string()));
8751                                continue;
8752                            }
8753                            if is_end {
8754                                idx_values.push(Value::String("end".to_string()));
8755                                continue;
8756                            }
8757                            if let Some(pos) = range_dims.iter().position(|&rd| rd == d) {
8758                                let (st, sp) = range_params[rp_iter];
8759                                rp_iter += 1;
8760                                let off = end_offsets[pos];
8761                                let cell = runmat_builtins::CellArray::new(
8762                                    vec![
8763                                        Value::Num(st),
8764                                        Value::Num(sp),
8765                                        Value::String("end".to_string()),
8766                                        Value::Num(off as f64),
8767                                    ],
8768                                    1,
8769                                    4,
8770                                )
8771                                .map_err(|e| format!("obj range: {e}"))?;
8772                                idx_values.push(Value::Cell(cell));
8773                            } else {
8774                                let v = numeric
8775                                    .get(num_iter)
8776                                    .ok_or(mex("MissingNumericIndex", "missing numeric index"))?;
8777                                num_iter += 1;
8778                                match v {
8779                                    Value::Num(n) => idx_values.push(Value::Num(*n)),
8780                                    Value::Int(i) => idx_values.push(Value::Num(i.to_f64())),
8781                                    Value::Tensor(t) => idx_values.push(Value::Tensor(t.clone())),
8782                                    other => {
8783                                        return Err(format!(
8784                                            "Unsupported index type for object: {other:?}"
8785                                        ))
8786                                    }
8787                                }
8788                            }
8789                        }
8790                        let cell = runmat_builtins::CellArray::new(idx_values, 1, dims)
8791                            .map_err(|e| format!("subsasgn build error: {e}"))?;
8792                        match runmat_runtime::call_builtin(
8793                            "call_method",
8794                            &[
8795                                Value::Object(obj),
8796                                Value::String("subsasgn".to_string()),
8797                                Value::String("()".to_string()),
8798                                Value::Cell(cell),
8799                                rhs,
8800                            ],
8801                        ) {
8802                            Ok(v) => stack.push(v),
8803                            Err(e) => vm_bail!(e),
8804                        }
8805                    }
8806                    _ => vm_bail!("StoreRangeEnd only supports tensors currently".to_string()),
8807                }
8808            }
8809            Instr::StoreSlice1DRangeEnd { has_step, offset } => {
8810                // RHS, then start[, step], then base
8811                let rhs = stack
8812                    .pop()
8813                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
8814                let step_val: f64 = if has_step {
8815                    let v: f64 = (&stack
8816                        .pop()
8817                        .ok_or(mex("StackUnderflow", "stack underflow"))?)
8818                        .try_into()?;
8819                    v
8820                } else {
8821                    1.0
8822                };
8823                let start_val: f64 = (&stack
8824                    .pop()
8825                    .ok_or(mex("StackUnderflow", "stack underflow"))?)
8826                    .try_into()?;
8827                let base = stack
8828                    .pop()
8829                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
8830                #[cfg(feature = "native-accel")]
8831                clear_residency(&base);
8832                match base {
8833                    Value::Tensor(mut t) => {
8834                        let total = t.data.len();
8835                        let end_idx = (total as i64) - offset;
8836                        let mut cur = start_val as i64;
8837                        let step_i = if step_val >= 0.0 {
8838                            step_val as i64
8839                        } else {
8840                            -(step_val.abs() as i64)
8841                        };
8842                        if step_i == 0 {
8843                            return Err(mex("IndexStepZero", "Index step cannot be zero"));
8844                        }
8845                        // Broadcast rhs if scalar
8846                        let rhs_vals: Vec<f64> = match rhs {
8847                            Value::Num(n) => vec![n],
8848                            Value::Tensor(rt) => rt.data.clone(),
8849                            _ => vec![0.0],
8850                        };
8851                        let mut rpos = 0usize;
8852                        if step_i > 0 {
8853                            while cur as i64 <= end_idx {
8854                                let idx0 = cur as usize;
8855                                if idx0 == 0 || idx0 > total {
8856                                    break;
8857                                }
8858                                let v = rhs_vals
8859                                    .get(rpos)
8860                                    .cloned()
8861                                    .unwrap_or(*rhs_vals.last().unwrap_or(&0.0));
8862                                t.data[idx0 - 1] = v;
8863                                rpos += 1;
8864                                cur += step_i;
8865                            }
8866                        } else {
8867                            while (cur as i64) >= end_idx {
8868                                let idx0 = cur as usize;
8869                                if idx0 == 0 || idx0 > total {
8870                                    break;
8871                                }
8872                                let v = rhs_vals
8873                                    .get(rpos)
8874                                    .cloned()
8875                                    .unwrap_or(*rhs_vals.last().unwrap_or(&0.0));
8876                                t.data[idx0 - 1] = v;
8877                                rpos += 1;
8878                                cur += step_i;
8879                            }
8880                        }
8881                        stack.push(Value::Tensor(t));
8882                    }
8883                    _ => vm_bail!("Store range with end only supported on tensors".to_string()),
8884                }
8885            }
8886            Instr::CreateCell2D(rows, cols) => {
8887                let mut elems = Vec::with_capacity(rows * cols);
8888                for _ in 0..rows * cols {
8889                    elems.push(
8890                        stack
8891                            .pop()
8892                            .ok_or(mex("StackUnderflow", "stack underflow"))?,
8893                    );
8894                }
8895                elems.reverse();
8896                let ca = runmat_builtins::CellArray::new(elems, rows, cols)
8897                    .map_err(|e| format!("Cell creation error: {e}"))?;
8898                stack.push(Value::Cell(ca));
8899            }
8900            Instr::IndexCell(num_indices) => {
8901                // Pop indices first (in reverse), then base
8902                let mut indices = Vec::with_capacity(num_indices);
8903                for _ in 0..num_indices {
8904                    let v: f64 = (&stack
8905                        .pop()
8906                        .ok_or(mex("StackUnderflow", "stack underflow"))?)
8907                        .try_into()?;
8908                    indices.push(v as usize);
8909                }
8910                indices.reverse();
8911                let base = stack
8912                    .pop()
8913                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
8914                match base {
8915                    Value::Object(obj) => {
8916                        // Route to subsref(obj, '{}', {indices})
8917                        let cell = runmat_runtime::call_builtin(
8918                            "__make_cell",
8919                            &indices
8920                                .iter()
8921                                .map(|n| Value::Num(*n as f64))
8922                                .collect::<Vec<_>>(),
8923                        )?;
8924                        match runmat_runtime::call_builtin(
8925                            "call_method",
8926                            &[
8927                                Value::Object(obj),
8928                                Value::String("subsref".to_string()),
8929                                Value::String("{}".to_string()),
8930                                cell,
8931                            ],
8932                        ) {
8933                            Ok(v) => stack.push(v),
8934                            Err(e) => vm_bail!(e.to_string()),
8935                        }
8936                    }
8937                    Value::Cell(ca) => match indices.len() {
8938                        1 => {
8939                            let i = indices[0];
8940                            if i == 0 || i > ca.data.len() {
8941                                return Err(mex(
8942                                    "CellIndexOutOfBounds",
8943                                    "Cell index out of bounds",
8944                                ));
8945                            }
8946                            stack.push((*ca.data[i - 1]).clone());
8947                        }
8948                        2 => {
8949                            let r = indices[0];
8950                            let c = indices[1];
8951                            if r == 0 || r > ca.rows || c == 0 || c > ca.cols {
8952                                return Err(mex(
8953                                    "CellSubscriptOutOfBounds",
8954                                    "Cell subscript out of bounds",
8955                                ));
8956                            }
8957                            stack.push((*ca.data[(r - 1) * ca.cols + (c - 1)]).clone());
8958                        }
8959                        _ => return Err("Unsupported number of cell indices".to_string()),
8960                    },
8961                    _ => return Err("Cell indexing on non-cell".to_string()),
8962                }
8963            }
8964            Instr::IndexCellExpand(num_indices, out_count) => {
8965                // Same as IndexCell but flatten cell contents into multiple outputs
8966                let mut indices = Vec::with_capacity(num_indices);
8967                if num_indices > 0 {
8968                    for _ in 0..num_indices {
8969                        let v: f64 = (&stack
8970                            .pop()
8971                            .ok_or(mex("StackUnderflow", "stack underflow"))?)
8972                            .try_into()?;
8973                        indices.push(v as usize);
8974                    }
8975                    indices.reverse();
8976                }
8977                let base = stack
8978                    .pop()
8979                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
8980                match base {
8981                    Value::Cell(ca) => {
8982                        // Expand in column-major order up to out_count elements
8983                        let mut values: Vec<Value> = Vec::new();
8984                        if indices.is_empty() {
8985                            // Expand all elements in column-major order
8986                            values.extend(ca.data.iter().map(|p| (*(*p)).clone()));
8987                        } else {
8988                            match indices.len() {
8989                                1 => {
8990                                    let i = indices[0];
8991                                    if i == 0 || i > ca.data.len() {
8992                                        return Err(mex(
8993                                            "CellIndexOutOfBounds",
8994                                            "Cell index out of bounds",
8995                                        ));
8996                                    }
8997                                    values.push((*ca.data[i - 1]).clone());
8998                                }
8999                                2 => {
9000                                    let r = indices[0];
9001                                    let c = indices[1];
9002                                    if r == 0 || r > ca.rows || c == 0 || c > ca.cols {
9003                                        return Err(mex(
9004                                            "CellSubscriptOutOfBounds",
9005                                            "Cell subscript out of bounds",
9006                                        ));
9007                                    }
9008                                    values.push((*ca.data[(r - 1) * ca.cols + (c - 1)]).clone());
9009                                }
9010                                _ => return Err("Unsupported number of cell indices".to_string()),
9011                            }
9012                        }
9013                        // Pad or truncate to out_count
9014                        if values.len() >= out_count {
9015                            for v in values.iter().take(out_count) {
9016                                stack.push(v.clone());
9017                            }
9018                        } else {
9019                            for v in &values {
9020                                stack.push(v.clone());
9021                            }
9022                            for _ in values.len()..out_count {
9023                                stack.push(Value::Num(0.0));
9024                            }
9025                        }
9026                    }
9027                    Value::Object(obj) => {
9028                        // Defer to subsref; expect a cell back; then expand one element
9029                        let cell = runmat_runtime::call_builtin(
9030                            "__make_cell",
9031                            &indices
9032                                .iter()
9033                                .map(|n| Value::Num(*n as f64))
9034                                .collect::<Vec<_>>(),
9035                        )?;
9036                        let v = match runmat_runtime::call_builtin(
9037                            "call_method",
9038                            &[
9039                                Value::Object(obj),
9040                                Value::String("subsref".to_string()),
9041                                Value::String("{}".to_string()),
9042                                cell,
9043                            ],
9044                        ) {
9045                            Ok(v) => v,
9046                            Err(e) => vm_bail!(e.to_string()),
9047                        };
9048                        // Push returned value and pad to out_count
9049                        stack.push(v);
9050                        for _ in 1..out_count {
9051                            stack.push(Value::Num(0.0));
9052                        }
9053                    }
9054                    _ => return Err("Cell expansion on non-cell".to_string()),
9055                }
9056            }
9057            Instr::Pop => {
9058                stack.pop();
9059            }
9060            Instr::ReturnValue => {
9061                let return_value = stack
9062                    .pop()
9063                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
9064                stack.push(return_value);
9065                interpreter_timing.flush_host_span("return_value", None);
9066                break;
9067            }
9068            Instr::Return => {
9069                interpreter_timing.flush_host_span("return", None);
9070                break;
9071            }
9072            Instr::StoreIndex(num_indices) => {
9073                // RHS to assign, then indices, then base
9074                // Debug snapshot of top-of-stack types before mutation
9075                #[allow(unused)]
9076                if std::env::var("RUNMAT_DEBUG_INDEX").as_deref() == Ok("1") {
9077                    let snap = stack
9078                        .iter()
9079                        .rev()
9080                        .take(6)
9081                        .map(|v| match v {
9082                            Value::Object(_) => "Object",
9083                            Value::Tensor(t) => {
9084                                eprintln!("StoreIndex pre-snap Tensor shape={:?}", t.shape);
9085                                "Tensor"
9086                            }
9087                            Value::GpuTensor(h) => {
9088                                eprintln!("StoreIndex pre-snap GpuTensor shape={:?}", h.shape);
9089                                "GpuTensor"
9090                            }
9091                            Value::Num(_) => "Num",
9092                            Value::Int(_) => "Int",
9093                            Value::String(_) => "String",
9094                            Value::Cell(_) => "Cell",
9095                            _ => "Other",
9096                        })
9097                        .collect::<Vec<_>>();
9098                    eprintln!("StoreIndex pre-snap pc={} stack_top_types={:?}", pc, snap);
9099                }
9100                let rhs = stack
9101                    .pop()
9102                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
9103                // We will determine indices relative to the base location to avoid RHS temporaries interfering
9104                // Select the correct base: scan from top for the first assignable container (Object/Tensor/GpuTensor)
9105                let assignable = |v: &Value| {
9106                    matches!(v, Value::Object(_) | Value::Tensor(_) | Value::GpuTensor(_))
9107                };
9108                let base_idx_opt = (0..stack.len()).rev().find(|&j| assignable(&stack[j]));
9109                let base_pos = if let Some(j) = base_idx_opt {
9110                    j
9111                } else {
9112                    return Err("Index assignment only for tensors".to_string());
9113                };
9114                let base = stack.remove(base_pos);
9115                #[cfg(feature = "native-accel")]
9116                clear_residency(&base);
9117                // Deterministically extract indices: take exactly `num_indices` numeric values
9118                // that were immediately above the base position.
9119                let mut indices: Vec<usize> = Vec::new();
9120                if num_indices > 0 {
9121                    let mut contiguous_ok = true;
9122                    if base_pos + num_indices > stack.len() {
9123                        contiguous_ok = false;
9124                    } else {
9125                        for k in 0..num_indices {
9126                            let idx_pos = base_pos + k;
9127                            match (&stack[idx_pos]).try_into() as Result<f64, _> {
9128                                Ok(v) => indices.push(v as usize),
9129                                Err(_) => {
9130                                    contiguous_ok = false;
9131                                    indices.clear();
9132                                    break;
9133                                }
9134                            }
9135                        }
9136                    }
9137                    if contiguous_ok {
9138                        // Remove the consumed index values from the stack (highest index first)
9139                        for k in (0..num_indices).rev() {
9140                            stack.remove(base_pos + k);
9141                        }
9142                    } else {
9143                        indices.clear();
9144                    }
9145                }
9146                // Determine expected bounds for fast validation
9147                let (rows_opt, cols_opt) = match &base {
9148                    Value::Tensor(t) => (Some(t.rows()), Some(t.cols())),
9149                    Value::GpuTensor(h) => (
9150                        Some(h.shape.first().copied().unwrap_or(1).max(1)),
9151                        Some(h.shape.get(1).copied().unwrap_or(1).max(1)),
9152                    ),
9153                    _ => (None, None),
9154                };
9155                // If deterministic path failed (unexpected stack form), fall back to nearest-fit heuristic
9156                if indices.is_empty() {
9157                    let mut numeric_above: Vec<(usize, usize)> = Vec::new(); // (stack_index, value)
9158                    let mut scan_limit = 12usize;
9159                    let mut kk = stack.len();
9160                    while kk > 0 && scan_limit > 0 {
9161                        let idx = kk - 1;
9162                        if assignable(&stack[idx]) {
9163                            break;
9164                        }
9165                        if let Ok(v) = (&stack[idx]).try_into() as Result<f64, _> {
9166                            numeric_above.push((idx, v as usize));
9167                        }
9168                        kk -= 1;
9169                        scan_limit -= 1;
9170                    }
9171                    if numeric_above.len() >= 2 {
9172                        let mut picked: Option<((usize, usize), (usize, usize))> = None;
9173                        for w in (1..numeric_above.len()).rev() {
9174                            let (j_idx, j_val) = numeric_above[w];
9175                            let (i_idx, i_val) = numeric_above[w - 1];
9176                            let fits = match (rows_opt, cols_opt) {
9177                                (Some(r), Some(c)) => {
9178                                    i_val >= 1 && i_val <= r && j_val >= 1 && j_val <= c
9179                                }
9180                                _ => true,
9181                            };
9182                            if fits {
9183                                picked = Some(((i_idx, i_val), (j_idx, j_val)));
9184                                break;
9185                            }
9186                        }
9187                        if let Some(((i_idx, i_val), (j_idx, j_val))) = picked {
9188                            let mut to_remove = [i_idx, j_idx];
9189                            to_remove.sort_unstable();
9190                            stack.remove(to_remove[1]);
9191                            stack.remove(to_remove[0]);
9192                            indices = vec![i_val, j_val];
9193                        }
9194                    } else if numeric_above.len() == 1 {
9195                        let (k_idx, k_val) = numeric_above[0];
9196                        stack.remove(k_idx);
9197                        indices = vec![k_val];
9198                    }
9199                }
9200                if indices.is_empty() {
9201                    return Err("Index assignment only for tensors".to_string());
9202                }
9203                // TODO(GC): write barrier hook if base is in older generation and rhs/indices reference younger objects
9204                match base {
9205                    Value::Object(obj) => {
9206                        // subsasgn(obj, '()', {indices...}, rhs)
9207                        let cell = runmat_runtime::call_builtin(
9208                            "__make_cell",
9209                            &indices
9210                                .iter()
9211                                .map(|n| Value::Num(*n as f64))
9212                                .collect::<Vec<_>>(),
9213                        )?;
9214                        match runmat_runtime::call_builtin(
9215                            "call_method",
9216                            &[
9217                                Value::Object(obj),
9218                                Value::String("subsasgn".to_string()),
9219                                Value::String("()".to_string()),
9220                                cell,
9221                                rhs,
9222                            ],
9223                        ) {
9224                            Ok(v) => stack.push(v),
9225                            Err(e) => vm_bail!(e.to_string()),
9226                        }
9227                    }
9228                    Value::Tensor(mut t) => {
9229                        // Helper to coerce RHS to scalar f64, supporting 1x1 tensors and gpu tensors
9230                        let rhs_to_scalar = |rhs: &Value| -> Result<f64, String> {
9231                            match rhs {
9232                                Value::Num(x) => Ok(*x),
9233                                Value::Tensor(t2) => {
9234                                    if t2.data.len() == 1 {
9235                                        Ok(t2.data[0])
9236                                    } else {
9237                                        Err("RHS must be scalar".to_string())
9238                                    }
9239                                }
9240                                Value::GpuTensor(h2) => {
9241                                    let total = h2.shape.iter().copied().product::<usize>();
9242                                    if total != 1 {
9243                                        return Err("RHS must be scalar".to_string());
9244                                    }
9245                                    if let Some(p) = runmat_accelerate_api::provider() {
9246                                        let host = p
9247                                            .download(h2)
9248                                            .map_err(|e| format!("gather rhs: {e}"))?;
9249                                        Ok(host.data[0])
9250                                    } else {
9251                                        Err("No acceleration provider registered".to_string())
9252                                    }
9253                                }
9254                                _ => rhs
9255                                    .try_into()
9256                                    .map_err(|_| "RHS must be numeric".to_string()),
9257                            }
9258                        };
9259                        // 1D linear or 2D scalar assignment only for now
9260                        if indices.len() == 1 {
9261                            let total = t.rows() * t.cols();
9262                            let idx = indices[0];
9263                            if idx == 0 || idx > total {
9264                                return Err(mex("IndexOutOfBounds", "Index out of bounds"));
9265                            }
9266                            let val: f64 = rhs_to_scalar(&rhs)?;
9267                            t.data[idx - 1] = val;
9268                            stack.push(Value::Tensor(t));
9269                        } else if indices.len() == 2 {
9270                            let i = indices[0];
9271                            let mut j = indices[1];
9272                            let rows = t.rows();
9273                            let cols = t.cols();
9274                            // Clamp column index within [1..cols] to accommodate end-offset semantics
9275                            if j == 0 {
9276                                j = 1;
9277                            }
9278                            if j > cols {
9279                                j = cols;
9280                            }
9281                            if i == 0 || i > rows {
9282                                if std::env::var("RUNMAT_DEBUG_INDEX").as_deref() == Ok("1") {
9283                                    eprintln!(
9284                                        "StoreIndex Tensor OOB: i={} j(clamped)={} rows={} cols={} shape={:?}",
9285                                        i, j, rows, cols, t.shape
9286                                    );
9287                                }
9288                                return Err(mex("SubscriptOutOfBounds", "Subscript out of bounds"));
9289                            }
9290                            let val: f64 = rhs_to_scalar(&rhs)?;
9291                            let idx = (i - 1) + (j - 1) * rows;
9292                            t.data[idx] = val;
9293                            stack.push(Value::Tensor(t));
9294                        } else {
9295                            return Err("Only 1D/2D scalar assignment supported".to_string());
9296                        }
9297                    }
9298                    Value::GpuTensor(h) => {
9299                        // Stage F1: gather–mutate–reupload for simple 1D/2D scalar assignments
9300                        let provider = runmat_accelerate_api::provider()
9301                            .ok_or_else(|| "No acceleration provider registered".to_string())?;
9302                        let host = provider
9303                            .download(&h)
9304                            .map_err(|e| format!("gather for assignment: {e}"))?;
9305                        let mut t = runmat_builtins::Tensor::new(host.data, host.shape)
9306                            .map_err(|e| format!("assignment: {e}"))?;
9307                        // Reuse same scalar coercion
9308                        let rhs_to_scalar = |rhs: &Value| -> Result<f64, String> {
9309                            match rhs {
9310                                Value::Num(x) => Ok(*x),
9311                                Value::Tensor(t2) => {
9312                                    if t2.data.len() == 1 {
9313                                        Ok(t2.data[0])
9314                                    } else {
9315                                        Err("RHS must be scalar".to_string())
9316                                    }
9317                                }
9318                                Value::GpuTensor(h2) => {
9319                                    let total = h2.shape.iter().copied().product::<usize>();
9320                                    if total != 1 {
9321                                        return Err("RHS must be scalar".to_string());
9322                                    }
9323                                    let host2 = provider
9324                                        .download(h2)
9325                                        .map_err(|e| format!("gather rhs: {e}"))?;
9326                                    Ok(host2.data[0])
9327                                }
9328                                _ => rhs
9329                                    .try_into()
9330                                    .map_err(|_| "RHS must be numeric".to_string()),
9331                            }
9332                        };
9333                        if indices.len() == 1 {
9334                            let total = t.rows() * t.cols();
9335                            let idx = indices[0];
9336                            if idx == 0 || idx > total {
9337                                return Err(mex("IndexOutOfBounds", "Index out of bounds"));
9338                            }
9339                            let val: f64 = rhs_to_scalar(&rhs)?;
9340                            t.data[idx - 1] = val;
9341                        } else if indices.len() == 2 {
9342                            let i = indices[0];
9343                            let mut j = indices[1];
9344                            let rows = t.rows();
9345                            let cols = t.cols();
9346                            // Clamp column index within [1..cols] to accommodate end-offset semantics
9347                            if j == 0 {
9348                                j = 1;
9349                            }
9350                            if j > cols {
9351                                j = cols;
9352                            }
9353                            if i == 0 || i > rows {
9354                                if std::env::var("RUNMAT_DEBUG_INDEX").as_deref() == Ok("1") {
9355                                    eprintln!(
9356                                        "StoreIndex GpuTensor OOB: i={} j(clamped)={} rows={} cols={} shape={:?}",
9357                                        i, j, rows, cols, t.shape
9358                                    );
9359                                }
9360                                return Err(mex("SubscriptOutOfBounds", "Subscript out of bounds"));
9361                            }
9362                            let val: f64 = rhs_to_scalar(&rhs)?;
9363                            let idx = (i - 1) + (j - 1) * rows;
9364                            t.data[idx] = val;
9365                        } else if indices.is_empty() {
9366                            // Trivial colon slice cases from parser may encode as zero indices; handle full-row/col scalar broadcast
9367                            let val: f64 = rhs_to_scalar(&rhs)?;
9368                            for k in 0..t.data.len() {
9369                                t.data[k] = val;
9370                            }
9371                        } else {
9372                            return Err("Only 1D/2D scalar assignment supported".to_string());
9373                        }
9374                        let view = runmat_accelerate_api::HostTensorView {
9375                            data: &t.data,
9376                            shape: &t.shape,
9377                        };
9378                        let new_h = provider
9379                            .upload(&view)
9380                            .map_err(|e| format!("reupload after assignment: {e}"))?;
9381                        stack.push(Value::GpuTensor(new_h));
9382                    }
9383                    _ => {
9384                        if std::env::var("RUNMAT_DEBUG_INDEX").as_deref() == Ok("1") {
9385                            let kind = |v: &Value| match v {
9386                                Value::Object(_) => "Object",
9387                                Value::Tensor(_) => "Tensor",
9388                                Value::GpuTensor(_) => "GpuTensor",
9389                                Value::Num(_) => "Num",
9390                                Value::Int(_) => "Int",
9391                                _ => "Other",
9392                            };
9393                            eprintln!(
9394                                "StoreIndex default-branch pc={} base_kind={} rhs_kind={} indices={:?}",
9395                                pc,
9396                                kind(&base),
9397                                kind(&rhs),
9398                                indices
9399                            );
9400                        }
9401                        return Err("Index assignment only for tensors".to_string());
9402                    }
9403                }
9404            }
9405            Instr::StoreIndexCell(num_indices) => {
9406                // RHS, then indices, then base cell
9407                let rhs = stack
9408                    .pop()
9409                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
9410                let mut indices = Vec::new();
9411                for _ in 0..num_indices {
9412                    let v: f64 = (&stack
9413                        .pop()
9414                        .ok_or(mex("StackUnderflow", "stack underflow"))?)
9415                        .try_into()?;
9416                    indices.push(v as usize);
9417                }
9418                indices.reverse();
9419                let base = stack
9420                    .pop()
9421                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
9422                #[cfg(feature = "native-accel")]
9423                clear_residency(&base);
9424                // TODO(GC): write barrier hook for cell element updates
9425                match base {
9426                    Value::Object(obj) => {
9427                        // subsasgn(obj, '{}', {indices}, rhs)
9428                        let cell = runmat_builtins::CellArray::new(
9429                            indices.iter().map(|n| Value::Num(*n as f64)).collect(),
9430                            1,
9431                            indices.len(),
9432                        )
9433                        .map_err(|e| format!("subsasgn build error: {e}"))?;
9434                        match runmat_runtime::call_builtin(
9435                            "call_method",
9436                            &[
9437                                Value::Object(obj),
9438                                Value::String("subsasgn".to_string()),
9439                                Value::String("{}".to_string()),
9440                                Value::Cell(cell),
9441                                rhs,
9442                            ],
9443                        ) {
9444                            Ok(v) => stack.push(v),
9445                            Err(e) => vm_bail!(e.to_string()),
9446                        }
9447                    }
9448                    Value::Cell(mut ca) => match indices.len() {
9449                        1 => {
9450                            let i = indices[0];
9451                            if i == 0 || i > ca.data.len() {
9452                                return Err(mex(
9453                                    "CellIndexOutOfBounds",
9454                                    "Cell index out of bounds",
9455                                ));
9456                            }
9457                            if let Some(oldv) = ca.data.get(i - 1) {
9458                                runmat_gc::gc_record_write(oldv, &rhs);
9459                            }
9460                            *ca.data[i - 1] = rhs;
9461                            stack.push(Value::Cell(ca));
9462                        }
9463                        2 => {
9464                            let i = indices[0];
9465                            let j = indices[1];
9466                            if i == 0 || i > ca.rows || j == 0 || j > ca.cols {
9467                                return Err(mex(
9468                                    "CellSubscriptOutOfBounds",
9469                                    "Cell subscript out of bounds",
9470                                ));
9471                            }
9472                            let lin = (i - 1) * ca.cols + (j - 1);
9473                            if let Some(oldv) = ca.data.get(lin) {
9474                                runmat_gc::gc_record_write(oldv, &rhs);
9475                            }
9476                            *ca.data[lin] = rhs;
9477                            stack.push(Value::Cell(ca));
9478                        }
9479                        _ => return Err("Unsupported number of cell indices".to_string()),
9480                    },
9481                    _ => return Err("Cell assignment on non-cell".to_string()),
9482                }
9483            }
9484            Instr::LoadMember(field) => {
9485                let base = stack
9486                    .pop()
9487                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
9488                match base {
9489                    Value::Object(obj) => {
9490                        if let Some((p, _owner)) =
9491                            runmat_builtins::lookup_property(&obj.class_name, &field)
9492                        {
9493                            if p.is_static {
9494                                vm_bail!(format!(
9495                                    "Property '{}' is static; use classref('{}').{}",
9496                                    field, obj.class_name, field
9497                                ));
9498                            }
9499                            if p.get_access == runmat_builtins::Access::Private {
9500                                vm_bail!(format!("Property '{}' is private", field))
9501                            }
9502                            if p.is_dependent {
9503                                // Call get.<field>(obj)
9504                                let getter = format!("get.{field}");
9505                                match runmat_runtime::call_builtin(
9506                                    &getter,
9507                                    &[Value::Object(obj.clone())],
9508                                ) {
9509                                    Ok(v) => {
9510                                        stack.push(v);
9511                                        continue;
9512                                    }
9513                                    Err(_e) => {}
9514                                }
9515                            }
9516                        }
9517                        if let Some(v) = obj.properties.get(&field) {
9518                            stack.push(v.clone());
9519                        } else if let Some((p2, _)) =
9520                            runmat_builtins::lookup_property(&obj.class_name, &field)
9521                        {
9522                            if p2.is_dependent {
9523                                let backing = format!("{field}_backing");
9524                                if let Some(vb) = obj.properties.get(&backing) {
9525                                    stack.push(vb.clone());
9526                                    continue;
9527                                }
9528                            }
9529                        } else if let Some(cls) = runmat_builtins::get_class(&obj.class_name) {
9530                            if cls.methods.contains_key("subsref") {
9531                                match runmat_runtime::call_builtin(
9532                                    "call_method",
9533                                    &[
9534                                        Value::Object(obj),
9535                                        Value::String("subsref".to_string()),
9536                                        Value::String(".".to_string()),
9537                                        Value::String(field),
9538                                    ],
9539                                ) {
9540                                    Ok(v) => stack.push(v),
9541                                    Err(e) => vm_bail!(e.to_string()),
9542                                }
9543                            } else {
9544                                vm_bail!(format!(
9545                                    "Undefined property '{}' for class {}",
9546                                    field, obj.class_name
9547                                ));
9548                            }
9549                        } else {
9550                            vm_bail!(format!("Unknown class {}", obj.class_name));
9551                        }
9552                    }
9553                    Value::Struct(st) => {
9554                        if let Some(v) = st.fields.get(&field) {
9555                            stack.push(v.clone());
9556                        } else {
9557                            vm_bail!(format!("Undefined field '{}'", field));
9558                        }
9559                    }
9560                    Value::Cell(ca) => {
9561                        // Extract field from each struct element; build a cell with same shape
9562                        let mut out: Vec<Value> = Vec::with_capacity(ca.data.len());
9563                        for v in &ca.data {
9564                            match &**v {
9565                                Value::Struct(st) => {
9566                                    if let Some(fv) = st.fields.get(&field) {
9567                                        out.push(fv.clone());
9568                                    } else {
9569                                        out.push(Value::Num(0.0));
9570                                    }
9571                                }
9572                                other => {
9573                                    out.push(other.clone());
9574                                }
9575                            }
9576                        }
9577                        let new_cell = runmat_builtins::CellArray::new(out, ca.rows, ca.cols)
9578                            .map_err(|e| format!("cell field gather: {e}"))?;
9579                        stack.push(Value::Cell(new_cell));
9580                    }
9581                    _ => vm_bail!("LoadMember on non-object".to_string()),
9582                }
9583            }
9584            Instr::LoadMemberDynamic => {
9585                let name_val = stack
9586                    .pop()
9587                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
9588                let base = stack
9589                    .pop()
9590                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
9591                let name: String = (&name_val).try_into()?;
9592                match base {
9593                    Value::Object(obj) => {
9594                        if let Some((p, _owner)) =
9595                            runmat_builtins::lookup_property(&obj.class_name, &name)
9596                        {
9597                            if p.is_static {
9598                                vm_bail!(format!(
9599                                    "Property '{}' is static; use classref('{}').{}",
9600                                    name, obj.class_name, name
9601                                ));
9602                            }
9603                            if p.get_access == runmat_builtins::Access::Private {
9604                                vm_bail!(format!("Property '{}' is private", name))
9605                            }
9606                        }
9607                        if let Some(v) = obj.properties.get(&name) {
9608                            stack.push(v.clone());
9609                        } else if let Some(cls) = runmat_builtins::get_class(&obj.class_name) {
9610                            if cls.methods.contains_key("subsref") {
9611                                match runmat_runtime::call_builtin(
9612                                    "call_method",
9613                                    &[
9614                                        Value::Object(obj),
9615                                        Value::String("subsref".to_string()),
9616                                        Value::String(".".to_string()),
9617                                        Value::String(name),
9618                                    ],
9619                                ) {
9620                                    Ok(v) => stack.push(v),
9621                                    Err(e) => vm_bail!(e.to_string()),
9622                                }
9623                            } else {
9624                                vm_bail!(format!(
9625                                    "Undefined property '{}' for class {}",
9626                                    name, obj.class_name
9627                                ));
9628                            }
9629                        } else {
9630                            vm_bail!(format!("Unknown class {}", obj.class_name));
9631                        }
9632                    }
9633                    Value::Struct(st) => {
9634                        if let Some(v) = st.fields.get(&name) {
9635                            stack.push(v.clone());
9636                        } else {
9637                            vm_bail!(format!("Undefined field '{}'", name));
9638                        }
9639                    }
9640                    _ => vm_bail!("LoadMemberDynamic on non-struct/object".to_string()),
9641                }
9642            }
9643            Instr::StoreMember(field) => {
9644                let rhs = stack
9645                    .pop()
9646                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
9647                let base = stack
9648                    .pop()
9649                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
9650                // TODO(GC): write barrier hook for object/struct field write
9651                match base {
9652                    Value::Object(mut obj) => {
9653                        if let Some((p, _owner)) =
9654                            runmat_builtins::lookup_property(&obj.class_name, &field)
9655                        {
9656                            if p.is_static {
9657                                vm_bail!(format!(
9658                                    "Property '{}' is static; use classref('{}').{}",
9659                                    field, obj.class_name, field
9660                                ));
9661                            }
9662                            if p.set_access == runmat_builtins::Access::Private {
9663                                vm_bail!(format!("Property '{}' is private", field))
9664                            }
9665                            if p.is_dependent {
9666                                // Call set.<field>(obj, rhs)
9667                                let setter = format!("set.{field}");
9668                                match runmat_runtime::call_builtin(
9669                                    &setter,
9670                                    &[Value::Object(obj.clone()), rhs.clone()],
9671                                ) {
9672                                    Ok(v) => {
9673                                        stack.push(v);
9674                                        continue;
9675                                    }
9676                                    Err(_e) => {}
9677                                }
9678                            }
9679                            if let Some(oldv) = obj.properties.get(&field) {
9680                                runmat_gc::gc_record_write(oldv, &rhs);
9681                            }
9682                            obj.properties.insert(field, rhs);
9683                            stack.push(Value::Object(obj));
9684                        } else if let Some(cls) = runmat_builtins::get_class(&obj.class_name) {
9685                            if cls.methods.contains_key("subsasgn") {
9686                                match runmat_runtime::call_builtin(
9687                                    "call_method",
9688                                    &[
9689                                        Value::Object(obj),
9690                                        Value::String("subsasgn".to_string()),
9691                                        Value::String(".".to_string()),
9692                                        Value::String(field),
9693                                        rhs,
9694                                    ],
9695                                ) {
9696                                    Ok(v) => stack.push(v),
9697                                    Err(e) => vm_bail!(e),
9698                                }
9699                            } else {
9700                                vm_bail!(format!(
9701                                    "Undefined property '{}' for class {}",
9702                                    field, obj.class_name
9703                                ));
9704                            }
9705                        } else {
9706                            vm_bail!(format!("Unknown class {}", obj.class_name));
9707                        }
9708                    }
9709                    Value::ClassRef(cls) => {
9710                        if let Some((p, owner)) = runmat_builtins::lookup_property(&cls, &field) {
9711                            if !p.is_static {
9712                                vm_bail!(format!("Property '{}' is not static", field));
9713                            }
9714                            if p.set_access == runmat_builtins::Access::Private {
9715                                vm_bail!(format!("Property '{}' is private", field))
9716                            }
9717                            runmat_builtins::set_static_property_value_in_owner(
9718                                &owner, &field, rhs,
9719                            )?;
9720                            stack.push(Value::ClassRef(cls));
9721                        } else {
9722                            vm_bail!(format!("Unknown property '{}' on class {}", field, cls));
9723                        }
9724                    }
9725                    Value::Struct(mut st) => {
9726                        if let Some(oldv) = st.fields.get(&field) {
9727                            runmat_gc::gc_record_write(oldv, &rhs);
9728                        }
9729                        st.fields.insert(field, rhs);
9730                        stack.push(Value::Struct(st));
9731                    }
9732                    Value::Cell(mut ca) => {
9733                        // Assign field across each element; support scalar rhs or cell rhs of same shape
9734                        let is_cell_rhs = matches!(rhs, Value::Cell(_));
9735                        let rhs_cell = if let Value::Cell(rc) = &rhs {
9736                            Some(rc)
9737                        } else {
9738                            None
9739                        };
9740                        if is_cell_rhs {
9741                            if let Some(rc) = rhs_cell {
9742                                if rc.rows != ca.rows || rc.cols != ca.cols {
9743                                    vm_bail!(
9744                                        "Field assignment: cell rhs shape mismatch".to_string()
9745                                    );
9746                                }
9747                            }
9748                        }
9749                        for i in 0..ca.data.len() {
9750                            let rv = if let Some(rc) = rhs_cell {
9751                                (*rc.data[i]).clone()
9752                            } else {
9753                                rhs.clone()
9754                            };
9755                            match &mut *ca.data[i] {
9756                                Value::Struct(st) => {
9757                                    if let Some(oldv) = st.fields.get(&field) {
9758                                        runmat_gc::gc_record_write(oldv, &rv);
9759                                    }
9760                                    st.fields.insert(field.clone(), rv);
9761                                }
9762                                other => {
9763                                    // If not struct, convert to struct with this single field
9764                                    let mut st = runmat_builtins::StructValue::new();
9765                                    st.fields.insert(field.clone(), rv);
9766                                    *other = Value::Struct(st);
9767                                }
9768                            }
9769                        }
9770                        stack.push(Value::Cell(ca));
9771                    }
9772                    _ => vm_bail!("StoreMember on non-object".to_string()),
9773                }
9774            }
9775            Instr::StoreMemberDynamic => {
9776                let rhs = stack
9777                    .pop()
9778                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
9779                let name_val = stack
9780                    .pop()
9781                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
9782                let base = stack
9783                    .pop()
9784                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
9785                let name: String = (&name_val).try_into()?;
9786                // TODO(GC): write barrier hook for dynamic field write
9787                match base {
9788                    Value::Object(mut obj) => {
9789                        if let Some((p, _owner)) =
9790                            runmat_builtins::lookup_property(&obj.class_name, &name)
9791                        {
9792                            if p.is_static {
9793                                vm_bail!(format!(
9794                                    "Property '{}' is static; use classref('{}').{}",
9795                                    name, obj.class_name, name
9796                                ));
9797                            }
9798                            if p.set_access == runmat_builtins::Access::Private {
9799                                vm_bail!(format!("Property '{}' is private", name))
9800                            }
9801                        }
9802                        if let Some(oldv) = obj.properties.get(&name) {
9803                            runmat_gc::gc_record_write(oldv, &rhs);
9804                        }
9805                        obj.properties.insert(name, rhs);
9806                        stack.push(Value::Object(obj));
9807                    }
9808                    Value::Struct(mut st) => {
9809                        if let Some(oldv) = st.fields.get(&name) {
9810                            runmat_gc::gc_record_write(oldv, &rhs);
9811                        }
9812                        st.fields.insert(name, rhs);
9813                        stack.push(Value::Struct(st));
9814                    }
9815                    Value::Cell(mut ca) => {
9816                        let is_cell_rhs = matches!(rhs, Value::Cell(_));
9817                        let rhs_cell = if let Value::Cell(rc) = &rhs {
9818                            Some(rc)
9819                        } else {
9820                            None
9821                        };
9822                        if is_cell_rhs {
9823                            if let Some(rc) = rhs_cell {
9824                                if rc.rows != ca.rows || rc.cols != ca.cols {
9825                                    vm_bail!(
9826                                        "Field assignment: cell rhs shape mismatch".to_string()
9827                                    );
9828                                }
9829                            }
9830                        }
9831                        for i in 0..ca.data.len() {
9832                            let rv = if let Some(rc) = rhs_cell {
9833                                (*rc.data[i]).clone()
9834                            } else {
9835                                rhs.clone()
9836                            };
9837                            match &mut *ca.data[i] {
9838                                Value::Struct(st) => {
9839                                    if let Some(oldv) = st.fields.get(&name) {
9840                                        runmat_gc::gc_record_write(oldv, &rv);
9841                                    }
9842                                    st.fields.insert(name.clone(), rv);
9843                                }
9844                                other => {
9845                                    let mut st = runmat_builtins::StructValue::new();
9846                                    st.fields.insert(name.clone(), rv);
9847                                    *other = Value::Struct(st);
9848                                }
9849                            }
9850                        }
9851                        stack.push(Value::Cell(ca));
9852                    }
9853                    _ => vm_bail!("StoreMemberDynamic on non-struct/object".to_string()),
9854                }
9855            }
9856            Instr::CallMethod(name, arg_count) => {
9857                // base, then args are on stack in order: [..., base, a1, a2, ...]
9858                let mut args = Vec::with_capacity(arg_count);
9859                for _ in 0..arg_count {
9860                    args.push(
9861                        stack
9862                            .pop()
9863                            .ok_or(mex("StackUnderflow", "stack underflow"))?,
9864                    );
9865                }
9866                args.reverse();
9867                let base = stack
9868                    .pop()
9869                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
9870                match base {
9871                    Value::Object(obj) => {
9872                        // Compose qualified and try runtime builtin dispatch, passing receiver first
9873                        if let Some((m, _owner)) =
9874                            runmat_builtins::lookup_method(&obj.class_name, &name)
9875                        {
9876                            if m.is_static {
9877                                vm_bail!(format!(
9878                                    "Method '{}' is static; use classref({}).{}",
9879                                    name, obj.class_name, name
9880                                ));
9881                            }
9882                            if m.access == runmat_builtins::Access::Private {
9883                                vm_bail!(format!("Method '{}' is private", name))
9884                            }
9885                            let mut full_args = Vec::with_capacity(1 + args.len());
9886                            full_args.push(Value::Object(obj));
9887                            full_args.extend(args.into_iter());
9888                            let v = runmat_runtime::call_builtin(&m.function_name, &full_args)?;
9889                            stack.push(v);
9890                            continue;
9891                        }
9892                        let qualified = format!("{}.{}", obj.class_name, name);
9893                        let mut full_args = Vec::with_capacity(1 + args.len());
9894                        full_args.push(Value::Object(obj));
9895                        full_args.extend(args.into_iter());
9896                        if let Ok(v) = runmat_runtime::call_builtin(&qualified, &full_args) {
9897                            stack.push(v);
9898                        } else {
9899                            match runmat_runtime::call_builtin(&name, &full_args) {
9900                                Ok(v) => {
9901                                    stack.push(v);
9902                                }
9903                                Err(e) => {
9904                                    vm_bail!(e);
9905                                }
9906                            }
9907                        }
9908                    }
9909                    _ => vm_bail!("CallMethod on non-object".to_string()),
9910                }
9911            }
9912            Instr::LoadMethod(name) => {
9913                // Base object on stack; return a closure that calls the method with receiver as first captured arg
9914                let base = stack
9915                    .pop()
9916                    .ok_or(mex("StackUnderflow", "stack underflow"))?;
9917                match base {
9918                    Value::Object(obj) => {
9919                        let func_qual = format!("{}.{}", obj.class_name, name);
9920                        stack.push(Value::Closure(runmat_builtins::Closure {
9921                            function_name: func_qual,
9922                            captures: vec![Value::Object(obj)],
9923                        }));
9924                    }
9925                    Value::ClassRef(cls) => {
9926                        // Bound static method handle (no receiver capture), resolve via inheritance
9927                        if let Some((m, _owner)) = runmat_builtins::lookup_method(&cls, &name) {
9928                            if !m.is_static {
9929                                vm_bail!(format!("Method '{}' is not static", name));
9930                            }
9931                            stack.push(Value::Closure(runmat_builtins::Closure {
9932                                function_name: m.function_name,
9933                                captures: vec![],
9934                            }));
9935                        } else {
9936                            vm_bail!(format!("Unknown static method '{}' on class {}", name, cls));
9937                        }
9938                    }
9939                    _ => vm_bail!("LoadMethod requires object or classref".to_string()),
9940                }
9941            }
9942            Instr::CreateClosure(func_name, capture_count) => {
9943                let mut captures = Vec::with_capacity(capture_count);
9944                for _ in 0..capture_count {
9945                    captures.push(
9946                        stack
9947                            .pop()
9948                            .ok_or(mex("StackUnderflow", "stack underflow"))?,
9949                    );
9950                }
9951                captures.reverse();
9952                stack.push(Value::Closure(runmat_builtins::Closure {
9953                    function_name: func_name,
9954                    captures,
9955                }));
9956            }
9957            Instr::LoadStaticProperty(class_name, prop) => {
9958                // Enforce access and static-ness via registry (with inheritance)
9959                if let Some((p, owner)) = runmat_builtins::lookup_property(&class_name, &prop) {
9960                    if !p.is_static {
9961                        vm_bail!(format!("Property '{}' is not static", prop));
9962                    }
9963                    if p.get_access == runmat_builtins::Access::Private {
9964                        vm_bail!(format!("Property '{}' is private", prop))
9965                    }
9966                    if let Some(v) = runmat_builtins::get_static_property_value(&owner, &prop) {
9967                        stack.push(v);
9968                    } else if let Some(v) = &p.default_value {
9969                        stack.push(v.clone());
9970                    } else {
9971                        stack.push(Value::Num(0.0));
9972                    }
9973                } else {
9974                    vm_bail!(format!(
9975                        "Unknown property '{}' on class {}",
9976                        prop, class_name
9977                    ));
9978                }
9979            }
9980            Instr::CallStaticMethod(class_name, method, arg_count) => {
9981                let mut args = Vec::with_capacity(arg_count);
9982                for _ in 0..arg_count {
9983                    args.push(
9984                        stack
9985                            .pop()
9986                            .ok_or(mex("StackUnderflow", "stack underflow"))?,
9987                    );
9988                }
9989                args.reverse();
9990                if let Some((m, _owner)) = runmat_builtins::lookup_method(&class_name, &method) {
9991                    if !m.is_static {
9992                        vm_bail!(format!("Method '{}' is not static", method));
9993                    }
9994                    if m.access == runmat_builtins::Access::Private {
9995                        vm_bail!(format!("Method '{}' is private", method))
9996                    }
9997                    let v = match runmat_runtime::call_builtin(&m.function_name, &args) {
9998                        Ok(v) => v,
9999                        Err(e) => vm_bail!(e),
10000                    };
10001                    stack.push(v);
10002                } else {
10003                    vm_bail!(format!(
10004                        "Unknown static method '{}' on class {}",
10005                        method, class_name
10006                    ));
10007                }
10008            }
10009            Instr::RegisterClass {
10010                name,
10011                super_class,
10012                properties,
10013                methods,
10014            } => {
10015                // Build a minimal ClassDef and register it in runtime builtins registry
10016                let mut prop_map = std::collections::HashMap::new();
10017                for (p, is_static, get_access, set_access) in properties {
10018                    let gacc = if get_access.eq_ignore_ascii_case("private") {
10019                        runmat_builtins::Access::Private
10020                    } else {
10021                        runmat_builtins::Access::Public
10022                    };
10023                    let sacc = if set_access.eq_ignore_ascii_case("private") {
10024                        runmat_builtins::Access::Private
10025                    } else {
10026                        runmat_builtins::Access::Public
10027                    };
10028                    let (is_dep, clean_name) = if let Some(stripped) = p.strip_prefix("@dep:") {
10029                        (true, stripped.to_string())
10030                    } else {
10031                        (false, p.clone())
10032                    };
10033                    prop_map.insert(
10034                        clean_name.clone(),
10035                        runmat_builtins::PropertyDef {
10036                            name: clean_name,
10037                            is_static,
10038                            is_dependent: is_dep,
10039                            get_access: gacc,
10040                            set_access: sacc,
10041                            default_value: None,
10042                        },
10043                    );
10044                }
10045                let mut method_map = std::collections::HashMap::new();
10046                for (mname, fname, is_static, access) in methods {
10047                    let access = if access.eq_ignore_ascii_case("private") {
10048                        runmat_builtins::Access::Private
10049                    } else {
10050                        runmat_builtins::Access::Public
10051                    };
10052                    method_map.insert(
10053                        mname.clone(),
10054                        runmat_builtins::MethodDef {
10055                            name: mname,
10056                            is_static,
10057                            access,
10058                            function_name: fname,
10059                        },
10060                    );
10061                }
10062                let def = runmat_builtins::ClassDef {
10063                    name: name.clone(),
10064                    parent: super_class.clone(),
10065                    properties: prop_map,
10066                    methods: method_map,
10067                };
10068                runmat_builtins::register_class(def);
10069            }
10070        }
10071        if debug_stack {
10072            eprintln!("After exec pc={} stack_len={}", pc, stack.len());
10073        }
10074        pc += 1;
10075    }
10076    interpreter_timing.flush_host_span("loop_complete", None);
10077    for (i, var) in vars.iter().enumerate() {
10078        if i < initial_vars.len() {
10079            initial_vars[i] = var.clone();
10080        }
10081    }
10082    Ok(vars)
10083}
10084
10085fn stochastic_evolution_dispatch(
10086    state: Value,
10087    drift: Value,
10088    scale: Value,
10089    steps: Value,
10090) -> Result<Value, String> {
10091    let steps_u32 = parse_steps_value(&steps)?;
10092    if steps_u32 == 0 {
10093        return Ok(state);
10094    }
10095
10096    #[cfg(feature = "native-accel")]
10097    {
10098        if let Some(provider) = runmat_accelerate_api::provider() {
10099            let (state_handle, state_owned) = ensure_gpu_tensor_for_stochastic(provider, &state)?;
10100            let drift_scalar = scalar_from_value_scalar(&drift, "stochastic_evolution drift")?;
10101            let scale_scalar = scalar_from_value_scalar(&scale, "stochastic_evolution scale")?;
10102            let output = provider
10103                .stochastic_evolution(&state_handle, drift_scalar, scale_scalar, steps_u32)
10104                .map_err(|e| format!("stochastic_evolution: {e}"))?;
10105            if let Some(temp) = state_owned {
10106                let _ = provider.free(&temp);
10107            }
10108            fusion_residency::mark(&output);
10109            return Ok(Value::GpuTensor(output));
10110        }
10111    }
10112
10113    let gathered_state =
10114        gather_if_needed(&state).map_err(|e| format!("stochastic_evolution: {e}"))?;
10115    let mut tensor_value = match gathered_state {
10116        Value::Tensor(t) => t,
10117        other => tensor::value_into_tensor_for("stochastic_evolution", other)?,
10118    };
10119    let drift_scalar = scalar_from_value_scalar(&drift, "stochastic_evolution drift")?;
10120    let scale_scalar = scalar_from_value_scalar(&scale, "stochastic_evolution scale")?;
10121    stochastic_evolution_host(&mut tensor_value, drift_scalar, scale_scalar, steps_u32)?;
10122    Ok(Value::Tensor(tensor_value))
10123}
10124
10125fn scalar_from_value_scalar(value: &Value, label: &str) -> Result<f64, String> {
10126    match value {
10127        Value::Num(n) => Ok(*n),
10128        Value::Int(i) => Ok(i.to_f64()),
10129        Value::Tensor(t) if t.data.len() == 1 => Ok(t.data[0]),
10130        Value::Tensor(t) => Err(format!(
10131            "{label}: expected scalar tensor, got {} elements",
10132            t.data.len()
10133        )),
10134        Value::GpuTensor(_) => {
10135            let gathered = gather_if_needed(value).map_err(|e| format!("{label}: {e}"))?;
10136            scalar_from_value_scalar(&gathered, label)
10137        }
10138        other => Err(format!("{label}: expected numeric scalar, got {:?}", other)),
10139    }
10140}
10141
10142fn parse_steps_value(value: &Value) -> Result<u32, String> {
10143    let raw = scalar_from_value_scalar(value, "stochastic_evolution steps")?;
10144    if !raw.is_finite() || raw < 0.0 {
10145        return Err("stochastic_evolution: steps must be a non-negative scalar".to_string());
10146    }
10147    Ok(raw.round() as u32)
10148}
10149
10150#[cfg(feature = "native-accel")]
10151fn ensure_gpu_tensor_for_stochastic(
10152    provider: &dyn runmat_accelerate_api::AccelProvider,
10153    value: &Value,
10154) -> Result<
10155    (
10156        runmat_accelerate_api::GpuTensorHandle,
10157        Option<runmat_accelerate_api::GpuTensorHandle>,
10158    ),
10159    String,
10160> {
10161    match value {
10162        Value::GpuTensor(handle) => Ok((handle.clone(), None)),
10163        Value::Tensor(tensor) => {
10164            let handle = upload_tensor_view(provider, tensor)?;
10165            Ok((handle.clone(), Some(handle)))
10166        }
10167        _ => {
10168            let gathered =
10169                gather_if_needed(value).map_err(|e| format!("stochastic_evolution: {e}"))?;
10170            match gathered {
10171                Value::Tensor(t) => {
10172                    let handle = upload_tensor_view(provider, &t)?;
10173                    Ok((handle.clone(), Some(handle)))
10174                }
10175                other => {
10176                    let tensor = tensor::value_into_tensor_for("stochastic_evolution", other)?;
10177                    let handle = upload_tensor_view(provider, &tensor)?;
10178                    Ok((handle.clone(), Some(handle)))
10179                }
10180            }
10181        }
10182    }
10183}
10184
10185#[cfg(feature = "native-accel")]
10186fn upload_tensor_view(
10187    provider: &dyn runmat_accelerate_api::AccelProvider,
10188    tensor: &runmat_builtins::Tensor,
10189) -> Result<runmat_accelerate_api::GpuTensorHandle, String> {
10190    let view = runmat_accelerate_api::HostTensorView {
10191        data: &tensor.data,
10192        shape: &tensor.shape,
10193    };
10194    provider.upload(&view).map_err(|e| e.to_string())
10195}
10196
10197#[cfg(feature = "native-accel")]
10198#[inline]
10199fn value_kind(value: &Value) -> &'static str {
10200    match value {
10201        Value::Int(_) => "Int",
10202        Value::Num(_) => "Num",
10203        Value::Complex(_, _) => "Complex",
10204        Value::Bool(_) => "Bool",
10205        Value::LogicalArray(_) => "LogicalArray",
10206        Value::String(_) => "String",
10207        Value::StringArray(_) => "StringArray",
10208        Value::CharArray(_) => "CharArray",
10209        Value::Tensor(_) => "Tensor",
10210        Value::ComplexTensor(_) => "ComplexTensor",
10211        Value::Cell(_) => "Cell",
10212        Value::Struct(_) => "Struct",
10213        Value::GpuTensor(_) => "GpuTensor",
10214        Value::Object(_) => "Object",
10215        Value::HandleObject(_) => "HandleObject",
10216        Value::Listener(_) => "Listener",
10217        Value::FunctionHandle(_) => "FunctionHandle",
10218        Value::Closure(_) => "Closure",
10219        Value::ClassRef(_) => "ClassRef",
10220        Value::MException(_) => "MException",
10221    }
10222}
10223#[cfg(feature = "native-accel")]
10224#[inline]
10225fn summarize_value(i: usize, v: &Value) -> String {
10226    match v {
10227        Value::GpuTensor(h) => format!("in#{i}:GpuTensor shape={:?}", h.shape),
10228        Value::Tensor(t) => format!("in#{i}:Tensor shape={:?}", t.shape),
10229        Value::String(s) => format!("in#{i}:String({})", s),
10230        _ => format!("in#{i}:{}", value_kind(v)),
10231    }
10232}
10233#[cfg(feature = "native-accel")]
10234struct StackSliceGuard<'a> {
10235    stack: *mut Vec<Value>,
10236    slice: Option<Vec<Value>>,
10237    _marker: std::marker::PhantomData<&'a mut Vec<Value>>,
10238}
10239
10240#[cfg(feature = "native-accel")]
10241impl<'a> StackSliceGuard<'a> {
10242    fn new(stack: &'a mut Vec<Value>, slice_start: usize) -> Self {
10243        let slice = stack.split_off(slice_start);
10244        Self {
10245            stack,
10246            slice: Some(slice),
10247            _marker: std::marker::PhantomData,
10248        }
10249    }
10250
10251    fn slice(&self) -> &[Value] {
10252        self.slice.as_ref().expect("stack slice missing").as_slice()
10253    }
10254
10255    fn commit(mut self) {
10256        self.slice = None;
10257    }
10258}
10259
10260#[cfg(feature = "native-accel")]
10261impl Drop for StackSliceGuard<'_> {
10262    fn drop(&mut self) {
10263        if let Some(slice) = self.slice.take() {
10264            unsafe {
10265                (&mut *self.stack).extend(slice);
10266            }
10267        }
10268    }
10269}
10270
10271#[cfg(feature = "native-accel")]
10272fn try_execute_fusion_group(
10273    plan: &runmat_accelerate::FusionGroupPlan,
10274    graph: &runmat_accelerate::AccelGraph,
10275    stack: &mut Vec<Value>,
10276    vars: &mut [Value],
10277    context: &ExecutionContext,
10278) -> Result<Value, String> {
10279    let mut inputs: Vec<Option<Value>> = vec![None; plan.inputs.len()];
10280
10281    for (idx, value) in &plan.constants {
10282        if let Some(slot) = inputs.get_mut(*idx) {
10283            if slot.is_none() {
10284                *slot = Some(value.clone());
10285            }
10286        }
10287    }
10288
10289    for (idx, value_id) in plan.inputs.iter().enumerate() {
10290        let info = graph
10291            .value(*value_id)
10292            .ok_or_else(|| format!("fusion: missing value metadata for id {value_id}"))?;
10293        match &info.origin {
10294            ValueOrigin::Variable { kind, index } => {
10295                let value =
10296                    match kind {
10297                        VarKind::Global => vars
10298                            .get(*index)
10299                            .cloned()
10300                            .ok_or_else(|| format!("fusion: global var {index} out of range"))?,
10301                        VarKind::Local => {
10302                            if let Some(frame) = context.call_stack.last() {
10303                                let absolute = frame.locals_start + index;
10304                                context.locals.get(absolute).cloned().ok_or_else(|| {
10305                                    format!("fusion: local var {index} unavailable")
10306                                })?
10307                            } else {
10308                                vars.get(*index).cloned().ok_or_else(|| {
10309                                    format!("fusion: local var {index} unavailable")
10310                                })?
10311                            }
10312                        }
10313                    };
10314                debug_assert!(
10315                    inputs[idx].is_none(),
10316                    "fusion: duplicate input slot {} for plan {}",
10317                    idx,
10318                    plan.index
10319                );
10320                inputs[idx] = Some(value);
10321            }
10322            ValueOrigin::Constant | ValueOrigin::NodeOutput { .. } | ValueOrigin::Unknown => {}
10323        }
10324    }
10325
10326    if log::log_enabled!(log::Level::Debug) && fusion_debug_enabled() {
10327        let stack_needed_preview = plan.stack_pattern.len();
10328        let stack_snapshot: Vec<&Value> = stack.iter().rev().take(stack_needed_preview).collect();
10329        let stack_kinds: Vec<&'static str> =
10330            stack_snapshot.iter().rev().map(|v| value_kind(v)).collect();
10331        let input_meta: Vec<String> = plan
10332            .inputs
10333            .iter()
10334            .enumerate()
10335            .map(|(i, value_id)| {
10336                if let Some(info) = graph.value(*value_id) {
10337                    format!("#{i}:id={} origin={:?}", value_id, info.origin)
10338                } else {
10339                    format!("#{i}:id={} origin=<missing>", value_id)
10340                }
10341            })
10342            .collect();
10343        log::debug!(
10344            "fusion group {} gather: stack_depth={} stack_needed={} stack_kinds={:?} pattern={:?} inputs={:?}",
10345            plan.index,
10346            stack.len(),
10347            stack_needed_preview,
10348            stack_kinds,
10349            &plan.stack_pattern,
10350            input_meta
10351        );
10352    }
10353
10354    let pattern_len = plan.stack_pattern.len();
10355    if stack.len() < pattern_len {
10356        if fusion_debug_enabled() {
10357            log::debug!(
10358                "fusion stack underflow: plan={} needed={} available={} pattern={:?}",
10359                plan.index,
10360                pattern_len,
10361                stack.len(),
10362                plan.stack_pattern
10363            );
10364        }
10365        return Err("fusion: stack underflow gathering inputs".to_string());
10366    }
10367    let available = pattern_len;
10368    let slice_start = stack.len() - available;
10369    let stack_guard = StackSliceGuard::new(stack, slice_start);
10370    let slice = stack_guard.slice().to_vec();
10371    let mut consumed: Vec<Option<Value>> = vec![None; pattern_len];
10372    let skip = 0;
10373
10374    for (offset, input_idx) in plan.stack_pattern.iter().enumerate() {
10375        if offset < skip {
10376            continue;
10377        }
10378        let slice_idx = offset - skip;
10379        let Some(val) = slice.get(slice_idx).cloned() else {
10380            continue;
10381        };
10382        consumed[offset] = Some(val.clone());
10383        if inputs[*input_idx].is_none() {
10384            // For reductions, only populate from stack if the value is a numeric tensor.
10385            // This avoids accidentally binding non-tensor metadata (e.g., dim strings) into the fused kernel inputs.
10386            let allow_stack_value = if plan.group.kind.is_reduction() {
10387                matches!(val, Value::GpuTensor(_) | Value::Tensor(_))
10388            } else {
10389                true
10390            };
10391            if allow_stack_value {
10392                inputs[*input_idx] = Some(val);
10393            }
10394        }
10395    }
10396
10397    for (idx, slot) in inputs.iter_mut().enumerate() {
10398        if slot.is_some() {
10399            continue;
10400        }
10401        let vid = plan.inputs[idx];
10402        let info = graph.value(vid);
10403        if let Some(info) = info {
10404            match &info.origin {
10405                ValueOrigin::Variable { kind, index } => {
10406                    let value_opt = match kind {
10407                        VarKind::Global => vars.get(*index).cloned(),
10408                        VarKind::Local => {
10409                            if let Some(frame) = context.call_stack.last() {
10410                                let absolute = frame.locals_start + index;
10411                                context.locals.get(absolute).cloned()
10412                            } else {
10413                                vars.get(*index).cloned()
10414                            }
10415                        }
10416                    };
10417                    if let Some(value) = value_opt {
10418                        *slot = Some(value);
10419                        continue;
10420                    }
10421                }
10422                ValueOrigin::Constant => {
10423                    if let Some(value) = plan.const_values.get(&vid) {
10424                        *slot = Some(value.clone());
10425                        continue;
10426                    }
10427                }
10428                _ => {}
10429            }
10430        }
10431        if slot.is_none() {
10432            if let Some(binding) = graph.var_binding(vid) {
10433                let value_opt = match binding.kind {
10434                    VarKind::Global => vars.get(binding.index).cloned(),
10435                    VarKind::Local => {
10436                        if let Some(frame) = context.call_stack.last() {
10437                            let absolute = frame.locals_start + binding.index;
10438                            context.locals.get(absolute).cloned()
10439                        } else {
10440                            vars.get(binding.index).cloned()
10441                        }
10442                    }
10443                };
10444                if let Some(value) = value_opt {
10445                    *slot = Some(value);
10446                    continue;
10447                }
10448            }
10449        }
10450        if slot.is_none() {
10451            if let Some(info) = info {
10452                if let ValueOrigin::NodeOutput { node, .. } = info.origin {
10453                    if let Some(binding) = graph.node_binding(node) {
10454                        let value_opt = match binding.kind {
10455                            VarKind::Global => vars.get(binding.index).cloned(),
10456                            VarKind::Local => {
10457                                if let Some(frame) = context.call_stack.last() {
10458                                    let absolute = frame.locals_start + binding.index;
10459                                    context.locals.get(absolute).cloned()
10460                                } else {
10461                                    vars.get(binding.index).cloned()
10462                                }
10463                            }
10464                        };
10465                        if let Some(value) = value_opt {
10466                            *slot = Some(value);
10467                            continue;
10468                        }
10469                    }
10470                }
10471            }
10472        }
10473        if slot.is_none() {
10474            if let Some(value) = plan.const_values.get(&vid) {
10475                *slot = Some(value.clone());
10476            }
10477        }
10478    }
10479
10480    let inputs: Vec<Value> = inputs
10481        .into_iter()
10482        .map(|opt| opt.ok_or_else(|| "fusion: missing input value".to_string()))
10483        .collect::<Result<_, _>>()?;
10484
10485    // Debug: summarize runtime input kinds/shapes
10486    if log::log_enabled!(log::Level::Debug) {
10487        let summaries: Vec<String> = inputs
10488            .iter()
10489            .enumerate()
10490            .map(|(i, v)| summarize_value(i, v))
10491            .collect();
10492        log::debug!("fusion inputs runtime: [{}]", summaries.join(", "));
10493    }
10494
10495    let request = FusionExecutionRequest { plan, inputs };
10496    log::debug!(
10497        "dispatch fusion kind {:?}, supported {}",
10498        plan.group.kind,
10499        plan.kernel.supported
10500    );
10501    if plan.group.kind.is_elementwise() {
10502        match execute_elementwise(request) {
10503            Ok(result) => {
10504                stack_guard.commit();
10505                Ok(result)
10506            }
10507            Err(err) => Err(err.to_string()),
10508        }
10509    } else if plan.group.kind.is_reduction() {
10510        // Determine reduction axis or 'all'. Prefer the builtin reduction op's dim argument (inputs[1]).
10511        // MATLAB dim is 1-based: dim=1 reduces rows (axis 0), dim=2 reduces cols (axis 1), 'all' reduces all elements.
10512        let mut axis = 0usize;
10513        let mut reduce_all = matches!(plan.reduction_axes, Some(ReductionAxes::All));
10514        if let Some(ReductionAxes::Explicit(dims)) = &plan.reduction_axes {
10515            if let Some(first) = dims.first().copied() {
10516                axis = first.saturating_sub(1);
10517            }
10518        }
10519        // Debug: show input origins for reduction
10520        if log::log_enabled!(log::Level::Debug) {
10521            let meta: Vec<String> = plan
10522                .inputs
10523                .iter()
10524                .map(|vid| {
10525                    if let Some(info) = graph.value(*vid) {
10526                        format!(
10527                            "vid={} origin={:?} shape={:?}",
10528                            vid, info.origin, info.shape
10529                        )
10530                    } else {
10531                        format!("vid={} origin=<missing>", vid)
10532                    }
10533                })
10534                .collect();
10535            log::debug!("reduction gather meta: [{}]", meta.join(", "));
10536        }
10537        // Detect 'all' in constants or const_values
10538        let has_all = reduce_all
10539            || plan.constants.values().any(value_is_all_keyword)
10540            || plan.const_values.values().any(value_is_all_keyword);
10541        if has_all {
10542            reduce_all = true;
10543        }
10544        if reduce_all && fusion_debug_enabled() {
10545            log::debug!(
10546                "fusion reduction (all) meta: data_vid={:?} inputs={:?} stack_pattern={:?}",
10547                plan.reduction_data,
10548                plan.inputs,
10549                plan.stack_pattern
10550            );
10551        }
10552        if !reduce_all {
10553            for node_id in &plan.group.nodes {
10554                if let Some(node) = graph.node(*node_id) {
10555                    if let runmat_accelerate::graph::AccelNodeLabel::Builtin { name } = &node.label
10556                    {
10557                        if name.eq_ignore_ascii_case("mean") {
10558                            for input_vid in &node.inputs {
10559                                if let Some(info) = graph.value(*input_vid) {
10560                                    if let Some(constant) = &info.constant {
10561                                        if value_is_all_keyword(constant) {
10562                                            reduce_all = true;
10563                                            break;
10564                                        }
10565                                    }
10566                                }
10567                            }
10568                        }
10569                    }
10570                }
10571                if reduce_all {
10572                    break;
10573                }
10574            }
10575        }
10576        // Prefer plan.reduction_dim if available
10577        if !reduce_all {
10578            if let Some(dim_vid) = plan.reduction_dim {
10579                if let Some(cv) = plan.const_values.get(&dim_vid) {
10580                    axis = match cv {
10581                        Value::Num(n) if *n >= 1.0 => (*n as usize).saturating_sub(1),
10582                        Value::Int(i) => (i.to_f64() as usize).saturating_sub(1),
10583                        _ => axis,
10584                    };
10585                } else if let Some(input_idx) = plan.inputs.iter().position(|v| *v == dim_vid) {
10586                    if let Some(cv) = plan.constants.get(&input_idx) {
10587                        axis = match cv {
10588                            Value::Num(n) if *n >= 1.0 => (*n as usize).saturating_sub(1),
10589                            Value::Int(i) => (i.to_f64() as usize).saturating_sub(1),
10590                            _ => axis,
10591                        };
10592                    }
10593                }
10594            } else {
10595                // Legacy fallback: inspect any constant mapped to the second logical position
10596                if let Some(dim_const) = plan.constants.get(&1) {
10597                    axis = match dim_const {
10598                        Value::Num(n) if *n >= 1.0 => (*n as usize).saturating_sub(1),
10599                        Value::Int(i) => (i.to_f64() as usize).saturating_sub(1),
10600                        _ => axis,
10601                    };
10602                }
10603            }
10604        }
10605        let (reduce_len, num_slices) = {
10606            // Try to get the data tensor's shape via the reduction builtin op input id
10607            let mut rows_cols: Option<(usize, usize)> = None;
10608            // Prefer shape from fusion plan reduction_data if fully known in graph
10609            if let Some(shape) = plan.reduction_data_shape(graph) {
10610                if shape.len() >= 2 {
10611                    rows_cols = Some((shape[0].max(1), shape[1].max(1)));
10612                } else if shape.len() == 1 {
10613                    rows_cols = Some((shape[0].max(1), 1));
10614                }
10615            }
10616            // Early fallback: inspect runtime variable values for declared plan inputs
10617            if rows_cols.is_none() {
10618                for &vid in &plan.inputs {
10619                    if let Some(binding) = graph.var_binding(vid) {
10620                        let value_opt = match binding.kind {
10621                            VarKind::Global => vars.get(binding.index).cloned(),
10622                            VarKind::Local => {
10623                                if let Some(frame) = context.call_stack.last() {
10624                                    let absolute = frame.locals_start + binding.index;
10625                                    context.locals.get(absolute).cloned()
10626                                } else {
10627                                    vars.get(binding.index).cloned()
10628                                }
10629                            }
10630                        };
10631                        if let Some(value) = value_opt {
10632                            match value {
10633                                Value::GpuTensor(h) => {
10634                                    rows_cols = Some((
10635                                        h.shape.first().copied().unwrap_or(1).max(1),
10636                                        h.shape.get(1).copied().unwrap_or(1).max(1),
10637                                    ));
10638                                    break;
10639                                }
10640                                Value::Tensor(t) => {
10641                                    rows_cols = Some((
10642                                        t.shape.first().copied().unwrap_or(1).max(1),
10643                                        t.shape.get(1).copied().unwrap_or(1).max(1),
10644                                    ));
10645                                    break;
10646                                }
10647                                _ => {}
10648                            }
10649                        }
10650                    }
10651                }
10652            }
10653            // Prefer immediately-consumed stack values (common for reductions over producer results)
10654            for v in consumed.iter().filter_map(|v| v.as_ref()) {
10655                match v {
10656                    Value::GpuTensor(h) => {
10657                        rows_cols = Some((
10658                            h.shape.first().copied().unwrap_or(1).max(1),
10659                            h.shape.get(1).copied().unwrap_or(1).max(1),
10660                        ));
10661                        break;
10662                    }
10663                    Value::Tensor(t) => {
10664                        rows_cols = Some((
10665                            t.shape.first().copied().unwrap_or(1).max(1),
10666                            t.shape.get(1).copied().unwrap_or(1).max(1),
10667                        ));
10668                        break;
10669                    }
10670                    _ => {}
10671                }
10672            }
10673            let data_value_id: Option<runmat_accelerate::graph::ValueId> = plan.reduction_data;
10674
10675            if let Some(data_id) = data_value_id {
10676                // Map data_id to plan input index if external
10677                if let Some(input_index) = plan.inputs.iter().position(|vid| *vid == data_id) {
10678                    // If this input came from the stack, it will have an entry in stack_pattern; use consumed value
10679                    if let Some(stack_offset) = plan
10680                        .stack_pattern
10681                        .iter()
10682                        .position(|&idx| idx == input_index)
10683                    {
10684                        if let Some(val) = consumed.get(stack_offset).and_then(|v| v.as_ref()) {
10685                            match val {
10686                                Value::GpuTensor(h) => {
10687                                    let r = h.shape.first().copied().unwrap_or(1).max(1);
10688                                    let c = h.shape.get(1).copied().unwrap_or(1).max(1);
10689                                    rows_cols = Some((r, c));
10690                                }
10691                                Value::Tensor(t) => {
10692                                    let r = t.shape.first().copied().unwrap_or(1).max(1);
10693                                    let c = t.shape.get(1).copied().unwrap_or(1).max(1);
10694                                    rows_cols = Some((r, c));
10695                                }
10696                                _ => {}
10697                            }
10698                        }
10699                    }
10700                    // Otherwise, it was a variable/constant; use request.inputs
10701                    if rows_cols.is_none() {
10702                        if let Some(val) = request.inputs.get(input_index) {
10703                            match val {
10704                                Value::GpuTensor(h) => {
10705                                    let r = h.shape.first().copied().unwrap_or(1).max(1);
10706                                    let c = h.shape.get(1).copied().unwrap_or(1).max(1);
10707                                    rows_cols = Some((r, c));
10708                                }
10709                                Value::Tensor(t) => {
10710                                    let r = t.shape.first().copied().unwrap_or(1).max(1);
10711                                    let c = t.shape.get(1).copied().unwrap_or(1).max(1);
10712                                    rows_cols = Some((r, c));
10713                                }
10714                                _ => {}
10715                            }
10716                        }
10717                    }
10718                }
10719                if rows_cols.is_none() {
10720                    if let Some(info) = graph.value(data_id) {
10721                        // Try direct variable lookup to get runtime value shape
10722                        if let ValueOrigin::Variable { kind, index } = &info.origin {
10723                            let val = match kind {
10724                                VarKind::Global => vars.get(*index).cloned(),
10725                                VarKind::Local => {
10726                                    if let Some(frame) = context.call_stack.last() {
10727                                        let absolute = frame.locals_start + index;
10728                                        context.locals.get(absolute).cloned()
10729                                    } else {
10730                                        vars.get(*index).cloned()
10731                                    }
10732                                }
10733                            };
10734                            if let Some(v) = val {
10735                                match v {
10736                                    Value::GpuTensor(h) => {
10737                                        rows_cols = Some((
10738                                            h.shape.first().copied().unwrap_or(1).max(1),
10739                                            h.shape.get(1).copied().unwrap_or(1).max(1),
10740                                        ));
10741                                    }
10742                                    Value::Tensor(t) => {
10743                                        rows_cols = Some((
10744                                            t.shape.first().copied().unwrap_or(1).max(1),
10745                                            t.shape.get(1).copied().unwrap_or(1).max(1),
10746                                        ));
10747                                    }
10748                                    _ => {}
10749                                }
10750                            }
10751                        }
10752                        if rows_cols.is_none() {
10753                            if let ShapeInfo::Tensor(dims) = &info.shape {
10754                                if !dims.is_empty() {
10755                                    let r = dims.first().and_then(|d| *d).unwrap_or(1);
10756                                    let c = dims.get(1).and_then(|d| *d).unwrap_or(1);
10757                                    rows_cols = Some((r.max(1), c.max(1)));
10758                                }
10759                            }
10760                        }
10761                    }
10762                }
10763            }
10764
10765            // Fallback: any tensor input
10766            if rows_cols.is_none() {
10767                for v in consumed.iter().filter_map(|v| v.as_ref()) {
10768                    match v {
10769                        Value::GpuTensor(h) => {
10770                            rows_cols = Some((
10771                                h.shape.first().copied().unwrap_or(1).max(1),
10772                                h.shape.get(1).copied().unwrap_or(1).max(1),
10773                            ));
10774                            break;
10775                        }
10776                        Value::Tensor(t) => {
10777                            rows_cols = Some((
10778                                t.shape.first().copied().unwrap_or(1).max(1),
10779                                t.shape.get(1).copied().unwrap_or(1).max(1),
10780                            ));
10781                            break;
10782                        }
10783                        _ => {}
10784                    }
10785                }
10786                if rows_cols.is_none() {
10787                    for v in &request.inputs {
10788                        match v {
10789                            Value::GpuTensor(h) => {
10790                                rows_cols = Some((
10791                                    h.shape.first().copied().unwrap_or(1).max(1),
10792                                    h.shape.get(1).copied().unwrap_or(1).max(1),
10793                                ));
10794                                break;
10795                            }
10796                            Value::Tensor(t) => {
10797                                rows_cols = Some((
10798                                    t.shape.first().copied().unwrap_or(1).max(1),
10799                                    t.shape.get(1).copied().unwrap_or(1).max(1),
10800                                ));
10801                                break;
10802                            }
10803                            _ => {}
10804                        }
10805                    }
10806                }
10807            }
10808            // Final fallback: group-level static shape if available
10809            if rows_cols.is_none() {
10810                if let ShapeInfo::Tensor(dims) = &plan.group.shape {
10811                    if !dims.is_empty() {
10812                        let r = dims.first().and_then(|d| *d).unwrap_or(1);
10813                        let c = dims.get(1).and_then(|d| *d).unwrap_or(1);
10814                        rows_cols = Some((r.max(1), c.max(1)));
10815                    }
10816                }
10817            }
10818
10819            let (r, c) = rows_cols.unwrap_or((1, 1));
10820            if reduce_all {
10821                let mut total_elems: Option<usize> = None;
10822                let mut total_from_operand = false;
10823                // Prefer fully-known graph shape for the reduction operand
10824                if let Some(shape) = plan.reduction_data_shape(graph) {
10825                    let prod = shape.into_iter().fold(1usize, |acc, dim| {
10826                        let d = dim.max(1);
10827                        acc.saturating_mul(d)
10828                    });
10829                    total_from_operand = true;
10830                    total_elems = Some(prod.max(1));
10831                }
10832                // Fall back to runtime tensor shapes (consumed stack values first, then inputs)
10833                if total_elems.is_none() {
10834                    let inspect_value = |value: &Value| -> Option<usize> {
10835                        match value {
10836                            Value::GpuTensor(handle) => {
10837                                if handle.shape.is_empty() {
10838                                    Some(1)
10839                                } else {
10840                                    Some(
10841                                        handle
10842                                            .shape
10843                                            .iter()
10844                                            .copied()
10845                                            .map(|d| d.max(1))
10846                                            .fold(1usize, |acc, dim| acc.saturating_mul(dim)),
10847                                    )
10848                                }
10849                            }
10850                            Value::Tensor(tensor) => {
10851                                if tensor.shape.is_empty() {
10852                                    Some(1)
10853                                } else {
10854                                    Some(
10855                                        tensor
10856                                            .shape
10857                                            .iter()
10858                                            .copied()
10859                                            .map(|d| d.max(1))
10860                                            .fold(1usize, |acc, dim| acc.saturating_mul(dim)),
10861                                    )
10862                                }
10863                            }
10864                            _ => None,
10865                        }
10866                    };
10867                    for value in consumed.iter().filter_map(|v| v.as_ref()) {
10868                        if let Some(prod) = inspect_value(value) {
10869                            total_from_operand = true;
10870                            total_elems = Some(prod.max(1));
10871                            break;
10872                        }
10873                    }
10874                    if total_elems.is_none() {
10875                        for value in &request.inputs {
10876                            if let Some(prod) = inspect_value(value) {
10877                                total_from_operand = true;
10878                                total_elems = Some(prod.max(1));
10879                                break;
10880                            }
10881                        }
10882                    }
10883                }
10884                // Final fallback: use group-level element count or the 2-D heuristic
10885                if total_elems.is_none() {
10886                    if let Some(ec) = plan.element_count() {
10887                        total_elems = Some(ec.max(1));
10888                    }
10889                }
10890                if total_elems.is_none() || !total_from_operand {
10891                    if fusion_debug_enabled() {
10892                        log::debug!(
10893                            "fusion reduction (all): operand extent unknown (source: {:?}); falling back to provider path",
10894                            if total_from_operand { "runtime" } else { "output_shape" }
10895                        );
10896                    }
10897                    return Err("fusion: reduction all extent unknown".to_string());
10898                }
10899                let total = total_elems.unwrap();
10900                if fusion_debug_enabled() {
10901                    log::debug!(
10902                        "fusion reduction (all): total_elems={} fallback_rows={} fallback_cols={}",
10903                        total,
10904                        r,
10905                        c
10906                    );
10907                }
10908                (total, 1usize)
10909            } else {
10910                if fusion_debug_enabled() {
10911                    if r == 1 && c == 1 {
10912                        log::debug!(
10913                    "fusion reduction: unresolved shape (defaulted to 1x1); axis={}, constants={:?}",
10914                    axis, plan.constants
10915                );
10916                    } else {
10917                        log::debug!(
10918                    "fusion reduction: resolved shape rows={} cols={} axis={} constants={:?}",
10919                    r,
10920                    c,
10921                    axis,
10922                    plan.constants
10923                );
10924                    }
10925                }
10926                if axis == 0 {
10927                    (r, c)
10928                } else {
10929                    (c, r)
10930                }
10931            }
10932        };
10933        if fusion_debug_enabled() {
10934            log::debug!(
10935                "fusion reduction: axis={} reduce_len={} num_slices={} constants={:?}",
10936                axis,
10937                reduce_len,
10938                num_slices,
10939                plan.constants
10940            );
10941        }
10942        if log::log_enabled!(log::Level::Debug) && fusion_debug_enabled() {
10943            let _rt_inputs: Vec<String> = request
10944                .inputs
10945                .iter()
10946                .enumerate()
10947                .map(|(i, v)| summarize_value(i, v))
10948                .collect();
10949            let _plan_inputs: Vec<String> = plan
10950                .inputs
10951                .iter()
10952                .map(|vid| {
10953                    if let Some(info) = graph.value(*vid) {
10954                        format!(
10955                            "vid={} origin={:?} shape={:?}",
10956                            vid, info.origin, info.shape
10957                        )
10958                    } else {
10959                        format!("vid={} origin=<missing>", vid)
10960                    }
10961                })
10962                .collect();
10963            // Summarize inputs once before execution (omit plan inputs to reduce noise)
10964            log::debug!("reduction inputs: [{}]", _rt_inputs.join(", "));
10965        }
10966        // If shape derivation failed (1x1) but inputs/consumed suggest a larger tensor, skip fusion
10967        let looks_wrong = reduce_len == 1 && num_slices == 1 && {
10968            let mut big = false;
10969            let mut check_val = |v: &Value| match v {
10970                Value::GpuTensor(h) => {
10971                    let prod = h.shape.iter().copied().product::<usize>();
10972                    if prod > 1 {
10973                        big = true;
10974                    }
10975                }
10976                Value::Tensor(t) => {
10977                    let prod = t.shape.iter().copied().product::<usize>();
10978                    if prod > 1 {
10979                        big = true;
10980                    }
10981                }
10982                _ => {}
10983            };
10984            for v in consumed.iter().filter_map(|v| v.as_ref()) {
10985                check_val(v);
10986            }
10987            for v in &request.inputs {
10988                check_val(v);
10989            }
10990            big
10991        };
10992        if looks_wrong {
10993            log::debug!(
10994                "fusion reduction: skipping fusion due to unresolved shape; falling back to provider path"
10995            );
10996            return Err("fusion: reduction shape unresolved".to_string());
10997        }
10998
10999        // Optional escape hatch: disable fused reductions to force provider path
11000        if std::env::var("RUNMAT_DISABLE_FUSED_REDUCTION")
11001            .ok()
11002            .as_deref()
11003            == Some("1")
11004        {
11005            return Err("fusion: fused reductions disabled".to_string());
11006        }
11007        let workgroup_size = 256u32;
11008        if log::log_enabled!(log::Level::Debug) && fusion_debug_enabled() {
11009            let _rt_inputs: Vec<String> = request
11010                .inputs
11011                .iter()
11012                .enumerate()
11013                .map(|(i, v)| summarize_value(i, v))
11014                .collect();
11015            let _plan_inputs: Vec<String> = plan
11016                .inputs
11017                .iter()
11018                .map(|vid| {
11019                    if let Some(info) = graph.value(*vid) {
11020                        format!(
11021                            "vid={} origin={:?} shape={:?}",
11022                            vid, info.origin, info.shape
11023                        )
11024                    } else {
11025                        format!("vid={} origin=<missing>", vid)
11026                    }
11027                })
11028                .collect();
11029            log::debug!(
11030                "reduction axis={} reduce_len={} num_slices={}",
11031                axis,
11032                reduce_len,
11033                num_slices
11034            );
11035        }
11036        match execute_reduction(request, reduce_len, num_slices, workgroup_size) {
11037            Ok(result) => {
11038                stack_guard.commit();
11039                Ok(result)
11040            }
11041            Err(err) => Err(err.to_string()),
11042        }
11043    } else if plan.group.kind == FusionKind::CenteredGram {
11044        match execute_centered_gram(request) {
11045            Ok(result) => {
11046                stack_guard.commit();
11047                Ok(result)
11048            }
11049            Err(err) => Err(err.to_string()),
11050        }
11051    } else if plan.group.kind == FusionKind::PowerStepNormalize {
11052        match execute_power_step_normalize(request) {
11053            Ok(result) => {
11054                stack_guard.commit();
11055                Ok(result)
11056            }
11057            Err(err) => Err(err.to_string()),
11058        }
11059    } else if plan.group.kind == FusionKind::ExplainedVariance {
11060        log::debug!("explained variance plan inputs {:?}", plan.inputs);
11061        match execute_explained_variance(request) {
11062            Ok(result) => {
11063                stack_guard.commit();
11064                Ok(result)
11065            }
11066            Err(err) => {
11067                log::debug!("explained variance fusion fallback: {}", err);
11068                Err(err.to_string())
11069            }
11070        }
11071    } else if plan.group.kind == FusionKind::MatmulEpilogue {
11072        match execute_matmul_epilogue(request) {
11073            Ok(result) => {
11074                stack_guard.commit();
11075                Ok(result)
11076            }
11077            Err(err) => Err(err.to_string()),
11078        }
11079    } else if plan.group.kind == FusionKind::ImageNormalize {
11080        match execute_image_normalize(request) {
11081            Ok(result) => {
11082                stack_guard.commit();
11083                Ok(result)
11084            }
11085            Err(err) => Err(err.to_string()),
11086        }
11087    } else {
11088        // Unknown fusion kind; restore stack and report
11089        Err("fusion: unsupported fusion kind".to_string())
11090    }
11091}
11092
11093#[cfg(feature = "native-accel")]
11094fn clear_residency(value: &Value) {
11095    if let Value::GpuTensor(handle) = value {
11096        fusion_residency::clear(handle);
11097    }
11098}
11099
11100fn parse_exception(err: &str) -> runmat_builtins::MException {
11101    // Prefer the last occurrence of ": " to split IDENT: message, preserving nested identifiers
11102    if let Some(idx) = err.rfind(": ") {
11103        let (id, msg) = err.split_at(idx);
11104        let message = msg.trim_start_matches(':').trim().to_string();
11105        let ident = if id.trim().is_empty() {
11106            format!("{ERROR_NAMESPACE}:error")
11107        } else {
11108            id.trim().to_string()
11109        };
11110        return runmat_builtins::MException::new(ident, message);
11111    }
11112    // Fallback: if any ':' present, use the last as separator
11113    if let Some(idx) = err.rfind(':') {
11114        let (id, msg) = err.split_at(idx);
11115        let message = msg.trim_start_matches(':').trim().to_string();
11116        let ident = if id.trim().is_empty() {
11117            format!("{ERROR_NAMESPACE}:error")
11118        } else {
11119            id.trim().to_string()
11120        };
11121        runmat_builtins::MException::new(ident, message)
11122    } else {
11123        runmat_builtins::MException::new(format!("{ERROR_NAMESPACE}:error"), err.to_string())
11124    }
11125}
11126
11127/// Interpret bytecode with default variable initialization
11128pub fn interpret(bytecode: &Bytecode) -> Result<Vec<Value>, String> {
11129    let mut vars = vec![Value::Num(0.0); bytecode.var_count];
11130    interpret_with_vars(bytecode, &mut vars, Some("<main>"))
11131}
11132
11133pub fn interpret_function(bytecode: &Bytecode, vars: Vec<Value>) -> Result<Vec<Value>, String> {
11134    // Delegate to the counted variant with anonymous name and zero counts
11135    interpret_function_with_counts(bytecode, vars, "<anonymous>", 0, 0)
11136}
11137
11138fn interpret_function_with_counts(
11139    bytecode: &Bytecode,
11140    mut vars: Vec<Value>,
11141    name: &str,
11142    out_count: usize,
11143    in_count: usize,
11144) -> Result<Vec<Value>, String> {
11145    // Push (nargin, nargout), run, then pop
11146    let res = CALL_COUNTS.with(|cc| {
11147        cc.borrow_mut().push((in_count, out_count));
11148        let r = interpret_with_vars(bytecode, &mut vars, Some(name));
11149        cc.borrow_mut().pop();
11150        r
11151    });
11152    // Persist any variables declared persistent in this bytecode under the given function name
11153    let func_name = name.to_string();
11154    for instr in &bytecode.instructions {
11155        match instr {
11156            crate::instr::Instr::DeclarePersistent(indices) => {
11157                for &i in indices {
11158                    if i < vars.len() {
11159                        let key = (func_name.clone(), i);
11160                        PERSISTENTS.with(|p| {
11161                            p.borrow_mut().insert(key, vars[i].clone());
11162                        });
11163                    }
11164                }
11165            }
11166            crate::instr::Instr::DeclarePersistentNamed(indices, names) => {
11167                for (pos, &i) in indices.iter().enumerate() {
11168                    if i < vars.len() {
11169                        let key = (func_name.clone(), i);
11170                        let name_key = (
11171                            func_name.clone(),
11172                            names
11173                                .get(pos)
11174                                .cloned()
11175                                .unwrap_or_else(|| format!("var_{i}")),
11176                        );
11177                        let val = vars[i].clone();
11178                        PERSISTENTS.with(|p| {
11179                            p.borrow_mut().insert(key, val.clone());
11180                        });
11181                        PERSISTENTS_BY_NAME.with(|p| {
11182                            p.borrow_mut().insert(name_key, val);
11183                        });
11184                    }
11185                }
11186            }
11187            _ => {}
11188        }
11189    }
11190    res
11191}