runmat_accelerate/
simple_provider.rs

1use crate::host_lu::{lu_factor_host, LuHostFactors};
2use crate::sortrows_host::{sort_rows_host, SortRowsHostOutputs};
3use anyhow::{anyhow, ensure, Result};
4use once_cell::sync::OnceCell;
5use runmat_accelerate_api::{
6    AccelProvider, CorrcoefOptions, CovarianceOptions, FindDirection, FspecialRequest,
7    GpuTensorHandle, HostTensorOwned, HostTensorView, ImfilterOptions, PagefunRequest,
8    ProviderBandwidth, ProviderCholResult, ProviderCondNorm, ProviderConv1dOptions,
9    ProviderConvMode, ProviderConvOrientation, ProviderEigResult, ProviderFindResult,
10    ProviderHermitianKind, ProviderIirFilterOptions, ProviderIirFilterResult, ProviderInvOptions,
11    ProviderLinsolveOptions, ProviderLinsolveResult, ProviderLuResult, ProviderNanMode,
12    ProviderNormOrder, ProviderPinvOptions, ProviderPolyderQuotient, ProviderPrecision,
13    ProviderQrOptions, ProviderQrPivot, ProviderQrResult, ProviderScanDirection,
14    ProviderSymmetryKind, SetdiffOptions, SetdiffResult, SortComparison, SortResult,
15    SortRowsColumnSpec, UniqueOptions, UniqueResult,
16};
17use runmat_builtins::{Tensor, Value};
18use runmat_runtime::builtins::array::sorting_sets::unique;
19use runmat_runtime::builtins::common::broadcast::{
20    broadcast_index as runtime_broadcast_index, broadcast_shapes as runtime_broadcast_shapes,
21    compute_strides as runtime_compute_strides,
22};
23use runmat_runtime::builtins::stats::summary::{
24    corrcoef_from_tensors as runtime_corrcoef_from_tensors,
25    cov_from_tensors as runtime_cov_from_tensors, CovWeightSpec,
26};
27
28use runmat_runtime::builtins::math::linalg::ops::{
29    dot_host_real_for_provider, mldivide_host_real_for_provider, mrdivide_host_real_for_provider,
30};
31use runmat_runtime::builtins::math::linalg::solve::cond::cond_host_real_for_provider;
32use runmat_runtime::builtins::math::linalg::solve::inv::inv_host_real_for_provider;
33use runmat_runtime::builtins::math::linalg::solve::linsolve::linsolve_host_real_for_provider;
34use runmat_runtime::builtins::math::linalg::solve::norm::norm_host_real_for_provider;
35use runmat_runtime::builtins::math::linalg::solve::pinv::pinv_host_real_for_provider;
36use runmat_runtime::builtins::math::linalg::solve::rank_host_real_for_provider;
37use runmat_runtime::builtins::math::linalg::solve::rcond::rcond_host_real_for_provider;
38use runmat_runtime::builtins::math::linalg::structure::bandwidth::bandwidth_host_real_data;
39use runmat_runtime::builtins::math::linalg::structure::ishermitian::ishermitian_host_real_data;
40use runmat_runtime::builtins::math::linalg::structure::issymmetric::issymmetric_host_real_data;
41use runmat_runtime::builtins::math::linalg::structure::symrcm::symrcm_host_real_data;
42use runmat_runtime::builtins::math::reduction::compute_median_inplace;
43use runmat_runtime::builtins::math::reduction::diff_tensor_host;
44use std::collections::HashMap;
45use std::sync::atomic::{AtomicU64, Ordering};
46use std::sync::Mutex;
47
48const PROVIDER_DEFAULT_SEED: u64 = 0x9e3779b97f4a7c15;
49
50static REGISTRY: OnceCell<Mutex<HashMap<u64, Vec<f64>>>> = OnceCell::new();
51
52fn registry() -> &'static Mutex<HashMap<u64, Vec<f64>>> {
53    REGISTRY.get_or_init(|| Mutex::new(HashMap::new()))
54}
55
56const POLYDER_EPS: f64 = 1.0e-12;
57const FACTORIAL_MAX_HOST: usize = 170;
58const FACTORIAL_INT_TOL: f64 = 1.0e-10;
59
60#[derive(Clone, Copy)]
61enum PolyOrientation {
62    Scalar,
63    Row,
64    Column,
65}
66
67fn poly_orientation_from_shape(shape: &[usize]) -> Result<PolyOrientation> {
68    let mut non_unit = 0usize;
69    let mut orientation = PolyOrientation::Scalar;
70    for (idx, &dim) in shape.iter().enumerate() {
71        if dim > 1 {
72            non_unit += 1;
73            orientation = if idx == 0 {
74                PolyOrientation::Column
75            } else {
76                PolyOrientation::Row
77            };
78        }
79    }
80    if non_unit > 1 {
81        Err(anyhow!("polyder: coefficient inputs must be vectors"))
82    } else {
83        Ok(orientation)
84    }
85}
86
87fn poly_shape_for_len(orientation: PolyOrientation, len: usize) -> Vec<usize> {
88    if len <= 1 {
89        return vec![1, 1];
90    }
91    match orientation {
92        PolyOrientation::Scalar | PolyOrientation::Row => vec![1, len],
93        PolyOrientation::Column => vec![len, 1],
94    }
95}
96
97fn poly_trim_slice(coeffs: &[f64]) -> Vec<f64> {
98    if coeffs.is_empty() {
99        return vec![0.0];
100    }
101    if let Some(idx) = coeffs.iter().position(|c| c.abs() > POLYDER_EPS) {
102        coeffs[idx..].to_vec()
103    } else {
104        vec![0.0]
105    }
106}
107
108fn poly_raw_derivative(coeffs: &[f64]) -> Vec<f64> {
109    if coeffs.len() <= 1 {
110        return vec![0.0];
111    }
112    let mut out = Vec::with_capacity(coeffs.len() - 1);
113    let mut power = coeffs.len() - 1;
114    for coeff in coeffs.iter().take(coeffs.len() - 1) {
115        out.push(*coeff * power as f64);
116        power -= 1;
117    }
118    out
119}
120
121fn poly_integral_real(coeffs: &[f64], constant: f64) -> Vec<f64> {
122    if coeffs.is_empty() {
123        return vec![constant];
124    }
125    let mut out = Vec::with_capacity(coeffs.len() + 1);
126    for (idx, &coeff) in coeffs.iter().enumerate() {
127        let power = (coeffs.len() - idx) as f64;
128        if power == 0.0 {
129            out.push(0.0);
130        } else {
131            out.push(coeff / power);
132        }
133    }
134    out.push(constant);
135    out
136}
137
138fn poly_convolve_real(a: &[f64], b: &[f64]) -> Vec<f64> {
139    if a.is_empty() || b.is_empty() {
140        return Vec::new();
141    }
142    let mut result = vec![0.0; a.len() + b.len() - 1];
143    for (i, &ai) in a.iter().enumerate() {
144        for (j, &bj) in b.iter().enumerate() {
145            result[i + j] += ai * bj;
146        }
147    }
148    result
149}
150
151fn poly_add_real(a: &[f64], b: &[f64]) -> Vec<f64> {
152    let len = a.len().max(b.len());
153    let mut result = vec![0.0; len];
154    for (idx, &value) in a.iter().enumerate() {
155        result[len - a.len() + idx] += value;
156    }
157    for (idx, &value) in b.iter().enumerate() {
158        result[len - b.len() + idx] += value;
159    }
160    result
161}
162
163fn poly_sub_real(a: &[f64], b: &[f64]) -> Vec<f64> {
164    let len = a.len().max(b.len());
165    let mut result = vec![0.0; len];
166    for (idx, &value) in a.iter().enumerate() {
167        result[len - a.len() + idx] += value;
168    }
169    for (idx, &value) in b.iter().enumerate() {
170        result[len - b.len() + idx] -= value;
171    }
172    result
173}
174
175fn rng_state() -> &'static Mutex<u64> {
176    static RNG: OnceCell<Mutex<u64>> = OnceCell::new();
177    RNG.get_or_init(|| Mutex::new(0x9e3779b97f4a7c15))
178}
179
180fn factorial_scalar_host(value: f64) -> f64 {
181    if value.is_nan() {
182        return f64::NAN;
183    }
184    if value == 0.0 {
185        return 1.0;
186    }
187    if value.is_infinite() {
188        return if value.is_sign_positive() {
189            f64::INFINITY
190        } else {
191            f64::NAN
192        };
193    }
194    if value < 0.0 {
195        return f64::NAN;
196    }
197    let rounded = value.round();
198    if (value - rounded).abs() > FACTORIAL_INT_TOL {
199        return f64::NAN;
200    }
201    if rounded < 0.0 {
202        return f64::NAN;
203    }
204    let n = rounded as usize;
205    if n > FACTORIAL_MAX_HOST {
206        return f64::INFINITY;
207    }
208    if n == 0 {
209        return 1.0;
210    }
211    let mut acc = 1.0f64;
212    for k in 2..=n {
213        acc *= k as f64;
214    }
215    acc
216}
217
218fn tensor_to_weight_vector(tensor: &Tensor) -> Result<Vec<f64>> {
219    if tensor.shape.len() > 2 {
220        return Err(anyhow!("covariance: weight vector must be 1-D"));
221    }
222    let rows = tensor.rows();
223    let cols = tensor.cols();
224    if rows != 1 && cols != 1 {
225        return Err(anyhow!(
226            "covariance: weight vector must be 1-D (received shape {}x{})",
227            rows,
228            cols
229        ));
230    }
231    Ok(tensor.data.clone())
232}
233
234fn next_uniform(state: &mut u64) -> f64 {
235    const MULTIPLIER: u64 = 6364136223846793005;
236    const INCREMENT: u64 = 1;
237    const SHIFT: u32 = 11;
238    const SCALE: f64 = 1.0 / ((1u64 << 53) as f64);
239
240    *state = state.wrapping_mul(MULTIPLIER).wrapping_add(INCREMENT);
241    let bits = *state >> SHIFT;
242    (bits as f64) * SCALE
243}
244
245fn next_normal_pair(state: &mut u64) -> (f64, f64) {
246    let mut u1 = next_uniform(state);
247    if u1 <= 0.0 {
248        u1 = f64::MIN_POSITIVE;
249    }
250    let u2 = next_uniform(state);
251    let radius = (-2.0 * u1.ln()).sqrt();
252    let angle = 2.0 * std::f64::consts::PI * u2;
253    (radius * angle.cos(), radius * angle.sin())
254}
255
256pub struct InProcessProvider {
257    next_id: AtomicU64,
258}
259
260impl InProcessProvider {
261    pub const fn new() -> Self {
262        Self {
263            next_id: AtomicU64::new(1),
264        }
265    }
266
267    fn allocate_tensor(&self, data: Vec<f64>, shape: Vec<usize>) -> GpuTensorHandle {
268        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
269        registry()
270            .lock()
271            .unwrap_or_else(|e| e.into_inner())
272            .insert(id, data);
273        GpuTensorHandle {
274            shape,
275            device_id: 0,
276            buffer_id: id,
277        }
278    }
279
280    fn load_polynomial(&self, handle: &GpuTensorHandle) -> Result<(Vec<f64>, PolyOrientation)> {
281        let data = {
282            let guard = registry().lock().unwrap();
283            guard
284                .get(&handle.buffer_id)
285                .cloned()
286                .ok_or_else(|| anyhow!("polyder: unknown tensor handle {}", handle.buffer_id))?
287        };
288        let orientation = poly_orientation_from_shape(&handle.shape)?;
289        let coeffs = if data.is_empty() { vec![0.0] } else { data };
290        Ok((coeffs, orientation))
291    }
292
293    fn allocate_polynomial(
294        &self,
295        coeffs: Vec<f64>,
296        orientation: PolyOrientation,
297    ) -> GpuTensorHandle {
298        let shape = poly_shape_for_len(orientation, coeffs.len());
299        self.allocate_tensor(coeffs, shape)
300    }
301}
302
303impl Default for InProcessProvider {
304    fn default() -> Self {
305        Self::new()
306    }
307}
308
309fn normalize_shape(shape: &[usize]) -> Vec<usize> {
310    match shape.len() {
311        0 => vec![1, 1],
312        1 => {
313            let n = shape[0];
314            vec![n, n]
315        }
316        _ => shape.to_vec(),
317    }
318}
319
320fn compute_strides(shape: &[usize]) -> Vec<usize> {
321    let mut strides = Vec::with_capacity(shape.len());
322    let mut stride = 1usize;
323    for &dim in shape {
324        strides.push(stride);
325        stride = stride.saturating_mul(dim);
326    }
327    strides
328}
329
330fn product(shape: &[usize]) -> usize {
331    shape.iter().copied().product()
332}
333
334fn decode_indices(mut index: usize, dims: &[usize]) -> Vec<usize> {
335    if dims.is_empty() {
336        return Vec::new();
337    }
338    let mut coords = Vec::with_capacity(dims.len());
339    for &dim in dims {
340        if dim == 0 {
341            coords.push(0);
342        } else {
343            let coord = index % dim;
344            coords.push(coord);
345            index /= dim.max(1);
346        }
347    }
348    coords
349}
350
351fn shapes_compatible(expected: &[usize], actual: &[usize]) -> bool {
352    let max_len = expected.len().max(actual.len());
353    for i in 0..max_len {
354        let e = expected.get(i).copied().unwrap_or(1);
355        let a = actual.get(i).copied().unwrap_or(1);
356        if e != a {
357            return false;
358        }
359    }
360    true
361}
362
363fn filter_state_shape(mut base: Vec<usize>, dim_idx: usize, state_len: usize) -> Vec<usize> {
364    if base.len() <= dim_idx {
365        base.extend(std::iter::repeat_n(1, dim_idx + 1 - base.len()));
366    }
367    if !base.is_empty() {
368        base[dim_idx] = state_len;
369    }
370    base
371}
372
373fn states_from_column_major(
374    data: &[f64],
375    state_len: usize,
376    dim_idx: usize,
377    shape_ext: &[usize],
378) -> Vec<f64> {
379    if state_len == 0 {
380        return Vec::new();
381    }
382    let dims_before = &shape_ext[..dim_idx];
383    let dims_after = if dim_idx + 1 < shape_ext.len() {
384        &shape_ext[dim_idx + 1..]
385    } else {
386        &[]
387    };
388    let leading = if dims_before.is_empty() {
389        1
390    } else {
391        dims_before.iter().copied().product()
392    };
393    let trailing = if dims_after.is_empty() {
394        1
395    } else {
396        dims_after.iter().copied().product()
397    };
398    let channel_count = leading * trailing;
399    let shape = filter_state_shape(shape_ext.to_vec(), dim_idx, state_len);
400    let mut states = vec![0.0; state_len * channel_count];
401    for channel in 0..channel_count {
402        let before_idx = if dims_before.is_empty() {
403            0
404        } else {
405            channel % leading
406        };
407        let after_idx = if dims_after.is_empty() {
408            0
409        } else {
410            channel / leading
411        };
412        let before_coords = decode_indices(before_idx, dims_before);
413        let after_coords = decode_indices(after_idx, dims_after);
414        for s in 0..state_len {
415            let mut offset = 0usize;
416            let mut stride = 1usize;
417            for (d, size) in shape.iter().copied().enumerate() {
418                let coord = if d < dim_idx {
419                    before_coords.get(d).copied().unwrap_or(0)
420                } else if d == dim_idx {
421                    s
422                } else {
423                    let idx = d - dim_idx - 1;
424                    after_coords.get(idx).copied().unwrap_or(0)
425                };
426                offset += coord * stride;
427                stride *= size;
428            }
429            states[channel * state_len + s] = data[offset];
430        }
431    }
432    states
433}
434
435fn states_to_column_major(
436    states: &[f64],
437    state_len: usize,
438    dim_idx: usize,
439    shape_ext: &[usize],
440) -> Vec<f64> {
441    if state_len == 0 {
442        return Vec::new();
443    }
444    let dims_before = &shape_ext[..dim_idx];
445    let dims_after = if dim_idx + 1 < shape_ext.len() {
446        &shape_ext[dim_idx + 1..]
447    } else {
448        &[]
449    };
450    let leading = if dims_before.is_empty() {
451        1
452    } else {
453        dims_before.iter().copied().product()
454    };
455    let trailing = if dims_after.is_empty() {
456        1
457    } else {
458        dims_after.iter().copied().product()
459    };
460    let channel_count = leading * trailing;
461    let shape = filter_state_shape(shape_ext.to_vec(), dim_idx, state_len);
462    let mut out = vec![0.0; states.len()];
463    for channel in 0..channel_count {
464        let before_idx = if dims_before.is_empty() {
465            0
466        } else {
467            channel % leading
468        };
469        let after_idx = if dims_after.is_empty() {
470            0
471        } else {
472            channel / leading
473        };
474        let before_coords = decode_indices(before_idx, dims_before);
475        let after_coords = decode_indices(after_idx, dims_after);
476        for s in 0..state_len {
477            let mut offset = 0usize;
478            let mut stride = 1usize;
479            for (d, size) in shape.iter().copied().enumerate() {
480                let coord = if d < dim_idx {
481                    before_coords.get(d).copied().unwrap_or(0)
482                } else if d == dim_idx {
483                    s
484                } else {
485                    let idx = d - dim_idx - 1;
486                    after_coords.get(idx).copied().unwrap_or(0)
487                };
488                offset += coord * stride;
489                stride *= size;
490            }
491            out[offset] = states[channel * state_len + s];
492        }
493    }
494    out
495}
496
497fn permute_data(data: &[f64], shape: &[usize], order: &[usize]) -> Result<(Vec<f64>, Vec<usize>)> {
498    ensure!(!order.is_empty(), "permute: order must not be empty");
499    let rank = order.len();
500    ensure!(
501        shape.len() <= rank,
502        "permute: order length must be at least the number of dimensions"
503    );
504    let mut seen = vec![false; rank];
505    for &dim in order {
506        ensure!(dim < rank, "permute: invalid dimension index {}", dim + 1);
507        ensure!(
508            !seen[dim],
509            "permute: duplicate dimension index {} encountered",
510            dim + 1
511        );
512        seen[dim] = true;
513    }
514    ensure!(
515        seen.iter().all(|v| *v),
516        "permute: order must include every dimension exactly once"
517    );
518
519    let mut src_shape = shape.to_vec();
520    if src_shape.len() < rank {
521        src_shape.extend(std::iter::repeat_n(1, rank - src_shape.len()));
522    }
523
524    let total = product(&src_shape);
525    ensure!(
526        total == data.len(),
527        "permute: shape/product mismatch ({} vs {})",
528        total,
529        data.len()
530    );
531
532    let mut dst_shape = vec![0usize; rank];
533    for (dst_dim, &src_dim) in order.iter().enumerate() {
534        dst_shape[dst_dim] = src_shape[src_dim];
535    }
536
537    let src_strides = compute_strides(&src_shape);
538    let dst_total = product(&dst_shape);
539    let mut out = vec![0.0f64; dst_total];
540    let mut dst_coords = vec![0usize; rank];
541    let mut src_coords = vec![0usize; rank];
542
543    for (dst_index, out_value) in out.iter_mut().enumerate() {
544        let mut rem = dst_index;
545        for (dim, &size) in dst_shape.iter().enumerate() {
546            if size == 0 {
547                dst_coords[dim] = 0;
548            } else {
549                dst_coords[dim] = rem % size;
550                rem /= size;
551            }
552        }
553        for (dst_dim, &src_dim) in order.iter().enumerate() {
554            src_coords[src_dim] = dst_coords[dst_dim];
555        }
556        let mut src_index = 0usize;
557        for (dim, &coord) in src_coords.iter().enumerate() {
558            src_index += coord * src_strides[dim];
559        }
560        *out_value = data[src_index];
561    }
562
563    Ok((out, dst_shape))
564}
565
566fn flip_data(data: &[f64], shape: &[usize], axes: &[usize]) -> Result<Vec<f64>> {
567    if axes.is_empty() || data.is_empty() {
568        return Ok(data.to_vec());
569    }
570    let mut ext_shape = shape.to_vec();
571    if let Some(max_dim) = axes.iter().copied().max() {
572        let needed = max_dim + 1;
573        if needed > ext_shape.len() {
574            ext_shape.extend(std::iter::repeat_n(1, needed - ext_shape.len()));
575        }
576    }
577    let total = product(&ext_shape);
578    ensure!(
579        total == data.len(),
580        "flip: shape/product mismatch ({} vs {})",
581        total,
582        data.len()
583    );
584    let mut flip_flags = vec![false; ext_shape.len()];
585    for &axis in axes {
586        if axis < flip_flags.len() {
587            flip_flags[axis] = !flip_flags[axis];
588        }
589    }
590    if !flip_flags.iter().any(|&flag| flag) {
591        return Ok(data.to_vec());
592    }
593    let mut out = Vec::with_capacity(total);
594    for idx in 0..total {
595        let mut coords = unravel_index(idx, &ext_shape);
596        for (axis, flag) in flip_flags.iter().enumerate() {
597            if *flag && ext_shape[axis] > 1 {
598                coords[axis] = ext_shape[axis] - 1 - coords[axis];
599            }
600        }
601        let src_idx = ravel_index(&coords, &ext_shape);
602        out.push(data[src_idx]);
603    }
604    Ok(out)
605}
606
607fn conv1d_output_shape(len: usize, orientation: ProviderConvOrientation) -> Vec<usize> {
608    match (orientation, len) {
609        (ProviderConvOrientation::Row, 0) => vec![1, 0],
610        (ProviderConvOrientation::Row, _) => vec![1, len],
611        (ProviderConvOrientation::Column, 0) => vec![0, 1],
612        (ProviderConvOrientation::Column, _) => vec![len, 1],
613    }
614}
615
616fn convolve_real(signal: &[f64], kernel: &[f64]) -> Result<Vec<f64>> {
617    if signal.is_empty() || kernel.is_empty() {
618        return Ok(Vec::new());
619    }
620    let full_len = signal
621        .len()
622        .checked_add(kernel.len())
623        .and_then(|v| v.checked_sub(1))
624        .ok_or_else(|| anyhow!("conv1d: length overflow"))?;
625    let mut out = vec![0.0; full_len];
626    for (i, &ai) in signal.iter().enumerate() {
627        for (j, &bj) in kernel.iter().enumerate() {
628            out[i + j] += ai * bj;
629        }
630    }
631    Ok(out)
632}
633
634fn apply_conv_mode_real(
635    full: &[f64],
636    mode: ProviderConvMode,
637    len_a: usize,
638    len_b: usize,
639) -> Vec<f64> {
640    match mode {
641        ProviderConvMode::Full => full.to_vec(),
642        ProviderConvMode::Same => {
643            if len_a == 0 {
644                return Vec::new();
645            }
646            let start = if len_b == 0 { 0 } else { (len_b - 1) / 2 };
647            let end = (start + len_a).min(full.len());
648            if start >= end {
649                Vec::new()
650            } else {
651                full[start..end].to_vec()
652            }
653        }
654        ProviderConvMode::Valid => {
655            if len_a < len_b {
656                Vec::new()
657            } else {
658                let start = len_b - 1;
659                let valid_len = len_a - len_b + 1;
660                let end = (start + valid_len).min(full.len());
661                if start >= end {
662                    Vec::new()
663                } else {
664                    full[start..end].to_vec()
665                }
666            }
667        }
668    }
669}
670
671#[allow(clippy::too_many_arguments)]
672fn conv2d_full_real(
673    signal: &[f64],
674    signal_rows: usize,
675    signal_cols: usize,
676    kernel: &[f64],
677    kernel_rows: usize,
678    kernel_cols: usize,
679    full_rows: usize,
680    full_cols: usize,
681) -> Vec<f64> {
682    let mut out = vec![0.0; full_rows * full_cols];
683    for sc in 0..signal_cols {
684        for sr in 0..signal_rows {
685            let aval = signal[sc * signal_rows + sr];
686            if aval == 0.0 {
687                continue;
688            }
689            for kc in 0..kernel_cols {
690                let out_c = sc + kc;
691                for kr in 0..kernel_rows {
692                    let out_r = sr + kr;
693                    let kval = kernel[kc * kernel_rows + kr];
694                    out[out_c * full_rows + out_r] += aval * kval;
695                }
696            }
697        }
698    }
699    out
700}
701
702fn slice_matrix_real(
703    full: &[f64],
704    full_rows: usize,
705    full_cols: usize,
706    row_start: usize,
707    row_end: usize,
708    col_start: usize,
709    col_end: usize,
710) -> (Vec<f64>, usize, usize) {
711    let row_end = row_end.min(full_rows);
712    let col_end = col_end.min(full_cols);
713    if row_start >= row_end || col_start >= col_end {
714        let rows = row_end.saturating_sub(row_start);
715        let cols = col_end.saturating_sub(col_start);
716        return (vec![0.0; rows * cols], rows, cols);
717    }
718    let rows = row_end - row_start;
719    let cols = col_end - col_start;
720    let mut out = vec![0.0; rows * cols];
721    for c in 0..cols {
722        for r in 0..rows {
723            let src = (col_start + c) * full_rows + (row_start + r);
724            let dst = c * rows + r;
725            out[dst] = full[src];
726        }
727    }
728    (out, rows, cols)
729}
730
731#[allow(clippy::too_many_arguments)]
732fn apply_conv2_mode_real_2d(
733    full: &[f64],
734    full_rows: usize,
735    full_cols: usize,
736    mode: ProviderConvMode,
737    a_rows: usize,
738    a_cols: usize,
739    b_rows: usize,
740    b_cols: usize,
741) -> (Vec<f64>, usize, usize) {
742    match mode {
743        ProviderConvMode::Full => (full.to_vec(), full_rows, full_cols),
744        ProviderConvMode::Same => {
745            if a_rows == 0 || a_cols == 0 {
746                return (Vec::new(), a_rows, a_cols);
747            }
748            let row_start = if b_rows == 0 { 0 } else { (b_rows - 1) / 2 };
749            let col_start = if b_cols == 0 { 0 } else { (b_cols - 1) / 2 };
750            slice_matrix_real(
751                full,
752                full_rows,
753                full_cols,
754                row_start,
755                row_start + a_rows,
756                col_start,
757                col_start + a_cols,
758            )
759        }
760        ProviderConvMode::Valid => {
761            if a_rows < b_rows || a_cols < b_cols {
762                (Vec::new(), 0, 0)
763            } else {
764                let rows = a_rows - b_rows + 1;
765                let cols = a_cols - b_cols + 1;
766                let row_start = b_rows.saturating_sub(1);
767                let col_start = b_cols.saturating_sub(1);
768                slice_matrix_real(
769                    full,
770                    full_rows,
771                    full_cols,
772                    row_start,
773                    row_start + rows,
774                    col_start,
775                    col_start + cols,
776                )
777            }
778        }
779    }
780}
781
782fn tensor_from_value(label: &str, value: Value) -> Result<Tensor> {
783    match value {
784        Value::Tensor(tensor) => Ok(tensor),
785        Value::Num(n) => Tensor::new(vec![n], vec![1, 1]).map_err(|e| anyhow!("{label}: {e}")),
786        Value::Int(i) => {
787            Tensor::new(vec![i.to_f64()], vec![1, 1]).map_err(|e| anyhow!("{label}: {e}"))
788        }
789        Value::Bool(b) => Tensor::new(vec![if b { 1.0 } else { 0.0 }], vec![1, 1])
790            .map_err(|e| anyhow!("{label}: {e}")),
791        Value::ComplexTensor(_) => Err(anyhow!(
792            "{label}: complex outputs are not supported by the in-process provider"
793        )),
794        other => Err(anyhow!("{label}: unexpected value {other:?}")),
795    }
796}
797
798fn tril_data(data: &[f64], shape: &[usize], offset: isize) -> Result<Vec<f64>> {
799    if data.is_empty() {
800        return Ok(Vec::new());
801    }
802    let rows = shape.first().copied().unwrap_or(1);
803    let cols = shape.get(1).copied().unwrap_or(1);
804    let plane = rows.saturating_mul(cols);
805    if plane == 0 {
806        ensure!(
807            data.is_empty(),
808            "tril: shape/product mismatch ({} vs {})",
809            0,
810            data.len()
811        );
812        return Ok(Vec::new());
813    }
814    let pages = if shape.len() <= 2 {
815        1usize
816    } else {
817        shape[2..].iter().product::<usize>()
818    };
819    if pages == 0 {
820        ensure!(
821            data.is_empty(),
822            "tril: shape/product mismatch ({} vs {})",
823            0,
824            data.len()
825        );
826        return Ok(Vec::new());
827    }
828    let expected = plane
829        .checked_mul(pages)
830        .ok_or_else(|| anyhow!("tril: dimension product overflow"))?;
831    ensure!(
832        expected == data.len(),
833        "tril: shape/product mismatch ({} vs {})",
834        expected,
835        data.len()
836    );
837    let mut out = data.to_vec();
838    for page in 0..pages {
839        let base = page * plane;
840        for col in 0..cols {
841            let col_base = base + col * rows;
842            for row in 0..rows {
843                if (row as isize) - (col as isize) < -offset {
844                    out[col_base + row] = 0.0;
845                }
846            }
847        }
848    }
849    Ok(out)
850}
851
852fn triu_data(data: &[f64], shape: &[usize], offset: isize) -> Result<Vec<f64>> {
853    if data.is_empty() {
854        return Ok(Vec::new());
855    }
856    let rows = shape.first().copied().unwrap_or(1);
857    let cols = shape.get(1).copied().unwrap_or(1);
858    let plane = rows.saturating_mul(cols);
859    if plane == 0 {
860        ensure!(
861            data.is_empty(),
862            "triu: shape/product mismatch ({} vs {})",
863            0,
864            data.len()
865        );
866        return Ok(Vec::new());
867    }
868    let pages = if shape.len() <= 2 {
869        1usize
870    } else {
871        shape[2..].iter().product::<usize>()
872    };
873    if pages == 0 {
874        ensure!(
875            data.is_empty(),
876            "triu: shape/product mismatch ({} vs {})",
877            0,
878            data.len()
879        );
880        return Ok(Vec::new());
881    }
882    let expected = plane
883        .checked_mul(pages)
884        .ok_or_else(|| anyhow!("triu: dimension product overflow"))?;
885    ensure!(
886        expected == data.len(),
887        "triu: shape/product mismatch ({} vs {})",
888        expected,
889        data.len()
890    );
891    let mut out = data.to_vec();
892    for page in 0..pages {
893        let base = page * plane;
894        for col in 0..cols {
895            let col_base = base + col * rows;
896            for row in 0..rows {
897                let diff = (col as isize) - (row as isize);
898                if diff < offset {
899                    out[col_base + row] = 0.0;
900                }
901            }
902        }
903    }
904    Ok(out)
905}
906
907fn circshift_data(data: &[f64], shape: &[usize], shifts: &[isize]) -> Result<Vec<f64>> {
908    ensure!(
909        shape.len() == shifts.len(),
910        "circshift: shift vector length must match tensor rank"
911    );
912    let mut total = 1usize;
913    for &dim in shape {
914        total = total
915            .checked_mul(dim)
916            .ok_or_else(|| anyhow!("circshift: requested output exceeds maximum size"))?;
917    }
918    ensure!(
919        total == data.len(),
920        "circshift: shape/product mismatch ({} vs {})",
921        total,
922        data.len()
923    );
924    if data.is_empty() {
925        return Ok(Vec::new());
926    }
927
928    let mut normalized = Vec::with_capacity(shape.len());
929    for (len, &shift) in shape.iter().zip(shifts.iter()) {
930        if *len <= 1 {
931            normalized.push(0usize);
932            continue;
933        }
934        let len_isize = *len as isize;
935        let mut value = shift % len_isize;
936        if value < 0 {
937            value += len_isize;
938        }
939        normalized.push(value as usize);
940    }
941    if normalized.iter().all(|&s| s == 0) {
942        return Ok(data.to_vec());
943    }
944
945    let strides = compute_strides(shape);
946    let mut out = vec![0.0f64; data.len()];
947    for (idx, out_value) in out.iter_mut().enumerate() {
948        let coords = unravel_index(idx, shape);
949        let mut src_idx = 0usize;
950        for (axis, &coord) in coords.iter().enumerate() {
951            let len = shape[axis];
952            let stride = strides[axis];
953            if len <= 1 || normalized[axis] == 0 {
954                src_idx += coord * stride;
955            } else {
956                let shift = normalized[axis] % len;
957                let src_coord = (coord + len - shift) % len;
958                src_idx += src_coord * stride;
959            }
960        }
961        *out_value = data[src_idx];
962    }
963    Ok(out)
964}
965
966fn unravel_index(mut index: usize, shape: &[usize]) -> Vec<usize> {
967    let mut coords = Vec::with_capacity(shape.len());
968    for &extent in shape {
969        if extent == 0 {
970            coords.push(0);
971        } else {
972            coords.push(index % extent);
973            index /= extent;
974        }
975    }
976    coords
977}
978
979fn ravel_index(coords: &[usize], shape: &[usize]) -> usize {
980    let mut index = 0usize;
981    let mut stride = 1usize;
982    for (coord, extent) in coords.iter().zip(shape.iter()) {
983        if *extent > 0 {
984            index += coord * stride;
985            stride *= extent;
986        }
987    }
988    index
989}
990
991fn checked_total(shape: &[usize]) -> Result<usize> {
992    shape.iter().try_fold(1usize, |acc, dim| {
993        acc.checked_mul(*dim)
994            .ok_or_else(|| anyhow!("repmat: requested output exceeds maximum size"))
995    })
996}
997
998fn repmat_numeric(data: &[f64], shape: &[usize], reps: &[usize]) -> Result<(Vec<f64>, Vec<usize>)> {
999    ensure!(
1000        !reps.is_empty(),
1001        "repmat: replication factors must be specified"
1002    );
1003    let orig_rank = if shape.is_empty() { 1 } else { shape.len() };
1004    let rank = if reps.len() == 1 {
1005        orig_rank.max(2)
1006    } else {
1007        orig_rank.max(reps.len())
1008    };
1009
1010    let mut base_shape = vec![1usize; rank];
1011    for (idx, &dim) in shape.iter().enumerate() {
1012        if idx < rank {
1013            base_shape[idx] = dim;
1014        }
1015    }
1016
1017    let mut factors = vec![1usize; rank];
1018    if reps.len() == 1 {
1019        factors.fill(reps[0]);
1020    } else {
1021        for (idx, &factor) in reps.iter().enumerate() {
1022            if idx < rank {
1023                factors[idx] = factor;
1024            }
1025        }
1026    }
1027
1028    let mut new_shape = Vec::with_capacity(rank);
1029    for i in 0..rank {
1030        let scaled = base_shape[i]
1031            .checked_mul(factors[i])
1032            .ok_or_else(|| anyhow!("repmat: requested output exceeds maximum size"))?;
1033        new_shape.push(scaled);
1034    }
1035
1036    let orig_total = checked_total(&base_shape)?;
1037    ensure!(
1038        orig_total == data.len() || (orig_total == 0 && data.is_empty()),
1039        "repmat: internal shape mismatch (expected {} elements, found {})",
1040        orig_total,
1041        data.len()
1042    );
1043
1044    let new_total = checked_total(&new_shape)?;
1045    if new_total == 0 {
1046        return Ok((Vec::new(), new_shape));
1047    }
1048
1049    let strides = compute_strides(&base_shape);
1050    let mut out = Vec::with_capacity(new_total);
1051    for idx in 0..new_total {
1052        let mut rem = idx;
1053        let mut src_index = 0usize;
1054        for dim in 0..rank {
1055            let dim_size = new_shape[dim];
1056            let coord = rem % dim_size;
1057            rem /= dim_size;
1058            let base = base_shape[dim];
1059            let orig_coord = if base == 0 { 0 } else { coord % base };
1060            src_index += orig_coord * strides[dim];
1061        }
1062        out.push(data[src_index]);
1063    }
1064    Ok((out, new_shape))
1065}
1066
1067fn coerce_sub2ind_value(value: f64, dim_number: usize, dim_size: usize) -> Result<usize> {
1068    if !value.is_finite() {
1069        return Err(anyhow!(
1070            "sub2ind: subscript in dimension {} must be finite",
1071            dim_number
1072        ));
1073    }
1074    let rounded = value.round();
1075    if (rounded - value).abs() > f64::EPSILON {
1076        return Err(anyhow!(
1077            "sub2ind: subscript in dimension {} must be an integer",
1078            dim_number
1079        ));
1080    }
1081    if rounded < 1.0 || rounded > dim_size as f64 {
1082        return Err(anyhow!(
1083            "sub2ind: subscript {} exceeds dimension {} (size {})",
1084            rounded as isize,
1085            dim_number,
1086            dim_size
1087        ));
1088    }
1089    Ok(rounded as usize)
1090}
1091
1092fn identity_data(shape: &[usize]) -> Vec<f64> {
1093    let shape = normalize_shape(shape);
1094    let total: usize = shape.iter().copied().product();
1095    let mut data = vec![0.0; total];
1096    if shape.is_empty() {
1097        return data;
1098    }
1099    let rows = shape[0];
1100    let cols = shape[1];
1101    let diag_len = rows.min(cols);
1102    if diag_len == 0 {
1103        return data;
1104    }
1105    let strides = compute_strides(&shape);
1106    let extra_dims = &shape[2..];
1107    let extra_count = if extra_dims.is_empty() {
1108        1
1109    } else {
1110        extra_dims.iter().copied().product()
1111    };
1112    let mut coords = vec![0usize; shape.len()];
1113    for mut extra_idx in 0..extra_count {
1114        for (offset, size) in extra_dims.iter().copied().enumerate() {
1115            let dim = offset + 2;
1116            if size == 0 {
1117                coords[dim] = 0;
1118                continue;
1119            }
1120            coords[dim] = extra_idx % size;
1121            extra_idx /= size;
1122        }
1123        for diag in 0..diag_len {
1124            coords[0] = diag;
1125            coords[1] = diag;
1126            let mut linear = 0usize;
1127            for (dim, &coord) in coords.iter().enumerate() {
1128                linear += coord * strides[dim];
1129            }
1130            data[linear] = 1.0;
1131        }
1132    }
1133    data
1134}
1135
1136fn offset_abs(offset: isize) -> usize {
1137    if offset >= 0 {
1138        offset as usize
1139    } else {
1140        let magnitude = -(offset as i128);
1141        magnitude as usize
1142    }
1143}
1144
1145fn diag_matrix_size(len: usize, offset: isize) -> Result<(usize, usize)> {
1146    let shift = offset_abs(offset);
1147    let size = len
1148        .checked_add(shift)
1149        .ok_or_else(|| anyhow!("diag: result dimension exceeds limits"))?;
1150    let total = size
1151        .checked_mul(size)
1152        .ok_or_else(|| anyhow!("diag: result size exceeds limits"))?;
1153    Ok((size, total))
1154}
1155
1156fn diagonal_length(rows: usize, cols: usize, offset: isize) -> usize {
1157    if rows == 0 || cols == 0 {
1158        return 0;
1159    }
1160    if offset >= 0 {
1161        let shift = offset as usize;
1162        if shift >= cols {
1163            0
1164        } else {
1165            rows.min(cols - shift)
1166        }
1167    } else {
1168        let shift = offset_abs(offset);
1169        if shift >= rows {
1170            0
1171        } else {
1172            (rows - shift).min(cols)
1173        }
1174    }
1175}
1176
1177fn diagonal_target_index(idx: usize, offset: isize) -> (usize, usize) {
1178    if offset >= 0 {
1179        (idx, idx + offset as usize)
1180    } else {
1181        (idx + offset_abs(offset), idx)
1182    }
1183}
1184
1185fn diagonal_source_index(idx: usize, offset: isize) -> (usize, usize) {
1186    if offset >= 0 {
1187        (idx, idx + offset as usize)
1188    } else {
1189        (idx + offset_abs(offset), idx)
1190    }
1191}
1192
1193fn ensure_diag_shape(label: &str, shape: &[usize]) -> Result<()> {
1194    if shape.len() > 2 && shape.iter().skip(2).any(|&d| d != 1) {
1195        Err(anyhow!("{label}: input must be 2-D"))
1196    } else {
1197        Ok(())
1198    }
1199}
1200
1201fn rows_cols(shape: &[usize]) -> (usize, usize) {
1202    match shape.len() {
1203        0 => (1, 1),
1204        1 => (shape[0], 1),
1205        _ => (shape[0], shape[1]),
1206    }
1207}
1208
1209fn is_vector_like(rows: usize, cols: usize, dims: usize) -> bool {
1210    rows == 1 || cols == 1 || dims <= 1
1211}
1212
1213impl AccelProvider for InProcessProvider {
1214    fn device_id(&self) -> u32 {
1215        0
1216    }
1217    fn gather_linear(
1218        &self,
1219        source: &GpuTensorHandle,
1220        indices: &[u32],
1221        output_shape: &[usize],
1222    ) -> Result<GpuTensorHandle> {
1223        let data = {
1224            let guard = registry().lock().unwrap_or_else(|e| e.into_inner());
1225            guard
1226                .get(&source.buffer_id)
1227                .cloned()
1228                .ok_or_else(|| anyhow!("gather_linear: unknown buffer {}", source.buffer_id))?
1229        };
1230        let mut out = Vec::with_capacity(indices.len());
1231        for (pos, &idx) in indices.iter().enumerate() {
1232            let lin = idx as usize;
1233            ensure!(
1234                lin < data.len(),
1235                "gather_linear: index {} (position {}) out of bounds for buffer {} (len={})",
1236                lin,
1237                pos,
1238                source.buffer_id,
1239                data.len()
1240            );
1241            out.push(data[lin]);
1242        }
1243        Ok(self.allocate_tensor(out, output_shape.to_vec()))
1244    }
1245
1246    fn scatter_linear(
1247        &self,
1248        target: &GpuTensorHandle,
1249        indices: &[u32],
1250        values: &GpuTensorHandle,
1251    ) -> Result<()> {
1252        let values_data = {
1253            let guard = registry().lock().unwrap_or_else(|e| e.into_inner());
1254            guard.get(&values.buffer_id).cloned().ok_or_else(|| {
1255                anyhow!("scatter_linear: unknown values buffer {}", values.buffer_id)
1256            })?
1257        };
1258        ensure!(
1259            values_data.len() == indices.len(),
1260            "scatter_linear: values length {} does not match indices length {}",
1261            values_data.len(),
1262            indices.len()
1263        );
1264        let mut guard = registry().lock().unwrap_or_else(|e| e.into_inner());
1265        let target_buf = guard
1266            .get_mut(&target.buffer_id)
1267            .ok_or_else(|| anyhow!("scatter_linear: unknown target buffer {}", target.buffer_id))?;
1268        for (pos, &idx) in indices.iter().enumerate() {
1269            let lin = idx as usize;
1270            ensure!(
1271                lin < target_buf.len(),
1272                "scatter_linear: index {} (position {}) out of bounds for target len {}",
1273                lin,
1274                pos,
1275                target_buf.len()
1276            );
1277            target_buf[lin] = values_data[pos];
1278        }
1279        Ok(())
1280    }
1281
1282    fn precision(&self) -> ProviderPrecision {
1283        ProviderPrecision::F64
1284    }
1285
1286    fn upload(&self, host: &HostTensorView) -> Result<GpuTensorHandle> {
1287        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1288        let mut guard = registry().lock().unwrap_or_else(|e| e.into_inner());
1289        guard.insert(id, host.data.to_vec());
1290        let handle = GpuTensorHandle {
1291            shape: host.shape.to_vec(),
1292            device_id: 0,
1293            buffer_id: id,
1294        };
1295        runmat_accelerate_api::set_handle_logical(&handle, false);
1296        Ok(handle)
1297    }
1298
1299    fn download(&self, h: &GpuTensorHandle) -> Result<HostTensorOwned> {
1300        let guard = registry().lock().unwrap_or_else(|e| e.into_inner());
1301        if let Some(buf) = guard.get(&h.buffer_id) {
1302            Ok(HostTensorOwned {
1303                data: buf.clone(),
1304                shape: h.shape.clone(),
1305            })
1306        } else {
1307            Err(anyhow::anyhow!("buffer not found: {}", h.buffer_id))
1308        }
1309    }
1310
1311    fn free(&self, h: &GpuTensorHandle) -> Result<()> {
1312        let mut guard = registry().lock().unwrap_or_else(|e| e.into_inner());
1313        guard.remove(&h.buffer_id);
1314        runmat_accelerate_api::clear_handle_logical(h);
1315        Ok(())
1316    }
1317
1318    fn device_info(&self) -> String {
1319        "in-process provider (host registry)".to_string()
1320    }
1321
1322    fn device_info_struct(&self) -> runmat_accelerate_api::ApiDeviceInfo {
1323        runmat_accelerate_api::ApiDeviceInfo {
1324            device_id: 0,
1325            name: "InProcess".to_string(),
1326            vendor: "RunMat".to_string(),
1327            memory_bytes: None,
1328            backend: Some("inprocess".to_string()),
1329        }
1330    }
1331
1332    fn telemetry_snapshot(&self) -> runmat_accelerate_api::ProviderTelemetry {
1333        runmat_accelerate_api::ProviderTelemetry::default()
1334    }
1335
1336    fn reset_telemetry(&self) {}
1337
1338    fn sort_rows(
1339        &self,
1340        handle: &GpuTensorHandle,
1341        columns: &[SortRowsColumnSpec],
1342        comparison: SortComparison,
1343    ) -> Result<SortResult> {
1344        let data = {
1345            let guard = registry().lock().unwrap();
1346            guard
1347                .get(&handle.buffer_id)
1348                .cloned()
1349                .ok_or_else(|| anyhow!("sortrows: unknown buffer {}", handle.buffer_id))?
1350        };
1351        let SortRowsHostOutputs {
1352            values,
1353            indices,
1354            indices_shape,
1355        } = sort_rows_host(&data, &handle.shape, columns, comparison)?;
1356        Ok(SortResult {
1357            values: HostTensorOwned {
1358                data: values,
1359                shape: handle.shape.clone(),
1360            },
1361            indices: HostTensorOwned {
1362                data: indices,
1363                shape: indices_shape,
1364            },
1365        })
1366    }
1367
1368    fn polyder_single(&self, polynomial: &GpuTensorHandle) -> Result<GpuTensorHandle> {
1369        let (coeffs, orientation) = self.load_polynomial(polynomial)?;
1370        let raw = poly_raw_derivative(&coeffs);
1371        let trimmed = poly_trim_slice(&raw);
1372        Ok(self.allocate_polynomial(trimmed, orientation))
1373    }
1374
1375    fn polyder_product(&self, p: &GpuTensorHandle, q: &GpuTensorHandle) -> Result<GpuTensorHandle> {
1376        let (p_coeffs, orientation) = self.load_polynomial(p)?;
1377        let (q_coeffs, _) = self.load_polynomial(q)?;
1378        let dp = poly_raw_derivative(&p_coeffs);
1379        let dq = poly_raw_derivative(&q_coeffs);
1380        let term1 = poly_convolve_real(&dp, &q_coeffs);
1381        let term2 = poly_convolve_real(&p_coeffs, &dq);
1382        let sum = poly_add_real(&term1, &term2);
1383        let trimmed = poly_trim_slice(&sum);
1384        Ok(self.allocate_polynomial(trimmed, orientation))
1385    }
1386
1387    fn polyder_quotient(
1388        &self,
1389        u: &GpuTensorHandle,
1390        v: &GpuTensorHandle,
1391    ) -> Result<ProviderPolyderQuotient> {
1392        let (u_coeffs, orientation_u) = self.load_polynomial(u)?;
1393        let (v_coeffs, orientation_v) = self.load_polynomial(v)?;
1394        let du = poly_raw_derivative(&u_coeffs);
1395        let dv = poly_raw_derivative(&v_coeffs);
1396        let term1 = poly_convolve_real(&du, &v_coeffs);
1397        let term2 = poly_convolve_real(&u_coeffs, &dv);
1398        let numerator_vec = poly_trim_slice(&poly_sub_real(&term1, &term2));
1399        let denominator_vec = poly_trim_slice(&poly_convolve_real(&v_coeffs, &v_coeffs));
1400        let numerator = self.allocate_polynomial(numerator_vec, orientation_u);
1401        let denominator = self.allocate_polynomial(denominator_vec, orientation_v);
1402        Ok(ProviderPolyderQuotient {
1403            numerator,
1404            denominator,
1405        })
1406    }
1407
1408    fn polyint(&self, polynomial: &GpuTensorHandle, constant: f64) -> Result<GpuTensorHandle> {
1409        let orientation = poly_orientation_from_shape(&polynomial.shape)?;
1410        let coeffs = {
1411            let guard = registry().lock().unwrap();
1412            guard
1413                .get(&polynomial.buffer_id)
1414                .cloned()
1415                .ok_or_else(|| anyhow!("polyint: unknown tensor handle {}", polynomial.buffer_id))?
1416        };
1417        let integrated = poly_integral_real(&coeffs, constant);
1418        Ok(self.allocate_polynomial(integrated, orientation))
1419    }
1420
1421    fn diag_from_vector(&self, vector: &GpuTensorHandle, offset: isize) -> Result<GpuTensorHandle> {
1422        ensure_diag_shape("diag", &vector.shape)?;
1423        let (rows, cols) = rows_cols(&vector.shape);
1424        ensure!(
1425            is_vector_like(rows, cols, vector.shape.len()),
1426            "diag: input must be a vector"
1427        );
1428
1429        let data = {
1430            let guard = registry().lock().unwrap();
1431            guard
1432                .get(&vector.buffer_id)
1433                .cloned()
1434                .ok_or_else(|| anyhow!("diag: unknown buffer {}", vector.buffer_id))?
1435        };
1436        let len = data.len();
1437        let (size, total) = diag_matrix_size(len, offset)?;
1438        let mut out = vec![0.0; total];
1439        for (idx, &value) in data.iter().enumerate() {
1440            let (row, col) = diagonal_target_index(idx, offset);
1441            if row < size && col < size {
1442                out[row + col * size] = value;
1443            }
1444        }
1445        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1446        registry().lock().unwrap().insert(id, out);
1447        Ok(GpuTensorHandle {
1448            shape: vec![size, size],
1449            device_id: 0,
1450            buffer_id: id,
1451        })
1452    }
1453
1454    fn diag_extract(&self, matrix: &GpuTensorHandle, offset: isize) -> Result<GpuTensorHandle> {
1455        ensure_diag_shape("diag", &matrix.shape)?;
1456        let (rows, cols) = rows_cols(&matrix.shape);
1457        ensure!(
1458            !is_vector_like(rows, cols, matrix.shape.len()),
1459            "diag: matrix input required"
1460        );
1461        let diag_len = diagonal_length(rows, cols, offset);
1462        if diag_len == 0 {
1463            return self.zeros(&[0, 1]);
1464        }
1465        let data = {
1466            let guard = registry().lock().unwrap();
1467            guard
1468                .get(&matrix.buffer_id)
1469                .cloned()
1470                .ok_or_else(|| anyhow!("diag: unknown buffer {}", matrix.buffer_id))?
1471        };
1472        let mut out = Vec::with_capacity(diag_len);
1473        for idx in 0..diag_len {
1474            let (row, col) = diagonal_source_index(idx, offset);
1475            let linear = row + col * rows;
1476            out.push(*data.get(linear).unwrap_or(&0.0));
1477        }
1478        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1479        registry().lock().unwrap().insert(id, out);
1480        Ok(GpuTensorHandle {
1481            shape: vec![diag_len, 1],
1482            device_id: 0,
1483            buffer_id: id,
1484        })
1485    }
1486
1487    fn tril(&self, handle: &GpuTensorHandle, offset: isize) -> Result<GpuTensorHandle> {
1488        let data = {
1489            let guard = registry().lock().unwrap();
1490            guard
1491                .get(&handle.buffer_id)
1492                .cloned()
1493                .ok_or_else(|| anyhow!("tril: unknown tensor handle {}", handle.buffer_id))?
1494        };
1495        let masked = tril_data(&data, &handle.shape, offset)?;
1496        Ok(self.allocate_tensor(masked, handle.shape.clone()))
1497    }
1498
1499    fn triu(&self, handle: &GpuTensorHandle, offset: isize) -> Result<GpuTensorHandle> {
1500        let data = {
1501            let guard = registry().lock().unwrap();
1502            guard
1503                .get(&handle.buffer_id)
1504                .cloned()
1505                .ok_or_else(|| anyhow!("triu: unknown tensor handle {}", handle.buffer_id))?
1506        };
1507        let masked = triu_data(&data, &handle.shape, offset)?;
1508        Ok(self.allocate_tensor(masked, handle.shape.clone()))
1509    }
1510
1511    fn issymmetric(
1512        &self,
1513        matrix: &GpuTensorHandle,
1514        kind: ProviderSymmetryKind,
1515        tolerance: f64,
1516    ) -> Result<bool> {
1517        let data = {
1518            let guard = registry().lock().unwrap();
1519            guard
1520                .get(&matrix.buffer_id)
1521                .cloned()
1522                .ok_or_else(|| anyhow!("issymmetric: unknown tensor handle {}", matrix.buffer_id))?
1523        };
1524        let skew = matches!(kind, ProviderSymmetryKind::Skew);
1525        issymmetric_host_real_data(&matrix.shape, &data, skew, tolerance).map_err(|e| anyhow!(e))
1526    }
1527
1528    fn ishermitian(
1529        &self,
1530        matrix: &GpuTensorHandle,
1531        kind: ProviderHermitianKind,
1532        tolerance: f64,
1533    ) -> Result<bool> {
1534        let data = {
1535            let guard = registry().lock().unwrap();
1536            guard
1537                .get(&matrix.buffer_id)
1538                .cloned()
1539                .ok_or_else(|| anyhow!("ishermitian: unknown tensor handle {}", matrix.buffer_id))?
1540        };
1541        let skew = matches!(kind, ProviderHermitianKind::Skew);
1542        ishermitian_host_real_data(&matrix.shape, &data, skew, tolerance).map_err(|e| anyhow!(e))
1543    }
1544
1545    fn bandwidth(&self, matrix: &GpuTensorHandle) -> Result<ProviderBandwidth> {
1546        let data = {
1547            let guard = registry().lock().unwrap();
1548            guard
1549                .get(&matrix.buffer_id)
1550                .cloned()
1551                .ok_or_else(|| anyhow!("bandwidth: unknown tensor handle {}", matrix.buffer_id))?
1552        };
1553        let (lower, upper) =
1554            bandwidth_host_real_data(&matrix.shape, &data).map_err(|e| anyhow!(e))?;
1555        Ok(ProviderBandwidth {
1556            lower: lower as u32,
1557            upper: upper as u32,
1558        })
1559    }
1560
1561    fn sym_rcm(&self, matrix: &GpuTensorHandle) -> Result<Vec<usize>> {
1562        let data = {
1563            let guard = registry().lock().unwrap();
1564            guard
1565                .get(&matrix.buffer_id)
1566                .cloned()
1567                .ok_or_else(|| anyhow!("symrcm: unknown tensor handle {}", matrix.buffer_id))?
1568        };
1569        symrcm_host_real_data(&matrix.shape, &data).map_err(|e| anyhow!(e))
1570    }
1571
1572    fn read_scalar(&self, h: &GpuTensorHandle, linear_index: usize) -> Result<f64> {
1573        let guard = registry().lock().unwrap_or_else(|e| e.into_inner());
1574        let buf = guard
1575            .get(&h.buffer_id)
1576            .ok_or_else(|| anyhow!("read_scalar: unknown buffer {}", h.buffer_id))?;
1577        if linear_index >= buf.len() {
1578            return Err(anyhow!(
1579                "read_scalar: index {} out of bounds (len {})",
1580                linear_index + 1,
1581                buf.len()
1582            ));
1583        }
1584        Ok(buf[linear_index])
1585    }
1586
1587    fn zeros(&self, shape: &[usize]) -> Result<GpuTensorHandle> {
1588        let len: usize = shape.iter().copied().product();
1589        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1590        let mut guard = registry().lock().unwrap();
1591        guard.insert(id, vec![0.0; len]);
1592        Ok(GpuTensorHandle {
1593            shape: shape.to_vec(),
1594            device_id: 0,
1595            buffer_id: id,
1596        })
1597    }
1598
1599    fn zeros_like(&self, prototype: &GpuTensorHandle) -> Result<GpuTensorHandle> {
1600        self.zeros(&prototype.shape)
1601    }
1602
1603    fn ones(&self, shape: &[usize]) -> Result<GpuTensorHandle> {
1604        let len: usize = shape.iter().copied().product();
1605        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1606        let mut guard = registry().lock().unwrap();
1607        guard.insert(id, vec![1.0; len]);
1608        Ok(GpuTensorHandle {
1609            shape: shape.to_vec(),
1610            device_id: 0,
1611            buffer_id: id,
1612        })
1613    }
1614
1615    fn ones_like(&self, prototype: &GpuTensorHandle) -> Result<GpuTensorHandle> {
1616        self.ones(&prototype.shape)
1617    }
1618
1619    fn eye(&self, shape: &[usize]) -> Result<GpuTensorHandle> {
1620        let shape = normalize_shape(shape);
1621        let data = identity_data(&shape);
1622        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1623        let mut guard = registry().lock().unwrap();
1624        guard.insert(id, data);
1625        Ok(GpuTensorHandle {
1626            shape,
1627            device_id: 0,
1628            buffer_id: id,
1629        })
1630    }
1631
1632    fn eye_like(&self, prototype: &GpuTensorHandle) -> Result<GpuTensorHandle> {
1633        self.eye(&prototype.shape)
1634    }
1635
1636    fn linspace(&self, start: f64, stop: f64, count: usize) -> Result<GpuTensorHandle> {
1637        let data = if count == 0 {
1638            Vec::new()
1639        } else if count == 1 {
1640            vec![stop]
1641        } else {
1642            let step = (stop - start) / ((count - 1) as f64);
1643            let mut seq = Vec::with_capacity(count);
1644            for idx in 0..count {
1645                seq.push(start + (idx as f64) * step);
1646            }
1647            if let Some(last) = seq.last_mut() {
1648                *last = stop;
1649            }
1650            seq
1651        };
1652
1653        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1654        registry().lock().unwrap().insert(id, data);
1655        Ok(GpuTensorHandle {
1656            shape: vec![1, count],
1657            device_id: 0,
1658            buffer_id: id,
1659        })
1660    }
1661
1662    fn random_uniform(&self, shape: &[usize]) -> Result<GpuTensorHandle> {
1663        let len: usize = shape.iter().copied().product();
1664        let mut data = vec![0.0; len];
1665        {
1666            let mut guard = rng_state().lock().unwrap_or_else(|e| e.into_inner());
1667            for slot in &mut data {
1668                *slot = next_uniform(&mut guard);
1669            }
1670        }
1671
1672        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1673        let mut buf_guard = registry().lock().unwrap_or_else(|e| e.into_inner());
1674        buf_guard.insert(id, data);
1675        Ok(GpuTensorHandle {
1676            shape: shape.to_vec(),
1677            device_id: 0,
1678            buffer_id: id,
1679        })
1680    }
1681
1682    fn random_normal(&self, shape: &[usize]) -> Result<GpuTensorHandle> {
1683        let len: usize = shape.iter().copied().product();
1684        let mut data = Vec::with_capacity(len);
1685        if len > 0 {
1686            let mut guard = rng_state().lock().unwrap_or_else(|e| e.into_inner());
1687            while data.len() < len {
1688                let (z0, z1) = next_normal_pair(&mut guard);
1689                data.push(z0);
1690                if data.len() < len {
1691                    data.push(z1);
1692                }
1693            }
1694        }
1695
1696        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1697        registry()
1698            .lock()
1699            .unwrap_or_else(|e| e.into_inner())
1700            .insert(id, data);
1701        Ok(GpuTensorHandle {
1702            shape: shape.to_vec(),
1703            device_id: 0,
1704            buffer_id: id,
1705        })
1706    }
1707
1708    fn set_rng_state(&self, state: u64) -> Result<()> {
1709        let mut guard = rng_state()
1710            .lock()
1711            .map_err(|_| anyhow::anyhow!("set_rng_state: RNG mutex poisoned"))?;
1712        *guard = if state == 0 {
1713            PROVIDER_DEFAULT_SEED
1714        } else {
1715            state
1716        };
1717        Ok(())
1718    }
1719
1720    fn fspecial(&self, request: &FspecialRequest) -> Result<GpuTensorHandle> {
1721        let spec =
1722            runmat_runtime::builtins::image::filters::fspecial::spec_from_request(&request.filter)
1723                .map_err(|e: String| anyhow!(e))?;
1724        let tensor = spec.generate_tensor().map_err(|e| anyhow!(e))?;
1725        Ok(self.allocate_tensor(tensor.data.clone(), tensor.shape.clone()))
1726    }
1727
1728    fn imfilter(
1729        &self,
1730        image: &GpuTensorHandle,
1731        kernel: &GpuTensorHandle,
1732        options: &ImfilterOptions,
1733    ) -> Result<GpuTensorHandle> {
1734        let (image_vec, kernel_vec) = {
1735            let guard = registry().lock().unwrap();
1736            let image_buf = guard
1737                .get(&image.buffer_id)
1738                .cloned()
1739                .ok_or_else(|| anyhow!("imfilter: unknown buffer {}", image.buffer_id))?;
1740            let kernel_buf = guard
1741                .get(&kernel.buffer_id)
1742                .cloned()
1743                .ok_or_else(|| anyhow!("imfilter: unknown buffer {}", kernel.buffer_id))?;
1744            (image_buf, kernel_buf)
1745        };
1746        let image_tensor =
1747            Tensor::new(image_vec, image.shape.clone()).map_err(|e| anyhow!("imfilter: {e}"))?;
1748        let kernel_tensor =
1749            Tensor::new(kernel_vec, kernel.shape.clone()).map_err(|e| anyhow!("imfilter: {e}"))?;
1750        let result = runmat_runtime::builtins::image::filters::imfilter::apply_imfilter_tensor(
1751            &image_tensor,
1752            &kernel_tensor,
1753            options,
1754        )
1755        .map_err(|e| anyhow!(e))?;
1756        let Tensor { data, shape, .. } = result;
1757        Ok(self.allocate_tensor(data, shape))
1758    }
1759
1760    fn random_integer_range(
1761        &self,
1762        lower: i64,
1763        upper: i64,
1764        shape: &[usize],
1765    ) -> Result<GpuTensorHandle> {
1766        ensure!(lower <= upper, "lower bound must be <= upper bound");
1767        let span_i128 = (upper as i128)
1768            .checked_sub(lower as i128)
1769            .and_then(|delta| delta.checked_add(1))
1770            .ok_or_else(|| anyhow!("integer range overflow"))?;
1771        ensure!(span_i128 > 0, "integer range must be non-empty");
1772        ensure!(
1773            span_i128 <= (1i128 << 53),
1774            "integer range exceeds 2^53 and cannot be represented exactly"
1775        );
1776        let span = span_i128 as u64;
1777
1778        let len: usize = shape.iter().copied().product();
1779        let mut data = Vec::with_capacity(len);
1780        if span == 1 {
1781            data.resize(len, lower as f64);
1782        } else if len > 0 {
1783            let mut guard = rng_state().lock().unwrap_or_else(|e| e.into_inner());
1784            let span_f64 = span as f64;
1785            for _ in 0..len {
1786                let mut offset = (next_uniform(&mut guard) * span_f64).floor() as u64;
1787                if offset >= span {
1788                    offset = span - 1;
1789                }
1790                let value = (lower as i128 + offset as i128) as f64;
1791                data.push(value);
1792            }
1793        }
1794
1795        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1796        registry().lock().unwrap().insert(id, data);
1797        Ok(GpuTensorHandle {
1798            shape: shape.to_vec(),
1799            device_id: 0,
1800            buffer_id: id,
1801        })
1802    }
1803
1804    fn random_permutation(&self, n: usize, k: usize) -> Result<GpuTensorHandle> {
1805        ensure!(k <= n, "randperm: K must satisfy 0 <= K <= N");
1806        let k = k.min(n);
1807        let mut values: Vec<f64> = if n == 0 {
1808            Vec::new()
1809        } else {
1810            (1..=n).map(|v| v as f64).collect()
1811        };
1812
1813        if k > 0 {
1814            let mut guard = rng_state().lock().unwrap_or_else(|e| e.into_inner());
1815            for i in 0..k {
1816                let span = n - i;
1817                if span == 0 {
1818                    break;
1819                }
1820                let mut u = next_uniform(&mut guard);
1821                if u >= 1.0 {
1822                    u = 0.9999999999999999;
1823                }
1824                let mut offset = (u * span as f64).floor() as usize;
1825                if offset >= span {
1826                    offset = span - 1;
1827                }
1828                let j = i + offset;
1829                values.swap(i, j);
1830            }
1831        }
1832
1833        if values.len() > k {
1834            values.truncate(k);
1835        }
1836
1837        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1838        registry().lock().unwrap().insert(id, values);
1839        Ok(GpuTensorHandle {
1840            shape: vec![1, k],
1841            device_id: 0,
1842            buffer_id: id,
1843        })
1844    }
1845
1846    fn random_permutation_like(
1847        &self,
1848        _prototype: &GpuTensorHandle,
1849        n: usize,
1850        k: usize,
1851    ) -> Result<GpuTensorHandle> {
1852        self.random_permutation(n, k)
1853    }
1854
1855    fn covariance(
1856        &self,
1857        matrix: &GpuTensorHandle,
1858        second: Option<&GpuTensorHandle>,
1859        weights: Option<&GpuTensorHandle>,
1860        options: &CovarianceOptions,
1861    ) -> Result<GpuTensorHandle> {
1862        let host_matrix = self.download(matrix)?;
1863        let left = Tensor::new(host_matrix.data.clone(), host_matrix.shape.clone())
1864            .map_err(|e| anyhow!("covariance: {e}"))?;
1865
1866        let right = if let Some(handle) = second {
1867            let host = self.download(handle)?;
1868            Some(
1869                Tensor::new(host.data.clone(), host.shape.clone())
1870                    .map_err(|e| anyhow!("covariance: {e}"))?,
1871            )
1872        } else {
1873            None
1874        };
1875
1876        let weight_spec = if let Some(handle) = weights {
1877            let host = self.download(handle)?;
1878            let tensor = Tensor::new(host.data.clone(), host.shape.clone())
1879                .map_err(|e| anyhow!("covariance: {e}"))?;
1880            let vec = tensor_to_weight_vector(&tensor).map_err(|e| anyhow!("covariance: {e}"))?;
1881            CovWeightSpec::Vector(vec)
1882        } else {
1883            CovWeightSpec::Scalar(options.normalization)
1884        };
1885
1886        let result = runtime_cov_from_tensors(left, right, options.rows, weight_spec)
1887            .map_err(|e| anyhow!("covariance: {e}"))?;
1888
1889        let view = HostTensorView {
1890            data: &result.data,
1891            shape: &result.shape,
1892        };
1893        self.upload(&view)
1894    }
1895
1896    fn corrcoef(
1897        &self,
1898        matrix: &GpuTensorHandle,
1899        options: &CorrcoefOptions,
1900    ) -> Result<GpuTensorHandle> {
1901        let host = self.download(matrix)?;
1902        let tensor = Tensor::new(host.data.clone(), host.shape.clone())
1903            .map_err(|e| anyhow!("corrcoef: {e}"))?;
1904        let result =
1905            runtime_corrcoef_from_tensors(tensor, None, options.normalization, options.rows)
1906                .map_err(|e| anyhow!("corrcoef: {e}"))?;
1907        let view = HostTensorView {
1908            data: &result.data,
1909            shape: &result.shape,
1910        };
1911        self.upload(&view)
1912    }
1913
1914    fn elem_add(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
1915        let guard = registry().lock().unwrap();
1916        let abuf = guard
1917            .get(&a.buffer_id)
1918            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
1919        let bbuf = guard
1920            .get(&b.buffer_id)
1921            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", b.buffer_id))?;
1922        if a.shape != b.shape {
1923            return Err(anyhow::anyhow!("shape mismatch"));
1924        }
1925        let mut out = vec![0.0; abuf.len()];
1926        for i in 0..abuf.len() {
1927            out[i] = abuf[i] + bbuf[i];
1928        }
1929        drop(guard);
1930        // Upload new buffer to registry
1931        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1932        let mut guard2 = registry().lock().unwrap();
1933        guard2.insert(id, out);
1934        Ok(GpuTensorHandle {
1935            shape: a.shape.clone(),
1936            device_id: 0,
1937            buffer_id: id,
1938        })
1939    }
1940
1941    fn elem_mul(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
1942        let guard = registry().lock().unwrap();
1943        let abuf = guard
1944            .get(&a.buffer_id)
1945            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
1946        let bbuf = guard
1947            .get(&b.buffer_id)
1948            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", b.buffer_id))?;
1949        if a.shape != b.shape {
1950            return Err(anyhow::anyhow!("shape mismatch"));
1951        }
1952        let mut out = vec![0.0; abuf.len()];
1953        for i in 0..abuf.len() {
1954            out[i] = abuf[i] * bbuf[i];
1955        }
1956        drop(guard);
1957        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1958        let mut guard2 = registry().lock().unwrap();
1959        guard2.insert(id, out);
1960        Ok(GpuTensorHandle {
1961            shape: a.shape.clone(),
1962            device_id: 0,
1963            buffer_id: id,
1964        })
1965    }
1966
1967    fn elem_sub(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
1968        let guard = registry().lock().unwrap();
1969        let abuf = guard
1970            .get(&a.buffer_id)
1971            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
1972        let bbuf = guard
1973            .get(&b.buffer_id)
1974            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", b.buffer_id))?;
1975        if a.shape != b.shape {
1976            return Err(anyhow::anyhow!("shape mismatch"));
1977        }
1978        let mut out = vec![0.0; abuf.len()];
1979        for i in 0..abuf.len() {
1980            out[i] = abuf[i] - bbuf[i];
1981        }
1982        drop(guard);
1983        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1984        let mut guard2 = registry().lock().unwrap();
1985        guard2.insert(id, out);
1986        Ok(GpuTensorHandle {
1987            shape: a.shape.clone(),
1988            device_id: 0,
1989            buffer_id: id,
1990        })
1991    }
1992
1993    fn elem_div(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
1994        let guard = registry().lock().unwrap();
1995        let abuf = guard
1996            .get(&a.buffer_id)
1997            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
1998        let bbuf = guard
1999            .get(&b.buffer_id)
2000            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", b.buffer_id))?;
2001        if a.shape != b.shape {
2002            return Err(anyhow::anyhow!("shape mismatch"));
2003        }
2004        let mut out = vec![0.0; abuf.len()];
2005        for i in 0..abuf.len() {
2006            out[i] = if bbuf[i] == 0.0 {
2007                f64::INFINITY * abuf[i].signum()
2008            } else {
2009                abuf[i] / bbuf[i]
2010            };
2011        }
2012        drop(guard);
2013        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2014        let mut guard2 = registry().lock().unwrap();
2015        guard2.insert(id, out);
2016        Ok(GpuTensorHandle {
2017            shape: a.shape.clone(),
2018            device_id: 0,
2019            buffer_id: id,
2020        })
2021    }
2022
2023    fn elem_pow(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2024        let guard = registry().lock().unwrap();
2025        let abuf = guard
2026            .get(&a.buffer_id)
2027            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2028        let bbuf = guard
2029            .get(&b.buffer_id)
2030            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", b.buffer_id))?;
2031        if a.shape != b.shape {
2032            return Err(anyhow::anyhow!("shape mismatch"));
2033        }
2034        let mut out = vec![0.0; abuf.len()];
2035        for i in 0..abuf.len() {
2036            out[i] = abuf[i].powf(bbuf[i]);
2037        }
2038        drop(guard);
2039        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2040        let mut guard2 = registry().lock().unwrap();
2041        guard2.insert(id, out);
2042        Ok(GpuTensorHandle {
2043            shape: a.shape.clone(),
2044            device_id: 0,
2045            buffer_id: id,
2046        })
2047    }
2048
2049    fn elem_ne(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2050        let (adata, bdata) = {
2051            let guard = registry().lock().unwrap();
2052            let a_vec = guard
2053                .get(&a.buffer_id)
2054                .ok_or_else(|| anyhow!("elem_ne: unknown buffer {}", a.buffer_id))?
2055                .clone();
2056            let b_vec = guard
2057                .get(&b.buffer_id)
2058                .ok_or_else(|| anyhow!("elem_ne: unknown buffer {}", b.buffer_id))?
2059                .clone();
2060            (a_vec, b_vec)
2061        };
2062
2063        let shape = runtime_broadcast_shapes("ne", &a.shape, &b.shape).map_err(|e| anyhow!(e))?;
2064        let total: usize = shape.iter().copied().product();
2065        if total == 0 {
2066            return Ok(self.allocate_tensor(Vec::new(), shape));
2067        }
2068
2069        let strides_a = runtime_compute_strides(&a.shape);
2070        let strides_b = runtime_compute_strides(&b.shape);
2071        let len_a = adata.len();
2072        let len_b = bdata.len();
2073
2074        let mut out = Vec::with_capacity(total);
2075        for idx in 0..total {
2076            let lhs = if len_a == 0 {
2077                0.0
2078            } else {
2079                let offset = runtime_broadcast_index(idx, &shape, &a.shape, &strides_a);
2080                *adata.get(offset).unwrap_or(&0.0)
2081            };
2082            let rhs = if len_b == 0 {
2083                0.0
2084            } else {
2085                let offset = runtime_broadcast_index(idx, &shape, &b.shape, &strides_b);
2086                *bdata.get(offset).unwrap_or(&0.0)
2087            };
2088            out.push(if lhs != rhs { 1.0 } else { 0.0 });
2089        }
2090
2091        Ok(self.allocate_tensor(out, shape))
2092    }
2093
2094    fn elem_ge(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2095        let (adata, bdata) = {
2096            let guard = registry().lock().unwrap();
2097            let a_vec = guard
2098                .get(&a.buffer_id)
2099                .ok_or_else(|| anyhow!("elem_ge: unknown buffer {}", a.buffer_id))?
2100                .clone();
2101            let b_vec = guard
2102                .get(&b.buffer_id)
2103                .ok_or_else(|| anyhow!("elem_ge: unknown buffer {}", b.buffer_id))?
2104                .clone();
2105            (a_vec, b_vec)
2106        };
2107
2108        let shape = runtime_broadcast_shapes("ge", &a.shape, &b.shape).map_err(|e| anyhow!(e))?;
2109        let total: usize = shape.iter().copied().product();
2110        if total == 0 {
2111            return Ok(self.allocate_tensor(Vec::new(), shape));
2112        }
2113
2114        let strides_a = runtime_compute_strides(&a.shape);
2115        let strides_b = runtime_compute_strides(&b.shape);
2116        let len_a = adata.len();
2117        let len_b = bdata.len();
2118
2119        let mut out = Vec::with_capacity(total);
2120        for idx in 0..total {
2121            let lhs = if len_a == 0 {
2122                0.0
2123            } else {
2124                let offset = runtime_broadcast_index(idx, &shape, &a.shape, &strides_a);
2125                *adata.get(offset).unwrap_or(&0.0)
2126            };
2127            let rhs = if len_b == 0 {
2128                0.0
2129            } else {
2130                let offset = runtime_broadcast_index(idx, &shape, &b.shape, &strides_b);
2131                *bdata.get(offset).unwrap_or(&0.0)
2132            };
2133            out.push(if lhs >= rhs { 1.0 } else { 0.0 });
2134        }
2135
2136        Ok(self.allocate_tensor(out, shape))
2137    }
2138
2139    fn elem_le(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2140        let (adata, bdata) = {
2141            let guard = registry().lock().unwrap();
2142            let a_vec = guard
2143                .get(&a.buffer_id)
2144                .ok_or_else(|| anyhow!("elem_le: unknown buffer {}", a.buffer_id))?
2145                .clone();
2146            let b_vec = guard
2147                .get(&b.buffer_id)
2148                .ok_or_else(|| anyhow!("elem_le: unknown buffer {}", b.buffer_id))?
2149                .clone();
2150            (a_vec, b_vec)
2151        };
2152
2153        let shape = runtime_broadcast_shapes("le", &a.shape, &b.shape).map_err(|e| anyhow!(e))?;
2154        let total: usize = shape.iter().copied().product();
2155        if total == 0 {
2156            return Ok(self.allocate_tensor(Vec::new(), shape));
2157        }
2158
2159        let strides_a = runtime_compute_strides(&a.shape);
2160        let strides_b = runtime_compute_strides(&b.shape);
2161        let len_a = adata.len();
2162        let len_b = bdata.len();
2163
2164        let mut out = Vec::with_capacity(total);
2165        for idx in 0..total {
2166            let lhs = if len_a == 0 {
2167                0.0
2168            } else {
2169                let offset = runtime_broadcast_index(idx, &shape, &a.shape, &strides_a);
2170                *adata.get(offset).unwrap_or(&0.0)
2171            };
2172            let rhs = if len_b == 0 {
2173                0.0
2174            } else {
2175                let offset = runtime_broadcast_index(idx, &shape, &b.shape, &strides_b);
2176                *bdata.get(offset).unwrap_or(&0.0)
2177            };
2178            out.push(if lhs <= rhs { 1.0 } else { 0.0 });
2179        }
2180
2181        Ok(self.allocate_tensor(out, shape))
2182    }
2183
2184    fn elem_lt(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2185        let (adata, bdata) = {
2186            let guard = registry().lock().unwrap();
2187            let a_vec = guard
2188                .get(&a.buffer_id)
2189                .ok_or_else(|| anyhow!("elem_lt: unknown buffer {}", a.buffer_id))?
2190                .clone();
2191            let b_vec = guard
2192                .get(&b.buffer_id)
2193                .ok_or_else(|| anyhow!("elem_lt: unknown buffer {}", b.buffer_id))?
2194                .clone();
2195            (a_vec, b_vec)
2196        };
2197
2198        let shape = runtime_broadcast_shapes("lt", &a.shape, &b.shape).map_err(|e| anyhow!(e))?;
2199        let total: usize = shape.iter().copied().product();
2200        if total == 0 {
2201            return Ok(self.allocate_tensor(Vec::new(), shape));
2202        }
2203
2204        let strides_a = runtime_compute_strides(&a.shape);
2205        let strides_b = runtime_compute_strides(&b.shape);
2206        let len_a = adata.len();
2207        let len_b = bdata.len();
2208
2209        let mut out = Vec::with_capacity(total);
2210        for idx in 0..total {
2211            let lhs = if len_a == 0 {
2212                0.0
2213            } else {
2214                let offset = runtime_broadcast_index(idx, &shape, &a.shape, &strides_a);
2215                *adata.get(offset).unwrap_or(&0.0)
2216            };
2217            let rhs = if len_b == 0 {
2218                0.0
2219            } else {
2220                let offset = runtime_broadcast_index(idx, &shape, &b.shape, &strides_b);
2221                *bdata.get(offset).unwrap_or(&0.0)
2222            };
2223            out.push(if lhs < rhs { 1.0 } else { 0.0 });
2224        }
2225
2226        Ok(self.allocate_tensor(out, shape))
2227    }
2228
2229    fn elem_gt(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2230        let (adata, bdata) = {
2231            let guard = registry().lock().unwrap();
2232            let a_vec = guard
2233                .get(&a.buffer_id)
2234                .ok_or_else(|| anyhow!("elem_gt: unknown buffer {}", a.buffer_id))?
2235                .clone();
2236            let b_vec = guard
2237                .get(&b.buffer_id)
2238                .ok_or_else(|| anyhow!("elem_gt: unknown buffer {}", b.buffer_id))?
2239                .clone();
2240            (a_vec, b_vec)
2241        };
2242
2243        let shape = runtime_broadcast_shapes("gt", &a.shape, &b.shape).map_err(|e| anyhow!(e))?;
2244        let total: usize = shape.iter().copied().product();
2245        if total == 0 {
2246            return Ok(self.allocate_tensor(Vec::new(), shape));
2247        }
2248
2249        let strides_a = runtime_compute_strides(&a.shape);
2250        let strides_b = runtime_compute_strides(&b.shape);
2251        let len_a = adata.len();
2252        let len_b = bdata.len();
2253
2254        let mut out = Vec::with_capacity(total);
2255        for idx in 0..total {
2256            let lhs = if len_a == 0 {
2257                0.0
2258            } else {
2259                let offset = runtime_broadcast_index(idx, &shape, &a.shape, &strides_a);
2260                *adata.get(offset).unwrap_or(&0.0)
2261            };
2262            let rhs = if len_b == 0 {
2263                0.0
2264            } else {
2265                let offset = runtime_broadcast_index(idx, &shape, &b.shape, &strides_b);
2266                *bdata.get(offset).unwrap_or(&0.0)
2267            };
2268            out.push(if lhs > rhs { 1.0 } else { 0.0 });
2269        }
2270
2271        Ok(self.allocate_tensor(out, shape))
2272    }
2273
2274    fn elem_eq(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2275        let (adata, bdata) = {
2276            let guard = registry().lock().unwrap();
2277            let a_vec = guard
2278                .get(&a.buffer_id)
2279                .ok_or_else(|| anyhow!("elem_eq: unknown buffer {}", a.buffer_id))?
2280                .clone();
2281            let b_vec = guard
2282                .get(&b.buffer_id)
2283                .ok_or_else(|| anyhow!("elem_eq: unknown buffer {}", b.buffer_id))?
2284                .clone();
2285            (a_vec, b_vec)
2286        };
2287
2288        let shape = runtime_broadcast_shapes("eq", &a.shape, &b.shape).map_err(|e| anyhow!(e))?;
2289        let total: usize = shape.iter().copied().product();
2290        if total == 0 {
2291            return Ok(self.allocate_tensor(Vec::new(), shape));
2292        }
2293
2294        let strides_a = runtime_compute_strides(&a.shape);
2295        let strides_b = runtime_compute_strides(&b.shape);
2296        let len_a = adata.len();
2297        let len_b = bdata.len();
2298
2299        let mut out = Vec::with_capacity(total);
2300        for idx in 0..total {
2301            let lhs = if len_a == 0 {
2302                0.0
2303            } else {
2304                let offset = runtime_broadcast_index(idx, &shape, &a.shape, &strides_a);
2305                *adata.get(offset).unwrap_or(&0.0)
2306            };
2307            let rhs = if len_b == 0 {
2308                0.0
2309            } else {
2310                let offset = runtime_broadcast_index(idx, &shape, &b.shape, &strides_b);
2311                *bdata.get(offset).unwrap_or(&0.0)
2312            };
2313            out.push(if lhs == rhs { 1.0 } else { 0.0 });
2314        }
2315
2316        Ok(self.allocate_tensor(out, shape))
2317    }
2318
2319    fn logical_and(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2320        let (adata, bdata) = {
2321            let guard = registry().lock().unwrap();
2322            let a_vec = guard
2323                .get(&a.buffer_id)
2324                .ok_or_else(|| anyhow!("logical_and: unknown buffer {}", a.buffer_id))?
2325                .clone();
2326            let b_vec = guard
2327                .get(&b.buffer_id)
2328                .ok_or_else(|| anyhow!("logical_and: unknown buffer {}", b.buffer_id))?
2329                .clone();
2330            (a_vec, b_vec)
2331        };
2332
2333        let shape =
2334            runtime_broadcast_shapes("logical_and", &a.shape, &b.shape).map_err(|e| anyhow!(e))?;
2335        let total: usize = shape.iter().copied().product();
2336        if total == 0 {
2337            return Ok(self.allocate_tensor(Vec::new(), shape));
2338        }
2339
2340        let strides_a = runtime_compute_strides(&a.shape);
2341        let strides_b = runtime_compute_strides(&b.shape);
2342        let len_a = adata.len();
2343        let len_b = bdata.len();
2344
2345        let mut out = Vec::with_capacity(total);
2346        for idx in 0..total {
2347            let lhs = if len_a == 0 {
2348                0.0
2349            } else {
2350                let offset = runtime_broadcast_index(idx, &shape, &a.shape, &strides_a);
2351                *adata.get(offset).unwrap_or(&0.0)
2352            };
2353            let rhs = if len_b == 0 {
2354                0.0
2355            } else {
2356                let offset = runtime_broadcast_index(idx, &shape, &b.shape, &strides_b);
2357                *bdata.get(offset).unwrap_or(&0.0)
2358            };
2359            out.push(if lhs != 0.0 && rhs != 0.0 { 1.0 } else { 0.0 });
2360        }
2361
2362        Ok(self.allocate_tensor(out, shape))
2363    }
2364
2365    fn logical_or(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2366        let (adata, bdata) = {
2367            let guard = registry().lock().unwrap();
2368            let a_vec = guard
2369                .get(&a.buffer_id)
2370                .ok_or_else(|| anyhow!("logical_or: unknown buffer {}", a.buffer_id))?
2371                .clone();
2372            let b_vec = guard
2373                .get(&b.buffer_id)
2374                .ok_or_else(|| anyhow!("logical_or: unknown buffer {}", b.buffer_id))?
2375                .clone();
2376            (a_vec, b_vec)
2377        };
2378
2379        let shape =
2380            runtime_broadcast_shapes("logical_or", &a.shape, &b.shape).map_err(|e| anyhow!(e))?;
2381        let total: usize = shape.iter().copied().product();
2382        if total == 0 {
2383            return Ok(self.allocate_tensor(Vec::new(), shape));
2384        }
2385
2386        let strides_a = runtime_compute_strides(&a.shape);
2387        let strides_b = runtime_compute_strides(&b.shape);
2388        let len_a = adata.len();
2389        let len_b = bdata.len();
2390
2391        let mut out = Vec::with_capacity(total);
2392        for idx in 0..total {
2393            let lhs = if len_a == 0 {
2394                0.0
2395            } else {
2396                let offset = runtime_broadcast_index(idx, &shape, &a.shape, &strides_a);
2397                *adata.get(offset).unwrap_or(&0.0)
2398            };
2399            let rhs = if len_b == 0 {
2400                0.0
2401            } else {
2402                let offset = runtime_broadcast_index(idx, &shape, &b.shape, &strides_b);
2403                *bdata.get(offset).unwrap_or(&0.0)
2404            };
2405            out.push(if lhs != 0.0 || rhs != 0.0 { 1.0 } else { 0.0 });
2406        }
2407
2408        Ok(self.allocate_tensor(out, shape))
2409    }
2410
2411    fn logical_xor(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2412        let (adata, bdata) = {
2413            let guard = registry().lock().unwrap();
2414            let a_vec = guard
2415                .get(&a.buffer_id)
2416                .ok_or_else(|| anyhow!("logical_xor: unknown buffer {}", a.buffer_id))?
2417                .clone();
2418            let b_vec = guard
2419                .get(&b.buffer_id)
2420                .ok_or_else(|| anyhow!("logical_xor: unknown buffer {}", b.buffer_id))?
2421                .clone();
2422            (a_vec, b_vec)
2423        };
2424
2425        let shape =
2426            runtime_broadcast_shapes("logical_xor", &a.shape, &b.shape).map_err(|e| anyhow!(e))?;
2427        let total: usize = shape.iter().copied().product();
2428        if total == 0 {
2429            return Ok(self.allocate_tensor(Vec::new(), shape));
2430        }
2431
2432        let strides_a = runtime_compute_strides(&a.shape);
2433        let strides_b = runtime_compute_strides(&b.shape);
2434        let len_a = adata.len();
2435        let len_b = bdata.len();
2436
2437        let mut out = Vec::with_capacity(total);
2438        for idx in 0..total {
2439            let lhs = if len_a == 0 {
2440                0.0
2441            } else {
2442                let offset = runtime_broadcast_index(idx, &shape, &a.shape, &strides_a);
2443                *adata.get(offset).unwrap_or(&0.0)
2444            };
2445            let rhs = if len_b == 0 {
2446                0.0
2447            } else {
2448                let offset = runtime_broadcast_index(idx, &shape, &b.shape, &strides_b);
2449                *bdata.get(offset).unwrap_or(&0.0)
2450            };
2451            let lhs_true = lhs != 0.0;
2452            let rhs_true = rhs != 0.0;
2453            out.push(if lhs_true ^ rhs_true { 1.0 } else { 0.0 });
2454        }
2455
2456        Ok(self.allocate_tensor(out, shape))
2457    }
2458
2459    fn logical_not(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2460        let data = {
2461            let guard = registry().lock().unwrap();
2462            guard
2463                .get(&a.buffer_id)
2464                .ok_or_else(|| anyhow!("logical_not: unknown buffer {}", a.buffer_id))?
2465                .clone()
2466        };
2467
2468        let shape = a.shape.clone();
2469        if data.is_empty() {
2470            return Ok(self.allocate_tensor(Vec::new(), shape));
2471        }
2472
2473        let mut out = Vec::with_capacity(data.len());
2474        for value in data {
2475            out.push(if value == 0.0 { 1.0 } else { 0.0 });
2476        }
2477
2478        Ok(self.allocate_tensor(out, shape))
2479    }
2480
2481    fn logical_isreal(&self, a: &GpuTensorHandle) -> Result<bool> {
2482        {
2483            let guard = registry().lock().unwrap();
2484            guard
2485                .get(&a.buffer_id)
2486                .ok_or_else(|| anyhow!("logical_isreal: unknown buffer {}", a.buffer_id))?;
2487        }
2488        Ok(true)
2489    }
2490
2491    fn logical_isfinite(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2492        let data = {
2493            let guard = registry().lock().unwrap();
2494            guard
2495                .get(&a.buffer_id)
2496                .ok_or_else(|| anyhow!("logical_isfinite: unknown buffer {}", a.buffer_id))?
2497                .clone()
2498        };
2499
2500        let shape = a.shape.clone();
2501        if data.is_empty() {
2502            return Ok(self.allocate_tensor(Vec::new(), shape));
2503        }
2504
2505        let mut out = Vec::with_capacity(data.len());
2506        for value in data {
2507            out.push(if value.is_finite() { 1.0 } else { 0.0 });
2508        }
2509
2510        Ok(self.allocate_tensor(out, shape))
2511    }
2512
2513    fn logical_isnan(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2514        let data = {
2515            let guard = registry().lock().unwrap();
2516            guard
2517                .get(&a.buffer_id)
2518                .ok_or_else(|| anyhow!("logical_isnan: unknown buffer {}", a.buffer_id))?
2519                .clone()
2520        };
2521
2522        let shape = a.shape.clone();
2523        if data.is_empty() {
2524            return Ok(self.allocate_tensor(Vec::new(), shape));
2525        }
2526
2527        let mut out = Vec::with_capacity(data.len());
2528        for value in data {
2529            out.push(if value.is_nan() { 1.0 } else { 0.0 });
2530        }
2531
2532        Ok(self.allocate_tensor(out, shape))
2533    }
2534
2535    fn logical_isinf(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2536        let data = {
2537            let guard = registry().lock().unwrap();
2538            guard
2539                .get(&a.buffer_id)
2540                .ok_or_else(|| anyhow!("logical_isinf: unknown buffer {}", a.buffer_id))?
2541                .clone()
2542        };
2543
2544        let shape = a.shape.clone();
2545        if data.is_empty() {
2546            return Ok(self.allocate_tensor(Vec::new(), shape));
2547        }
2548
2549        let mut out = Vec::with_capacity(data.len());
2550        for value in data {
2551            out.push(if value.is_infinite() { 1.0 } else { 0.0 });
2552        }
2553
2554        Ok(self.allocate_tensor(out, shape))
2555    }
2556
2557    fn elem_hypot(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2558        let guard = registry().lock().unwrap();
2559        let abuf = guard
2560            .get(&a.buffer_id)
2561            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2562        let bbuf = guard
2563            .get(&b.buffer_id)
2564            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", b.buffer_id))?;
2565        if a.shape != b.shape {
2566            return Err(anyhow::anyhow!("shape mismatch"));
2567        }
2568        let mut out = vec![0.0; abuf.len()];
2569        for i in 0..abuf.len() {
2570            out[i] = abuf[i].hypot(bbuf[i]);
2571        }
2572        drop(guard);
2573        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2574        let mut guard2 = registry().lock().unwrap();
2575        guard2.insert(id, out);
2576        Ok(GpuTensorHandle {
2577            shape: a.shape.clone(),
2578            device_id: 0,
2579            buffer_id: id,
2580        })
2581    }
2582    fn elem_atan2(&self, y: &GpuTensorHandle, x: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2583        let guard = registry().lock().unwrap();
2584        let ybuf = guard
2585            .get(&y.buffer_id)
2586            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", y.buffer_id))?;
2587        let xbuf = guard
2588            .get(&x.buffer_id)
2589            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", x.buffer_id))?;
2590        if y.shape != x.shape {
2591            return Err(anyhow::anyhow!("shape mismatch"));
2592        }
2593        let mut out = vec![0.0; ybuf.len()];
2594        for idx in 0..ybuf.len() {
2595            out[idx] = ybuf[idx].atan2(xbuf[idx]);
2596        }
2597        drop(guard);
2598        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2599        let mut guard2 = registry().lock().unwrap();
2600        guard2.insert(id, out);
2601        Ok(GpuTensorHandle {
2602            shape: y.shape.clone(),
2603            device_id: 0,
2604            buffer_id: id,
2605        })
2606    }
2607
2608    fn unary_sin(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2609        let guard = registry().lock().unwrap();
2610        let abuf = guard
2611            .get(&a.buffer_id)
2612            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2613        let out: Vec<f64> = abuf.iter().map(|&x| x.sin()).collect();
2614        drop(guard);
2615        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2616        let mut guard2 = registry().lock().unwrap();
2617        guard2.insert(id, out);
2618        Ok(GpuTensorHandle {
2619            shape: a.shape.clone(),
2620            device_id: 0,
2621            buffer_id: id,
2622        })
2623    }
2624    fn unary_gamma(&self, _a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2625        Err(anyhow::anyhow!("unary_gamma not supported by provider"))
2626    }
2627    fn unary_factorial(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2628        let guard = registry().lock().unwrap();
2629        let abuf = guard
2630            .get(&a.buffer_id)
2631            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2632        let out: Vec<f64> = abuf.iter().copied().map(factorial_scalar_host).collect();
2633        drop(guard);
2634        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2635        let mut guard2 = registry().lock().unwrap();
2636        guard2.insert(id, out);
2637        Ok(GpuTensorHandle {
2638            shape: a.shape.clone(),
2639            device_id: 0,
2640            buffer_id: id,
2641        })
2642    }
2643    fn unary_asinh(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2644        let guard = registry().lock().unwrap();
2645        let abuf = guard
2646            .get(&a.buffer_id)
2647            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2648        let out: Vec<f64> = abuf.iter().map(|&x| x.asinh()).collect();
2649        drop(guard);
2650        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2651        let mut guard2 = registry().lock().unwrap();
2652        guard2.insert(id, out);
2653        Ok(GpuTensorHandle {
2654            shape: a.shape.clone(),
2655            device_id: 0,
2656            buffer_id: id,
2657        })
2658    }
2659    fn unary_sinh(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2660        let guard = registry().lock().unwrap();
2661        let abuf = guard
2662            .get(&a.buffer_id)
2663            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2664        let out: Vec<f64> = abuf.iter().map(|&x| x.sinh()).collect();
2665        drop(guard);
2666        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2667        let mut guard2 = registry().lock().unwrap();
2668        guard2.insert(id, out);
2669        Ok(GpuTensorHandle {
2670            shape: a.shape.clone(),
2671            device_id: 0,
2672            buffer_id: id,
2673        })
2674    }
2675    fn unary_cosh(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2676        let guard = registry().lock().unwrap();
2677        let abuf = guard
2678            .get(&a.buffer_id)
2679            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2680        let out: Vec<f64> = abuf.iter().map(|&x| x.cosh()).collect();
2681        drop(guard);
2682        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2683        let mut guard2 = registry().lock().unwrap();
2684        guard2.insert(id, out);
2685        Ok(GpuTensorHandle {
2686            shape: a.shape.clone(),
2687            device_id: 0,
2688            buffer_id: id,
2689        })
2690    }
2691
2692    fn unary_asin(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2693        let guard = registry().lock().unwrap();
2694        let abuf = guard
2695            .get(&a.buffer_id)
2696            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2697        let out: Vec<f64> = abuf.iter().map(|&x| x.asin()).collect();
2698        drop(guard);
2699        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2700        let mut guard2 = registry().lock().unwrap();
2701        guard2.insert(id, out);
2702        Ok(GpuTensorHandle {
2703            shape: a.shape.clone(),
2704            device_id: 0,
2705            buffer_id: id,
2706        })
2707    }
2708    fn unary_acos(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2709        let guard = registry().lock().unwrap();
2710        let abuf = guard
2711            .get(&a.buffer_id)
2712            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2713        let out: Vec<f64> = abuf.iter().map(|&x| x.acos()).collect();
2714        drop(guard);
2715        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2716        let mut guard2 = registry().lock().unwrap();
2717        guard2.insert(id, out);
2718        Ok(GpuTensorHandle {
2719            shape: a.shape.clone(),
2720            device_id: 0,
2721            buffer_id: id,
2722        })
2723    }
2724    fn unary_acosh(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2725        let guard = registry().lock().unwrap();
2726        let abuf = guard
2727            .get(&a.buffer_id)
2728            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2729        let out: Vec<f64> = abuf.iter().map(|&x| x.acosh()).collect();
2730        drop(guard);
2731        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2732        let mut guard2 = registry().lock().unwrap();
2733        guard2.insert(id, out);
2734        Ok(GpuTensorHandle {
2735            shape: a.shape.clone(),
2736            device_id: 0,
2737            buffer_id: id,
2738        })
2739    }
2740
2741    fn unary_tan(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2742        let guard = registry().lock().unwrap();
2743        let abuf = guard
2744            .get(&a.buffer_id)
2745            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2746        let out: Vec<f64> = abuf.iter().map(|&x| x.tan()).collect();
2747        drop(guard);
2748        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2749        let mut guard2 = registry().lock().unwrap();
2750        guard2.insert(id, out);
2751        Ok(GpuTensorHandle {
2752            shape: a.shape.clone(),
2753            device_id: 0,
2754            buffer_id: id,
2755        })
2756    }
2757    fn unary_tanh(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2758        let guard = registry().lock().unwrap();
2759        let abuf = guard
2760            .get(&a.buffer_id)
2761            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2762        let out: Vec<f64> = abuf.iter().map(|&x| x.tanh()).collect();
2763        drop(guard);
2764        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2765        let mut guard2 = registry().lock().unwrap();
2766        guard2.insert(id, out);
2767        Ok(GpuTensorHandle {
2768            shape: a.shape.clone(),
2769            device_id: 0,
2770            buffer_id: id,
2771        })
2772    }
2773
2774    fn unary_atan(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2775        let guard = registry().lock().unwrap();
2776        let abuf = guard
2777            .get(&a.buffer_id)
2778            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2779        let out: Vec<f64> = abuf.iter().map(|&x| x.atan()).collect();
2780        drop(guard);
2781        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2782        let mut guard2 = registry().lock().unwrap();
2783        guard2.insert(id, out);
2784        Ok(GpuTensorHandle {
2785            shape: a.shape.clone(),
2786            device_id: 0,
2787            buffer_id: id,
2788        })
2789    }
2790    fn unary_atanh(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2791        let guard = registry().lock().unwrap();
2792        let abuf = guard
2793            .get(&a.buffer_id)
2794            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2795        let out: Vec<f64> = abuf.iter().map(|&x| x.atanh()).collect();
2796        drop(guard);
2797        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2798        let mut guard2 = registry().lock().unwrap();
2799        guard2.insert(id, out);
2800        Ok(GpuTensorHandle {
2801            shape: a.shape.clone(),
2802            device_id: 0,
2803            buffer_id: id,
2804        })
2805    }
2806
2807    fn unary_ceil(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2808        let guard = registry().lock().unwrap();
2809        let abuf = guard
2810            .get(&a.buffer_id)
2811            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2812        let out: Vec<f64> = abuf.iter().map(|&x| x.ceil()).collect();
2813        drop(guard);
2814        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2815        let mut guard2 = registry().lock().unwrap();
2816        guard2.insert(id, out);
2817        Ok(GpuTensorHandle {
2818            shape: a.shape.clone(),
2819            device_id: 0,
2820            buffer_id: id,
2821        })
2822    }
2823
2824    fn unary_floor(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2825        let guard = registry().lock().unwrap();
2826        let abuf = guard
2827            .get(&a.buffer_id)
2828            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2829        let out: Vec<f64> = abuf.iter().map(|&x| x.floor()).collect();
2830        drop(guard);
2831        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2832        let mut guard2 = registry().lock().unwrap();
2833        guard2.insert(id, out);
2834        Ok(GpuTensorHandle {
2835            shape: a.shape.clone(),
2836            device_id: 0,
2837            buffer_id: id,
2838        })
2839    }
2840
2841    fn unary_round(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2842        let guard = registry().lock().unwrap();
2843        let abuf = guard
2844            .get(&a.buffer_id)
2845            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2846        let out: Vec<f64> = abuf.iter().map(|&x| x.round()).collect();
2847        drop(guard);
2848        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2849        let mut guard2 = registry().lock().unwrap();
2850        guard2.insert(id, out);
2851        Ok(GpuTensorHandle {
2852            shape: a.shape.clone(),
2853            device_id: 0,
2854            buffer_id: id,
2855        })
2856    }
2857
2858    fn unary_fix(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2859        let guard = registry().lock().unwrap();
2860        let abuf = guard
2861            .get(&a.buffer_id)
2862            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2863        let out: Vec<f64> = abuf
2864            .iter()
2865            .map(|&x| {
2866                if !x.is_finite() {
2867                    x
2868                } else {
2869                    let truncated = x.trunc();
2870                    if truncated == 0.0 {
2871                        0.0
2872                    } else {
2873                        truncated
2874                    }
2875                }
2876            })
2877            .collect();
2878        drop(guard);
2879        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2880        let mut guard2 = registry().lock().unwrap();
2881        guard2.insert(id, out);
2882        Ok(GpuTensorHandle {
2883            shape: a.shape.clone(),
2884            device_id: 0,
2885            buffer_id: id,
2886        })
2887    }
2888
2889    fn unary_cos(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2890        let guard = registry().lock().unwrap();
2891        let abuf = guard
2892            .get(&a.buffer_id)
2893            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2894        let out: Vec<f64> = abuf.iter().map(|&x| x.cos()).collect();
2895        drop(guard);
2896        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2897        let mut guard2 = registry().lock().unwrap();
2898        guard2.insert(id, out);
2899        Ok(GpuTensorHandle {
2900            shape: a.shape.clone(),
2901            device_id: 0,
2902            buffer_id: id,
2903        })
2904    }
2905
2906    fn unary_abs(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2907        let guard = registry().lock().unwrap();
2908        let abuf = guard
2909            .get(&a.buffer_id)
2910            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2911        let out: Vec<f64> = abuf.iter().map(|&x| x.abs()).collect();
2912        drop(guard);
2913        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2914        let mut guard2 = registry().lock().unwrap();
2915        guard2.insert(id, out);
2916        Ok(GpuTensorHandle {
2917            shape: a.shape.clone(),
2918            device_id: 0,
2919            buffer_id: id,
2920        })
2921    }
2922
2923    fn unary_exp(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2924        let guard = registry().lock().unwrap();
2925        let abuf = guard
2926            .get(&a.buffer_id)
2927            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2928        let out: Vec<f64> = abuf.iter().map(|&x| x.exp()).collect();
2929        drop(guard);
2930        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2931        let mut guard2 = registry().lock().unwrap();
2932        guard2.insert(id, out);
2933        Ok(GpuTensorHandle {
2934            shape: a.shape.clone(),
2935            device_id: 0,
2936            buffer_id: id,
2937        })
2938    }
2939
2940    fn unary_log(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2941        let guard = registry().lock().unwrap();
2942        let abuf = guard
2943            .get(&a.buffer_id)
2944            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2945        let out: Vec<f64> = abuf.iter().map(|&x| x.ln()).collect();
2946        drop(guard);
2947        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2948        let mut guard2 = registry().lock().unwrap();
2949        guard2.insert(id, out);
2950        Ok(GpuTensorHandle {
2951            shape: a.shape.clone(),
2952            device_id: 0,
2953            buffer_id: id,
2954        })
2955    }
2956
2957    fn unary_sqrt(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2958        let guard = registry().lock().unwrap();
2959        let abuf = guard
2960            .get(&a.buffer_id)
2961            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2962        let out: Vec<f64> = abuf.iter().map(|&x| x.sqrt()).collect();
2963        drop(guard);
2964        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2965        let mut guard2 = registry().lock().unwrap();
2966        guard2.insert(id, out);
2967        Ok(GpuTensorHandle {
2968            shape: a.shape.clone(),
2969            device_id: 0,
2970            buffer_id: id,
2971        })
2972    }
2973
2974    fn unary_double(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2975        let guard = registry().lock().unwrap();
2976        let abuf = guard
2977            .get(&a.buffer_id)
2978            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?
2979            .clone();
2980        drop(guard);
2981        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2982        let mut guard2 = registry().lock().unwrap();
2983        guard2.insert(id, abuf);
2984        Ok(GpuTensorHandle {
2985            shape: a.shape.clone(),
2986            device_id: 0,
2987            buffer_id: id,
2988        })
2989    }
2990
2991    fn unary_single(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2992        let guard = registry().lock().unwrap();
2993        let abuf = guard
2994            .get(&a.buffer_id)
2995            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2996        let out: Vec<f64> = abuf.iter().map(|&x| (x as f32) as f64).collect();
2997        drop(guard);
2998        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2999        let mut guard2 = registry().lock().unwrap();
3000        guard2.insert(id, out);
3001        Ok(GpuTensorHandle {
3002            shape: a.shape.clone(),
3003            device_id: 0,
3004            buffer_id: id,
3005        })
3006    }
3007
3008    fn unary_pow2(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
3009        let guard = registry().lock().unwrap();
3010        let abuf = guard
3011            .get(&a.buffer_id)
3012            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3013        let out: Vec<f64> = abuf.iter().map(|&x| x.exp2()).collect();
3014        drop(guard);
3015        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3016        let mut guard2 = registry().lock().unwrap();
3017        guard2.insert(id, out);
3018        Ok(GpuTensorHandle {
3019            shape: a.shape.clone(),
3020            device_id: 0,
3021            buffer_id: id,
3022        })
3023    }
3024
3025    fn pow2_scale(
3026        &self,
3027        mantissa: &GpuTensorHandle,
3028        exponent: &GpuTensorHandle,
3029    ) -> Result<GpuTensorHandle> {
3030        let guard = registry().lock().unwrap();
3031        let mbuf = guard
3032            .get(&mantissa.buffer_id)
3033            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", mantissa.buffer_id))?;
3034        let ebuf = guard
3035            .get(&exponent.buffer_id)
3036            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", exponent.buffer_id))?;
3037        if mantissa.shape != exponent.shape {
3038            return Err(anyhow::anyhow!("shape mismatch"));
3039        }
3040        let mut out = vec![0.0f64; mbuf.len()];
3041        for i in 0..mbuf.len() {
3042            out[i] = mbuf[i] * ebuf[i].exp2();
3043        }
3044        drop(guard);
3045        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3046        let mut guard2 = registry().lock().unwrap();
3047        guard2.insert(id, out);
3048        Ok(GpuTensorHandle {
3049            shape: mantissa.shape.clone(),
3050            device_id: 0,
3051            buffer_id: id,
3052        })
3053    }
3054
3055    fn scalar_add(&self, a: &GpuTensorHandle, scalar: f64) -> Result<GpuTensorHandle> {
3056        let guard = registry().lock().unwrap();
3057        let abuf = guard
3058            .get(&a.buffer_id)
3059            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3060        let out: Vec<f64> = abuf.iter().map(|&x| x + scalar).collect();
3061        drop(guard);
3062        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3063        let mut guard2 = registry().lock().unwrap();
3064        guard2.insert(id, out);
3065        Ok(GpuTensorHandle {
3066            shape: a.shape.clone(),
3067            device_id: 0,
3068            buffer_id: id,
3069        })
3070    }
3071
3072    fn scalar_sub(&self, a: &GpuTensorHandle, scalar: f64) -> Result<GpuTensorHandle> {
3073        let guard = registry().lock().unwrap();
3074        let abuf = guard
3075            .get(&a.buffer_id)
3076            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3077        let out: Vec<f64> = abuf.iter().map(|&x| x - scalar).collect();
3078        drop(guard);
3079        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3080        let mut guard2 = registry().lock().unwrap();
3081        guard2.insert(id, out);
3082        Ok(GpuTensorHandle {
3083            shape: a.shape.clone(),
3084            device_id: 0,
3085            buffer_id: id,
3086        })
3087    }
3088
3089    fn scalar_mul(&self, a: &GpuTensorHandle, scalar: f64) -> Result<GpuTensorHandle> {
3090        let guard = registry().lock().unwrap();
3091        let abuf = guard
3092            .get(&a.buffer_id)
3093            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3094        let out: Vec<f64> = abuf.iter().map(|&x| x * scalar).collect();
3095        drop(guard);
3096        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3097        let mut guard2 = registry().lock().unwrap();
3098        guard2.insert(id, out);
3099        Ok(GpuTensorHandle {
3100            shape: a.shape.clone(),
3101            device_id: 0,
3102            buffer_id: id,
3103        })
3104    }
3105
3106    fn scalar_div(&self, a: &GpuTensorHandle, scalar: f64) -> Result<GpuTensorHandle> {
3107        let guard = registry().lock().unwrap();
3108        let abuf = guard
3109            .get(&a.buffer_id)
3110            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3111        let out: Vec<f64> = if scalar == 0.0 {
3112            abuf.iter().map(|&x| f64::INFINITY * x.signum()).collect()
3113        } else {
3114            abuf.iter().map(|&x| x / scalar).collect()
3115        };
3116        drop(guard);
3117        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3118        let mut guard2 = registry().lock().unwrap();
3119        guard2.insert(id, out);
3120        Ok(GpuTensorHandle {
3121            shape: a.shape.clone(),
3122            device_id: 0,
3123            buffer_id: id,
3124        })
3125    }
3126
3127    fn scalar_rsub(&self, a: &GpuTensorHandle, scalar: f64) -> Result<GpuTensorHandle> {
3128        // compute scalar - a
3129        let guard = registry().lock().unwrap();
3130        let abuf = guard
3131            .get(&a.buffer_id)
3132            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3133        let out: Vec<f64> = abuf.iter().map(|&x| scalar - x).collect();
3134        drop(guard);
3135        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3136        let mut guard2 = registry().lock().unwrap();
3137        guard2.insert(id, out);
3138        Ok(GpuTensorHandle {
3139            shape: a.shape.clone(),
3140            device_id: 0,
3141            buffer_id: id,
3142        })
3143    }
3144
3145    fn scalar_rdiv(&self, a: &GpuTensorHandle, scalar: f64) -> Result<GpuTensorHandle> {
3146        // compute scalar ./ a
3147        let guard = registry().lock().unwrap();
3148        let abuf = guard
3149            .get(&a.buffer_id)
3150            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3151        let out: Vec<f64> = abuf
3152            .iter()
3153            .map(|&x| {
3154                if x == 0.0 {
3155                    f64::INFINITY * scalar.signum()
3156                } else {
3157                    scalar / x
3158                }
3159            })
3160            .collect();
3161        drop(guard);
3162        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3163        let mut guard2 = registry().lock().unwrap();
3164        guard2.insert(id, out);
3165        Ok(GpuTensorHandle {
3166            shape: a.shape.clone(),
3167            device_id: 0,
3168            buffer_id: id,
3169        })
3170    }
3171
3172    fn transpose(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
3173        if a.shape.len() != 2 {
3174            return Err(anyhow::anyhow!("transpose: only 2D supported"));
3175        }
3176        let rows = a.shape[0];
3177        let cols = a.shape[1];
3178        let guard = registry().lock().unwrap();
3179        let abuf = guard
3180            .get(&a.buffer_id)
3181            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3182        let mut out = vec![0.0; abuf.len()];
3183        for i in 0..rows {
3184            for j in 0..cols {
3185                let src = i + j * rows;
3186                let dst = j + i * cols;
3187                out[dst] = abuf[src];
3188            }
3189        }
3190        drop(guard);
3191        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3192        let mut guard2 = registry().lock().unwrap();
3193        guard2.insert(id, out);
3194        Ok(GpuTensorHandle {
3195            shape: vec![cols, rows],
3196            device_id: 0,
3197            buffer_id: id,
3198        })
3199    }
3200    fn conv1d(
3201        &self,
3202        signal: &GpuTensorHandle,
3203        kernel: &GpuTensorHandle,
3204        options: ProviderConv1dOptions,
3205    ) -> Result<GpuTensorHandle> {
3206        let signal_len: usize = signal.shape.iter().copied().product();
3207        let kernel_len: usize = kernel.shape.iter().copied().product();
3208
3209        if signal_len == 0 || kernel_len == 0 {
3210            let shape = conv1d_output_shape(0, options.orientation);
3211            return Ok(self.allocate_tensor(Vec::new(), shape));
3212        }
3213
3214        if matches!(options.mode, ProviderConvMode::Valid) && signal_len < kernel_len {
3215            let shape = conv1d_output_shape(0, options.orientation);
3216            return Ok(self.allocate_tensor(Vec::new(), shape));
3217        }
3218
3219        let (signal_data, kernel_data) = {
3220            let guard = registry().lock().unwrap();
3221            let signal_buf = guard
3222                .get(&signal.buffer_id)
3223                .cloned()
3224                .ok_or_else(|| anyhow!("conv1d: unknown signal buffer {}", signal.buffer_id))?;
3225            let kernel_buf = guard
3226                .get(&kernel.buffer_id)
3227                .cloned()
3228                .ok_or_else(|| anyhow!("conv1d: unknown kernel buffer {}", kernel.buffer_id))?;
3229            (signal_buf, kernel_buf)
3230        };
3231
3232        ensure!(
3233            signal_data.len() == signal_len,
3234            "conv1d: signal length mismatch (shape implies {}, buffer has {})",
3235            signal_len,
3236            signal_data.len()
3237        );
3238        ensure!(
3239            kernel_data.len() == kernel_len,
3240            "conv1d: kernel length mismatch (shape implies {}, buffer has {})",
3241            kernel_len,
3242            kernel_data.len()
3243        );
3244
3245        let full = convolve_real(&signal_data, &kernel_data)?;
3246        let shaped = apply_conv_mode_real(&full, options.mode, signal_len, kernel_len);
3247        let shape = conv1d_output_shape(shaped.len(), options.orientation);
3248        Ok(self.allocate_tensor(shaped, shape))
3249    }
3250    fn conv2d(
3251        &self,
3252        signal: &GpuTensorHandle,
3253        kernel: &GpuTensorHandle,
3254        mode: ProviderConvMode,
3255    ) -> Result<GpuTensorHandle> {
3256        ensure_diag_shape("conv2d", &signal.shape)?;
3257        ensure_diag_shape("conv2d", &kernel.shape)?;
3258        let (signal_rows, signal_cols) = rows_cols(&signal.shape);
3259        let (kernel_rows, kernel_cols) = rows_cols(&kernel.shape);
3260
3261        let signal_len: usize = signal.shape.iter().copied().product();
3262        let kernel_len: usize = kernel.shape.iter().copied().product();
3263
3264        let empty_shape = match mode {
3265            ProviderConvMode::Full | ProviderConvMode::Valid => vec![0, 0],
3266            ProviderConvMode::Same => vec![signal_rows, signal_cols],
3267        };
3268
3269        if signal_len == 0
3270            || kernel_len == 0
3271            || signal_rows == 0
3272            || signal_cols == 0
3273            || kernel_rows == 0
3274            || kernel_cols == 0
3275        {
3276            let len = empty_shape.iter().copied().product::<usize>();
3277            return Ok(self.allocate_tensor(vec![0.0; len], empty_shape));
3278        }
3279
3280        let (signal_data, kernel_data) = {
3281            let guard = registry().lock().unwrap();
3282            let signal_buf = guard
3283                .get(&signal.buffer_id)
3284                .cloned()
3285                .ok_or_else(|| anyhow!("conv2d: unknown signal buffer {}", signal.buffer_id))?;
3286            let kernel_buf = guard
3287                .get(&kernel.buffer_id)
3288                .cloned()
3289                .ok_or_else(|| anyhow!("conv2d: unknown kernel buffer {}", kernel.buffer_id))?;
3290            (signal_buf, kernel_buf)
3291        };
3292
3293        ensure!(
3294            signal_data.len() == signal_len,
3295            "conv2d: signal length mismatch (shape implies {}, buffer has {})",
3296            signal_len,
3297            signal_data.len()
3298        );
3299        ensure!(
3300            kernel_data.len() == kernel_len,
3301            "conv2d: kernel length mismatch (shape implies {}, buffer has {})",
3302            kernel_len,
3303            kernel_data.len()
3304        );
3305
3306        let full_rows = signal_rows
3307            .checked_add(kernel_rows)
3308            .and_then(|v| v.checked_sub(1))
3309            .ok_or_else(|| anyhow!("conv2d: row size overflow"))?;
3310        let full_cols = signal_cols
3311            .checked_add(kernel_cols)
3312            .and_then(|v| v.checked_sub(1))
3313            .ok_or_else(|| anyhow!("conv2d: column size overflow"))?;
3314        full_rows
3315            .checked_mul(full_cols)
3316            .ok_or_else(|| anyhow!("conv2d: output size overflow"))?;
3317
3318        let full = conv2d_full_real(
3319            &signal_data,
3320            signal_rows,
3321            signal_cols,
3322            &kernel_data,
3323            kernel_rows,
3324            kernel_cols,
3325            full_rows,
3326            full_cols,
3327        );
3328
3329        let (shaped, out_rows, out_cols) = apply_conv2_mode_real_2d(
3330            &full,
3331            full_rows,
3332            full_cols,
3333            mode,
3334            signal_rows,
3335            signal_cols,
3336            kernel_rows,
3337            kernel_cols,
3338        );
3339        Ok(self.allocate_tensor(shaped, vec![out_rows, out_cols]))
3340    }
3341    fn iir_filter(
3342        &self,
3343        b: &GpuTensorHandle,
3344        a: &GpuTensorHandle,
3345        x: &GpuTensorHandle,
3346        options: ProviderIirFilterOptions,
3347    ) -> Result<ProviderIirFilterResult> {
3348        let ProviderIirFilterOptions { dim, zi } = options;
3349
3350        let nb = product(&b.shape);
3351        let na = product(&a.shape);
3352        ensure!(
3353            nb > 0,
3354            "iir_filter: numerator coefficients must not be empty"
3355        );
3356        ensure!(
3357            na > 0,
3358            "iir_filter: denominator coefficients must not be empty"
3359        );
3360
3361        let signal_elems = product(&x.shape);
3362        let zi_shape = zi.as_ref().map(|handle| handle.shape.clone());
3363
3364        let (b_data, a_data, x_data, zi_data) =
3365            {
3366                let guard = registry().lock().unwrap();
3367                let b_buf = guard.get(&b.buffer_id).cloned().ok_or_else(|| {
3368                    anyhow!("iir_filter: unknown numerator buffer {}", b.buffer_id)
3369                })?;
3370                let a_buf = guard.get(&a.buffer_id).cloned().ok_or_else(|| {
3371                    anyhow!("iir_filter: unknown denominator buffer {}", a.buffer_id)
3372                })?;
3373                let x_buf = guard
3374                    .get(&x.buffer_id)
3375                    .cloned()
3376                    .ok_or_else(|| anyhow!("iir_filter: unknown signal buffer {}", x.buffer_id))?;
3377                ensure!(
3378                    b_buf.len() == nb,
3379                    "iir_filter: numerator length mismatch (shape implies {}, buffer has {})",
3380                    nb,
3381                    b_buf.len()
3382                );
3383                ensure!(
3384                    a_buf.len() == na,
3385                    "iir_filter: denominator length mismatch (shape implies {}, buffer has {})",
3386                    na,
3387                    a_buf.len()
3388                );
3389                ensure!(
3390                    x_buf.len() == signal_elems,
3391                    "iir_filter: signal length mismatch (shape implies {}, buffer has {})",
3392                    signal_elems,
3393                    x_buf.len()
3394                );
3395                let zi_buf = if let Some(ref zi_handle) = zi {
3396                    Some(guard.get(&zi_handle.buffer_id).cloned().ok_or_else(|| {
3397                        anyhow!(
3398                            "iir_filter: unknown initial state buffer {}",
3399                            zi_handle.buffer_id
3400                        )
3401                    })?)
3402                } else {
3403                    None
3404                };
3405                (b_buf, a_buf, x_buf, zi_buf)
3406            };
3407
3408        ensure!(
3409            !a_data.is_empty() && a_data[0] != 0.0,
3410            "iir_filter: denominator coefficient a(1) must be non-zero"
3411        );
3412
3413        let mut shape_ext = x.shape.clone();
3414        if dim >= shape_ext.len() {
3415            shape_ext.extend(std::iter::repeat_n(1, dim + 1 - shape_ext.len()));
3416        }
3417        let dim_idx = dim;
3418        let dim_len = shape_ext.get(dim_idx).copied().unwrap_or(1);
3419
3420        let leading = if dim_idx == 0 {
3421            1
3422        } else {
3423            shape_ext[..dim_idx].iter().copied().product()
3424        };
3425        let trailing = if dim_idx + 1 >= shape_ext.len() {
3426            1
3427        } else {
3428            shape_ext[dim_idx + 1..].iter().copied().product()
3429        };
3430        let channel_count = leading * trailing;
3431
3432        let order = nb.max(na);
3433        let state_len = order.saturating_sub(1);
3434        let state_shape = filter_state_shape(shape_ext.clone(), dim_idx, state_len);
3435        let expected_states = state_len.saturating_mul(channel_count);
3436
3437        if let Some(ref shape) = zi_shape {
3438            ensure!(
3439                shapes_compatible(&state_shape, shape),
3440                "iir_filter: initial conditions are not compatible with the signal shape"
3441            );
3442            let zi_dim = if dim_idx < shape.len() {
3443                shape[dim_idx]
3444            } else {
3445                1
3446            };
3447            ensure!(
3448                zi_dim == state_len,
3449                "iir_filter: initial conditions must have {} states along dimension {}",
3450                state_len,
3451                dim + 1
3452            );
3453        }
3454
3455        let mut states = if state_len == 0 {
3456            Vec::new()
3457        } else if let Some(ref zi_buf) = zi_data {
3458            ensure!(
3459                zi_buf.len() == expected_states,
3460                "iir_filter: initial state vector length mismatch (expected {}, found {})",
3461                expected_states,
3462                zi_buf.len()
3463            );
3464            states_from_column_major(zi_buf, state_len, dim_idx, &shape_ext)
3465        } else {
3466            vec![0.0; expected_states]
3467        };
3468
3469        let mut b_norm = vec![0.0f64; order];
3470        let mut a_norm = vec![0.0f64; order];
3471        let a0 = a_data[0];
3472        for i in 0..order {
3473            let b_coeff = if i < nb { b_data[i] } else { 0.0 };
3474            b_norm[i] = b_coeff / a0;
3475            if i == 0 {
3476                a_norm[0] = 1.0;
3477            } else {
3478                let a_coeff = if i < na { a_data[i] } else { 0.0 };
3479                a_norm[i] = a_coeff / a0;
3480            }
3481        }
3482
3483        let mut output = vec![0.0f64; x_data.len()];
3484
3485        if state_len == 0 {
3486            let gain = b_norm[0];
3487            for (dst, &src) in output.iter_mut().zip(x_data.iter()) {
3488                *dst = gain * src;
3489            }
3490        } else if dim_len == 0 || channel_count == 0 {
3491            // No samples to process; states remain unchanged.
3492        } else {
3493            for t in 0..trailing {
3494                let base = t
3495                    .checked_mul(dim_len)
3496                    .and_then(|v| v.checked_mul(leading))
3497                    .ok_or_else(|| anyhow!("iir_filter: index overflow"))?;
3498                for l in 0..leading {
3499                    let channel_idx = t
3500                        .checked_mul(leading)
3501                        .and_then(|v| v.checked_add(l))
3502                        .ok_or_else(|| anyhow!("iir_filter: index overflow"))?;
3503                    if channel_idx >= channel_count {
3504                        continue;
3505                    }
3506                    let state_base = channel_idx
3507                        .checked_mul(state_len)
3508                        .ok_or_else(|| anyhow!("iir_filter: state index overflow"))?;
3509                    for step in 0..dim_len {
3510                        let idx = base
3511                            .checked_add(l)
3512                            .and_then(|v| v.checked_add(step.saturating_mul(leading)))
3513                            .ok_or_else(|| anyhow!("iir_filter: signal index overflow"))?;
3514                        if idx >= x_data.len() {
3515                            break;
3516                        }
3517                        let x_n = x_data[idx];
3518                        let y = b_norm[0] * x_n + states.get(state_base).copied().unwrap_or(0.0);
3519                        output[idx] = y;
3520                        for i in 1..order {
3521                            let next_state = if i < state_len {
3522                                states[state_base + i]
3523                            } else {
3524                                0.0
3525                            };
3526                            let new_state = b_norm[i] * x_n + next_state - a_norm[i] * y;
3527                            states[state_base + i - 1] = new_state;
3528                        }
3529                    }
3530                }
3531            }
3532        }
3533
3534        let final_state_data = if state_len == 0 {
3535            Vec::new()
3536        } else {
3537            states_to_column_major(&states, state_len, dim_idx, &shape_ext)
3538        };
3539
3540        let output_handle = self.allocate_tensor(output, x.shape.clone());
3541        let final_state_handle = self.allocate_tensor(final_state_data, state_shape);
3542
3543        Ok(ProviderIirFilterResult {
3544            output: output_handle,
3545            final_state: Some(final_state_handle),
3546        })
3547    }
3548    fn permute(&self, handle: &GpuTensorHandle, order: &[usize]) -> Result<GpuTensorHandle> {
3549        let data = {
3550            let guard = registry().lock().unwrap();
3551            guard
3552                .get(&handle.buffer_id)
3553                .ok_or_else(|| anyhow!("permute: unknown tensor handle {}", handle.buffer_id))?
3554                .clone()
3555        };
3556        let (permuted, new_shape) = permute_data(&data, &handle.shape, order)?;
3557        Ok(self.allocate_tensor(permuted, new_shape))
3558    }
3559
3560    fn flip(&self, handle: &GpuTensorHandle, axes: &[usize]) -> Result<GpuTensorHandle> {
3561        let data = {
3562            let guard = registry().lock().unwrap();
3563            guard
3564                .get(&handle.buffer_id)
3565                .ok_or_else(|| anyhow!("flip: unknown tensor handle {}", handle.buffer_id))?
3566                .clone()
3567        };
3568        let flipped = flip_data(&data, &handle.shape, axes)?;
3569        Ok(self.allocate_tensor(flipped, handle.shape.clone()))
3570    }
3571
3572    fn circshift(&self, handle: &GpuTensorHandle, shifts: &[isize]) -> Result<GpuTensorHandle> {
3573        let data = {
3574            let guard = registry().lock().unwrap();
3575            guard
3576                .get(&handle.buffer_id)
3577                .ok_or_else(|| anyhow!("circshift: unknown tensor handle {}", handle.buffer_id))?
3578                .clone()
3579        };
3580        let mut shape = handle.shape.clone();
3581        if shifts.len() > shape.len() {
3582            shape.extend(std::iter::repeat_n(1, shifts.len() - shape.len()));
3583        }
3584        let mut full_shifts = vec![0isize; shape.len()];
3585        for (idx, &shift) in shifts.iter().enumerate() {
3586            if idx < full_shifts.len() {
3587                full_shifts[idx] = shift;
3588            }
3589        }
3590        let rotated = circshift_data(&data, &shape, &full_shifts)?;
3591        Ok(self.allocate_tensor(rotated, shape))
3592    }
3593
3594    fn diff_dim(
3595        &self,
3596        handle: &GpuTensorHandle,
3597        order: usize,
3598        dim: usize,
3599    ) -> Result<GpuTensorHandle> {
3600        if order == 0 {
3601            return Ok(handle.clone());
3602        }
3603        let data = {
3604            let guard = registry().lock().unwrap();
3605            guard
3606                .get(&handle.buffer_id)
3607                .ok_or_else(|| anyhow!("diff_dim: unknown tensor handle {}", handle.buffer_id))?
3608                .clone()
3609        };
3610        let tensor =
3611            Tensor::new(data, handle.shape.clone()).map_err(|e| anyhow!("diff_dim: {e}"))?;
3612        let diffed =
3613            diff_tensor_host(tensor, order, Some(dim + 1)).map_err(|e| anyhow!("diff_dim: {e}"))?;
3614        let Tensor { data, shape, .. } = diffed;
3615        Ok(self.allocate_tensor(data, shape))
3616    }
3617
3618    fn unique(&self, handle: &GpuTensorHandle, options: &UniqueOptions) -> Result<UniqueResult> {
3619        let data = {
3620            let guard = registry().lock().unwrap();
3621            guard
3622                .get(&handle.buffer_id)
3623                .ok_or_else(|| anyhow!("unique: unknown tensor handle {}", handle.buffer_id))?
3624                .clone()
3625        };
3626        let tensor = Tensor::new(data, handle.shape.clone()).map_err(|e| anyhow!("unique: {e}"))?;
3627        unique::unique_numeric_from_tensor(tensor, options)
3628            .and_then(|eval| eval.into_numeric_unique_result())
3629            .map_err(|e| anyhow!("{e}"))
3630    }
3631
3632    fn setdiff(
3633        &self,
3634        a: &GpuTensorHandle,
3635        b: &GpuTensorHandle,
3636        options: &SetdiffOptions,
3637    ) -> Result<SetdiffResult> {
3638        let data_a = {
3639            let guard = registry().lock().unwrap();
3640            guard
3641                .get(&a.buffer_id)
3642                .cloned()
3643                .ok_or_else(|| anyhow!("setdiff: unknown tensor handle {}", a.buffer_id))?
3644        };
3645        let data_b = {
3646            let guard = registry().lock().unwrap();
3647            guard
3648                .get(&b.buffer_id)
3649                .cloned()
3650                .ok_or_else(|| anyhow!("setdiff: unknown tensor handle {}", b.buffer_id))?
3651        };
3652        let tensor_a = Tensor::new(data_a, a.shape.clone()).map_err(|e| anyhow!("setdiff: {e}"))?;
3653        let tensor_b = Tensor::new(data_b, b.shape.clone()).map_err(|e| anyhow!("setdiff: {e}"))?;
3654        runmat_runtime::builtins::array::sorting_sets::setdiff::setdiff_numeric_from_tensors(
3655            tensor_a, tensor_b, options,
3656        )
3657        .and_then(|eval| eval.into_numeric_setdiff_result())
3658        .map_err(|e| anyhow!("setdiff: {e}"))
3659    }
3660
3661    fn repmat(&self, handle: &GpuTensorHandle, reps: &[usize]) -> Result<GpuTensorHandle> {
3662        let data = {
3663            let guard = registry().lock().unwrap();
3664            guard
3665                .get(&handle.buffer_id)
3666                .ok_or_else(|| anyhow!("repmat: unknown tensor handle {}", handle.buffer_id))?
3667                .clone()
3668        };
3669        let (tiled, shape) = repmat_numeric(&data, &handle.shape, reps)?;
3670        Ok(self.allocate_tensor(tiled, shape))
3671    }
3672
3673    fn dot(
3674        &self,
3675        lhs: &GpuTensorHandle,
3676        rhs: &GpuTensorHandle,
3677        dim: Option<usize>,
3678    ) -> Result<GpuTensorHandle> {
3679        let (lhs_buf, rhs_buf) = {
3680            let guard = registry().lock().unwrap();
3681            let lhs_buf = guard
3682                .get(&lhs.buffer_id)
3683                .cloned()
3684                .ok_or_else(|| anyhow!("dot: unknown tensor handle {}", lhs.buffer_id))?;
3685            let rhs_buf = guard
3686                .get(&rhs.buffer_id)
3687                .cloned()
3688                .ok_or_else(|| anyhow!("dot: unknown tensor handle {}", rhs.buffer_id))?;
3689            (lhs_buf, rhs_buf)
3690        };
3691        let lhs_tensor =
3692            Tensor::new(lhs_buf, lhs.shape.clone()).map_err(|e| anyhow!("dot: {e}"))?;
3693        let rhs_tensor =
3694            Tensor::new(rhs_buf, rhs.shape.clone()).map_err(|e| anyhow!("dot: {e}"))?;
3695        let result =
3696            dot_host_real_for_provider(&lhs_tensor, &rhs_tensor, dim).map_err(|e| anyhow!(e))?;
3697        Ok(self.allocate_tensor(result.data.clone(), result.shape.clone()))
3698    }
3699
3700    fn reduce_sum(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
3701        let guard = registry().lock().unwrap();
3702        let abuf = guard
3703            .get(&a.buffer_id)
3704            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3705        let s: f64 = abuf.iter().sum();
3706        drop(guard);
3707        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3708        let mut guard2 = registry().lock().unwrap();
3709        guard2.insert(id, vec![s]);
3710        Ok(GpuTensorHandle {
3711            shape: vec![1, 1],
3712            device_id: 0,
3713            buffer_id: id,
3714        })
3715    }
3716
3717    fn reduce_sum_dim(&self, a: &GpuTensorHandle, dim: usize) -> Result<GpuTensorHandle> {
3718        if a.shape.len() != 2 {
3719            return Err(anyhow::anyhow!("reduce_sum_dim: only 2D supported"));
3720        }
3721        let rows = a.shape[0];
3722        let cols = a.shape[1];
3723        let guard = registry().lock().unwrap();
3724        let abuf = guard
3725            .get(&a.buffer_id)
3726            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3727        let out = if dim <= 1 {
3728            // sum over rows -> 1 x cols
3729            let mut v = vec![0.0f64; cols];
3730            for c in 0..cols {
3731                let mut s = 0.0;
3732                for r in 0..rows {
3733                    s += abuf[r + c * rows];
3734                }
3735                v[c] = s;
3736            }
3737            v
3738        } else {
3739            // sum over cols -> rows x 1
3740            let mut v = vec![0.0f64; rows];
3741            for r in 0..rows {
3742                let mut s = 0.0;
3743                for c in 0..cols {
3744                    s += abuf[r + c * rows];
3745                }
3746                v[r] = s;
3747            }
3748            v
3749        };
3750        drop(guard);
3751        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3752        let mut guard2 = registry().lock().unwrap();
3753        guard2.insert(id, out);
3754        let shape = if dim <= 1 {
3755            vec![1, cols]
3756        } else {
3757            vec![rows, 1]
3758        };
3759        Ok(GpuTensorHandle {
3760            shape,
3761            device_id: 0,
3762            buffer_id: id,
3763        })
3764    }
3765
3766    fn reduce_prod(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
3767        let guard = registry().lock().unwrap();
3768        let abuf = guard
3769            .get(&a.buffer_id)
3770            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3771        let p: f64 = abuf.iter().product();
3772        drop(guard);
3773        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3774        let mut guard2 = registry().lock().unwrap();
3775        guard2.insert(id, vec![p]);
3776        Ok(GpuTensorHandle {
3777            shape: vec![1, 1],
3778            device_id: 0,
3779            buffer_id: id,
3780        })
3781    }
3782
3783    fn reduce_prod_dim(&self, a: &GpuTensorHandle, dim: usize) -> Result<GpuTensorHandle> {
3784        if a.shape.len() != 2 {
3785            return Err(anyhow::anyhow!("reduce_prod_dim: only 2D supported"));
3786        }
3787        let rows = a.shape[0];
3788        let cols = a.shape[1];
3789        let guard = registry().lock().unwrap();
3790        let abuf = guard
3791            .get(&a.buffer_id)
3792            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3793        let out = if dim <= 1 {
3794            let mut v = vec![1.0f64; cols];
3795            for c in 0..cols {
3796                let mut prod = 1.0;
3797                for r in 0..rows {
3798                    prod *= abuf[r + c * rows];
3799                }
3800                v[c] = prod;
3801            }
3802            v
3803        } else {
3804            let mut v = vec![1.0f64; rows];
3805            for r in 0..rows {
3806                let mut prod = 1.0;
3807                for c in 0..cols {
3808                    prod *= abuf[r + c * rows];
3809                }
3810                v[r] = prod;
3811            }
3812            v
3813        };
3814        drop(guard);
3815        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3816        let mut guard2 = registry().lock().unwrap();
3817        guard2.insert(id, out);
3818        let shape = if dim <= 1 {
3819            vec![1, cols]
3820        } else {
3821            vec![rows, 1]
3822        };
3823        Ok(GpuTensorHandle {
3824            shape,
3825            device_id: 0,
3826            buffer_id: id,
3827        })
3828    }
3829
3830    fn reduce_mean(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
3831        let guard = registry().lock().unwrap();
3832        let abuf = guard
3833            .get(&a.buffer_id)
3834            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3835        let mean = if abuf.is_empty() {
3836            0.0
3837        } else {
3838            abuf.iter().sum::<f64>() / (abuf.len() as f64)
3839        };
3840        drop(guard);
3841        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3842        registry().lock().unwrap().insert(id, vec![mean]);
3843        Ok(GpuTensorHandle {
3844            shape: vec![1, 1],
3845            device_id: 0,
3846            buffer_id: id,
3847        })
3848    }
3849
3850    fn reduce_mean_dim(&self, a: &GpuTensorHandle, dim: usize) -> Result<GpuTensorHandle> {
3851        if a.shape.len() != 2 {
3852            return Err(anyhow::anyhow!("reduce_mean_dim: only 2D supported"));
3853        }
3854        let rows = a.shape[0];
3855        let cols = a.shape[1];
3856        let guard = registry().lock().unwrap();
3857        let abuf = guard
3858            .get(&a.buffer_id)
3859            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3860        let out = if dim <= 1 {
3861            let mut v = vec![0.0f64; cols];
3862            for c in 0..cols {
3863                let mut s = 0.0;
3864                for r in 0..rows {
3865                    s += abuf[r + c * rows];
3866                }
3867                v[c] = s / (rows as f64);
3868            }
3869            v
3870        } else {
3871            let mut v = vec![0.0f64; rows];
3872            for r in 0..rows {
3873                let mut s = 0.0;
3874                for c in 0..cols {
3875                    s += abuf[r + c * rows];
3876                }
3877                v[r] = s / (cols as f64);
3878            }
3879            v
3880        };
3881        drop(guard);
3882        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3883        registry().lock().unwrap().insert(id, out);
3884        let shape = if dim <= 1 {
3885            vec![1, cols]
3886        } else {
3887            vec![rows, 1]
3888        };
3889        Ok(GpuTensorHandle {
3890            shape,
3891            device_id: 0,
3892            buffer_id: id,
3893        })
3894    }
3895
3896    fn reduce_any(&self, a: &GpuTensorHandle, omit_nan: bool) -> Result<GpuTensorHandle> {
3897        let data = {
3898            let guard = registry().lock().unwrap();
3899            guard
3900                .get(&a.buffer_id)
3901                .cloned()
3902                .ok_or_else(|| anyhow!("reduce_any: unknown tensor handle {}", a.buffer_id))?
3903        };
3904        let mut truthy = false;
3905        for v in &data {
3906            if v.is_nan() {
3907                if !omit_nan {
3908                    truthy = true;
3909                    break;
3910                }
3911            } else if *v != 0.0 {
3912                truthy = true;
3913                break;
3914            }
3915        }
3916        let result = if truthy { 1.0 } else { 0.0 };
3917        Ok(self.allocate_tensor(vec![result], vec![1, 1]))
3918    }
3919
3920    fn reduce_any_dim(
3921        &self,
3922        a: &GpuTensorHandle,
3923        dim: usize,
3924        omit_nan: bool,
3925    ) -> Result<GpuTensorHandle> {
3926        if a.shape.len() != 2 {
3927            return Err(anyhow!("reduce_any_dim: only 2D supported"));
3928        }
3929        let rows = a.shape[0];
3930        let cols = a.shape[1];
3931        let data = {
3932            let guard = registry().lock().unwrap();
3933            guard
3934                .get(&a.buffer_id)
3935                .cloned()
3936                .ok_or_else(|| anyhow!("reduce_any_dim: unknown tensor handle {}", a.buffer_id))?
3937        };
3938        let (out, shape) = if dim == 0 {
3939            let mut v = vec![0.0f64; cols];
3940            for c in 0..cols {
3941                let mut truth = false;
3942                for r in 0..rows {
3943                    let val = data[r + c * rows];
3944                    if val.is_nan() {
3945                        if !omit_nan {
3946                            truth = true;
3947                            break;
3948                        }
3949                    } else if val != 0.0 {
3950                        truth = true;
3951                        break;
3952                    }
3953                }
3954                v[c] = if truth { 1.0 } else { 0.0 };
3955            }
3956            (v, vec![1, cols])
3957        } else if dim == 1 {
3958            let mut v = vec![0.0f64; rows];
3959            for r in 0..rows {
3960                let mut truth = false;
3961                for c in 0..cols {
3962                    let val = data[r + c * rows];
3963                    if val.is_nan() {
3964                        if !omit_nan {
3965                            truth = true;
3966                            break;
3967                        }
3968                    } else if val != 0.0 {
3969                        truth = true;
3970                        break;
3971                    }
3972                }
3973                v[r] = if truth { 1.0 } else { 0.0 };
3974            }
3975            (v, vec![rows, 1])
3976        } else {
3977            return Err(anyhow!("reduce_any_dim: invalid dimension {}", dim));
3978        };
3979        Ok(self.allocate_tensor(out, shape))
3980    }
3981
3982    fn reduce_all(&self, a: &GpuTensorHandle, omit_nan: bool) -> Result<GpuTensorHandle> {
3983        let data = {
3984            let guard = registry().lock().unwrap();
3985            guard
3986                .get(&a.buffer_id)
3987                .cloned()
3988                .ok_or_else(|| anyhow!("reduce_all: unknown tensor handle {}", a.buffer_id))?
3989        };
3990        let mut all_true = true;
3991        let mut saw_value = false;
3992        for v in &data {
3993            if v.is_nan() {
3994                if omit_nan {
3995                    continue;
3996                }
3997            } else {
3998                saw_value = true;
3999                if *v == 0.0 {
4000                    all_true = false;
4001                    break;
4002                }
4003                continue;
4004            }
4005        }
4006        if omit_nan && !saw_value {
4007            all_true = true;
4008        }
4009        let result = if all_true { 1.0 } else { 0.0 };
4010        Ok(self.allocate_tensor(vec![result], vec![1, 1]))
4011    }
4012
4013    fn reduce_all_dim(
4014        &self,
4015        a: &GpuTensorHandle,
4016        dim: usize,
4017        omit_nan: bool,
4018    ) -> Result<GpuTensorHandle> {
4019        if a.shape.len() != 2 {
4020            return Err(anyhow!("reduce_all_dim: only 2D supported"));
4021        }
4022        let rows = a.shape[0];
4023        let cols = a.shape[1];
4024        let data = {
4025            let guard = registry().lock().unwrap();
4026            guard
4027                .get(&a.buffer_id)
4028                .cloned()
4029                .ok_or_else(|| anyhow!("reduce_all_dim: unknown tensor handle {}", a.buffer_id))?
4030        };
4031        let (out, shape) = if dim == 0 {
4032            let mut v = vec![0.0f64; cols];
4033            for c in 0..cols {
4034                let mut all_true = true;
4035                let mut saw_value = false;
4036                for r in 0..rows {
4037                    let val = data[r + c * rows];
4038                    if val.is_nan() {
4039                        if omit_nan {
4040                            continue;
4041                        }
4042                    } else {
4043                        saw_value = true;
4044                        if val == 0.0 {
4045                            all_true = false;
4046                            break;
4047                        }
4048                        continue;
4049                    }
4050                }
4051                if omit_nan && !saw_value {
4052                    all_true = true;
4053                }
4054                v[c] = if all_true { 1.0 } else { 0.0 };
4055            }
4056            (v, vec![1, cols])
4057        } else if dim == 1 {
4058            let mut v = vec![0.0f64; rows];
4059            for r in 0..rows {
4060                let mut all_true = true;
4061                let mut saw_value = false;
4062                for c in 0..cols {
4063                    let val = data[r + c * rows];
4064                    if val.is_nan() {
4065                        if omit_nan {
4066                            continue;
4067                        }
4068                    } else {
4069                        saw_value = true;
4070                        if val == 0.0 {
4071                            all_true = false;
4072                            break;
4073                        }
4074                        continue;
4075                    }
4076                }
4077                if omit_nan && !saw_value {
4078                    all_true = true;
4079                }
4080                v[r] = if all_true { 1.0 } else { 0.0 };
4081            }
4082            (v, vec![rows, 1])
4083        } else {
4084            return Err(anyhow!("reduce_all_dim: invalid dimension {}", dim));
4085        };
4086        Ok(self.allocate_tensor(out, shape))
4087    }
4088
4089    fn reduce_median(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
4090        let data = {
4091            let guard = registry().lock().unwrap();
4092            guard
4093                .get(&a.buffer_id)
4094                .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?
4095                .clone()
4096        };
4097        let median = if data.is_empty() || data.iter().any(|v| v.is_nan()) {
4098            f64::NAN
4099        } else {
4100            let mut scratch = data.clone();
4101            compute_median_inplace(&mut scratch)
4102        };
4103        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
4104        registry().lock().unwrap().insert(id, vec![median]);
4105        Ok(GpuTensorHandle {
4106            shape: vec![1, 1],
4107            device_id: 0,
4108            buffer_id: id,
4109        })
4110    }
4111
4112    fn reduce_median_dim(&self, a: &GpuTensorHandle, dim: usize) -> Result<GpuTensorHandle> {
4113        if a.shape.len() != 2 {
4114            return Err(anyhow::anyhow!("reduce_median_dim: only 2D supported"));
4115        }
4116        let rows = a.shape[0];
4117        let cols = a.shape[1];
4118        let guard = registry().lock().unwrap();
4119        let abuf = guard
4120            .get(&a.buffer_id)
4121            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
4122        let mut scratch = Vec::<f64>::with_capacity(rows.max(cols));
4123        let out = if dim <= 1 {
4124            let mut v = vec![f64::NAN; cols];
4125            for c in 0..cols {
4126                scratch.clear();
4127                let mut saw_nan = false;
4128                for r in 0..rows {
4129                    let val = abuf[r + c * rows];
4130                    if val.is_nan() {
4131                        saw_nan = true;
4132                        scratch.clear();
4133                        break;
4134                    }
4135                    scratch.push(val);
4136                }
4137                v[c] = if saw_nan || scratch.is_empty() {
4138                    f64::NAN
4139                } else {
4140                    compute_median_inplace(&mut scratch)
4141                };
4142            }
4143            v
4144        } else {
4145            let mut v = vec![f64::NAN; rows];
4146            for r in 0..rows {
4147                scratch.clear();
4148                let mut saw_nan = false;
4149                for c in 0..cols {
4150                    let val = abuf[r + c * rows];
4151                    if val.is_nan() {
4152                        saw_nan = true;
4153                        scratch.clear();
4154                        break;
4155                    }
4156                    scratch.push(val);
4157                }
4158                v[r] = if saw_nan || scratch.is_empty() {
4159                    f64::NAN
4160                } else {
4161                    compute_median_inplace(&mut scratch)
4162                };
4163            }
4164            v
4165        };
4166        drop(guard);
4167        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
4168        registry().lock().unwrap().insert(id, out);
4169        let shape = if dim <= 1 {
4170            vec![1, cols]
4171        } else {
4172            vec![rows, 1]
4173        };
4174        Ok(GpuTensorHandle {
4175            shape,
4176            device_id: 0,
4177            buffer_id: id,
4178        })
4179    }
4180
4181    fn reduce_min(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
4182        let guard = registry().lock().unwrap();
4183        let abuf = guard
4184            .get(&a.buffer_id)
4185            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
4186        let m = abuf.iter().cloned().fold(f64::INFINITY, f64::min);
4187        drop(guard);
4188        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
4189        registry().lock().unwrap().insert(id, vec![m]);
4190        Ok(GpuTensorHandle {
4191            shape: vec![1, 1],
4192            device_id: 0,
4193            buffer_id: id,
4194        })
4195    }
4196
4197    fn reduce_min_dim(
4198        &self,
4199        a: &GpuTensorHandle,
4200        dim: usize,
4201    ) -> Result<runmat_accelerate_api::ReduceDimResult> {
4202        if a.shape.len() != 2 {
4203            return Err(anyhow::anyhow!("reduce_min_dim: only 2D supported"));
4204        }
4205        let rows = a.shape[0];
4206        let cols = a.shape[1];
4207        let guard = registry().lock().unwrap();
4208        let abuf = guard
4209            .get(&a.buffer_id)
4210            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
4211        let (vals, inds, vshape) = if dim <= 1 {
4212            let mut m: Vec<f64> = vec![f64::INFINITY; cols];
4213            let mut idx: Vec<f64> = vec![1.0; cols];
4214            for c in 0..cols {
4215                for r in 0..rows {
4216                    let v = abuf[r + c * rows];
4217                    if v < m[c] {
4218                        m[c] = v;
4219                        idx[c] = (r + 1) as f64;
4220                    }
4221                }
4222            }
4223            (m, idx, vec![1, cols])
4224        } else {
4225            let mut m: Vec<f64> = vec![f64::INFINITY; rows];
4226            let mut idx: Vec<f64> = vec![1.0; rows];
4227            for r in 0..rows {
4228                for c in 0..cols {
4229                    let v = abuf[r + c * rows];
4230                    if v < m[r] {
4231                        m[r] = v;
4232                        idx[r] = (c + 1) as f64;
4233                    }
4234                }
4235            }
4236            (m, idx, vec![rows, 1])
4237        };
4238        drop(guard);
4239        let idv = self.next_id.fetch_add(1, Ordering::Relaxed);
4240        let idi = self.next_id.fetch_add(1, Ordering::Relaxed);
4241        let mut g = registry().lock().unwrap();
4242        g.insert(idv, vals);
4243        g.insert(idi, inds);
4244        let shape_vals = vshape.clone();
4245        let shape_inds = vshape;
4246        Ok(runmat_accelerate_api::ReduceDimResult {
4247            values: GpuTensorHandle {
4248                shape: shape_vals,
4249                device_id: 0,
4250                buffer_id: idv,
4251            },
4252            indices: GpuTensorHandle {
4253                shape: shape_inds,
4254                device_id: 0,
4255                buffer_id: idi,
4256            },
4257        })
4258    }
4259
4260    fn reduce_max(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
4261        let guard = registry().lock().unwrap();
4262        let abuf = guard
4263            .get(&a.buffer_id)
4264            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
4265        let m = abuf.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
4266        drop(guard);
4267        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
4268        registry().lock().unwrap().insert(id, vec![m]);
4269        Ok(GpuTensorHandle {
4270            shape: vec![1, 1],
4271            device_id: 0,
4272            buffer_id: id,
4273        })
4274    }
4275
4276    fn reduce_max_dim(
4277        &self,
4278        a: &GpuTensorHandle,
4279        dim: usize,
4280    ) -> Result<runmat_accelerate_api::ReduceDimResult> {
4281        if a.shape.len() != 2 {
4282            return Err(anyhow::anyhow!("reduce_max_dim: only 2D supported"));
4283        }
4284        let rows = a.shape[0];
4285        let cols = a.shape[1];
4286        let guard = registry().lock().unwrap();
4287        let abuf = guard
4288            .get(&a.buffer_id)
4289            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
4290        let (vals, inds, vshape) = if dim <= 1 {
4291            let mut m: Vec<f64> = vec![f64::NEG_INFINITY; cols];
4292            let mut idx: Vec<f64> = vec![1.0; cols];
4293            for c in 0..cols {
4294                for r in 0..rows {
4295                    let v = abuf[r + c * rows];
4296                    if v > m[c] {
4297                        m[c] = v;
4298                        idx[c] = (r + 1) as f64;
4299                    }
4300                }
4301            }
4302            (m, idx, vec![1, cols])
4303        } else {
4304            let mut m: Vec<f64> = vec![f64::NEG_INFINITY; rows];
4305            let mut idx: Vec<f64> = vec![1.0; rows];
4306            for r in 0..rows {
4307                for c in 0..cols {
4308                    let v = abuf[r + c * rows];
4309                    if v > m[r] {
4310                        m[r] = v;
4311                        idx[r] = (c + 1) as f64;
4312                    }
4313                }
4314            }
4315            (m, idx, vec![rows, 1])
4316        };
4317        drop(guard);
4318        let idv = self.next_id.fetch_add(1, Ordering::Relaxed);
4319        let idi = self.next_id.fetch_add(1, Ordering::Relaxed);
4320        let mut g = registry().lock().unwrap();
4321        g.insert(idv, vals);
4322        g.insert(idi, inds);
4323        let shape_vals = vshape.clone();
4324        let shape_inds = vshape;
4325        Ok(runmat_accelerate_api::ReduceDimResult {
4326            values: GpuTensorHandle {
4327                shape: shape_vals,
4328                device_id: 0,
4329                buffer_id: idv,
4330            },
4331            indices: GpuTensorHandle {
4332                shape: shape_inds,
4333                device_id: 0,
4334                buffer_id: idi,
4335            },
4336        })
4337    }
4338
4339    fn cumsum_scan(
4340        &self,
4341        _input: &GpuTensorHandle,
4342        _dim: usize,
4343        _direction: ProviderScanDirection,
4344        _nan_mode: ProviderNanMode,
4345    ) -> Result<GpuTensorHandle> {
4346        Err(anyhow!("cumsum_scan not supported by provider"))
4347    }
4348
4349    fn cummin_scan(
4350        &self,
4351        _input: &GpuTensorHandle,
4352        _dim: usize,
4353        _direction: ProviderScanDirection,
4354        _nan_mode: ProviderNanMode,
4355    ) -> Result<runmat_accelerate_api::ProviderCumminResult> {
4356        Err(anyhow!("cummin_scan not supported by provider"))
4357    }
4358
4359    fn find(
4360        &self,
4361        a: &GpuTensorHandle,
4362        limit: Option<usize>,
4363        direction: FindDirection,
4364    ) -> Result<ProviderFindResult> {
4365        let shape = a.shape.clone();
4366        let row_extent = shape.first().copied().unwrap_or(1).max(1);
4367        let (indices, rows, cols, values) = {
4368            let guard = registry().lock().unwrap();
4369            let data = guard
4370                .get(&a.buffer_id)
4371                .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
4372            let total = data.len();
4373            let cap = match direction {
4374                FindDirection::First => limit.unwrap_or(total),
4375                FindDirection::Last => limit.unwrap_or(1),
4376            }
4377            .min(total);
4378
4379            let mut indices = Vec::new();
4380            let mut rows_out = Vec::new();
4381            let mut cols_out = Vec::new();
4382            let mut values_out = Vec::new();
4383
4384            if cap == 0 || total == 0 {
4385                (indices, rows_out, cols_out, values_out)
4386            } else {
4387                match direction {
4388                    FindDirection::First => {
4389                        for (idx, &value) in data.iter().enumerate() {
4390                            if value != 0.0 {
4391                                indices.push((idx + 1) as f64);
4392                                rows_out.push(((idx % row_extent) + 1) as f64);
4393                                cols_out.push(((idx / row_extent) + 1) as f64);
4394                                values_out.push(value);
4395                                if indices.len() >= cap {
4396                                    break;
4397                                }
4398                            }
4399                        }
4400                    }
4401                    FindDirection::Last => {
4402                        for (idx, &value) in data.iter().enumerate().rev() {
4403                            if value != 0.0 {
4404                                indices.push((idx + 1) as f64);
4405                                rows_out.push(((idx % row_extent) + 1) as f64);
4406                                cols_out.push(((idx / row_extent) + 1) as f64);
4407                                values_out.push(value);
4408                                if indices.len() >= cap {
4409                                    break;
4410                                }
4411                            }
4412                        }
4413                    }
4414                }
4415                (indices, rows_out, cols_out, values_out)
4416            }
4417        };
4418
4419        let count = indices.len();
4420        let shape_out = vec![count, 1];
4421        let linear = self.allocate_tensor(indices, shape_out.clone());
4422        let rows_handle = self.allocate_tensor(rows, shape_out.clone());
4423        let cols_handle = self.allocate_tensor(cols, shape_out.clone());
4424        let values_handle = self.allocate_tensor(values, shape_out);
4425
4426        Ok(ProviderFindResult {
4427            linear,
4428            rows: rows_handle,
4429            cols: cols_handle,
4430            values: Some(values_handle),
4431        })
4432    }
4433
4434    fn lu(&self, a: &GpuTensorHandle) -> Result<ProviderLuResult> {
4435        let data = {
4436            let guard = registry().lock().unwrap();
4437            guard
4438                .get(&a.buffer_id)
4439                .cloned()
4440                .ok_or_else(|| anyhow!("lu: unknown buffer {}", a.buffer_id))?
4441        };
4442        let LuHostFactors {
4443            combined,
4444            lower,
4445            upper,
4446            perm_matrix,
4447            pivot_vector,
4448            combined_shape,
4449            lower_shape,
4450            upper_shape,
4451            perm_shape,
4452            pivot_shape,
4453        } = lu_factor_host(&data, &a.shape)?;
4454        let combined = self.allocate_tensor(combined, combined_shape);
4455        let lower = self.allocate_tensor(lower, lower_shape);
4456        let upper = self.allocate_tensor(upper, upper_shape);
4457        let perm_matrix = self.allocate_tensor(perm_matrix, perm_shape);
4458        let perm_vector = self.allocate_tensor(pivot_vector, pivot_shape);
4459        Ok(ProviderLuResult {
4460            combined,
4461            lower,
4462            upper,
4463            perm_matrix,
4464            perm_vector,
4465        })
4466    }
4467
4468    fn chol(&self, a: &GpuTensorHandle, lower: bool) -> Result<ProviderCholResult> {
4469        let data = {
4470            let guard = registry().lock().unwrap();
4471            guard
4472                .get(&a.buffer_id)
4473                .cloned()
4474                .ok_or_else(|| anyhow!("chol: unknown buffer {}", a.buffer_id))?
4475        };
4476        let tensor = Tensor::new(data, a.shape.clone()).map_err(|e| anyhow!("chol: {e}"))?;
4477        let mut args = Vec::new();
4478        if lower {
4479            args.push(Value::from("lower"));
4480        }
4481        let eval = runmat_runtime::builtins::math::linalg::factor::chol::evaluate(
4482            Value::Tensor(tensor),
4483            &args,
4484        )
4485        .map_err(|e| anyhow!("chol: {e}"))?;
4486        let factor_tensor = tensor_from_value("chol", eval.factor())?;
4487        let factor = self.allocate_tensor(factor_tensor.data.clone(), factor_tensor.shape.clone());
4488        Ok(ProviderCholResult {
4489            factor,
4490            info: eval.flag_index() as u32,
4491        })
4492    }
4493
4494    fn qr(&self, handle: &GpuTensorHandle, options: ProviderQrOptions) -> Result<ProviderQrResult> {
4495        let data = {
4496            let guard = registry().lock().unwrap();
4497            guard
4498                .get(&handle.buffer_id)
4499                .cloned()
4500                .ok_or_else(|| anyhow!("qr: unknown buffer {}", handle.buffer_id))?
4501        };
4502        let tensor = Tensor::new(data, handle.shape.clone()).map_err(|e| anyhow!("qr: {e}"))?;
4503        let mut args = Vec::new();
4504        if options.economy {
4505            args.push(Value::Num(0.0));
4506        }
4507        if matches!(options.pivot, ProviderQrPivot::Vector) {
4508            args.push(Value::from("vector"));
4509        }
4510        let eval = runmat_runtime::builtins::math::linalg::factor::qr::evaluate(
4511            Value::Tensor(tensor),
4512            &args,
4513        )
4514        .map_err(|e| anyhow!("qr: {e}"))?;
4515
4516        let q_tensor = tensor_from_value("qr", eval.q())?;
4517        let r_tensor = tensor_from_value("qr", eval.r())?;
4518        let perm_matrix_tensor = tensor_from_value("qr", eval.permutation_matrix())?;
4519        let perm_vector_tensor = tensor_from_value("qr", eval.permutation_vector())?;
4520
4521        let q_handle = self.allocate_tensor(q_tensor.data.clone(), q_tensor.shape.clone());
4522        let r_handle = self.allocate_tensor(r_tensor.data.clone(), r_tensor.shape.clone());
4523        let perm_matrix_handle = self.allocate_tensor(
4524            perm_matrix_tensor.data.clone(),
4525            perm_matrix_tensor.shape.clone(),
4526        );
4527        let perm_vector_handle = self.allocate_tensor(
4528            perm_vector_tensor.data.clone(),
4529            perm_vector_tensor.shape.clone(),
4530        );
4531
4532        Ok(ProviderQrResult {
4533            q: q_handle,
4534            r: r_handle,
4535            perm_matrix: perm_matrix_handle,
4536            perm_vector: perm_vector_handle,
4537        })
4538    }
4539
4540    fn matmul(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
4541        // Only support 2D shapes for reference provider
4542        if a.shape.len() != 2 || b.shape.len() != 2 {
4543            return Err(anyhow::anyhow!("matmul: only 2D supported"));
4544        }
4545        let (ar, ac) = (a.shape[0], a.shape[1]);
4546        let (br, bc) = (b.shape[0], b.shape[1]);
4547        if ac != br {
4548            return Err(anyhow::anyhow!("matmul: inner dims must agree"));
4549        }
4550        let guard = registry().lock().unwrap();
4551        let abuf = guard
4552            .get(&a.buffer_id)
4553            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
4554        let bbuf = guard
4555            .get(&b.buffer_id)
4556            .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", b.buffer_id))?;
4557        let mut out = vec![0.0; ar * bc];
4558        // Column-major multiplication
4559        for j in 0..bc {
4560            for i in 0..ar {
4561                let mut sum = 0.0;
4562                for k in 0..ac {
4563                    sum += abuf[i + k * ar] * bbuf[k + j * br];
4564                }
4565                out[i + j * ar] = sum;
4566            }
4567        }
4568        drop(guard);
4569        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
4570        let mut guard2 = registry().lock().unwrap();
4571        guard2.insert(id, out);
4572        Ok(GpuTensorHandle {
4573            shape: vec![ar, bc],
4574            device_id: 0,
4575            buffer_id: id,
4576        })
4577    }
4578
4579    fn matmul_epilogue(
4580        &self,
4581        a: &GpuTensorHandle,
4582        b: &GpuTensorHandle,
4583        ep: &runmat_accelerate_api::MatmulEpilogue,
4584    ) -> Result<GpuTensorHandle> {
4585        // Compute plain matmul first
4586        let base = self.matmul(a, b)?;
4587        let (rows, cols) = (base.shape[0], base.shape[1]);
4588        let mut data = {
4589            let guard = registry().lock().unwrap();
4590            guard
4591                .get(&base.buffer_id)
4592                .cloned()
4593                .ok_or_else(|| anyhow!("matmul_epilogue: unknown buffer {}", base.buffer_id))?
4594        };
4595
4596        // Load optional scales from registry
4597        let row_scale: Option<Vec<f64>> = if let Some(ref h) = ep.row_scale {
4598            let guard = registry().lock().unwrap();
4599            Some(guard.get(&h.buffer_id).cloned().ok_or_else(|| {
4600                anyhow!("matmul_epilogue: unknown row scale buffer {}", h.buffer_id)
4601            })?)
4602        } else {
4603            None
4604        };
4605        let col_scale: Option<Vec<f64>> = if let Some(ref h) = ep.col_scale {
4606            let guard = registry().lock().unwrap();
4607            Some(guard.get(&h.buffer_id).cloned().ok_or_else(|| {
4608                anyhow!("matmul_epilogue: unknown col scale buffer {}", h.buffer_id)
4609            })?)
4610        } else {
4611            None
4612        };
4613        let mut diag_output: Option<Vec<f64>> = if let Some(ref h) = ep.diag_output {
4614            let guard = registry().lock().unwrap();
4615            let vec = guard.get(&h.buffer_id).cloned().ok_or_else(|| {
4616                anyhow!(
4617                    "matmul_epilogue: unknown diag output buffer {}",
4618                    h.buffer_id
4619                )
4620            })?;
4621            Some(vec)
4622        } else {
4623            None
4624        };
4625        if let Some(ref diag) = diag_output {
4626            let expected = rows.min(cols);
4627            if diag.len() < expected {
4628                return Err(anyhow!(
4629                    "matmul_epilogue: diag_output length {} insufficient for diag size {}",
4630                    diag.len(),
4631                    expected
4632                ));
4633            }
4634        }
4635
4636        // Apply epilogue: alpha/beta, then per-row/col scales (matches GPU epilogue ordering)
4637        for j in 0..cols {
4638            for i in 0..rows {
4639                let idx = i + j * rows;
4640                let mut v = data[idx] * ep.alpha + ep.beta;
4641                if let Some(ref rs) = row_scale {
4642                    let s = *rs.get(i).unwrap_or(&1.0);
4643                    v = match ep.row_op {
4644                        runmat_accelerate_api::ScaleOp::Multiply => v * s,
4645                        runmat_accelerate_api::ScaleOp::Divide => v / s,
4646                    };
4647                }
4648                if let Some(ref cs) = col_scale {
4649                    let s = *cs.get(j).unwrap_or(&1.0);
4650                    v = match ep.col_op {
4651                        runmat_accelerate_api::ScaleOp::Multiply => v * s,
4652                        runmat_accelerate_api::ScaleOp::Divide => v / s,
4653                    };
4654                }
4655                if let Some(min_v) = ep.clamp_min {
4656                    v = v.max(min_v);
4657                }
4658                if let Some(max_v) = ep.clamp_max {
4659                    v = v.min(max_v);
4660                }
4661                if let Some(pow_v) = ep.pow_exponent {
4662                    v = v.powf(pow_v);
4663                }
4664                if let Some(ref mut diag) = diag_output {
4665                    if i == j && i < diag.len() {
4666                        diag[i] = v;
4667                    }
4668                }
4669                data[idx] = v;
4670            }
4671        }
4672
4673        // Replace buffer contents with epilogued data
4674        {
4675            let mut guard = registry().lock().unwrap();
4676            guard.insert(base.buffer_id, data);
4677            if let Some(vec) = diag_output {
4678                if let Some(ref h) = ep.diag_output {
4679                    guard.insert(h.buffer_id, vec);
4680                }
4681            }
4682        }
4683        Ok(base)
4684    }
4685
4686    fn matmul_power_step(
4687        &self,
4688        lhs: &GpuTensorHandle,
4689        rhs: &GpuTensorHandle,
4690        ep: &runmat_accelerate_api::PowerStepEpilogue,
4691    ) -> Result<GpuTensorHandle> {
4692        let base = self.matmul(lhs, rhs)?;
4693        let rows = base.shape[0];
4694        let cols = base.shape[1];
4695        let mut data = {
4696            let guard = registry().lock().unwrap();
4697            guard
4698                .get(&base.buffer_id)
4699                .cloned()
4700                .ok_or_else(|| anyhow!("matmul_power_step: unknown buffer {}", base.buffer_id))?
4701        };
4702        let mut norms = vec![0.0f64; cols];
4703        for (col, norm) in norms.iter_mut().enumerate().take(cols) {
4704            let mut acc = 0.0f64;
4705            for row in 0..rows {
4706                let idx = row + col * rows;
4707                let val = data[idx];
4708                acc += val * val;
4709            }
4710            acc += ep.epsilon;
4711            *norm = acc.sqrt();
4712        }
4713        for (col, norm) in norms.iter().enumerate().take(cols) {
4714            for row in 0..rows {
4715                let idx = row + col * rows;
4716                data[idx] /= norm;
4717            }
4718        }
4719        {
4720            let mut guard = registry().lock().unwrap();
4721            guard.insert(base.buffer_id, data);
4722        }
4723        Ok(base)
4724    }
4725
4726    fn image_normalize(
4727        &self,
4728        input: &GpuTensorHandle,
4729        desc: &runmat_accelerate_api::ImageNormalizeDescriptor,
4730    ) -> Result<GpuTensorHandle> {
4731        ensure!(
4732            input.shape.len() == 3,
4733            "image_normalize: expected 3-D tensor, got {:?}",
4734            input.shape
4735        );
4736        ensure!(
4737            input.shape[0] == desc.batch
4738                && input.shape[1] == desc.height
4739                && input.shape[2] == desc.width,
4740            "image_normalize: descriptor dims {:?} do not match tensor shape {:?}",
4741            (desc.batch, desc.height, desc.width),
4742            input.shape
4743        );
4744
4745        let data = {
4746            let guard = registry().lock().unwrap();
4747            guard
4748                .get(&input.buffer_id)
4749                .cloned()
4750                .ok_or_else(|| anyhow!("image_normalize: unknown buffer {}", input.buffer_id))?
4751        };
4752
4753        let batch = desc.batch;
4754        let height = desc.height;
4755        let width = desc.width;
4756        let plane = height * width;
4757        if plane == 0 {
4758            return Ok(self.allocate_tensor(vec![], input.shape.clone()));
4759        }
4760
4761        let stride_h = batch;
4762        let stride_w = batch * height;
4763
4764        let gain = desc.gain.unwrap_or(1.0);
4765        let bias = desc.bias.unwrap_or(0.0);
4766        let gamma = desc.gamma;
4767
4768        let mut output = data.clone();
4769
4770        for b in 0..batch {
4771            let mut sum = 0.0;
4772            for w in 0..width {
4773                let base_w = w * stride_w;
4774                for h in 0..height {
4775                    let idx = b + h * stride_h + base_w;
4776                    sum += data[idx];
4777                }
4778            }
4779            let mean = sum / plane as f64;
4780
4781            let mut sq_sum = 0.0;
4782            for w in 0..width {
4783                let base_w = w * stride_w;
4784                for h in 0..height {
4785                    let idx = b + h * stride_h + base_w;
4786                    let diff = data[idx] - mean;
4787                    sq_sum += diff * diff;
4788                }
4789            }
4790            let variance = sq_sum / plane as f64;
4791            let sigma = (variance + desc.epsilon).sqrt();
4792            let inv_sigma = if sigma > 0.0 { 1.0 / sigma } else { 0.0 };
4793
4794            for w in 0..width {
4795                let base_w = w * stride_w;
4796                for h in 0..height {
4797                    let idx = b + h * stride_h + base_w;
4798                    let mut value = (data[idx] - mean) * inv_sigma;
4799                    if desc.gain.is_some() {
4800                        value *= gain;
4801                    }
4802                    if desc.bias.is_some() {
4803                        value += bias;
4804                    }
4805                    value = value.max(0.0);
4806                    if let Some(gamma) = gamma {
4807                        value = value.powf(gamma);
4808                    }
4809                    output[idx] = value;
4810                }
4811            }
4812        }
4813
4814        Ok(self.allocate_tensor(output, input.shape.clone()))
4815    }
4816
4817    fn pagefun(&self, _request: &PagefunRequest) -> Result<GpuTensorHandle> {
4818        Err(anyhow::anyhow!(
4819            "pagefun: in-process provider does not implement GPU page operations"
4820        ))
4821    }
4822
4823    fn linsolve(
4824        &self,
4825        lhs: &GpuTensorHandle,
4826        rhs: &GpuTensorHandle,
4827        options: &ProviderLinsolveOptions,
4828    ) -> Result<ProviderLinsolveResult> {
4829        let (lhs_data, rhs_data) = {
4830            let guard = registry().lock().unwrap();
4831            let lhs_buf = guard
4832                .get(&lhs.buffer_id)
4833                .cloned()
4834                .ok_or_else(|| anyhow!("linsolve: unknown buffer {}", lhs.buffer_id))?;
4835            let rhs_buf = guard
4836                .get(&rhs.buffer_id)
4837                .cloned()
4838                .ok_or_else(|| anyhow!("linsolve: unknown buffer {}", rhs.buffer_id))?;
4839            (lhs_buf, rhs_buf)
4840        };
4841
4842        let lhs_tensor =
4843            Tensor::new(lhs_data, lhs.shape.clone()).map_err(|e| anyhow!("linsolve: {e}"))?;
4844        let rhs_tensor =
4845            Tensor::new(rhs_data, rhs.shape.clone()).map_err(|e| anyhow!("linsolve: {e}"))?;
4846
4847        let (solution, rcond) = linsolve_host_real_for_provider(&lhs_tensor, &rhs_tensor, options)
4848            .map_err(|e| anyhow!("{e}"))?;
4849
4850        let Tensor { data, shape, .. } = solution;
4851        let handle = self.allocate_tensor(data, shape);
4852        Ok(ProviderLinsolveResult {
4853            solution: handle,
4854            reciprocal_condition: rcond,
4855        })
4856    }
4857
4858    fn inv(
4859        &self,
4860        matrix: &GpuTensorHandle,
4861        _options: ProviderInvOptions,
4862    ) -> Result<GpuTensorHandle> {
4863        let data = {
4864            let guard = registry().lock().unwrap();
4865            guard
4866                .get(&matrix.buffer_id)
4867                .cloned()
4868                .ok_or_else(|| anyhow!("inv: unknown buffer {}", matrix.buffer_id))?
4869        };
4870        let tensor = Tensor::new(data, matrix.shape.clone()).map_err(|e| anyhow!("inv: {e}"))?;
4871        let result = inv_host_real_for_provider(&tensor).map_err(|e| anyhow!("{e}"))?;
4872        let Tensor { data, shape, .. } = result;
4873        Ok(self.allocate_tensor(data, shape))
4874    }
4875
4876    fn pinv(
4877        &self,
4878        matrix: &GpuTensorHandle,
4879        options: ProviderPinvOptions,
4880    ) -> Result<GpuTensorHandle> {
4881        let data = {
4882            let guard = registry().lock().unwrap();
4883            guard
4884                .get(&matrix.buffer_id)
4885                .cloned()
4886                .ok_or_else(|| anyhow!("pinv: unknown buffer {}", matrix.buffer_id))?
4887        };
4888        let tensor = Tensor::new(data, matrix.shape.clone()).map_err(|e| anyhow!("pinv: {e}"))?;
4889        let result =
4890            pinv_host_real_for_provider(&tensor, options.tolerance).map_err(|e| anyhow!("{e}"))?;
4891        let Tensor { data, shape, .. } = result;
4892        Ok(self.allocate_tensor(data, shape))
4893    }
4894
4895    fn cond(&self, matrix: &GpuTensorHandle, norm: ProviderCondNorm) -> Result<GpuTensorHandle> {
4896        let data = {
4897            let guard = registry().lock().unwrap();
4898            guard
4899                .get(&matrix.buffer_id)
4900                .cloned()
4901                .ok_or_else(|| anyhow!("cond: unknown buffer {}", matrix.buffer_id))?
4902        };
4903        let tensor = Tensor::new(data, matrix.shape.clone()).map_err(|e| anyhow!("cond: {e}"))?;
4904        let cond_value = cond_host_real_for_provider(&tensor, norm).map_err(|e| anyhow!("{e}"))?;
4905        Ok(self.allocate_tensor(vec![cond_value], vec![1, 1]))
4906    }
4907
4908    fn norm(&self, tensor: &GpuTensorHandle, order: ProviderNormOrder) -> Result<GpuTensorHandle> {
4909        let data = {
4910            let guard = registry().lock().unwrap();
4911            guard
4912                .get(&tensor.buffer_id)
4913                .cloned()
4914                .ok_or_else(|| anyhow!("norm: unknown buffer {}", tensor.buffer_id))?
4915        };
4916        let host_tensor =
4917            Tensor::new(data, tensor.shape.clone()).map_err(|e| anyhow!("norm: {e}"))?;
4918        let value = norm_host_real_for_provider(&host_tensor, order).map_err(|e| anyhow!("{e}"))?;
4919        Ok(self.allocate_tensor(vec![value], vec![1, 1]))
4920    }
4921
4922    fn rank(&self, matrix: &GpuTensorHandle, tolerance: Option<f64>) -> Result<GpuTensorHandle> {
4923        let data = {
4924            let guard = registry().lock().unwrap();
4925            guard
4926                .get(&matrix.buffer_id)
4927                .cloned()
4928                .ok_or_else(|| anyhow!("rank: unknown buffer {}", matrix.buffer_id))?
4929        };
4930
4931        let tensor = Tensor::new(data, matrix.shape.clone()).map_err(|e| anyhow!("rank: {e}"))?;
4932        let rank =
4933            rank_host_real_for_provider(&tensor, tolerance).map_err(|e| anyhow!("{e}"))? as f64;
4934
4935        Ok(self.allocate_tensor(vec![rank], vec![1, 1]))
4936    }
4937
4938    fn rcond(&self, matrix: &GpuTensorHandle) -> Result<GpuTensorHandle> {
4939        let data = {
4940            let guard = registry().lock().unwrap();
4941            guard
4942                .get(&matrix.buffer_id)
4943                .cloned()
4944                .ok_or_else(|| anyhow!("rcond: unknown buffer {}", matrix.buffer_id))?
4945        };
4946        let tensor = Tensor::new(data, matrix.shape.clone()).map_err(|e| anyhow!("rcond: {e}"))?;
4947        let estimate = rcond_host_real_for_provider(&tensor).map_err(|e| anyhow!("{e}"))?;
4948        Ok(self.allocate_tensor(vec![estimate], vec![1, 1]))
4949    }
4950
4951    fn mldivide(&self, lhs: &GpuTensorHandle, rhs: &GpuTensorHandle) -> Result<GpuTensorHandle> {
4952        let (lhs_data, rhs_data) = {
4953            let guard = registry().lock().unwrap();
4954            let lhs_buf = guard
4955                .get(&lhs.buffer_id)
4956                .cloned()
4957                .ok_or_else(|| anyhow!("mldivide: unknown buffer {}", lhs.buffer_id))?;
4958            let rhs_buf = guard
4959                .get(&rhs.buffer_id)
4960                .cloned()
4961                .ok_or_else(|| anyhow!("mldivide: unknown buffer {}", rhs.buffer_id))?;
4962            (lhs_buf, rhs_buf)
4963        };
4964
4965        let lhs_tensor =
4966            Tensor::new(lhs_data, lhs.shape.clone()).map_err(|e| anyhow!("mldivide: {e}"))?;
4967        let rhs_tensor =
4968            Tensor::new(rhs_data, rhs.shape.clone()).map_err(|e| anyhow!("mldivide: {e}"))?;
4969
4970        let result = mldivide_host_real_for_provider(&lhs_tensor, &rhs_tensor)
4971            .map_err(|e| anyhow!("{e}"))?;
4972
4973        let Tensor { data, shape, .. } = result;
4974        Ok(self.allocate_tensor(data, shape))
4975    }
4976
4977    fn mrdivide(&self, lhs: &GpuTensorHandle, rhs: &GpuTensorHandle) -> Result<GpuTensorHandle> {
4978        let (lhs_data, rhs_data) = {
4979            let guard = registry().lock().unwrap();
4980            let lhs_buf = guard
4981                .get(&lhs.buffer_id)
4982                .cloned()
4983                .ok_or_else(|| anyhow!("mrdivide: unknown buffer {}", lhs.buffer_id))?;
4984            let rhs_buf = guard
4985                .get(&rhs.buffer_id)
4986                .cloned()
4987                .ok_or_else(|| anyhow!("mrdivide: unknown buffer {}", rhs.buffer_id))?;
4988            (lhs_buf, rhs_buf)
4989        };
4990
4991        let lhs_tensor =
4992            Tensor::new(lhs_data, lhs.shape.clone()).map_err(|e| anyhow!("mrdivide: {e}"))?;
4993        let rhs_tensor =
4994            Tensor::new(rhs_data, rhs.shape.clone()).map_err(|e| anyhow!("mrdivide: {e}"))?;
4995
4996        let result = mrdivide_host_real_for_provider(&lhs_tensor, &rhs_tensor)
4997            .map_err(|e| anyhow!("{e}"))?;
4998
4999        let Tensor { data, shape, .. } = result;
5000        Ok(self.allocate_tensor(data, shape))
5001    }
5002
5003    fn eig(&self, _a: &GpuTensorHandle, _compute_left: bool) -> Result<ProviderEigResult> {
5004        Err(anyhow!("eig: not supported by in-process provider"))
5005    }
5006
5007    fn sub2ind(
5008        &self,
5009        dims: &[usize],
5010        strides: &[usize],
5011        inputs: &[&GpuTensorHandle],
5012        scalar_mask: &[bool],
5013        len: usize,
5014        output_shape: &[usize],
5015    ) -> Result<GpuTensorHandle> {
5016        if inputs.len() != dims.len() || inputs.len() != scalar_mask.len() {
5017            return Err(anyhow::anyhow!(
5018                "sub2ind: expected {} subscripts for {} dimensions",
5019                dims.len(),
5020                dims.len()
5021            ));
5022        }
5023        let expected_len: usize = output_shape.iter().copied().product();
5024        if expected_len != len {
5025            return Err(anyhow::anyhow!(
5026                "sub2ind: output shape does not match subscript sizes"
5027            ));
5028        }
5029        if len == 0 {
5030            let id = self.next_id.fetch_add(1, Ordering::Relaxed);
5031            registry().lock().unwrap().insert(id, Vec::new());
5032            return Ok(GpuTensorHandle {
5033                shape: output_shape.to_vec(),
5034                device_id: 0,
5035                buffer_id: id,
5036            });
5037        }
5038
5039        let mut host_values: Vec<Vec<f64>> = Vec::with_capacity(inputs.len());
5040        {
5041            let guard = registry().lock().unwrap();
5042            for handle in inputs {
5043                let data = guard
5044                    .get(&handle.buffer_id)
5045                    .ok_or_else(|| anyhow::anyhow!("sub2ind: unknown buffer {}", handle.buffer_id))?
5046                    .clone();
5047                host_values.push(data);
5048            }
5049        }
5050
5051        let mut output = Vec::with_capacity(len);
5052        for idx in 0..len {
5053            let mut offset: usize = 0;
5054            for (dim_index, ((&dim_size, &stride), data)) in dims
5055                .iter()
5056                .zip(strides.iter())
5057                .zip(host_values.iter())
5058                .enumerate()
5059            {
5060                let raw = if scalar_mask[dim_index] {
5061                    *data.first().unwrap_or(&0.0)
5062                } else {
5063                    data[idx]
5064                };
5065                let coerced = coerce_sub2ind_value(raw, dim_index + 1, dim_size)?;
5066                let term = coerced
5067                    .checked_sub(1)
5068                    .and_then(|base| base.checked_mul(stride))
5069                    .ok_or_else(|| {
5070                        anyhow::anyhow!("sub2ind: computed index exceeds platform limits")
5071                    })?;
5072                offset = offset.checked_add(term).ok_or_else(|| {
5073                    anyhow::anyhow!("sub2ind: computed index exceeds platform limits")
5074                })?;
5075            }
5076            output.push((offset + 1) as f64);
5077        }
5078
5079        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
5080        registry().lock().unwrap().insert(id, output);
5081        Ok(GpuTensorHandle {
5082            shape: output_shape.to_vec(),
5083            device_id: 0,
5084            buffer_id: id,
5085        })
5086    }
5087
5088    fn fused_elementwise(
5089        &self,
5090        _shader: &str,
5091        _inputs: &[GpuTensorHandle],
5092        _output_shape: &[usize],
5093        _len: usize,
5094    ) -> Result<GpuTensorHandle> {
5095        Err(anyhow::anyhow!(
5096            "fused_elementwise not supported by in-process provider"
5097        ))
5098    }
5099}
5100
5101static INSTANCE: OnceCell<InProcessProvider> = OnceCell::new();
5102
5103/// Register the in-process provider as the global acceleration provider.
5104/// Safe to call multiple times; only the first call installs the provider.
5105pub fn register_inprocess_provider() {
5106    let provider: &'static InProcessProvider = INSTANCE.get_or_init(InProcessProvider::new);
5107    // Safety: we intentionally install a reference with 'static lifetime. Always reassert.
5108    unsafe { runmat_accelerate_api::register_provider(provider) };
5109}
5110
5111/// Reset the in-process provider RNG to its default seed (test-only helper).
5112pub fn reset_inprocess_rng() {
5113    if let Ok(mut guard) = rng_state().lock() {
5114        *guard = 0x9e3779b97f4a7c15;
5115    }
5116}