1use crate::host_lu::{lu_factor_host, LuHostFactors};
2use crate::sortrows_host::{sort_rows_host, SortRowsHostOutputs};
3use anyhow::{anyhow, ensure, Result};
4use once_cell::sync::OnceCell;
5use runmat_accelerate_api::{
6 AccelProvider, CorrcoefOptions, CovarianceOptions, FindDirection, FspecialRequest,
7 GpuTensorHandle, HostTensorOwned, HostTensorView, ImfilterOptions, PagefunRequest,
8 ProviderBandwidth, ProviderCholResult, ProviderCondNorm, ProviderConv1dOptions,
9 ProviderConvMode, ProviderConvOrientation, ProviderEigResult, ProviderFindResult,
10 ProviderHermitianKind, ProviderIirFilterOptions, ProviderIirFilterResult, ProviderInvOptions,
11 ProviderLinsolveOptions, ProviderLinsolveResult, ProviderLuResult, ProviderNanMode,
12 ProviderNormOrder, ProviderPinvOptions, ProviderPolyderQuotient, ProviderPrecision,
13 ProviderQrOptions, ProviderQrPivot, ProviderQrResult, ProviderScanDirection,
14 ProviderSymmetryKind, SetdiffOptions, SetdiffResult, SortComparison, SortResult,
15 SortRowsColumnSpec, UniqueOptions, UniqueResult,
16};
17use runmat_builtins::{Tensor, Value};
18use runmat_runtime::builtins::array::sorting_sets::unique;
19use runmat_runtime::builtins::common::broadcast::{
20 broadcast_index as runtime_broadcast_index, broadcast_shapes as runtime_broadcast_shapes,
21 compute_strides as runtime_compute_strides,
22};
23use runmat_runtime::builtins::stats::summary::{
24 corrcoef_from_tensors as runtime_corrcoef_from_tensors,
25 cov_from_tensors as runtime_cov_from_tensors, CovWeightSpec,
26};
27
28use runmat_runtime::builtins::math::linalg::ops::{
29 dot_host_real_for_provider, mldivide_host_real_for_provider, mrdivide_host_real_for_provider,
30};
31use runmat_runtime::builtins::math::linalg::solve::cond::cond_host_real_for_provider;
32use runmat_runtime::builtins::math::linalg::solve::inv::inv_host_real_for_provider;
33use runmat_runtime::builtins::math::linalg::solve::linsolve::linsolve_host_real_for_provider;
34use runmat_runtime::builtins::math::linalg::solve::norm::norm_host_real_for_provider;
35use runmat_runtime::builtins::math::linalg::solve::pinv::pinv_host_real_for_provider;
36use runmat_runtime::builtins::math::linalg::solve::rank_host_real_for_provider;
37use runmat_runtime::builtins::math::linalg::solve::rcond::rcond_host_real_for_provider;
38use runmat_runtime::builtins::math::linalg::structure::bandwidth::bandwidth_host_real_data;
39use runmat_runtime::builtins::math::linalg::structure::ishermitian::ishermitian_host_real_data;
40use runmat_runtime::builtins::math::linalg::structure::issymmetric::issymmetric_host_real_data;
41use runmat_runtime::builtins::math::linalg::structure::symrcm::symrcm_host_real_data;
42use runmat_runtime::builtins::math::reduction::compute_median_inplace;
43use runmat_runtime::builtins::math::reduction::diff_tensor_host;
44use std::collections::HashMap;
45use std::sync::atomic::{AtomicU64, Ordering};
46use std::sync::Mutex;
47
48const PROVIDER_DEFAULT_SEED: u64 = 0x9e3779b97f4a7c15;
49
50static REGISTRY: OnceCell<Mutex<HashMap<u64, Vec<f64>>>> = OnceCell::new();
51
52fn registry() -> &'static Mutex<HashMap<u64, Vec<f64>>> {
53 REGISTRY.get_or_init(|| Mutex::new(HashMap::new()))
54}
55
56const POLYDER_EPS: f64 = 1.0e-12;
57const FACTORIAL_MAX_HOST: usize = 170;
58const FACTORIAL_INT_TOL: f64 = 1.0e-10;
59
60#[derive(Clone, Copy)]
61enum PolyOrientation {
62 Scalar,
63 Row,
64 Column,
65}
66
67fn poly_orientation_from_shape(shape: &[usize]) -> Result<PolyOrientation> {
68 let mut non_unit = 0usize;
69 let mut orientation = PolyOrientation::Scalar;
70 for (idx, &dim) in shape.iter().enumerate() {
71 if dim > 1 {
72 non_unit += 1;
73 orientation = if idx == 0 {
74 PolyOrientation::Column
75 } else {
76 PolyOrientation::Row
77 };
78 }
79 }
80 if non_unit > 1 {
81 Err(anyhow!("polyder: coefficient inputs must be vectors"))
82 } else {
83 Ok(orientation)
84 }
85}
86
87fn poly_shape_for_len(orientation: PolyOrientation, len: usize) -> Vec<usize> {
88 if len <= 1 {
89 return vec![1, 1];
90 }
91 match orientation {
92 PolyOrientation::Scalar | PolyOrientation::Row => vec![1, len],
93 PolyOrientation::Column => vec![len, 1],
94 }
95}
96
97fn poly_trim_slice(coeffs: &[f64]) -> Vec<f64> {
98 if coeffs.is_empty() {
99 return vec![0.0];
100 }
101 if let Some(idx) = coeffs.iter().position(|c| c.abs() > POLYDER_EPS) {
102 coeffs[idx..].to_vec()
103 } else {
104 vec![0.0]
105 }
106}
107
108fn poly_raw_derivative(coeffs: &[f64]) -> Vec<f64> {
109 if coeffs.len() <= 1 {
110 return vec![0.0];
111 }
112 let mut out = Vec::with_capacity(coeffs.len() - 1);
113 let mut power = coeffs.len() - 1;
114 for coeff in coeffs.iter().take(coeffs.len() - 1) {
115 out.push(*coeff * power as f64);
116 power -= 1;
117 }
118 out
119}
120
121fn poly_integral_real(coeffs: &[f64], constant: f64) -> Vec<f64> {
122 if coeffs.is_empty() {
123 return vec![constant];
124 }
125 let mut out = Vec::with_capacity(coeffs.len() + 1);
126 for (idx, &coeff) in coeffs.iter().enumerate() {
127 let power = (coeffs.len() - idx) as f64;
128 if power == 0.0 {
129 out.push(0.0);
130 } else {
131 out.push(coeff / power);
132 }
133 }
134 out.push(constant);
135 out
136}
137
138fn poly_convolve_real(a: &[f64], b: &[f64]) -> Vec<f64> {
139 if a.is_empty() || b.is_empty() {
140 return Vec::new();
141 }
142 let mut result = vec![0.0; a.len() + b.len() - 1];
143 for (i, &ai) in a.iter().enumerate() {
144 for (j, &bj) in b.iter().enumerate() {
145 result[i + j] += ai * bj;
146 }
147 }
148 result
149}
150
151fn poly_add_real(a: &[f64], b: &[f64]) -> Vec<f64> {
152 let len = a.len().max(b.len());
153 let mut result = vec![0.0; len];
154 for (idx, &value) in a.iter().enumerate() {
155 result[len - a.len() + idx] += value;
156 }
157 for (idx, &value) in b.iter().enumerate() {
158 result[len - b.len() + idx] += value;
159 }
160 result
161}
162
163fn poly_sub_real(a: &[f64], b: &[f64]) -> Vec<f64> {
164 let len = a.len().max(b.len());
165 let mut result = vec![0.0; len];
166 for (idx, &value) in a.iter().enumerate() {
167 result[len - a.len() + idx] += value;
168 }
169 for (idx, &value) in b.iter().enumerate() {
170 result[len - b.len() + idx] -= value;
171 }
172 result
173}
174
175fn rng_state() -> &'static Mutex<u64> {
176 static RNG: OnceCell<Mutex<u64>> = OnceCell::new();
177 RNG.get_or_init(|| Mutex::new(0x9e3779b97f4a7c15))
178}
179
180fn factorial_scalar_host(value: f64) -> f64 {
181 if value.is_nan() {
182 return f64::NAN;
183 }
184 if value == 0.0 {
185 return 1.0;
186 }
187 if value.is_infinite() {
188 return if value.is_sign_positive() {
189 f64::INFINITY
190 } else {
191 f64::NAN
192 };
193 }
194 if value < 0.0 {
195 return f64::NAN;
196 }
197 let rounded = value.round();
198 if (value - rounded).abs() > FACTORIAL_INT_TOL {
199 return f64::NAN;
200 }
201 if rounded < 0.0 {
202 return f64::NAN;
203 }
204 let n = rounded as usize;
205 if n > FACTORIAL_MAX_HOST {
206 return f64::INFINITY;
207 }
208 if n == 0 {
209 return 1.0;
210 }
211 let mut acc = 1.0f64;
212 for k in 2..=n {
213 acc *= k as f64;
214 }
215 acc
216}
217
218fn tensor_to_weight_vector(tensor: &Tensor) -> Result<Vec<f64>> {
219 if tensor.shape.len() > 2 {
220 return Err(anyhow!("covariance: weight vector must be 1-D"));
221 }
222 let rows = tensor.rows();
223 let cols = tensor.cols();
224 if rows != 1 && cols != 1 {
225 return Err(anyhow!(
226 "covariance: weight vector must be 1-D (received shape {}x{})",
227 rows,
228 cols
229 ));
230 }
231 Ok(tensor.data.clone())
232}
233
234fn next_uniform(state: &mut u64) -> f64 {
235 const MULTIPLIER: u64 = 6364136223846793005;
236 const INCREMENT: u64 = 1;
237 const SHIFT: u32 = 11;
238 const SCALE: f64 = 1.0 / ((1u64 << 53) as f64);
239
240 *state = state.wrapping_mul(MULTIPLIER).wrapping_add(INCREMENT);
241 let bits = *state >> SHIFT;
242 (bits as f64) * SCALE
243}
244
245fn next_normal_pair(state: &mut u64) -> (f64, f64) {
246 let mut u1 = next_uniform(state);
247 if u1 <= 0.0 {
248 u1 = f64::MIN_POSITIVE;
249 }
250 let u2 = next_uniform(state);
251 let radius = (-2.0 * u1.ln()).sqrt();
252 let angle = 2.0 * std::f64::consts::PI * u2;
253 (radius * angle.cos(), radius * angle.sin())
254}
255
256pub struct InProcessProvider {
257 next_id: AtomicU64,
258}
259
260impl InProcessProvider {
261 pub const fn new() -> Self {
262 Self {
263 next_id: AtomicU64::new(1),
264 }
265 }
266
267 fn allocate_tensor(&self, data: Vec<f64>, shape: Vec<usize>) -> GpuTensorHandle {
268 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
269 registry()
270 .lock()
271 .unwrap_or_else(|e| e.into_inner())
272 .insert(id, data);
273 GpuTensorHandle {
274 shape,
275 device_id: 0,
276 buffer_id: id,
277 }
278 }
279
280 fn load_polynomial(&self, handle: &GpuTensorHandle) -> Result<(Vec<f64>, PolyOrientation)> {
281 let data = {
282 let guard = registry().lock().unwrap();
283 guard
284 .get(&handle.buffer_id)
285 .cloned()
286 .ok_or_else(|| anyhow!("polyder: unknown tensor handle {}", handle.buffer_id))?
287 };
288 let orientation = poly_orientation_from_shape(&handle.shape)?;
289 let coeffs = if data.is_empty() { vec![0.0] } else { data };
290 Ok((coeffs, orientation))
291 }
292
293 fn allocate_polynomial(
294 &self,
295 coeffs: Vec<f64>,
296 orientation: PolyOrientation,
297 ) -> GpuTensorHandle {
298 let shape = poly_shape_for_len(orientation, coeffs.len());
299 self.allocate_tensor(coeffs, shape)
300 }
301}
302
303impl Default for InProcessProvider {
304 fn default() -> Self {
305 Self::new()
306 }
307}
308
309fn normalize_shape(shape: &[usize]) -> Vec<usize> {
310 match shape.len() {
311 0 => vec![1, 1],
312 1 => {
313 let n = shape[0];
314 vec![n, n]
315 }
316 _ => shape.to_vec(),
317 }
318}
319
320fn compute_strides(shape: &[usize]) -> Vec<usize> {
321 let mut strides = Vec::with_capacity(shape.len());
322 let mut stride = 1usize;
323 for &dim in shape {
324 strides.push(stride);
325 stride = stride.saturating_mul(dim);
326 }
327 strides
328}
329
330fn product(shape: &[usize]) -> usize {
331 shape.iter().copied().product()
332}
333
334fn decode_indices(mut index: usize, dims: &[usize]) -> Vec<usize> {
335 if dims.is_empty() {
336 return Vec::new();
337 }
338 let mut coords = Vec::with_capacity(dims.len());
339 for &dim in dims {
340 if dim == 0 {
341 coords.push(0);
342 } else {
343 let coord = index % dim;
344 coords.push(coord);
345 index /= dim.max(1);
346 }
347 }
348 coords
349}
350
351fn shapes_compatible(expected: &[usize], actual: &[usize]) -> bool {
352 let max_len = expected.len().max(actual.len());
353 for i in 0..max_len {
354 let e = expected.get(i).copied().unwrap_or(1);
355 let a = actual.get(i).copied().unwrap_or(1);
356 if e != a {
357 return false;
358 }
359 }
360 true
361}
362
363fn filter_state_shape(mut base: Vec<usize>, dim_idx: usize, state_len: usize) -> Vec<usize> {
364 if base.len() <= dim_idx {
365 base.extend(std::iter::repeat_n(1, dim_idx + 1 - base.len()));
366 }
367 if !base.is_empty() {
368 base[dim_idx] = state_len;
369 }
370 base
371}
372
373fn states_from_column_major(
374 data: &[f64],
375 state_len: usize,
376 dim_idx: usize,
377 shape_ext: &[usize],
378) -> Vec<f64> {
379 if state_len == 0 {
380 return Vec::new();
381 }
382 let dims_before = &shape_ext[..dim_idx];
383 let dims_after = if dim_idx + 1 < shape_ext.len() {
384 &shape_ext[dim_idx + 1..]
385 } else {
386 &[]
387 };
388 let leading = if dims_before.is_empty() {
389 1
390 } else {
391 dims_before.iter().copied().product()
392 };
393 let trailing = if dims_after.is_empty() {
394 1
395 } else {
396 dims_after.iter().copied().product()
397 };
398 let channel_count = leading * trailing;
399 let shape = filter_state_shape(shape_ext.to_vec(), dim_idx, state_len);
400 let mut states = vec![0.0; state_len * channel_count];
401 for channel in 0..channel_count {
402 let before_idx = if dims_before.is_empty() {
403 0
404 } else {
405 channel % leading
406 };
407 let after_idx = if dims_after.is_empty() {
408 0
409 } else {
410 channel / leading
411 };
412 let before_coords = decode_indices(before_idx, dims_before);
413 let after_coords = decode_indices(after_idx, dims_after);
414 for s in 0..state_len {
415 let mut offset = 0usize;
416 let mut stride = 1usize;
417 for (d, size) in shape.iter().copied().enumerate() {
418 let coord = if d < dim_idx {
419 before_coords.get(d).copied().unwrap_or(0)
420 } else if d == dim_idx {
421 s
422 } else {
423 let idx = d - dim_idx - 1;
424 after_coords.get(idx).copied().unwrap_or(0)
425 };
426 offset += coord * stride;
427 stride *= size;
428 }
429 states[channel * state_len + s] = data[offset];
430 }
431 }
432 states
433}
434
435fn states_to_column_major(
436 states: &[f64],
437 state_len: usize,
438 dim_idx: usize,
439 shape_ext: &[usize],
440) -> Vec<f64> {
441 if state_len == 0 {
442 return Vec::new();
443 }
444 let dims_before = &shape_ext[..dim_idx];
445 let dims_after = if dim_idx + 1 < shape_ext.len() {
446 &shape_ext[dim_idx + 1..]
447 } else {
448 &[]
449 };
450 let leading = if dims_before.is_empty() {
451 1
452 } else {
453 dims_before.iter().copied().product()
454 };
455 let trailing = if dims_after.is_empty() {
456 1
457 } else {
458 dims_after.iter().copied().product()
459 };
460 let channel_count = leading * trailing;
461 let shape = filter_state_shape(shape_ext.to_vec(), dim_idx, state_len);
462 let mut out = vec![0.0; states.len()];
463 for channel in 0..channel_count {
464 let before_idx = if dims_before.is_empty() {
465 0
466 } else {
467 channel % leading
468 };
469 let after_idx = if dims_after.is_empty() {
470 0
471 } else {
472 channel / leading
473 };
474 let before_coords = decode_indices(before_idx, dims_before);
475 let after_coords = decode_indices(after_idx, dims_after);
476 for s in 0..state_len {
477 let mut offset = 0usize;
478 let mut stride = 1usize;
479 for (d, size) in shape.iter().copied().enumerate() {
480 let coord = if d < dim_idx {
481 before_coords.get(d).copied().unwrap_or(0)
482 } else if d == dim_idx {
483 s
484 } else {
485 let idx = d - dim_idx - 1;
486 after_coords.get(idx).copied().unwrap_or(0)
487 };
488 offset += coord * stride;
489 stride *= size;
490 }
491 out[offset] = states[channel * state_len + s];
492 }
493 }
494 out
495}
496
497fn permute_data(data: &[f64], shape: &[usize], order: &[usize]) -> Result<(Vec<f64>, Vec<usize>)> {
498 ensure!(!order.is_empty(), "permute: order must not be empty");
499 let rank = order.len();
500 ensure!(
501 shape.len() <= rank,
502 "permute: order length must be at least the number of dimensions"
503 );
504 let mut seen = vec![false; rank];
505 for &dim in order {
506 ensure!(dim < rank, "permute: invalid dimension index {}", dim + 1);
507 ensure!(
508 !seen[dim],
509 "permute: duplicate dimension index {} encountered",
510 dim + 1
511 );
512 seen[dim] = true;
513 }
514 ensure!(
515 seen.iter().all(|v| *v),
516 "permute: order must include every dimension exactly once"
517 );
518
519 let mut src_shape = shape.to_vec();
520 if src_shape.len() < rank {
521 src_shape.extend(std::iter::repeat_n(1, rank - src_shape.len()));
522 }
523
524 let total = product(&src_shape);
525 ensure!(
526 total == data.len(),
527 "permute: shape/product mismatch ({} vs {})",
528 total,
529 data.len()
530 );
531
532 let mut dst_shape = vec![0usize; rank];
533 for (dst_dim, &src_dim) in order.iter().enumerate() {
534 dst_shape[dst_dim] = src_shape[src_dim];
535 }
536
537 let src_strides = compute_strides(&src_shape);
538 let dst_total = product(&dst_shape);
539 let mut out = vec![0.0f64; dst_total];
540 let mut dst_coords = vec![0usize; rank];
541 let mut src_coords = vec![0usize; rank];
542
543 for (dst_index, out_value) in out.iter_mut().enumerate() {
544 let mut rem = dst_index;
545 for (dim, &size) in dst_shape.iter().enumerate() {
546 if size == 0 {
547 dst_coords[dim] = 0;
548 } else {
549 dst_coords[dim] = rem % size;
550 rem /= size;
551 }
552 }
553 for (dst_dim, &src_dim) in order.iter().enumerate() {
554 src_coords[src_dim] = dst_coords[dst_dim];
555 }
556 let mut src_index = 0usize;
557 for (dim, &coord) in src_coords.iter().enumerate() {
558 src_index += coord * src_strides[dim];
559 }
560 *out_value = data[src_index];
561 }
562
563 Ok((out, dst_shape))
564}
565
566fn flip_data(data: &[f64], shape: &[usize], axes: &[usize]) -> Result<Vec<f64>> {
567 if axes.is_empty() || data.is_empty() {
568 return Ok(data.to_vec());
569 }
570 let mut ext_shape = shape.to_vec();
571 if let Some(max_dim) = axes.iter().copied().max() {
572 let needed = max_dim + 1;
573 if needed > ext_shape.len() {
574 ext_shape.extend(std::iter::repeat_n(1, needed - ext_shape.len()));
575 }
576 }
577 let total = product(&ext_shape);
578 ensure!(
579 total == data.len(),
580 "flip: shape/product mismatch ({} vs {})",
581 total,
582 data.len()
583 );
584 let mut flip_flags = vec![false; ext_shape.len()];
585 for &axis in axes {
586 if axis < flip_flags.len() {
587 flip_flags[axis] = !flip_flags[axis];
588 }
589 }
590 if !flip_flags.iter().any(|&flag| flag) {
591 return Ok(data.to_vec());
592 }
593 let mut out = Vec::with_capacity(total);
594 for idx in 0..total {
595 let mut coords = unravel_index(idx, &ext_shape);
596 for (axis, flag) in flip_flags.iter().enumerate() {
597 if *flag && ext_shape[axis] > 1 {
598 coords[axis] = ext_shape[axis] - 1 - coords[axis];
599 }
600 }
601 let src_idx = ravel_index(&coords, &ext_shape);
602 out.push(data[src_idx]);
603 }
604 Ok(out)
605}
606
607fn conv1d_output_shape(len: usize, orientation: ProviderConvOrientation) -> Vec<usize> {
608 match (orientation, len) {
609 (ProviderConvOrientation::Row, 0) => vec![1, 0],
610 (ProviderConvOrientation::Row, _) => vec![1, len],
611 (ProviderConvOrientation::Column, 0) => vec![0, 1],
612 (ProviderConvOrientation::Column, _) => vec![len, 1],
613 }
614}
615
616fn convolve_real(signal: &[f64], kernel: &[f64]) -> Result<Vec<f64>> {
617 if signal.is_empty() || kernel.is_empty() {
618 return Ok(Vec::new());
619 }
620 let full_len = signal
621 .len()
622 .checked_add(kernel.len())
623 .and_then(|v| v.checked_sub(1))
624 .ok_or_else(|| anyhow!("conv1d: length overflow"))?;
625 let mut out = vec![0.0; full_len];
626 for (i, &ai) in signal.iter().enumerate() {
627 for (j, &bj) in kernel.iter().enumerate() {
628 out[i + j] += ai * bj;
629 }
630 }
631 Ok(out)
632}
633
634fn apply_conv_mode_real(
635 full: &[f64],
636 mode: ProviderConvMode,
637 len_a: usize,
638 len_b: usize,
639) -> Vec<f64> {
640 match mode {
641 ProviderConvMode::Full => full.to_vec(),
642 ProviderConvMode::Same => {
643 if len_a == 0 {
644 return Vec::new();
645 }
646 let start = if len_b == 0 { 0 } else { (len_b - 1) / 2 };
647 let end = (start + len_a).min(full.len());
648 if start >= end {
649 Vec::new()
650 } else {
651 full[start..end].to_vec()
652 }
653 }
654 ProviderConvMode::Valid => {
655 if len_a < len_b {
656 Vec::new()
657 } else {
658 let start = len_b - 1;
659 let valid_len = len_a - len_b + 1;
660 let end = (start + valid_len).min(full.len());
661 if start >= end {
662 Vec::new()
663 } else {
664 full[start..end].to_vec()
665 }
666 }
667 }
668 }
669}
670
671#[allow(clippy::too_many_arguments)]
672fn conv2d_full_real(
673 signal: &[f64],
674 signal_rows: usize,
675 signal_cols: usize,
676 kernel: &[f64],
677 kernel_rows: usize,
678 kernel_cols: usize,
679 full_rows: usize,
680 full_cols: usize,
681) -> Vec<f64> {
682 let mut out = vec![0.0; full_rows * full_cols];
683 for sc in 0..signal_cols {
684 for sr in 0..signal_rows {
685 let aval = signal[sc * signal_rows + sr];
686 if aval == 0.0 {
687 continue;
688 }
689 for kc in 0..kernel_cols {
690 let out_c = sc + kc;
691 for kr in 0..kernel_rows {
692 let out_r = sr + kr;
693 let kval = kernel[kc * kernel_rows + kr];
694 out[out_c * full_rows + out_r] += aval * kval;
695 }
696 }
697 }
698 }
699 out
700}
701
702fn slice_matrix_real(
703 full: &[f64],
704 full_rows: usize,
705 full_cols: usize,
706 row_start: usize,
707 row_end: usize,
708 col_start: usize,
709 col_end: usize,
710) -> (Vec<f64>, usize, usize) {
711 let row_end = row_end.min(full_rows);
712 let col_end = col_end.min(full_cols);
713 if row_start >= row_end || col_start >= col_end {
714 let rows = row_end.saturating_sub(row_start);
715 let cols = col_end.saturating_sub(col_start);
716 return (vec![0.0; rows * cols], rows, cols);
717 }
718 let rows = row_end - row_start;
719 let cols = col_end - col_start;
720 let mut out = vec![0.0; rows * cols];
721 for c in 0..cols {
722 for r in 0..rows {
723 let src = (col_start + c) * full_rows + (row_start + r);
724 let dst = c * rows + r;
725 out[dst] = full[src];
726 }
727 }
728 (out, rows, cols)
729}
730
731#[allow(clippy::too_many_arguments)]
732fn apply_conv2_mode_real_2d(
733 full: &[f64],
734 full_rows: usize,
735 full_cols: usize,
736 mode: ProviderConvMode,
737 a_rows: usize,
738 a_cols: usize,
739 b_rows: usize,
740 b_cols: usize,
741) -> (Vec<f64>, usize, usize) {
742 match mode {
743 ProviderConvMode::Full => (full.to_vec(), full_rows, full_cols),
744 ProviderConvMode::Same => {
745 if a_rows == 0 || a_cols == 0 {
746 return (Vec::new(), a_rows, a_cols);
747 }
748 let row_start = if b_rows == 0 { 0 } else { (b_rows - 1) / 2 };
749 let col_start = if b_cols == 0 { 0 } else { (b_cols - 1) / 2 };
750 slice_matrix_real(
751 full,
752 full_rows,
753 full_cols,
754 row_start,
755 row_start + a_rows,
756 col_start,
757 col_start + a_cols,
758 )
759 }
760 ProviderConvMode::Valid => {
761 if a_rows < b_rows || a_cols < b_cols {
762 (Vec::new(), 0, 0)
763 } else {
764 let rows = a_rows - b_rows + 1;
765 let cols = a_cols - b_cols + 1;
766 let row_start = b_rows.saturating_sub(1);
767 let col_start = b_cols.saturating_sub(1);
768 slice_matrix_real(
769 full,
770 full_rows,
771 full_cols,
772 row_start,
773 row_start + rows,
774 col_start,
775 col_start + cols,
776 )
777 }
778 }
779 }
780}
781
782fn tensor_from_value(label: &str, value: Value) -> Result<Tensor> {
783 match value {
784 Value::Tensor(tensor) => Ok(tensor),
785 Value::Num(n) => Tensor::new(vec![n], vec![1, 1]).map_err(|e| anyhow!("{label}: {e}")),
786 Value::Int(i) => {
787 Tensor::new(vec![i.to_f64()], vec![1, 1]).map_err(|e| anyhow!("{label}: {e}"))
788 }
789 Value::Bool(b) => Tensor::new(vec![if b { 1.0 } else { 0.0 }], vec![1, 1])
790 .map_err(|e| anyhow!("{label}: {e}")),
791 Value::ComplexTensor(_) => Err(anyhow!(
792 "{label}: complex outputs are not supported by the in-process provider"
793 )),
794 other => Err(anyhow!("{label}: unexpected value {other:?}")),
795 }
796}
797
798fn tril_data(data: &[f64], shape: &[usize], offset: isize) -> Result<Vec<f64>> {
799 if data.is_empty() {
800 return Ok(Vec::new());
801 }
802 let rows = shape.first().copied().unwrap_or(1);
803 let cols = shape.get(1).copied().unwrap_or(1);
804 let plane = rows.saturating_mul(cols);
805 if plane == 0 {
806 ensure!(
807 data.is_empty(),
808 "tril: shape/product mismatch ({} vs {})",
809 0,
810 data.len()
811 );
812 return Ok(Vec::new());
813 }
814 let pages = if shape.len() <= 2 {
815 1usize
816 } else {
817 shape[2..].iter().product::<usize>()
818 };
819 if pages == 0 {
820 ensure!(
821 data.is_empty(),
822 "tril: shape/product mismatch ({} vs {})",
823 0,
824 data.len()
825 );
826 return Ok(Vec::new());
827 }
828 let expected = plane
829 .checked_mul(pages)
830 .ok_or_else(|| anyhow!("tril: dimension product overflow"))?;
831 ensure!(
832 expected == data.len(),
833 "tril: shape/product mismatch ({} vs {})",
834 expected,
835 data.len()
836 );
837 let mut out = data.to_vec();
838 for page in 0..pages {
839 let base = page * plane;
840 for col in 0..cols {
841 let col_base = base + col * rows;
842 for row in 0..rows {
843 if (row as isize) - (col as isize) < -offset {
844 out[col_base + row] = 0.0;
845 }
846 }
847 }
848 }
849 Ok(out)
850}
851
852fn triu_data(data: &[f64], shape: &[usize], offset: isize) -> Result<Vec<f64>> {
853 if data.is_empty() {
854 return Ok(Vec::new());
855 }
856 let rows = shape.first().copied().unwrap_or(1);
857 let cols = shape.get(1).copied().unwrap_or(1);
858 let plane = rows.saturating_mul(cols);
859 if plane == 0 {
860 ensure!(
861 data.is_empty(),
862 "triu: shape/product mismatch ({} vs {})",
863 0,
864 data.len()
865 );
866 return Ok(Vec::new());
867 }
868 let pages = if shape.len() <= 2 {
869 1usize
870 } else {
871 shape[2..].iter().product::<usize>()
872 };
873 if pages == 0 {
874 ensure!(
875 data.is_empty(),
876 "triu: shape/product mismatch ({} vs {})",
877 0,
878 data.len()
879 );
880 return Ok(Vec::new());
881 }
882 let expected = plane
883 .checked_mul(pages)
884 .ok_or_else(|| anyhow!("triu: dimension product overflow"))?;
885 ensure!(
886 expected == data.len(),
887 "triu: shape/product mismatch ({} vs {})",
888 expected,
889 data.len()
890 );
891 let mut out = data.to_vec();
892 for page in 0..pages {
893 let base = page * plane;
894 for col in 0..cols {
895 let col_base = base + col * rows;
896 for row in 0..rows {
897 let diff = (col as isize) - (row as isize);
898 if diff < offset {
899 out[col_base + row] = 0.0;
900 }
901 }
902 }
903 }
904 Ok(out)
905}
906
907fn circshift_data(data: &[f64], shape: &[usize], shifts: &[isize]) -> Result<Vec<f64>> {
908 ensure!(
909 shape.len() == shifts.len(),
910 "circshift: shift vector length must match tensor rank"
911 );
912 let mut total = 1usize;
913 for &dim in shape {
914 total = total
915 .checked_mul(dim)
916 .ok_or_else(|| anyhow!("circshift: requested output exceeds maximum size"))?;
917 }
918 ensure!(
919 total == data.len(),
920 "circshift: shape/product mismatch ({} vs {})",
921 total,
922 data.len()
923 );
924 if data.is_empty() {
925 return Ok(Vec::new());
926 }
927
928 let mut normalized = Vec::with_capacity(shape.len());
929 for (len, &shift) in shape.iter().zip(shifts.iter()) {
930 if *len <= 1 {
931 normalized.push(0usize);
932 continue;
933 }
934 let len_isize = *len as isize;
935 let mut value = shift % len_isize;
936 if value < 0 {
937 value += len_isize;
938 }
939 normalized.push(value as usize);
940 }
941 if normalized.iter().all(|&s| s == 0) {
942 return Ok(data.to_vec());
943 }
944
945 let strides = compute_strides(shape);
946 let mut out = vec![0.0f64; data.len()];
947 for (idx, out_value) in out.iter_mut().enumerate() {
948 let coords = unravel_index(idx, shape);
949 let mut src_idx = 0usize;
950 for (axis, &coord) in coords.iter().enumerate() {
951 let len = shape[axis];
952 let stride = strides[axis];
953 if len <= 1 || normalized[axis] == 0 {
954 src_idx += coord * stride;
955 } else {
956 let shift = normalized[axis] % len;
957 let src_coord = (coord + len - shift) % len;
958 src_idx += src_coord * stride;
959 }
960 }
961 *out_value = data[src_idx];
962 }
963 Ok(out)
964}
965
966fn unravel_index(mut index: usize, shape: &[usize]) -> Vec<usize> {
967 let mut coords = Vec::with_capacity(shape.len());
968 for &extent in shape {
969 if extent == 0 {
970 coords.push(0);
971 } else {
972 coords.push(index % extent);
973 index /= extent;
974 }
975 }
976 coords
977}
978
979fn ravel_index(coords: &[usize], shape: &[usize]) -> usize {
980 let mut index = 0usize;
981 let mut stride = 1usize;
982 for (coord, extent) in coords.iter().zip(shape.iter()) {
983 if *extent > 0 {
984 index += coord * stride;
985 stride *= extent;
986 }
987 }
988 index
989}
990
991fn checked_total(shape: &[usize]) -> Result<usize> {
992 shape.iter().try_fold(1usize, |acc, dim| {
993 acc.checked_mul(*dim)
994 .ok_or_else(|| anyhow!("repmat: requested output exceeds maximum size"))
995 })
996}
997
998fn repmat_numeric(data: &[f64], shape: &[usize], reps: &[usize]) -> Result<(Vec<f64>, Vec<usize>)> {
999 ensure!(
1000 !reps.is_empty(),
1001 "repmat: replication factors must be specified"
1002 );
1003 let orig_rank = if shape.is_empty() { 1 } else { shape.len() };
1004 let rank = if reps.len() == 1 {
1005 orig_rank.max(2)
1006 } else {
1007 orig_rank.max(reps.len())
1008 };
1009
1010 let mut base_shape = vec![1usize; rank];
1011 for (idx, &dim) in shape.iter().enumerate() {
1012 if idx < rank {
1013 base_shape[idx] = dim;
1014 }
1015 }
1016
1017 let mut factors = vec![1usize; rank];
1018 if reps.len() == 1 {
1019 factors.fill(reps[0]);
1020 } else {
1021 for (idx, &factor) in reps.iter().enumerate() {
1022 if idx < rank {
1023 factors[idx] = factor;
1024 }
1025 }
1026 }
1027
1028 let mut new_shape = Vec::with_capacity(rank);
1029 for i in 0..rank {
1030 let scaled = base_shape[i]
1031 .checked_mul(factors[i])
1032 .ok_or_else(|| anyhow!("repmat: requested output exceeds maximum size"))?;
1033 new_shape.push(scaled);
1034 }
1035
1036 let orig_total = checked_total(&base_shape)?;
1037 ensure!(
1038 orig_total == data.len() || (orig_total == 0 && data.is_empty()),
1039 "repmat: internal shape mismatch (expected {} elements, found {})",
1040 orig_total,
1041 data.len()
1042 );
1043
1044 let new_total = checked_total(&new_shape)?;
1045 if new_total == 0 {
1046 return Ok((Vec::new(), new_shape));
1047 }
1048
1049 let strides = compute_strides(&base_shape);
1050 let mut out = Vec::with_capacity(new_total);
1051 for idx in 0..new_total {
1052 let mut rem = idx;
1053 let mut src_index = 0usize;
1054 for dim in 0..rank {
1055 let dim_size = new_shape[dim];
1056 let coord = rem % dim_size;
1057 rem /= dim_size;
1058 let base = base_shape[dim];
1059 let orig_coord = if base == 0 { 0 } else { coord % base };
1060 src_index += orig_coord * strides[dim];
1061 }
1062 out.push(data[src_index]);
1063 }
1064 Ok((out, new_shape))
1065}
1066
1067fn coerce_sub2ind_value(value: f64, dim_number: usize, dim_size: usize) -> Result<usize> {
1068 if !value.is_finite() {
1069 return Err(anyhow!(
1070 "sub2ind: subscript in dimension {} must be finite",
1071 dim_number
1072 ));
1073 }
1074 let rounded = value.round();
1075 if (rounded - value).abs() > f64::EPSILON {
1076 return Err(anyhow!(
1077 "sub2ind: subscript in dimension {} must be an integer",
1078 dim_number
1079 ));
1080 }
1081 if rounded < 1.0 || rounded > dim_size as f64 {
1082 return Err(anyhow!(
1083 "sub2ind: subscript {} exceeds dimension {} (size {})",
1084 rounded as isize,
1085 dim_number,
1086 dim_size
1087 ));
1088 }
1089 Ok(rounded as usize)
1090}
1091
1092fn identity_data(shape: &[usize]) -> Vec<f64> {
1093 let shape = normalize_shape(shape);
1094 let total: usize = shape.iter().copied().product();
1095 let mut data = vec![0.0; total];
1096 if shape.is_empty() {
1097 return data;
1098 }
1099 let rows = shape[0];
1100 let cols = shape[1];
1101 let diag_len = rows.min(cols);
1102 if diag_len == 0 {
1103 return data;
1104 }
1105 let strides = compute_strides(&shape);
1106 let extra_dims = &shape[2..];
1107 let extra_count = if extra_dims.is_empty() {
1108 1
1109 } else {
1110 extra_dims.iter().copied().product()
1111 };
1112 let mut coords = vec![0usize; shape.len()];
1113 for mut extra_idx in 0..extra_count {
1114 for (offset, size) in extra_dims.iter().copied().enumerate() {
1115 let dim = offset + 2;
1116 if size == 0 {
1117 coords[dim] = 0;
1118 continue;
1119 }
1120 coords[dim] = extra_idx % size;
1121 extra_idx /= size;
1122 }
1123 for diag in 0..diag_len {
1124 coords[0] = diag;
1125 coords[1] = diag;
1126 let mut linear = 0usize;
1127 for (dim, &coord) in coords.iter().enumerate() {
1128 linear += coord * strides[dim];
1129 }
1130 data[linear] = 1.0;
1131 }
1132 }
1133 data
1134}
1135
1136fn offset_abs(offset: isize) -> usize {
1137 if offset >= 0 {
1138 offset as usize
1139 } else {
1140 let magnitude = -(offset as i128);
1141 magnitude as usize
1142 }
1143}
1144
1145fn diag_matrix_size(len: usize, offset: isize) -> Result<(usize, usize)> {
1146 let shift = offset_abs(offset);
1147 let size = len
1148 .checked_add(shift)
1149 .ok_or_else(|| anyhow!("diag: result dimension exceeds limits"))?;
1150 let total = size
1151 .checked_mul(size)
1152 .ok_or_else(|| anyhow!("diag: result size exceeds limits"))?;
1153 Ok((size, total))
1154}
1155
1156fn diagonal_length(rows: usize, cols: usize, offset: isize) -> usize {
1157 if rows == 0 || cols == 0 {
1158 return 0;
1159 }
1160 if offset >= 0 {
1161 let shift = offset as usize;
1162 if shift >= cols {
1163 0
1164 } else {
1165 rows.min(cols - shift)
1166 }
1167 } else {
1168 let shift = offset_abs(offset);
1169 if shift >= rows {
1170 0
1171 } else {
1172 (rows - shift).min(cols)
1173 }
1174 }
1175}
1176
1177fn diagonal_target_index(idx: usize, offset: isize) -> (usize, usize) {
1178 if offset >= 0 {
1179 (idx, idx + offset as usize)
1180 } else {
1181 (idx + offset_abs(offset), idx)
1182 }
1183}
1184
1185fn diagonal_source_index(idx: usize, offset: isize) -> (usize, usize) {
1186 if offset >= 0 {
1187 (idx, idx + offset as usize)
1188 } else {
1189 (idx + offset_abs(offset), idx)
1190 }
1191}
1192
1193fn ensure_diag_shape(label: &str, shape: &[usize]) -> Result<()> {
1194 if shape.len() > 2 && shape.iter().skip(2).any(|&d| d != 1) {
1195 Err(anyhow!("{label}: input must be 2-D"))
1196 } else {
1197 Ok(())
1198 }
1199}
1200
1201fn rows_cols(shape: &[usize]) -> (usize, usize) {
1202 match shape.len() {
1203 0 => (1, 1),
1204 1 => (shape[0], 1),
1205 _ => (shape[0], shape[1]),
1206 }
1207}
1208
1209fn is_vector_like(rows: usize, cols: usize, dims: usize) -> bool {
1210 rows == 1 || cols == 1 || dims <= 1
1211}
1212
1213impl AccelProvider for InProcessProvider {
1214 fn device_id(&self) -> u32 {
1215 0
1216 }
1217 fn gather_linear(
1218 &self,
1219 source: &GpuTensorHandle,
1220 indices: &[u32],
1221 output_shape: &[usize],
1222 ) -> Result<GpuTensorHandle> {
1223 let data = {
1224 let guard = registry().lock().unwrap_or_else(|e| e.into_inner());
1225 guard
1226 .get(&source.buffer_id)
1227 .cloned()
1228 .ok_or_else(|| anyhow!("gather_linear: unknown buffer {}", source.buffer_id))?
1229 };
1230 let mut out = Vec::with_capacity(indices.len());
1231 for (pos, &idx) in indices.iter().enumerate() {
1232 let lin = idx as usize;
1233 ensure!(
1234 lin < data.len(),
1235 "gather_linear: index {} (position {}) out of bounds for buffer {} (len={})",
1236 lin,
1237 pos,
1238 source.buffer_id,
1239 data.len()
1240 );
1241 out.push(data[lin]);
1242 }
1243 Ok(self.allocate_tensor(out, output_shape.to_vec()))
1244 }
1245
1246 fn scatter_linear(
1247 &self,
1248 target: &GpuTensorHandle,
1249 indices: &[u32],
1250 values: &GpuTensorHandle,
1251 ) -> Result<()> {
1252 let values_data = {
1253 let guard = registry().lock().unwrap_or_else(|e| e.into_inner());
1254 guard.get(&values.buffer_id).cloned().ok_or_else(|| {
1255 anyhow!("scatter_linear: unknown values buffer {}", values.buffer_id)
1256 })?
1257 };
1258 ensure!(
1259 values_data.len() == indices.len(),
1260 "scatter_linear: values length {} does not match indices length {}",
1261 values_data.len(),
1262 indices.len()
1263 );
1264 let mut guard = registry().lock().unwrap_or_else(|e| e.into_inner());
1265 let target_buf = guard
1266 .get_mut(&target.buffer_id)
1267 .ok_or_else(|| anyhow!("scatter_linear: unknown target buffer {}", target.buffer_id))?;
1268 for (pos, &idx) in indices.iter().enumerate() {
1269 let lin = idx as usize;
1270 ensure!(
1271 lin < target_buf.len(),
1272 "scatter_linear: index {} (position {}) out of bounds for target len {}",
1273 lin,
1274 pos,
1275 target_buf.len()
1276 );
1277 target_buf[lin] = values_data[pos];
1278 }
1279 Ok(())
1280 }
1281
1282 fn precision(&self) -> ProviderPrecision {
1283 ProviderPrecision::F64
1284 }
1285
1286 fn upload(&self, host: &HostTensorView) -> Result<GpuTensorHandle> {
1287 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1288 let mut guard = registry().lock().unwrap_or_else(|e| e.into_inner());
1289 guard.insert(id, host.data.to_vec());
1290 let handle = GpuTensorHandle {
1291 shape: host.shape.to_vec(),
1292 device_id: 0,
1293 buffer_id: id,
1294 };
1295 runmat_accelerate_api::set_handle_logical(&handle, false);
1296 Ok(handle)
1297 }
1298
1299 fn download(&self, h: &GpuTensorHandle) -> Result<HostTensorOwned> {
1300 let guard = registry().lock().unwrap_or_else(|e| e.into_inner());
1301 if let Some(buf) = guard.get(&h.buffer_id) {
1302 Ok(HostTensorOwned {
1303 data: buf.clone(),
1304 shape: h.shape.clone(),
1305 })
1306 } else {
1307 Err(anyhow::anyhow!("buffer not found: {}", h.buffer_id))
1308 }
1309 }
1310
1311 fn free(&self, h: &GpuTensorHandle) -> Result<()> {
1312 let mut guard = registry().lock().unwrap_or_else(|e| e.into_inner());
1313 guard.remove(&h.buffer_id);
1314 runmat_accelerate_api::clear_handle_logical(h);
1315 Ok(())
1316 }
1317
1318 fn device_info(&self) -> String {
1319 "in-process provider (host registry)".to_string()
1320 }
1321
1322 fn device_info_struct(&self) -> runmat_accelerate_api::ApiDeviceInfo {
1323 runmat_accelerate_api::ApiDeviceInfo {
1324 device_id: 0,
1325 name: "InProcess".to_string(),
1326 vendor: "RunMat".to_string(),
1327 memory_bytes: None,
1328 backend: Some("inprocess".to_string()),
1329 }
1330 }
1331
1332 fn telemetry_snapshot(&self) -> runmat_accelerate_api::ProviderTelemetry {
1333 runmat_accelerate_api::ProviderTelemetry::default()
1334 }
1335
1336 fn reset_telemetry(&self) {}
1337
1338 fn sort_rows(
1339 &self,
1340 handle: &GpuTensorHandle,
1341 columns: &[SortRowsColumnSpec],
1342 comparison: SortComparison,
1343 ) -> Result<SortResult> {
1344 let data = {
1345 let guard = registry().lock().unwrap();
1346 guard
1347 .get(&handle.buffer_id)
1348 .cloned()
1349 .ok_or_else(|| anyhow!("sortrows: unknown buffer {}", handle.buffer_id))?
1350 };
1351 let SortRowsHostOutputs {
1352 values,
1353 indices,
1354 indices_shape,
1355 } = sort_rows_host(&data, &handle.shape, columns, comparison)?;
1356 Ok(SortResult {
1357 values: HostTensorOwned {
1358 data: values,
1359 shape: handle.shape.clone(),
1360 },
1361 indices: HostTensorOwned {
1362 data: indices,
1363 shape: indices_shape,
1364 },
1365 })
1366 }
1367
1368 fn polyder_single(&self, polynomial: &GpuTensorHandle) -> Result<GpuTensorHandle> {
1369 let (coeffs, orientation) = self.load_polynomial(polynomial)?;
1370 let raw = poly_raw_derivative(&coeffs);
1371 let trimmed = poly_trim_slice(&raw);
1372 Ok(self.allocate_polynomial(trimmed, orientation))
1373 }
1374
1375 fn polyder_product(&self, p: &GpuTensorHandle, q: &GpuTensorHandle) -> Result<GpuTensorHandle> {
1376 let (p_coeffs, orientation) = self.load_polynomial(p)?;
1377 let (q_coeffs, _) = self.load_polynomial(q)?;
1378 let dp = poly_raw_derivative(&p_coeffs);
1379 let dq = poly_raw_derivative(&q_coeffs);
1380 let term1 = poly_convolve_real(&dp, &q_coeffs);
1381 let term2 = poly_convolve_real(&p_coeffs, &dq);
1382 let sum = poly_add_real(&term1, &term2);
1383 let trimmed = poly_trim_slice(&sum);
1384 Ok(self.allocate_polynomial(trimmed, orientation))
1385 }
1386
1387 fn polyder_quotient(
1388 &self,
1389 u: &GpuTensorHandle,
1390 v: &GpuTensorHandle,
1391 ) -> Result<ProviderPolyderQuotient> {
1392 let (u_coeffs, orientation_u) = self.load_polynomial(u)?;
1393 let (v_coeffs, orientation_v) = self.load_polynomial(v)?;
1394 let du = poly_raw_derivative(&u_coeffs);
1395 let dv = poly_raw_derivative(&v_coeffs);
1396 let term1 = poly_convolve_real(&du, &v_coeffs);
1397 let term2 = poly_convolve_real(&u_coeffs, &dv);
1398 let numerator_vec = poly_trim_slice(&poly_sub_real(&term1, &term2));
1399 let denominator_vec = poly_trim_slice(&poly_convolve_real(&v_coeffs, &v_coeffs));
1400 let numerator = self.allocate_polynomial(numerator_vec, orientation_u);
1401 let denominator = self.allocate_polynomial(denominator_vec, orientation_v);
1402 Ok(ProviderPolyderQuotient {
1403 numerator,
1404 denominator,
1405 })
1406 }
1407
1408 fn polyint(&self, polynomial: &GpuTensorHandle, constant: f64) -> Result<GpuTensorHandle> {
1409 let orientation = poly_orientation_from_shape(&polynomial.shape)?;
1410 let coeffs = {
1411 let guard = registry().lock().unwrap();
1412 guard
1413 .get(&polynomial.buffer_id)
1414 .cloned()
1415 .ok_or_else(|| anyhow!("polyint: unknown tensor handle {}", polynomial.buffer_id))?
1416 };
1417 let integrated = poly_integral_real(&coeffs, constant);
1418 Ok(self.allocate_polynomial(integrated, orientation))
1419 }
1420
1421 fn diag_from_vector(&self, vector: &GpuTensorHandle, offset: isize) -> Result<GpuTensorHandle> {
1422 ensure_diag_shape("diag", &vector.shape)?;
1423 let (rows, cols) = rows_cols(&vector.shape);
1424 ensure!(
1425 is_vector_like(rows, cols, vector.shape.len()),
1426 "diag: input must be a vector"
1427 );
1428
1429 let data = {
1430 let guard = registry().lock().unwrap();
1431 guard
1432 .get(&vector.buffer_id)
1433 .cloned()
1434 .ok_or_else(|| anyhow!("diag: unknown buffer {}", vector.buffer_id))?
1435 };
1436 let len = data.len();
1437 let (size, total) = diag_matrix_size(len, offset)?;
1438 let mut out = vec![0.0; total];
1439 for (idx, &value) in data.iter().enumerate() {
1440 let (row, col) = diagonal_target_index(idx, offset);
1441 if row < size && col < size {
1442 out[row + col * size] = value;
1443 }
1444 }
1445 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1446 registry().lock().unwrap().insert(id, out);
1447 Ok(GpuTensorHandle {
1448 shape: vec![size, size],
1449 device_id: 0,
1450 buffer_id: id,
1451 })
1452 }
1453
1454 fn diag_extract(&self, matrix: &GpuTensorHandle, offset: isize) -> Result<GpuTensorHandle> {
1455 ensure_diag_shape("diag", &matrix.shape)?;
1456 let (rows, cols) = rows_cols(&matrix.shape);
1457 ensure!(
1458 !is_vector_like(rows, cols, matrix.shape.len()),
1459 "diag: matrix input required"
1460 );
1461 let diag_len = diagonal_length(rows, cols, offset);
1462 if diag_len == 0 {
1463 return self.zeros(&[0, 1]);
1464 }
1465 let data = {
1466 let guard = registry().lock().unwrap();
1467 guard
1468 .get(&matrix.buffer_id)
1469 .cloned()
1470 .ok_or_else(|| anyhow!("diag: unknown buffer {}", matrix.buffer_id))?
1471 };
1472 let mut out = Vec::with_capacity(diag_len);
1473 for idx in 0..diag_len {
1474 let (row, col) = diagonal_source_index(idx, offset);
1475 let linear = row + col * rows;
1476 out.push(*data.get(linear).unwrap_or(&0.0));
1477 }
1478 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1479 registry().lock().unwrap().insert(id, out);
1480 Ok(GpuTensorHandle {
1481 shape: vec![diag_len, 1],
1482 device_id: 0,
1483 buffer_id: id,
1484 })
1485 }
1486
1487 fn tril(&self, handle: &GpuTensorHandle, offset: isize) -> Result<GpuTensorHandle> {
1488 let data = {
1489 let guard = registry().lock().unwrap();
1490 guard
1491 .get(&handle.buffer_id)
1492 .cloned()
1493 .ok_or_else(|| anyhow!("tril: unknown tensor handle {}", handle.buffer_id))?
1494 };
1495 let masked = tril_data(&data, &handle.shape, offset)?;
1496 Ok(self.allocate_tensor(masked, handle.shape.clone()))
1497 }
1498
1499 fn triu(&self, handle: &GpuTensorHandle, offset: isize) -> Result<GpuTensorHandle> {
1500 let data = {
1501 let guard = registry().lock().unwrap();
1502 guard
1503 .get(&handle.buffer_id)
1504 .cloned()
1505 .ok_or_else(|| anyhow!("triu: unknown tensor handle {}", handle.buffer_id))?
1506 };
1507 let masked = triu_data(&data, &handle.shape, offset)?;
1508 Ok(self.allocate_tensor(masked, handle.shape.clone()))
1509 }
1510
1511 fn issymmetric(
1512 &self,
1513 matrix: &GpuTensorHandle,
1514 kind: ProviderSymmetryKind,
1515 tolerance: f64,
1516 ) -> Result<bool> {
1517 let data = {
1518 let guard = registry().lock().unwrap();
1519 guard
1520 .get(&matrix.buffer_id)
1521 .cloned()
1522 .ok_or_else(|| anyhow!("issymmetric: unknown tensor handle {}", matrix.buffer_id))?
1523 };
1524 let skew = matches!(kind, ProviderSymmetryKind::Skew);
1525 issymmetric_host_real_data(&matrix.shape, &data, skew, tolerance).map_err(|e| anyhow!(e))
1526 }
1527
1528 fn ishermitian(
1529 &self,
1530 matrix: &GpuTensorHandle,
1531 kind: ProviderHermitianKind,
1532 tolerance: f64,
1533 ) -> Result<bool> {
1534 let data = {
1535 let guard = registry().lock().unwrap();
1536 guard
1537 .get(&matrix.buffer_id)
1538 .cloned()
1539 .ok_or_else(|| anyhow!("ishermitian: unknown tensor handle {}", matrix.buffer_id))?
1540 };
1541 let skew = matches!(kind, ProviderHermitianKind::Skew);
1542 ishermitian_host_real_data(&matrix.shape, &data, skew, tolerance).map_err(|e| anyhow!(e))
1543 }
1544
1545 fn bandwidth(&self, matrix: &GpuTensorHandle) -> Result<ProviderBandwidth> {
1546 let data = {
1547 let guard = registry().lock().unwrap();
1548 guard
1549 .get(&matrix.buffer_id)
1550 .cloned()
1551 .ok_or_else(|| anyhow!("bandwidth: unknown tensor handle {}", matrix.buffer_id))?
1552 };
1553 let (lower, upper) =
1554 bandwidth_host_real_data(&matrix.shape, &data).map_err(|e| anyhow!(e))?;
1555 Ok(ProviderBandwidth {
1556 lower: lower as u32,
1557 upper: upper as u32,
1558 })
1559 }
1560
1561 fn sym_rcm(&self, matrix: &GpuTensorHandle) -> Result<Vec<usize>> {
1562 let data = {
1563 let guard = registry().lock().unwrap();
1564 guard
1565 .get(&matrix.buffer_id)
1566 .cloned()
1567 .ok_or_else(|| anyhow!("symrcm: unknown tensor handle {}", matrix.buffer_id))?
1568 };
1569 symrcm_host_real_data(&matrix.shape, &data).map_err(|e| anyhow!(e))
1570 }
1571
1572 fn read_scalar(&self, h: &GpuTensorHandle, linear_index: usize) -> Result<f64> {
1573 let guard = registry().lock().unwrap_or_else(|e| e.into_inner());
1574 let buf = guard
1575 .get(&h.buffer_id)
1576 .ok_or_else(|| anyhow!("read_scalar: unknown buffer {}", h.buffer_id))?;
1577 if linear_index >= buf.len() {
1578 return Err(anyhow!(
1579 "read_scalar: index {} out of bounds (len {})",
1580 linear_index + 1,
1581 buf.len()
1582 ));
1583 }
1584 Ok(buf[linear_index])
1585 }
1586
1587 fn zeros(&self, shape: &[usize]) -> Result<GpuTensorHandle> {
1588 let len: usize = shape.iter().copied().product();
1589 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1590 let mut guard = registry().lock().unwrap();
1591 guard.insert(id, vec![0.0; len]);
1592 Ok(GpuTensorHandle {
1593 shape: shape.to_vec(),
1594 device_id: 0,
1595 buffer_id: id,
1596 })
1597 }
1598
1599 fn zeros_like(&self, prototype: &GpuTensorHandle) -> Result<GpuTensorHandle> {
1600 self.zeros(&prototype.shape)
1601 }
1602
1603 fn ones(&self, shape: &[usize]) -> Result<GpuTensorHandle> {
1604 let len: usize = shape.iter().copied().product();
1605 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1606 let mut guard = registry().lock().unwrap();
1607 guard.insert(id, vec![1.0; len]);
1608 Ok(GpuTensorHandle {
1609 shape: shape.to_vec(),
1610 device_id: 0,
1611 buffer_id: id,
1612 })
1613 }
1614
1615 fn ones_like(&self, prototype: &GpuTensorHandle) -> Result<GpuTensorHandle> {
1616 self.ones(&prototype.shape)
1617 }
1618
1619 fn eye(&self, shape: &[usize]) -> Result<GpuTensorHandle> {
1620 let shape = normalize_shape(shape);
1621 let data = identity_data(&shape);
1622 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1623 let mut guard = registry().lock().unwrap();
1624 guard.insert(id, data);
1625 Ok(GpuTensorHandle {
1626 shape,
1627 device_id: 0,
1628 buffer_id: id,
1629 })
1630 }
1631
1632 fn eye_like(&self, prototype: &GpuTensorHandle) -> Result<GpuTensorHandle> {
1633 self.eye(&prototype.shape)
1634 }
1635
1636 fn linspace(&self, start: f64, stop: f64, count: usize) -> Result<GpuTensorHandle> {
1637 let data = if count == 0 {
1638 Vec::new()
1639 } else if count == 1 {
1640 vec![stop]
1641 } else {
1642 let step = (stop - start) / ((count - 1) as f64);
1643 let mut seq = Vec::with_capacity(count);
1644 for idx in 0..count {
1645 seq.push(start + (idx as f64) * step);
1646 }
1647 if let Some(last) = seq.last_mut() {
1648 *last = stop;
1649 }
1650 seq
1651 };
1652
1653 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1654 registry().lock().unwrap().insert(id, data);
1655 Ok(GpuTensorHandle {
1656 shape: vec![1, count],
1657 device_id: 0,
1658 buffer_id: id,
1659 })
1660 }
1661
1662 fn random_uniform(&self, shape: &[usize]) -> Result<GpuTensorHandle> {
1663 let len: usize = shape.iter().copied().product();
1664 let mut data = vec![0.0; len];
1665 {
1666 let mut guard = rng_state().lock().unwrap_or_else(|e| e.into_inner());
1667 for slot in &mut data {
1668 *slot = next_uniform(&mut guard);
1669 }
1670 }
1671
1672 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1673 let mut buf_guard = registry().lock().unwrap_or_else(|e| e.into_inner());
1674 buf_guard.insert(id, data);
1675 Ok(GpuTensorHandle {
1676 shape: shape.to_vec(),
1677 device_id: 0,
1678 buffer_id: id,
1679 })
1680 }
1681
1682 fn random_normal(&self, shape: &[usize]) -> Result<GpuTensorHandle> {
1683 let len: usize = shape.iter().copied().product();
1684 let mut data = Vec::with_capacity(len);
1685 if len > 0 {
1686 let mut guard = rng_state().lock().unwrap_or_else(|e| e.into_inner());
1687 while data.len() < len {
1688 let (z0, z1) = next_normal_pair(&mut guard);
1689 data.push(z0);
1690 if data.len() < len {
1691 data.push(z1);
1692 }
1693 }
1694 }
1695
1696 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1697 registry()
1698 .lock()
1699 .unwrap_or_else(|e| e.into_inner())
1700 .insert(id, data);
1701 Ok(GpuTensorHandle {
1702 shape: shape.to_vec(),
1703 device_id: 0,
1704 buffer_id: id,
1705 })
1706 }
1707
1708 fn set_rng_state(&self, state: u64) -> Result<()> {
1709 let mut guard = rng_state()
1710 .lock()
1711 .map_err(|_| anyhow::anyhow!("set_rng_state: RNG mutex poisoned"))?;
1712 *guard = if state == 0 {
1713 PROVIDER_DEFAULT_SEED
1714 } else {
1715 state
1716 };
1717 Ok(())
1718 }
1719
1720 fn fspecial(&self, request: &FspecialRequest) -> Result<GpuTensorHandle> {
1721 let spec =
1722 runmat_runtime::builtins::image::filters::fspecial::spec_from_request(&request.filter)
1723 .map_err(|e: String| anyhow!(e))?;
1724 let tensor = spec.generate_tensor().map_err(|e| anyhow!(e))?;
1725 Ok(self.allocate_tensor(tensor.data.clone(), tensor.shape.clone()))
1726 }
1727
1728 fn imfilter(
1729 &self,
1730 image: &GpuTensorHandle,
1731 kernel: &GpuTensorHandle,
1732 options: &ImfilterOptions,
1733 ) -> Result<GpuTensorHandle> {
1734 let (image_vec, kernel_vec) = {
1735 let guard = registry().lock().unwrap();
1736 let image_buf = guard
1737 .get(&image.buffer_id)
1738 .cloned()
1739 .ok_or_else(|| anyhow!("imfilter: unknown buffer {}", image.buffer_id))?;
1740 let kernel_buf = guard
1741 .get(&kernel.buffer_id)
1742 .cloned()
1743 .ok_or_else(|| anyhow!("imfilter: unknown buffer {}", kernel.buffer_id))?;
1744 (image_buf, kernel_buf)
1745 };
1746 let image_tensor =
1747 Tensor::new(image_vec, image.shape.clone()).map_err(|e| anyhow!("imfilter: {e}"))?;
1748 let kernel_tensor =
1749 Tensor::new(kernel_vec, kernel.shape.clone()).map_err(|e| anyhow!("imfilter: {e}"))?;
1750 let result = runmat_runtime::builtins::image::filters::imfilter::apply_imfilter_tensor(
1751 &image_tensor,
1752 &kernel_tensor,
1753 options,
1754 )
1755 .map_err(|e| anyhow!(e))?;
1756 let Tensor { data, shape, .. } = result;
1757 Ok(self.allocate_tensor(data, shape))
1758 }
1759
1760 fn random_integer_range(
1761 &self,
1762 lower: i64,
1763 upper: i64,
1764 shape: &[usize],
1765 ) -> Result<GpuTensorHandle> {
1766 ensure!(lower <= upper, "lower bound must be <= upper bound");
1767 let span_i128 = (upper as i128)
1768 .checked_sub(lower as i128)
1769 .and_then(|delta| delta.checked_add(1))
1770 .ok_or_else(|| anyhow!("integer range overflow"))?;
1771 ensure!(span_i128 > 0, "integer range must be non-empty");
1772 ensure!(
1773 span_i128 <= (1i128 << 53),
1774 "integer range exceeds 2^53 and cannot be represented exactly"
1775 );
1776 let span = span_i128 as u64;
1777
1778 let len: usize = shape.iter().copied().product();
1779 let mut data = Vec::with_capacity(len);
1780 if span == 1 {
1781 data.resize(len, lower as f64);
1782 } else if len > 0 {
1783 let mut guard = rng_state().lock().unwrap_or_else(|e| e.into_inner());
1784 let span_f64 = span as f64;
1785 for _ in 0..len {
1786 let mut offset = (next_uniform(&mut guard) * span_f64).floor() as u64;
1787 if offset >= span {
1788 offset = span - 1;
1789 }
1790 let value = (lower as i128 + offset as i128) as f64;
1791 data.push(value);
1792 }
1793 }
1794
1795 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1796 registry().lock().unwrap().insert(id, data);
1797 Ok(GpuTensorHandle {
1798 shape: shape.to_vec(),
1799 device_id: 0,
1800 buffer_id: id,
1801 })
1802 }
1803
1804 fn random_permutation(&self, n: usize, k: usize) -> Result<GpuTensorHandle> {
1805 ensure!(k <= n, "randperm: K must satisfy 0 <= K <= N");
1806 let k = k.min(n);
1807 let mut values: Vec<f64> = if n == 0 {
1808 Vec::new()
1809 } else {
1810 (1..=n).map(|v| v as f64).collect()
1811 };
1812
1813 if k > 0 {
1814 let mut guard = rng_state().lock().unwrap_or_else(|e| e.into_inner());
1815 for i in 0..k {
1816 let span = n - i;
1817 if span == 0 {
1818 break;
1819 }
1820 let mut u = next_uniform(&mut guard);
1821 if u >= 1.0 {
1822 u = 0.9999999999999999;
1823 }
1824 let mut offset = (u * span as f64).floor() as usize;
1825 if offset >= span {
1826 offset = span - 1;
1827 }
1828 let j = i + offset;
1829 values.swap(i, j);
1830 }
1831 }
1832
1833 if values.len() > k {
1834 values.truncate(k);
1835 }
1836
1837 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1838 registry().lock().unwrap().insert(id, values);
1839 Ok(GpuTensorHandle {
1840 shape: vec![1, k],
1841 device_id: 0,
1842 buffer_id: id,
1843 })
1844 }
1845
1846 fn random_permutation_like(
1847 &self,
1848 _prototype: &GpuTensorHandle,
1849 n: usize,
1850 k: usize,
1851 ) -> Result<GpuTensorHandle> {
1852 self.random_permutation(n, k)
1853 }
1854
1855 fn covariance(
1856 &self,
1857 matrix: &GpuTensorHandle,
1858 second: Option<&GpuTensorHandle>,
1859 weights: Option<&GpuTensorHandle>,
1860 options: &CovarianceOptions,
1861 ) -> Result<GpuTensorHandle> {
1862 let host_matrix = self.download(matrix)?;
1863 let left = Tensor::new(host_matrix.data.clone(), host_matrix.shape.clone())
1864 .map_err(|e| anyhow!("covariance: {e}"))?;
1865
1866 let right = if let Some(handle) = second {
1867 let host = self.download(handle)?;
1868 Some(
1869 Tensor::new(host.data.clone(), host.shape.clone())
1870 .map_err(|e| anyhow!("covariance: {e}"))?,
1871 )
1872 } else {
1873 None
1874 };
1875
1876 let weight_spec = if let Some(handle) = weights {
1877 let host = self.download(handle)?;
1878 let tensor = Tensor::new(host.data.clone(), host.shape.clone())
1879 .map_err(|e| anyhow!("covariance: {e}"))?;
1880 let vec = tensor_to_weight_vector(&tensor).map_err(|e| anyhow!("covariance: {e}"))?;
1881 CovWeightSpec::Vector(vec)
1882 } else {
1883 CovWeightSpec::Scalar(options.normalization)
1884 };
1885
1886 let result = runtime_cov_from_tensors(left, right, options.rows, weight_spec)
1887 .map_err(|e| anyhow!("covariance: {e}"))?;
1888
1889 let view = HostTensorView {
1890 data: &result.data,
1891 shape: &result.shape,
1892 };
1893 self.upload(&view)
1894 }
1895
1896 fn corrcoef(
1897 &self,
1898 matrix: &GpuTensorHandle,
1899 options: &CorrcoefOptions,
1900 ) -> Result<GpuTensorHandle> {
1901 let host = self.download(matrix)?;
1902 let tensor = Tensor::new(host.data.clone(), host.shape.clone())
1903 .map_err(|e| anyhow!("corrcoef: {e}"))?;
1904 let result =
1905 runtime_corrcoef_from_tensors(tensor, None, options.normalization, options.rows)
1906 .map_err(|e| anyhow!("corrcoef: {e}"))?;
1907 let view = HostTensorView {
1908 data: &result.data,
1909 shape: &result.shape,
1910 };
1911 self.upload(&view)
1912 }
1913
1914 fn elem_add(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
1915 let guard = registry().lock().unwrap();
1916 let abuf = guard
1917 .get(&a.buffer_id)
1918 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
1919 let bbuf = guard
1920 .get(&b.buffer_id)
1921 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", b.buffer_id))?;
1922 if a.shape != b.shape {
1923 return Err(anyhow::anyhow!("shape mismatch"));
1924 }
1925 let mut out = vec![0.0; abuf.len()];
1926 for i in 0..abuf.len() {
1927 out[i] = abuf[i] + bbuf[i];
1928 }
1929 drop(guard);
1930 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1932 let mut guard2 = registry().lock().unwrap();
1933 guard2.insert(id, out);
1934 Ok(GpuTensorHandle {
1935 shape: a.shape.clone(),
1936 device_id: 0,
1937 buffer_id: id,
1938 })
1939 }
1940
1941 fn elem_mul(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
1942 let guard = registry().lock().unwrap();
1943 let abuf = guard
1944 .get(&a.buffer_id)
1945 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
1946 let bbuf = guard
1947 .get(&b.buffer_id)
1948 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", b.buffer_id))?;
1949 if a.shape != b.shape {
1950 return Err(anyhow::anyhow!("shape mismatch"));
1951 }
1952 let mut out = vec![0.0; abuf.len()];
1953 for i in 0..abuf.len() {
1954 out[i] = abuf[i] * bbuf[i];
1955 }
1956 drop(guard);
1957 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1958 let mut guard2 = registry().lock().unwrap();
1959 guard2.insert(id, out);
1960 Ok(GpuTensorHandle {
1961 shape: a.shape.clone(),
1962 device_id: 0,
1963 buffer_id: id,
1964 })
1965 }
1966
1967 fn elem_sub(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
1968 let guard = registry().lock().unwrap();
1969 let abuf = guard
1970 .get(&a.buffer_id)
1971 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
1972 let bbuf = guard
1973 .get(&b.buffer_id)
1974 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", b.buffer_id))?;
1975 if a.shape != b.shape {
1976 return Err(anyhow::anyhow!("shape mismatch"));
1977 }
1978 let mut out = vec![0.0; abuf.len()];
1979 for i in 0..abuf.len() {
1980 out[i] = abuf[i] - bbuf[i];
1981 }
1982 drop(guard);
1983 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1984 let mut guard2 = registry().lock().unwrap();
1985 guard2.insert(id, out);
1986 Ok(GpuTensorHandle {
1987 shape: a.shape.clone(),
1988 device_id: 0,
1989 buffer_id: id,
1990 })
1991 }
1992
1993 fn elem_div(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
1994 let guard = registry().lock().unwrap();
1995 let abuf = guard
1996 .get(&a.buffer_id)
1997 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
1998 let bbuf = guard
1999 .get(&b.buffer_id)
2000 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", b.buffer_id))?;
2001 if a.shape != b.shape {
2002 return Err(anyhow::anyhow!("shape mismatch"));
2003 }
2004 let mut out = vec![0.0; abuf.len()];
2005 for i in 0..abuf.len() {
2006 out[i] = if bbuf[i] == 0.0 {
2007 f64::INFINITY * abuf[i].signum()
2008 } else {
2009 abuf[i] / bbuf[i]
2010 };
2011 }
2012 drop(guard);
2013 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2014 let mut guard2 = registry().lock().unwrap();
2015 guard2.insert(id, out);
2016 Ok(GpuTensorHandle {
2017 shape: a.shape.clone(),
2018 device_id: 0,
2019 buffer_id: id,
2020 })
2021 }
2022
2023 fn elem_pow(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2024 let guard = registry().lock().unwrap();
2025 let abuf = guard
2026 .get(&a.buffer_id)
2027 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2028 let bbuf = guard
2029 .get(&b.buffer_id)
2030 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", b.buffer_id))?;
2031 if a.shape != b.shape {
2032 return Err(anyhow::anyhow!("shape mismatch"));
2033 }
2034 let mut out = vec![0.0; abuf.len()];
2035 for i in 0..abuf.len() {
2036 out[i] = abuf[i].powf(bbuf[i]);
2037 }
2038 drop(guard);
2039 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2040 let mut guard2 = registry().lock().unwrap();
2041 guard2.insert(id, out);
2042 Ok(GpuTensorHandle {
2043 shape: a.shape.clone(),
2044 device_id: 0,
2045 buffer_id: id,
2046 })
2047 }
2048
2049 fn elem_ne(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2050 let (adata, bdata) = {
2051 let guard = registry().lock().unwrap();
2052 let a_vec = guard
2053 .get(&a.buffer_id)
2054 .ok_or_else(|| anyhow!("elem_ne: unknown buffer {}", a.buffer_id))?
2055 .clone();
2056 let b_vec = guard
2057 .get(&b.buffer_id)
2058 .ok_or_else(|| anyhow!("elem_ne: unknown buffer {}", b.buffer_id))?
2059 .clone();
2060 (a_vec, b_vec)
2061 };
2062
2063 let shape = runtime_broadcast_shapes("ne", &a.shape, &b.shape).map_err(|e| anyhow!(e))?;
2064 let total: usize = shape.iter().copied().product();
2065 if total == 0 {
2066 return Ok(self.allocate_tensor(Vec::new(), shape));
2067 }
2068
2069 let strides_a = runtime_compute_strides(&a.shape);
2070 let strides_b = runtime_compute_strides(&b.shape);
2071 let len_a = adata.len();
2072 let len_b = bdata.len();
2073
2074 let mut out = Vec::with_capacity(total);
2075 for idx in 0..total {
2076 let lhs = if len_a == 0 {
2077 0.0
2078 } else {
2079 let offset = runtime_broadcast_index(idx, &shape, &a.shape, &strides_a);
2080 *adata.get(offset).unwrap_or(&0.0)
2081 };
2082 let rhs = if len_b == 0 {
2083 0.0
2084 } else {
2085 let offset = runtime_broadcast_index(idx, &shape, &b.shape, &strides_b);
2086 *bdata.get(offset).unwrap_or(&0.0)
2087 };
2088 out.push(if lhs != rhs { 1.0 } else { 0.0 });
2089 }
2090
2091 Ok(self.allocate_tensor(out, shape))
2092 }
2093
2094 fn elem_ge(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2095 let (adata, bdata) = {
2096 let guard = registry().lock().unwrap();
2097 let a_vec = guard
2098 .get(&a.buffer_id)
2099 .ok_or_else(|| anyhow!("elem_ge: unknown buffer {}", a.buffer_id))?
2100 .clone();
2101 let b_vec = guard
2102 .get(&b.buffer_id)
2103 .ok_or_else(|| anyhow!("elem_ge: unknown buffer {}", b.buffer_id))?
2104 .clone();
2105 (a_vec, b_vec)
2106 };
2107
2108 let shape = runtime_broadcast_shapes("ge", &a.shape, &b.shape).map_err(|e| anyhow!(e))?;
2109 let total: usize = shape.iter().copied().product();
2110 if total == 0 {
2111 return Ok(self.allocate_tensor(Vec::new(), shape));
2112 }
2113
2114 let strides_a = runtime_compute_strides(&a.shape);
2115 let strides_b = runtime_compute_strides(&b.shape);
2116 let len_a = adata.len();
2117 let len_b = bdata.len();
2118
2119 let mut out = Vec::with_capacity(total);
2120 for idx in 0..total {
2121 let lhs = if len_a == 0 {
2122 0.0
2123 } else {
2124 let offset = runtime_broadcast_index(idx, &shape, &a.shape, &strides_a);
2125 *adata.get(offset).unwrap_or(&0.0)
2126 };
2127 let rhs = if len_b == 0 {
2128 0.0
2129 } else {
2130 let offset = runtime_broadcast_index(idx, &shape, &b.shape, &strides_b);
2131 *bdata.get(offset).unwrap_or(&0.0)
2132 };
2133 out.push(if lhs >= rhs { 1.0 } else { 0.0 });
2134 }
2135
2136 Ok(self.allocate_tensor(out, shape))
2137 }
2138
2139 fn elem_le(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2140 let (adata, bdata) = {
2141 let guard = registry().lock().unwrap();
2142 let a_vec = guard
2143 .get(&a.buffer_id)
2144 .ok_or_else(|| anyhow!("elem_le: unknown buffer {}", a.buffer_id))?
2145 .clone();
2146 let b_vec = guard
2147 .get(&b.buffer_id)
2148 .ok_or_else(|| anyhow!("elem_le: unknown buffer {}", b.buffer_id))?
2149 .clone();
2150 (a_vec, b_vec)
2151 };
2152
2153 let shape = runtime_broadcast_shapes("le", &a.shape, &b.shape).map_err(|e| anyhow!(e))?;
2154 let total: usize = shape.iter().copied().product();
2155 if total == 0 {
2156 return Ok(self.allocate_tensor(Vec::new(), shape));
2157 }
2158
2159 let strides_a = runtime_compute_strides(&a.shape);
2160 let strides_b = runtime_compute_strides(&b.shape);
2161 let len_a = adata.len();
2162 let len_b = bdata.len();
2163
2164 let mut out = Vec::with_capacity(total);
2165 for idx in 0..total {
2166 let lhs = if len_a == 0 {
2167 0.0
2168 } else {
2169 let offset = runtime_broadcast_index(idx, &shape, &a.shape, &strides_a);
2170 *adata.get(offset).unwrap_or(&0.0)
2171 };
2172 let rhs = if len_b == 0 {
2173 0.0
2174 } else {
2175 let offset = runtime_broadcast_index(idx, &shape, &b.shape, &strides_b);
2176 *bdata.get(offset).unwrap_or(&0.0)
2177 };
2178 out.push(if lhs <= rhs { 1.0 } else { 0.0 });
2179 }
2180
2181 Ok(self.allocate_tensor(out, shape))
2182 }
2183
2184 fn elem_lt(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2185 let (adata, bdata) = {
2186 let guard = registry().lock().unwrap();
2187 let a_vec = guard
2188 .get(&a.buffer_id)
2189 .ok_or_else(|| anyhow!("elem_lt: unknown buffer {}", a.buffer_id))?
2190 .clone();
2191 let b_vec = guard
2192 .get(&b.buffer_id)
2193 .ok_or_else(|| anyhow!("elem_lt: unknown buffer {}", b.buffer_id))?
2194 .clone();
2195 (a_vec, b_vec)
2196 };
2197
2198 let shape = runtime_broadcast_shapes("lt", &a.shape, &b.shape).map_err(|e| anyhow!(e))?;
2199 let total: usize = shape.iter().copied().product();
2200 if total == 0 {
2201 return Ok(self.allocate_tensor(Vec::new(), shape));
2202 }
2203
2204 let strides_a = runtime_compute_strides(&a.shape);
2205 let strides_b = runtime_compute_strides(&b.shape);
2206 let len_a = adata.len();
2207 let len_b = bdata.len();
2208
2209 let mut out = Vec::with_capacity(total);
2210 for idx in 0..total {
2211 let lhs = if len_a == 0 {
2212 0.0
2213 } else {
2214 let offset = runtime_broadcast_index(idx, &shape, &a.shape, &strides_a);
2215 *adata.get(offset).unwrap_or(&0.0)
2216 };
2217 let rhs = if len_b == 0 {
2218 0.0
2219 } else {
2220 let offset = runtime_broadcast_index(idx, &shape, &b.shape, &strides_b);
2221 *bdata.get(offset).unwrap_or(&0.0)
2222 };
2223 out.push(if lhs < rhs { 1.0 } else { 0.0 });
2224 }
2225
2226 Ok(self.allocate_tensor(out, shape))
2227 }
2228
2229 fn elem_gt(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2230 let (adata, bdata) = {
2231 let guard = registry().lock().unwrap();
2232 let a_vec = guard
2233 .get(&a.buffer_id)
2234 .ok_or_else(|| anyhow!("elem_gt: unknown buffer {}", a.buffer_id))?
2235 .clone();
2236 let b_vec = guard
2237 .get(&b.buffer_id)
2238 .ok_or_else(|| anyhow!("elem_gt: unknown buffer {}", b.buffer_id))?
2239 .clone();
2240 (a_vec, b_vec)
2241 };
2242
2243 let shape = runtime_broadcast_shapes("gt", &a.shape, &b.shape).map_err(|e| anyhow!(e))?;
2244 let total: usize = shape.iter().copied().product();
2245 if total == 0 {
2246 return Ok(self.allocate_tensor(Vec::new(), shape));
2247 }
2248
2249 let strides_a = runtime_compute_strides(&a.shape);
2250 let strides_b = runtime_compute_strides(&b.shape);
2251 let len_a = adata.len();
2252 let len_b = bdata.len();
2253
2254 let mut out = Vec::with_capacity(total);
2255 for idx in 0..total {
2256 let lhs = if len_a == 0 {
2257 0.0
2258 } else {
2259 let offset = runtime_broadcast_index(idx, &shape, &a.shape, &strides_a);
2260 *adata.get(offset).unwrap_or(&0.0)
2261 };
2262 let rhs = if len_b == 0 {
2263 0.0
2264 } else {
2265 let offset = runtime_broadcast_index(idx, &shape, &b.shape, &strides_b);
2266 *bdata.get(offset).unwrap_or(&0.0)
2267 };
2268 out.push(if lhs > rhs { 1.0 } else { 0.0 });
2269 }
2270
2271 Ok(self.allocate_tensor(out, shape))
2272 }
2273
2274 fn elem_eq(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2275 let (adata, bdata) = {
2276 let guard = registry().lock().unwrap();
2277 let a_vec = guard
2278 .get(&a.buffer_id)
2279 .ok_or_else(|| anyhow!("elem_eq: unknown buffer {}", a.buffer_id))?
2280 .clone();
2281 let b_vec = guard
2282 .get(&b.buffer_id)
2283 .ok_or_else(|| anyhow!("elem_eq: unknown buffer {}", b.buffer_id))?
2284 .clone();
2285 (a_vec, b_vec)
2286 };
2287
2288 let shape = runtime_broadcast_shapes("eq", &a.shape, &b.shape).map_err(|e| anyhow!(e))?;
2289 let total: usize = shape.iter().copied().product();
2290 if total == 0 {
2291 return Ok(self.allocate_tensor(Vec::new(), shape));
2292 }
2293
2294 let strides_a = runtime_compute_strides(&a.shape);
2295 let strides_b = runtime_compute_strides(&b.shape);
2296 let len_a = adata.len();
2297 let len_b = bdata.len();
2298
2299 let mut out = Vec::with_capacity(total);
2300 for idx in 0..total {
2301 let lhs = if len_a == 0 {
2302 0.0
2303 } else {
2304 let offset = runtime_broadcast_index(idx, &shape, &a.shape, &strides_a);
2305 *adata.get(offset).unwrap_or(&0.0)
2306 };
2307 let rhs = if len_b == 0 {
2308 0.0
2309 } else {
2310 let offset = runtime_broadcast_index(idx, &shape, &b.shape, &strides_b);
2311 *bdata.get(offset).unwrap_or(&0.0)
2312 };
2313 out.push(if lhs == rhs { 1.0 } else { 0.0 });
2314 }
2315
2316 Ok(self.allocate_tensor(out, shape))
2317 }
2318
2319 fn logical_and(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2320 let (adata, bdata) = {
2321 let guard = registry().lock().unwrap();
2322 let a_vec = guard
2323 .get(&a.buffer_id)
2324 .ok_or_else(|| anyhow!("logical_and: unknown buffer {}", a.buffer_id))?
2325 .clone();
2326 let b_vec = guard
2327 .get(&b.buffer_id)
2328 .ok_or_else(|| anyhow!("logical_and: unknown buffer {}", b.buffer_id))?
2329 .clone();
2330 (a_vec, b_vec)
2331 };
2332
2333 let shape =
2334 runtime_broadcast_shapes("logical_and", &a.shape, &b.shape).map_err(|e| anyhow!(e))?;
2335 let total: usize = shape.iter().copied().product();
2336 if total == 0 {
2337 return Ok(self.allocate_tensor(Vec::new(), shape));
2338 }
2339
2340 let strides_a = runtime_compute_strides(&a.shape);
2341 let strides_b = runtime_compute_strides(&b.shape);
2342 let len_a = adata.len();
2343 let len_b = bdata.len();
2344
2345 let mut out = Vec::with_capacity(total);
2346 for idx in 0..total {
2347 let lhs = if len_a == 0 {
2348 0.0
2349 } else {
2350 let offset = runtime_broadcast_index(idx, &shape, &a.shape, &strides_a);
2351 *adata.get(offset).unwrap_or(&0.0)
2352 };
2353 let rhs = if len_b == 0 {
2354 0.0
2355 } else {
2356 let offset = runtime_broadcast_index(idx, &shape, &b.shape, &strides_b);
2357 *bdata.get(offset).unwrap_or(&0.0)
2358 };
2359 out.push(if lhs != 0.0 && rhs != 0.0 { 1.0 } else { 0.0 });
2360 }
2361
2362 Ok(self.allocate_tensor(out, shape))
2363 }
2364
2365 fn logical_or(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2366 let (adata, bdata) = {
2367 let guard = registry().lock().unwrap();
2368 let a_vec = guard
2369 .get(&a.buffer_id)
2370 .ok_or_else(|| anyhow!("logical_or: unknown buffer {}", a.buffer_id))?
2371 .clone();
2372 let b_vec = guard
2373 .get(&b.buffer_id)
2374 .ok_or_else(|| anyhow!("logical_or: unknown buffer {}", b.buffer_id))?
2375 .clone();
2376 (a_vec, b_vec)
2377 };
2378
2379 let shape =
2380 runtime_broadcast_shapes("logical_or", &a.shape, &b.shape).map_err(|e| anyhow!(e))?;
2381 let total: usize = shape.iter().copied().product();
2382 if total == 0 {
2383 return Ok(self.allocate_tensor(Vec::new(), shape));
2384 }
2385
2386 let strides_a = runtime_compute_strides(&a.shape);
2387 let strides_b = runtime_compute_strides(&b.shape);
2388 let len_a = adata.len();
2389 let len_b = bdata.len();
2390
2391 let mut out = Vec::with_capacity(total);
2392 for idx in 0..total {
2393 let lhs = if len_a == 0 {
2394 0.0
2395 } else {
2396 let offset = runtime_broadcast_index(idx, &shape, &a.shape, &strides_a);
2397 *adata.get(offset).unwrap_or(&0.0)
2398 };
2399 let rhs = if len_b == 0 {
2400 0.0
2401 } else {
2402 let offset = runtime_broadcast_index(idx, &shape, &b.shape, &strides_b);
2403 *bdata.get(offset).unwrap_or(&0.0)
2404 };
2405 out.push(if lhs != 0.0 || rhs != 0.0 { 1.0 } else { 0.0 });
2406 }
2407
2408 Ok(self.allocate_tensor(out, shape))
2409 }
2410
2411 fn logical_xor(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2412 let (adata, bdata) = {
2413 let guard = registry().lock().unwrap();
2414 let a_vec = guard
2415 .get(&a.buffer_id)
2416 .ok_or_else(|| anyhow!("logical_xor: unknown buffer {}", a.buffer_id))?
2417 .clone();
2418 let b_vec = guard
2419 .get(&b.buffer_id)
2420 .ok_or_else(|| anyhow!("logical_xor: unknown buffer {}", b.buffer_id))?
2421 .clone();
2422 (a_vec, b_vec)
2423 };
2424
2425 let shape =
2426 runtime_broadcast_shapes("logical_xor", &a.shape, &b.shape).map_err(|e| anyhow!(e))?;
2427 let total: usize = shape.iter().copied().product();
2428 if total == 0 {
2429 return Ok(self.allocate_tensor(Vec::new(), shape));
2430 }
2431
2432 let strides_a = runtime_compute_strides(&a.shape);
2433 let strides_b = runtime_compute_strides(&b.shape);
2434 let len_a = adata.len();
2435 let len_b = bdata.len();
2436
2437 let mut out = Vec::with_capacity(total);
2438 for idx in 0..total {
2439 let lhs = if len_a == 0 {
2440 0.0
2441 } else {
2442 let offset = runtime_broadcast_index(idx, &shape, &a.shape, &strides_a);
2443 *adata.get(offset).unwrap_or(&0.0)
2444 };
2445 let rhs = if len_b == 0 {
2446 0.0
2447 } else {
2448 let offset = runtime_broadcast_index(idx, &shape, &b.shape, &strides_b);
2449 *bdata.get(offset).unwrap_or(&0.0)
2450 };
2451 let lhs_true = lhs != 0.0;
2452 let rhs_true = rhs != 0.0;
2453 out.push(if lhs_true ^ rhs_true { 1.0 } else { 0.0 });
2454 }
2455
2456 Ok(self.allocate_tensor(out, shape))
2457 }
2458
2459 fn logical_not(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2460 let data = {
2461 let guard = registry().lock().unwrap();
2462 guard
2463 .get(&a.buffer_id)
2464 .ok_or_else(|| anyhow!("logical_not: unknown buffer {}", a.buffer_id))?
2465 .clone()
2466 };
2467
2468 let shape = a.shape.clone();
2469 if data.is_empty() {
2470 return Ok(self.allocate_tensor(Vec::new(), shape));
2471 }
2472
2473 let mut out = Vec::with_capacity(data.len());
2474 for value in data {
2475 out.push(if value == 0.0 { 1.0 } else { 0.0 });
2476 }
2477
2478 Ok(self.allocate_tensor(out, shape))
2479 }
2480
2481 fn logical_isreal(&self, a: &GpuTensorHandle) -> Result<bool> {
2482 {
2483 let guard = registry().lock().unwrap();
2484 guard
2485 .get(&a.buffer_id)
2486 .ok_or_else(|| anyhow!("logical_isreal: unknown buffer {}", a.buffer_id))?;
2487 }
2488 Ok(true)
2489 }
2490
2491 fn logical_isfinite(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2492 let data = {
2493 let guard = registry().lock().unwrap();
2494 guard
2495 .get(&a.buffer_id)
2496 .ok_or_else(|| anyhow!("logical_isfinite: unknown buffer {}", a.buffer_id))?
2497 .clone()
2498 };
2499
2500 let shape = a.shape.clone();
2501 if data.is_empty() {
2502 return Ok(self.allocate_tensor(Vec::new(), shape));
2503 }
2504
2505 let mut out = Vec::with_capacity(data.len());
2506 for value in data {
2507 out.push(if value.is_finite() { 1.0 } else { 0.0 });
2508 }
2509
2510 Ok(self.allocate_tensor(out, shape))
2511 }
2512
2513 fn logical_isnan(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2514 let data = {
2515 let guard = registry().lock().unwrap();
2516 guard
2517 .get(&a.buffer_id)
2518 .ok_or_else(|| anyhow!("logical_isnan: unknown buffer {}", a.buffer_id))?
2519 .clone()
2520 };
2521
2522 let shape = a.shape.clone();
2523 if data.is_empty() {
2524 return Ok(self.allocate_tensor(Vec::new(), shape));
2525 }
2526
2527 let mut out = Vec::with_capacity(data.len());
2528 for value in data {
2529 out.push(if value.is_nan() { 1.0 } else { 0.0 });
2530 }
2531
2532 Ok(self.allocate_tensor(out, shape))
2533 }
2534
2535 fn logical_isinf(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2536 let data = {
2537 let guard = registry().lock().unwrap();
2538 guard
2539 .get(&a.buffer_id)
2540 .ok_or_else(|| anyhow!("logical_isinf: unknown buffer {}", a.buffer_id))?
2541 .clone()
2542 };
2543
2544 let shape = a.shape.clone();
2545 if data.is_empty() {
2546 return Ok(self.allocate_tensor(Vec::new(), shape));
2547 }
2548
2549 let mut out = Vec::with_capacity(data.len());
2550 for value in data {
2551 out.push(if value.is_infinite() { 1.0 } else { 0.0 });
2552 }
2553
2554 Ok(self.allocate_tensor(out, shape))
2555 }
2556
2557 fn elem_hypot(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2558 let guard = registry().lock().unwrap();
2559 let abuf = guard
2560 .get(&a.buffer_id)
2561 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2562 let bbuf = guard
2563 .get(&b.buffer_id)
2564 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", b.buffer_id))?;
2565 if a.shape != b.shape {
2566 return Err(anyhow::anyhow!("shape mismatch"));
2567 }
2568 let mut out = vec![0.0; abuf.len()];
2569 for i in 0..abuf.len() {
2570 out[i] = abuf[i].hypot(bbuf[i]);
2571 }
2572 drop(guard);
2573 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2574 let mut guard2 = registry().lock().unwrap();
2575 guard2.insert(id, out);
2576 Ok(GpuTensorHandle {
2577 shape: a.shape.clone(),
2578 device_id: 0,
2579 buffer_id: id,
2580 })
2581 }
2582 fn elem_atan2(&self, y: &GpuTensorHandle, x: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2583 let guard = registry().lock().unwrap();
2584 let ybuf = guard
2585 .get(&y.buffer_id)
2586 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", y.buffer_id))?;
2587 let xbuf = guard
2588 .get(&x.buffer_id)
2589 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", x.buffer_id))?;
2590 if y.shape != x.shape {
2591 return Err(anyhow::anyhow!("shape mismatch"));
2592 }
2593 let mut out = vec![0.0; ybuf.len()];
2594 for idx in 0..ybuf.len() {
2595 out[idx] = ybuf[idx].atan2(xbuf[idx]);
2596 }
2597 drop(guard);
2598 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2599 let mut guard2 = registry().lock().unwrap();
2600 guard2.insert(id, out);
2601 Ok(GpuTensorHandle {
2602 shape: y.shape.clone(),
2603 device_id: 0,
2604 buffer_id: id,
2605 })
2606 }
2607
2608 fn unary_sin(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2609 let guard = registry().lock().unwrap();
2610 let abuf = guard
2611 .get(&a.buffer_id)
2612 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2613 let out: Vec<f64> = abuf.iter().map(|&x| x.sin()).collect();
2614 drop(guard);
2615 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2616 let mut guard2 = registry().lock().unwrap();
2617 guard2.insert(id, out);
2618 Ok(GpuTensorHandle {
2619 shape: a.shape.clone(),
2620 device_id: 0,
2621 buffer_id: id,
2622 })
2623 }
2624 fn unary_gamma(&self, _a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2625 Err(anyhow::anyhow!("unary_gamma not supported by provider"))
2626 }
2627 fn unary_factorial(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2628 let guard = registry().lock().unwrap();
2629 let abuf = guard
2630 .get(&a.buffer_id)
2631 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2632 let out: Vec<f64> = abuf.iter().copied().map(factorial_scalar_host).collect();
2633 drop(guard);
2634 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2635 let mut guard2 = registry().lock().unwrap();
2636 guard2.insert(id, out);
2637 Ok(GpuTensorHandle {
2638 shape: a.shape.clone(),
2639 device_id: 0,
2640 buffer_id: id,
2641 })
2642 }
2643 fn unary_asinh(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2644 let guard = registry().lock().unwrap();
2645 let abuf = guard
2646 .get(&a.buffer_id)
2647 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2648 let out: Vec<f64> = abuf.iter().map(|&x| x.asinh()).collect();
2649 drop(guard);
2650 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2651 let mut guard2 = registry().lock().unwrap();
2652 guard2.insert(id, out);
2653 Ok(GpuTensorHandle {
2654 shape: a.shape.clone(),
2655 device_id: 0,
2656 buffer_id: id,
2657 })
2658 }
2659 fn unary_sinh(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2660 let guard = registry().lock().unwrap();
2661 let abuf = guard
2662 .get(&a.buffer_id)
2663 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2664 let out: Vec<f64> = abuf.iter().map(|&x| x.sinh()).collect();
2665 drop(guard);
2666 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2667 let mut guard2 = registry().lock().unwrap();
2668 guard2.insert(id, out);
2669 Ok(GpuTensorHandle {
2670 shape: a.shape.clone(),
2671 device_id: 0,
2672 buffer_id: id,
2673 })
2674 }
2675 fn unary_cosh(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2676 let guard = registry().lock().unwrap();
2677 let abuf = guard
2678 .get(&a.buffer_id)
2679 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2680 let out: Vec<f64> = abuf.iter().map(|&x| x.cosh()).collect();
2681 drop(guard);
2682 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2683 let mut guard2 = registry().lock().unwrap();
2684 guard2.insert(id, out);
2685 Ok(GpuTensorHandle {
2686 shape: a.shape.clone(),
2687 device_id: 0,
2688 buffer_id: id,
2689 })
2690 }
2691
2692 fn unary_asin(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2693 let guard = registry().lock().unwrap();
2694 let abuf = guard
2695 .get(&a.buffer_id)
2696 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2697 let out: Vec<f64> = abuf.iter().map(|&x| x.asin()).collect();
2698 drop(guard);
2699 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2700 let mut guard2 = registry().lock().unwrap();
2701 guard2.insert(id, out);
2702 Ok(GpuTensorHandle {
2703 shape: a.shape.clone(),
2704 device_id: 0,
2705 buffer_id: id,
2706 })
2707 }
2708 fn unary_acos(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2709 let guard = registry().lock().unwrap();
2710 let abuf = guard
2711 .get(&a.buffer_id)
2712 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2713 let out: Vec<f64> = abuf.iter().map(|&x| x.acos()).collect();
2714 drop(guard);
2715 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2716 let mut guard2 = registry().lock().unwrap();
2717 guard2.insert(id, out);
2718 Ok(GpuTensorHandle {
2719 shape: a.shape.clone(),
2720 device_id: 0,
2721 buffer_id: id,
2722 })
2723 }
2724 fn unary_acosh(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2725 let guard = registry().lock().unwrap();
2726 let abuf = guard
2727 .get(&a.buffer_id)
2728 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2729 let out: Vec<f64> = abuf.iter().map(|&x| x.acosh()).collect();
2730 drop(guard);
2731 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2732 let mut guard2 = registry().lock().unwrap();
2733 guard2.insert(id, out);
2734 Ok(GpuTensorHandle {
2735 shape: a.shape.clone(),
2736 device_id: 0,
2737 buffer_id: id,
2738 })
2739 }
2740
2741 fn unary_tan(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2742 let guard = registry().lock().unwrap();
2743 let abuf = guard
2744 .get(&a.buffer_id)
2745 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2746 let out: Vec<f64> = abuf.iter().map(|&x| x.tan()).collect();
2747 drop(guard);
2748 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2749 let mut guard2 = registry().lock().unwrap();
2750 guard2.insert(id, out);
2751 Ok(GpuTensorHandle {
2752 shape: a.shape.clone(),
2753 device_id: 0,
2754 buffer_id: id,
2755 })
2756 }
2757 fn unary_tanh(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2758 let guard = registry().lock().unwrap();
2759 let abuf = guard
2760 .get(&a.buffer_id)
2761 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2762 let out: Vec<f64> = abuf.iter().map(|&x| x.tanh()).collect();
2763 drop(guard);
2764 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2765 let mut guard2 = registry().lock().unwrap();
2766 guard2.insert(id, out);
2767 Ok(GpuTensorHandle {
2768 shape: a.shape.clone(),
2769 device_id: 0,
2770 buffer_id: id,
2771 })
2772 }
2773
2774 fn unary_atan(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2775 let guard = registry().lock().unwrap();
2776 let abuf = guard
2777 .get(&a.buffer_id)
2778 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2779 let out: Vec<f64> = abuf.iter().map(|&x| x.atan()).collect();
2780 drop(guard);
2781 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2782 let mut guard2 = registry().lock().unwrap();
2783 guard2.insert(id, out);
2784 Ok(GpuTensorHandle {
2785 shape: a.shape.clone(),
2786 device_id: 0,
2787 buffer_id: id,
2788 })
2789 }
2790 fn unary_atanh(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2791 let guard = registry().lock().unwrap();
2792 let abuf = guard
2793 .get(&a.buffer_id)
2794 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2795 let out: Vec<f64> = abuf.iter().map(|&x| x.atanh()).collect();
2796 drop(guard);
2797 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2798 let mut guard2 = registry().lock().unwrap();
2799 guard2.insert(id, out);
2800 Ok(GpuTensorHandle {
2801 shape: a.shape.clone(),
2802 device_id: 0,
2803 buffer_id: id,
2804 })
2805 }
2806
2807 fn unary_ceil(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2808 let guard = registry().lock().unwrap();
2809 let abuf = guard
2810 .get(&a.buffer_id)
2811 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2812 let out: Vec<f64> = abuf.iter().map(|&x| x.ceil()).collect();
2813 drop(guard);
2814 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2815 let mut guard2 = registry().lock().unwrap();
2816 guard2.insert(id, out);
2817 Ok(GpuTensorHandle {
2818 shape: a.shape.clone(),
2819 device_id: 0,
2820 buffer_id: id,
2821 })
2822 }
2823
2824 fn unary_floor(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2825 let guard = registry().lock().unwrap();
2826 let abuf = guard
2827 .get(&a.buffer_id)
2828 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2829 let out: Vec<f64> = abuf.iter().map(|&x| x.floor()).collect();
2830 drop(guard);
2831 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2832 let mut guard2 = registry().lock().unwrap();
2833 guard2.insert(id, out);
2834 Ok(GpuTensorHandle {
2835 shape: a.shape.clone(),
2836 device_id: 0,
2837 buffer_id: id,
2838 })
2839 }
2840
2841 fn unary_round(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2842 let guard = registry().lock().unwrap();
2843 let abuf = guard
2844 .get(&a.buffer_id)
2845 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2846 let out: Vec<f64> = abuf.iter().map(|&x| x.round()).collect();
2847 drop(guard);
2848 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2849 let mut guard2 = registry().lock().unwrap();
2850 guard2.insert(id, out);
2851 Ok(GpuTensorHandle {
2852 shape: a.shape.clone(),
2853 device_id: 0,
2854 buffer_id: id,
2855 })
2856 }
2857
2858 fn unary_fix(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2859 let guard = registry().lock().unwrap();
2860 let abuf = guard
2861 .get(&a.buffer_id)
2862 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2863 let out: Vec<f64> = abuf
2864 .iter()
2865 .map(|&x| {
2866 if !x.is_finite() {
2867 x
2868 } else {
2869 let truncated = x.trunc();
2870 if truncated == 0.0 {
2871 0.0
2872 } else {
2873 truncated
2874 }
2875 }
2876 })
2877 .collect();
2878 drop(guard);
2879 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2880 let mut guard2 = registry().lock().unwrap();
2881 guard2.insert(id, out);
2882 Ok(GpuTensorHandle {
2883 shape: a.shape.clone(),
2884 device_id: 0,
2885 buffer_id: id,
2886 })
2887 }
2888
2889 fn unary_cos(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2890 let guard = registry().lock().unwrap();
2891 let abuf = guard
2892 .get(&a.buffer_id)
2893 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2894 let out: Vec<f64> = abuf.iter().map(|&x| x.cos()).collect();
2895 drop(guard);
2896 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2897 let mut guard2 = registry().lock().unwrap();
2898 guard2.insert(id, out);
2899 Ok(GpuTensorHandle {
2900 shape: a.shape.clone(),
2901 device_id: 0,
2902 buffer_id: id,
2903 })
2904 }
2905
2906 fn unary_abs(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2907 let guard = registry().lock().unwrap();
2908 let abuf = guard
2909 .get(&a.buffer_id)
2910 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2911 let out: Vec<f64> = abuf.iter().map(|&x| x.abs()).collect();
2912 drop(guard);
2913 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2914 let mut guard2 = registry().lock().unwrap();
2915 guard2.insert(id, out);
2916 Ok(GpuTensorHandle {
2917 shape: a.shape.clone(),
2918 device_id: 0,
2919 buffer_id: id,
2920 })
2921 }
2922
2923 fn unary_exp(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2924 let guard = registry().lock().unwrap();
2925 let abuf = guard
2926 .get(&a.buffer_id)
2927 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2928 let out: Vec<f64> = abuf.iter().map(|&x| x.exp()).collect();
2929 drop(guard);
2930 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2931 let mut guard2 = registry().lock().unwrap();
2932 guard2.insert(id, out);
2933 Ok(GpuTensorHandle {
2934 shape: a.shape.clone(),
2935 device_id: 0,
2936 buffer_id: id,
2937 })
2938 }
2939
2940 fn unary_log(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2941 let guard = registry().lock().unwrap();
2942 let abuf = guard
2943 .get(&a.buffer_id)
2944 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2945 let out: Vec<f64> = abuf.iter().map(|&x| x.ln()).collect();
2946 drop(guard);
2947 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2948 let mut guard2 = registry().lock().unwrap();
2949 guard2.insert(id, out);
2950 Ok(GpuTensorHandle {
2951 shape: a.shape.clone(),
2952 device_id: 0,
2953 buffer_id: id,
2954 })
2955 }
2956
2957 fn unary_sqrt(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2958 let guard = registry().lock().unwrap();
2959 let abuf = guard
2960 .get(&a.buffer_id)
2961 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2962 let out: Vec<f64> = abuf.iter().map(|&x| x.sqrt()).collect();
2963 drop(guard);
2964 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2965 let mut guard2 = registry().lock().unwrap();
2966 guard2.insert(id, out);
2967 Ok(GpuTensorHandle {
2968 shape: a.shape.clone(),
2969 device_id: 0,
2970 buffer_id: id,
2971 })
2972 }
2973
2974 fn unary_double(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2975 let guard = registry().lock().unwrap();
2976 let abuf = guard
2977 .get(&a.buffer_id)
2978 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?
2979 .clone();
2980 drop(guard);
2981 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2982 let mut guard2 = registry().lock().unwrap();
2983 guard2.insert(id, abuf);
2984 Ok(GpuTensorHandle {
2985 shape: a.shape.clone(),
2986 device_id: 0,
2987 buffer_id: id,
2988 })
2989 }
2990
2991 fn unary_single(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2992 let guard = registry().lock().unwrap();
2993 let abuf = guard
2994 .get(&a.buffer_id)
2995 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2996 let out: Vec<f64> = abuf.iter().map(|&x| (x as f32) as f64).collect();
2997 drop(guard);
2998 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2999 let mut guard2 = registry().lock().unwrap();
3000 guard2.insert(id, out);
3001 Ok(GpuTensorHandle {
3002 shape: a.shape.clone(),
3003 device_id: 0,
3004 buffer_id: id,
3005 })
3006 }
3007
3008 fn unary_pow2(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
3009 let guard = registry().lock().unwrap();
3010 let abuf = guard
3011 .get(&a.buffer_id)
3012 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3013 let out: Vec<f64> = abuf.iter().map(|&x| x.exp2()).collect();
3014 drop(guard);
3015 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3016 let mut guard2 = registry().lock().unwrap();
3017 guard2.insert(id, out);
3018 Ok(GpuTensorHandle {
3019 shape: a.shape.clone(),
3020 device_id: 0,
3021 buffer_id: id,
3022 })
3023 }
3024
3025 fn pow2_scale(
3026 &self,
3027 mantissa: &GpuTensorHandle,
3028 exponent: &GpuTensorHandle,
3029 ) -> Result<GpuTensorHandle> {
3030 let guard = registry().lock().unwrap();
3031 let mbuf = guard
3032 .get(&mantissa.buffer_id)
3033 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", mantissa.buffer_id))?;
3034 let ebuf = guard
3035 .get(&exponent.buffer_id)
3036 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", exponent.buffer_id))?;
3037 if mantissa.shape != exponent.shape {
3038 return Err(anyhow::anyhow!("shape mismatch"));
3039 }
3040 let mut out = vec![0.0f64; mbuf.len()];
3041 for i in 0..mbuf.len() {
3042 out[i] = mbuf[i] * ebuf[i].exp2();
3043 }
3044 drop(guard);
3045 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3046 let mut guard2 = registry().lock().unwrap();
3047 guard2.insert(id, out);
3048 Ok(GpuTensorHandle {
3049 shape: mantissa.shape.clone(),
3050 device_id: 0,
3051 buffer_id: id,
3052 })
3053 }
3054
3055 fn scalar_add(&self, a: &GpuTensorHandle, scalar: f64) -> Result<GpuTensorHandle> {
3056 let guard = registry().lock().unwrap();
3057 let abuf = guard
3058 .get(&a.buffer_id)
3059 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3060 let out: Vec<f64> = abuf.iter().map(|&x| x + scalar).collect();
3061 drop(guard);
3062 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3063 let mut guard2 = registry().lock().unwrap();
3064 guard2.insert(id, out);
3065 Ok(GpuTensorHandle {
3066 shape: a.shape.clone(),
3067 device_id: 0,
3068 buffer_id: id,
3069 })
3070 }
3071
3072 fn scalar_sub(&self, a: &GpuTensorHandle, scalar: f64) -> Result<GpuTensorHandle> {
3073 let guard = registry().lock().unwrap();
3074 let abuf = guard
3075 .get(&a.buffer_id)
3076 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3077 let out: Vec<f64> = abuf.iter().map(|&x| x - scalar).collect();
3078 drop(guard);
3079 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3080 let mut guard2 = registry().lock().unwrap();
3081 guard2.insert(id, out);
3082 Ok(GpuTensorHandle {
3083 shape: a.shape.clone(),
3084 device_id: 0,
3085 buffer_id: id,
3086 })
3087 }
3088
3089 fn scalar_mul(&self, a: &GpuTensorHandle, scalar: f64) -> Result<GpuTensorHandle> {
3090 let guard = registry().lock().unwrap();
3091 let abuf = guard
3092 .get(&a.buffer_id)
3093 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3094 let out: Vec<f64> = abuf.iter().map(|&x| x * scalar).collect();
3095 drop(guard);
3096 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3097 let mut guard2 = registry().lock().unwrap();
3098 guard2.insert(id, out);
3099 Ok(GpuTensorHandle {
3100 shape: a.shape.clone(),
3101 device_id: 0,
3102 buffer_id: id,
3103 })
3104 }
3105
3106 fn scalar_div(&self, a: &GpuTensorHandle, scalar: f64) -> Result<GpuTensorHandle> {
3107 let guard = registry().lock().unwrap();
3108 let abuf = guard
3109 .get(&a.buffer_id)
3110 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3111 let out: Vec<f64> = if scalar == 0.0 {
3112 abuf.iter().map(|&x| f64::INFINITY * x.signum()).collect()
3113 } else {
3114 abuf.iter().map(|&x| x / scalar).collect()
3115 };
3116 drop(guard);
3117 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3118 let mut guard2 = registry().lock().unwrap();
3119 guard2.insert(id, out);
3120 Ok(GpuTensorHandle {
3121 shape: a.shape.clone(),
3122 device_id: 0,
3123 buffer_id: id,
3124 })
3125 }
3126
3127 fn scalar_rsub(&self, a: &GpuTensorHandle, scalar: f64) -> Result<GpuTensorHandle> {
3128 let guard = registry().lock().unwrap();
3130 let abuf = guard
3131 .get(&a.buffer_id)
3132 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3133 let out: Vec<f64> = abuf.iter().map(|&x| scalar - x).collect();
3134 drop(guard);
3135 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3136 let mut guard2 = registry().lock().unwrap();
3137 guard2.insert(id, out);
3138 Ok(GpuTensorHandle {
3139 shape: a.shape.clone(),
3140 device_id: 0,
3141 buffer_id: id,
3142 })
3143 }
3144
3145 fn scalar_rdiv(&self, a: &GpuTensorHandle, scalar: f64) -> Result<GpuTensorHandle> {
3146 let guard = registry().lock().unwrap();
3148 let abuf = guard
3149 .get(&a.buffer_id)
3150 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3151 let out: Vec<f64> = abuf
3152 .iter()
3153 .map(|&x| {
3154 if x == 0.0 {
3155 f64::INFINITY * scalar.signum()
3156 } else {
3157 scalar / x
3158 }
3159 })
3160 .collect();
3161 drop(guard);
3162 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3163 let mut guard2 = registry().lock().unwrap();
3164 guard2.insert(id, out);
3165 Ok(GpuTensorHandle {
3166 shape: a.shape.clone(),
3167 device_id: 0,
3168 buffer_id: id,
3169 })
3170 }
3171
3172 fn transpose(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
3173 if a.shape.len() != 2 {
3174 return Err(anyhow::anyhow!("transpose: only 2D supported"));
3175 }
3176 let rows = a.shape[0];
3177 let cols = a.shape[1];
3178 let guard = registry().lock().unwrap();
3179 let abuf = guard
3180 .get(&a.buffer_id)
3181 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3182 let mut out = vec![0.0; abuf.len()];
3183 for i in 0..rows {
3184 for j in 0..cols {
3185 let src = i + j * rows;
3186 let dst = j + i * cols;
3187 out[dst] = abuf[src];
3188 }
3189 }
3190 drop(guard);
3191 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3192 let mut guard2 = registry().lock().unwrap();
3193 guard2.insert(id, out);
3194 Ok(GpuTensorHandle {
3195 shape: vec![cols, rows],
3196 device_id: 0,
3197 buffer_id: id,
3198 })
3199 }
3200 fn conv1d(
3201 &self,
3202 signal: &GpuTensorHandle,
3203 kernel: &GpuTensorHandle,
3204 options: ProviderConv1dOptions,
3205 ) -> Result<GpuTensorHandle> {
3206 let signal_len: usize = signal.shape.iter().copied().product();
3207 let kernel_len: usize = kernel.shape.iter().copied().product();
3208
3209 if signal_len == 0 || kernel_len == 0 {
3210 let shape = conv1d_output_shape(0, options.orientation);
3211 return Ok(self.allocate_tensor(Vec::new(), shape));
3212 }
3213
3214 if matches!(options.mode, ProviderConvMode::Valid) && signal_len < kernel_len {
3215 let shape = conv1d_output_shape(0, options.orientation);
3216 return Ok(self.allocate_tensor(Vec::new(), shape));
3217 }
3218
3219 let (signal_data, kernel_data) = {
3220 let guard = registry().lock().unwrap();
3221 let signal_buf = guard
3222 .get(&signal.buffer_id)
3223 .cloned()
3224 .ok_or_else(|| anyhow!("conv1d: unknown signal buffer {}", signal.buffer_id))?;
3225 let kernel_buf = guard
3226 .get(&kernel.buffer_id)
3227 .cloned()
3228 .ok_or_else(|| anyhow!("conv1d: unknown kernel buffer {}", kernel.buffer_id))?;
3229 (signal_buf, kernel_buf)
3230 };
3231
3232 ensure!(
3233 signal_data.len() == signal_len,
3234 "conv1d: signal length mismatch (shape implies {}, buffer has {})",
3235 signal_len,
3236 signal_data.len()
3237 );
3238 ensure!(
3239 kernel_data.len() == kernel_len,
3240 "conv1d: kernel length mismatch (shape implies {}, buffer has {})",
3241 kernel_len,
3242 kernel_data.len()
3243 );
3244
3245 let full = convolve_real(&signal_data, &kernel_data)?;
3246 let shaped = apply_conv_mode_real(&full, options.mode, signal_len, kernel_len);
3247 let shape = conv1d_output_shape(shaped.len(), options.orientation);
3248 Ok(self.allocate_tensor(shaped, shape))
3249 }
3250 fn conv2d(
3251 &self,
3252 signal: &GpuTensorHandle,
3253 kernel: &GpuTensorHandle,
3254 mode: ProviderConvMode,
3255 ) -> Result<GpuTensorHandle> {
3256 ensure_diag_shape("conv2d", &signal.shape)?;
3257 ensure_diag_shape("conv2d", &kernel.shape)?;
3258 let (signal_rows, signal_cols) = rows_cols(&signal.shape);
3259 let (kernel_rows, kernel_cols) = rows_cols(&kernel.shape);
3260
3261 let signal_len: usize = signal.shape.iter().copied().product();
3262 let kernel_len: usize = kernel.shape.iter().copied().product();
3263
3264 let empty_shape = match mode {
3265 ProviderConvMode::Full | ProviderConvMode::Valid => vec![0, 0],
3266 ProviderConvMode::Same => vec![signal_rows, signal_cols],
3267 };
3268
3269 if signal_len == 0
3270 || kernel_len == 0
3271 || signal_rows == 0
3272 || signal_cols == 0
3273 || kernel_rows == 0
3274 || kernel_cols == 0
3275 {
3276 let len = empty_shape.iter().copied().product::<usize>();
3277 return Ok(self.allocate_tensor(vec![0.0; len], empty_shape));
3278 }
3279
3280 let (signal_data, kernel_data) = {
3281 let guard = registry().lock().unwrap();
3282 let signal_buf = guard
3283 .get(&signal.buffer_id)
3284 .cloned()
3285 .ok_or_else(|| anyhow!("conv2d: unknown signal buffer {}", signal.buffer_id))?;
3286 let kernel_buf = guard
3287 .get(&kernel.buffer_id)
3288 .cloned()
3289 .ok_or_else(|| anyhow!("conv2d: unknown kernel buffer {}", kernel.buffer_id))?;
3290 (signal_buf, kernel_buf)
3291 };
3292
3293 ensure!(
3294 signal_data.len() == signal_len,
3295 "conv2d: signal length mismatch (shape implies {}, buffer has {})",
3296 signal_len,
3297 signal_data.len()
3298 );
3299 ensure!(
3300 kernel_data.len() == kernel_len,
3301 "conv2d: kernel length mismatch (shape implies {}, buffer has {})",
3302 kernel_len,
3303 kernel_data.len()
3304 );
3305
3306 let full_rows = signal_rows
3307 .checked_add(kernel_rows)
3308 .and_then(|v| v.checked_sub(1))
3309 .ok_or_else(|| anyhow!("conv2d: row size overflow"))?;
3310 let full_cols = signal_cols
3311 .checked_add(kernel_cols)
3312 .and_then(|v| v.checked_sub(1))
3313 .ok_or_else(|| anyhow!("conv2d: column size overflow"))?;
3314 full_rows
3315 .checked_mul(full_cols)
3316 .ok_or_else(|| anyhow!("conv2d: output size overflow"))?;
3317
3318 let full = conv2d_full_real(
3319 &signal_data,
3320 signal_rows,
3321 signal_cols,
3322 &kernel_data,
3323 kernel_rows,
3324 kernel_cols,
3325 full_rows,
3326 full_cols,
3327 );
3328
3329 let (shaped, out_rows, out_cols) = apply_conv2_mode_real_2d(
3330 &full,
3331 full_rows,
3332 full_cols,
3333 mode,
3334 signal_rows,
3335 signal_cols,
3336 kernel_rows,
3337 kernel_cols,
3338 );
3339 Ok(self.allocate_tensor(shaped, vec![out_rows, out_cols]))
3340 }
3341 fn iir_filter(
3342 &self,
3343 b: &GpuTensorHandle,
3344 a: &GpuTensorHandle,
3345 x: &GpuTensorHandle,
3346 options: ProviderIirFilterOptions,
3347 ) -> Result<ProviderIirFilterResult> {
3348 let ProviderIirFilterOptions { dim, zi } = options;
3349
3350 let nb = product(&b.shape);
3351 let na = product(&a.shape);
3352 ensure!(
3353 nb > 0,
3354 "iir_filter: numerator coefficients must not be empty"
3355 );
3356 ensure!(
3357 na > 0,
3358 "iir_filter: denominator coefficients must not be empty"
3359 );
3360
3361 let signal_elems = product(&x.shape);
3362 let zi_shape = zi.as_ref().map(|handle| handle.shape.clone());
3363
3364 let (b_data, a_data, x_data, zi_data) =
3365 {
3366 let guard = registry().lock().unwrap();
3367 let b_buf = guard.get(&b.buffer_id).cloned().ok_or_else(|| {
3368 anyhow!("iir_filter: unknown numerator buffer {}", b.buffer_id)
3369 })?;
3370 let a_buf = guard.get(&a.buffer_id).cloned().ok_or_else(|| {
3371 anyhow!("iir_filter: unknown denominator buffer {}", a.buffer_id)
3372 })?;
3373 let x_buf = guard
3374 .get(&x.buffer_id)
3375 .cloned()
3376 .ok_or_else(|| anyhow!("iir_filter: unknown signal buffer {}", x.buffer_id))?;
3377 ensure!(
3378 b_buf.len() == nb,
3379 "iir_filter: numerator length mismatch (shape implies {}, buffer has {})",
3380 nb,
3381 b_buf.len()
3382 );
3383 ensure!(
3384 a_buf.len() == na,
3385 "iir_filter: denominator length mismatch (shape implies {}, buffer has {})",
3386 na,
3387 a_buf.len()
3388 );
3389 ensure!(
3390 x_buf.len() == signal_elems,
3391 "iir_filter: signal length mismatch (shape implies {}, buffer has {})",
3392 signal_elems,
3393 x_buf.len()
3394 );
3395 let zi_buf = if let Some(ref zi_handle) = zi {
3396 Some(guard.get(&zi_handle.buffer_id).cloned().ok_or_else(|| {
3397 anyhow!(
3398 "iir_filter: unknown initial state buffer {}",
3399 zi_handle.buffer_id
3400 )
3401 })?)
3402 } else {
3403 None
3404 };
3405 (b_buf, a_buf, x_buf, zi_buf)
3406 };
3407
3408 ensure!(
3409 !a_data.is_empty() && a_data[0] != 0.0,
3410 "iir_filter: denominator coefficient a(1) must be non-zero"
3411 );
3412
3413 let mut shape_ext = x.shape.clone();
3414 if dim >= shape_ext.len() {
3415 shape_ext.extend(std::iter::repeat_n(1, dim + 1 - shape_ext.len()));
3416 }
3417 let dim_idx = dim;
3418 let dim_len = shape_ext.get(dim_idx).copied().unwrap_or(1);
3419
3420 let leading = if dim_idx == 0 {
3421 1
3422 } else {
3423 shape_ext[..dim_idx].iter().copied().product()
3424 };
3425 let trailing = if dim_idx + 1 >= shape_ext.len() {
3426 1
3427 } else {
3428 shape_ext[dim_idx + 1..].iter().copied().product()
3429 };
3430 let channel_count = leading * trailing;
3431
3432 let order = nb.max(na);
3433 let state_len = order.saturating_sub(1);
3434 let state_shape = filter_state_shape(shape_ext.clone(), dim_idx, state_len);
3435 let expected_states = state_len.saturating_mul(channel_count);
3436
3437 if let Some(ref shape) = zi_shape {
3438 ensure!(
3439 shapes_compatible(&state_shape, shape),
3440 "iir_filter: initial conditions are not compatible with the signal shape"
3441 );
3442 let zi_dim = if dim_idx < shape.len() {
3443 shape[dim_idx]
3444 } else {
3445 1
3446 };
3447 ensure!(
3448 zi_dim == state_len,
3449 "iir_filter: initial conditions must have {} states along dimension {}",
3450 state_len,
3451 dim + 1
3452 );
3453 }
3454
3455 let mut states = if state_len == 0 {
3456 Vec::new()
3457 } else if let Some(ref zi_buf) = zi_data {
3458 ensure!(
3459 zi_buf.len() == expected_states,
3460 "iir_filter: initial state vector length mismatch (expected {}, found {})",
3461 expected_states,
3462 zi_buf.len()
3463 );
3464 states_from_column_major(zi_buf, state_len, dim_idx, &shape_ext)
3465 } else {
3466 vec![0.0; expected_states]
3467 };
3468
3469 let mut b_norm = vec![0.0f64; order];
3470 let mut a_norm = vec![0.0f64; order];
3471 let a0 = a_data[0];
3472 for i in 0..order {
3473 let b_coeff = if i < nb { b_data[i] } else { 0.0 };
3474 b_norm[i] = b_coeff / a0;
3475 if i == 0 {
3476 a_norm[0] = 1.0;
3477 } else {
3478 let a_coeff = if i < na { a_data[i] } else { 0.0 };
3479 a_norm[i] = a_coeff / a0;
3480 }
3481 }
3482
3483 let mut output = vec![0.0f64; x_data.len()];
3484
3485 if state_len == 0 {
3486 let gain = b_norm[0];
3487 for (dst, &src) in output.iter_mut().zip(x_data.iter()) {
3488 *dst = gain * src;
3489 }
3490 } else if dim_len == 0 || channel_count == 0 {
3491 } else {
3493 for t in 0..trailing {
3494 let base = t
3495 .checked_mul(dim_len)
3496 .and_then(|v| v.checked_mul(leading))
3497 .ok_or_else(|| anyhow!("iir_filter: index overflow"))?;
3498 for l in 0..leading {
3499 let channel_idx = t
3500 .checked_mul(leading)
3501 .and_then(|v| v.checked_add(l))
3502 .ok_or_else(|| anyhow!("iir_filter: index overflow"))?;
3503 if channel_idx >= channel_count {
3504 continue;
3505 }
3506 let state_base = channel_idx
3507 .checked_mul(state_len)
3508 .ok_or_else(|| anyhow!("iir_filter: state index overflow"))?;
3509 for step in 0..dim_len {
3510 let idx = base
3511 .checked_add(l)
3512 .and_then(|v| v.checked_add(step.saturating_mul(leading)))
3513 .ok_or_else(|| anyhow!("iir_filter: signal index overflow"))?;
3514 if idx >= x_data.len() {
3515 break;
3516 }
3517 let x_n = x_data[idx];
3518 let y = b_norm[0] * x_n + states.get(state_base).copied().unwrap_or(0.0);
3519 output[idx] = y;
3520 for i in 1..order {
3521 let next_state = if i < state_len {
3522 states[state_base + i]
3523 } else {
3524 0.0
3525 };
3526 let new_state = b_norm[i] * x_n + next_state - a_norm[i] * y;
3527 states[state_base + i - 1] = new_state;
3528 }
3529 }
3530 }
3531 }
3532 }
3533
3534 let final_state_data = if state_len == 0 {
3535 Vec::new()
3536 } else {
3537 states_to_column_major(&states, state_len, dim_idx, &shape_ext)
3538 };
3539
3540 let output_handle = self.allocate_tensor(output, x.shape.clone());
3541 let final_state_handle = self.allocate_tensor(final_state_data, state_shape);
3542
3543 Ok(ProviderIirFilterResult {
3544 output: output_handle,
3545 final_state: Some(final_state_handle),
3546 })
3547 }
3548 fn permute(&self, handle: &GpuTensorHandle, order: &[usize]) -> Result<GpuTensorHandle> {
3549 let data = {
3550 let guard = registry().lock().unwrap();
3551 guard
3552 .get(&handle.buffer_id)
3553 .ok_or_else(|| anyhow!("permute: unknown tensor handle {}", handle.buffer_id))?
3554 .clone()
3555 };
3556 let (permuted, new_shape) = permute_data(&data, &handle.shape, order)?;
3557 Ok(self.allocate_tensor(permuted, new_shape))
3558 }
3559
3560 fn flip(&self, handle: &GpuTensorHandle, axes: &[usize]) -> Result<GpuTensorHandle> {
3561 let data = {
3562 let guard = registry().lock().unwrap();
3563 guard
3564 .get(&handle.buffer_id)
3565 .ok_or_else(|| anyhow!("flip: unknown tensor handle {}", handle.buffer_id))?
3566 .clone()
3567 };
3568 let flipped = flip_data(&data, &handle.shape, axes)?;
3569 Ok(self.allocate_tensor(flipped, handle.shape.clone()))
3570 }
3571
3572 fn circshift(&self, handle: &GpuTensorHandle, shifts: &[isize]) -> Result<GpuTensorHandle> {
3573 let data = {
3574 let guard = registry().lock().unwrap();
3575 guard
3576 .get(&handle.buffer_id)
3577 .ok_or_else(|| anyhow!("circshift: unknown tensor handle {}", handle.buffer_id))?
3578 .clone()
3579 };
3580 let mut shape = handle.shape.clone();
3581 if shifts.len() > shape.len() {
3582 shape.extend(std::iter::repeat_n(1, shifts.len() - shape.len()));
3583 }
3584 let mut full_shifts = vec![0isize; shape.len()];
3585 for (idx, &shift) in shifts.iter().enumerate() {
3586 if idx < full_shifts.len() {
3587 full_shifts[idx] = shift;
3588 }
3589 }
3590 let rotated = circshift_data(&data, &shape, &full_shifts)?;
3591 Ok(self.allocate_tensor(rotated, shape))
3592 }
3593
3594 fn diff_dim(
3595 &self,
3596 handle: &GpuTensorHandle,
3597 order: usize,
3598 dim: usize,
3599 ) -> Result<GpuTensorHandle> {
3600 if order == 0 {
3601 return Ok(handle.clone());
3602 }
3603 let data = {
3604 let guard = registry().lock().unwrap();
3605 guard
3606 .get(&handle.buffer_id)
3607 .ok_or_else(|| anyhow!("diff_dim: unknown tensor handle {}", handle.buffer_id))?
3608 .clone()
3609 };
3610 let tensor =
3611 Tensor::new(data, handle.shape.clone()).map_err(|e| anyhow!("diff_dim: {e}"))?;
3612 let diffed =
3613 diff_tensor_host(tensor, order, Some(dim + 1)).map_err(|e| anyhow!("diff_dim: {e}"))?;
3614 let Tensor { data, shape, .. } = diffed;
3615 Ok(self.allocate_tensor(data, shape))
3616 }
3617
3618 fn unique(&self, handle: &GpuTensorHandle, options: &UniqueOptions) -> Result<UniqueResult> {
3619 let data = {
3620 let guard = registry().lock().unwrap();
3621 guard
3622 .get(&handle.buffer_id)
3623 .ok_or_else(|| anyhow!("unique: unknown tensor handle {}", handle.buffer_id))?
3624 .clone()
3625 };
3626 let tensor = Tensor::new(data, handle.shape.clone()).map_err(|e| anyhow!("unique: {e}"))?;
3627 unique::unique_numeric_from_tensor(tensor, options)
3628 .and_then(|eval| eval.into_numeric_unique_result())
3629 .map_err(|e| anyhow!("{e}"))
3630 }
3631
3632 fn setdiff(
3633 &self,
3634 a: &GpuTensorHandle,
3635 b: &GpuTensorHandle,
3636 options: &SetdiffOptions,
3637 ) -> Result<SetdiffResult> {
3638 let data_a = {
3639 let guard = registry().lock().unwrap();
3640 guard
3641 .get(&a.buffer_id)
3642 .cloned()
3643 .ok_or_else(|| anyhow!("setdiff: unknown tensor handle {}", a.buffer_id))?
3644 };
3645 let data_b = {
3646 let guard = registry().lock().unwrap();
3647 guard
3648 .get(&b.buffer_id)
3649 .cloned()
3650 .ok_or_else(|| anyhow!("setdiff: unknown tensor handle {}", b.buffer_id))?
3651 };
3652 let tensor_a = Tensor::new(data_a, a.shape.clone()).map_err(|e| anyhow!("setdiff: {e}"))?;
3653 let tensor_b = Tensor::new(data_b, b.shape.clone()).map_err(|e| anyhow!("setdiff: {e}"))?;
3654 runmat_runtime::builtins::array::sorting_sets::setdiff::setdiff_numeric_from_tensors(
3655 tensor_a, tensor_b, options,
3656 )
3657 .and_then(|eval| eval.into_numeric_setdiff_result())
3658 .map_err(|e| anyhow!("setdiff: {e}"))
3659 }
3660
3661 fn repmat(&self, handle: &GpuTensorHandle, reps: &[usize]) -> Result<GpuTensorHandle> {
3662 let data = {
3663 let guard = registry().lock().unwrap();
3664 guard
3665 .get(&handle.buffer_id)
3666 .ok_or_else(|| anyhow!("repmat: unknown tensor handle {}", handle.buffer_id))?
3667 .clone()
3668 };
3669 let (tiled, shape) = repmat_numeric(&data, &handle.shape, reps)?;
3670 Ok(self.allocate_tensor(tiled, shape))
3671 }
3672
3673 fn dot(
3674 &self,
3675 lhs: &GpuTensorHandle,
3676 rhs: &GpuTensorHandle,
3677 dim: Option<usize>,
3678 ) -> Result<GpuTensorHandle> {
3679 let (lhs_buf, rhs_buf) = {
3680 let guard = registry().lock().unwrap();
3681 let lhs_buf = guard
3682 .get(&lhs.buffer_id)
3683 .cloned()
3684 .ok_or_else(|| anyhow!("dot: unknown tensor handle {}", lhs.buffer_id))?;
3685 let rhs_buf = guard
3686 .get(&rhs.buffer_id)
3687 .cloned()
3688 .ok_or_else(|| anyhow!("dot: unknown tensor handle {}", rhs.buffer_id))?;
3689 (lhs_buf, rhs_buf)
3690 };
3691 let lhs_tensor =
3692 Tensor::new(lhs_buf, lhs.shape.clone()).map_err(|e| anyhow!("dot: {e}"))?;
3693 let rhs_tensor =
3694 Tensor::new(rhs_buf, rhs.shape.clone()).map_err(|e| anyhow!("dot: {e}"))?;
3695 let result =
3696 dot_host_real_for_provider(&lhs_tensor, &rhs_tensor, dim).map_err(|e| anyhow!(e))?;
3697 Ok(self.allocate_tensor(result.data.clone(), result.shape.clone()))
3698 }
3699
3700 fn reduce_sum(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
3701 let guard = registry().lock().unwrap();
3702 let abuf = guard
3703 .get(&a.buffer_id)
3704 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3705 let s: f64 = abuf.iter().sum();
3706 drop(guard);
3707 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3708 let mut guard2 = registry().lock().unwrap();
3709 guard2.insert(id, vec![s]);
3710 Ok(GpuTensorHandle {
3711 shape: vec![1, 1],
3712 device_id: 0,
3713 buffer_id: id,
3714 })
3715 }
3716
3717 fn reduce_sum_dim(&self, a: &GpuTensorHandle, dim: usize) -> Result<GpuTensorHandle> {
3718 if a.shape.len() != 2 {
3719 return Err(anyhow::anyhow!("reduce_sum_dim: only 2D supported"));
3720 }
3721 let rows = a.shape[0];
3722 let cols = a.shape[1];
3723 let guard = registry().lock().unwrap();
3724 let abuf = guard
3725 .get(&a.buffer_id)
3726 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3727 let out = if dim <= 1 {
3728 let mut v = vec![0.0f64; cols];
3730 for c in 0..cols {
3731 let mut s = 0.0;
3732 for r in 0..rows {
3733 s += abuf[r + c * rows];
3734 }
3735 v[c] = s;
3736 }
3737 v
3738 } else {
3739 let mut v = vec![0.0f64; rows];
3741 for r in 0..rows {
3742 let mut s = 0.0;
3743 for c in 0..cols {
3744 s += abuf[r + c * rows];
3745 }
3746 v[r] = s;
3747 }
3748 v
3749 };
3750 drop(guard);
3751 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3752 let mut guard2 = registry().lock().unwrap();
3753 guard2.insert(id, out);
3754 let shape = if dim <= 1 {
3755 vec![1, cols]
3756 } else {
3757 vec![rows, 1]
3758 };
3759 Ok(GpuTensorHandle {
3760 shape,
3761 device_id: 0,
3762 buffer_id: id,
3763 })
3764 }
3765
3766 fn reduce_prod(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
3767 let guard = registry().lock().unwrap();
3768 let abuf = guard
3769 .get(&a.buffer_id)
3770 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3771 let p: f64 = abuf.iter().product();
3772 drop(guard);
3773 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3774 let mut guard2 = registry().lock().unwrap();
3775 guard2.insert(id, vec![p]);
3776 Ok(GpuTensorHandle {
3777 shape: vec![1, 1],
3778 device_id: 0,
3779 buffer_id: id,
3780 })
3781 }
3782
3783 fn reduce_prod_dim(&self, a: &GpuTensorHandle, dim: usize) -> Result<GpuTensorHandle> {
3784 if a.shape.len() != 2 {
3785 return Err(anyhow::anyhow!("reduce_prod_dim: only 2D supported"));
3786 }
3787 let rows = a.shape[0];
3788 let cols = a.shape[1];
3789 let guard = registry().lock().unwrap();
3790 let abuf = guard
3791 .get(&a.buffer_id)
3792 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3793 let out = if dim <= 1 {
3794 let mut v = vec![1.0f64; cols];
3795 for c in 0..cols {
3796 let mut prod = 1.0;
3797 for r in 0..rows {
3798 prod *= abuf[r + c * rows];
3799 }
3800 v[c] = prod;
3801 }
3802 v
3803 } else {
3804 let mut v = vec![1.0f64; rows];
3805 for r in 0..rows {
3806 let mut prod = 1.0;
3807 for c in 0..cols {
3808 prod *= abuf[r + c * rows];
3809 }
3810 v[r] = prod;
3811 }
3812 v
3813 };
3814 drop(guard);
3815 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3816 let mut guard2 = registry().lock().unwrap();
3817 guard2.insert(id, out);
3818 let shape = if dim <= 1 {
3819 vec![1, cols]
3820 } else {
3821 vec![rows, 1]
3822 };
3823 Ok(GpuTensorHandle {
3824 shape,
3825 device_id: 0,
3826 buffer_id: id,
3827 })
3828 }
3829
3830 fn reduce_mean(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
3831 let guard = registry().lock().unwrap();
3832 let abuf = guard
3833 .get(&a.buffer_id)
3834 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3835 let mean = if abuf.is_empty() {
3836 0.0
3837 } else {
3838 abuf.iter().sum::<f64>() / (abuf.len() as f64)
3839 };
3840 drop(guard);
3841 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3842 registry().lock().unwrap().insert(id, vec![mean]);
3843 Ok(GpuTensorHandle {
3844 shape: vec![1, 1],
3845 device_id: 0,
3846 buffer_id: id,
3847 })
3848 }
3849
3850 fn reduce_mean_dim(&self, a: &GpuTensorHandle, dim: usize) -> Result<GpuTensorHandle> {
3851 if a.shape.len() != 2 {
3852 return Err(anyhow::anyhow!("reduce_mean_dim: only 2D supported"));
3853 }
3854 let rows = a.shape[0];
3855 let cols = a.shape[1];
3856 let guard = registry().lock().unwrap();
3857 let abuf = guard
3858 .get(&a.buffer_id)
3859 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3860 let out = if dim <= 1 {
3861 let mut v = vec![0.0f64; cols];
3862 for c in 0..cols {
3863 let mut s = 0.0;
3864 for r in 0..rows {
3865 s += abuf[r + c * rows];
3866 }
3867 v[c] = s / (rows as f64);
3868 }
3869 v
3870 } else {
3871 let mut v = vec![0.0f64; rows];
3872 for r in 0..rows {
3873 let mut s = 0.0;
3874 for c in 0..cols {
3875 s += abuf[r + c * rows];
3876 }
3877 v[r] = s / (cols as f64);
3878 }
3879 v
3880 };
3881 drop(guard);
3882 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3883 registry().lock().unwrap().insert(id, out);
3884 let shape = if dim <= 1 {
3885 vec![1, cols]
3886 } else {
3887 vec![rows, 1]
3888 };
3889 Ok(GpuTensorHandle {
3890 shape,
3891 device_id: 0,
3892 buffer_id: id,
3893 })
3894 }
3895
3896 fn reduce_any(&self, a: &GpuTensorHandle, omit_nan: bool) -> Result<GpuTensorHandle> {
3897 let data = {
3898 let guard = registry().lock().unwrap();
3899 guard
3900 .get(&a.buffer_id)
3901 .cloned()
3902 .ok_or_else(|| anyhow!("reduce_any: unknown tensor handle {}", a.buffer_id))?
3903 };
3904 let mut truthy = false;
3905 for v in &data {
3906 if v.is_nan() {
3907 if !omit_nan {
3908 truthy = true;
3909 break;
3910 }
3911 } else if *v != 0.0 {
3912 truthy = true;
3913 break;
3914 }
3915 }
3916 let result = if truthy { 1.0 } else { 0.0 };
3917 Ok(self.allocate_tensor(vec![result], vec![1, 1]))
3918 }
3919
3920 fn reduce_any_dim(
3921 &self,
3922 a: &GpuTensorHandle,
3923 dim: usize,
3924 omit_nan: bool,
3925 ) -> Result<GpuTensorHandle> {
3926 if a.shape.len() != 2 {
3927 return Err(anyhow!("reduce_any_dim: only 2D supported"));
3928 }
3929 let rows = a.shape[0];
3930 let cols = a.shape[1];
3931 let data = {
3932 let guard = registry().lock().unwrap();
3933 guard
3934 .get(&a.buffer_id)
3935 .cloned()
3936 .ok_or_else(|| anyhow!("reduce_any_dim: unknown tensor handle {}", a.buffer_id))?
3937 };
3938 let (out, shape) = if dim == 0 {
3939 let mut v = vec![0.0f64; cols];
3940 for c in 0..cols {
3941 let mut truth = false;
3942 for r in 0..rows {
3943 let val = data[r + c * rows];
3944 if val.is_nan() {
3945 if !omit_nan {
3946 truth = true;
3947 break;
3948 }
3949 } else if val != 0.0 {
3950 truth = true;
3951 break;
3952 }
3953 }
3954 v[c] = if truth { 1.0 } else { 0.0 };
3955 }
3956 (v, vec![1, cols])
3957 } else if dim == 1 {
3958 let mut v = vec![0.0f64; rows];
3959 for r in 0..rows {
3960 let mut truth = false;
3961 for c in 0..cols {
3962 let val = data[r + c * rows];
3963 if val.is_nan() {
3964 if !omit_nan {
3965 truth = true;
3966 break;
3967 }
3968 } else if val != 0.0 {
3969 truth = true;
3970 break;
3971 }
3972 }
3973 v[r] = if truth { 1.0 } else { 0.0 };
3974 }
3975 (v, vec![rows, 1])
3976 } else {
3977 return Err(anyhow!("reduce_any_dim: invalid dimension {}", dim));
3978 };
3979 Ok(self.allocate_tensor(out, shape))
3980 }
3981
3982 fn reduce_all(&self, a: &GpuTensorHandle, omit_nan: bool) -> Result<GpuTensorHandle> {
3983 let data = {
3984 let guard = registry().lock().unwrap();
3985 guard
3986 .get(&a.buffer_id)
3987 .cloned()
3988 .ok_or_else(|| anyhow!("reduce_all: unknown tensor handle {}", a.buffer_id))?
3989 };
3990 let mut all_true = true;
3991 let mut saw_value = false;
3992 for v in &data {
3993 if v.is_nan() {
3994 if omit_nan {
3995 continue;
3996 }
3997 } else {
3998 saw_value = true;
3999 if *v == 0.0 {
4000 all_true = false;
4001 break;
4002 }
4003 continue;
4004 }
4005 }
4006 if omit_nan && !saw_value {
4007 all_true = true;
4008 }
4009 let result = if all_true { 1.0 } else { 0.0 };
4010 Ok(self.allocate_tensor(vec![result], vec![1, 1]))
4011 }
4012
4013 fn reduce_all_dim(
4014 &self,
4015 a: &GpuTensorHandle,
4016 dim: usize,
4017 omit_nan: bool,
4018 ) -> Result<GpuTensorHandle> {
4019 if a.shape.len() != 2 {
4020 return Err(anyhow!("reduce_all_dim: only 2D supported"));
4021 }
4022 let rows = a.shape[0];
4023 let cols = a.shape[1];
4024 let data = {
4025 let guard = registry().lock().unwrap();
4026 guard
4027 .get(&a.buffer_id)
4028 .cloned()
4029 .ok_or_else(|| anyhow!("reduce_all_dim: unknown tensor handle {}", a.buffer_id))?
4030 };
4031 let (out, shape) = if dim == 0 {
4032 let mut v = vec![0.0f64; cols];
4033 for c in 0..cols {
4034 let mut all_true = true;
4035 let mut saw_value = false;
4036 for r in 0..rows {
4037 let val = data[r + c * rows];
4038 if val.is_nan() {
4039 if omit_nan {
4040 continue;
4041 }
4042 } else {
4043 saw_value = true;
4044 if val == 0.0 {
4045 all_true = false;
4046 break;
4047 }
4048 continue;
4049 }
4050 }
4051 if omit_nan && !saw_value {
4052 all_true = true;
4053 }
4054 v[c] = if all_true { 1.0 } else { 0.0 };
4055 }
4056 (v, vec![1, cols])
4057 } else if dim == 1 {
4058 let mut v = vec![0.0f64; rows];
4059 for r in 0..rows {
4060 let mut all_true = true;
4061 let mut saw_value = false;
4062 for c in 0..cols {
4063 let val = data[r + c * rows];
4064 if val.is_nan() {
4065 if omit_nan {
4066 continue;
4067 }
4068 } else {
4069 saw_value = true;
4070 if val == 0.0 {
4071 all_true = false;
4072 break;
4073 }
4074 continue;
4075 }
4076 }
4077 if omit_nan && !saw_value {
4078 all_true = true;
4079 }
4080 v[r] = if all_true { 1.0 } else { 0.0 };
4081 }
4082 (v, vec![rows, 1])
4083 } else {
4084 return Err(anyhow!("reduce_all_dim: invalid dimension {}", dim));
4085 };
4086 Ok(self.allocate_tensor(out, shape))
4087 }
4088
4089 fn reduce_median(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
4090 let data = {
4091 let guard = registry().lock().unwrap();
4092 guard
4093 .get(&a.buffer_id)
4094 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?
4095 .clone()
4096 };
4097 let median = if data.is_empty() || data.iter().any(|v| v.is_nan()) {
4098 f64::NAN
4099 } else {
4100 let mut scratch = data.clone();
4101 compute_median_inplace(&mut scratch)
4102 };
4103 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
4104 registry().lock().unwrap().insert(id, vec![median]);
4105 Ok(GpuTensorHandle {
4106 shape: vec![1, 1],
4107 device_id: 0,
4108 buffer_id: id,
4109 })
4110 }
4111
4112 fn reduce_median_dim(&self, a: &GpuTensorHandle, dim: usize) -> Result<GpuTensorHandle> {
4113 if a.shape.len() != 2 {
4114 return Err(anyhow::anyhow!("reduce_median_dim: only 2D supported"));
4115 }
4116 let rows = a.shape[0];
4117 let cols = a.shape[1];
4118 let guard = registry().lock().unwrap();
4119 let abuf = guard
4120 .get(&a.buffer_id)
4121 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
4122 let mut scratch = Vec::<f64>::with_capacity(rows.max(cols));
4123 let out = if dim <= 1 {
4124 let mut v = vec![f64::NAN; cols];
4125 for c in 0..cols {
4126 scratch.clear();
4127 let mut saw_nan = false;
4128 for r in 0..rows {
4129 let val = abuf[r + c * rows];
4130 if val.is_nan() {
4131 saw_nan = true;
4132 scratch.clear();
4133 break;
4134 }
4135 scratch.push(val);
4136 }
4137 v[c] = if saw_nan || scratch.is_empty() {
4138 f64::NAN
4139 } else {
4140 compute_median_inplace(&mut scratch)
4141 };
4142 }
4143 v
4144 } else {
4145 let mut v = vec![f64::NAN; rows];
4146 for r in 0..rows {
4147 scratch.clear();
4148 let mut saw_nan = false;
4149 for c in 0..cols {
4150 let val = abuf[r + c * rows];
4151 if val.is_nan() {
4152 saw_nan = true;
4153 scratch.clear();
4154 break;
4155 }
4156 scratch.push(val);
4157 }
4158 v[r] = if saw_nan || scratch.is_empty() {
4159 f64::NAN
4160 } else {
4161 compute_median_inplace(&mut scratch)
4162 };
4163 }
4164 v
4165 };
4166 drop(guard);
4167 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
4168 registry().lock().unwrap().insert(id, out);
4169 let shape = if dim <= 1 {
4170 vec![1, cols]
4171 } else {
4172 vec![rows, 1]
4173 };
4174 Ok(GpuTensorHandle {
4175 shape,
4176 device_id: 0,
4177 buffer_id: id,
4178 })
4179 }
4180
4181 fn reduce_min(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
4182 let guard = registry().lock().unwrap();
4183 let abuf = guard
4184 .get(&a.buffer_id)
4185 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
4186 let m = abuf.iter().cloned().fold(f64::INFINITY, f64::min);
4187 drop(guard);
4188 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
4189 registry().lock().unwrap().insert(id, vec![m]);
4190 Ok(GpuTensorHandle {
4191 shape: vec![1, 1],
4192 device_id: 0,
4193 buffer_id: id,
4194 })
4195 }
4196
4197 fn reduce_min_dim(
4198 &self,
4199 a: &GpuTensorHandle,
4200 dim: usize,
4201 ) -> Result<runmat_accelerate_api::ReduceDimResult> {
4202 if a.shape.len() != 2 {
4203 return Err(anyhow::anyhow!("reduce_min_dim: only 2D supported"));
4204 }
4205 let rows = a.shape[0];
4206 let cols = a.shape[1];
4207 let guard = registry().lock().unwrap();
4208 let abuf = guard
4209 .get(&a.buffer_id)
4210 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
4211 let (vals, inds, vshape) = if dim <= 1 {
4212 let mut m: Vec<f64> = vec![f64::INFINITY; cols];
4213 let mut idx: Vec<f64> = vec![1.0; cols];
4214 for c in 0..cols {
4215 for r in 0..rows {
4216 let v = abuf[r + c * rows];
4217 if v < m[c] {
4218 m[c] = v;
4219 idx[c] = (r + 1) as f64;
4220 }
4221 }
4222 }
4223 (m, idx, vec![1, cols])
4224 } else {
4225 let mut m: Vec<f64> = vec![f64::INFINITY; rows];
4226 let mut idx: Vec<f64> = vec![1.0; rows];
4227 for r in 0..rows {
4228 for c in 0..cols {
4229 let v = abuf[r + c * rows];
4230 if v < m[r] {
4231 m[r] = v;
4232 idx[r] = (c + 1) as f64;
4233 }
4234 }
4235 }
4236 (m, idx, vec![rows, 1])
4237 };
4238 drop(guard);
4239 let idv = self.next_id.fetch_add(1, Ordering::Relaxed);
4240 let idi = self.next_id.fetch_add(1, Ordering::Relaxed);
4241 let mut g = registry().lock().unwrap();
4242 g.insert(idv, vals);
4243 g.insert(idi, inds);
4244 let shape_vals = vshape.clone();
4245 let shape_inds = vshape;
4246 Ok(runmat_accelerate_api::ReduceDimResult {
4247 values: GpuTensorHandle {
4248 shape: shape_vals,
4249 device_id: 0,
4250 buffer_id: idv,
4251 },
4252 indices: GpuTensorHandle {
4253 shape: shape_inds,
4254 device_id: 0,
4255 buffer_id: idi,
4256 },
4257 })
4258 }
4259
4260 fn reduce_max(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
4261 let guard = registry().lock().unwrap();
4262 let abuf = guard
4263 .get(&a.buffer_id)
4264 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
4265 let m = abuf.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
4266 drop(guard);
4267 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
4268 registry().lock().unwrap().insert(id, vec![m]);
4269 Ok(GpuTensorHandle {
4270 shape: vec![1, 1],
4271 device_id: 0,
4272 buffer_id: id,
4273 })
4274 }
4275
4276 fn reduce_max_dim(
4277 &self,
4278 a: &GpuTensorHandle,
4279 dim: usize,
4280 ) -> Result<runmat_accelerate_api::ReduceDimResult> {
4281 if a.shape.len() != 2 {
4282 return Err(anyhow::anyhow!("reduce_max_dim: only 2D supported"));
4283 }
4284 let rows = a.shape[0];
4285 let cols = a.shape[1];
4286 let guard = registry().lock().unwrap();
4287 let abuf = guard
4288 .get(&a.buffer_id)
4289 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
4290 let (vals, inds, vshape) = if dim <= 1 {
4291 let mut m: Vec<f64> = vec![f64::NEG_INFINITY; cols];
4292 let mut idx: Vec<f64> = vec![1.0; cols];
4293 for c in 0..cols {
4294 for r in 0..rows {
4295 let v = abuf[r + c * rows];
4296 if v > m[c] {
4297 m[c] = v;
4298 idx[c] = (r + 1) as f64;
4299 }
4300 }
4301 }
4302 (m, idx, vec![1, cols])
4303 } else {
4304 let mut m: Vec<f64> = vec![f64::NEG_INFINITY; rows];
4305 let mut idx: Vec<f64> = vec![1.0; rows];
4306 for r in 0..rows {
4307 for c in 0..cols {
4308 let v = abuf[r + c * rows];
4309 if v > m[r] {
4310 m[r] = v;
4311 idx[r] = (c + 1) as f64;
4312 }
4313 }
4314 }
4315 (m, idx, vec![rows, 1])
4316 };
4317 drop(guard);
4318 let idv = self.next_id.fetch_add(1, Ordering::Relaxed);
4319 let idi = self.next_id.fetch_add(1, Ordering::Relaxed);
4320 let mut g = registry().lock().unwrap();
4321 g.insert(idv, vals);
4322 g.insert(idi, inds);
4323 let shape_vals = vshape.clone();
4324 let shape_inds = vshape;
4325 Ok(runmat_accelerate_api::ReduceDimResult {
4326 values: GpuTensorHandle {
4327 shape: shape_vals,
4328 device_id: 0,
4329 buffer_id: idv,
4330 },
4331 indices: GpuTensorHandle {
4332 shape: shape_inds,
4333 device_id: 0,
4334 buffer_id: idi,
4335 },
4336 })
4337 }
4338
4339 fn cumsum_scan(
4340 &self,
4341 _input: &GpuTensorHandle,
4342 _dim: usize,
4343 _direction: ProviderScanDirection,
4344 _nan_mode: ProviderNanMode,
4345 ) -> Result<GpuTensorHandle> {
4346 Err(anyhow!("cumsum_scan not supported by provider"))
4347 }
4348
4349 fn cummin_scan(
4350 &self,
4351 _input: &GpuTensorHandle,
4352 _dim: usize,
4353 _direction: ProviderScanDirection,
4354 _nan_mode: ProviderNanMode,
4355 ) -> Result<runmat_accelerate_api::ProviderCumminResult> {
4356 Err(anyhow!("cummin_scan not supported by provider"))
4357 }
4358
4359 fn find(
4360 &self,
4361 a: &GpuTensorHandle,
4362 limit: Option<usize>,
4363 direction: FindDirection,
4364 ) -> Result<ProviderFindResult> {
4365 let shape = a.shape.clone();
4366 let row_extent = shape.first().copied().unwrap_or(1).max(1);
4367 let (indices, rows, cols, values) = {
4368 let guard = registry().lock().unwrap();
4369 let data = guard
4370 .get(&a.buffer_id)
4371 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
4372 let total = data.len();
4373 let cap = match direction {
4374 FindDirection::First => limit.unwrap_or(total),
4375 FindDirection::Last => limit.unwrap_or(1),
4376 }
4377 .min(total);
4378
4379 let mut indices = Vec::new();
4380 let mut rows_out = Vec::new();
4381 let mut cols_out = Vec::new();
4382 let mut values_out = Vec::new();
4383
4384 if cap == 0 || total == 0 {
4385 (indices, rows_out, cols_out, values_out)
4386 } else {
4387 match direction {
4388 FindDirection::First => {
4389 for (idx, &value) in data.iter().enumerate() {
4390 if value != 0.0 {
4391 indices.push((idx + 1) as f64);
4392 rows_out.push(((idx % row_extent) + 1) as f64);
4393 cols_out.push(((idx / row_extent) + 1) as f64);
4394 values_out.push(value);
4395 if indices.len() >= cap {
4396 break;
4397 }
4398 }
4399 }
4400 }
4401 FindDirection::Last => {
4402 for (idx, &value) in data.iter().enumerate().rev() {
4403 if value != 0.0 {
4404 indices.push((idx + 1) as f64);
4405 rows_out.push(((idx % row_extent) + 1) as f64);
4406 cols_out.push(((idx / row_extent) + 1) as f64);
4407 values_out.push(value);
4408 if indices.len() >= cap {
4409 break;
4410 }
4411 }
4412 }
4413 }
4414 }
4415 (indices, rows_out, cols_out, values_out)
4416 }
4417 };
4418
4419 let count = indices.len();
4420 let shape_out = vec![count, 1];
4421 let linear = self.allocate_tensor(indices, shape_out.clone());
4422 let rows_handle = self.allocate_tensor(rows, shape_out.clone());
4423 let cols_handle = self.allocate_tensor(cols, shape_out.clone());
4424 let values_handle = self.allocate_tensor(values, shape_out);
4425
4426 Ok(ProviderFindResult {
4427 linear,
4428 rows: rows_handle,
4429 cols: cols_handle,
4430 values: Some(values_handle),
4431 })
4432 }
4433
4434 fn lu(&self, a: &GpuTensorHandle) -> Result<ProviderLuResult> {
4435 let data = {
4436 let guard = registry().lock().unwrap();
4437 guard
4438 .get(&a.buffer_id)
4439 .cloned()
4440 .ok_or_else(|| anyhow!("lu: unknown buffer {}", a.buffer_id))?
4441 };
4442 let LuHostFactors {
4443 combined,
4444 lower,
4445 upper,
4446 perm_matrix,
4447 pivot_vector,
4448 combined_shape,
4449 lower_shape,
4450 upper_shape,
4451 perm_shape,
4452 pivot_shape,
4453 } = lu_factor_host(&data, &a.shape)?;
4454 let combined = self.allocate_tensor(combined, combined_shape);
4455 let lower = self.allocate_tensor(lower, lower_shape);
4456 let upper = self.allocate_tensor(upper, upper_shape);
4457 let perm_matrix = self.allocate_tensor(perm_matrix, perm_shape);
4458 let perm_vector = self.allocate_tensor(pivot_vector, pivot_shape);
4459 Ok(ProviderLuResult {
4460 combined,
4461 lower,
4462 upper,
4463 perm_matrix,
4464 perm_vector,
4465 })
4466 }
4467
4468 fn chol(&self, a: &GpuTensorHandle, lower: bool) -> Result<ProviderCholResult> {
4469 let data = {
4470 let guard = registry().lock().unwrap();
4471 guard
4472 .get(&a.buffer_id)
4473 .cloned()
4474 .ok_or_else(|| anyhow!("chol: unknown buffer {}", a.buffer_id))?
4475 };
4476 let tensor = Tensor::new(data, a.shape.clone()).map_err(|e| anyhow!("chol: {e}"))?;
4477 let mut args = Vec::new();
4478 if lower {
4479 args.push(Value::from("lower"));
4480 }
4481 let eval = runmat_runtime::builtins::math::linalg::factor::chol::evaluate(
4482 Value::Tensor(tensor),
4483 &args,
4484 )
4485 .map_err(|e| anyhow!("chol: {e}"))?;
4486 let factor_tensor = tensor_from_value("chol", eval.factor())?;
4487 let factor = self.allocate_tensor(factor_tensor.data.clone(), factor_tensor.shape.clone());
4488 Ok(ProviderCholResult {
4489 factor,
4490 info: eval.flag_index() as u32,
4491 })
4492 }
4493
4494 fn qr(&self, handle: &GpuTensorHandle, options: ProviderQrOptions) -> Result<ProviderQrResult> {
4495 let data = {
4496 let guard = registry().lock().unwrap();
4497 guard
4498 .get(&handle.buffer_id)
4499 .cloned()
4500 .ok_or_else(|| anyhow!("qr: unknown buffer {}", handle.buffer_id))?
4501 };
4502 let tensor = Tensor::new(data, handle.shape.clone()).map_err(|e| anyhow!("qr: {e}"))?;
4503 let mut args = Vec::new();
4504 if options.economy {
4505 args.push(Value::Num(0.0));
4506 }
4507 if matches!(options.pivot, ProviderQrPivot::Vector) {
4508 args.push(Value::from("vector"));
4509 }
4510 let eval = runmat_runtime::builtins::math::linalg::factor::qr::evaluate(
4511 Value::Tensor(tensor),
4512 &args,
4513 )
4514 .map_err(|e| anyhow!("qr: {e}"))?;
4515
4516 let q_tensor = tensor_from_value("qr", eval.q())?;
4517 let r_tensor = tensor_from_value("qr", eval.r())?;
4518 let perm_matrix_tensor = tensor_from_value("qr", eval.permutation_matrix())?;
4519 let perm_vector_tensor = tensor_from_value("qr", eval.permutation_vector())?;
4520
4521 let q_handle = self.allocate_tensor(q_tensor.data.clone(), q_tensor.shape.clone());
4522 let r_handle = self.allocate_tensor(r_tensor.data.clone(), r_tensor.shape.clone());
4523 let perm_matrix_handle = self.allocate_tensor(
4524 perm_matrix_tensor.data.clone(),
4525 perm_matrix_tensor.shape.clone(),
4526 );
4527 let perm_vector_handle = self.allocate_tensor(
4528 perm_vector_tensor.data.clone(),
4529 perm_vector_tensor.shape.clone(),
4530 );
4531
4532 Ok(ProviderQrResult {
4533 q: q_handle,
4534 r: r_handle,
4535 perm_matrix: perm_matrix_handle,
4536 perm_vector: perm_vector_handle,
4537 })
4538 }
4539
4540 fn matmul(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
4541 if a.shape.len() != 2 || b.shape.len() != 2 {
4543 return Err(anyhow::anyhow!("matmul: only 2D supported"));
4544 }
4545 let (ar, ac) = (a.shape[0], a.shape[1]);
4546 let (br, bc) = (b.shape[0], b.shape[1]);
4547 if ac != br {
4548 return Err(anyhow::anyhow!("matmul: inner dims must agree"));
4549 }
4550 let guard = registry().lock().unwrap();
4551 let abuf = guard
4552 .get(&a.buffer_id)
4553 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
4554 let bbuf = guard
4555 .get(&b.buffer_id)
4556 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", b.buffer_id))?;
4557 let mut out = vec![0.0; ar * bc];
4558 for j in 0..bc {
4560 for i in 0..ar {
4561 let mut sum = 0.0;
4562 for k in 0..ac {
4563 sum += abuf[i + k * ar] * bbuf[k + j * br];
4564 }
4565 out[i + j * ar] = sum;
4566 }
4567 }
4568 drop(guard);
4569 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
4570 let mut guard2 = registry().lock().unwrap();
4571 guard2.insert(id, out);
4572 Ok(GpuTensorHandle {
4573 shape: vec![ar, bc],
4574 device_id: 0,
4575 buffer_id: id,
4576 })
4577 }
4578
4579 fn matmul_epilogue(
4580 &self,
4581 a: &GpuTensorHandle,
4582 b: &GpuTensorHandle,
4583 ep: &runmat_accelerate_api::MatmulEpilogue,
4584 ) -> Result<GpuTensorHandle> {
4585 let base = self.matmul(a, b)?;
4587 let (rows, cols) = (base.shape[0], base.shape[1]);
4588 let mut data = {
4589 let guard = registry().lock().unwrap();
4590 guard
4591 .get(&base.buffer_id)
4592 .cloned()
4593 .ok_or_else(|| anyhow!("matmul_epilogue: unknown buffer {}", base.buffer_id))?
4594 };
4595
4596 let row_scale: Option<Vec<f64>> = if let Some(ref h) = ep.row_scale {
4598 let guard = registry().lock().unwrap();
4599 Some(guard.get(&h.buffer_id).cloned().ok_or_else(|| {
4600 anyhow!("matmul_epilogue: unknown row scale buffer {}", h.buffer_id)
4601 })?)
4602 } else {
4603 None
4604 };
4605 let col_scale: Option<Vec<f64>> = if let Some(ref h) = ep.col_scale {
4606 let guard = registry().lock().unwrap();
4607 Some(guard.get(&h.buffer_id).cloned().ok_or_else(|| {
4608 anyhow!("matmul_epilogue: unknown col scale buffer {}", h.buffer_id)
4609 })?)
4610 } else {
4611 None
4612 };
4613 let mut diag_output: Option<Vec<f64>> = if let Some(ref h) = ep.diag_output {
4614 let guard = registry().lock().unwrap();
4615 let vec = guard.get(&h.buffer_id).cloned().ok_or_else(|| {
4616 anyhow!(
4617 "matmul_epilogue: unknown diag output buffer {}",
4618 h.buffer_id
4619 )
4620 })?;
4621 Some(vec)
4622 } else {
4623 None
4624 };
4625 if let Some(ref diag) = diag_output {
4626 let expected = rows.min(cols);
4627 if diag.len() < expected {
4628 return Err(anyhow!(
4629 "matmul_epilogue: diag_output length {} insufficient for diag size {}",
4630 diag.len(),
4631 expected
4632 ));
4633 }
4634 }
4635
4636 for j in 0..cols {
4638 for i in 0..rows {
4639 let idx = i + j * rows;
4640 let mut v = data[idx] * ep.alpha + ep.beta;
4641 if let Some(ref rs) = row_scale {
4642 let s = *rs.get(i).unwrap_or(&1.0);
4643 v = match ep.row_op {
4644 runmat_accelerate_api::ScaleOp::Multiply => v * s,
4645 runmat_accelerate_api::ScaleOp::Divide => v / s,
4646 };
4647 }
4648 if let Some(ref cs) = col_scale {
4649 let s = *cs.get(j).unwrap_or(&1.0);
4650 v = match ep.col_op {
4651 runmat_accelerate_api::ScaleOp::Multiply => v * s,
4652 runmat_accelerate_api::ScaleOp::Divide => v / s,
4653 };
4654 }
4655 if let Some(min_v) = ep.clamp_min {
4656 v = v.max(min_v);
4657 }
4658 if let Some(max_v) = ep.clamp_max {
4659 v = v.min(max_v);
4660 }
4661 if let Some(pow_v) = ep.pow_exponent {
4662 v = v.powf(pow_v);
4663 }
4664 if let Some(ref mut diag) = diag_output {
4665 if i == j && i < diag.len() {
4666 diag[i] = v;
4667 }
4668 }
4669 data[idx] = v;
4670 }
4671 }
4672
4673 {
4675 let mut guard = registry().lock().unwrap();
4676 guard.insert(base.buffer_id, data);
4677 if let Some(vec) = diag_output {
4678 if let Some(ref h) = ep.diag_output {
4679 guard.insert(h.buffer_id, vec);
4680 }
4681 }
4682 }
4683 Ok(base)
4684 }
4685
4686 fn matmul_power_step(
4687 &self,
4688 lhs: &GpuTensorHandle,
4689 rhs: &GpuTensorHandle,
4690 ep: &runmat_accelerate_api::PowerStepEpilogue,
4691 ) -> Result<GpuTensorHandle> {
4692 let base = self.matmul(lhs, rhs)?;
4693 let rows = base.shape[0];
4694 let cols = base.shape[1];
4695 let mut data = {
4696 let guard = registry().lock().unwrap();
4697 guard
4698 .get(&base.buffer_id)
4699 .cloned()
4700 .ok_or_else(|| anyhow!("matmul_power_step: unknown buffer {}", base.buffer_id))?
4701 };
4702 let mut norms = vec![0.0f64; cols];
4703 for (col, norm) in norms.iter_mut().enumerate().take(cols) {
4704 let mut acc = 0.0f64;
4705 for row in 0..rows {
4706 let idx = row + col * rows;
4707 let val = data[idx];
4708 acc += val * val;
4709 }
4710 acc += ep.epsilon;
4711 *norm = acc.sqrt();
4712 }
4713 for (col, norm) in norms.iter().enumerate().take(cols) {
4714 for row in 0..rows {
4715 let idx = row + col * rows;
4716 data[idx] /= norm;
4717 }
4718 }
4719 {
4720 let mut guard = registry().lock().unwrap();
4721 guard.insert(base.buffer_id, data);
4722 }
4723 Ok(base)
4724 }
4725
4726 fn image_normalize(
4727 &self,
4728 input: &GpuTensorHandle,
4729 desc: &runmat_accelerate_api::ImageNormalizeDescriptor,
4730 ) -> Result<GpuTensorHandle> {
4731 ensure!(
4732 input.shape.len() == 3,
4733 "image_normalize: expected 3-D tensor, got {:?}",
4734 input.shape
4735 );
4736 ensure!(
4737 input.shape[0] == desc.batch
4738 && input.shape[1] == desc.height
4739 && input.shape[2] == desc.width,
4740 "image_normalize: descriptor dims {:?} do not match tensor shape {:?}",
4741 (desc.batch, desc.height, desc.width),
4742 input.shape
4743 );
4744
4745 let data = {
4746 let guard = registry().lock().unwrap();
4747 guard
4748 .get(&input.buffer_id)
4749 .cloned()
4750 .ok_or_else(|| anyhow!("image_normalize: unknown buffer {}", input.buffer_id))?
4751 };
4752
4753 let batch = desc.batch;
4754 let height = desc.height;
4755 let width = desc.width;
4756 let plane = height * width;
4757 if plane == 0 {
4758 return Ok(self.allocate_tensor(vec![], input.shape.clone()));
4759 }
4760
4761 let stride_h = batch;
4762 let stride_w = batch * height;
4763
4764 let gain = desc.gain.unwrap_or(1.0);
4765 let bias = desc.bias.unwrap_or(0.0);
4766 let gamma = desc.gamma;
4767
4768 let mut output = data.clone();
4769
4770 for b in 0..batch {
4771 let mut sum = 0.0;
4772 for w in 0..width {
4773 let base_w = w * stride_w;
4774 for h in 0..height {
4775 let idx = b + h * stride_h + base_w;
4776 sum += data[idx];
4777 }
4778 }
4779 let mean = sum / plane as f64;
4780
4781 let mut sq_sum = 0.0;
4782 for w in 0..width {
4783 let base_w = w * stride_w;
4784 for h in 0..height {
4785 let idx = b + h * stride_h + base_w;
4786 let diff = data[idx] - mean;
4787 sq_sum += diff * diff;
4788 }
4789 }
4790 let variance = sq_sum / plane as f64;
4791 let sigma = (variance + desc.epsilon).sqrt();
4792 let inv_sigma = if sigma > 0.0 { 1.0 / sigma } else { 0.0 };
4793
4794 for w in 0..width {
4795 let base_w = w * stride_w;
4796 for h in 0..height {
4797 let idx = b + h * stride_h + base_w;
4798 let mut value = (data[idx] - mean) * inv_sigma;
4799 if desc.gain.is_some() {
4800 value *= gain;
4801 }
4802 if desc.bias.is_some() {
4803 value += bias;
4804 }
4805 value = value.max(0.0);
4806 if let Some(gamma) = gamma {
4807 value = value.powf(gamma);
4808 }
4809 output[idx] = value;
4810 }
4811 }
4812 }
4813
4814 Ok(self.allocate_tensor(output, input.shape.clone()))
4815 }
4816
4817 fn pagefun(&self, _request: &PagefunRequest) -> Result<GpuTensorHandle> {
4818 Err(anyhow::anyhow!(
4819 "pagefun: in-process provider does not implement GPU page operations"
4820 ))
4821 }
4822
4823 fn linsolve(
4824 &self,
4825 lhs: &GpuTensorHandle,
4826 rhs: &GpuTensorHandle,
4827 options: &ProviderLinsolveOptions,
4828 ) -> Result<ProviderLinsolveResult> {
4829 let (lhs_data, rhs_data) = {
4830 let guard = registry().lock().unwrap();
4831 let lhs_buf = guard
4832 .get(&lhs.buffer_id)
4833 .cloned()
4834 .ok_or_else(|| anyhow!("linsolve: unknown buffer {}", lhs.buffer_id))?;
4835 let rhs_buf = guard
4836 .get(&rhs.buffer_id)
4837 .cloned()
4838 .ok_or_else(|| anyhow!("linsolve: unknown buffer {}", rhs.buffer_id))?;
4839 (lhs_buf, rhs_buf)
4840 };
4841
4842 let lhs_tensor =
4843 Tensor::new(lhs_data, lhs.shape.clone()).map_err(|e| anyhow!("linsolve: {e}"))?;
4844 let rhs_tensor =
4845 Tensor::new(rhs_data, rhs.shape.clone()).map_err(|e| anyhow!("linsolve: {e}"))?;
4846
4847 let (solution, rcond) = linsolve_host_real_for_provider(&lhs_tensor, &rhs_tensor, options)
4848 .map_err(|e| anyhow!("{e}"))?;
4849
4850 let Tensor { data, shape, .. } = solution;
4851 let handle = self.allocate_tensor(data, shape);
4852 Ok(ProviderLinsolveResult {
4853 solution: handle,
4854 reciprocal_condition: rcond,
4855 })
4856 }
4857
4858 fn inv(
4859 &self,
4860 matrix: &GpuTensorHandle,
4861 _options: ProviderInvOptions,
4862 ) -> Result<GpuTensorHandle> {
4863 let data = {
4864 let guard = registry().lock().unwrap();
4865 guard
4866 .get(&matrix.buffer_id)
4867 .cloned()
4868 .ok_or_else(|| anyhow!("inv: unknown buffer {}", matrix.buffer_id))?
4869 };
4870 let tensor = Tensor::new(data, matrix.shape.clone()).map_err(|e| anyhow!("inv: {e}"))?;
4871 let result = inv_host_real_for_provider(&tensor).map_err(|e| anyhow!("{e}"))?;
4872 let Tensor { data, shape, .. } = result;
4873 Ok(self.allocate_tensor(data, shape))
4874 }
4875
4876 fn pinv(
4877 &self,
4878 matrix: &GpuTensorHandle,
4879 options: ProviderPinvOptions,
4880 ) -> Result<GpuTensorHandle> {
4881 let data = {
4882 let guard = registry().lock().unwrap();
4883 guard
4884 .get(&matrix.buffer_id)
4885 .cloned()
4886 .ok_or_else(|| anyhow!("pinv: unknown buffer {}", matrix.buffer_id))?
4887 };
4888 let tensor = Tensor::new(data, matrix.shape.clone()).map_err(|e| anyhow!("pinv: {e}"))?;
4889 let result =
4890 pinv_host_real_for_provider(&tensor, options.tolerance).map_err(|e| anyhow!("{e}"))?;
4891 let Tensor { data, shape, .. } = result;
4892 Ok(self.allocate_tensor(data, shape))
4893 }
4894
4895 fn cond(&self, matrix: &GpuTensorHandle, norm: ProviderCondNorm) -> Result<GpuTensorHandle> {
4896 let data = {
4897 let guard = registry().lock().unwrap();
4898 guard
4899 .get(&matrix.buffer_id)
4900 .cloned()
4901 .ok_or_else(|| anyhow!("cond: unknown buffer {}", matrix.buffer_id))?
4902 };
4903 let tensor = Tensor::new(data, matrix.shape.clone()).map_err(|e| anyhow!("cond: {e}"))?;
4904 let cond_value = cond_host_real_for_provider(&tensor, norm).map_err(|e| anyhow!("{e}"))?;
4905 Ok(self.allocate_tensor(vec![cond_value], vec![1, 1]))
4906 }
4907
4908 fn norm(&self, tensor: &GpuTensorHandle, order: ProviderNormOrder) -> Result<GpuTensorHandle> {
4909 let data = {
4910 let guard = registry().lock().unwrap();
4911 guard
4912 .get(&tensor.buffer_id)
4913 .cloned()
4914 .ok_or_else(|| anyhow!("norm: unknown buffer {}", tensor.buffer_id))?
4915 };
4916 let host_tensor =
4917 Tensor::new(data, tensor.shape.clone()).map_err(|e| anyhow!("norm: {e}"))?;
4918 let value = norm_host_real_for_provider(&host_tensor, order).map_err(|e| anyhow!("{e}"))?;
4919 Ok(self.allocate_tensor(vec![value], vec![1, 1]))
4920 }
4921
4922 fn rank(&self, matrix: &GpuTensorHandle, tolerance: Option<f64>) -> Result<GpuTensorHandle> {
4923 let data = {
4924 let guard = registry().lock().unwrap();
4925 guard
4926 .get(&matrix.buffer_id)
4927 .cloned()
4928 .ok_or_else(|| anyhow!("rank: unknown buffer {}", matrix.buffer_id))?
4929 };
4930
4931 let tensor = Tensor::new(data, matrix.shape.clone()).map_err(|e| anyhow!("rank: {e}"))?;
4932 let rank =
4933 rank_host_real_for_provider(&tensor, tolerance).map_err(|e| anyhow!("{e}"))? as f64;
4934
4935 Ok(self.allocate_tensor(vec![rank], vec![1, 1]))
4936 }
4937
4938 fn rcond(&self, matrix: &GpuTensorHandle) -> Result<GpuTensorHandle> {
4939 let data = {
4940 let guard = registry().lock().unwrap();
4941 guard
4942 .get(&matrix.buffer_id)
4943 .cloned()
4944 .ok_or_else(|| anyhow!("rcond: unknown buffer {}", matrix.buffer_id))?
4945 };
4946 let tensor = Tensor::new(data, matrix.shape.clone()).map_err(|e| anyhow!("rcond: {e}"))?;
4947 let estimate = rcond_host_real_for_provider(&tensor).map_err(|e| anyhow!("{e}"))?;
4948 Ok(self.allocate_tensor(vec![estimate], vec![1, 1]))
4949 }
4950
4951 fn mldivide(&self, lhs: &GpuTensorHandle, rhs: &GpuTensorHandle) -> Result<GpuTensorHandle> {
4952 let (lhs_data, rhs_data) = {
4953 let guard = registry().lock().unwrap();
4954 let lhs_buf = guard
4955 .get(&lhs.buffer_id)
4956 .cloned()
4957 .ok_or_else(|| anyhow!("mldivide: unknown buffer {}", lhs.buffer_id))?;
4958 let rhs_buf = guard
4959 .get(&rhs.buffer_id)
4960 .cloned()
4961 .ok_or_else(|| anyhow!("mldivide: unknown buffer {}", rhs.buffer_id))?;
4962 (lhs_buf, rhs_buf)
4963 };
4964
4965 let lhs_tensor =
4966 Tensor::new(lhs_data, lhs.shape.clone()).map_err(|e| anyhow!("mldivide: {e}"))?;
4967 let rhs_tensor =
4968 Tensor::new(rhs_data, rhs.shape.clone()).map_err(|e| anyhow!("mldivide: {e}"))?;
4969
4970 let result = mldivide_host_real_for_provider(&lhs_tensor, &rhs_tensor)
4971 .map_err(|e| anyhow!("{e}"))?;
4972
4973 let Tensor { data, shape, .. } = result;
4974 Ok(self.allocate_tensor(data, shape))
4975 }
4976
4977 fn mrdivide(&self, lhs: &GpuTensorHandle, rhs: &GpuTensorHandle) -> Result<GpuTensorHandle> {
4978 let (lhs_data, rhs_data) = {
4979 let guard = registry().lock().unwrap();
4980 let lhs_buf = guard
4981 .get(&lhs.buffer_id)
4982 .cloned()
4983 .ok_or_else(|| anyhow!("mrdivide: unknown buffer {}", lhs.buffer_id))?;
4984 let rhs_buf = guard
4985 .get(&rhs.buffer_id)
4986 .cloned()
4987 .ok_or_else(|| anyhow!("mrdivide: unknown buffer {}", rhs.buffer_id))?;
4988 (lhs_buf, rhs_buf)
4989 };
4990
4991 let lhs_tensor =
4992 Tensor::new(lhs_data, lhs.shape.clone()).map_err(|e| anyhow!("mrdivide: {e}"))?;
4993 let rhs_tensor =
4994 Tensor::new(rhs_data, rhs.shape.clone()).map_err(|e| anyhow!("mrdivide: {e}"))?;
4995
4996 let result = mrdivide_host_real_for_provider(&lhs_tensor, &rhs_tensor)
4997 .map_err(|e| anyhow!("{e}"))?;
4998
4999 let Tensor { data, shape, .. } = result;
5000 Ok(self.allocate_tensor(data, shape))
5001 }
5002
5003 fn eig(&self, _a: &GpuTensorHandle, _compute_left: bool) -> Result<ProviderEigResult> {
5004 Err(anyhow!("eig: not supported by in-process provider"))
5005 }
5006
5007 fn sub2ind(
5008 &self,
5009 dims: &[usize],
5010 strides: &[usize],
5011 inputs: &[&GpuTensorHandle],
5012 scalar_mask: &[bool],
5013 len: usize,
5014 output_shape: &[usize],
5015 ) -> Result<GpuTensorHandle> {
5016 if inputs.len() != dims.len() || inputs.len() != scalar_mask.len() {
5017 return Err(anyhow::anyhow!(
5018 "sub2ind: expected {} subscripts for {} dimensions",
5019 dims.len(),
5020 dims.len()
5021 ));
5022 }
5023 let expected_len: usize = output_shape.iter().copied().product();
5024 if expected_len != len {
5025 return Err(anyhow::anyhow!(
5026 "sub2ind: output shape does not match subscript sizes"
5027 ));
5028 }
5029 if len == 0 {
5030 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
5031 registry().lock().unwrap().insert(id, Vec::new());
5032 return Ok(GpuTensorHandle {
5033 shape: output_shape.to_vec(),
5034 device_id: 0,
5035 buffer_id: id,
5036 });
5037 }
5038
5039 let mut host_values: Vec<Vec<f64>> = Vec::with_capacity(inputs.len());
5040 {
5041 let guard = registry().lock().unwrap();
5042 for handle in inputs {
5043 let data = guard
5044 .get(&handle.buffer_id)
5045 .ok_or_else(|| anyhow::anyhow!("sub2ind: unknown buffer {}", handle.buffer_id))?
5046 .clone();
5047 host_values.push(data);
5048 }
5049 }
5050
5051 let mut output = Vec::with_capacity(len);
5052 for idx in 0..len {
5053 let mut offset: usize = 0;
5054 for (dim_index, ((&dim_size, &stride), data)) in dims
5055 .iter()
5056 .zip(strides.iter())
5057 .zip(host_values.iter())
5058 .enumerate()
5059 {
5060 let raw = if scalar_mask[dim_index] {
5061 *data.first().unwrap_or(&0.0)
5062 } else {
5063 data[idx]
5064 };
5065 let coerced = coerce_sub2ind_value(raw, dim_index + 1, dim_size)?;
5066 let term = coerced
5067 .checked_sub(1)
5068 .and_then(|base| base.checked_mul(stride))
5069 .ok_or_else(|| {
5070 anyhow::anyhow!("sub2ind: computed index exceeds platform limits")
5071 })?;
5072 offset = offset.checked_add(term).ok_or_else(|| {
5073 anyhow::anyhow!("sub2ind: computed index exceeds platform limits")
5074 })?;
5075 }
5076 output.push((offset + 1) as f64);
5077 }
5078
5079 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
5080 registry().lock().unwrap().insert(id, output);
5081 Ok(GpuTensorHandle {
5082 shape: output_shape.to_vec(),
5083 device_id: 0,
5084 buffer_id: id,
5085 })
5086 }
5087
5088 fn fused_elementwise(
5089 &self,
5090 _shader: &str,
5091 _inputs: &[GpuTensorHandle],
5092 _output_shape: &[usize],
5093 _len: usize,
5094 ) -> Result<GpuTensorHandle> {
5095 Err(anyhow::anyhow!(
5096 "fused_elementwise not supported by in-process provider"
5097 ))
5098 }
5099}
5100
5101static INSTANCE: OnceCell<InProcessProvider> = OnceCell::new();
5102
5103pub fn register_inprocess_provider() {
5106 let provider: &'static InProcessProvider = INSTANCE.get_or_init(InProcessProvider::new);
5107 unsafe { runmat_accelerate_api::register_provider(provider) };
5109}
5110
5111pub fn reset_inprocess_rng() {
5113 if let Ok(mut guard) = rng_state().lock() {
5114 *guard = 0x9e3779b97f4a7c15;
5115 }
5116}