1use crate::host_lu::{lu_factor_host, LuHostFactors};
2use crate::sortrows_host::{sort_rows_host, SortRowsHostOutputs};
3use crate::telemetry::AccelTelemetry;
4use anyhow::{anyhow, ensure, Result};
5use once_cell::sync::OnceCell;
6use runmat_accelerate_api::{
7 AccelDownloadFuture, AccelProvider, AccelProviderFuture, CorrcoefOptions, CovarianceOptions,
8 FindDirection, FspecialRequest, GpuTensorHandle, GpuTensorStorage, HostTensorOwned,
9 HostTensorView, ImfilterOptions, PagefunRequest, ProviderBandwidth, ProviderCholResult,
10 ProviderCondNorm, ProviderConv1dOptions, ProviderConvMode, ProviderConvOrientation,
11 ProviderEigResult, ProviderFindResult, ProviderHermitianKind, ProviderIirFilterOptions,
12 ProviderIirFilterResult, ProviderInvOptions, ProviderLinsolveOptions, ProviderLinsolveResult,
13 ProviderLuResult, ProviderNanMode, ProviderNormOrder, ProviderPinvOptions,
14 ProviderPolyderQuotient, ProviderPrecision, ProviderQrOptions, ProviderQrPivot,
15 ProviderQrResult, ProviderScanDirection, ProviderSymmetryKind, SetdiffOptions, SetdiffResult,
16 SortComparison, SortResult, SortRowsColumnSpec, UniqueOptions, UniqueResult,
17};
18use runmat_builtins::{Tensor, Value};
19use runmat_runtime::builtins::array::sorting_sets::unique;
20use runmat_runtime::builtins::common::broadcast::{
21 broadcast_index as runtime_broadcast_index, broadcast_shapes as runtime_broadcast_shapes,
22 compute_strides as runtime_compute_strides,
23};
24use runmat_runtime::builtins::stats::summary::{
25 corrcoef_from_tensors as runtime_corrcoef_from_tensors,
26 cov_from_tensors as runtime_cov_from_tensors, CovWeightSpec,
27};
28use runmat_runtime::RuntimeError;
29
30use runmat_runtime::builtins::math::linalg::ops::{
31 dot_host_real_for_provider, mldivide_host_real_for_provider, mrdivide_host_real_for_provider,
32};
33use runmat_runtime::builtins::math::linalg::solve::cond::cond_host_real_for_provider;
34use runmat_runtime::builtins::math::linalg::solve::inv::inv_host_real_for_provider;
35use runmat_runtime::builtins::math::linalg::solve::linsolve::linsolve_host_real_for_provider;
36use runmat_runtime::builtins::math::linalg::solve::norm::norm_host_real_for_provider;
37use runmat_runtime::builtins::math::linalg::solve::pinv::pinv_host_real_for_provider;
38use runmat_runtime::builtins::math::linalg::solve::rank_host_real_for_provider;
39use runmat_runtime::builtins::math::linalg::solve::rcond::rcond_host_real_for_provider;
40use runmat_runtime::builtins::math::linalg::structure::bandwidth::bandwidth_host_real_data;
41use runmat_runtime::builtins::math::linalg::structure::ishermitian::ishermitian_host_real_data;
42use runmat_runtime::builtins::math::linalg::structure::issymmetric::issymmetric_host_real_data;
43use runmat_runtime::builtins::math::linalg::structure::symrcm::symrcm_host_real_data;
44use runmat_runtime::builtins::math::reduction::{
45 compute_median_inplace, diff_tensor_host, gradient_real_tensor_host,
46};
47use std::collections::HashMap;
48use std::sync::atomic::{AtomicU64, Ordering};
49use std::sync::Mutex;
50use std::time::Instant;
51
52const PROVIDER_DEFAULT_SEED: u64 = 0x9e3779b97f4a7c15;
53
54static REGISTRY: OnceCell<Mutex<HashMap<u64, Vec<f64>>>> = OnceCell::new();
55
56fn registry() -> &'static Mutex<HashMap<u64, Vec<f64>>> {
57 REGISTRY.get_or_init(|| Mutex::new(HashMap::new()))
58}
59
60const POLYDER_EPS: f64 = 1.0e-12;
61const FACTORIAL_MAX_HOST: usize = 170;
62
63#[derive(Clone, Copy)]
64enum WindowKind {
65 Hann,
66 Hamming,
67 Blackman,
68}
69
70fn generate_window_data(kind: WindowKind, len: usize, periodic: bool) -> Vec<f64> {
71 match len {
72 0 => Vec::new(),
73 1 => vec![1.0],
74 _ => {
75 let effective_len = if periodic { len + 1 } else { len };
76 let denom = (effective_len - 1) as f64;
77 let mut data = (0..effective_len)
78 .map(|idx| {
79 let phase = 2.0 * std::f64::consts::PI * idx as f64 / denom;
80 match kind {
81 WindowKind::Hann => 0.5 - 0.5 * phase.cos(),
82 WindowKind::Hamming => 0.54 - 0.46 * phase.cos(),
83 WindowKind::Blackman => {
84 0.42 - 0.5 * phase.cos() + 0.08 * (2.0 * phase).cos()
85 }
86 }
87 })
88 .collect::<Vec<_>>();
89 if periodic {
90 data.pop();
91 }
92 data
93 }
94 }
95}
96const FACTORIAL_INT_TOL: f64 = 1.0e-10;
97
98fn sinc_scalar_host(value: f64) -> f64 {
99 if value == 0.0 {
100 return 1.0;
101 }
102 if value.is_finite() && value.fract() == 0.0 {
103 return 0.0;
104 }
105 let scaled = std::f64::consts::PI * value;
106 scaled.sin() / scaled
107}
108
109fn runtime_flow_to_anyhow(_context: &str, err: RuntimeError) -> anyhow::Error {
110 anyhow::Error::new(err)
111}
112
113#[derive(Clone, Copy)]
114enum PolyOrientation {
115 Scalar,
116 Row,
117 Column,
118}
119
120fn poly_orientation_from_shape(shape: &[usize]) -> Result<PolyOrientation> {
121 let mut non_unit = 0usize;
122 let mut orientation = PolyOrientation::Scalar;
123 for (idx, &dim) in shape.iter().enumerate() {
124 if dim > 1 {
125 non_unit += 1;
126 orientation = if idx == 0 {
127 PolyOrientation::Column
128 } else {
129 PolyOrientation::Row
130 };
131 }
132 }
133 if non_unit > 1 {
134 Err(anyhow!("polyder: coefficient inputs must be vectors"))
135 } else {
136 Ok(orientation)
137 }
138}
139
140fn poly_shape_for_len(orientation: PolyOrientation, len: usize) -> Vec<usize> {
141 if len <= 1 {
142 return vec![1, 1];
143 }
144 match orientation {
145 PolyOrientation::Scalar | PolyOrientation::Row => vec![1, len],
146 PolyOrientation::Column => vec![len, 1],
147 }
148}
149
150fn poly_trim_slice(coeffs: &[f64]) -> Vec<f64> {
151 if coeffs.is_empty() {
152 return vec![0.0];
153 }
154 if let Some(idx) = coeffs.iter().position(|c| c.abs() > POLYDER_EPS) {
155 coeffs[idx..].to_vec()
156 } else {
157 vec![0.0]
158 }
159}
160
161fn poly_raw_derivative(coeffs: &[f64]) -> Vec<f64> {
162 if coeffs.len() <= 1 {
163 return vec![0.0];
164 }
165 let mut out = Vec::with_capacity(coeffs.len() - 1);
166 let mut power = coeffs.len() - 1;
167 for coeff in coeffs.iter().take(coeffs.len() - 1) {
168 out.push(*coeff * power as f64);
169 power -= 1;
170 }
171 out
172}
173
174fn poly_integral_real(coeffs: &[f64], constant: f64) -> Vec<f64> {
175 if coeffs.is_empty() {
176 return vec![constant];
177 }
178 let mut out = Vec::with_capacity(coeffs.len() + 1);
179 for (idx, &coeff) in coeffs.iter().enumerate() {
180 let power = (coeffs.len() - idx) as f64;
181 if power == 0.0 {
182 out.push(0.0);
183 } else {
184 out.push(coeff / power);
185 }
186 }
187 out.push(constant);
188 out
189}
190
191fn poly_convolve_real(a: &[f64], b: &[f64]) -> Vec<f64> {
192 if a.is_empty() || b.is_empty() {
193 return Vec::new();
194 }
195 let mut result = vec![0.0; a.len() + b.len() - 1];
196 for (i, &ai) in a.iter().enumerate() {
197 for (j, &bj) in b.iter().enumerate() {
198 result[i + j] += ai * bj;
199 }
200 }
201 result
202}
203
204fn poly_add_real(a: &[f64], b: &[f64]) -> Vec<f64> {
205 let len = a.len().max(b.len());
206 let mut result = vec![0.0; len];
207 for (idx, &value) in a.iter().enumerate() {
208 result[len - a.len() + idx] += value;
209 }
210 for (idx, &value) in b.iter().enumerate() {
211 result[len - b.len() + idx] += value;
212 }
213 result
214}
215
216fn poly_sub_real(a: &[f64], b: &[f64]) -> Vec<f64> {
217 let len = a.len().max(b.len());
218 let mut result = vec![0.0; len];
219 for (idx, &value) in a.iter().enumerate() {
220 result[len - a.len() + idx] += value;
221 }
222 for (idx, &value) in b.iter().enumerate() {
223 result[len - b.len() + idx] -= value;
224 }
225 result
226}
227
228fn rng_state() -> &'static Mutex<u64> {
229 static RNG: OnceCell<Mutex<u64>> = OnceCell::new();
230 RNG.get_or_init(|| Mutex::new(0x9e3779b97f4a7c15))
231}
232
233fn factorial_scalar_host(value: f64) -> f64 {
234 if value.is_nan() {
235 return f64::NAN;
236 }
237 if value == 0.0 {
238 return 1.0;
239 }
240 if value.is_infinite() {
241 return if value.is_sign_positive() {
242 f64::INFINITY
243 } else {
244 f64::NAN
245 };
246 }
247 if value < 0.0 {
248 return f64::NAN;
249 }
250 let rounded = value.round();
251 if (value - rounded).abs() > FACTORIAL_INT_TOL {
252 return f64::NAN;
253 }
254 if rounded < 0.0 {
255 return f64::NAN;
256 }
257 let n = rounded as usize;
258 if n > FACTORIAL_MAX_HOST {
259 return f64::INFINITY;
260 }
261 if n == 0 {
262 return 1.0;
263 }
264 let mut acc = 1.0f64;
265 for k in 2..=n {
266 acc *= k as f64;
267 }
268 acc
269}
270
271fn tensor_to_weight_vector(tensor: &Tensor) -> Result<Vec<f64>> {
272 if tensor.shape.len() > 2 {
273 return Err(anyhow!("covariance: weight vector must be 1-D"));
274 }
275 let rows = tensor.rows();
276 let cols = tensor.cols();
277 if rows != 1 && cols != 1 {
278 return Err(anyhow!(
279 "covariance: weight vector must be 1-D (received shape {}x{})",
280 rows,
281 cols
282 ));
283 }
284 Ok(tensor.data.clone())
285}
286
287fn next_uniform(state: &mut u64) -> f64 {
288 const MULTIPLIER: u64 = 6364136223846793005;
289 const INCREMENT: u64 = 1;
290 const SHIFT: u32 = 11;
291 const SCALE: f64 = 1.0 / ((1u64 << 53) as f64);
292
293 *state = state.wrapping_mul(MULTIPLIER).wrapping_add(INCREMENT);
294 let bits = *state >> SHIFT;
295 (bits as f64) * SCALE
296}
297
298fn next_normal_pair(state: &mut u64) -> (f64, f64) {
299 let mut u1 = next_uniform(state);
300 if u1 <= 0.0 {
301 u1 = f64::MIN_POSITIVE;
302 }
303 let u2 = next_uniform(state);
304 let radius = (-2.0 * u1.ln()).sqrt();
305 let angle = 2.0 * std::f64::consts::PI * u2;
306 (radius * angle.cos(), radius * angle.sin())
307}
308
309pub struct InProcessProvider {
310 next_id: AtomicU64,
311 telemetry: AccelTelemetry,
312}
313
314impl InProcessProvider {
315 pub fn new() -> Self {
316 Self {
317 next_id: AtomicU64::new(1),
318 telemetry: AccelTelemetry::new(),
319 }
320 }
321
322 fn allocate_tensor(&self, data: Vec<f64>, shape: Vec<usize>) -> GpuTensorHandle {
323 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
324 registry()
325 .lock()
326 .unwrap_or_else(|e| e.into_inner())
327 .insert(id, data);
328 GpuTensorHandle {
329 shape,
330 device_id: 0,
331 buffer_id: id,
332 }
333 }
334
335 fn load_polynomial(&self, handle: &GpuTensorHandle) -> Result<(Vec<f64>, PolyOrientation)> {
336 let data = {
337 let guard = registry().lock().unwrap();
338 guard
339 .get(&handle.buffer_id)
340 .cloned()
341 .ok_or_else(|| anyhow!("polyder: unknown tensor handle {}", handle.buffer_id))?
342 };
343 let orientation = poly_orientation_from_shape(&handle.shape)?;
344 let coeffs = if data.is_empty() { vec![0.0] } else { data };
345 Ok((coeffs, orientation))
346 }
347
348 fn allocate_polynomial(
349 &self,
350 coeffs: Vec<f64>,
351 orientation: PolyOrientation,
352 ) -> GpuTensorHandle {
353 let shape = poly_shape_for_len(orientation, coeffs.len());
354 self.allocate_tensor(coeffs, shape)
355 }
356}
357
358impl Default for InProcessProvider {
359 fn default() -> Self {
360 Self::new()
361 }
362}
363
364fn normalize_shape(shape: &[usize]) -> Vec<usize> {
365 match shape.len() {
366 0 => vec![1, 1],
367 1 => {
368 let n = shape[0];
369 vec![n, n]
370 }
371 _ => shape.to_vec(),
372 }
373}
374
375fn compute_strides(shape: &[usize]) -> Vec<usize> {
376 let mut strides = Vec::with_capacity(shape.len());
377 let mut stride = 1usize;
378 for &dim in shape {
379 strides.push(stride);
380 stride = stride.saturating_mul(dim);
381 }
382 strides
383}
384
385fn product(shape: &[usize]) -> usize {
386 shape.iter().copied().product()
387}
388
389fn decode_indices(mut index: usize, dims: &[usize]) -> Vec<usize> {
390 if dims.is_empty() {
391 return Vec::new();
392 }
393 let mut coords = Vec::with_capacity(dims.len());
394 for &dim in dims {
395 if dim == 0 {
396 coords.push(0);
397 } else {
398 let coord = index % dim;
399 coords.push(coord);
400 index /= dim.max(1);
401 }
402 }
403 coords
404}
405
406fn shapes_compatible(expected: &[usize], actual: &[usize]) -> bool {
407 let max_len = expected.len().max(actual.len());
408 for i in 0..max_len {
409 let e = expected.get(i).copied().unwrap_or(1);
410 let a = actual.get(i).copied().unwrap_or(1);
411 if e != a {
412 return false;
413 }
414 }
415 true
416}
417
418fn filter_state_shape(mut base: Vec<usize>, dim_idx: usize, state_len: usize) -> Vec<usize> {
419 if base.len() <= dim_idx {
420 base.extend(std::iter::repeat_n(1, dim_idx + 1 - base.len()));
421 }
422 if !base.is_empty() {
423 base[dim_idx] = state_len;
424 }
425 base
426}
427
428fn states_from_column_major(
429 data: &[f64],
430 state_len: usize,
431 dim_idx: usize,
432 shape_ext: &[usize],
433) -> Vec<f64> {
434 if state_len == 0 {
435 return Vec::new();
436 }
437 let dims_before = &shape_ext[..dim_idx];
438 let dims_after = if dim_idx + 1 < shape_ext.len() {
439 &shape_ext[dim_idx + 1..]
440 } else {
441 &[]
442 };
443 let leading = if dims_before.is_empty() {
444 1
445 } else {
446 dims_before.iter().copied().product()
447 };
448 let trailing = if dims_after.is_empty() {
449 1
450 } else {
451 dims_after.iter().copied().product()
452 };
453 let channel_count = leading * trailing;
454 let shape = filter_state_shape(shape_ext.to_vec(), dim_idx, state_len);
455 let mut states = vec![0.0; state_len * channel_count];
456 for channel in 0..channel_count {
457 let before_idx = if dims_before.is_empty() {
458 0
459 } else {
460 channel % leading
461 };
462 let after_idx = if dims_after.is_empty() {
463 0
464 } else {
465 channel / leading
466 };
467 let before_coords = decode_indices(before_idx, dims_before);
468 let after_coords = decode_indices(after_idx, dims_after);
469 for s in 0..state_len {
470 let mut offset = 0usize;
471 let mut stride = 1usize;
472 for (d, size) in shape.iter().copied().enumerate() {
473 let coord = if d < dim_idx {
474 before_coords.get(d).copied().unwrap_or(0)
475 } else if d == dim_idx {
476 s
477 } else {
478 let idx = d - dim_idx - 1;
479 after_coords.get(idx).copied().unwrap_or(0)
480 };
481 offset += coord * stride;
482 stride *= size;
483 }
484 states[channel * state_len + s] = data[offset];
485 }
486 }
487 states
488}
489
490fn states_to_column_major(
491 states: &[f64],
492 state_len: usize,
493 dim_idx: usize,
494 shape_ext: &[usize],
495) -> Vec<f64> {
496 if state_len == 0 {
497 return Vec::new();
498 }
499 let dims_before = &shape_ext[..dim_idx];
500 let dims_after = if dim_idx + 1 < shape_ext.len() {
501 &shape_ext[dim_idx + 1..]
502 } else {
503 &[]
504 };
505 let leading = if dims_before.is_empty() {
506 1
507 } else {
508 dims_before.iter().copied().product()
509 };
510 let trailing = if dims_after.is_empty() {
511 1
512 } else {
513 dims_after.iter().copied().product()
514 };
515 let channel_count = leading * trailing;
516 let shape = filter_state_shape(shape_ext.to_vec(), dim_idx, state_len);
517 let mut out = vec![0.0; states.len()];
518 for channel in 0..channel_count {
519 let before_idx = if dims_before.is_empty() {
520 0
521 } else {
522 channel % leading
523 };
524 let after_idx = if dims_after.is_empty() {
525 0
526 } else {
527 channel / leading
528 };
529 let before_coords = decode_indices(before_idx, dims_before);
530 let after_coords = decode_indices(after_idx, dims_after);
531 for s in 0..state_len {
532 let mut offset = 0usize;
533 let mut stride = 1usize;
534 for (d, size) in shape.iter().copied().enumerate() {
535 let coord = if d < dim_idx {
536 before_coords.get(d).copied().unwrap_or(0)
537 } else if d == dim_idx {
538 s
539 } else {
540 let idx = d - dim_idx - 1;
541 after_coords.get(idx).copied().unwrap_or(0)
542 };
543 offset += coord * stride;
544 stride *= size;
545 }
546 out[offset] = states[channel * state_len + s];
547 }
548 }
549 out
550}
551
552fn permute_data(data: &[f64], shape: &[usize], order: &[usize]) -> Result<(Vec<f64>, Vec<usize>)> {
553 ensure!(!order.is_empty(), "permute: order must not be empty");
554 let rank = order.len();
555 ensure!(
556 shape.len() <= rank,
557 "permute: order length must be at least the number of dimensions"
558 );
559 let mut seen = vec![false; rank];
560 for &dim in order {
561 ensure!(dim < rank, "permute: invalid dimension index {}", dim + 1);
562 ensure!(
563 !seen[dim],
564 "permute: duplicate dimension index {} encountered",
565 dim + 1
566 );
567 seen[dim] = true;
568 }
569 ensure!(
570 seen.iter().all(|v| *v),
571 "permute: order must include every dimension exactly once"
572 );
573
574 let mut src_shape = shape.to_vec();
575 if src_shape.len() < rank {
576 src_shape.extend(std::iter::repeat_n(1, rank - src_shape.len()));
577 }
578
579 let total = product(&src_shape);
580 ensure!(
581 total == data.len(),
582 "permute: shape/product mismatch ({} vs {})",
583 total,
584 data.len()
585 );
586
587 let mut dst_shape = vec![0usize; rank];
588 for (dst_dim, &src_dim) in order.iter().enumerate() {
589 dst_shape[dst_dim] = src_shape[src_dim];
590 }
591
592 let src_strides = compute_strides(&src_shape);
593 let dst_total = product(&dst_shape);
594 let mut out = vec![0.0f64; dst_total];
595 let mut dst_coords = vec![0usize; rank];
596 let mut src_coords = vec![0usize; rank];
597
598 for (dst_index, out_value) in out.iter_mut().enumerate() {
599 let mut rem = dst_index;
600 for (dim, &size) in dst_shape.iter().enumerate() {
601 if size == 0 {
602 dst_coords[dim] = 0;
603 } else {
604 dst_coords[dim] = rem % size;
605 rem /= size;
606 }
607 }
608 for (dst_dim, &src_dim) in order.iter().enumerate() {
609 src_coords[src_dim] = dst_coords[dst_dim];
610 }
611 let mut src_index = 0usize;
612 for (dim, &coord) in src_coords.iter().enumerate() {
613 src_index += coord * src_strides[dim];
614 }
615 *out_value = data[src_index];
616 }
617
618 Ok((out, dst_shape))
619}
620
621fn flip_data(data: &[f64], shape: &[usize], axes: &[usize]) -> Result<Vec<f64>> {
622 if axes.is_empty() || data.is_empty() {
623 return Ok(data.to_vec());
624 }
625 let mut ext_shape = shape.to_vec();
626 if let Some(max_dim) = axes.iter().copied().max() {
627 let needed = max_dim + 1;
628 if needed > ext_shape.len() {
629 ext_shape.extend(std::iter::repeat_n(1, needed - ext_shape.len()));
630 }
631 }
632 let total = product(&ext_shape);
633 ensure!(
634 total == data.len(),
635 "flip: shape/product mismatch ({} vs {})",
636 total,
637 data.len()
638 );
639 let mut flip_flags = vec![false; ext_shape.len()];
640 for &axis in axes {
641 if axis < flip_flags.len() {
642 flip_flags[axis] = !flip_flags[axis];
643 }
644 }
645 if !flip_flags.iter().any(|&flag| flag) {
646 return Ok(data.to_vec());
647 }
648 let mut out = Vec::with_capacity(total);
649 for idx in 0..total {
650 let mut coords = unravel_index(idx, &ext_shape);
651 for (axis, flag) in flip_flags.iter().enumerate() {
652 if *flag && ext_shape[axis] > 1 {
653 coords[axis] = ext_shape[axis] - 1 - coords[axis];
654 }
655 }
656 let src_idx = ravel_index(&coords, &ext_shape);
657 out.push(data[src_idx]);
658 }
659 Ok(out)
660}
661
662fn conv1d_output_shape(len: usize, orientation: ProviderConvOrientation) -> Vec<usize> {
663 match (orientation, len) {
664 (ProviderConvOrientation::Row, 0) => vec![1, 0],
665 (ProviderConvOrientation::Row, _) => vec![1, len],
666 (ProviderConvOrientation::Column, 0) => vec![0, 1],
667 (ProviderConvOrientation::Column, _) => vec![len, 1],
668 }
669}
670
671fn convolve_real(signal: &[f64], kernel: &[f64]) -> Result<Vec<f64>> {
672 if signal.is_empty() || kernel.is_empty() {
673 return Ok(Vec::new());
674 }
675 let full_len = signal
676 .len()
677 .checked_add(kernel.len())
678 .and_then(|v| v.checked_sub(1))
679 .ok_or_else(|| anyhow!("conv1d: length overflow"))?;
680 let mut out = vec![0.0; full_len];
681 for (i, &ai) in signal.iter().enumerate() {
682 for (j, &bj) in kernel.iter().enumerate() {
683 out[i + j] += ai * bj;
684 }
685 }
686 Ok(out)
687}
688
689fn apply_conv_mode_real(
690 full: &[f64],
691 mode: ProviderConvMode,
692 len_a: usize,
693 len_b: usize,
694) -> Vec<f64> {
695 match mode {
696 ProviderConvMode::Full => full.to_vec(),
697 ProviderConvMode::Same => {
698 if len_a == 0 {
699 return Vec::new();
700 }
701 let start = if len_b == 0 { 0 } else { (len_b - 1) / 2 };
702 let end = (start + len_a).min(full.len());
703 if start >= end {
704 Vec::new()
705 } else {
706 full[start..end].to_vec()
707 }
708 }
709 ProviderConvMode::Valid => {
710 if len_a < len_b {
711 Vec::new()
712 } else {
713 let start = len_b - 1;
714 let valid_len = len_a - len_b + 1;
715 let end = (start + valid_len).min(full.len());
716 if start >= end {
717 Vec::new()
718 } else {
719 full[start..end].to_vec()
720 }
721 }
722 }
723 }
724}
725
726#[allow(clippy::too_many_arguments)]
727fn conv2d_full_real(
728 signal: &[f64],
729 signal_rows: usize,
730 signal_cols: usize,
731 kernel: &[f64],
732 kernel_rows: usize,
733 kernel_cols: usize,
734 full_rows: usize,
735 full_cols: usize,
736) -> Vec<f64> {
737 let mut out = vec![0.0; full_rows * full_cols];
738 for sc in 0..signal_cols {
739 for sr in 0..signal_rows {
740 let aval = signal[sc * signal_rows + sr];
741 if aval == 0.0 {
742 continue;
743 }
744 for kc in 0..kernel_cols {
745 let out_c = sc + kc;
746 let kcol = kernel_cols - 1 - kc;
747 for kr in 0..kernel_rows {
748 let out_r = sr + kr;
749 let krow = kernel_rows - 1 - kr;
750 let kval = kernel[kcol * kernel_rows + krow];
752 out[out_c * full_rows + out_r] += aval * kval;
753 }
754 }
755 }
756 }
757 out
758}
759
760fn slice_matrix_real(
761 full: &[f64],
762 full_rows: usize,
763 full_cols: usize,
764 row_start: usize,
765 row_end: usize,
766 col_start: usize,
767 col_end: usize,
768) -> (Vec<f64>, usize, usize) {
769 let row_end = row_end.min(full_rows);
770 let col_end = col_end.min(full_cols);
771 if row_start >= row_end || col_start >= col_end {
772 let rows = row_end.saturating_sub(row_start);
773 let cols = col_end.saturating_sub(col_start);
774 return (vec![0.0; rows * cols], rows, cols);
775 }
776 let rows = row_end - row_start;
777 let cols = col_end - col_start;
778 let mut out = vec![0.0; rows * cols];
779 for c in 0..cols {
780 for r in 0..rows {
781 let src = (col_start + c) * full_rows + (row_start + r);
782 let dst = c * rows + r;
783 out[dst] = full[src];
784 }
785 }
786 (out, rows, cols)
787}
788
789#[allow(clippy::too_many_arguments)]
790fn apply_conv2_mode_real_2d(
791 full: &[f64],
792 full_rows: usize,
793 full_cols: usize,
794 mode: ProviderConvMode,
795 a_rows: usize,
796 a_cols: usize,
797 b_rows: usize,
798 b_cols: usize,
799) -> (Vec<f64>, usize, usize) {
800 match mode {
801 ProviderConvMode::Full => (full.to_vec(), full_rows, full_cols),
802 ProviderConvMode::Same => {
803 if a_rows == 0 || a_cols == 0 {
804 return (Vec::new(), a_rows, a_cols);
805 }
806 let row_start = if b_rows == 0 { 0 } else { (b_rows - 1) / 2 };
807 let col_start = if b_cols == 0 { 0 } else { (b_cols - 1) / 2 };
808 slice_matrix_real(
809 full,
810 full_rows,
811 full_cols,
812 row_start,
813 row_start + a_rows,
814 col_start,
815 col_start + a_cols,
816 )
817 }
818 ProviderConvMode::Valid => {
819 if a_rows < b_rows || a_cols < b_cols {
820 (Vec::new(), 0, 0)
821 } else {
822 let rows = a_rows - b_rows + 1;
823 let cols = a_cols - b_cols + 1;
824 let row_start = b_rows.saturating_sub(1);
825 let col_start = b_cols.saturating_sub(1);
826 slice_matrix_real(
827 full,
828 full_rows,
829 full_cols,
830 row_start,
831 row_start + rows,
832 col_start,
833 col_start + cols,
834 )
835 }
836 }
837 }
838}
839
840fn tensor_from_value(label: &str, value: Value) -> Result<Tensor> {
841 match value {
842 Value::Tensor(tensor) => Ok(tensor),
843 Value::Num(n) => Tensor::new(vec![n], vec![1, 1]).map_err(|e| anyhow!("{label}: {e}")),
844 Value::Int(i) => {
845 Tensor::new(vec![i.to_f64()], vec![1, 1]).map_err(|e| anyhow!("{label}: {e}"))
846 }
847 Value::Bool(b) => Tensor::new(vec![if b { 1.0 } else { 0.0 }], vec![1, 1])
848 .map_err(|e| anyhow!("{label}: {e}")),
849 Value::ComplexTensor(_) => Err(anyhow!(
850 "{label}: complex outputs are not supported by the in-process provider"
851 )),
852 other => Err(anyhow!("{label}: unexpected value {other:?}")),
853 }
854}
855
856fn tril_data(data: &[f64], shape: &[usize], offset: isize) -> Result<Vec<f64>> {
857 if data.is_empty() {
858 return Ok(Vec::new());
859 }
860 let rows = shape.first().copied().unwrap_or(1);
861 let cols = shape.get(1).copied().unwrap_or(1);
862 let plane = rows.saturating_mul(cols);
863 if plane == 0 {
864 ensure!(
865 data.is_empty(),
866 "tril: shape/product mismatch ({} vs {})",
867 0,
868 data.len()
869 );
870 return Ok(Vec::new());
871 }
872 let pages = if shape.len() <= 2 {
873 1usize
874 } else {
875 shape[2..].iter().product::<usize>()
876 };
877 if pages == 0 {
878 ensure!(
879 data.is_empty(),
880 "tril: shape/product mismatch ({} vs {})",
881 0,
882 data.len()
883 );
884 return Ok(Vec::new());
885 }
886 let expected = plane
887 .checked_mul(pages)
888 .ok_or_else(|| anyhow!("tril: dimension product overflow"))?;
889 ensure!(
890 expected == data.len(),
891 "tril: shape/product mismatch ({} vs {})",
892 expected,
893 data.len()
894 );
895 let mut out = data.to_vec();
896 for page in 0..pages {
897 let base = page * plane;
898 for col in 0..cols {
899 let col_base = base + col * rows;
900 for row in 0..rows {
901 if (row as isize) - (col as isize) < -offset {
902 out[col_base + row] = 0.0;
903 }
904 }
905 }
906 }
907 Ok(out)
908}
909
910fn triu_data(data: &[f64], shape: &[usize], offset: isize) -> Result<Vec<f64>> {
911 if data.is_empty() {
912 return Ok(Vec::new());
913 }
914 let rows = shape.first().copied().unwrap_or(1);
915 let cols = shape.get(1).copied().unwrap_or(1);
916 let plane = rows.saturating_mul(cols);
917 if plane == 0 {
918 ensure!(
919 data.is_empty(),
920 "triu: shape/product mismatch ({} vs {})",
921 0,
922 data.len()
923 );
924 return Ok(Vec::new());
925 }
926 let pages = if shape.len() <= 2 {
927 1usize
928 } else {
929 shape[2..].iter().product::<usize>()
930 };
931 if pages == 0 {
932 ensure!(
933 data.is_empty(),
934 "triu: shape/product mismatch ({} vs {})",
935 0,
936 data.len()
937 );
938 return Ok(Vec::new());
939 }
940 let expected = plane
941 .checked_mul(pages)
942 .ok_or_else(|| anyhow!("triu: dimension product overflow"))?;
943 ensure!(
944 expected == data.len(),
945 "triu: shape/product mismatch ({} vs {})",
946 expected,
947 data.len()
948 );
949 let mut out = data.to_vec();
950 for page in 0..pages {
951 let base = page * plane;
952 for col in 0..cols {
953 let col_base = base + col * rows;
954 for row in 0..rows {
955 let diff = (col as isize) - (row as isize);
956 if diff < offset {
957 out[col_base + row] = 0.0;
958 }
959 }
960 }
961 }
962 Ok(out)
963}
964
965fn circshift_data(data: &[f64], shape: &[usize], shifts: &[isize]) -> Result<Vec<f64>> {
966 ensure!(
967 shape.len() == shifts.len(),
968 "circshift: shift vector length must match tensor rank"
969 );
970 let mut total = 1usize;
971 for &dim in shape {
972 total = total
973 .checked_mul(dim)
974 .ok_or_else(|| anyhow!("circshift: requested output exceeds maximum size"))?;
975 }
976 ensure!(
977 total == data.len(),
978 "circshift: shape/product mismatch ({} vs {})",
979 total,
980 data.len()
981 );
982 if data.is_empty() {
983 return Ok(Vec::new());
984 }
985
986 let mut normalized = Vec::with_capacity(shape.len());
987 for (len, &shift) in shape.iter().zip(shifts.iter()) {
988 if *len <= 1 {
989 normalized.push(0usize);
990 continue;
991 }
992 let len_isize = *len as isize;
993 let mut value = shift % len_isize;
994 if value < 0 {
995 value += len_isize;
996 }
997 normalized.push(value as usize);
998 }
999 if normalized.iter().all(|&s| s == 0) {
1000 return Ok(data.to_vec());
1001 }
1002
1003 let strides = compute_strides(shape);
1004 let mut out = vec![0.0f64; data.len()];
1005 for (idx, out_value) in out.iter_mut().enumerate() {
1006 let coords = unravel_index(idx, shape);
1007 let mut src_idx = 0usize;
1008 for (axis, &coord) in coords.iter().enumerate() {
1009 let len = shape[axis];
1010 let stride = strides[axis];
1011 if len <= 1 || normalized[axis] == 0 {
1012 src_idx += coord * stride;
1013 } else {
1014 let shift = normalized[axis] % len;
1015 let src_coord = (coord + len - shift) % len;
1016 src_idx += src_coord * stride;
1017 }
1018 }
1019 *out_value = data[src_idx];
1020 }
1021 Ok(out)
1022}
1023
1024fn unravel_index(mut index: usize, shape: &[usize]) -> Vec<usize> {
1025 let mut coords = Vec::with_capacity(shape.len());
1026 for &extent in shape {
1027 if extent == 0 {
1028 coords.push(0);
1029 } else {
1030 coords.push(index % extent);
1031 index /= extent;
1032 }
1033 }
1034 coords
1035}
1036
1037fn ravel_index(coords: &[usize], shape: &[usize]) -> usize {
1038 let mut index = 0usize;
1039 let mut stride = 1usize;
1040 for (coord, extent) in coords.iter().zip(shape.iter()) {
1041 if *extent > 0 {
1042 index += coord * stride;
1043 stride *= extent;
1044 }
1045 }
1046 index
1047}
1048
1049fn checked_total(shape: &[usize]) -> Result<usize> {
1050 shape.iter().try_fold(1usize, |acc, dim| {
1051 acc.checked_mul(*dim)
1052 .ok_or_else(|| anyhow!("repmat: requested output exceeds maximum size"))
1053 })
1054}
1055
1056fn repmat_numeric(data: &[f64], shape: &[usize], reps: &[usize]) -> Result<(Vec<f64>, Vec<usize>)> {
1057 ensure!(
1058 !reps.is_empty(),
1059 "repmat: replication factors must be specified"
1060 );
1061 let orig_rank = if shape.is_empty() { 1 } else { shape.len() };
1062 let rank = if reps.len() == 1 {
1063 orig_rank.max(2)
1064 } else {
1065 orig_rank.max(reps.len())
1066 };
1067
1068 let mut base_shape = vec![1usize; rank];
1069 for (idx, &dim) in shape.iter().enumerate() {
1070 if idx < rank {
1071 base_shape[idx] = dim;
1072 }
1073 }
1074
1075 let mut factors = vec![1usize; rank];
1076 if reps.len() == 1 {
1077 factors.fill(reps[0]);
1078 } else {
1079 for (idx, &factor) in reps.iter().enumerate() {
1080 if idx < rank {
1081 factors[idx] = factor;
1082 }
1083 }
1084 }
1085
1086 let mut new_shape = Vec::with_capacity(rank);
1087 for i in 0..rank {
1088 let scaled = base_shape[i]
1089 .checked_mul(factors[i])
1090 .ok_or_else(|| anyhow!("repmat: requested output exceeds maximum size"))?;
1091 new_shape.push(scaled);
1092 }
1093
1094 let orig_total = checked_total(&base_shape)?;
1095 ensure!(
1096 orig_total == data.len() || (orig_total == 0 && data.is_empty()),
1097 "repmat: internal shape mismatch (expected {} elements, found {})",
1098 orig_total,
1099 data.len()
1100 );
1101
1102 let new_total = checked_total(&new_shape)?;
1103 if new_total == 0 {
1104 return Ok((Vec::new(), new_shape));
1105 }
1106
1107 let strides = compute_strides(&base_shape);
1108 let mut out = Vec::with_capacity(new_total);
1109 for idx in 0..new_total {
1110 let mut rem = idx;
1111 let mut src_index = 0usize;
1112 for dim in 0..rank {
1113 let dim_size = new_shape[dim];
1114 let coord = rem % dim_size;
1115 rem /= dim_size;
1116 let base = base_shape[dim];
1117 let orig_coord = if base == 0 { 0 } else { coord % base };
1118 src_index += orig_coord * strides[dim];
1119 }
1120 out.push(data[src_index]);
1121 }
1122 Ok((out, new_shape))
1123}
1124
1125fn coerce_sub2ind_value(value: f64, dim_number: usize, dim_size: usize) -> Result<usize> {
1126 if !value.is_finite() {
1127 return Err(anyhow!(
1128 "sub2ind: subscript in dimension {} must be finite",
1129 dim_number
1130 ));
1131 }
1132 let rounded = value.round();
1133 if (rounded - value).abs() > f64::EPSILON {
1134 return Err(anyhow!(
1135 "sub2ind: subscript in dimension {} must be an integer",
1136 dim_number
1137 ));
1138 }
1139 if rounded < 1.0 || rounded > dim_size as f64 {
1140 return Err(anyhow!(
1141 "sub2ind: subscript {} exceeds dimension {} (size {})",
1142 rounded as isize,
1143 dim_number,
1144 dim_size
1145 ));
1146 }
1147 Ok(rounded as usize)
1148}
1149
1150fn identity_data(shape: &[usize]) -> Vec<f64> {
1151 let shape = normalize_shape(shape);
1152 let total: usize = shape.iter().copied().product();
1153 let mut data = vec![0.0; total];
1154 if shape.is_empty() {
1155 return data;
1156 }
1157 let rows = shape[0];
1158 let cols = shape[1];
1159 let diag_len = rows.min(cols);
1160 if diag_len == 0 {
1161 return data;
1162 }
1163 let strides = compute_strides(&shape);
1164 let extra_dims = &shape[2..];
1165 let extra_count = if extra_dims.is_empty() {
1166 1
1167 } else {
1168 extra_dims.iter().copied().product()
1169 };
1170 let mut coords = vec![0usize; shape.len()];
1171 for mut extra_idx in 0..extra_count {
1172 for (offset, size) in extra_dims.iter().copied().enumerate() {
1173 let dim = offset + 2;
1174 if size == 0 {
1175 coords[dim] = 0;
1176 continue;
1177 }
1178 coords[dim] = extra_idx % size;
1179 extra_idx /= size;
1180 }
1181 for diag in 0..diag_len {
1182 coords[0] = diag;
1183 coords[1] = diag;
1184 let mut linear = 0usize;
1185 for (dim, &coord) in coords.iter().enumerate() {
1186 linear += coord * strides[dim];
1187 }
1188 data[linear] = 1.0;
1189 }
1190 }
1191 data
1192}
1193
1194fn offset_abs(offset: isize) -> usize {
1195 if offset >= 0 {
1196 offset as usize
1197 } else {
1198 let magnitude = -(offset as i128);
1199 magnitude as usize
1200 }
1201}
1202
1203fn diag_matrix_size(len: usize, offset: isize) -> Result<(usize, usize)> {
1204 let shift = offset_abs(offset);
1205 let size = len
1206 .checked_add(shift)
1207 .ok_or_else(|| anyhow!("diag: result dimension exceeds limits"))?;
1208 let total = size
1209 .checked_mul(size)
1210 .ok_or_else(|| anyhow!("diag: result size exceeds limits"))?;
1211 Ok((size, total))
1212}
1213
1214fn diagonal_length(rows: usize, cols: usize, offset: isize) -> usize {
1215 if rows == 0 || cols == 0 {
1216 return 0;
1217 }
1218 if offset >= 0 {
1219 let shift = offset as usize;
1220 if shift >= cols {
1221 0
1222 } else {
1223 rows.min(cols - shift)
1224 }
1225 } else {
1226 let shift = offset_abs(offset);
1227 if shift >= rows {
1228 0
1229 } else {
1230 (rows - shift).min(cols)
1231 }
1232 }
1233}
1234
1235fn diagonal_target_index(idx: usize, offset: isize) -> (usize, usize) {
1236 if offset >= 0 {
1237 (idx, idx + offset as usize)
1238 } else {
1239 (idx + offset_abs(offset), idx)
1240 }
1241}
1242
1243fn diagonal_source_index(idx: usize, offset: isize) -> (usize, usize) {
1244 if offset >= 0 {
1245 (idx, idx + offset as usize)
1246 } else {
1247 (idx + offset_abs(offset), idx)
1248 }
1249}
1250
1251fn ensure_diag_shape(label: &str, shape: &[usize]) -> Result<()> {
1252 if shape.len() > 2 && shape.iter().skip(2).any(|&d| d != 1) {
1253 Err(anyhow!("{label}: input must be 2-D"))
1254 } else {
1255 Ok(())
1256 }
1257}
1258
1259fn rows_cols(shape: &[usize]) -> (usize, usize) {
1260 match shape.len() {
1261 0 => (1, 1),
1262 1 => (shape[0], 1),
1263 _ => (shape[0], shape[1]),
1264 }
1265}
1266
1267fn is_vector_like(rows: usize, cols: usize, dims: usize) -> bool {
1268 rows == 1 || cols == 1 || dims <= 1
1269}
1270
1271impl AccelProvider for InProcessProvider {
1272 fn device_id(&self) -> u32 {
1273 0
1274 }
1275
1276 fn spawn_handle_concurrency(&self) -> runmat_accelerate_api::SpawnHandleConcurrency {
1277 runmat_accelerate_api::SpawnHandleConcurrency::SynchronizedMutation
1278 }
1279
1280 fn gather_linear(
1281 &self,
1282 source: &GpuTensorHandle,
1283 indices: &[u32],
1284 output_shape: &[usize],
1285 ) -> Result<GpuTensorHandle> {
1286 let data = {
1287 let guard = registry().lock().unwrap_or_else(|e| e.into_inner());
1288 guard
1289 .get(&source.buffer_id)
1290 .cloned()
1291 .ok_or_else(|| anyhow!("gather_linear: unknown buffer {}", source.buffer_id))?
1292 };
1293 let mut out = Vec::with_capacity(indices.len());
1294 for (pos, &idx) in indices.iter().enumerate() {
1295 let lin = idx as usize;
1296 ensure!(
1297 lin < data.len(),
1298 "gather_linear: index {} (position {}) out of bounds for buffer {} (len={})",
1299 lin,
1300 pos,
1301 source.buffer_id,
1302 data.len()
1303 );
1304 out.push(data[lin]);
1305 }
1306 Ok(self.allocate_tensor(out, output_shape.to_vec()))
1307 }
1308
1309 fn scatter_linear(
1310 &self,
1311 target: &GpuTensorHandle,
1312 indices: &[u32],
1313 values: &GpuTensorHandle,
1314 ) -> Result<()> {
1315 let values_data = {
1316 let guard = registry().lock().unwrap_or_else(|e| e.into_inner());
1317 guard.get(&values.buffer_id).cloned().ok_or_else(|| {
1318 anyhow!("scatter_linear: unknown values buffer {}", values.buffer_id)
1319 })?
1320 };
1321 ensure!(
1322 values_data.len() == indices.len(),
1323 "scatter_linear: values length {} does not match indices length {}",
1324 values_data.len(),
1325 indices.len()
1326 );
1327 let mut guard = registry().lock().unwrap_or_else(|e| e.into_inner());
1328 let target_buf = guard
1329 .get_mut(&target.buffer_id)
1330 .ok_or_else(|| anyhow!("scatter_linear: unknown target buffer {}", target.buffer_id))?;
1331 for (pos, &idx) in indices.iter().enumerate() {
1332 let lin = idx as usize;
1333 ensure!(
1334 lin < target_buf.len(),
1335 "scatter_linear: index {} (position {}) out of bounds for target len {}",
1336 lin,
1337 pos,
1338 target_buf.len()
1339 );
1340 target_buf[lin] = values_data[pos];
1341 }
1342 Ok(())
1343 }
1344
1345 fn precision(&self) -> ProviderPrecision {
1346 ProviderPrecision::F64
1347 }
1348
1349 fn upload(&self, host: &HostTensorView) -> Result<GpuTensorHandle> {
1350 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1351 let mut guard = registry().lock().unwrap_or_else(|e| e.into_inner());
1352 guard.insert(id, host.data.to_vec());
1353 let bytes = std::mem::size_of_val(host.data) as u64;
1354 self.telemetry.record_upload_bytes(bytes);
1355 let handle = GpuTensorHandle {
1356 shape: host.shape.to_vec(),
1357 device_id: 0,
1358 buffer_id: id,
1359 };
1360 runmat_accelerate_api::set_handle_logical(&handle, false);
1361 Ok(handle)
1362 }
1363
1364 fn download<'a>(&'a self, h: &'a GpuTensorHandle) -> AccelDownloadFuture<'a> {
1365 Box::pin(async move {
1366 let guard = registry().lock().unwrap_or_else(|e| e.into_inner());
1367 if let Some(buf) = guard.get(&h.buffer_id) {
1368 let bytes = (buf.len() * std::mem::size_of::<f64>()) as u64;
1369 self.telemetry.record_download_bytes(bytes);
1370 Ok(HostTensorOwned {
1371 data: buf.clone(),
1372 shape: h.shape.clone(),
1373 storage: runmat_accelerate_api::handle_storage(h),
1374 })
1375 } else {
1376 Err(anyhow::anyhow!("buffer not found: {}", h.buffer_id))
1377 }
1378 })
1379 }
1380
1381 fn free(&self, h: &GpuTensorHandle) -> Result<()> {
1382 let mut guard = registry().lock().unwrap_or_else(|e| e.into_inner());
1383 guard.remove(&h.buffer_id);
1384 runmat_accelerate_api::clear_handle_logical(h);
1385 runmat_accelerate_api::clear_handle_storage(h);
1386 Ok(())
1387 }
1388
1389 fn device_info(&self) -> String {
1390 "in-process provider (host registry)".to_string()
1391 }
1392
1393 fn device_info_struct(&self) -> runmat_accelerate_api::ApiDeviceInfo {
1394 runmat_accelerate_api::ApiDeviceInfo {
1395 device_id: 0,
1396 name: "InProcess".to_string(),
1397 vendor: "RunMat".to_string(),
1398 memory_bytes: None,
1399 backend: Some("inprocess".to_string()),
1400 }
1401 }
1402
1403 fn telemetry_snapshot(&self) -> runmat_accelerate_api::ProviderTelemetry {
1404 self.telemetry.snapshot(0, 0, 0, 0, None)
1405 }
1406
1407 fn reset_telemetry(&self) {
1408 self.telemetry.reset();
1409 }
1410
1411 fn sort_rows<'a>(
1412 &'a self,
1413 handle: &'a GpuTensorHandle,
1414 columns: &'a [SortRowsColumnSpec],
1415 comparison: SortComparison,
1416 ) -> AccelProviderFuture<'a, SortResult> {
1417 Box::pin(async move {
1418 let data = {
1419 let guard = registry().lock().unwrap();
1420 guard
1421 .get(&handle.buffer_id)
1422 .cloned()
1423 .ok_or_else(|| anyhow!("sortrows: unknown buffer {}", handle.buffer_id))?
1424 };
1425 let SortRowsHostOutputs {
1426 values,
1427 indices,
1428 indices_shape,
1429 } = sort_rows_host(&data, &handle.shape, columns, comparison)?;
1430 Ok(SortResult {
1431 values: HostTensorOwned {
1432 data: values,
1433 shape: handle.shape.clone(),
1434 storage: GpuTensorStorage::Real,
1435 },
1436 indices: HostTensorOwned {
1437 data: indices,
1438 shape: indices_shape,
1439 storage: GpuTensorStorage::Real,
1440 },
1441 })
1442 })
1443 }
1444
1445 fn polyder_single<'a>(
1446 &'a self,
1447 polynomial: &'a GpuTensorHandle,
1448 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
1449 Box::pin(async move {
1450 let (coeffs, orientation) = self.load_polynomial(polynomial)?;
1451 let raw = poly_raw_derivative(&coeffs);
1452 let trimmed = poly_trim_slice(&raw);
1453 Ok(self.allocate_polynomial(trimmed, orientation))
1454 })
1455 }
1456
1457 fn polyder_product<'a>(
1458 &'a self,
1459 p: &'a GpuTensorHandle,
1460 q: &'a GpuTensorHandle,
1461 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
1462 Box::pin(async move {
1463 let (p_coeffs, orientation) = self.load_polynomial(p)?;
1464 let (q_coeffs, _) = self.load_polynomial(q)?;
1465 let dp = poly_raw_derivative(&p_coeffs);
1466 let dq = poly_raw_derivative(&q_coeffs);
1467 let term1 = poly_convolve_real(&dp, &q_coeffs);
1468 let term2 = poly_convolve_real(&p_coeffs, &dq);
1469 let sum = poly_add_real(&term1, &term2);
1470 let trimmed = poly_trim_slice(&sum);
1471 Ok(self.allocate_polynomial(trimmed, orientation))
1472 })
1473 }
1474
1475 fn polyder_quotient<'a>(
1476 &'a self,
1477 u: &'a GpuTensorHandle,
1478 v: &'a GpuTensorHandle,
1479 ) -> AccelProviderFuture<'a, ProviderPolyderQuotient> {
1480 Box::pin(async move {
1481 let (u_coeffs, orientation_u) = self.load_polynomial(u)?;
1482 let (v_coeffs, orientation_v) = self.load_polynomial(v)?;
1483 let du = poly_raw_derivative(&u_coeffs);
1484 let dv = poly_raw_derivative(&v_coeffs);
1485 let term1 = poly_convolve_real(&du, &v_coeffs);
1486 let term2 = poly_convolve_real(&u_coeffs, &dv);
1487 let numerator_vec = poly_trim_slice(&poly_sub_real(&term1, &term2));
1488 let denominator_vec = poly_trim_slice(&poly_convolve_real(&v_coeffs, &v_coeffs));
1489 let numerator = self.allocate_polynomial(numerator_vec, orientation_u);
1490 let denominator = self.allocate_polynomial(denominator_vec, orientation_v);
1491 Ok(ProviderPolyderQuotient {
1492 numerator,
1493 denominator,
1494 })
1495 })
1496 }
1497
1498 fn polyint(&self, polynomial: &GpuTensorHandle, constant: f64) -> Result<GpuTensorHandle> {
1499 let orientation = poly_orientation_from_shape(&polynomial.shape)?;
1500 let coeffs = {
1501 let guard = registry().lock().unwrap();
1502 guard
1503 .get(&polynomial.buffer_id)
1504 .cloned()
1505 .ok_or_else(|| anyhow!("polyint: unknown tensor handle {}", polynomial.buffer_id))?
1506 };
1507 let integrated = poly_integral_real(&coeffs, constant);
1508 Ok(self.allocate_polynomial(integrated, orientation))
1509 }
1510
1511 fn diag_from_vector(&self, vector: &GpuTensorHandle, offset: isize) -> Result<GpuTensorHandle> {
1512 ensure_diag_shape("diag", &vector.shape)?;
1513 let (rows, cols) = rows_cols(&vector.shape);
1514 ensure!(
1515 is_vector_like(rows, cols, vector.shape.len()),
1516 "diag: input must be a vector"
1517 );
1518
1519 let data = {
1520 let guard = registry().lock().unwrap();
1521 guard
1522 .get(&vector.buffer_id)
1523 .cloned()
1524 .ok_or_else(|| anyhow!("diag: unknown buffer {}", vector.buffer_id))?
1525 };
1526 let len = data.len();
1527 let (size, total) = diag_matrix_size(len, offset)?;
1528 let mut out = vec![0.0; total];
1529 for (idx, &value) in data.iter().enumerate() {
1530 let (row, col) = diagonal_target_index(idx, offset);
1531 if row < size && col < size {
1532 out[row + col * size] = value;
1533 }
1534 }
1535 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1536 registry().lock().unwrap().insert(id, out);
1537 Ok(GpuTensorHandle {
1538 shape: vec![size, size],
1539 device_id: 0,
1540 buffer_id: id,
1541 })
1542 }
1543
1544 fn diag_extract(&self, matrix: &GpuTensorHandle, offset: isize) -> Result<GpuTensorHandle> {
1545 ensure_diag_shape("diag", &matrix.shape)?;
1546 let (rows, cols) = rows_cols(&matrix.shape);
1547 ensure!(
1548 !is_vector_like(rows, cols, matrix.shape.len()),
1549 "diag: matrix input required"
1550 );
1551 let diag_len = diagonal_length(rows, cols, offset);
1552 if diag_len == 0 {
1553 return self.zeros(&[0, 1]);
1554 }
1555 let data = {
1556 let guard = registry().lock().unwrap();
1557 guard
1558 .get(&matrix.buffer_id)
1559 .cloned()
1560 .ok_or_else(|| anyhow!("diag: unknown buffer {}", matrix.buffer_id))?
1561 };
1562 let mut out = Vec::with_capacity(diag_len);
1563 for idx in 0..diag_len {
1564 let (row, col) = diagonal_source_index(idx, offset);
1565 let linear = row + col * rows;
1566 out.push(*data.get(linear).unwrap_or(&0.0));
1567 }
1568 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1569 registry().lock().unwrap().insert(id, out);
1570 Ok(GpuTensorHandle {
1571 shape: vec![diag_len, 1],
1572 device_id: 0,
1573 buffer_id: id,
1574 })
1575 }
1576
1577 fn tril<'a>(
1578 &'a self,
1579 handle: &'a GpuTensorHandle,
1580 offset: isize,
1581 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
1582 Box::pin(async move {
1583 let data = {
1584 let guard = registry().lock().unwrap();
1585 guard
1586 .get(&handle.buffer_id)
1587 .cloned()
1588 .ok_or_else(|| anyhow!("tril: unknown tensor handle {}", handle.buffer_id))?
1589 };
1590 let masked = tril_data(&data, &handle.shape, offset)?;
1591 Ok(self.allocate_tensor(masked, handle.shape.clone()))
1592 })
1593 }
1594
1595 fn triu<'a>(
1596 &'a self,
1597 handle: &'a GpuTensorHandle,
1598 offset: isize,
1599 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
1600 Box::pin(async move {
1601 let data = {
1602 let guard = registry().lock().unwrap();
1603 guard
1604 .get(&handle.buffer_id)
1605 .cloned()
1606 .ok_or_else(|| anyhow!("triu: unknown tensor handle {}", handle.buffer_id))?
1607 };
1608 let masked = triu_data(&data, &handle.shape, offset)?;
1609 Ok(self.allocate_tensor(masked, handle.shape.clone()))
1610 })
1611 }
1612
1613 fn issymmetric(
1614 &self,
1615 matrix: &GpuTensorHandle,
1616 kind: ProviderSymmetryKind,
1617 tolerance: f64,
1618 ) -> Result<bool> {
1619 let data = {
1620 let guard = registry().lock().unwrap();
1621 guard
1622 .get(&matrix.buffer_id)
1623 .cloned()
1624 .ok_or_else(|| anyhow!("issymmetric: unknown tensor handle {}", matrix.buffer_id))?
1625 };
1626 let skew = matches!(kind, ProviderSymmetryKind::Skew);
1627 issymmetric_host_real_data(&matrix.shape, &data, skew, tolerance).map_err(|e| anyhow!(e))
1628 }
1629
1630 fn ishermitian<'a>(
1631 &'a self,
1632 matrix: &'a GpuTensorHandle,
1633 kind: ProviderHermitianKind,
1634 tolerance: f64,
1635 ) -> AccelProviderFuture<'a, bool> {
1636 Box::pin(async move {
1637 let data = {
1638 let guard = registry().lock().unwrap();
1639 guard.get(&matrix.buffer_id).cloned().ok_or_else(|| {
1640 anyhow!("ishermitian: unknown tensor handle {}", matrix.buffer_id)
1641 })?
1642 };
1643 let skew = matches!(kind, ProviderHermitianKind::Skew);
1644 ishermitian_host_real_data(&matrix.shape, &data, skew, tolerance)
1645 .map_err(|e| anyhow!(e))
1646 })
1647 }
1648
1649 fn bandwidth(&self, matrix: &GpuTensorHandle) -> Result<ProviderBandwidth> {
1650 let data = {
1651 let guard = registry().lock().unwrap();
1652 guard
1653 .get(&matrix.buffer_id)
1654 .cloned()
1655 .ok_or_else(|| anyhow!("bandwidth: unknown tensor handle {}", matrix.buffer_id))?
1656 };
1657 let (lower, upper) =
1658 bandwidth_host_real_data(&matrix.shape, &data).map_err(|e| anyhow!(e))?;
1659 Ok(ProviderBandwidth {
1660 lower: lower as u32,
1661 upper: upper as u32,
1662 })
1663 }
1664
1665 fn sym_rcm<'a>(&'a self, matrix: &'a GpuTensorHandle) -> AccelProviderFuture<'a, Vec<usize>> {
1666 Box::pin(async move {
1667 let data = {
1668 let guard = registry().lock().unwrap();
1669 guard
1670 .get(&matrix.buffer_id)
1671 .cloned()
1672 .ok_or_else(|| anyhow!("symrcm: unknown tensor handle {}", matrix.buffer_id))?
1673 };
1674 symrcm_host_real_data(&matrix.shape, &data).map_err(|e| anyhow!(e))
1675 })
1676 }
1677
1678 fn read_scalar(&self, h: &GpuTensorHandle, linear_index: usize) -> Result<f64> {
1679 let guard = registry().lock().unwrap_or_else(|e| e.into_inner());
1680 let buf = guard
1681 .get(&h.buffer_id)
1682 .ok_or_else(|| anyhow!("read_scalar: unknown buffer {}", h.buffer_id))?;
1683 if linear_index >= buf.len() {
1684 return Err(anyhow!(
1685 "read_scalar: index {} out of bounds (len {})",
1686 linear_index + 1,
1687 buf.len()
1688 ));
1689 }
1690 Ok(buf[linear_index])
1691 }
1692
1693 fn zeros(&self, shape: &[usize]) -> Result<GpuTensorHandle> {
1694 let len: usize = shape.iter().copied().product();
1695 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1696 let mut guard = registry().lock().unwrap();
1697 guard.insert(id, vec![0.0; len]);
1698 Ok(GpuTensorHandle {
1699 shape: shape.to_vec(),
1700 device_id: 0,
1701 buffer_id: id,
1702 })
1703 }
1704
1705 fn zeros_like(&self, prototype: &GpuTensorHandle) -> Result<GpuTensorHandle> {
1706 self.zeros(&prototype.shape)
1707 }
1708
1709 fn ones(&self, shape: &[usize]) -> Result<GpuTensorHandle> {
1710 let len: usize = shape.iter().copied().product();
1711 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1712 let mut guard = registry().lock().unwrap();
1713 guard.insert(id, vec![1.0; len]);
1714 Ok(GpuTensorHandle {
1715 shape: shape.to_vec(),
1716 device_id: 0,
1717 buffer_id: id,
1718 })
1719 }
1720
1721 fn ones_like(&self, prototype: &GpuTensorHandle) -> Result<GpuTensorHandle> {
1722 self.ones(&prototype.shape)
1723 }
1724
1725 fn eye(&self, shape: &[usize]) -> Result<GpuTensorHandle> {
1726 let shape = normalize_shape(shape);
1727 let data = identity_data(&shape);
1728 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1729 let mut guard = registry().lock().unwrap();
1730 guard.insert(id, data);
1731 Ok(GpuTensorHandle {
1732 shape,
1733 device_id: 0,
1734 buffer_id: id,
1735 })
1736 }
1737
1738 fn eye_like(&self, prototype: &GpuTensorHandle) -> Result<GpuTensorHandle> {
1739 self.eye(&prototype.shape)
1740 }
1741
1742 fn linspace(&self, start: f64, stop: f64, count: usize) -> Result<GpuTensorHandle> {
1743 let data = if count == 0 {
1744 Vec::new()
1745 } else if count == 1 {
1746 vec![stop]
1747 } else {
1748 let step = (stop - start) / ((count - 1) as f64);
1749 let mut seq = Vec::with_capacity(count);
1750 for idx in 0..count {
1751 seq.push(start + (idx as f64) * step);
1752 }
1753 if let Some(last) = seq.last_mut() {
1754 *last = stop;
1755 }
1756 seq
1757 };
1758
1759 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1760 registry().lock().unwrap().insert(id, data);
1761 Ok(GpuTensorHandle {
1762 shape: vec![1, count],
1763 device_id: 0,
1764 buffer_id: id,
1765 })
1766 }
1767
1768 fn random_uniform(&self, shape: &[usize]) -> Result<GpuTensorHandle> {
1769 let len: usize = shape.iter().copied().product();
1770 let mut data = vec![0.0; len];
1771 {
1772 let mut guard = rng_state().lock().unwrap_or_else(|e| e.into_inner());
1773 for slot in &mut data {
1774 *slot = next_uniform(&mut guard);
1775 }
1776 }
1777
1778 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1779 let mut buf_guard = registry().lock().unwrap_or_else(|e| e.into_inner());
1780 buf_guard.insert(id, data);
1781 Ok(GpuTensorHandle {
1782 shape: shape.to_vec(),
1783 device_id: 0,
1784 buffer_id: id,
1785 })
1786 }
1787
1788 fn random_normal(&self, shape: &[usize]) -> Result<GpuTensorHandle> {
1789 let len: usize = shape.iter().copied().product();
1790 let mut data = Vec::with_capacity(len);
1791 if len > 0 {
1792 let mut guard = rng_state().lock().unwrap_or_else(|e| e.into_inner());
1793 while data.len() < len {
1794 let (z0, z1) = next_normal_pair(&mut guard);
1795 data.push(z0);
1796 if data.len() < len {
1797 data.push(z1);
1798 }
1799 }
1800 }
1801
1802 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1803 registry()
1804 .lock()
1805 .unwrap_or_else(|e| e.into_inner())
1806 .insert(id, data);
1807 Ok(GpuTensorHandle {
1808 shape: shape.to_vec(),
1809 device_id: 0,
1810 buffer_id: id,
1811 })
1812 }
1813
1814 fn random_exponential(&self, mu: f64, shape: &[usize]) -> Result<GpuTensorHandle> {
1815 let len: usize = shape.iter().copied().product();
1816 let mut data = vec![0.0f64; len];
1817 {
1818 let mut guard = rng_state().lock().unwrap_or_else(|e| e.into_inner());
1819 for slot in &mut data {
1820 let u = next_uniform(&mut guard).max(f64::MIN_POSITIVE);
1821 *slot = -mu * u.ln();
1822 }
1823 }
1824 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1825 registry()
1826 .lock()
1827 .unwrap_or_else(|e| e.into_inner())
1828 .insert(id, data);
1829 Ok(GpuTensorHandle {
1830 shape: shape.to_vec(),
1831 device_id: 0,
1832 buffer_id: id,
1833 })
1834 }
1835
1836 fn random_normrnd(&self, mu: f64, sigma: f64, shape: &[usize]) -> Result<GpuTensorHandle> {
1837 let len: usize = shape.iter().copied().product();
1838 let mut data = Vec::with_capacity(len);
1839 if len > 0 {
1840 let mut guard = rng_state().lock().unwrap_or_else(|e| e.into_inner());
1841 while data.len() < len {
1842 let (z0, z1) = next_normal_pair(&mut guard);
1843 data.push(mu + sigma * z0);
1844 if data.len() < len {
1845 data.push(mu + sigma * z1);
1846 }
1847 }
1848 }
1849 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1850 registry()
1851 .lock()
1852 .unwrap_or_else(|e| e.into_inner())
1853 .insert(id, data);
1854 Ok(GpuTensorHandle {
1855 shape: shape.to_vec(),
1856 device_id: 0,
1857 buffer_id: id,
1858 })
1859 }
1860
1861 fn random_unifrnd(&self, a: f64, b: f64, shape: &[usize]) -> Result<GpuTensorHandle> {
1862 let len: usize = shape.iter().copied().product();
1863 let mut data = vec![0.0f64; len];
1864 {
1865 let mut guard = rng_state().lock().unwrap_or_else(|e| e.into_inner());
1866 for slot in &mut data {
1867 *slot = a + (b - a) * next_uniform(&mut guard);
1868 }
1869 }
1870 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1871 registry()
1872 .lock()
1873 .unwrap_or_else(|e| e.into_inner())
1874 .insert(id, data);
1875 Ok(GpuTensorHandle {
1876 shape: shape.to_vec(),
1877 device_id: 0,
1878 buffer_id: id,
1879 })
1880 }
1881
1882 fn set_rng_state(&self, state: u64) -> Result<()> {
1883 let mut guard = rng_state()
1884 .lock()
1885 .map_err(|_| anyhow::anyhow!("set_rng_state: RNG mutex poisoned"))?;
1886 *guard = if state == 0 {
1887 PROVIDER_DEFAULT_SEED
1888 } else {
1889 state
1890 };
1891 Ok(())
1892 }
1893
1894 fn fspecial(&self, request: &FspecialRequest) -> Result<GpuTensorHandle> {
1895 let spec =
1896 runmat_runtime::builtins::image::filters::fspecial::spec_from_request(&request.filter)
1897 .map_err(|err| anyhow!(err))?;
1898 let tensor = spec.generate_tensor().map_err(|err| anyhow!(err))?;
1899 Ok(self.allocate_tensor(tensor.data.clone(), tensor.shape.clone()))
1900 }
1901
1902 fn imfilter<'a>(
1903 &'a self,
1904 image: &'a GpuTensorHandle,
1905 kernel: &'a GpuTensorHandle,
1906 options: &'a ImfilterOptions,
1907 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
1908 Box::pin(async move {
1909 let (image_vec, kernel_vec) = {
1910 let guard = registry().lock().unwrap();
1911 let image_buf = guard
1912 .get(&image.buffer_id)
1913 .cloned()
1914 .ok_or_else(|| anyhow!("imfilter: unknown buffer {}", image.buffer_id))?;
1915 let kernel_buf = guard
1916 .get(&kernel.buffer_id)
1917 .cloned()
1918 .ok_or_else(|| anyhow!("imfilter: unknown buffer {}", kernel.buffer_id))?;
1919 (image_buf, kernel_buf)
1920 };
1921 let image_tensor = Tensor::new(image_vec, image.shape.clone())
1922 .map_err(|e| anyhow!("imfilter: {e}"))?;
1923 let kernel_tensor = Tensor::new(kernel_vec, kernel.shape.clone())
1924 .map_err(|e| anyhow!("imfilter: {e}"))?;
1925 let result = runmat_runtime::builtins::image::filters::imfilter::apply_imfilter_tensor(
1926 &image_tensor,
1927 &kernel_tensor,
1928 options,
1929 "imfilter",
1930 )
1931 .map_err(|err| anyhow!(err))?;
1932 let Tensor { data, shape, .. } = result;
1933 Ok(self.allocate_tensor(data, shape))
1934 })
1935 }
1936
1937 fn random_integer_range(
1938 &self,
1939 lower: i64,
1940 upper: i64,
1941 shape: &[usize],
1942 ) -> Result<GpuTensorHandle> {
1943 ensure!(lower <= upper, "lower bound must be <= upper bound");
1944 let span_i128 = (upper as i128)
1945 .checked_sub(lower as i128)
1946 .and_then(|delta| delta.checked_add(1))
1947 .ok_or_else(|| anyhow!("integer range overflow"))?;
1948 ensure!(span_i128 > 0, "integer range must be non-empty");
1949 ensure!(
1950 span_i128 <= (1i128 << 53),
1951 "integer range exceeds 2^53 and cannot be represented exactly"
1952 );
1953 let span = span_i128 as u64;
1954
1955 let len: usize = shape.iter().copied().product();
1956 let mut data = Vec::with_capacity(len);
1957 if span == 1 {
1958 data.resize(len, lower as f64);
1959 } else if len > 0 {
1960 let mut guard = rng_state().lock().unwrap_or_else(|e| e.into_inner());
1961 let span_f64 = span as f64;
1962 for _ in 0..len {
1963 let mut offset = (next_uniform(&mut guard) * span_f64).floor() as u64;
1964 if offset >= span {
1965 offset = span - 1;
1966 }
1967 let value = (lower as i128 + offset as i128) as f64;
1968 data.push(value);
1969 }
1970 }
1971
1972 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
1973 registry().lock().unwrap().insert(id, data);
1974 Ok(GpuTensorHandle {
1975 shape: shape.to_vec(),
1976 device_id: 0,
1977 buffer_id: id,
1978 })
1979 }
1980
1981 fn random_permutation(&self, n: usize, k: usize) -> Result<GpuTensorHandle> {
1982 ensure!(k <= n, "randperm: K must satisfy 0 <= K <= N");
1983 let k = k.min(n);
1984 let mut values: Vec<f64> = if n == 0 {
1985 Vec::new()
1986 } else {
1987 (1..=n).map(|v| v as f64).collect()
1988 };
1989
1990 if k > 0 {
1991 let mut guard = rng_state().lock().unwrap_or_else(|e| e.into_inner());
1992 for i in 0..k {
1993 let span = n - i;
1994 if span == 0 {
1995 break;
1996 }
1997 let mut u = next_uniform(&mut guard);
1998 if u >= 1.0 {
1999 u = 0.9999999999999999;
2000 }
2001 let mut offset = (u * span as f64).floor() as usize;
2002 if offset >= span {
2003 offset = span - 1;
2004 }
2005 let j = i + offset;
2006 values.swap(i, j);
2007 }
2008 }
2009
2010 if values.len() > k {
2011 values.truncate(k);
2012 }
2013
2014 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2015 registry().lock().unwrap().insert(id, values);
2016 Ok(GpuTensorHandle {
2017 shape: vec![1, k],
2018 device_id: 0,
2019 buffer_id: id,
2020 })
2021 }
2022
2023 fn random_permutation_like(
2024 &self,
2025 _prototype: &GpuTensorHandle,
2026 n: usize,
2027 k: usize,
2028 ) -> Result<GpuTensorHandle> {
2029 self.random_permutation(n, k)
2030 }
2031
2032 fn covariance<'a>(
2033 &'a self,
2034 matrix: &'a GpuTensorHandle,
2035 second: Option<&'a GpuTensorHandle>,
2036 weights: Option<&'a GpuTensorHandle>,
2037 options: &'a CovarianceOptions,
2038 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
2039 Box::pin(async move {
2040 let host_matrix = <Self as AccelProvider>::download(self, matrix).await?;
2041 let left = Tensor::new(host_matrix.data.clone(), host_matrix.shape.clone())
2042 .map_err(|e| anyhow!("covariance: {e}"))?;
2043
2044 let right = if let Some(handle) = second {
2045 let host = <Self as AccelProvider>::download(self, handle).await?;
2046 Some(
2047 Tensor::new(host.data.clone(), host.shape.clone())
2048 .map_err(|e| anyhow!("covariance: {e}"))?,
2049 )
2050 } else {
2051 None
2052 };
2053
2054 let weight_spec = if let Some(handle) = weights {
2055 let host = <Self as AccelProvider>::download(self, handle).await?;
2056 let tensor = Tensor::new(host.data.clone(), host.shape.clone())
2057 .map_err(|e| anyhow!("covariance: {e}"))?;
2058 let vec =
2059 tensor_to_weight_vector(&tensor).map_err(|e| anyhow!("covariance: {e}"))?;
2060 CovWeightSpec::Vector(vec)
2061 } else {
2062 CovWeightSpec::Scalar(options.normalization)
2063 };
2064
2065 let result = runtime_cov_from_tensors(left, right, options.rows, weight_spec)
2066 .map_err(|flow| runtime_flow_to_anyhow("covariance", flow))?;
2067
2068 let view = HostTensorView {
2069 data: &result.data,
2070 shape: &result.shape,
2071 };
2072 self.upload(&view)
2073 })
2074 }
2075
2076 fn corrcoef<'a>(
2077 &'a self,
2078 matrix: &'a GpuTensorHandle,
2079 options: &'a CorrcoefOptions,
2080 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
2081 Box::pin(async move {
2082 let host = <Self as AccelProvider>::download(self, matrix).await?;
2083 let tensor = Tensor::new(host.data.clone(), host.shape.clone())
2084 .map_err(|e| anyhow!("corrcoef: {e}"))?;
2085 let result =
2086 runtime_corrcoef_from_tensors(tensor, None, options.normalization, options.rows)
2087 .map_err(|flow| runtime_flow_to_anyhow("corrcoef", flow))?;
2088 let view = HostTensorView {
2089 data: &result.data,
2090 shape: &result.shape,
2091 };
2092 self.upload(&view)
2093 })
2094 }
2095
2096 fn elem_add<'a>(
2097 &'a self,
2098 a: &'a GpuTensorHandle,
2099 b: &'a GpuTensorHandle,
2100 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
2101 Box::pin(async move {
2102 let guard = registry().lock().unwrap();
2103 let abuf = guard
2104 .get(&a.buffer_id)
2105 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2106 let bbuf = guard
2107 .get(&b.buffer_id)
2108 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", b.buffer_id))?;
2109 if a.shape != b.shape {
2110 return Err(anyhow::anyhow!("shape mismatch"));
2111 }
2112 let mut out = vec![0.0; abuf.len()];
2113 for i in 0..abuf.len() {
2114 out[i] = abuf[i] + bbuf[i];
2115 }
2116 drop(guard);
2117 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2119 let mut guard2 = registry().lock().unwrap();
2120 guard2.insert(id, out);
2121 Ok(GpuTensorHandle {
2122 shape: a.shape.clone(),
2123 device_id: 0,
2124 buffer_id: id,
2125 })
2126 })
2127 }
2128
2129 fn elem_mul<'a>(
2130 &'a self,
2131 a: &'a GpuTensorHandle,
2132 b: &'a GpuTensorHandle,
2133 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
2134 Box::pin(async move {
2135 let guard = registry().lock().unwrap();
2136 let abuf = guard
2137 .get(&a.buffer_id)
2138 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2139 let bbuf = guard
2140 .get(&b.buffer_id)
2141 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", b.buffer_id))?;
2142 if a.shape != b.shape {
2143 return Err(anyhow::anyhow!("shape mismatch"));
2144 }
2145 let mut out = vec![0.0; abuf.len()];
2146 for i in 0..abuf.len() {
2147 out[i] = abuf[i] * bbuf[i];
2148 }
2149 drop(guard);
2150 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2151 let mut guard2 = registry().lock().unwrap();
2152 guard2.insert(id, out);
2153 Ok(GpuTensorHandle {
2154 shape: a.shape.clone(),
2155 device_id: 0,
2156 buffer_id: id,
2157 })
2158 })
2159 }
2160
2161 fn elem_sub<'a>(
2162 &'a self,
2163 a: &'a GpuTensorHandle,
2164 b: &'a GpuTensorHandle,
2165 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
2166 Box::pin(async move {
2167 let guard = registry().lock().unwrap();
2168 let abuf = guard
2169 .get(&a.buffer_id)
2170 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2171 let bbuf = guard
2172 .get(&b.buffer_id)
2173 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", b.buffer_id))?;
2174 if a.shape != b.shape {
2175 return Err(anyhow::anyhow!("shape mismatch"));
2176 }
2177 let mut out = vec![0.0; abuf.len()];
2178 for i in 0..abuf.len() {
2179 out[i] = abuf[i] - bbuf[i];
2180 }
2181 drop(guard);
2182 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2183 let mut guard2 = registry().lock().unwrap();
2184 guard2.insert(id, out);
2185 Ok(GpuTensorHandle {
2186 shape: a.shape.clone(),
2187 device_id: 0,
2188 buffer_id: id,
2189 })
2190 })
2191 }
2192
2193 fn elem_div<'a>(
2194 &'a self,
2195 a: &'a GpuTensorHandle,
2196 b: &'a GpuTensorHandle,
2197 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
2198 Box::pin(async move {
2199 let guard = registry().lock().unwrap();
2200 let abuf = guard
2201 .get(&a.buffer_id)
2202 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2203 let bbuf = guard
2204 .get(&b.buffer_id)
2205 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", b.buffer_id))?;
2206 if a.shape != b.shape {
2207 return Err(anyhow::anyhow!("shape mismatch"));
2208 }
2209 let mut out = vec![0.0; abuf.len()];
2210 for i in 0..abuf.len() {
2211 out[i] = if bbuf[i] == 0.0 {
2212 f64::INFINITY * abuf[i].signum()
2213 } else {
2214 abuf[i] / bbuf[i]
2215 };
2216 }
2217 drop(guard);
2218 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2219 let mut guard2 = registry().lock().unwrap();
2220 guard2.insert(id, out);
2221 Ok(GpuTensorHandle {
2222 shape: a.shape.clone(),
2223 device_id: 0,
2224 buffer_id: id,
2225 })
2226 })
2227 }
2228
2229 fn elem_pow<'a>(
2230 &'a self,
2231 a: &'a GpuTensorHandle,
2232 b: &'a GpuTensorHandle,
2233 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
2234 Box::pin(async move {
2235 let guard = registry().lock().unwrap();
2236 let abuf = guard
2237 .get(&a.buffer_id)
2238 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2239 let bbuf = guard
2240 .get(&b.buffer_id)
2241 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", b.buffer_id))?;
2242 if a.shape != b.shape {
2243 return Err(anyhow::anyhow!("shape mismatch"));
2244 }
2245 let mut out = vec![0.0; abuf.len()];
2246 for i in 0..abuf.len() {
2247 out[i] = abuf[i].powf(bbuf[i]);
2248 }
2249 drop(guard);
2250 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2251 let mut guard2 = registry().lock().unwrap();
2252 guard2.insert(id, out);
2253 Ok(GpuTensorHandle {
2254 shape: a.shape.clone(),
2255 device_id: 0,
2256 buffer_id: id,
2257 })
2258 })
2259 }
2260
2261 fn elem_ne<'a>(
2262 &'a self,
2263 a: &'a GpuTensorHandle,
2264 b: &'a GpuTensorHandle,
2265 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
2266 Box::pin(async move {
2267 let (adata, bdata) = {
2268 let guard = registry().lock().unwrap();
2269 let a_vec = guard
2270 .get(&a.buffer_id)
2271 .ok_or_else(|| anyhow!("elem_ne: unknown buffer {}", a.buffer_id))?
2272 .clone();
2273 let b_vec = guard
2274 .get(&b.buffer_id)
2275 .ok_or_else(|| anyhow!("elem_ne: unknown buffer {}", b.buffer_id))?
2276 .clone();
2277 (a_vec, b_vec)
2278 };
2279
2280 let shape =
2281 runtime_broadcast_shapes("ne", &a.shape, &b.shape).map_err(|e| anyhow!(e))?;
2282 let total: usize = shape.iter().copied().product();
2283 if total == 0 {
2284 return Ok(self.allocate_tensor(Vec::new(), shape));
2285 }
2286
2287 let strides_a = runtime_compute_strides(&a.shape);
2288 let strides_b = runtime_compute_strides(&b.shape);
2289 let len_a = adata.len();
2290 let len_b = bdata.len();
2291
2292 let mut out = Vec::with_capacity(total);
2293 for idx in 0..total {
2294 let lhs = if len_a == 0 {
2295 0.0
2296 } else {
2297 let offset = runtime_broadcast_index(idx, &shape, &a.shape, &strides_a);
2298 *adata.get(offset).unwrap_or(&0.0)
2299 };
2300 let rhs = if len_b == 0 {
2301 0.0
2302 } else {
2303 let offset = runtime_broadcast_index(idx, &shape, &b.shape, &strides_b);
2304 *bdata.get(offset).unwrap_or(&0.0)
2305 };
2306 out.push(if lhs != rhs { 1.0 } else { 0.0 });
2307 }
2308
2309 Ok(self.allocate_tensor(out, shape))
2310 })
2311 }
2312
2313 fn elem_ge<'a>(
2314 &'a self,
2315 a: &'a GpuTensorHandle,
2316 b: &'a GpuTensorHandle,
2317 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
2318 Box::pin(async move {
2319 let (adata, bdata) = {
2320 let guard = registry().lock().unwrap();
2321 let a_vec = guard
2322 .get(&a.buffer_id)
2323 .ok_or_else(|| anyhow!("elem_ge: unknown buffer {}", a.buffer_id))?
2324 .clone();
2325 let b_vec = guard
2326 .get(&b.buffer_id)
2327 .ok_or_else(|| anyhow!("elem_ge: unknown buffer {}", b.buffer_id))?
2328 .clone();
2329 (a_vec, b_vec)
2330 };
2331
2332 let shape =
2333 runtime_broadcast_shapes("ge", &a.shape, &b.shape).map_err(|e| anyhow!(e))?;
2334 let total: usize = shape.iter().copied().product();
2335 if total == 0 {
2336 return Ok(self.allocate_tensor(Vec::new(), shape));
2337 }
2338
2339 let strides_a = runtime_compute_strides(&a.shape);
2340 let strides_b = runtime_compute_strides(&b.shape);
2341 let len_a = adata.len();
2342 let len_b = bdata.len();
2343
2344 let mut out = Vec::with_capacity(total);
2345 for idx in 0..total {
2346 let lhs = if len_a == 0 {
2347 0.0
2348 } else {
2349 let offset = runtime_broadcast_index(idx, &shape, &a.shape, &strides_a);
2350 *adata.get(offset).unwrap_or(&0.0)
2351 };
2352 let rhs = if len_b == 0 {
2353 0.0
2354 } else {
2355 let offset = runtime_broadcast_index(idx, &shape, &b.shape, &strides_b);
2356 *bdata.get(offset).unwrap_or(&0.0)
2357 };
2358 out.push(if lhs >= rhs { 1.0 } else { 0.0 });
2359 }
2360
2361 Ok(self.allocate_tensor(out, shape))
2362 })
2363 }
2364
2365 fn elem_le<'a>(
2366 &'a self,
2367 a: &'a GpuTensorHandle,
2368 b: &'a GpuTensorHandle,
2369 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
2370 Box::pin(async move {
2371 let (adata, bdata) = {
2372 let guard = registry().lock().unwrap();
2373 let a_vec = guard
2374 .get(&a.buffer_id)
2375 .ok_or_else(|| anyhow!("elem_le: unknown buffer {}", a.buffer_id))?
2376 .clone();
2377 let b_vec = guard
2378 .get(&b.buffer_id)
2379 .ok_or_else(|| anyhow!("elem_le: unknown buffer {}", b.buffer_id))?
2380 .clone();
2381 (a_vec, b_vec)
2382 };
2383
2384 let shape =
2385 runtime_broadcast_shapes("le", &a.shape, &b.shape).map_err(|e| anyhow!(e))?;
2386 let total: usize = shape.iter().copied().product();
2387 if total == 0 {
2388 return Ok(self.allocate_tensor(Vec::new(), shape));
2389 }
2390
2391 let strides_a = runtime_compute_strides(&a.shape);
2392 let strides_b = runtime_compute_strides(&b.shape);
2393 let len_a = adata.len();
2394 let len_b = bdata.len();
2395
2396 let mut out = Vec::with_capacity(total);
2397 for idx in 0..total {
2398 let lhs = if len_a == 0 {
2399 0.0
2400 } else {
2401 let offset = runtime_broadcast_index(idx, &shape, &a.shape, &strides_a);
2402 *adata.get(offset).unwrap_or(&0.0)
2403 };
2404 let rhs = if len_b == 0 {
2405 0.0
2406 } else {
2407 let offset = runtime_broadcast_index(idx, &shape, &b.shape, &strides_b);
2408 *bdata.get(offset).unwrap_or(&0.0)
2409 };
2410 out.push(if lhs <= rhs { 1.0 } else { 0.0 });
2411 }
2412
2413 Ok(self.allocate_tensor(out, shape))
2414 })
2415 }
2416
2417 fn elem_lt<'a>(
2418 &'a self,
2419 a: &'a GpuTensorHandle,
2420 b: &'a GpuTensorHandle,
2421 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
2422 Box::pin(async move {
2423 let (adata, bdata) = {
2424 let guard = registry().lock().unwrap();
2425 let a_vec = guard
2426 .get(&a.buffer_id)
2427 .ok_or_else(|| anyhow!("elem_lt: unknown buffer {}", a.buffer_id))?
2428 .clone();
2429 let b_vec = guard
2430 .get(&b.buffer_id)
2431 .ok_or_else(|| anyhow!("elem_lt: unknown buffer {}", b.buffer_id))?
2432 .clone();
2433 (a_vec, b_vec)
2434 };
2435
2436 let shape =
2437 runtime_broadcast_shapes("lt", &a.shape, &b.shape).map_err(|e| anyhow!(e))?;
2438 let total: usize = shape.iter().copied().product();
2439 if total == 0 {
2440 return Ok(self.allocate_tensor(Vec::new(), shape));
2441 }
2442
2443 let strides_a = runtime_compute_strides(&a.shape);
2444 let strides_b = runtime_compute_strides(&b.shape);
2445 let len_a = adata.len();
2446 let len_b = bdata.len();
2447
2448 let mut out = Vec::with_capacity(total);
2449 for idx in 0..total {
2450 let lhs = if len_a == 0 {
2451 0.0
2452 } else {
2453 let offset = runtime_broadcast_index(idx, &shape, &a.shape, &strides_a);
2454 *adata.get(offset).unwrap_or(&0.0)
2455 };
2456 let rhs = if len_b == 0 {
2457 0.0
2458 } else {
2459 let offset = runtime_broadcast_index(idx, &shape, &b.shape, &strides_b);
2460 *bdata.get(offset).unwrap_or(&0.0)
2461 };
2462 out.push(if lhs < rhs { 1.0 } else { 0.0 });
2463 }
2464
2465 Ok(self.allocate_tensor(out, shape))
2466 })
2467 }
2468
2469 fn elem_gt<'a>(
2470 &'a self,
2471 a: &'a GpuTensorHandle,
2472 b: &'a GpuTensorHandle,
2473 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
2474 Box::pin(async move {
2475 let (adata, bdata) = {
2476 let guard = registry().lock().unwrap();
2477 let a_vec = guard
2478 .get(&a.buffer_id)
2479 .ok_or_else(|| anyhow!("elem_gt: unknown buffer {}", a.buffer_id))?
2480 .clone();
2481 let b_vec = guard
2482 .get(&b.buffer_id)
2483 .ok_or_else(|| anyhow!("elem_gt: unknown buffer {}", b.buffer_id))?
2484 .clone();
2485 (a_vec, b_vec)
2486 };
2487
2488 let shape =
2489 runtime_broadcast_shapes("gt", &a.shape, &b.shape).map_err(|e| anyhow!(e))?;
2490 let total: usize = shape.iter().copied().product();
2491 if total == 0 {
2492 return Ok(self.allocate_tensor(Vec::new(), shape));
2493 }
2494
2495 let strides_a = runtime_compute_strides(&a.shape);
2496 let strides_b = runtime_compute_strides(&b.shape);
2497 let len_a = adata.len();
2498 let len_b = bdata.len();
2499
2500 let mut out = Vec::with_capacity(total);
2501 for idx in 0..total {
2502 let lhs = if len_a == 0 {
2503 0.0
2504 } else {
2505 let offset = runtime_broadcast_index(idx, &shape, &a.shape, &strides_a);
2506 *adata.get(offset).unwrap_or(&0.0)
2507 };
2508 let rhs = if len_b == 0 {
2509 0.0
2510 } else {
2511 let offset = runtime_broadcast_index(idx, &shape, &b.shape, &strides_b);
2512 *bdata.get(offset).unwrap_or(&0.0)
2513 };
2514 out.push(if lhs > rhs { 1.0 } else { 0.0 });
2515 }
2516
2517 Ok(self.allocate_tensor(out, shape))
2518 })
2519 }
2520
2521 fn elem_eq<'a>(
2522 &'a self,
2523 a: &'a GpuTensorHandle,
2524 b: &'a GpuTensorHandle,
2525 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
2526 Box::pin(async move {
2527 let (adata, bdata) = {
2528 let guard = registry().lock().unwrap();
2529 let a_vec = guard
2530 .get(&a.buffer_id)
2531 .ok_or_else(|| anyhow!("elem_eq: unknown buffer {}", a.buffer_id))?
2532 .clone();
2533 let b_vec = guard
2534 .get(&b.buffer_id)
2535 .ok_or_else(|| anyhow!("elem_eq: unknown buffer {}", b.buffer_id))?
2536 .clone();
2537 (a_vec, b_vec)
2538 };
2539
2540 let shape =
2541 runtime_broadcast_shapes("eq", &a.shape, &b.shape).map_err(|e| anyhow!(e))?;
2542 let total: usize = shape.iter().copied().product();
2543 if total == 0 {
2544 return Ok(self.allocate_tensor(Vec::new(), shape));
2545 }
2546
2547 let strides_a = runtime_compute_strides(&a.shape);
2548 let strides_b = runtime_compute_strides(&b.shape);
2549 let len_a = adata.len();
2550 let len_b = bdata.len();
2551
2552 let mut out = Vec::with_capacity(total);
2553 for idx in 0..total {
2554 let lhs = if len_a == 0 {
2555 0.0
2556 } else {
2557 let offset = runtime_broadcast_index(idx, &shape, &a.shape, &strides_a);
2558 *adata.get(offset).unwrap_or(&0.0)
2559 };
2560 let rhs = if len_b == 0 {
2561 0.0
2562 } else {
2563 let offset = runtime_broadcast_index(idx, &shape, &b.shape, &strides_b);
2564 *bdata.get(offset).unwrap_or(&0.0)
2565 };
2566 out.push(if lhs == rhs { 1.0 } else { 0.0 });
2567 }
2568
2569 Ok(self.allocate_tensor(out, shape))
2570 })
2571 }
2572
2573 fn logical_and(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2574 let (adata, bdata) = {
2575 let guard = registry().lock().unwrap();
2576 let a_vec = guard
2577 .get(&a.buffer_id)
2578 .ok_or_else(|| anyhow!("logical_and: unknown buffer {}", a.buffer_id))?
2579 .clone();
2580 let b_vec = guard
2581 .get(&b.buffer_id)
2582 .ok_or_else(|| anyhow!("logical_and: unknown buffer {}", b.buffer_id))?
2583 .clone();
2584 (a_vec, b_vec)
2585 };
2586
2587 let shape =
2588 runtime_broadcast_shapes("logical_and", &a.shape, &b.shape).map_err(|e| anyhow!(e))?;
2589 let total: usize = shape.iter().copied().product();
2590 if total == 0 {
2591 return Ok(self.allocate_tensor(Vec::new(), shape));
2592 }
2593
2594 let strides_a = runtime_compute_strides(&a.shape);
2595 let strides_b = runtime_compute_strides(&b.shape);
2596 let len_a = adata.len();
2597 let len_b = bdata.len();
2598
2599 let mut out = Vec::with_capacity(total);
2600 for idx in 0..total {
2601 let lhs = if len_a == 0 {
2602 0.0
2603 } else {
2604 let offset = runtime_broadcast_index(idx, &shape, &a.shape, &strides_a);
2605 *adata.get(offset).unwrap_or(&0.0)
2606 };
2607 let rhs = if len_b == 0 {
2608 0.0
2609 } else {
2610 let offset = runtime_broadcast_index(idx, &shape, &b.shape, &strides_b);
2611 *bdata.get(offset).unwrap_or(&0.0)
2612 };
2613 out.push(if lhs != 0.0 && rhs != 0.0 { 1.0 } else { 0.0 });
2614 }
2615
2616 Ok(self.allocate_tensor(out, shape))
2617 }
2618
2619 fn logical_or(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2620 let (adata, bdata) = {
2621 let guard = registry().lock().unwrap();
2622 let a_vec = guard
2623 .get(&a.buffer_id)
2624 .ok_or_else(|| anyhow!("logical_or: unknown buffer {}", a.buffer_id))?
2625 .clone();
2626 let b_vec = guard
2627 .get(&b.buffer_id)
2628 .ok_or_else(|| anyhow!("logical_or: unknown buffer {}", b.buffer_id))?
2629 .clone();
2630 (a_vec, b_vec)
2631 };
2632
2633 let shape =
2634 runtime_broadcast_shapes("logical_or", &a.shape, &b.shape).map_err(|e| anyhow!(e))?;
2635 let total: usize = shape.iter().copied().product();
2636 if total == 0 {
2637 return Ok(self.allocate_tensor(Vec::new(), shape));
2638 }
2639
2640 let strides_a = runtime_compute_strides(&a.shape);
2641 let strides_b = runtime_compute_strides(&b.shape);
2642 let len_a = adata.len();
2643 let len_b = bdata.len();
2644
2645 let mut out = Vec::with_capacity(total);
2646 for idx in 0..total {
2647 let lhs = if len_a == 0 {
2648 0.0
2649 } else {
2650 let offset = runtime_broadcast_index(idx, &shape, &a.shape, &strides_a);
2651 *adata.get(offset).unwrap_or(&0.0)
2652 };
2653 let rhs = if len_b == 0 {
2654 0.0
2655 } else {
2656 let offset = runtime_broadcast_index(idx, &shape, &b.shape, &strides_b);
2657 *bdata.get(offset).unwrap_or(&0.0)
2658 };
2659 out.push(if lhs != 0.0 || rhs != 0.0 { 1.0 } else { 0.0 });
2660 }
2661
2662 Ok(self.allocate_tensor(out, shape))
2663 }
2664
2665 fn logical_xor(&self, a: &GpuTensorHandle, b: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2666 let (adata, bdata) = {
2667 let guard = registry().lock().unwrap();
2668 let a_vec = guard
2669 .get(&a.buffer_id)
2670 .ok_or_else(|| anyhow!("logical_xor: unknown buffer {}", a.buffer_id))?
2671 .clone();
2672 let b_vec = guard
2673 .get(&b.buffer_id)
2674 .ok_or_else(|| anyhow!("logical_xor: unknown buffer {}", b.buffer_id))?
2675 .clone();
2676 (a_vec, b_vec)
2677 };
2678
2679 let shape =
2680 runtime_broadcast_shapes("logical_xor", &a.shape, &b.shape).map_err(|e| anyhow!(e))?;
2681 let total: usize = shape.iter().copied().product();
2682 if total == 0 {
2683 return Ok(self.allocate_tensor(Vec::new(), shape));
2684 }
2685
2686 let strides_a = runtime_compute_strides(&a.shape);
2687 let strides_b = runtime_compute_strides(&b.shape);
2688 let len_a = adata.len();
2689 let len_b = bdata.len();
2690
2691 let mut out = Vec::with_capacity(total);
2692 for idx in 0..total {
2693 let lhs = if len_a == 0 {
2694 0.0
2695 } else {
2696 let offset = runtime_broadcast_index(idx, &shape, &a.shape, &strides_a);
2697 *adata.get(offset).unwrap_or(&0.0)
2698 };
2699 let rhs = if len_b == 0 {
2700 0.0
2701 } else {
2702 let offset = runtime_broadcast_index(idx, &shape, &b.shape, &strides_b);
2703 *bdata.get(offset).unwrap_or(&0.0)
2704 };
2705 let lhs_true = lhs != 0.0;
2706 let rhs_true = rhs != 0.0;
2707 out.push(if lhs_true ^ rhs_true { 1.0 } else { 0.0 });
2708 }
2709
2710 Ok(self.allocate_tensor(out, shape))
2711 }
2712
2713 fn logical_not(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2714 let data = {
2715 let guard = registry().lock().unwrap();
2716 guard
2717 .get(&a.buffer_id)
2718 .ok_or_else(|| anyhow!("logical_not: unknown buffer {}", a.buffer_id))?
2719 .clone()
2720 };
2721
2722 let shape = a.shape.clone();
2723 if data.is_empty() {
2724 return Ok(self.allocate_tensor(Vec::new(), shape));
2725 }
2726
2727 let mut out = Vec::with_capacity(data.len());
2728 for value in data {
2729 out.push(if value == 0.0 { 1.0 } else { 0.0 });
2730 }
2731
2732 Ok(self.allocate_tensor(out, shape))
2733 }
2734
2735 fn logical_isreal(&self, a: &GpuTensorHandle) -> Result<bool> {
2736 {
2737 let guard = registry().lock().unwrap();
2738 guard
2739 .get(&a.buffer_id)
2740 .ok_or_else(|| anyhow!("logical_isreal: unknown buffer {}", a.buffer_id))?;
2741 }
2742 Ok(true)
2743 }
2744
2745 fn logical_isfinite(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2746 let data = {
2747 let guard = registry().lock().unwrap();
2748 guard
2749 .get(&a.buffer_id)
2750 .ok_or_else(|| anyhow!("logical_isfinite: unknown buffer {}", a.buffer_id))?
2751 .clone()
2752 };
2753
2754 let shape = a.shape.clone();
2755 if data.is_empty() {
2756 return Ok(self.allocate_tensor(Vec::new(), shape));
2757 }
2758
2759 let mut out = Vec::with_capacity(data.len());
2760 for value in data {
2761 out.push(if value.is_finite() { 1.0 } else { 0.0 });
2762 }
2763
2764 Ok(self.allocate_tensor(out, shape))
2765 }
2766
2767 fn logical_isnan(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2768 let data = {
2769 let guard = registry().lock().unwrap();
2770 guard
2771 .get(&a.buffer_id)
2772 .ok_or_else(|| anyhow!("logical_isnan: unknown buffer {}", a.buffer_id))?
2773 .clone()
2774 };
2775
2776 let shape = a.shape.clone();
2777 if data.is_empty() {
2778 return Ok(self.allocate_tensor(Vec::new(), shape));
2779 }
2780
2781 let mut out = Vec::with_capacity(data.len());
2782 for value in data {
2783 out.push(if value.is_nan() { 1.0 } else { 0.0 });
2784 }
2785
2786 Ok(self.allocate_tensor(out, shape))
2787 }
2788
2789 fn logical_isinf(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
2790 let data = {
2791 let guard = registry().lock().unwrap();
2792 guard
2793 .get(&a.buffer_id)
2794 .ok_or_else(|| anyhow!("logical_isinf: unknown buffer {}", a.buffer_id))?
2795 .clone()
2796 };
2797
2798 let shape = a.shape.clone();
2799 if data.is_empty() {
2800 return Ok(self.allocate_tensor(Vec::new(), shape));
2801 }
2802
2803 let mut out = Vec::with_capacity(data.len());
2804 for value in data {
2805 out.push(if value.is_infinite() { 1.0 } else { 0.0 });
2806 }
2807
2808 Ok(self.allocate_tensor(out, shape))
2809 }
2810
2811 fn elem_hypot<'a>(
2812 &'a self,
2813 a: &'a GpuTensorHandle,
2814 b: &'a GpuTensorHandle,
2815 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
2816 Box::pin(async move {
2817 let guard = registry().lock().unwrap();
2818 let abuf = guard
2819 .get(&a.buffer_id)
2820 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2821 let bbuf = guard
2822 .get(&b.buffer_id)
2823 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", b.buffer_id))?;
2824 if a.shape != b.shape {
2825 return Err(anyhow::anyhow!("shape mismatch"));
2826 }
2827 let mut out = vec![0.0; abuf.len()];
2828 for i in 0..abuf.len() {
2829 out[i] = abuf[i].hypot(bbuf[i]);
2830 }
2831 drop(guard);
2832 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2833 let mut guard2 = registry().lock().unwrap();
2834 guard2.insert(id, out);
2835 Ok(GpuTensorHandle {
2836 shape: a.shape.clone(),
2837 device_id: 0,
2838 buffer_id: id,
2839 })
2840 })
2841 }
2842 fn elem_atan2<'a>(
2843 &'a self,
2844 y: &'a GpuTensorHandle,
2845 x: &'a GpuTensorHandle,
2846 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
2847 Box::pin(async move {
2848 let guard = registry().lock().unwrap();
2849 let ybuf = guard
2850 .get(&y.buffer_id)
2851 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", y.buffer_id))?;
2852 let xbuf = guard
2853 .get(&x.buffer_id)
2854 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", x.buffer_id))?;
2855 if y.shape != x.shape {
2856 return Err(anyhow::anyhow!("shape mismatch"));
2857 }
2858 let mut out = vec![0.0; ybuf.len()];
2859 for idx in 0..ybuf.len() {
2860 out[idx] = ybuf[idx].atan2(xbuf[idx]);
2861 }
2862 drop(guard);
2863 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2864 let mut guard2 = registry().lock().unwrap();
2865 guard2.insert(id, out);
2866 Ok(GpuTensorHandle {
2867 shape: y.shape.clone(),
2868 device_id: 0,
2869 buffer_id: id,
2870 })
2871 })
2872 }
2873
2874 fn unary_sin<'a>(&'a self, a: &'a GpuTensorHandle) -> AccelProviderFuture<'a, GpuTensorHandle> {
2875 Box::pin(async move {
2876 let guard = registry().lock().unwrap();
2877 let abuf = guard
2878 .get(&a.buffer_id)
2879 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2880 let out: Vec<f64> = abuf.iter().map(|&x| x.sin()).collect();
2881 drop(guard);
2882 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2883 let mut guard2 = registry().lock().unwrap();
2884 guard2.insert(id, out);
2885 Ok(GpuTensorHandle {
2886 shape: a.shape.clone(),
2887 device_id: 0,
2888 buffer_id: id,
2889 })
2890 })
2891 }
2892 fn unary_sinc<'a>(
2893 &'a self,
2894 a: &'a GpuTensorHandle,
2895 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
2896 Box::pin(async move {
2897 let guard = registry().lock().unwrap();
2898 let abuf = guard
2899 .get(&a.buffer_id)
2900 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2901 let out: Vec<f64> = abuf.iter().copied().map(sinc_scalar_host).collect();
2902 drop(guard);
2903 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2904 let mut guard2 = registry().lock().unwrap();
2905 guard2.insert(id, out);
2906 Ok(GpuTensorHandle {
2907 shape: a.shape.clone(),
2908 device_id: 0,
2909 buffer_id: id,
2910 })
2911 })
2912 }
2913 fn unary_gamma<'a>(
2914 &'a self,
2915 _a: &'a GpuTensorHandle,
2916 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
2917 Box::pin(async move { Err(anyhow::anyhow!("unary_gamma not supported by provider")) })
2918 }
2919 fn unary_factorial<'a>(
2920 &'a self,
2921 a: &'a GpuTensorHandle,
2922 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
2923 Box::pin(async move {
2924 let guard = registry().lock().unwrap();
2925 let abuf = guard
2926 .get(&a.buffer_id)
2927 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2928 let out: Vec<f64> = abuf.iter().copied().map(factorial_scalar_host).collect();
2929 drop(guard);
2930 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2931 let mut guard2 = registry().lock().unwrap();
2932 guard2.insert(id, out);
2933 Ok(GpuTensorHandle {
2934 shape: a.shape.clone(),
2935 device_id: 0,
2936 buffer_id: id,
2937 })
2938 })
2939 }
2940 fn unary_asinh<'a>(
2941 &'a self,
2942 a: &'a GpuTensorHandle,
2943 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
2944 Box::pin(async move {
2945 let guard = registry().lock().unwrap();
2946 let abuf = guard
2947 .get(&a.buffer_id)
2948 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2949 let out: Vec<f64> = abuf.iter().map(|&x| x.asinh()).collect();
2950 drop(guard);
2951 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2952 let mut guard2 = registry().lock().unwrap();
2953 guard2.insert(id, out);
2954 Ok(GpuTensorHandle {
2955 shape: a.shape.clone(),
2956 device_id: 0,
2957 buffer_id: id,
2958 })
2959 })
2960 }
2961 fn unary_sinh<'a>(
2962 &'a self,
2963 a: &'a GpuTensorHandle,
2964 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
2965 Box::pin(async move {
2966 let guard = registry().lock().unwrap();
2967 let abuf = guard
2968 .get(&a.buffer_id)
2969 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2970 let out: Vec<f64> = abuf.iter().map(|&x| x.sinh()).collect();
2971 drop(guard);
2972 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2973 let mut guard2 = registry().lock().unwrap();
2974 guard2.insert(id, out);
2975 Ok(GpuTensorHandle {
2976 shape: a.shape.clone(),
2977 device_id: 0,
2978 buffer_id: id,
2979 })
2980 })
2981 }
2982 fn unary_cosh<'a>(
2983 &'a self,
2984 a: &'a GpuTensorHandle,
2985 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
2986 Box::pin(async move {
2987 let guard = registry().lock().unwrap();
2988 let abuf = guard
2989 .get(&a.buffer_id)
2990 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
2991 let out: Vec<f64> = abuf.iter().map(|&x| x.cosh()).collect();
2992 drop(guard);
2993 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
2994 let mut guard2 = registry().lock().unwrap();
2995 guard2.insert(id, out);
2996 Ok(GpuTensorHandle {
2997 shape: a.shape.clone(),
2998 device_id: 0,
2999 buffer_id: id,
3000 })
3001 })
3002 }
3003
3004 fn unary_asin<'a>(
3005 &'a self,
3006 a: &'a GpuTensorHandle,
3007 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
3008 Box::pin(async move {
3009 let guard = registry().lock().unwrap();
3010 let abuf = guard
3011 .get(&a.buffer_id)
3012 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3013 let out: Vec<f64> = abuf.iter().map(|&x| x.asin()).collect();
3014 drop(guard);
3015 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3016 let mut guard2 = registry().lock().unwrap();
3017 guard2.insert(id, out);
3018 Ok(GpuTensorHandle {
3019 shape: a.shape.clone(),
3020 device_id: 0,
3021 buffer_id: id,
3022 })
3023 })
3024 }
3025 fn unary_acos<'a>(
3026 &'a self,
3027 a: &'a GpuTensorHandle,
3028 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
3029 Box::pin(async move {
3030 let guard = registry().lock().unwrap();
3031 let abuf = guard
3032 .get(&a.buffer_id)
3033 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3034 let out: Vec<f64> = abuf.iter().map(|&x| x.acos()).collect();
3035 drop(guard);
3036 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3037 let mut guard2 = registry().lock().unwrap();
3038 guard2.insert(id, out);
3039 Ok(GpuTensorHandle {
3040 shape: a.shape.clone(),
3041 device_id: 0,
3042 buffer_id: id,
3043 })
3044 })
3045 }
3046 fn unary_acosh<'a>(
3047 &'a self,
3048 a: &'a GpuTensorHandle,
3049 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
3050 Box::pin(async move {
3051 let guard = registry().lock().unwrap();
3052 let abuf = guard
3053 .get(&a.buffer_id)
3054 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3055 let out: Vec<f64> = abuf.iter().map(|&x| x.acosh()).collect();
3056 drop(guard);
3057 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3058 let mut guard2 = registry().lock().unwrap();
3059 guard2.insert(id, out);
3060 Ok(GpuTensorHandle {
3061 shape: a.shape.clone(),
3062 device_id: 0,
3063 buffer_id: id,
3064 })
3065 })
3066 }
3067
3068 fn unary_tan<'a>(&'a self, a: &'a GpuTensorHandle) -> AccelProviderFuture<'a, GpuTensorHandle> {
3069 Box::pin(async move {
3070 let guard = registry().lock().unwrap();
3071 let abuf = guard
3072 .get(&a.buffer_id)
3073 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3074 let out: Vec<f64> = abuf.iter().map(|&x| x.tan()).collect();
3075 drop(guard);
3076 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3077 let mut guard2 = registry().lock().unwrap();
3078 guard2.insert(id, out);
3079 Ok(GpuTensorHandle {
3080 shape: a.shape.clone(),
3081 device_id: 0,
3082 buffer_id: id,
3083 })
3084 })
3085 }
3086 fn unary_tanh<'a>(
3087 &'a self,
3088 a: &'a GpuTensorHandle,
3089 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
3090 Box::pin(async move {
3091 let guard = registry().lock().unwrap();
3092 let abuf = guard
3093 .get(&a.buffer_id)
3094 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3095 let out: Vec<f64> = abuf.iter().map(|&x| x.tanh()).collect();
3096 drop(guard);
3097 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3098 let mut guard2 = registry().lock().unwrap();
3099 guard2.insert(id, out);
3100 Ok(GpuTensorHandle {
3101 shape: a.shape.clone(),
3102 device_id: 0,
3103 buffer_id: id,
3104 })
3105 })
3106 }
3107
3108 fn unary_atan<'a>(
3109 &'a self,
3110 a: &'a GpuTensorHandle,
3111 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
3112 Box::pin(async move {
3113 let guard = registry().lock().unwrap();
3114 let abuf = guard
3115 .get(&a.buffer_id)
3116 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3117 let out: Vec<f64> = abuf.iter().map(|&x| x.atan()).collect();
3118 drop(guard);
3119 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3120 let mut guard2 = registry().lock().unwrap();
3121 guard2.insert(id, out);
3122 Ok(GpuTensorHandle {
3123 shape: a.shape.clone(),
3124 device_id: 0,
3125 buffer_id: id,
3126 })
3127 })
3128 }
3129 fn unary_atanh<'a>(
3130 &'a self,
3131 a: &'a GpuTensorHandle,
3132 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
3133 Box::pin(async move {
3134 let guard = registry().lock().unwrap();
3135 let abuf = guard
3136 .get(&a.buffer_id)
3137 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3138 let out: Vec<f64> = abuf.iter().map(|&x| x.atanh()).collect();
3139 drop(guard);
3140 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3141 let mut guard2 = registry().lock().unwrap();
3142 guard2.insert(id, out);
3143 Ok(GpuTensorHandle {
3144 shape: a.shape.clone(),
3145 device_id: 0,
3146 buffer_id: id,
3147 })
3148 })
3149 }
3150
3151 fn unary_ceil<'a>(
3152 &'a self,
3153 a: &'a GpuTensorHandle,
3154 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
3155 Box::pin(async move {
3156 let guard = registry().lock().unwrap();
3157 let abuf = guard
3158 .get(&a.buffer_id)
3159 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3160 let out: Vec<f64> = abuf.iter().map(|&x| x.ceil()).collect();
3161 drop(guard);
3162 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3163 let mut guard2 = registry().lock().unwrap();
3164 guard2.insert(id, out);
3165 Ok(GpuTensorHandle {
3166 shape: a.shape.clone(),
3167 device_id: 0,
3168 buffer_id: id,
3169 })
3170 })
3171 }
3172
3173 fn unary_floor<'a>(
3174 &'a self,
3175 a: &'a GpuTensorHandle,
3176 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
3177 Box::pin(async move {
3178 let guard = registry().lock().unwrap();
3179 let abuf = guard
3180 .get(&a.buffer_id)
3181 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3182 let out: Vec<f64> = abuf.iter().map(|&x| x.floor()).collect();
3183 drop(guard);
3184 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3185 let mut guard2 = registry().lock().unwrap();
3186 guard2.insert(id, out);
3187 Ok(GpuTensorHandle {
3188 shape: a.shape.clone(),
3189 device_id: 0,
3190 buffer_id: id,
3191 })
3192 })
3193 }
3194
3195 fn unary_round<'a>(
3196 &'a self,
3197 a: &'a GpuTensorHandle,
3198 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
3199 Box::pin(async move {
3200 let guard = registry().lock().unwrap();
3201 let abuf = guard
3202 .get(&a.buffer_id)
3203 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3204 let out: Vec<f64> = abuf.iter().map(|&x| x.round()).collect();
3205 drop(guard);
3206 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3207 let mut guard2 = registry().lock().unwrap();
3208 guard2.insert(id, out);
3209 Ok(GpuTensorHandle {
3210 shape: a.shape.clone(),
3211 device_id: 0,
3212 buffer_id: id,
3213 })
3214 })
3215 }
3216
3217 fn unary_fix<'a>(&'a self, a: &'a GpuTensorHandle) -> AccelProviderFuture<'a, GpuTensorHandle> {
3218 Box::pin(async move {
3219 let guard = registry().lock().unwrap();
3220 let abuf = guard
3221 .get(&a.buffer_id)
3222 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3223 let out: Vec<f64> = abuf
3224 .iter()
3225 .map(|&x| {
3226 if !x.is_finite() {
3227 x
3228 } else {
3229 let truncated = x.trunc();
3230 if truncated == 0.0 {
3231 0.0
3232 } else {
3233 truncated
3234 }
3235 }
3236 })
3237 .collect();
3238 drop(guard);
3239 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3240 let mut guard2 = registry().lock().unwrap();
3241 guard2.insert(id, out);
3242 Ok(GpuTensorHandle {
3243 shape: a.shape.clone(),
3244 device_id: 0,
3245 buffer_id: id,
3246 })
3247 })
3248 }
3249
3250 fn unary_cos<'a>(&'a self, a: &'a GpuTensorHandle) -> AccelProviderFuture<'a, GpuTensorHandle> {
3251 Box::pin(async move {
3252 let guard = registry().lock().unwrap();
3253 let abuf = guard
3254 .get(&a.buffer_id)
3255 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3256 let out: Vec<f64> = abuf.iter().map(|&x| x.cos()).collect();
3257 drop(guard);
3258 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3259 let mut guard2 = registry().lock().unwrap();
3260 guard2.insert(id, out);
3261 Ok(GpuTensorHandle {
3262 shape: a.shape.clone(),
3263 device_id: 0,
3264 buffer_id: id,
3265 })
3266 })
3267 }
3268
3269 fn unary_abs<'a>(&'a self, a: &'a GpuTensorHandle) -> AccelProviderFuture<'a, GpuTensorHandle> {
3270 Box::pin(async move {
3271 let guard = registry().lock().unwrap();
3272 let abuf = guard
3273 .get(&a.buffer_id)
3274 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3275 let out: Vec<f64> = abuf.iter().map(|&x| x.abs()).collect();
3276 drop(guard);
3277 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3278 let mut guard2 = registry().lock().unwrap();
3279 guard2.insert(id, out);
3280 Ok(GpuTensorHandle {
3281 shape: a.shape.clone(),
3282 device_id: 0,
3283 buffer_id: id,
3284 })
3285 })
3286 }
3287
3288 fn unary_exp<'a>(&'a self, a: &'a GpuTensorHandle) -> AccelProviderFuture<'a, GpuTensorHandle> {
3289 Box::pin(async move {
3290 let guard = registry().lock().unwrap();
3291 let abuf = guard
3292 .get(&a.buffer_id)
3293 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3294 let out: Vec<f64> = abuf.iter().map(|&x| x.exp()).collect();
3295 drop(guard);
3296 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3297 let mut guard2 = registry().lock().unwrap();
3298 guard2.insert(id, out);
3299 Ok(GpuTensorHandle {
3300 shape: a.shape.clone(),
3301 device_id: 0,
3302 buffer_id: id,
3303 })
3304 })
3305 }
3306
3307 fn unary_log<'a>(&'a self, a: &'a GpuTensorHandle) -> AccelProviderFuture<'a, GpuTensorHandle> {
3308 Box::pin(async move {
3309 let guard = registry().lock().unwrap();
3310 let abuf = guard
3311 .get(&a.buffer_id)
3312 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3313 let out: Vec<f64> = abuf.iter().map(|&x| x.ln()).collect();
3314 drop(guard);
3315 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3316 let mut guard2 = registry().lock().unwrap();
3317 guard2.insert(id, out);
3318 Ok(GpuTensorHandle {
3319 shape: a.shape.clone(),
3320 device_id: 0,
3321 buffer_id: id,
3322 })
3323 })
3324 }
3325
3326 fn unary_sqrt<'a>(
3327 &'a self,
3328 a: &'a GpuTensorHandle,
3329 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
3330 Box::pin(async move {
3331 let guard = registry().lock().unwrap();
3332 let abuf = guard
3333 .get(&a.buffer_id)
3334 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3335 let out: Vec<f64> = abuf.iter().map(|&x| x.sqrt()).collect();
3336 drop(guard);
3337 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3338 let mut guard2 = registry().lock().unwrap();
3339 guard2.insert(id, out);
3340 Ok(GpuTensorHandle {
3341 shape: a.shape.clone(),
3342 device_id: 0,
3343 buffer_id: id,
3344 })
3345 })
3346 }
3347
3348 fn unary_double<'a>(
3349 &'a self,
3350 a: &'a GpuTensorHandle,
3351 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
3352 Box::pin(async move {
3353 let guard = registry().lock().unwrap();
3354 let abuf = guard
3355 .get(&a.buffer_id)
3356 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?
3357 .clone();
3358 drop(guard);
3359 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3360 let mut guard2 = registry().lock().unwrap();
3361 guard2.insert(id, abuf);
3362 Ok(GpuTensorHandle {
3363 shape: a.shape.clone(),
3364 device_id: 0,
3365 buffer_id: id,
3366 })
3367 })
3368 }
3369
3370 fn unary_single<'a>(
3371 &'a self,
3372 a: &'a GpuTensorHandle,
3373 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
3374 Box::pin(async move {
3375 let guard = registry().lock().unwrap();
3376 let abuf = guard
3377 .get(&a.buffer_id)
3378 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3379 let out: Vec<f64> = abuf.iter().map(|&x| (x as f32) as f64).collect();
3380 drop(guard);
3381 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3382 let mut guard2 = registry().lock().unwrap();
3383 guard2.insert(id, out);
3384 Ok(GpuTensorHandle {
3385 shape: a.shape.clone(),
3386 device_id: 0,
3387 buffer_id: id,
3388 })
3389 })
3390 }
3391
3392 fn unary_pow2<'a>(
3393 &'a self,
3394 a: &'a GpuTensorHandle,
3395 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
3396 Box::pin(async move {
3397 let guard = registry().lock().unwrap();
3398 let abuf = guard
3399 .get(&a.buffer_id)
3400 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3401 let out: Vec<f64> = abuf.iter().map(|&x| x.exp2()).collect();
3402 drop(guard);
3403 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3404 let mut guard2 = registry().lock().unwrap();
3405 guard2.insert(id, out);
3406 Ok(GpuTensorHandle {
3407 shape: a.shape.clone(),
3408 device_id: 0,
3409 buffer_id: id,
3410 })
3411 })
3412 }
3413
3414 fn unary_nextpow2<'a>(
3415 &'a self,
3416 a: &'a GpuTensorHandle,
3417 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
3418 Box::pin(async move {
3419 let guard = registry().lock().unwrap();
3420 let abuf = guard
3421 .get(&a.buffer_id)
3422 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3423 let out: Vec<f64> = abuf
3424 .iter()
3425 .map(|&x| {
3426 let ax = x.abs();
3427 if ax == 0.0 {
3428 0.0
3429 } else {
3430 ax.log2().ceil()
3431 }
3432 })
3433 .collect();
3434 drop(guard);
3435 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3436 let mut guard2 = registry().lock().unwrap();
3437 guard2.insert(id, out);
3438 Ok(GpuTensorHandle {
3439 shape: a.shape.clone(),
3440 device_id: 0,
3441 buffer_id: id,
3442 })
3443 })
3444 }
3445
3446 fn pow2_scale(
3447 &self,
3448 mantissa: &GpuTensorHandle,
3449 exponent: &GpuTensorHandle,
3450 ) -> Result<GpuTensorHandle> {
3451 let guard = registry().lock().unwrap();
3452 let mbuf = guard
3453 .get(&mantissa.buffer_id)
3454 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", mantissa.buffer_id))?;
3455 let ebuf = guard
3456 .get(&exponent.buffer_id)
3457 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", exponent.buffer_id))?;
3458 if mantissa.shape != exponent.shape {
3459 return Err(anyhow::anyhow!("shape mismatch"));
3460 }
3461 let mut out = vec![0.0f64; mbuf.len()];
3462 for i in 0..mbuf.len() {
3463 out[i] = mbuf[i] * ebuf[i].exp2();
3464 }
3465 drop(guard);
3466 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3467 let mut guard2 = registry().lock().unwrap();
3468 guard2.insert(id, out);
3469 Ok(GpuTensorHandle {
3470 shape: mantissa.shape.clone(),
3471 device_id: 0,
3472 buffer_id: id,
3473 })
3474 }
3475
3476 fn scalar_add(&self, a: &GpuTensorHandle, scalar: f64) -> Result<GpuTensorHandle> {
3477 let guard = registry().lock().unwrap();
3478 let abuf = guard
3479 .get(&a.buffer_id)
3480 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3481 let out: Vec<f64> = abuf.iter().map(|&x| x + scalar).collect();
3482 drop(guard);
3483 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3484 let mut guard2 = registry().lock().unwrap();
3485 guard2.insert(id, out);
3486 Ok(GpuTensorHandle {
3487 shape: a.shape.clone(),
3488 device_id: 0,
3489 buffer_id: id,
3490 })
3491 }
3492
3493 fn scalar_sub(&self, a: &GpuTensorHandle, scalar: f64) -> Result<GpuTensorHandle> {
3494 let guard = registry().lock().unwrap();
3495 let abuf = guard
3496 .get(&a.buffer_id)
3497 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3498 let out: Vec<f64> = abuf.iter().map(|&x| x - scalar).collect();
3499 drop(guard);
3500 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3501 let mut guard2 = registry().lock().unwrap();
3502 guard2.insert(id, out);
3503 Ok(GpuTensorHandle {
3504 shape: a.shape.clone(),
3505 device_id: 0,
3506 buffer_id: id,
3507 })
3508 }
3509
3510 fn scalar_mul(&self, a: &GpuTensorHandle, scalar: f64) -> Result<GpuTensorHandle> {
3511 let guard = registry().lock().unwrap();
3512 let abuf = guard
3513 .get(&a.buffer_id)
3514 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3515 let out: Vec<f64> = abuf.iter().map(|&x| x * scalar).collect();
3516 drop(guard);
3517 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3518 let mut guard2 = registry().lock().unwrap();
3519 guard2.insert(id, out);
3520 Ok(GpuTensorHandle {
3521 shape: a.shape.clone(),
3522 device_id: 0,
3523 buffer_id: id,
3524 })
3525 }
3526
3527 fn scalar_div(&self, a: &GpuTensorHandle, scalar: f64) -> Result<GpuTensorHandle> {
3528 let guard = registry().lock().unwrap();
3529 let abuf = guard
3530 .get(&a.buffer_id)
3531 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3532 let out: Vec<f64> = if scalar == 0.0 {
3533 abuf.iter().map(|&x| f64::INFINITY * x.signum()).collect()
3534 } else {
3535 abuf.iter().map(|&x| x / scalar).collect()
3536 };
3537 drop(guard);
3538 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3539 let mut guard2 = registry().lock().unwrap();
3540 guard2.insert(id, out);
3541 Ok(GpuTensorHandle {
3542 shape: a.shape.clone(),
3543 device_id: 0,
3544 buffer_id: id,
3545 })
3546 }
3547
3548 fn scalar_rsub(&self, a: &GpuTensorHandle, scalar: f64) -> Result<GpuTensorHandle> {
3549 let guard = registry().lock().unwrap();
3551 let abuf = guard
3552 .get(&a.buffer_id)
3553 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3554 let out: Vec<f64> = abuf.iter().map(|&x| scalar - x).collect();
3555 drop(guard);
3556 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3557 let mut guard2 = registry().lock().unwrap();
3558 guard2.insert(id, out);
3559 Ok(GpuTensorHandle {
3560 shape: a.shape.clone(),
3561 device_id: 0,
3562 buffer_id: id,
3563 })
3564 }
3565
3566 fn scalar_rdiv(&self, a: &GpuTensorHandle, scalar: f64) -> Result<GpuTensorHandle> {
3567 let guard = registry().lock().unwrap();
3569 let abuf = guard
3570 .get(&a.buffer_id)
3571 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3572 let out: Vec<f64> = abuf
3573 .iter()
3574 .map(|&x| {
3575 if x == 0.0 {
3576 f64::INFINITY * scalar.signum()
3577 } else {
3578 scalar / x
3579 }
3580 })
3581 .collect();
3582 drop(guard);
3583 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3584 let mut guard2 = registry().lock().unwrap();
3585 guard2.insert(id, out);
3586 Ok(GpuTensorHandle {
3587 shape: a.shape.clone(),
3588 device_id: 0,
3589 buffer_id: id,
3590 })
3591 }
3592
3593 fn transpose(&self, a: &GpuTensorHandle) -> Result<GpuTensorHandle> {
3594 if a.shape.len() != 2 {
3595 return Err(anyhow::anyhow!("transpose: only 2D supported"));
3596 }
3597 let rows = a.shape[0];
3598 let cols = a.shape[1];
3599 let guard = registry().lock().unwrap();
3600 let abuf = guard
3601 .get(&a.buffer_id)
3602 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
3603 let mut out = vec![0.0; abuf.len()];
3604 for i in 0..rows {
3605 for j in 0..cols {
3606 let src = i + j * rows;
3607 let dst = j + i * cols;
3608 out[dst] = abuf[src];
3609 }
3610 }
3611 drop(guard);
3612 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
3613 let mut guard2 = registry().lock().unwrap();
3614 guard2.insert(id, out);
3615 Ok(GpuTensorHandle {
3616 shape: vec![cols, rows],
3617 device_id: 0,
3618 buffer_id: id,
3619 })
3620 }
3621 fn conv1d(
3622 &self,
3623 signal: &GpuTensorHandle,
3624 kernel: &GpuTensorHandle,
3625 options: ProviderConv1dOptions,
3626 ) -> Result<GpuTensorHandle> {
3627 let signal_len: usize = signal.shape.iter().copied().product();
3628 let kernel_len: usize = kernel.shape.iter().copied().product();
3629
3630 if signal_len == 0 || kernel_len == 0 {
3631 let shape = conv1d_output_shape(0, options.orientation);
3632 return Ok(self.allocate_tensor(Vec::new(), shape));
3633 }
3634
3635 if matches!(options.mode, ProviderConvMode::Valid) && signal_len < kernel_len {
3636 let shape = conv1d_output_shape(0, options.orientation);
3637 return Ok(self.allocate_tensor(Vec::new(), shape));
3638 }
3639
3640 let (signal_data, kernel_data) = {
3641 let guard = registry().lock().unwrap();
3642 let signal_buf = guard
3643 .get(&signal.buffer_id)
3644 .cloned()
3645 .ok_or_else(|| anyhow!("conv1d: unknown signal buffer {}", signal.buffer_id))?;
3646 let kernel_buf = guard
3647 .get(&kernel.buffer_id)
3648 .cloned()
3649 .ok_or_else(|| anyhow!("conv1d: unknown kernel buffer {}", kernel.buffer_id))?;
3650 (signal_buf, kernel_buf)
3651 };
3652
3653 ensure!(
3654 signal_data.len() == signal_len,
3655 "conv1d: signal length mismatch (shape implies {}, buffer has {})",
3656 signal_len,
3657 signal_data.len()
3658 );
3659 ensure!(
3660 kernel_data.len() == kernel_len,
3661 "conv1d: kernel length mismatch (shape implies {}, buffer has {})",
3662 kernel_len,
3663 kernel_data.len()
3664 );
3665
3666 let full = convolve_real(&signal_data, &kernel_data)?;
3667 let shaped = apply_conv_mode_real(&full, options.mode, signal_len, kernel_len);
3668 let shape = conv1d_output_shape(shaped.len(), options.orientation);
3669 Ok(self.allocate_tensor(shaped, shape))
3670 }
3671 fn conv2d(
3672 &self,
3673 signal: &GpuTensorHandle,
3674 kernel: &GpuTensorHandle,
3675 mode: ProviderConvMode,
3676 ) -> Result<GpuTensorHandle> {
3677 ensure_diag_shape("conv2d", &signal.shape)?;
3678 ensure_diag_shape("conv2d", &kernel.shape)?;
3679 let (signal_rows, signal_cols) = rows_cols(&signal.shape);
3680 let (kernel_rows, kernel_cols) = rows_cols(&kernel.shape);
3681
3682 let signal_len: usize = signal.shape.iter().copied().product();
3683 let kernel_len: usize = kernel.shape.iter().copied().product();
3684
3685 let empty_shape = match mode {
3686 ProviderConvMode::Full | ProviderConvMode::Valid => vec![0, 0],
3687 ProviderConvMode::Same => vec![signal_rows, signal_cols],
3688 };
3689
3690 if signal_len == 0
3691 || kernel_len == 0
3692 || signal_rows == 0
3693 || signal_cols == 0
3694 || kernel_rows == 0
3695 || kernel_cols == 0
3696 {
3697 let len = empty_shape.iter().copied().product::<usize>();
3698 return Ok(self.allocate_tensor(vec![0.0; len], empty_shape));
3699 }
3700
3701 let (signal_data, kernel_data) = {
3702 let guard = registry().lock().unwrap();
3703 let signal_buf = guard
3704 .get(&signal.buffer_id)
3705 .cloned()
3706 .ok_or_else(|| anyhow!("conv2d: unknown signal buffer {}", signal.buffer_id))?;
3707 let kernel_buf = guard
3708 .get(&kernel.buffer_id)
3709 .cloned()
3710 .ok_or_else(|| anyhow!("conv2d: unknown kernel buffer {}", kernel.buffer_id))?;
3711 (signal_buf, kernel_buf)
3712 };
3713
3714 ensure!(
3715 signal_data.len() == signal_len,
3716 "conv2d: signal length mismatch (shape implies {}, buffer has {})",
3717 signal_len,
3718 signal_data.len()
3719 );
3720 ensure!(
3721 kernel_data.len() == kernel_len,
3722 "conv2d: kernel length mismatch (shape implies {}, buffer has {})",
3723 kernel_len,
3724 kernel_data.len()
3725 );
3726
3727 let full_rows = signal_rows
3728 .checked_add(kernel_rows)
3729 .and_then(|v| v.checked_sub(1))
3730 .ok_or_else(|| anyhow!("conv2d: row size overflow"))?;
3731 let full_cols = signal_cols
3732 .checked_add(kernel_cols)
3733 .and_then(|v| v.checked_sub(1))
3734 .ok_or_else(|| anyhow!("conv2d: column size overflow"))?;
3735 full_rows
3736 .checked_mul(full_cols)
3737 .ok_or_else(|| anyhow!("conv2d: output size overflow"))?;
3738 let full = conv2d_full_real(
3739 &signal_data,
3740 signal_rows,
3741 signal_cols,
3742 &kernel_data,
3743 kernel_rows,
3744 kernel_cols,
3745 full_rows,
3746 full_cols,
3747 );
3748
3749 let (shaped, out_rows, out_cols) = apply_conv2_mode_real_2d(
3750 &full,
3751 full_rows,
3752 full_cols,
3753 mode,
3754 signal_rows,
3755 signal_cols,
3756 kernel_rows,
3757 kernel_cols,
3758 );
3759 Ok(self.allocate_tensor(shaped, vec![out_rows, out_cols]))
3760 }
3761 fn iir_filter<'a>(
3762 &'a self,
3763 b: &'a GpuTensorHandle,
3764 a: &'a GpuTensorHandle,
3765 x: &'a GpuTensorHandle,
3766 options: ProviderIirFilterOptions,
3767 ) -> AccelProviderFuture<'a, ProviderIirFilterResult> {
3768 Box::pin(async move {
3769 let ProviderIirFilterOptions { dim, zi } = options;
3770
3771 let nb = product(&b.shape);
3772 let na = product(&a.shape);
3773 ensure!(
3774 nb > 0,
3775 "iir_filter: numerator coefficients must not be empty"
3776 );
3777 ensure!(
3778 na > 0,
3779 "iir_filter: denominator coefficients must not be empty"
3780 );
3781
3782 let signal_elems = product(&x.shape);
3783 let zi_shape = zi.as_ref().map(|handle| handle.shape.clone());
3784
3785 let (b_data, a_data, x_data, zi_data) = {
3786 let guard = registry().lock().unwrap();
3787 let b_buf = guard.get(&b.buffer_id).cloned().ok_or_else(|| {
3788 anyhow!("iir_filter: unknown numerator buffer {}", b.buffer_id)
3789 })?;
3790 let a_buf = guard.get(&a.buffer_id).cloned().ok_or_else(|| {
3791 anyhow!("iir_filter: unknown denominator buffer {}", a.buffer_id)
3792 })?;
3793 let x_buf = guard
3794 .get(&x.buffer_id)
3795 .cloned()
3796 .ok_or_else(|| anyhow!("iir_filter: unknown signal buffer {}", x.buffer_id))?;
3797 ensure!(
3798 b_buf.len() == nb,
3799 "iir_filter: numerator length mismatch (shape implies {}, buffer has {})",
3800 nb,
3801 b_buf.len()
3802 );
3803 ensure!(
3804 a_buf.len() == na,
3805 "iir_filter: denominator length mismatch (shape implies {}, buffer has {})",
3806 na,
3807 a_buf.len()
3808 );
3809 ensure!(
3810 x_buf.len() == signal_elems,
3811 "iir_filter: signal length mismatch (shape implies {}, buffer has {})",
3812 signal_elems,
3813 x_buf.len()
3814 );
3815 let zi_buf = if let Some(ref zi_handle) = zi {
3816 Some(guard.get(&zi_handle.buffer_id).cloned().ok_or_else(|| {
3817 anyhow!(
3818 "iir_filter: unknown initial state buffer {}",
3819 zi_handle.buffer_id
3820 )
3821 })?)
3822 } else {
3823 None
3824 };
3825 (b_buf, a_buf, x_buf, zi_buf)
3826 };
3827
3828 ensure!(
3829 !a_data.is_empty() && a_data[0] != 0.0,
3830 "iir_filter: denominator coefficient a(1) must be non-zero"
3831 );
3832
3833 let mut shape_ext = x.shape.clone();
3834 if dim >= shape_ext.len() {
3835 shape_ext.extend(std::iter::repeat_n(1, dim + 1 - shape_ext.len()));
3836 }
3837 let dim_idx = dim;
3838 let dim_len = shape_ext.get(dim_idx).copied().unwrap_or(1);
3839
3840 let leading = if dim_idx == 0 {
3841 1
3842 } else {
3843 shape_ext[..dim_idx].iter().copied().product()
3844 };
3845 let trailing = if dim_idx + 1 >= shape_ext.len() {
3846 1
3847 } else {
3848 shape_ext[dim_idx + 1..].iter().copied().product()
3849 };
3850 let channel_count = leading * trailing;
3851
3852 let order = nb.max(na);
3853 let state_len = order.saturating_sub(1);
3854 let state_shape = filter_state_shape(shape_ext.clone(), dim_idx, state_len);
3855 let expected_states = state_len.saturating_mul(channel_count);
3856
3857 if let Some(ref shape) = zi_shape {
3858 ensure!(
3859 shapes_compatible(&state_shape, shape),
3860 "iir_filter: initial conditions are not compatible with the signal shape"
3861 );
3862 let zi_dim = if dim_idx < shape.len() {
3863 shape[dim_idx]
3864 } else {
3865 1
3866 };
3867 ensure!(
3868 zi_dim == state_len,
3869 "iir_filter: initial conditions must have {} states along dimension {}",
3870 state_len,
3871 dim + 1
3872 );
3873 }
3874
3875 let mut states = if state_len == 0 {
3876 Vec::new()
3877 } else if let Some(ref zi_buf) = zi_data {
3878 ensure!(
3879 zi_buf.len() == expected_states,
3880 "iir_filter: initial state vector length mismatch (expected {}, found {})",
3881 expected_states,
3882 zi_buf.len()
3883 );
3884 states_from_column_major(zi_buf, state_len, dim_idx, &shape_ext)
3885 } else {
3886 vec![0.0; expected_states]
3887 };
3888
3889 let mut b_norm = vec![0.0f64; order];
3890 let mut a_norm = vec![0.0f64; order];
3891 let a0 = a_data[0];
3892 for i in 0..order {
3893 let b_coeff = if i < nb { b_data[i] } else { 0.0 };
3894 b_norm[i] = b_coeff / a0;
3895 if i == 0 {
3896 a_norm[0] = 1.0;
3897 } else {
3898 let a_coeff = if i < na { a_data[i] } else { 0.0 };
3899 a_norm[i] = a_coeff / a0;
3900 }
3901 }
3902
3903 let mut output = vec![0.0f64; x_data.len()];
3904
3905 if state_len == 0 {
3906 let gain = b_norm[0];
3907 for (dst, &src) in output.iter_mut().zip(x_data.iter()) {
3908 *dst = gain * src;
3909 }
3910 } else if dim_len == 0 || channel_count == 0 {
3911 } else {
3913 for t in 0..trailing {
3914 let base = t
3915 .checked_mul(dim_len)
3916 .and_then(|v| v.checked_mul(leading))
3917 .ok_or_else(|| anyhow!("iir_filter: index overflow"))?;
3918 for l in 0..leading {
3919 let channel_idx = t
3920 .checked_mul(leading)
3921 .and_then(|v| v.checked_add(l))
3922 .ok_or_else(|| anyhow!("iir_filter: index overflow"))?;
3923 if channel_idx >= channel_count {
3924 continue;
3925 }
3926 let state_base = channel_idx
3927 .checked_mul(state_len)
3928 .ok_or_else(|| anyhow!("iir_filter: state index overflow"))?;
3929 for step in 0..dim_len {
3930 let idx = base
3931 .checked_add(l)
3932 .and_then(|v| v.checked_add(step.saturating_mul(leading)))
3933 .ok_or_else(|| anyhow!("iir_filter: signal index overflow"))?;
3934 if idx >= x_data.len() {
3935 break;
3936 }
3937 let x_n = x_data[idx];
3938 let y =
3939 b_norm[0] * x_n + states.get(state_base).copied().unwrap_or(0.0);
3940 output[idx] = y;
3941 for i in 1..order {
3942 let next_state = if i < state_len {
3943 states[state_base + i]
3944 } else {
3945 0.0
3946 };
3947 let new_state = b_norm[i] * x_n + next_state - a_norm[i] * y;
3948 states[state_base + i - 1] = new_state;
3949 }
3950 }
3951 }
3952 }
3953 }
3954
3955 let final_state_data = if state_len == 0 {
3956 Vec::new()
3957 } else {
3958 states_to_column_major(&states, state_len, dim_idx, &shape_ext)
3959 };
3960
3961 let output_handle = self.allocate_tensor(output, x.shape.clone());
3962 let final_state_handle = self.allocate_tensor(final_state_data, state_shape);
3963
3964 Ok(ProviderIirFilterResult {
3965 output: output_handle,
3966 final_state: Some(final_state_handle),
3967 })
3968 })
3969 }
3970 fn permute(&self, handle: &GpuTensorHandle, order: &[usize]) -> Result<GpuTensorHandle> {
3971 let data = {
3972 let guard = registry().lock().unwrap();
3973 guard
3974 .get(&handle.buffer_id)
3975 .ok_or_else(|| anyhow!("permute: unknown tensor handle {}", handle.buffer_id))?
3976 .clone()
3977 };
3978 let (permuted, new_shape) = permute_data(&data, &handle.shape, order)?;
3979 Ok(self.allocate_tensor(permuted, new_shape))
3980 }
3981
3982 fn flip(&self, handle: &GpuTensorHandle, axes: &[usize]) -> Result<GpuTensorHandle> {
3983 let data = {
3984 let guard = registry().lock().unwrap();
3985 guard
3986 .get(&handle.buffer_id)
3987 .ok_or_else(|| anyhow!("flip: unknown tensor handle {}", handle.buffer_id))?
3988 .clone()
3989 };
3990 let flipped = flip_data(&data, &handle.shape, axes)?;
3991 Ok(self.allocate_tensor(flipped, handle.shape.clone()))
3992 }
3993
3994 fn circshift(&self, handle: &GpuTensorHandle, shifts: &[isize]) -> Result<GpuTensorHandle> {
3995 let data = {
3996 let guard = registry().lock().unwrap();
3997 guard
3998 .get(&handle.buffer_id)
3999 .ok_or_else(|| anyhow!("circshift: unknown tensor handle {}", handle.buffer_id))?
4000 .clone()
4001 };
4002 let mut shape = handle.shape.clone();
4003 if shifts.len() > shape.len() {
4004 shape.extend(std::iter::repeat_n(1, shifts.len() - shape.len()));
4005 }
4006 let mut full_shifts = vec![0isize; shape.len()];
4007 for (idx, &shift) in shifts.iter().enumerate() {
4008 if idx < full_shifts.len() {
4009 full_shifts[idx] = shift;
4010 }
4011 }
4012 let rotated = circshift_data(&data, &shape, &full_shifts)?;
4013 Ok(self.allocate_tensor(rotated, shape))
4014 }
4015
4016 fn hann_window(&self, len: usize, periodic: bool) -> Result<GpuTensorHandle> {
4017 Ok(self.allocate_tensor(
4018 generate_window_data(WindowKind::Hann, len, periodic),
4019 vec![len, 1],
4020 ))
4021 }
4022
4023 fn hamming_window(&self, len: usize, periodic: bool) -> Result<GpuTensorHandle> {
4024 Ok(self.allocate_tensor(
4025 generate_window_data(WindowKind::Hamming, len, periodic),
4026 vec![len, 1],
4027 ))
4028 }
4029
4030 fn blackman_window(&self, len: usize, periodic: bool) -> Result<GpuTensorHandle> {
4031 Ok(self.allocate_tensor(
4032 generate_window_data(WindowKind::Blackman, len, periodic),
4033 vec![len, 1],
4034 ))
4035 }
4036
4037 fn diff_dim(
4038 &self,
4039 handle: &GpuTensorHandle,
4040 order: usize,
4041 dim: usize,
4042 ) -> Result<GpuTensorHandle> {
4043 if order == 0 {
4044 return Ok(handle.clone());
4045 }
4046 let data = {
4047 let guard = registry().lock().unwrap();
4048 guard
4049 .get(&handle.buffer_id)
4050 .ok_or_else(|| anyhow!("diff_dim: unknown tensor handle {}", handle.buffer_id))?
4051 .clone()
4052 };
4053 let tensor =
4054 Tensor::new(data, handle.shape.clone()).map_err(|e| anyhow!("diff_dim: {e}"))?;
4055 let diffed =
4056 diff_tensor_host(tensor, order, Some(dim + 1)).map_err(|e| anyhow!("diff_dim: {e}"))?;
4057 let Tensor { data, shape, .. } = diffed;
4058 Ok(self.allocate_tensor(data, shape))
4059 }
4060
4061 fn gradient_dim(
4062 &self,
4063 handle: &GpuTensorHandle,
4064 dim: usize,
4065 spacing: f64,
4066 ) -> Result<GpuTensorHandle> {
4067 let data = {
4068 let guard = registry().lock().unwrap();
4069 guard
4070 .get(&handle.buffer_id)
4071 .ok_or_else(|| anyhow!("gradient_dim: unknown tensor handle {}", handle.buffer_id))?
4072 .clone()
4073 };
4074 let tensor =
4075 Tensor::new(data, handle.shape.clone()).map_err(|e| anyhow!("gradient_dim: {e}"))?;
4076 let gradiented = gradient_real_tensor_host(tensor, dim + 1, spacing)
4077 .map_err(|e| anyhow!("gradient_dim: {e}"))?;
4078 let Tensor { data, shape, .. } = gradiented;
4079 Ok(self.allocate_tensor(data, shape))
4080 }
4081
4082 fn unique<'a>(
4083 &'a self,
4084 handle: &'a GpuTensorHandle,
4085 options: &'a UniqueOptions,
4086 ) -> AccelProviderFuture<'a, UniqueResult> {
4087 Box::pin(async move {
4088 let data = {
4089 let guard = registry().lock().unwrap();
4090 guard
4091 .get(&handle.buffer_id)
4092 .ok_or_else(|| anyhow!("unique: unknown tensor handle {}", handle.buffer_id))?
4093 .clone()
4094 };
4095 let tensor =
4096 Tensor::new(data, handle.shape.clone()).map_err(|e| anyhow!("unique: {e}"))?;
4097 let eval = match unique::unique_numeric_from_tensor(tensor, options) {
4098 Ok(eval) => eval,
4099 Err(err) => {
4100 return Err(anyhow!("{err}"));
4101 }
4102 };
4103 match eval.into_numeric_unique_result() {
4104 Ok(result) => Ok(result),
4105 Err(err) => Err(anyhow!("{err}")),
4106 }
4107 })
4108 }
4109
4110 fn setdiff<'a>(
4111 &'a self,
4112 a: &'a GpuTensorHandle,
4113 b: &'a GpuTensorHandle,
4114 options: &'a SetdiffOptions,
4115 ) -> AccelProviderFuture<'a, SetdiffResult> {
4116 Box::pin(async move {
4117 let data_a = {
4118 let guard = registry().lock().unwrap();
4119 guard
4120 .get(&a.buffer_id)
4121 .cloned()
4122 .ok_or_else(|| anyhow!("setdiff: unknown tensor handle {}", a.buffer_id))?
4123 };
4124 let data_b = {
4125 let guard = registry().lock().unwrap();
4126 guard
4127 .get(&b.buffer_id)
4128 .cloned()
4129 .ok_or_else(|| anyhow!("setdiff: unknown tensor handle {}", b.buffer_id))?
4130 };
4131 let tensor_a =
4132 Tensor::new(data_a, a.shape.clone()).map_err(|e| anyhow!("setdiff: {e}"))?;
4133 let tensor_b =
4134 Tensor::new(data_b, b.shape.clone()).map_err(|e| anyhow!("setdiff: {e}"))?;
4135 let eval = match runmat_runtime::builtins::array::sorting_sets::setdiff::setdiff_numeric_from_tensors(
4136 tensor_a, tensor_b, options,
4137 ) {
4138 Ok(eval) => eval,
4139 Err(err) => {
4140 return Err(anyhow!("setdiff: {err}"));
4141 }
4142 };
4143 match eval.into_numeric_setdiff_result() {
4144 Ok(result) => Ok(result),
4145 Err(err) => Err(anyhow!("setdiff: {err}")),
4146 }
4147 })
4148 }
4149
4150 fn repmat(&self, handle: &GpuTensorHandle, reps: &[usize]) -> Result<GpuTensorHandle> {
4151 let data = {
4152 let guard = registry().lock().unwrap();
4153 guard
4154 .get(&handle.buffer_id)
4155 .ok_or_else(|| anyhow!("repmat: unknown tensor handle {}", handle.buffer_id))?
4156 .clone()
4157 };
4158 let (tiled, shape) = repmat_numeric(&data, &handle.shape, reps)?;
4159 Ok(self.allocate_tensor(tiled, shape))
4160 }
4161
4162 fn dot<'a>(
4163 &'a self,
4164 lhs: &'a GpuTensorHandle,
4165 rhs: &'a GpuTensorHandle,
4166 dim: Option<usize>,
4167 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
4168 Box::pin(async move {
4169 let (lhs_buf, rhs_buf) = {
4170 let guard = registry().lock().unwrap();
4171 let lhs_buf = guard
4172 .get(&lhs.buffer_id)
4173 .cloned()
4174 .ok_or_else(|| anyhow!("dot: unknown tensor handle {}", lhs.buffer_id))?;
4175 let rhs_buf = guard
4176 .get(&rhs.buffer_id)
4177 .cloned()
4178 .ok_or_else(|| anyhow!("dot: unknown tensor handle {}", rhs.buffer_id))?;
4179 (lhs_buf, rhs_buf)
4180 };
4181 let lhs_tensor =
4182 Tensor::new(lhs_buf, lhs.shape.clone()).map_err(|e| anyhow!("dot: {e}"))?;
4183 let rhs_tensor =
4184 Tensor::new(rhs_buf, rhs.shape.clone()).map_err(|e| anyhow!("dot: {e}"))?;
4185 let result = dot_host_real_for_provider(&lhs_tensor, &rhs_tensor, dim)
4186 .map_err(|e| anyhow!(e))?;
4187 Ok(self.allocate_tensor(result.data.clone(), result.shape.clone()))
4188 })
4189 }
4190
4191 fn reduce_sum<'a>(
4192 &'a self,
4193 a: &'a GpuTensorHandle,
4194 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
4195 Box::pin(async move {
4196 let guard = registry().lock().unwrap();
4197 let abuf = guard
4198 .get(&a.buffer_id)
4199 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
4200 let s: f64 = abuf.iter().sum();
4201 drop(guard);
4202 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
4203 let mut guard2 = registry().lock().unwrap();
4204 guard2.insert(id, vec![s]);
4205 Ok(GpuTensorHandle {
4206 shape: vec![1, 1],
4207 device_id: 0,
4208 buffer_id: id,
4209 })
4210 })
4211 }
4212
4213 fn reduce_sum_dim<'a>(
4214 &'a self,
4215 a: &'a GpuTensorHandle,
4216 dim: usize,
4217 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
4218 Box::pin(async move {
4219 if a.shape.len() != 2 {
4220 return Err(anyhow::anyhow!("reduce_sum_dim: only 2D supported"));
4221 }
4222 let rows = a.shape[0];
4223 let cols = a.shape[1];
4224 let guard = registry().lock().unwrap();
4225 let abuf = guard
4226 .get(&a.buffer_id)
4227 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
4228 let (out, shape) = match dim {
4229 0 => {
4230 let mut v = vec![0.0f64; cols];
4232 for c in 0..cols {
4233 let mut s = 0.0;
4234 for r in 0..rows {
4235 s += abuf[r + c * rows];
4236 }
4237 v[c] = s;
4238 }
4239 (v, vec![1, cols])
4240 }
4241 1 => {
4242 let mut v = vec![0.0f64; rows];
4244 for r in 0..rows {
4245 let mut s = 0.0;
4246 for c in 0..cols {
4247 s += abuf[r + c * rows];
4248 }
4249 v[r] = s;
4250 }
4251 (v, vec![rows, 1])
4252 }
4253 _ => {
4254 return Err(anyhow::anyhow!(
4255 "reduce_sum_dim: only dims 0 or 1 supported"
4256 ))
4257 }
4258 };
4259 drop(guard);
4260 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
4261 let mut guard2 = registry().lock().unwrap();
4262 guard2.insert(id, out);
4263 Ok(GpuTensorHandle {
4264 shape,
4265 device_id: 0,
4266 buffer_id: id,
4267 })
4268 })
4269 }
4270
4271 fn reduce_prod<'a>(
4272 &'a self,
4273 a: &'a GpuTensorHandle,
4274 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
4275 Box::pin(async move {
4276 let guard = registry().lock().unwrap();
4277 let abuf = guard
4278 .get(&a.buffer_id)
4279 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
4280 let p: f64 = abuf.iter().product();
4281 drop(guard);
4282 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
4283 let mut guard2 = registry().lock().unwrap();
4284 guard2.insert(id, vec![p]);
4285 Ok(GpuTensorHandle {
4286 shape: vec![1, 1],
4287 device_id: 0,
4288 buffer_id: id,
4289 })
4290 })
4291 }
4292
4293 fn reduce_prod_dim<'a>(
4294 &'a self,
4295 a: &'a GpuTensorHandle,
4296 dim: usize,
4297 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
4298 Box::pin(async move {
4299 if a.shape.len() != 2 {
4300 return Err(anyhow::anyhow!("reduce_prod_dim: only 2D supported"));
4301 }
4302 let rows = a.shape[0];
4303 let cols = a.shape[1];
4304 let guard = registry().lock().unwrap();
4305 let abuf = guard
4306 .get(&a.buffer_id)
4307 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
4308 let out = if dim <= 1 {
4309 let mut v = vec![1.0f64; cols];
4310 for c in 0..cols {
4311 let mut prod = 1.0;
4312 for r in 0..rows {
4313 prod *= abuf[r + c * rows];
4314 }
4315 v[c] = prod;
4316 }
4317 v
4318 } else {
4319 let mut v = vec![1.0f64; rows];
4320 for r in 0..rows {
4321 let mut prod = 1.0;
4322 for c in 0..cols {
4323 prod *= abuf[r + c * rows];
4324 }
4325 v[r] = prod;
4326 }
4327 v
4328 };
4329 drop(guard);
4330 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
4331 let mut guard2 = registry().lock().unwrap();
4332 guard2.insert(id, out);
4333 let shape = if dim <= 1 {
4334 vec![1, cols]
4335 } else {
4336 vec![rows, 1]
4337 };
4338 Ok(GpuTensorHandle {
4339 shape,
4340 device_id: 0,
4341 buffer_id: id,
4342 })
4343 })
4344 }
4345
4346 fn reduce_mean<'a>(
4347 &'a self,
4348 a: &'a GpuTensorHandle,
4349 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
4350 Box::pin(async move {
4351 let guard = registry().lock().unwrap();
4352 let abuf = guard
4353 .get(&a.buffer_id)
4354 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
4355 let mean = if abuf.is_empty() {
4356 0.0
4357 } else {
4358 abuf.iter().sum::<f64>() / (abuf.len() as f64)
4359 };
4360 drop(guard);
4361 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
4362 registry().lock().unwrap().insert(id, vec![mean]);
4363 Ok(GpuTensorHandle {
4364 shape: vec![1, 1],
4365 device_id: 0,
4366 buffer_id: id,
4367 })
4368 })
4369 }
4370
4371 fn reduce_mean_dim<'a>(
4372 &'a self,
4373 a: &'a GpuTensorHandle,
4374 dim: usize,
4375 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
4376 Box::pin(async move {
4377 if a.shape.len() != 2 {
4378 return Err(anyhow::anyhow!("reduce_mean_dim: only 2D supported"));
4379 }
4380 let rows = a.shape[0];
4381 let cols = a.shape[1];
4382 let guard = registry().lock().unwrap();
4383 let abuf = guard
4384 .get(&a.buffer_id)
4385 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
4386 let out = if dim <= 1 {
4387 let mut v = vec![0.0f64; cols];
4388 for c in 0..cols {
4389 let mut s = 0.0;
4390 for r in 0..rows {
4391 s += abuf[r + c * rows];
4392 }
4393 v[c] = s / (rows as f64);
4394 }
4395 v
4396 } else {
4397 let mut v = vec![0.0f64; rows];
4398 for r in 0..rows {
4399 let mut s = 0.0;
4400 for c in 0..cols {
4401 s += abuf[r + c * rows];
4402 }
4403 v[r] = s / (cols as f64);
4404 }
4405 v
4406 };
4407 drop(guard);
4408 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
4409 registry().lock().unwrap().insert(id, out);
4410 let shape = if dim <= 1 {
4411 vec![1, cols]
4412 } else {
4413 vec![rows, 1]
4414 };
4415 Ok(GpuTensorHandle {
4416 shape,
4417 device_id: 0,
4418 buffer_id: id,
4419 })
4420 })
4421 }
4422
4423 fn reduce_any<'a>(
4424 &'a self,
4425 a: &'a GpuTensorHandle,
4426 omit_nan: bool,
4427 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
4428 Box::pin(async move {
4429 let data = {
4430 let guard = registry().lock().unwrap();
4431 guard
4432 .get(&a.buffer_id)
4433 .cloned()
4434 .ok_or_else(|| anyhow!("reduce_any: unknown tensor handle {}", a.buffer_id))?
4435 };
4436 let mut truthy = false;
4437 for v in &data {
4438 if v.is_nan() {
4439 if !omit_nan {
4440 truthy = true;
4441 break;
4442 }
4443 } else if *v != 0.0 {
4444 truthy = true;
4445 break;
4446 }
4447 }
4448 let result = if truthy { 1.0 } else { 0.0 };
4449 Ok(self.allocate_tensor(vec![result], vec![1, 1]))
4450 })
4451 }
4452
4453 fn reduce_any_dim<'a>(
4454 &'a self,
4455 a: &'a GpuTensorHandle,
4456 dim: usize,
4457 omit_nan: bool,
4458 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
4459 Box::pin(async move {
4460 if a.shape.len() != 2 {
4461 return Err(anyhow!("reduce_any_dim: only 2D supported"));
4462 }
4463 let rows = a.shape[0];
4464 let cols = a.shape[1];
4465 let data = {
4466 let guard = registry().lock().unwrap();
4467 guard.get(&a.buffer_id).cloned().ok_or_else(|| {
4468 anyhow!("reduce_any_dim: unknown tensor handle {}", a.buffer_id)
4469 })?
4470 };
4471 let (out, shape) = if dim == 0 {
4472 let mut v = vec![0.0f64; cols];
4473 for c in 0..cols {
4474 let mut truth = false;
4475 for r in 0..rows {
4476 let val = data[r + c * rows];
4477 if val.is_nan() {
4478 if !omit_nan {
4479 truth = true;
4480 break;
4481 }
4482 } else if val != 0.0 {
4483 truth = true;
4484 break;
4485 }
4486 }
4487 v[c] = if truth { 1.0 } else { 0.0 };
4488 }
4489 (v, vec![1, cols])
4490 } else if dim == 1 {
4491 let mut v = vec![0.0f64; rows];
4492 for r in 0..rows {
4493 let mut truth = false;
4494 for c in 0..cols {
4495 let val = data[r + c * rows];
4496 if val.is_nan() {
4497 if !omit_nan {
4498 truth = true;
4499 break;
4500 }
4501 } else if val != 0.0 {
4502 truth = true;
4503 break;
4504 }
4505 }
4506 v[r] = if truth { 1.0 } else { 0.0 };
4507 }
4508 (v, vec![rows, 1])
4509 } else {
4510 return Err(anyhow!("reduce_any_dim: invalid dimension {}", dim));
4511 };
4512 Ok(self.allocate_tensor(out, shape))
4513 })
4514 }
4515
4516 fn reduce_all<'a>(
4517 &'a self,
4518 a: &'a GpuTensorHandle,
4519 omit_nan: bool,
4520 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
4521 Box::pin(async move {
4522 let data = {
4523 let guard = registry().lock().unwrap();
4524 guard
4525 .get(&a.buffer_id)
4526 .cloned()
4527 .ok_or_else(|| anyhow!("reduce_all: unknown tensor handle {}", a.buffer_id))?
4528 };
4529 let mut all_true = true;
4530 let mut saw_value = false;
4531 for v in &data {
4532 if v.is_nan() {
4533 if omit_nan {
4534 continue;
4535 }
4536 } else {
4537 saw_value = true;
4538 if *v == 0.0 {
4539 all_true = false;
4540 break;
4541 }
4542 continue;
4543 }
4544 }
4545 if omit_nan && !saw_value {
4546 all_true = true;
4547 }
4548 let result = if all_true { 1.0 } else { 0.0 };
4549 Ok(self.allocate_tensor(vec![result], vec![1, 1]))
4550 })
4551 }
4552
4553 fn reduce_all_dim<'a>(
4554 &'a self,
4555 a: &'a GpuTensorHandle,
4556 dim: usize,
4557 omit_nan: bool,
4558 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
4559 Box::pin(async move {
4560 if a.shape.len() != 2 {
4561 return Err(anyhow!("reduce_all_dim: only 2D supported"));
4562 }
4563 let rows = a.shape[0];
4564 let cols = a.shape[1];
4565 let data = {
4566 let guard = registry().lock().unwrap();
4567 guard.get(&a.buffer_id).cloned().ok_or_else(|| {
4568 anyhow!("reduce_all_dim: unknown tensor handle {}", a.buffer_id)
4569 })?
4570 };
4571 let (out, shape) = if dim == 0 {
4572 let mut v = vec![0.0f64; cols];
4573 for c in 0..cols {
4574 let mut all_true = true;
4575 let mut saw_value = false;
4576 for r in 0..rows {
4577 let val = data[r + c * rows];
4578 if val.is_nan() {
4579 if omit_nan {
4580 continue;
4581 }
4582 } else {
4583 saw_value = true;
4584 if val == 0.0 {
4585 all_true = false;
4586 break;
4587 }
4588 continue;
4589 }
4590 }
4591 if omit_nan && !saw_value {
4592 all_true = true;
4593 }
4594 v[c] = if all_true { 1.0 } else { 0.0 };
4595 }
4596 (v, vec![1, cols])
4597 } else if dim == 1 {
4598 let mut v = vec![0.0f64; rows];
4599 for r in 0..rows {
4600 let mut all_true = true;
4601 let mut saw_value = false;
4602 for c in 0..cols {
4603 let val = data[r + c * rows];
4604 if val.is_nan() {
4605 if omit_nan {
4606 continue;
4607 }
4608 } else {
4609 saw_value = true;
4610 if val == 0.0 {
4611 all_true = false;
4612 break;
4613 }
4614 continue;
4615 }
4616 }
4617 if omit_nan && !saw_value {
4618 all_true = true;
4619 }
4620 v[r] = if all_true { 1.0 } else { 0.0 };
4621 }
4622 (v, vec![rows, 1])
4623 } else {
4624 return Err(anyhow!("reduce_all_dim: invalid dimension {}", dim));
4625 };
4626 Ok(self.allocate_tensor(out, shape))
4627 })
4628 }
4629
4630 fn reduce_median<'a>(
4631 &'a self,
4632 a: &'a GpuTensorHandle,
4633 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
4634 Box::pin(async move {
4635 let data = {
4636 let guard = registry().lock().unwrap();
4637 guard
4638 .get(&a.buffer_id)
4639 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?
4640 .clone()
4641 };
4642 let median = if data.is_empty() || data.iter().any(|v| v.is_nan()) {
4643 f64::NAN
4644 } else {
4645 let mut scratch = data.clone();
4646 compute_median_inplace(&mut scratch)
4647 };
4648 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
4649 registry().lock().unwrap().insert(id, vec![median]);
4650 Ok(GpuTensorHandle {
4651 shape: vec![1, 1],
4652 device_id: 0,
4653 buffer_id: id,
4654 })
4655 })
4656 }
4657
4658 fn reduce_median_dim<'a>(
4659 &'a self,
4660 a: &'a GpuTensorHandle,
4661 dim: usize,
4662 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
4663 Box::pin(async move {
4664 if a.shape.len() != 2 {
4665 return Err(anyhow::anyhow!("reduce_median_dim: only 2D supported"));
4666 }
4667 let rows = a.shape[0];
4668 let cols = a.shape[1];
4669 let guard = registry().lock().unwrap();
4670 let abuf = guard
4671 .get(&a.buffer_id)
4672 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
4673 let mut scratch = Vec::<f64>::with_capacity(rows.max(cols));
4674 let out = if dim <= 1 {
4675 let mut v = vec![f64::NAN; cols];
4676 for c in 0..cols {
4677 scratch.clear();
4678 let mut saw_nan = false;
4679 for r in 0..rows {
4680 let val = abuf[r + c * rows];
4681 if val.is_nan() {
4682 saw_nan = true;
4683 scratch.clear();
4684 break;
4685 }
4686 scratch.push(val);
4687 }
4688 v[c] = if saw_nan || scratch.is_empty() {
4689 f64::NAN
4690 } else {
4691 compute_median_inplace(&mut scratch)
4692 };
4693 }
4694 v
4695 } else {
4696 let mut v = vec![f64::NAN; rows];
4697 for r in 0..rows {
4698 scratch.clear();
4699 let mut saw_nan = false;
4700 for c in 0..cols {
4701 let val = abuf[r + c * rows];
4702 if val.is_nan() {
4703 saw_nan = true;
4704 scratch.clear();
4705 break;
4706 }
4707 scratch.push(val);
4708 }
4709 v[r] = if saw_nan || scratch.is_empty() {
4710 f64::NAN
4711 } else {
4712 compute_median_inplace(&mut scratch)
4713 };
4714 }
4715 v
4716 };
4717 drop(guard);
4718 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
4719 registry().lock().unwrap().insert(id, out);
4720 let shape = if dim <= 1 {
4721 vec![1, cols]
4722 } else {
4723 vec![rows, 1]
4724 };
4725 Ok(GpuTensorHandle {
4726 shape,
4727 device_id: 0,
4728 buffer_id: id,
4729 })
4730 })
4731 }
4732
4733 fn reduce_min<'a>(
4734 &'a self,
4735 a: &'a GpuTensorHandle,
4736 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
4737 Box::pin(async move {
4738 let guard = registry().lock().unwrap();
4739 let abuf = guard
4740 .get(&a.buffer_id)
4741 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
4742 let m = abuf.iter().cloned().fold(f64::INFINITY, f64::min);
4743 drop(guard);
4744 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
4745 registry().lock().unwrap().insert(id, vec![m]);
4746 Ok(GpuTensorHandle {
4747 shape: vec![1, 1],
4748 device_id: 0,
4749 buffer_id: id,
4750 })
4751 })
4752 }
4753
4754 fn reduce_min_dim<'a>(
4755 &'a self,
4756 a: &'a GpuTensorHandle,
4757 dim: usize,
4758 ) -> AccelProviderFuture<'a, runmat_accelerate_api::ReduceDimResult> {
4759 Box::pin(async move {
4760 if a.shape.len() != 2 {
4761 return Err(anyhow::anyhow!("reduce_min_dim: only 2D supported"));
4762 }
4763 let rows = a.shape[0];
4764 let cols = a.shape[1];
4765 let guard = registry().lock().unwrap();
4766 let abuf = guard
4767 .get(&a.buffer_id)
4768 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
4769 let (vals, inds, vshape) = if dim <= 1 {
4770 let mut m: Vec<f64> = vec![f64::INFINITY; cols];
4771 let mut idx: Vec<f64> = vec![1.0; cols];
4772 for c in 0..cols {
4773 for r in 0..rows {
4774 let v = abuf[r + c * rows];
4775 if v < m[c] {
4776 m[c] = v;
4777 idx[c] = (r + 1) as f64;
4778 }
4779 }
4780 }
4781 (m, idx, vec![1, cols])
4782 } else {
4783 let mut m: Vec<f64> = vec![f64::INFINITY; rows];
4784 let mut idx: Vec<f64> = vec![1.0; rows];
4785 for r in 0..rows {
4786 for c in 0..cols {
4787 let v = abuf[r + c * rows];
4788 if v < m[r] {
4789 m[r] = v;
4790 idx[r] = (c + 1) as f64;
4791 }
4792 }
4793 }
4794 (m, idx, vec![rows, 1])
4795 };
4796 drop(guard);
4797 let idv = self.next_id.fetch_add(1, Ordering::Relaxed);
4798 let idi = self.next_id.fetch_add(1, Ordering::Relaxed);
4799 let mut g = registry().lock().unwrap();
4800 g.insert(idv, vals);
4801 g.insert(idi, inds);
4802 let shape_vals = vshape.clone();
4803 let shape_inds = vshape;
4804 Ok(runmat_accelerate_api::ReduceDimResult {
4805 values: GpuTensorHandle {
4806 shape: shape_vals,
4807 device_id: 0,
4808 buffer_id: idv,
4809 },
4810 indices: GpuTensorHandle {
4811 shape: shape_inds,
4812 device_id: 0,
4813 buffer_id: idi,
4814 },
4815 })
4816 })
4817 }
4818
4819 fn reduce_max<'a>(
4820 &'a self,
4821 a: &'a GpuTensorHandle,
4822 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
4823 Box::pin(async move {
4824 let guard = registry().lock().unwrap();
4825 let abuf = guard
4826 .get(&a.buffer_id)
4827 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
4828 let m = abuf.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
4829 drop(guard);
4830 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
4831 registry().lock().unwrap().insert(id, vec![m]);
4832 Ok(GpuTensorHandle {
4833 shape: vec![1, 1],
4834 device_id: 0,
4835 buffer_id: id,
4836 })
4837 })
4838 }
4839
4840 fn reduce_max_dim<'a>(
4841 &'a self,
4842 a: &'a GpuTensorHandle,
4843 dim: usize,
4844 ) -> AccelProviderFuture<'a, runmat_accelerate_api::ReduceDimResult> {
4845 Box::pin(async move {
4846 if a.shape.len() != 2 {
4847 return Err(anyhow::anyhow!("reduce_max_dim: only 2D supported"));
4848 }
4849 let rows = a.shape[0];
4850 let cols = a.shape[1];
4851 let guard = registry().lock().unwrap();
4852 let abuf = guard
4853 .get(&a.buffer_id)
4854 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
4855 let (vals, inds, vshape) = if dim <= 1 {
4856 let mut m: Vec<f64> = vec![f64::NEG_INFINITY; cols];
4857 let mut idx: Vec<f64> = vec![1.0; cols];
4858 for c in 0..cols {
4859 for r in 0..rows {
4860 let v = abuf[r + c * rows];
4861 if v > m[c] {
4862 m[c] = v;
4863 idx[c] = (r + 1) as f64;
4864 }
4865 }
4866 }
4867 (m, idx, vec![1, cols])
4868 } else {
4869 let mut m: Vec<f64> = vec![f64::NEG_INFINITY; rows];
4870 let mut idx: Vec<f64> = vec![1.0; rows];
4871 for r in 0..rows {
4872 for c in 0..cols {
4873 let v = abuf[r + c * rows];
4874 if v > m[r] {
4875 m[r] = v;
4876 idx[r] = (c + 1) as f64;
4877 }
4878 }
4879 }
4880 (m, idx, vec![rows, 1])
4881 };
4882 drop(guard);
4883 let idv = self.next_id.fetch_add(1, Ordering::Relaxed);
4884 let idi = self.next_id.fetch_add(1, Ordering::Relaxed);
4885 let mut g = registry().lock().unwrap();
4886 g.insert(idv, vals);
4887 g.insert(idi, inds);
4888 let shape_vals = vshape.clone();
4889 let shape_inds = vshape;
4890 Ok(runmat_accelerate_api::ReduceDimResult {
4891 values: GpuTensorHandle {
4892 shape: shape_vals,
4893 device_id: 0,
4894 buffer_id: idv,
4895 },
4896 indices: GpuTensorHandle {
4897 shape: shape_inds,
4898 device_id: 0,
4899 buffer_id: idi,
4900 },
4901 })
4902 })
4903 }
4904
4905 fn cumsum_scan(
4906 &self,
4907 _input: &GpuTensorHandle,
4908 _dim: usize,
4909 _direction: ProviderScanDirection,
4910 _nan_mode: ProviderNanMode,
4911 ) -> Result<GpuTensorHandle> {
4912 Err(anyhow!("cumsum_scan not supported by provider"))
4913 }
4914
4915 fn cummin_scan(
4916 &self,
4917 _input: &GpuTensorHandle,
4918 _dim: usize,
4919 _direction: ProviderScanDirection,
4920 _nan_mode: ProviderNanMode,
4921 ) -> Result<runmat_accelerate_api::ProviderCumminResult> {
4922 Err(anyhow!("cummin_scan not supported by provider"))
4923 }
4924
4925 fn find(
4926 &self,
4927 a: &GpuTensorHandle,
4928 limit: Option<usize>,
4929 direction: FindDirection,
4930 ) -> Result<ProviderFindResult> {
4931 let shape = a.shape.clone();
4932 let row_extent = shape.first().copied().unwrap_or(1).max(1);
4933 let (indices, rows, cols, values) = {
4934 let guard = registry().lock().unwrap();
4935 let data = guard
4936 .get(&a.buffer_id)
4937 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
4938 let total = data.len();
4939 let cap = match direction {
4940 FindDirection::First => limit.unwrap_or(total),
4941 FindDirection::Last => limit.unwrap_or(1),
4942 }
4943 .min(total);
4944
4945 let mut indices = Vec::new();
4946 let mut rows_out = Vec::new();
4947 let mut cols_out = Vec::new();
4948 let mut values_out = Vec::new();
4949
4950 if cap == 0 || total == 0 {
4951 (indices, rows_out, cols_out, values_out)
4952 } else {
4953 match direction {
4954 FindDirection::First => {
4955 for (idx, &value) in data.iter().enumerate() {
4956 if value != 0.0 {
4957 indices.push((idx + 1) as f64);
4958 rows_out.push(((idx % row_extent) + 1) as f64);
4959 cols_out.push(((idx / row_extent) + 1) as f64);
4960 values_out.push(value);
4961 if indices.len() >= cap {
4962 break;
4963 }
4964 }
4965 }
4966 }
4967 FindDirection::Last => {
4968 for (idx, &value) in data.iter().enumerate().rev() {
4969 if value != 0.0 {
4970 indices.push((idx + 1) as f64);
4971 rows_out.push(((idx % row_extent) + 1) as f64);
4972 cols_out.push(((idx / row_extent) + 1) as f64);
4973 values_out.push(value);
4974 if indices.len() >= cap {
4975 break;
4976 }
4977 }
4978 }
4979 }
4980 }
4981 (indices, rows_out, cols_out, values_out)
4982 }
4983 };
4984
4985 let count = indices.len();
4986 let shape_out = vec![count, 1];
4987 let linear = self.allocate_tensor(indices, shape_out.clone());
4988 let rows_handle = self.allocate_tensor(rows, shape_out.clone());
4989 let cols_handle = self.allocate_tensor(cols, shape_out.clone());
4990 let values_handle = self.allocate_tensor(values, shape_out);
4991
4992 Ok(ProviderFindResult {
4993 linear,
4994 rows: rows_handle,
4995 cols: cols_handle,
4996 values: Some(values_handle),
4997 })
4998 }
4999
5000 fn lu<'a>(&'a self, a: &'a GpuTensorHandle) -> AccelProviderFuture<'a, ProviderLuResult> {
5001 Box::pin(async move {
5002 let data = {
5003 let guard = registry().lock().unwrap();
5004 guard
5005 .get(&a.buffer_id)
5006 .cloned()
5007 .ok_or_else(|| anyhow!("lu: unknown buffer {}", a.buffer_id))?
5008 };
5009 let LuHostFactors {
5010 combined,
5011 lower,
5012 upper,
5013 perm_matrix,
5014 pivot_vector,
5015 combined_shape,
5016 lower_shape,
5017 upper_shape,
5018 perm_shape,
5019 pivot_shape,
5020 } = lu_factor_host(&data, &a.shape)?;
5021 let combined = self.allocate_tensor(combined, combined_shape);
5022 let lower = self.allocate_tensor(lower, lower_shape);
5023 let upper = self.allocate_tensor(upper, upper_shape);
5024 let perm_matrix = self.allocate_tensor(perm_matrix, perm_shape);
5025 let perm_vector = self.allocate_tensor(pivot_vector, pivot_shape);
5026 Ok(ProviderLuResult {
5027 combined,
5028 lower,
5029 upper,
5030 perm_matrix,
5031 perm_vector,
5032 })
5033 })
5034 }
5035
5036 fn chol<'a>(
5037 &'a self,
5038 a: &'a GpuTensorHandle,
5039 lower: bool,
5040 ) -> AccelProviderFuture<'a, ProviderCholResult> {
5041 Box::pin(async move {
5042 let data = {
5043 let guard = registry().lock().unwrap();
5044 guard
5045 .get(&a.buffer_id)
5046 .cloned()
5047 .ok_or_else(|| anyhow!("chol: unknown buffer {}", a.buffer_id))?
5048 };
5049 let tensor = Tensor::new(data, a.shape.clone()).map_err(|e| anyhow!("chol: {e}"))?;
5050 let mut args = Vec::new();
5051 if lower {
5052 args.push(Value::from("lower"));
5053 }
5054 let eval = runmat_runtime::builtins::math::linalg::factor::chol::evaluate(
5055 Value::Tensor(tensor),
5056 &args,
5057 )
5058 .await
5059 .map_err(|err| runtime_flow_to_anyhow("chol", err))?;
5060 let factor_tensor = tensor_from_value("chol", eval.factor())?;
5061 let factor =
5062 self.allocate_tensor(factor_tensor.data.clone(), factor_tensor.shape.clone());
5063 Ok(ProviderCholResult {
5064 factor,
5065 info: eval.flag_index() as u32,
5066 })
5067 })
5068 }
5069
5070 fn qr<'a>(
5071 &'a self,
5072 handle: &'a GpuTensorHandle,
5073 options: ProviderQrOptions,
5074 ) -> AccelProviderFuture<'a, ProviderQrResult> {
5075 Box::pin(async move {
5076 let data = {
5077 let guard = registry().lock().unwrap();
5078 guard
5079 .get(&handle.buffer_id)
5080 .cloned()
5081 .ok_or_else(|| anyhow!("qr: unknown buffer {}", handle.buffer_id))?
5082 };
5083 let tensor = Tensor::new(data, handle.shape.clone()).map_err(|e| anyhow!("qr: {e}"))?;
5084 let mut args = Vec::new();
5085 if options.economy {
5086 args.push(Value::Num(0.0));
5087 }
5088 if matches!(options.pivot, ProviderQrPivot::Vector) {
5089 args.push(Value::from("vector"));
5090 }
5091 let eval = runmat_runtime::builtins::math::linalg::factor::qr::evaluate(
5092 Value::Tensor(tensor),
5093 &args,
5094 )
5095 .await
5096 .map_err(|err| runtime_flow_to_anyhow("qr", err))?;
5097
5098 let q_tensor = tensor_from_value("qr", eval.q())?;
5099 let r_tensor = tensor_from_value("qr", eval.r())?;
5100 let perm_matrix_tensor = tensor_from_value("qr", eval.permutation_matrix())?;
5101 let perm_vector_tensor = tensor_from_value("qr", eval.permutation_vector())?;
5102
5103 let q_handle = self.allocate_tensor(q_tensor.data.clone(), q_tensor.shape.clone());
5104 let r_handle = self.allocate_tensor(r_tensor.data.clone(), r_tensor.shape.clone());
5105 let perm_matrix_handle = self.allocate_tensor(
5106 perm_matrix_tensor.data.clone(),
5107 perm_matrix_tensor.shape.clone(),
5108 );
5109 let perm_vector_handle = self.allocate_tensor(
5110 perm_vector_tensor.data.clone(),
5111 perm_vector_tensor.shape.clone(),
5112 );
5113
5114 Ok(ProviderQrResult {
5115 q: q_handle,
5116 r: r_handle,
5117 perm_matrix: perm_matrix_handle,
5118 perm_vector: perm_vector_handle,
5119 })
5120 })
5121 }
5122
5123 fn matmul<'a>(
5124 &'a self,
5125 a: &'a GpuTensorHandle,
5126 b: &'a GpuTensorHandle,
5127 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
5128 Box::pin(async move {
5129 if a.shape.len() != 2 || b.shape.len() != 2 {
5131 return Err(anyhow::anyhow!("matmul: only 2D supported"));
5132 }
5133 let (ar, ac) = (a.shape[0], a.shape[1]);
5134 let (br, bc) = (b.shape[0], b.shape[1]);
5135 if ac != br {
5136 return Err(anyhow::anyhow!("matmul: inner dims must agree"));
5137 }
5138 let guard = registry().lock().unwrap();
5139 let abuf = guard
5140 .get(&a.buffer_id)
5141 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", a.buffer_id))?;
5142 let bbuf = guard
5143 .get(&b.buffer_id)
5144 .ok_or_else(|| anyhow::anyhow!("buffer not found: {}", b.buffer_id))?;
5145 let mut out = vec![0.0; ar * bc];
5146 for j in 0..bc {
5148 for i in 0..ar {
5149 let mut sum = 0.0;
5150 for k in 0..ac {
5151 sum += abuf[i + k * ar] * bbuf[k + j * br];
5152 }
5153 out[i + j * ar] = sum;
5154 }
5155 }
5156 drop(guard);
5157 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
5158 let mut guard2 = registry().lock().unwrap();
5159 guard2.insert(id, out);
5160 Ok(GpuTensorHandle {
5161 shape: vec![ar, bc],
5162 device_id: 0,
5163 buffer_id: id,
5164 })
5165 })
5166 }
5167
5168 fn matmul_epilogue<'a>(
5169 &'a self,
5170 a: &'a GpuTensorHandle,
5171 b: &'a GpuTensorHandle,
5172 ep: &'a runmat_accelerate_api::MatmulEpilogue,
5173 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
5174 Box::pin(async move {
5175 let base = self.matmul(a, b).await?;
5177 let (rows, cols) = (base.shape[0], base.shape[1]);
5178 let mut data = {
5179 let guard = registry().lock().unwrap();
5180 guard
5181 .get(&base.buffer_id)
5182 .cloned()
5183 .ok_or_else(|| anyhow!("matmul_epilogue: unknown buffer {}", base.buffer_id))?
5184 };
5185
5186 let row_scale: Option<Vec<f64>> = if let Some(ref h) = ep.row_scale {
5188 let guard = registry().lock().unwrap();
5189 Some(guard.get(&h.buffer_id).cloned().ok_or_else(|| {
5190 anyhow!("matmul_epilogue: unknown row scale buffer {}", h.buffer_id)
5191 })?)
5192 } else {
5193 None
5194 };
5195 let col_scale: Option<Vec<f64>> = if let Some(ref h) = ep.col_scale {
5196 let guard = registry().lock().unwrap();
5197 Some(guard.get(&h.buffer_id).cloned().ok_or_else(|| {
5198 anyhow!("matmul_epilogue: unknown col scale buffer {}", h.buffer_id)
5199 })?)
5200 } else {
5201 None
5202 };
5203 let mut diag_output: Option<Vec<f64>> = if let Some(ref h) = ep.diag_output {
5204 let guard = registry().lock().unwrap();
5205 let vec = guard.get(&h.buffer_id).cloned().ok_or_else(|| {
5206 anyhow!(
5207 "matmul_epilogue: unknown diag output buffer {}",
5208 h.buffer_id
5209 )
5210 })?;
5211 Some(vec)
5212 } else {
5213 None
5214 };
5215 if let Some(ref diag) = diag_output {
5216 let expected = rows.min(cols);
5217 if diag.len() < expected {
5218 return Err(anyhow!(
5219 "matmul_epilogue: diag_output length {} insufficient for diag size {}",
5220 diag.len(),
5221 expected
5222 ));
5223 }
5224 }
5225
5226 for j in 0..cols {
5228 for i in 0..rows {
5229 let idx = i + j * rows;
5230 let mut v = data[idx] * ep.alpha + ep.beta;
5231 if let Some(ref rs) = row_scale {
5232 let s = *rs.get(i).unwrap_or(&1.0);
5233 v = match ep.row_op {
5234 runmat_accelerate_api::ScaleOp::Multiply => v * s,
5235 runmat_accelerate_api::ScaleOp::Divide => v / s,
5236 };
5237 }
5238 if let Some(ref cs) = col_scale {
5239 let s = *cs.get(j).unwrap_or(&1.0);
5240 v = match ep.col_op {
5241 runmat_accelerate_api::ScaleOp::Multiply => v * s,
5242 runmat_accelerate_api::ScaleOp::Divide => v / s,
5243 };
5244 }
5245 if let Some(min_v) = ep.clamp_min {
5246 v = v.max(min_v);
5247 }
5248 if let Some(max_v) = ep.clamp_max {
5249 v = v.min(max_v);
5250 }
5251 if let Some(pow_v) = ep.pow_exponent {
5252 v = v.powf(pow_v);
5253 }
5254 if let Some(ref mut diag) = diag_output {
5255 if i == j && i < diag.len() {
5256 diag[i] = v;
5257 }
5258 }
5259 data[idx] = v;
5260 }
5261 }
5262
5263 {
5265 let mut guard = registry().lock().unwrap();
5266 guard.insert(base.buffer_id, data);
5267 if let Some(vec) = diag_output {
5268 if let Some(ref h) = ep.diag_output {
5269 guard.insert(h.buffer_id, vec);
5270 }
5271 }
5272 }
5273 Ok(base)
5274 })
5275 }
5276
5277 fn matmul_power_step<'a>(
5278 &'a self,
5279 lhs: &'a GpuTensorHandle,
5280 rhs: &'a GpuTensorHandle,
5281 ep: &'a runmat_accelerate_api::PowerStepEpilogue,
5282 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
5283 Box::pin(async move {
5284 let base = self.matmul(lhs, rhs).await?;
5285 let rows = base.shape[0];
5286 let cols = base.shape[1];
5287 let mut data = {
5288 let guard = registry().lock().unwrap();
5289 guard.get(&base.buffer_id).cloned().ok_or_else(|| {
5290 anyhow!("matmul_power_step: unknown buffer {}", base.buffer_id)
5291 })?
5292 };
5293 let mut norms = vec![0.0f64; cols];
5294 for (col, norm) in norms.iter_mut().enumerate().take(cols) {
5295 let mut acc = 0.0f64;
5296 for row in 0..rows {
5297 let idx = row + col * rows;
5298 let val = data[idx];
5299 acc += val * val;
5300 }
5301 acc += ep.epsilon;
5302 *norm = acc.sqrt();
5303 }
5304 for (col, norm) in norms.iter().enumerate().take(cols) {
5305 for row in 0..rows {
5306 let idx = row + col * rows;
5307 data[idx] /= norm;
5308 }
5309 }
5310 {
5311 let mut guard = registry().lock().unwrap();
5312 guard.insert(base.buffer_id, data);
5313 }
5314 Ok(base)
5315 })
5316 }
5317
5318 fn image_normalize<'a>(
5319 &'a self,
5320 input: &'a GpuTensorHandle,
5321 desc: &'a runmat_accelerate_api::ImageNormalizeDescriptor,
5322 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
5323 Box::pin(async move {
5324 ensure!(
5325 input.shape.len() == 3,
5326 "image_normalize: expected 3-D tensor, got {:?}",
5327 input.shape
5328 );
5329 ensure!(
5330 input.shape[0] == desc.batch
5331 && input.shape[1] == desc.height
5332 && input.shape[2] == desc.width,
5333 "image_normalize: descriptor dims {:?} do not match tensor shape {:?}",
5334 (desc.batch, desc.height, desc.width),
5335 input.shape
5336 );
5337
5338 let data = {
5339 let guard = registry().lock().unwrap();
5340 guard
5341 .get(&input.buffer_id)
5342 .cloned()
5343 .ok_or_else(|| anyhow!("image_normalize: unknown buffer {}", input.buffer_id))?
5344 };
5345
5346 let batch = desc.batch;
5347 let height = desc.height;
5348 let width = desc.width;
5349 let plane = height * width;
5350 if plane == 0 {
5351 return Ok(self.allocate_tensor(vec![], input.shape.clone()));
5352 }
5353
5354 let stride_h = batch;
5355 let stride_w = batch * height;
5356
5357 let gain = desc.gain.unwrap_or(1.0);
5358 let bias = desc.bias.unwrap_or(0.0);
5359 let gamma = desc.gamma;
5360
5361 let mut output = data.clone();
5362
5363 for b in 0..batch {
5364 let mut sum = 0.0;
5365 for w in 0..width {
5366 let base_w = w * stride_w;
5367 for h in 0..height {
5368 let idx = b + h * stride_h + base_w;
5369 sum += data[idx];
5370 }
5371 }
5372 let mean = sum / plane as f64;
5373
5374 let mut sq_sum = 0.0;
5375 for w in 0..width {
5376 let base_w = w * stride_w;
5377 for h in 0..height {
5378 let idx = b + h * stride_h + base_w;
5379 let diff = data[idx] - mean;
5380 sq_sum += diff * diff;
5381 }
5382 }
5383 let variance = sq_sum / plane as f64;
5384 let sigma = (variance + desc.epsilon).sqrt();
5385 let inv_sigma = if sigma > 0.0 { 1.0 / sigma } else { 0.0 };
5386
5387 for w in 0..width {
5388 let base_w = w * stride_w;
5389 for h in 0..height {
5390 let idx = b + h * stride_h + base_w;
5391 let mut value = (data[idx] - mean) * inv_sigma;
5392 if desc.gain.is_some() {
5393 value *= gain;
5394 }
5395 if desc.bias.is_some() {
5396 value += bias;
5397 }
5398 value = value.max(0.0);
5399 if let Some(gamma) = gamma {
5400 value = value.powf(gamma);
5401 }
5402 output[idx] = value;
5403 }
5404 }
5405 }
5406
5407 Ok(self.allocate_tensor(output, input.shape.clone()))
5408 })
5409 }
5410
5411 fn pagefun(&self, _request: &PagefunRequest) -> Result<GpuTensorHandle> {
5412 Err(anyhow::anyhow!(
5413 "pagefun: in-process provider does not implement GPU page operations"
5414 ))
5415 }
5416
5417 fn linsolve<'a>(
5418 &'a self,
5419 lhs: &'a GpuTensorHandle,
5420 rhs: &'a GpuTensorHandle,
5421 options: &'a ProviderLinsolveOptions,
5422 ) -> AccelProviderFuture<'a, ProviderLinsolveResult> {
5423 Box::pin(async move {
5424 let start = Instant::now();
5425 let (lhs_data, rhs_data) = {
5426 let guard = registry().lock().unwrap();
5427 let lhs_buf = guard
5428 .get(&lhs.buffer_id)
5429 .cloned()
5430 .ok_or_else(|| anyhow!("linsolve: unknown buffer {}", lhs.buffer_id))?;
5431 let rhs_buf = guard
5432 .get(&rhs.buffer_id)
5433 .cloned()
5434 .ok_or_else(|| anyhow!("linsolve: unknown buffer {}", rhs.buffer_id))?;
5435 (lhs_buf, rhs_buf)
5436 };
5437 self.telemetry
5438 .record_download_bytes((lhs_data.len() * std::mem::size_of::<f64>()) as u64);
5439 self.telemetry
5440 .record_download_bytes((rhs_data.len() * std::mem::size_of::<f64>()) as u64);
5441
5442 let lhs_tensor =
5443 Tensor::new(lhs_data, lhs.shape.clone()).map_err(|e| anyhow!("linsolve: {e}"))?;
5444 let rhs_tensor =
5445 Tensor::new(rhs_data, rhs.shape.clone()).map_err(|e| anyhow!("linsolve: {e}"))?;
5446
5447 let (solution, rcond) =
5448 linsolve_host_real_for_provider(&lhs_tensor, &rhs_tensor, options)
5449 .map_err(|e| anyhow!("{e}"))?;
5450 self.telemetry.record_linsolve_duration(start.elapsed());
5451 self.telemetry
5452 .record_solve_fallback("linsolve:host_reupload");
5453 self.telemetry
5454 .record_upload_bytes((solution.data.len() * std::mem::size_of::<f64>()) as u64);
5455
5456 let Tensor { data, shape, .. } = solution;
5457 let handle = self.allocate_tensor(data, shape);
5458 Ok(ProviderLinsolveResult {
5459 solution: handle,
5460 reciprocal_condition: rcond,
5461 })
5462 })
5463 }
5464
5465 fn inv<'a>(
5466 &'a self,
5467 matrix: &'a GpuTensorHandle,
5468 _options: ProviderInvOptions,
5469 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
5470 Box::pin(async move {
5471 let data = {
5472 let guard = registry().lock().unwrap();
5473 guard
5474 .get(&matrix.buffer_id)
5475 .cloned()
5476 .ok_or_else(|| anyhow!("inv: unknown buffer {}", matrix.buffer_id))?
5477 };
5478 let tensor =
5479 Tensor::new(data, matrix.shape.clone()).map_err(|e| anyhow!("inv: {e}"))?;
5480 let result = inv_host_real_for_provider(&tensor).map_err(|e| anyhow!("{e}"))?;
5481 let Tensor { data, shape, .. } = result;
5482 Ok(self.allocate_tensor(data, shape))
5483 })
5484 }
5485
5486 fn pinv<'a>(
5487 &'a self,
5488 matrix: &'a GpuTensorHandle,
5489 options: ProviderPinvOptions,
5490 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
5491 Box::pin(async move {
5492 let data = {
5493 let guard = registry().lock().unwrap();
5494 guard
5495 .get(&matrix.buffer_id)
5496 .cloned()
5497 .ok_or_else(|| anyhow!("pinv: unknown buffer {}", matrix.buffer_id))?
5498 };
5499 let tensor =
5500 Tensor::new(data, matrix.shape.clone()).map_err(|e| anyhow!("pinv: {e}"))?;
5501 let result = pinv_host_real_for_provider(&tensor, options.tolerance)
5502 .map_err(|e| anyhow!("{e}"))?;
5503 let Tensor { data, shape, .. } = result;
5504 Ok(self.allocate_tensor(data, shape))
5505 })
5506 }
5507
5508 fn cond<'a>(
5509 &'a self,
5510 matrix: &'a GpuTensorHandle,
5511 norm: ProviderCondNorm,
5512 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
5513 Box::pin(async move {
5514 let data = {
5515 let guard = registry().lock().unwrap();
5516 guard
5517 .get(&matrix.buffer_id)
5518 .cloned()
5519 .ok_or_else(|| anyhow!("cond: unknown buffer {}", matrix.buffer_id))?
5520 };
5521 let tensor =
5522 Tensor::new(data, matrix.shape.clone()).map_err(|e| anyhow!("cond: {e}"))?;
5523 let cond_value =
5524 cond_host_real_for_provider(&tensor, norm).map_err(|e| anyhow!("{e}"))?;
5525 Ok(self.allocate_tensor(vec![cond_value], vec![1, 1]))
5526 })
5527 }
5528
5529 fn norm<'a>(
5530 &'a self,
5531 tensor: &'a GpuTensorHandle,
5532 order: ProviderNormOrder,
5533 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
5534 Box::pin(async move {
5535 let data = {
5536 let guard = registry().lock().unwrap();
5537 guard
5538 .get(&tensor.buffer_id)
5539 .cloned()
5540 .ok_or_else(|| anyhow!("norm: unknown buffer {}", tensor.buffer_id))?
5541 };
5542 let host_tensor =
5543 Tensor::new(data, tensor.shape.clone()).map_err(|e| anyhow!("norm: {e}"))?;
5544 let value =
5545 norm_host_real_for_provider(&host_tensor, order).map_err(|e| anyhow!("{e}"))?;
5546 Ok(self.allocate_tensor(vec![value], vec![1, 1]))
5547 })
5548 }
5549
5550 fn rank<'a>(
5551 &'a self,
5552 matrix: &'a GpuTensorHandle,
5553 tolerance: Option<f64>,
5554 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
5555 Box::pin(async move {
5556 let data = {
5557 let guard = registry().lock().unwrap();
5558 guard
5559 .get(&matrix.buffer_id)
5560 .cloned()
5561 .ok_or_else(|| anyhow!("rank: unknown buffer {}", matrix.buffer_id))?
5562 };
5563
5564 let tensor =
5565 Tensor::new(data, matrix.shape.clone()).map_err(|e| anyhow!("rank: {e}"))?;
5566 let rank =
5567 rank_host_real_for_provider(&tensor, tolerance).map_err(|e| anyhow!("{e}"))? as f64;
5568
5569 Ok(self.allocate_tensor(vec![rank], vec![1, 1]))
5570 })
5571 }
5572
5573 fn rcond<'a>(
5574 &'a self,
5575 matrix: &'a GpuTensorHandle,
5576 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
5577 Box::pin(async move {
5578 let data = {
5579 let guard = registry().lock().unwrap();
5580 guard
5581 .get(&matrix.buffer_id)
5582 .cloned()
5583 .ok_or_else(|| anyhow!("rcond: unknown buffer {}", matrix.buffer_id))?
5584 };
5585 let tensor =
5586 Tensor::new(data, matrix.shape.clone()).map_err(|e| anyhow!("rcond: {e}"))?;
5587 let estimate = rcond_host_real_for_provider(&tensor).map_err(|e| anyhow!("{e}"))?;
5588 Ok(self.allocate_tensor(vec![estimate], vec![1, 1]))
5589 })
5590 }
5591
5592 fn mldivide<'a>(
5593 &'a self,
5594 lhs: &'a GpuTensorHandle,
5595 rhs: &'a GpuTensorHandle,
5596 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
5597 Box::pin(async move {
5598 let start = Instant::now();
5599 let (lhs_data, rhs_data) = {
5600 let guard = registry().lock().unwrap();
5601 let lhs_buf = guard
5602 .get(&lhs.buffer_id)
5603 .cloned()
5604 .ok_or_else(|| anyhow!("mldivide: unknown buffer {}", lhs.buffer_id))?;
5605 let rhs_buf = guard
5606 .get(&rhs.buffer_id)
5607 .cloned()
5608 .ok_or_else(|| anyhow!("mldivide: unknown buffer {}", rhs.buffer_id))?;
5609 (lhs_buf, rhs_buf)
5610 };
5611 self.telemetry
5612 .record_download_bytes((lhs_data.len() * std::mem::size_of::<f64>()) as u64);
5613 self.telemetry
5614 .record_download_bytes((rhs_data.len() * std::mem::size_of::<f64>()) as u64);
5615
5616 let lhs_tensor =
5617 Tensor::new(lhs_data, lhs.shape.clone()).map_err(|e| anyhow!("mldivide: {e}"))?;
5618 let rhs_tensor =
5619 Tensor::new(rhs_data, rhs.shape.clone()).map_err(|e| anyhow!("mldivide: {e}"))?;
5620
5621 let result = mldivide_host_real_for_provider(&lhs_tensor, &rhs_tensor)
5622 .map_err(|e| anyhow!("{e}"))?;
5623 self.telemetry.record_mldivide_duration(start.elapsed());
5624 self.telemetry
5625 .record_solve_fallback("mldivide:host_reupload");
5626 self.telemetry
5627 .record_upload_bytes((result.data.len() * std::mem::size_of::<f64>()) as u64);
5628
5629 let Tensor { data, shape, .. } = result;
5630 Ok(self.allocate_tensor(data, shape))
5631 })
5632 }
5633
5634 fn mrdivide<'a>(
5635 &'a self,
5636 lhs: &'a GpuTensorHandle,
5637 rhs: &'a GpuTensorHandle,
5638 ) -> AccelProviderFuture<'a, GpuTensorHandle> {
5639 Box::pin(async move {
5640 let start = Instant::now();
5641 let (lhs_data, rhs_data) = {
5642 let guard = registry().lock().unwrap();
5643 let lhs_buf = guard
5644 .get(&lhs.buffer_id)
5645 .cloned()
5646 .ok_or_else(|| anyhow!("mrdivide: unknown buffer {}", lhs.buffer_id))?;
5647 let rhs_buf = guard
5648 .get(&rhs.buffer_id)
5649 .cloned()
5650 .ok_or_else(|| anyhow!("mrdivide: unknown buffer {}", rhs.buffer_id))?;
5651 (lhs_buf, rhs_buf)
5652 };
5653 self.telemetry
5654 .record_download_bytes((lhs_data.len() * std::mem::size_of::<f64>()) as u64);
5655 self.telemetry
5656 .record_download_bytes((rhs_data.len() * std::mem::size_of::<f64>()) as u64);
5657
5658 let lhs_tensor =
5659 Tensor::new(lhs_data, lhs.shape.clone()).map_err(|e| anyhow!("mrdivide: {e}"))?;
5660 let rhs_tensor =
5661 Tensor::new(rhs_data, rhs.shape.clone()).map_err(|e| anyhow!("mrdivide: {e}"))?;
5662
5663 let result = mrdivide_host_real_for_provider(&lhs_tensor, &rhs_tensor)
5664 .map_err(|e| anyhow!("{e}"))?;
5665 self.telemetry.record_mrdivide_duration(start.elapsed());
5666 self.telemetry
5667 .record_solve_fallback("mrdivide:host_reupload");
5668 self.telemetry
5669 .record_upload_bytes((result.data.len() * std::mem::size_of::<f64>()) as u64);
5670
5671 let Tensor { data, shape, .. } = result;
5672 Ok(self.allocate_tensor(data, shape))
5673 })
5674 }
5675
5676 fn eig<'a>(
5677 &'a self,
5678 _a: &'a GpuTensorHandle,
5679 _compute_left: bool,
5680 ) -> AccelProviderFuture<'a, ProviderEigResult> {
5681 Box::pin(async move { Err(anyhow!("eig: not supported by in-process provider")) })
5682 }
5683
5684 fn sub2ind(
5685 &self,
5686 dims: &[usize],
5687 strides: &[usize],
5688 inputs: &[&GpuTensorHandle],
5689 scalar_mask: &[bool],
5690 len: usize,
5691 output_shape: &[usize],
5692 ) -> Result<GpuTensorHandle> {
5693 if inputs.len() != dims.len() || inputs.len() != scalar_mask.len() {
5694 return Err(anyhow::anyhow!(
5695 "sub2ind: expected {} subscripts for {} dimensions",
5696 dims.len(),
5697 dims.len()
5698 ));
5699 }
5700 let expected_len: usize = output_shape.iter().copied().product();
5701 if expected_len != len {
5702 return Err(anyhow::anyhow!(
5703 "sub2ind: output shape does not match subscript sizes"
5704 ));
5705 }
5706 if len == 0 {
5707 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
5708 registry().lock().unwrap().insert(id, Vec::new());
5709 return Ok(GpuTensorHandle {
5710 shape: output_shape.to_vec(),
5711 device_id: 0,
5712 buffer_id: id,
5713 });
5714 }
5715
5716 let mut host_values: Vec<Vec<f64>> = Vec::with_capacity(inputs.len());
5717 {
5718 let guard = registry().lock().unwrap();
5719 for handle in inputs {
5720 let data = guard
5721 .get(&handle.buffer_id)
5722 .ok_or_else(|| anyhow::anyhow!("sub2ind: unknown buffer {}", handle.buffer_id))?
5723 .clone();
5724 host_values.push(data);
5725 }
5726 }
5727
5728 let mut output = Vec::with_capacity(len);
5729 for idx in 0..len {
5730 let mut offset: usize = 0;
5731 for (dim_index, ((&dim_size, &stride), data)) in dims
5732 .iter()
5733 .zip(strides.iter())
5734 .zip(host_values.iter())
5735 .enumerate()
5736 {
5737 let raw = if scalar_mask[dim_index] {
5738 *data.first().unwrap_or(&0.0)
5739 } else {
5740 data[idx]
5741 };
5742 let coerced = coerce_sub2ind_value(raw, dim_index + 1, dim_size)?;
5743 let term = coerced
5744 .checked_sub(1)
5745 .and_then(|base| base.checked_mul(stride))
5746 .ok_or_else(|| {
5747 anyhow::anyhow!("sub2ind: computed index exceeds platform limits")
5748 })?;
5749 offset = offset.checked_add(term).ok_or_else(|| {
5750 anyhow::anyhow!("sub2ind: computed index exceeds platform limits")
5751 })?;
5752 }
5753 output.push((offset + 1) as f64);
5754 }
5755
5756 let id = self.next_id.fetch_add(1, Ordering::Relaxed);
5757 registry().lock().unwrap().insert(id, output);
5758 Ok(GpuTensorHandle {
5759 shape: output_shape.to_vec(),
5760 device_id: 0,
5761 buffer_id: id,
5762 })
5763 }
5764
5765 fn fused_elementwise(
5766 &self,
5767 _shader: &str,
5768 _inputs: &[GpuTensorHandle],
5769 _output_shape: &[usize],
5770 _len: usize,
5771 ) -> Result<GpuTensorHandle> {
5772 Err(anyhow::anyhow!(
5773 "fused_elementwise not supported by in-process provider"
5774 ))
5775 }
5776}
5777
5778static INSTANCE: OnceCell<InProcessProvider> = OnceCell::new();
5779
5780pub fn register_inprocess_provider() {
5783 let provider: &'static InProcessProvider = INSTANCE.get_or_init(InProcessProvider::new);
5784 #[cfg(not(test))]
5785 unsafe {
5786 runmat_accelerate_api::register_provider(provider)
5788 };
5789 runmat_accelerate_api::set_thread_provider(Some(provider));
5790}
5791
5792pub fn reset_inprocess_rng() {
5794 if let Ok(mut guard) = rng_state().lock() {
5795 *guard = 0x9e3779b97f4a7c15;
5796 }
5797}