Skip to main content

simd_kernels/kernels/scientific/
vector.rs

1// Copyright (c) 2025 SpaceCell Enterprises Ltd
2// SPDX-License-Identifier: AGPL-3.0-or-later
3// Commercial licensing available. See LICENSE and LICENSING.md.
4
5//! # **Vector Mathematics Module** - *High-Performance BLAS-Compatible Vector Operations*
6//!
7//! This module provides a streamlined set of vector operations without the need for BLAS/LAPACK linking.
8//! It includes SIMD-accelerated arms where appropriate.
9//!
10//! ## Usage Examples
11//!
12//! ### Basic Linear Algebra
13//! ```rust,ignore
14//! use minarrow::vec64;
15//! use crate::kernels::scientific::vector::{dot, scale, axpy, l2_norm};
16//!
17//! let x = vec64![1.0, 2.0, 3.0, 4.0];
18//! let mut y = vec64![5.0, 6.0, 7.0, 8.0];
19//!
20//! // Dot product computation
21//! let result = dot(&x, &y, None, None);
22//!
23//! // In-place vector scaling
24//! scale(&mut y, 2.0, None, None);
25//!
26//! // Scaled addition: y ← 2.0 * x + y
27//! axpy(4, 2.0, &x, 1, &mut y, 1, None, None)?;
28//!
29//! // Euclidean norm
30//! let norm = l2_norm(&x, None, None);
31//! ```
32
33include!(concat!(env!("OUT_DIR"), "/simd_lanes.rs"));
34
35#[cfg(feature = "simd")]
36use std::simd::{Mask, Select, Simd, num::SimdFloat};
37
38use minarrow::{Bitmask, FloatArray, Vec64};
39
40#[cfg(feature = "simd")]
41use crate::kernels::aggregate::neumaier_simd_add;
42use crate::kernels::aggregate::{neumaier_add, sum_squares};
43use crate::utils::has_nulls;
44use minarrow::enums::error::KernelError;
45
46#[cfg(feature = "simd")]
47use crate::utils::bitmask_to_simd_mask;
48
49#[cfg(feature = "simd")]
50use minarrow::utils::is_simd_aligned;
51
52// --- Vector analytics ---
53
54/// Computes the dot product with another vector, propagating nulls via an optional Bitmask.
55#[inline(always)]
56pub fn dot(v1: &[f64], v2: &[f64], null_mask: Option<&Bitmask>, null_count: Option<usize>) -> f64 {
57    assert_eq!(v1.len(), v2.len(), "dot: input length mismatch");
58
59    let len = v1.len();
60
61    #[cfg(feature = "simd")]
62    if is_simd_aligned(v1) && is_simd_aligned(v2) {
63        let needs_nulls = has_nulls(null_count, null_mask);
64        const N: usize = W64;
65        let mut sum_v = Simd::<f64, N>::splat(0.0);
66        let mut comp_v = Simd::<f64, N>::splat(0.0);
67        let mut i = 0;
68        if !needs_nulls {
69            // fast path: no nulls — compensated per-lane accumulation
70            while i + N <= len {
71                let a = Simd::<f64, N>::from_slice(&v1[i..i + N]);
72                let b = Simd::<f64, N>::from_slice(&v2[i..i + N]);
73                neumaier_simd_add(&mut sum_v, &mut comp_v, a * b);
74                i += N;
75            }
76        } else {
77            // null-aware path: compensated per-lane with masked products
78            let mask_bytes = null_mask.unwrap().as_bytes();
79            while i + N <= len {
80                let a = Simd::<f64, N>::from_slice(&v1[i..i + N]);
81                let b = Simd::<f64, N>::from_slice(&v2[i..i + N]);
82                let prod = a * b;
83                let lane_mask: Mask<i64, N> = bitmask_to_simd_mask::<N, i64>(mask_bytes, i, len);
84                neumaier_simd_add(
85                    &mut sum_v,
86                    &mut comp_v,
87                    lane_mask.select(prod, Simd::splat(0.0)),
88                );
89                i += N;
90            }
91        }
92        // Horizontal reduction + scalar tail
93        let mut acc = (sum_v + comp_v).reduce_sum();
94        for idx in i..len {
95            if !needs_nulls || unsafe { null_mask.unwrap().get_unchecked(idx) } {
96                acc += v1[idx] * v2[idx];
97            }
98        }
99        return acc;
100    }
101
102    // Scalar fallback — alignment check failed or no simd flag
103    let needs_nulls = has_nulls(null_count, null_mask);
104    let mut acc = 0.0_f64;
105    let mut comp = 0.0_f64;
106    if !needs_nulls {
107        for i in 0..len {
108            neumaier_add(&mut acc, &mut comp, v1[i] * v2[i]);
109        }
110    } else {
111        let mb = null_mask.unwrap();
112        for i in 0..len {
113            if unsafe { mb.get_unchecked(i) } {
114                neumaier_add(&mut acc, &mut comp, v1[i] * v2[i]);
115            }
116        }
117    }
118    acc + comp
119}
120
121/// Computes a stable key-based argsort of indices
122#[inline(always)]
123pub fn argsort(
124    v: &[f64],
125    mask: Option<&Bitmask>,
126    null_count: Option<usize>,
127    descending: bool,
128) -> Vec64<usize> {
129    let mut idx: Vec64<usize> = (0..v.len()).collect();
130    if !has_nulls(null_count, mask) {
131        if descending {
132            idx.sort_unstable_by(|&i, &j| {
133                v[j].partial_cmp(&v[i]).unwrap_or(std::cmp::Ordering::Equal)
134            });
135        } else {
136            idx.sort_unstable_by(|&i, &j| {
137                v[i].partial_cmp(&v[j]).unwrap_or(std::cmp::Ordering::Equal)
138            });
139        }
140    } else {
141        let mb = mask.expect("argsort: mask required when nulls present");
142        idx.sort_unstable_by(|&i, &j| {
143            let vi_null = !unsafe { mb.get_unchecked(i) };
144            let vj_null = !unsafe { mb.get_unchecked(j) };
145            match (vi_null, vj_null) {
146                (true, true) => std::cmp::Ordering::Equal,
147                (true, false) => std::cmp::Ordering::Greater, // nulls last
148                (false, true) => std::cmp::Ordering::Less,
149                (false, false) => {
150                    if descending {
151                        v[j].partial_cmp(&v[i]).unwrap_or(std::cmp::Ordering::Equal)
152                    } else {
153                        v[i].partial_cmp(&v[j]).unwrap_or(std::cmp::Ordering::Equal)
154                    }
155                }
156            }
157        });
158    }
159    idx
160}
161
162/// Bins values into histogram buckets.
163#[inline(always)]
164pub fn histogram(
165    v: &[f64],
166    bins: &[f64],
167    mask: Option<&Bitmask>,
168    null_count: Option<usize>,
169) -> Vec<usize> {
170    assert!(!bins.is_empty(), "histogram: bins must be non-empty");
171    let has_nulls = match null_count {
172        Some(n) => n > 0,
173        None => mask.is_some(),
174    };
175
176    let bins_sorted = bins.windows(2).all(|w| w[0] <= w[1]);
177    let mut counts = vec![0; bins.len() + 1];
178    if !has_nulls {
179        if bins_sorted {
180            for &x in v {
181                match bins.binary_search_by(|b| b.partial_cmp(&x).unwrap()) {
182                    Ok(pos) | Err(pos) => counts[pos] += 1,
183                }
184            }
185        } else {
186            for &x in v {
187                let mut placed = false;
188                for (i, &b) in bins.iter().enumerate() {
189                    if x < b {
190                        counts[i] += 1;
191                        placed = true;
192                        break;
193                    }
194                }
195                if !placed {
196                    counts[bins.len()] += 1;
197                }
198            }
199        }
200    } else {
201        let mb = mask.expect("histogram: mask required when nulls present");
202        if bins_sorted {
203            for (i, &x) in v.iter().enumerate() {
204                if unsafe { mb.get_unchecked(i) } {
205                    match bins.binary_search_by(|b| b.partial_cmp(&x).unwrap()) {
206                        Ok(pos) | Err(pos) => counts[pos] += 1,
207                    }
208                }
209            }
210        } else {
211            for (i, &x) in v.iter().enumerate() {
212                if unsafe { mb.get_unchecked(i) } {
213                    let mut placed = false;
214                    for (j, &b) in bins.iter().enumerate() {
215                        if x < b {
216                            counts[j] += 1;
217                            placed = true;
218                            break;
219                        }
220                    }
221                    if !placed {
222                        counts[bins.len()] += 1;
223                    }
224                }
225            }
226        }
227    }
228    counts
229}
230
231/// Uniform reservoir sampling of size k.
232#[inline(always)]
233pub fn reservoir_sample(
234    v: &[f64],
235    k: usize,
236    mask: Option<&Bitmask>,
237    null_count: Option<usize>,
238) -> FloatArray<f64> {
239    use rand::prelude::*;
240    assert!(k > 0, "reservoir_sample: k must be positive");
241    let has_nulls = match null_count {
242        Some(n) => n > 0,
243        None => mask.is_some(),
244    };
245
246    let mut rng = rand::rng();
247    let mut out = Vec64::with_capacity(k);
248    let mut seen = 0usize;
249
250    for (idx, &val) in v.iter().enumerate() {
251        if has_nulls && !unsafe { mask.unwrap().get_unchecked(idx) } {
252            continue;
253        }
254        if out.len() < k {
255            out.push(val);
256        } else {
257            let j = rng.random_range(0..=seen);
258            if j < k {
259                out[j] = val;
260            }
261        }
262        seen += 1;
263    }
264    assert_eq!(out.len(), k, "reservoir_sample: not enough valid values");
265    FloatArray::from_vec64(out, None)
266}
267
268// --- Vector transformations ---
269
270/// Scales the vector in place: x ← αx
271/// Kernel: SCAL (L2-2)
272#[inline(always)]
273pub fn scale(v: &mut [f64], alpha: f64, null_mask: Option<&Bitmask>, null_count: Option<usize>) {
274    #[cfg(feature = "simd")]
275    if is_simd_aligned(v) {
276        const N: usize = W64;
277        let len = v.len();
278        let alpha_v = Simd::<f64, N>::splat(alpha);
279        let mut i = 0;
280
281        if !has_nulls(null_count, null_mask) {
282            // Dense path: just scale every lane
283            while i + N <= len {
284                let chunk = Simd::from_slice(&v[i..i + N]);
285                let scaled = chunk * alpha_v;
286                v[i..i + N].copy_from_slice(&scaled.to_array());
287                i += N;
288            }
289            // Remainder
290            for x in &mut v[i..] {
291                *x *= alpha;
292            }
293        } else {
294            // Null‐aware path
295            let mb = null_mask.expect("scale: mask required when nulls present");
296            let mask_bytes = mb.as_bytes();
297
298            // Vectorized blocks
299            while i + N <= len {
300                let chunk = Simd::from_slice(&v[i..i + N]);
301                // Build SIMD mask<M,N> from the global bitmask
302                let lane_mask: Mask<i64, N> = bitmask_to_simd_mask::<N, i64>(mask_bytes, i, len);
303                // Only scale the valid lanes, leave the rest untouched
304                let result = lane_mask.select(chunk * alpha_v, chunk);
305                v[i..i + N].copy_from_slice(&result.to_array());
306                i += N;
307            }
308            // Scalar remainder
309            for idx in i..len {
310                if unsafe { mb.get_unchecked(idx) } {
311                    v[idx] *= alpha;
312                }
313            }
314        }
315        return;
316    }
317
318    // Scalar fallback - alignment check failed or no SIMD flag
319    if !has_nulls(null_count, null_mask) {
320        for x in v {
321            *x *= alpha;
322        }
323    } else {
324        let mb = null_mask.expect("scale: mask required when nulls present");
325        for (i, x) in v.iter_mut().enumerate() {
326            if unsafe { mb.get_unchecked(i) } {
327                *x *= alpha;
328            }
329        }
330    }
331}
332
333/// In-place vector scaling: x ← α·x
334///
335/// This is the version for a matrix in a flat strided buffer,
336/// and therefore tackles incx ("increment x") the gap between values.
337///
338/// Multiplies each element of the input vector `x` by the scalar `alpha`.
339/// Equivalent to `x[i] ← alpha * x[i]` for all valid `i`.
340#[inline(always)]
341pub fn scale_vec(
342    n: i32,
343    alpha: f64,
344    x: &mut [f64],
345    incx: i32,
346    null_mask: Option<&Bitmask>,
347    null_count: Option<usize>,
348) -> Result<(), KernelError> {
349    if n < 0 {
350        return Err(KernelError::InvalidArguments(
351            "n must be non-negative".into(),
352        ));
353    }
354    let n = n as usize;
355    let incx = incx as usize;
356    if incx == 0 {
357        return Err(KernelError::InvalidArguments(
358            "incx must be positive".into(),
359        ));
360    }
361    if n == 0 {
362        return Ok(());
363    }
364    if (n - 1).saturating_mul(incx) >= x.len() {
365        return Err(KernelError::InvalidArguments(
366            "indexing out of bounds".into(),
367        ));
368    }
369
370    let mask_present = has_nulls(null_count, null_mask);
371
372    #[cfg(feature = "simd")]
373    if is_simd_aligned(x) {
374        const LANES: usize = W64;
375        let alpha_v = Simd::<f64, LANES>::splat(alpha);
376
377        // only vectorize when truly contiguous
378        if incx == 1 && !mask_present {
379            // dense contiguous path
380            let mut i = 0;
381            while i + LANES <= n {
382                let chunk = Simd::from_slice(&x[i..i + LANES]);
383                let out_v = chunk * alpha_v;
384                x[i..i + LANES].copy_from_slice(&out_v.to_array());
385                i += LANES;
386            }
387            for xi in &mut x[i..n] {
388                *xi *= alpha;
389            }
390            return Ok(());
391        }
392        if incx == 1 && mask_present {
393            // contiguous + null‐aware
394            let mb = null_mask.unwrap();
395            let bytes = mb.as_bytes();
396            let mut i = 0;
397            while i + LANES <= n {
398                // build a SIMD mask from the arrow bitmap
399                let lane_mask: Mask<i64, LANES> = bitmask_to_simd_mask::<LANES, i64>(bytes, i, n);
400                let chunk = Simd::from_slice(&x[i..i + LANES]);
401                let scaled = chunk * alpha_v;
402                // only scale where mask==true, else leave original
403                let out_v = lane_mask.select(scaled, chunk);
404                x[i..i + LANES].copy_from_slice(&out_v.to_array());
405                i += LANES;
406            }
407            // tail
408            for idx in i..n {
409                if unsafe { mb.get_unchecked(idx) } {
410                    x[idx] *= alpha;
411                }
412            }
413            return Ok(());
414        }
415    }
416
417    // scalar (strided) fallback
418    if !mask_present {
419        let mut idx = 0;
420        for _ in 0..n {
421            x[idx] *= alpha;
422            idx += incx;
423        }
424    } else {
425        let mb = null_mask.unwrap();
426        let mut idx = 0;
427        for _ in 0..n {
428            if unsafe { mb.get_unchecked(idx) } {
429                x[idx] *= alpha;
430            }
431            idx += incx;
432        }
433    }
434    Ok(())
435}
436
437/// Scaled vector addition: y ← α·x + y
438///
439/// Computes a linear combination of two vectors, scaling `x` by `alpha` and accumulating into `y`.
440/// Equivalent to `y[i] ← alpha * x[i] + y[i]` for all valid `i`.
441#[inline(always)]
442pub fn axpy(
443    n: i32,
444    alpha: f64,
445    x: &[f64],
446    incx: i32,
447    y: &mut [f64],
448    incy: i32,
449    null_mask: Option<&Bitmask>,
450    null_count: Option<usize>,
451) -> Result<(), KernelError> {
452    if n < 0 {
453        return Err(KernelError::InvalidArguments(
454            "n must be non-negative".into(),
455        ));
456    }
457    let n = n as usize;
458    let incx = incx as usize;
459    let incy = incy as usize;
460    if incx == 0 || incy == 0 {
461        return Err(KernelError::InvalidArguments(
462            "increments must be positive".into(),
463        ));
464    }
465    if n == 0 {
466        return Ok(());
467    }
468    if (n - 1).saturating_mul(incx) >= x.len() {
469        return Err(KernelError::InvalidArguments("x out of bounds".into()));
470    }
471    if (n - 1).saturating_mul(incy) >= y.len() {
472        return Err(KernelError::InvalidArguments("y out of bounds".into()));
473    }
474
475    let mask_present = has_nulls(null_count, null_mask);
476
477    #[cfg(feature = "simd")]
478    if is_simd_aligned(x) {
479        const LANES: usize = W64;
480        let alpha_v = Simd::<f64, LANES>::splat(alpha);
481
482        // only when both x and y are contiguous
483        if incx == 1 && incy == 1 && !mask_present {
484            // dense contiguous
485            let mut i = 0;
486            while i + LANES <= n {
487                let xv = Simd::from_slice(&x[i..i + LANES]);
488                let yv = Simd::from_slice(&y[i..i + LANES]);
489                let out_v = alpha_v * xv + yv;
490                y[i..i + LANES].copy_from_slice(&out_v.to_array());
491                i += LANES;
492            }
493            for j in i..n {
494                y[j] += alpha * x[j];
495            }
496            return Ok(());
497        }
498        if incx == 1 && incy == 1 && mask_present {
499            // contiguous + null‐aware
500            let mb = null_mask.unwrap();
501            let bytes = mb.as_bytes();
502            let mut i = 0;
503            while i + LANES <= n {
504                let lane_mask: Mask<i64, LANES> = bitmask_to_simd_mask::<LANES, i64>(bytes, i, n);
505                let xv = Simd::from_slice(&x[i..i + LANES]);
506                let yv = Simd::from_slice(&y[i..i + LANES]);
507                let computed = alpha_v * xv + yv;
508                let out_v = lane_mask.select(computed, yv);
509                y[i..i + LANES].copy_from_slice(&out_v.to_array());
510                i += LANES;
511            }
512            for idx in i..n {
513                if unsafe { mb.get_unchecked(idx) } {
514                    y[idx] += alpha * x[idx];
515                }
516            }
517            return Ok(());
518        }
519    }
520
521    // scalar (potentially strided) fallback
522    if !mask_present {
523        let mut ix = 0;
524        let mut iy = 0;
525        for _ in 0..n {
526            y[iy] += alpha * x[ix];
527            ix += incx;
528            iy += incy;
529        }
530    } else {
531        let mb = null_mask.unwrap();
532        let mut ix = 0;
533        let mut iy = 0;
534        for _ in 0..n {
535            if unsafe { mb.get_unchecked(ix) } {
536                y[iy] += alpha * x[ix];
537            }
538            ix += incx;
539            iy += incy;
540        }
541    }
542    Ok(())
543}
544
545/// Computes the L2 norm (‖x‖₂)
546#[inline(always)]
547pub fn l2_norm(v: &[f64], mask: Option<&Bitmask>, null_count: Option<usize>) -> f64 {
548    sum_squares(v, mask, null_count).sqrt()
549}
550
551/// Computes the Euclidean (ℓ₂) norm of a vector: ‖x‖₂ = sqrt(Σ xᵢ²)
552///
553/// Returns the magnitude (Euclidean norm) of the vector `x`.
554/// For an input vector `x`, computes `sqrt(sum(x_i^2))`.
555#[inline(always)]
556pub fn vector_norm(
557    n: i32,
558    x: &[f64],
559    incx: i32,
560    null_mask: Option<&Bitmask>,
561    null_count: Option<usize>,
562) -> Result<f64, KernelError> {
563    if n < 0 {
564        return Err(KernelError::InvalidArguments(
565            "n must be non-negative".into(),
566        ));
567    }
568    if incx <= 0 {
569        return Err(KernelError::InvalidArguments(
570            "incx must be positive".into(),
571        ));
572    }
573    let n = n as usize;
574    let incx = incx as usize;
575    if n == 0 {
576        return Ok(0.0);
577    }
578    if (n - 1).saturating_mul(incx) >= x.len() {
579        return Err(KernelError::InvalidArguments(
580            "indexing out of bounds for x".into(),
581        ));
582    }
583
584    let mask_present = has_nulls(null_count, null_mask);
585
586    #[cfg(feature = "simd")]
587    if is_simd_aligned(x) {
588        use crate::utils::bitmask_to_simd_mask;
589        use core::simd::Simd;
590
591        const LANES: usize = W64;
592
593        // only vectorise when contiguous
594        if incx == 1 && !mask_present {
595            // dense contiguous path — compensated per-lane accumulation
596            let mut sum_v = Simd::<f64, LANES>::splat(0.0);
597            let mut comp_v = Simd::<f64, LANES>::splat(0.0);
598            let mut i = 0;
599            while i + LANES <= n {
600                let v = Simd::from_slice(&x[i..i + LANES]);
601                neumaier_simd_add(&mut sum_v, &mut comp_v, v * v);
602                i += LANES;
603            }
604            // Horizontal reduction + scalar tail
605            let mut sumsq = (sum_v + comp_v).reduce_sum();
606            for &xi in &x[i..n] {
607                sumsq += xi * xi;
608            }
609            return Ok(sumsq.sqrt());
610        }
611
612        if incx == 1 && mask_present {
613            // contiguous + null-aware — compensated per-lane accumulation
614            let mb = null_mask.unwrap();
615            let bytes = mb.as_bytes();
616            let mut sum_v = Simd::<f64, LANES>::splat(0.0);
617            let mut comp_v = Simd::<f64, LANES>::splat(0.0);
618            let mut i = 0;
619            while i + LANES <= n {
620                let lane_mask: Mask<i64, LANES> = bitmask_to_simd_mask::<LANES, i64>(bytes, i, n);
621                let v = Simd::from_slice(&x[i..i + LANES]);
622                let v = lane_mask.select(v, Simd::splat(0.0));
623                neumaier_simd_add(&mut sum_v, &mut comp_v, v * v);
624                i += LANES;
625            }
626            // Horizontal reduction + scalar tail
627            let mut sumsq = (sum_v + comp_v).reduce_sum();
628            for idx in i..n {
629                if unsafe { mb.get_unchecked(idx) } {
630                    sumsq += x[idx] * x[idx];
631                }
632            }
633            return Ok(sumsq.sqrt());
634        }
635    }
636
637    // scalar (possibly strided) fallback — compensated
638    let mut sumsq = 0.0_f64;
639    let mut comp = 0.0_f64;
640    if !mask_present {
641        let mut idx = 0;
642        for _ in 0..n {
643            let xi = x[idx];
644            neumaier_add(&mut sumsq, &mut comp, xi * xi);
645            idx += incx;
646        }
647    } else {
648        let mb = null_mask.unwrap();
649        let mut idx = 0;
650        for _ in 0..n {
651            if unsafe { mb.get_unchecked(idx) } {
652                let xi = x[idx];
653                neumaier_add(&mut sumsq, &mut comp, xi * xi);
654            }
655            idx += incx;
656        }
657    }
658    Ok((sumsq + comp).sqrt())
659}
660
661#[cfg(test)]
662mod tests {
663    use minarrow::{Bitmask, vec64};
664
665    use super::*;
666
667    fn make_mask(bits: &[bool]) -> Bitmask {
668        Bitmask::from_bools(bits)
669    }
670
671    #[test]
672    fn test_dot_no_nulls() {
673        let v1 = vec64![1.0, 2.0, 3.0, 4.0];
674        let v2 = vec64![2.0, 0.5, 1.0, -1.0];
675        let expected = 1.0 * 2.0 + 2.0 * 0.5 + 3.0 * 1.0 + 4.0 * (-1.0);
676        assert_eq!(dot(v1.as_slice(), v2.as_slice(), None, None), expected);
677
678        // Null count is Some(0) should still use fast path
679        assert_eq!(dot(&v1, &v2, None, Some(0)), expected);
680    }
681
682    #[test]
683    fn test_dot_with_nulls() {
684        let v1 = vec64![1.0, 2.0, 3.0, 4.0];
685        let v2 = vec64![2.0, 0.5, 1.0, -1.0];
686        let mask = make_mask(&[true, false, true, false]);
687        let null_count = 2;
688        // Only indices 0 and 2 are valid
689        let expected = 1.0 * 2.0 + 3.0 * 1.0;
690        assert_eq!(dot(&v1, &v2, Some(&mask), Some(null_count)), expected);
691
692        // Null count None but mask provided, should also work
693        assert_eq!(dot(&v1, &v2, Some(&mask), None), expected);
694    }
695
696    #[test]
697    fn test_dot_all_nulls() {
698        let v1 = vec64![1.0, 2.0, 3.0, 4.0];
699        let v2 = vec64![2.0, 0.5, 1.0, -1.0];
700        let mask = make_mask(&[false, false, false, false]);
701        let expected = 0.0;
702        assert_eq!(dot(&v1, &v2, Some(&mask), Some(4)), expected);
703    }
704
705    #[test]
706    fn test_argsort_no_nulls() {
707        let v = vec64![5.0, 2.0, 1.0, 4.0];
708        assert_eq!(argsort(&v, None, None, false), vec64![2, 1, 3, 0]);
709        assert_eq!(argsort(&v, None, None, true), vec64![0, 3, 1, 2]);
710    }
711
712    #[test]
713    fn test_argsort_with_nulls() {
714        let v = vec64![5.0, 2.0, 1.0, 4.0, 3.0];
715        let mask = make_mask(&[true, false, true, true, false]);
716        // valid indices: 0,2,3; so sorted: [2,3,0] (by value) then nulls [1,4]
717        assert_eq!(
718            argsort(&v, Some(&mask), Some(2), false),
719            vec64![2, 3, 0, 1, 4]
720        );
721        assert_eq!(
722            argsort(&v, Some(&mask), Some(2), true),
723            vec64![0, 3, 2, 1, 4]
724        );
725    }
726
727    #[test]
728    fn test_histogram_no_nulls() {
729        let v = vec64![1.0, 2.0, 4.0, 6.0];
730        let bins = [2.5, 5.0];
731        // Buckets: [<2.5],[2.5,5.0),[>=5.0] => [2,1,1]
732        // values: [1.0,2.0] in bucket 0, [4.0] in bucket 1, [6.0] in bucket 2
733        assert_eq!(histogram(&v, &bins, None, None), vec![2, 1, 1]);
734    }
735
736    #[test]
737    fn test_histogram_with_nulls() {
738        let v = vec64![1.0, 2.0, 4.0, 6.0];
739        let bins = [2.5, 5.0];
740        let mask = make_mask(&[false, true, true, false]);
741        // Only v[1]=2.0 and v[2]=4.0: bins [<2.5]=1, [2.5,5.0)=1, [>=5.0]=0
742        assert_eq!(histogram(&v, &bins, Some(&mask), Some(2)), vec![1, 1, 0]);
743    }
744
745    #[test]
746    fn test_histogram_all_nulls() {
747        let v = vec64![1.0, 2.0, 4.0, 6.0];
748        let bins = [2.5, 5.0];
749        let mask = make_mask(&[false, false, false, false]);
750        assert_eq!(histogram(&v, &bins, Some(&mask), Some(4)), vec![0, 0, 0]);
751    }
752
753    #[test]
754    fn test_reservoir_sample_basic() {
755        let v = vec64![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
756        let arr = reservoir_sample(&v, 3, None, None);
757        assert_eq!(arr.data.len(), 3);
758        // Should be a subset of original input (since all valid)
759        for &x in arr.data.iter() {
760            assert!(v.contains(&x));
761        }
762    }
763
764    #[test]
765    fn test_reservoir_sample_with_nulls() {
766        let v = vec64![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
767        let mask = make_mask(&[false, true, false, true, false, true]);
768        let arr = reservoir_sample(&v, 2, Some(&mask), Some(3));
769        // Only indices 1,3,5 are valid. Sampled values must come from v[1], v[3], v[5].
770        for &x in arr.data.iter() {
771            assert!(x == 2.0 || x == 4.0 || x == 6.0);
772        }
773    }
774
775    #[test]
776    fn test_scale_no_nulls() {
777        let mut v = vec64![1.0, 2.0, 3.0];
778        scale(&mut v, 3.0, None, None);
779        assert_eq!(*v, [3.0, 6.0, 9.0]);
780    }
781
782    #[test]
783    fn test_scale_with_nulls() {
784        let mut v = vec64![1.0, 2.0, 3.0];
785        let mask = make_mask(&[false, true, true]);
786        scale(&mut v, 4.0, Some(&mask), Some(1));
787        // Only v[1] and v[2] are scaled
788        assert_eq!(*v, [1.0, 8.0, 12.0]);
789    }
790
791    #[test]
792    fn test_scale_vec_no_nulls() {
793        let mut v = vec64![2.0, 4.0, 8.0, 16.0];
794        let r = scale_vec(2, 0.5, &mut v, 2, None, None);
795        assert!(r.is_ok());
796        assert_eq!(*v, [1.0, 4.0, 4.0, 16.0]);
797    }
798
799    #[test]
800    fn test_scale_vec_with_nulls() {
801        let mut v = vec64![2.0, 4.0, 8.0, 16.0];
802        let mask = make_mask(&[false, true, true, false]);
803        let r = scale_vec(2, 0.5, &mut v, 2, Some(&mask), Some(2));
804        assert!(r.is_ok());
805        // Only index 0 and 2 are processed but only 2 is valid
806        assert_eq!(*v, [2.0, 4.0, 4.0, 16.0]);
807    }
808
809    #[test]
810    fn test_axpy_no_nulls() {
811        let mut y = vec64![10.0, 20.0, 30.0, 40.0];
812        let x = vec64![1.0, 2.0, 3.0, 4.0];
813        let r = axpy(4, 2.0, &x, 1, &mut y, 1, None, None);
814        assert!(r.is_ok());
815        assert_eq!(*y, [12.0, 24.0, 36.0, 48.0]);
816    }
817
818    #[test]
819    fn test_axpy_with_nulls() {
820        let mut y = vec64![10.0, 20.0, 30.0, 40.0];
821        let x = vec64![1.0, 2.0, 3.0, 4.0];
822        let mask = make_mask(&[false, true, false, true]);
823        let r = axpy(4, 2.0, &x, 1, &mut y, 1, Some(&mask), Some(2));
824        assert!(r.is_ok());
825        // Only indices 1 and 3 are updated
826        assert_eq!(*y, [10.0, 24.0, 30.0, 48.0]);
827    }
828
829    #[test]
830    fn test_l2_norm_no_nulls() {
831        let v = vec64![3.0, 4.0];
832        let n = l2_norm(&v, None, None);
833        assert!((n - 5.0).abs() < 1e-12);
834    }
835
836    #[test]
837    fn test_l2_norm_with_nulls() {
838        let v = vec64![3.0, 4.0];
839        let mask = make_mask(&[false, true]);
840        let n = l2_norm(&v, Some(&mask), Some(1));
841        assert!((n - 4.0).abs() < 1e-12);
842    }
843
844    #[test]
845    fn test_vector_norm_no_nulls() {
846        let v = vec64![3.0, 0.0, 4.0];
847        let n = vector_norm(3, &v, 1, None, None).unwrap();
848        assert!((n - 5.0).abs() < 1e-12);
849    }
850
851    #[test]
852    fn test_vector_norm_with_nulls() {
853        let v = vec64![3.0, 0.0, 4.0];
854        let mask = make_mask(&[true, false, true]);
855        let n = vector_norm(3, &v, 1, Some(&mask), Some(1)).unwrap();
856        // Only indices 0 and 2 are used: sqrt(9+16)=5
857        assert!((n - 5.0).abs() < 1e-12);
858    }
859
860    #[test]
861    fn test_vector_norm_all_nulls() {
862        let v = vec64![3.0, 0.0, 4.0];
863        let mask = make_mask(&[false, false, false]);
864        let n = vector_norm(3, &v, 1, Some(&mask), Some(3)).unwrap();
865        assert_eq!(n, 0.0);
866    }
867
868    /// Verify Neumaier compensation on a large dot product where naive
869    /// summation would accumulate significant floating-point error.
870    ///
871    /// Strategy: construct two 1M-element vectors where each product term
872    /// equals 1.0, so the exact dot product is 1_000_000.0. Add a large
873    /// offset to one vector to create catastrophic cancellation in naive
874    /// summation, then verify the compensated result stays within 1 ULP.
875    #[test]
876    fn test_dot_large_compensated_accuracy() {
877        let n = 1_000_000;
878        let offset = 1e12;
879
880        // v1[i] = offset + 1.0, v2[i] = 1.0 for all i
881        // Each product = offset + 1.0. Sum = n * (offset + 1.0).
882        // We compare against the known exact answer.
883        let v1: Vec64<f64> = (0..n).map(|_| offset + 1.0).collect();
884        let v2: Vec64<f64> = (0..n).map(|_| 1.0).collect();
885
886        let result = dot(&v1, &v2, None, None);
887        let exact = n as f64 * (offset + 1.0);
888
889        // With Neumaier compensation, error should be within a few ULPs.
890        // Without it, naive summation on 1M terms accumulates O(n) ULPs.
891        let ulps = ((result - exact) / exact).abs() / f64::EPSILON;
892        assert!(
893            ulps < 4.0,
894            "dot product off by {ulps:.1} ULPs: got {result}, expected {exact}",
895        );
896    }
897
898    /// Same principle but with alternating signs to stress cancellation
899    /// across SIMD lanes. Exact result should be 0.0 since pairs cancel.
900    #[test]
901    fn test_dot_large_cancellation() {
902        let n = 1_000_000;
903
904        // v1 = [1, -1, 1, -1, ...], v2 = [1, 1, 1, 1, ...]
905        // Exact dot = 0 (pairs cancel). Naive summation drifts.
906        let v1: Vec64<f64> = (0..n)
907            .map(|i| if i % 2 == 0 { 1e8 } else { -1e8 })
908            .collect();
909        let v2: Vec64<f64> = (0..n).map(|_| 1.0).collect();
910
911        let result = dot(&v1, &v2, None, None);
912        // Exact answer is 0.0 since n is even
913        assert!(
914            result.abs() < 1e-4,
915            "dot cancellation failed: got {result}, expected 0.0",
916        );
917    }
918
919    /// Verify compensated l2_norm on a large vector where naive summation
920    /// would lose precision in the sum-of-squares accumulation.
921    #[test]
922    fn test_l2_norm_large_compensated_accuracy() {
923        let n = 1_000_000;
924
925        // All elements = 1.0, so ||v||_2 = sqrt(n) = 1000.0
926        let v: Vec64<f64> = (0..n).map(|_| 1.0).collect();
927
928        let result = l2_norm(&v, None, None);
929        let exact = (n as f64).sqrt();
930
931        let ulps = ((result - exact) / exact).abs() / f64::EPSILON;
932        assert!(
933            ulps < 4.0,
934            "l2_norm off by {ulps:.1} ULPs: got {result}, expected {exact}",
935        );
936    }
937}