Skip to main content

simd_kernels/kernels/scientific/
vector.rs

1// Copyright (c) 2025 SpaceCell Enterprises Ltd
2// SPDX-License-Identifier: AGPL-3.0-or-later
3// Commercial licensing available. See LICENSE and LICENSING.md.
4
5//! # **Vector Mathematics Module** - *High-Performance BLAS-Compatible Vector Operations*
6//!
7//! This module provides a streamlined set of vector operations without the need for BLAS/LAPACK linking.
8//! It includes SIMD-accelerated arms where appropriate.
9//!
10//! ## Usage Examples
11//!
12//! ### Basic Linear Algebra
13//! ```rust,ignore
14//! use minarrow::vec64;
15//! use crate::kernels::scientific::vector::{dot, scale, axpy, l2_norm};
16//!
17//! let x = vec64![1.0, 2.0, 3.0, 4.0];
18//! let mut y = vec64![5.0, 6.0, 7.0, 8.0];
19//!
20//! // Dot product computation
21//! let result = dot(&x, &y, None, None);
22//!
23//! // In-place vector scaling
24//! scale(&mut y, 2.0, None, None);
25//!
26//! // Scaled addition: y ← 2.0 * x + y
27//! axpy(4, 2.0, &x, 1, &mut y, 1, None, None)?;
28//!
29//! // Euclidean norm
30//! let norm = l2_norm(&x, None, None);
31//! ```
32
33include!(concat!(env!("OUT_DIR"), "/simd_lanes.rs"));
34
35#[cfg(feature = "simd")]
36use std::simd::{Mask, Select, Simd, num::SimdFloat};
37
38use minarrow::{Bitmask, FloatArray, Vec64};
39
40use crate::kernels::aggregate::{neumaier_add, sum_squares};
41#[cfg(feature = "simd")]
42use crate::kernels::aggregate::neumaier_simd_add;
43use crate::utils::has_nulls;
44use minarrow::enums::error::KernelError;
45
46#[cfg(feature = "simd")]
47use crate::utils::bitmask_to_simd_mask;
48
49#[cfg(feature = "simd")]
50use minarrow::utils::is_simd_aligned;
51
52// --- Vector analytics ---
53
54/// Computes the dot product with another vector, propagating nulls via an optional Bitmask.
55#[inline(always)]
56pub fn dot(v1: &[f64], v2: &[f64], null_mask: Option<&Bitmask>, null_count: Option<usize>) -> f64 {
57    assert_eq!(v1.len(), v2.len(), "dot: input length mismatch");
58
59    let len = v1.len();
60
61    #[cfg(feature = "simd")]
62    if is_simd_aligned(v1) && is_simd_aligned(v2) {
63        let needs_nulls = has_nulls(null_count, null_mask);
64        const N: usize = W64;
65        let mut sum_v = Simd::<f64, N>::splat(0.0);
66        let mut comp_v = Simd::<f64, N>::splat(0.0);
67        let mut i = 0;
68        if !needs_nulls {
69            // fast path: no nulls — compensated per-lane accumulation
70            while i + N <= len {
71                let a = Simd::<f64, N>::from_slice(&v1[i..i + N]);
72                let b = Simd::<f64, N>::from_slice(&v2[i..i + N]);
73                neumaier_simd_add(&mut sum_v, &mut comp_v, a * b);
74                i += N;
75            }
76        } else {
77            // null-aware path: compensated per-lane with masked products
78            let mask_bytes = null_mask.unwrap().as_bytes();
79            while i + N <= len {
80                let a = Simd::<f64, N>::from_slice(&v1[i..i + N]);
81                let b = Simd::<f64, N>::from_slice(&v2[i..i + N]);
82                let prod = a * b;
83                let lane_mask: Mask<i64, N> = bitmask_to_simd_mask::<N, i64>(mask_bytes, i, len);
84                neumaier_simd_add(&mut sum_v, &mut comp_v, lane_mask.select(prod, Simd::splat(0.0)));
85                i += N;
86            }
87        }
88        // Horizontal reduction + scalar tail
89        let mut acc = (sum_v + comp_v).reduce_sum();
90        for idx in i..len {
91            if !needs_nulls || unsafe { null_mask.unwrap().get_unchecked(idx) } {
92                acc += v1[idx] * v2[idx];
93            }
94        }
95        return acc;
96    }
97
98    // Scalar fallback — alignment check failed or no simd flag
99    let needs_nulls = has_nulls(null_count, null_mask);
100    let mut acc = 0.0_f64;
101    let mut comp = 0.0_f64;
102    if !needs_nulls {
103        for i in 0..len {
104            neumaier_add(&mut acc, &mut comp, v1[i] * v2[i]);
105        }
106    } else {
107        let mb = null_mask.unwrap();
108        for i in 0..len {
109            if unsafe { mb.get_unchecked(i) } {
110                neumaier_add(&mut acc, &mut comp, v1[i] * v2[i]);
111            }
112        }
113    }
114    acc + comp
115}
116
117/// Computes a stable key-based argsort of indices
118#[inline(always)]
119pub fn argsort(
120    v: &[f64],
121    mask: Option<&Bitmask>,
122    null_count: Option<usize>,
123    descending: bool,
124) -> Vec64<usize> {
125    let mut idx: Vec64<usize> = (0..v.len()).collect();
126    if !has_nulls(null_count, mask) {
127        if descending {
128            idx.sort_unstable_by(|&i, &j| {
129                v[j].partial_cmp(&v[i]).unwrap_or(std::cmp::Ordering::Equal)
130            });
131        } else {
132            idx.sort_unstable_by(|&i, &j| {
133                v[i].partial_cmp(&v[j]).unwrap_or(std::cmp::Ordering::Equal)
134            });
135        }
136    } else {
137        let mb = mask.expect("argsort: mask required when nulls present");
138        idx.sort_unstable_by(|&i, &j| {
139            let vi_null = !unsafe { mb.get_unchecked(i) };
140            let vj_null = !unsafe { mb.get_unchecked(j) };
141            match (vi_null, vj_null) {
142                (true, true) => std::cmp::Ordering::Equal,
143                (true, false) => std::cmp::Ordering::Greater, // nulls last
144                (false, true) => std::cmp::Ordering::Less,
145                (false, false) => {
146                    if descending {
147                        v[j].partial_cmp(&v[i]).unwrap_or(std::cmp::Ordering::Equal)
148                    } else {
149                        v[i].partial_cmp(&v[j]).unwrap_or(std::cmp::Ordering::Equal)
150                    }
151                }
152            }
153        });
154    }
155    idx
156}
157
158/// Bins values into histogram buckets.
159#[inline(always)]
160pub fn histogram(
161    v: &[f64],
162    bins: &[f64],
163    mask: Option<&Bitmask>,
164    null_count: Option<usize>,
165) -> Vec<usize> {
166    assert!(!bins.is_empty(), "histogram: bins must be non-empty");
167    let has_nulls = match null_count {
168        Some(n) => n > 0,
169        None => mask.is_some(),
170    };
171
172    let bins_sorted = bins.windows(2).all(|w| w[0] <= w[1]);
173    let mut counts = vec![0; bins.len() + 1];
174    if !has_nulls {
175        if bins_sorted {
176            for &x in v {
177                match bins.binary_search_by(|b| b.partial_cmp(&x).unwrap()) {
178                    Ok(pos) | Err(pos) => counts[pos] += 1,
179                }
180            }
181        } else {
182            for &x in v {
183                let mut placed = false;
184                for (i, &b) in bins.iter().enumerate() {
185                    if x < b {
186                        counts[i] += 1;
187                        placed = true;
188                        break;
189                    }
190                }
191                if !placed {
192                    counts[bins.len()] += 1;
193                }
194            }
195        }
196    } else {
197        let mb = mask.expect("histogram: mask required when nulls present");
198        if bins_sorted {
199            for (i, &x) in v.iter().enumerate() {
200                if unsafe { mb.get_unchecked(i) } {
201                    match bins.binary_search_by(|b| b.partial_cmp(&x).unwrap()) {
202                        Ok(pos) | Err(pos) => counts[pos] += 1,
203                    }
204                }
205            }
206        } else {
207            for (i, &x) in v.iter().enumerate() {
208                if unsafe { mb.get_unchecked(i) } {
209                    let mut placed = false;
210                    for (j, &b) in bins.iter().enumerate() {
211                        if x < b {
212                            counts[j] += 1;
213                            placed = true;
214                            break;
215                        }
216                    }
217                    if !placed {
218                        counts[bins.len()] += 1;
219                    }
220                }
221            }
222        }
223    }
224    counts
225}
226
227/// Uniform reservoir sampling of size k.
228#[inline(always)]
229pub fn reservoir_sample(
230    v: &[f64],
231    k: usize,
232    mask: Option<&Bitmask>,
233    null_count: Option<usize>,
234) -> FloatArray<f64> {
235    use rand::prelude::*;
236    assert!(k > 0, "reservoir_sample: k must be positive");
237    let has_nulls = match null_count {
238        Some(n) => n > 0,
239        None => mask.is_some(),
240    };
241
242    let mut rng = rand::rng();
243    let mut out = Vec64::with_capacity(k);
244    let mut seen = 0usize;
245
246    for (idx, &val) in v.iter().enumerate() {
247        if has_nulls && !unsafe { mask.unwrap().get_unchecked(idx) } {
248            continue;
249        }
250        if out.len() < k {
251            out.push(val);
252        } else {
253            let j = rng.random_range(0..=seen);
254            if j < k {
255                out[j] = val;
256            }
257        }
258        seen += 1;
259    }
260    assert_eq!(out.len(), k, "reservoir_sample: not enough valid values");
261    FloatArray::from_vec64(out, None)
262}
263
264// --- Vector transformations ---
265
266/// Scales the vector in place: x ← αx
267/// Kernel: SCAL (L2-2)
268#[inline(always)]
269pub fn scale(v: &mut [f64], alpha: f64, null_mask: Option<&Bitmask>, null_count: Option<usize>) {
270    #[cfg(feature = "simd")]
271    if is_simd_aligned(v) {
272        const N: usize = W64;
273        let len = v.len();
274        let alpha_v = Simd::<f64, N>::splat(alpha);
275        let mut i = 0;
276
277        if !has_nulls(null_count, null_mask) {
278            // Dense path: just scale every lane
279            while i + N <= len {
280                let chunk = Simd::from_slice(&v[i..i + N]);
281                let scaled = chunk * alpha_v;
282                v[i..i + N].copy_from_slice(&scaled.to_array());
283                i += N;
284            }
285            // Remainder
286            for x in &mut v[i..] {
287                *x *= alpha;
288            }
289        } else {
290            // Null‐aware path
291            let mb = null_mask.expect("scale: mask required when nulls present");
292            let mask_bytes = mb.as_bytes();
293
294            // Vectorized blocks
295            while i + N <= len {
296                let chunk = Simd::from_slice(&v[i..i + N]);
297                // Build SIMD mask<M,N> from the global bitmask
298                let lane_mask: Mask<i64, N> = bitmask_to_simd_mask::<N, i64>(mask_bytes, i, len);
299                // Only scale the valid lanes, leave the rest untouched
300                let result = lane_mask.select(chunk * alpha_v, chunk);
301                v[i..i + N].copy_from_slice(&result.to_array());
302                i += N;
303            }
304            // Scalar remainder
305            for idx in i..len {
306                if unsafe { mb.get_unchecked(idx) } {
307                    v[idx] *= alpha;
308                }
309            }
310        }
311        return;
312    }
313
314    // Scalar fallback - alignment check failed or no SIMD flag
315    if !has_nulls(null_count, null_mask) {
316        for x in v {
317            *x *= alpha;
318        }
319    } else {
320        let mb = null_mask.expect("scale: mask required when nulls present");
321        for (i, x) in v.iter_mut().enumerate() {
322            if unsafe { mb.get_unchecked(i) } {
323                *x *= alpha;
324            }
325        }
326    }
327}
328
329/// In-place vector scaling: x ← α·x
330///
331/// This is the version for a matrix in a flat strided buffer,
332/// and therefore tackles incx ("increment x") the gap between values.
333///
334/// Multiplies each element of the input vector `x` by the scalar `alpha`.
335/// Equivalent to `x[i] ← alpha * x[i]` for all valid `i`.
336#[inline(always)]
337pub fn scale_vec(
338    n: i32,
339    alpha: f64,
340    x: &mut [f64],
341    incx: i32,
342    null_mask: Option<&Bitmask>,
343    null_count: Option<usize>,
344) -> Result<(), KernelError> {
345    if n < 0 {
346        return Err(KernelError::InvalidArguments(
347            "n must be non-negative".into(),
348        ));
349    }
350    let n = n as usize;
351    let incx = incx as usize;
352    if incx == 0 {
353        return Err(KernelError::InvalidArguments(
354            "incx must be positive".into(),
355        ));
356    }
357    if n == 0 {
358        return Ok(());
359    }
360    if (n - 1).saturating_mul(incx) >= x.len() {
361        return Err(KernelError::InvalidArguments(
362            "indexing out of bounds".into(),
363        ));
364    }
365
366    let mask_present = has_nulls(null_count, null_mask);
367
368    #[cfg(feature = "simd")]
369    if is_simd_aligned(x) {
370        const LANES: usize = W64;
371        let alpha_v = Simd::<f64, LANES>::splat(alpha);
372
373        // only vectorize when truly contiguous
374        if incx == 1 && !mask_present {
375            // dense contiguous path
376            let mut i = 0;
377            while i + LANES <= n {
378                let chunk = Simd::from_slice(&x[i..i + LANES]);
379                let out_v = chunk * alpha_v;
380                x[i..i + LANES].copy_from_slice(&out_v.to_array());
381                i += LANES;
382            }
383            for xi in &mut x[i..n] {
384                *xi *= alpha;
385            }
386            return Ok(());
387        }
388        if incx == 1 && mask_present {
389            // contiguous + null‐aware
390            let mb = null_mask.unwrap();
391            let bytes = mb.as_bytes();
392            let mut i = 0;
393            while i + LANES <= n {
394                // build a SIMD mask from the arrow bitmap
395                let lane_mask: Mask<i64, LANES> = bitmask_to_simd_mask::<LANES, i64>(bytes, i, n);
396                let chunk = Simd::from_slice(&x[i..i + LANES]);
397                let scaled = chunk * alpha_v;
398                // only scale where mask==true, else leave original
399                let out_v = lane_mask.select(scaled, chunk);
400                x[i..i + LANES].copy_from_slice(&out_v.to_array());
401                i += LANES;
402            }
403            // tail
404            for idx in i..n {
405                if unsafe { mb.get_unchecked(idx) } {
406                    x[idx] *= alpha;
407                }
408            }
409            return Ok(());
410        }
411    }
412
413    // scalar (strided) fallback
414    if !mask_present {
415        let mut idx = 0;
416        for _ in 0..n {
417            x[idx] *= alpha;
418            idx += incx;
419        }
420    } else {
421        let mb = null_mask.unwrap();
422        let mut idx = 0;
423        for _ in 0..n {
424            if unsafe { mb.get_unchecked(idx) } {
425                x[idx] *= alpha;
426            }
427            idx += incx;
428        }
429    }
430    Ok(())
431}
432
433/// Scaled vector addition: y ← α·x + y
434///
435/// Computes a linear combination of two vectors, scaling `x` by `alpha` and accumulating into `y`.
436/// Equivalent to `y[i] ← alpha * x[i] + y[i]` for all valid `i`.
437#[inline(always)]
438pub fn axpy(
439    n: i32,
440    alpha: f64,
441    x: &[f64],
442    incx: i32,
443    y: &mut [f64],
444    incy: i32,
445    null_mask: Option<&Bitmask>,
446    null_count: Option<usize>,
447) -> Result<(), KernelError> {
448    if n < 0 {
449        return Err(KernelError::InvalidArguments(
450            "n must be non-negative".into(),
451        ));
452    }
453    let n = n as usize;
454    let incx = incx as usize;
455    let incy = incy as usize;
456    if incx == 0 || incy == 0 {
457        return Err(KernelError::InvalidArguments(
458            "increments must be positive".into(),
459        ));
460    }
461    if n == 0 {
462        return Ok(());
463    }
464    if (n - 1).saturating_mul(incx) >= x.len() {
465        return Err(KernelError::InvalidArguments("x out of bounds".into()));
466    }
467    if (n - 1).saturating_mul(incy) >= y.len() {
468        return Err(KernelError::InvalidArguments("y out of bounds".into()));
469    }
470
471    let mask_present = has_nulls(null_count, null_mask);
472
473    #[cfg(feature = "simd")]
474    if is_simd_aligned(x) {
475        const LANES: usize = W64;
476        let alpha_v = Simd::<f64, LANES>::splat(alpha);
477
478        // only when both x and y are contiguous
479        if incx == 1 && incy == 1 && !mask_present {
480            // dense contiguous
481            let mut i = 0;
482            while i + LANES <= n {
483                let xv = Simd::from_slice(&x[i..i + LANES]);
484                let yv = Simd::from_slice(&y[i..i + LANES]);
485                let out_v = alpha_v * xv + yv;
486                y[i..i + LANES].copy_from_slice(&out_v.to_array());
487                i += LANES;
488            }
489            for j in i..n {
490                y[j] += alpha * x[j];
491            }
492            return Ok(());
493        }
494        if incx == 1 && incy == 1 && mask_present {
495            // contiguous + null‐aware
496            let mb = null_mask.unwrap();
497            let bytes = mb.as_bytes();
498            let mut i = 0;
499            while i + LANES <= n {
500                let lane_mask: Mask<i64, LANES> = bitmask_to_simd_mask::<LANES, i64>(bytes, i, n);
501                let xv = Simd::from_slice(&x[i..i + LANES]);
502                let yv = Simd::from_slice(&y[i..i + LANES]);
503                let computed = alpha_v * xv + yv;
504                let out_v = lane_mask.select(computed, yv);
505                y[i..i + LANES].copy_from_slice(&out_v.to_array());
506                i += LANES;
507            }
508            for idx in i..n {
509                if unsafe { mb.get_unchecked(idx) } {
510                    y[idx] += alpha * x[idx];
511                }
512            }
513            return Ok(());
514        }
515    }
516
517    // scalar (potentially strided) fallback
518    if !mask_present {
519        let mut ix = 0;
520        let mut iy = 0;
521        for _ in 0..n {
522            y[iy] += alpha * x[ix];
523            ix += incx;
524            iy += incy;
525        }
526    } else {
527        let mb = null_mask.unwrap();
528        let mut ix = 0;
529        let mut iy = 0;
530        for _ in 0..n {
531            if unsafe { mb.get_unchecked(ix) } {
532                y[iy] += alpha * x[ix];
533            }
534            ix += incx;
535            iy += incy;
536        }
537    }
538    Ok(())
539}
540
541/// Computes the L2 norm (‖x‖₂)
542#[inline(always)]
543pub fn l2_norm(v: &[f64], mask: Option<&Bitmask>, null_count: Option<usize>) -> f64 {
544    sum_squares(v, mask, null_count).sqrt()
545}
546
547/// Computes the Euclidean (ℓ₂) norm of a vector: ‖x‖₂ = sqrt(Σ xᵢ²)
548///
549/// Returns the magnitude (Euclidean norm) of the vector `x`.
550/// For an input vector `x`, computes `sqrt(sum(x_i^2))`.
551#[inline(always)]
552pub fn vector_norm(
553    n: i32,
554    x: &[f64],
555    incx: i32,
556    null_mask: Option<&Bitmask>,
557    null_count: Option<usize>,
558) -> Result<f64, KernelError> {
559    if n < 0 {
560        return Err(KernelError::InvalidArguments(
561            "n must be non-negative".into(),
562        ));
563    }
564    if incx <= 0 {
565        return Err(KernelError::InvalidArguments(
566            "incx must be positive".into(),
567        ));
568    }
569    let n = n as usize;
570    let incx = incx as usize;
571    if n == 0 {
572        return Ok(0.0);
573    }
574    if (n - 1).saturating_mul(incx) >= x.len() {
575        return Err(KernelError::InvalidArguments(
576            "indexing out of bounds for x".into(),
577        ));
578    }
579
580    let mask_present = has_nulls(null_count, null_mask);
581
582    #[cfg(feature = "simd")]
583    if is_simd_aligned(x) {
584        use crate::utils::bitmask_to_simd_mask;
585        use core::simd::Simd;
586
587        const LANES: usize = W64;
588
589        // only vectorise when contiguous
590        if incx == 1 && !mask_present {
591            // dense contiguous path — compensated per-lane accumulation
592            let mut sum_v = Simd::<f64, LANES>::splat(0.0);
593            let mut comp_v = Simd::<f64, LANES>::splat(0.0);
594            let mut i = 0;
595            while i + LANES <= n {
596                let v = Simd::from_slice(&x[i..i + LANES]);
597                neumaier_simd_add(&mut sum_v, &mut comp_v, v * v);
598                i += LANES;
599            }
600            // Horizontal reduction + scalar tail
601            let mut sumsq = (sum_v + comp_v).reduce_sum();
602            for &xi in &x[i..n] {
603                sumsq += xi * xi;
604            }
605            return Ok(sumsq.sqrt());
606        }
607
608        if incx == 1 && mask_present {
609            // contiguous + null-aware — compensated per-lane accumulation
610            let mb = null_mask.unwrap();
611            let bytes = mb.as_bytes();
612            let mut sum_v = Simd::<f64, LANES>::splat(0.0);
613            let mut comp_v = Simd::<f64, LANES>::splat(0.0);
614            let mut i = 0;
615            while i + LANES <= n {
616                let lane_mask: Mask<i64, LANES> = bitmask_to_simd_mask::<LANES, i64>(bytes, i, n);
617                let v = Simd::from_slice(&x[i..i + LANES]);
618                let v = lane_mask.select(v, Simd::splat(0.0));
619                neumaier_simd_add(&mut sum_v, &mut comp_v, v * v);
620                i += LANES;
621            }
622            // Horizontal reduction + scalar tail
623            let mut sumsq = (sum_v + comp_v).reduce_sum();
624            for idx in i..n {
625                if unsafe { mb.get_unchecked(idx) } {
626                    sumsq += x[idx] * x[idx];
627                }
628            }
629            return Ok(sumsq.sqrt());
630        }
631    }
632
633    // scalar (possibly strided) fallback — compensated
634    let mut sumsq = 0.0_f64;
635    let mut comp = 0.0_f64;
636    if !mask_present {
637        let mut idx = 0;
638        for _ in 0..n {
639            let xi = x[idx];
640            neumaier_add(&mut sumsq, &mut comp, xi * xi);
641            idx += incx;
642        }
643    } else {
644        let mb = null_mask.unwrap();
645        let mut idx = 0;
646        for _ in 0..n {
647            if unsafe { mb.get_unchecked(idx) } {
648                let xi = x[idx];
649                neumaier_add(&mut sumsq, &mut comp, xi * xi);
650            }
651            idx += incx;
652        }
653    }
654    Ok((sumsq + comp).sqrt())
655}
656
657#[cfg(test)]
658mod tests {
659    use minarrow::{Bitmask, vec64};
660
661    use super::*;
662
663    fn make_mask(bits: &[bool]) -> Bitmask {
664        Bitmask::from_bools(bits)
665    }
666
667    #[test]
668    fn test_dot_no_nulls() {
669        let v1 = vec64![1.0, 2.0, 3.0, 4.0];
670        let v2 = vec64![2.0, 0.5, 1.0, -1.0];
671        let expected = 1.0 * 2.0 + 2.0 * 0.5 + 3.0 * 1.0 + 4.0 * (-1.0);
672        assert_eq!(dot(v1.as_slice(), v2.as_slice(), None, None), expected);
673
674        // Null count is Some(0) should still use fast path
675        assert_eq!(dot(&v1, &v2, None, Some(0)), expected);
676    }
677
678    #[test]
679    fn test_dot_with_nulls() {
680        let v1 = vec64![1.0, 2.0, 3.0, 4.0];
681        let v2 = vec64![2.0, 0.5, 1.0, -1.0];
682        let mask = make_mask(&[true, false, true, false]);
683        let null_count = 2;
684        // Only indices 0 and 2 are valid
685        let expected = 1.0 * 2.0 + 3.0 * 1.0;
686        assert_eq!(dot(&v1, &v2, Some(&mask), Some(null_count)), expected);
687
688        // Null count None but mask provided, should also work
689        assert_eq!(dot(&v1, &v2, Some(&mask), None), expected);
690    }
691
692    #[test]
693    fn test_dot_all_nulls() {
694        let v1 = vec64![1.0, 2.0, 3.0, 4.0];
695        let v2 = vec64![2.0, 0.5, 1.0, -1.0];
696        let mask = make_mask(&[false, false, false, false]);
697        let expected = 0.0;
698        assert_eq!(dot(&v1, &v2, Some(&mask), Some(4)), expected);
699    }
700
701    #[test]
702    fn test_argsort_no_nulls() {
703        let v = vec64![5.0, 2.0, 1.0, 4.0];
704        assert_eq!(argsort(&v, None, None, false), vec64![2, 1, 3, 0]);
705        assert_eq!(argsort(&v, None, None, true), vec64![0, 3, 1, 2]);
706    }
707
708    #[test]
709    fn test_argsort_with_nulls() {
710        let v = vec64![5.0, 2.0, 1.0, 4.0, 3.0];
711        let mask = make_mask(&[true, false, true, true, false]);
712        // valid indices: 0,2,3; so sorted: [2,3,0] (by value) then nulls [1,4]
713        assert_eq!(
714            argsort(&v, Some(&mask), Some(2), false),
715            vec64![2, 3, 0, 1, 4]
716        );
717        assert_eq!(
718            argsort(&v, Some(&mask), Some(2), true),
719            vec64![0, 3, 2, 1, 4]
720        );
721    }
722
723    #[test]
724    fn test_histogram_no_nulls() {
725        let v = vec64![1.0, 2.0, 4.0, 6.0];
726        let bins = [2.5, 5.0];
727        // Buckets: [<2.5],[2.5,5.0),[>=5.0] => [2,1,1]
728        // values: [1.0,2.0] in bucket 0, [4.0] in bucket 1, [6.0] in bucket 2
729        assert_eq!(histogram(&v, &bins, None, None), vec![2, 1, 1]);
730    }
731
732    #[test]
733    fn test_histogram_with_nulls() {
734        let v = vec64![1.0, 2.0, 4.0, 6.0];
735        let bins = [2.5, 5.0];
736        let mask = make_mask(&[false, true, true, false]);
737        // Only v[1]=2.0 and v[2]=4.0: bins [<2.5]=1, [2.5,5.0)=1, [>=5.0]=0
738        assert_eq!(histogram(&v, &bins, Some(&mask), Some(2)), vec![1, 1, 0]);
739    }
740
741    #[test]
742    fn test_histogram_all_nulls() {
743        let v = vec64![1.0, 2.0, 4.0, 6.0];
744        let bins = [2.5, 5.0];
745        let mask = make_mask(&[false, false, false, false]);
746        assert_eq!(histogram(&v, &bins, Some(&mask), Some(4)), vec![0, 0, 0]);
747    }
748
749    #[test]
750    fn test_reservoir_sample_basic() {
751        let v = vec64![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
752        let arr = reservoir_sample(&v, 3, None, None);
753        assert_eq!(arr.data.len(), 3);
754        // Should be a subset of original input (since all valid)
755        for &x in arr.data.iter() {
756            assert!(v.contains(&x));
757        }
758    }
759
760    #[test]
761    fn test_reservoir_sample_with_nulls() {
762        let v = vec64![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
763        let mask = make_mask(&[false, true, false, true, false, true]);
764        let arr = reservoir_sample(&v, 2, Some(&mask), Some(3));
765        // Only indices 1,3,5 are valid. Sampled values must come from v[1], v[3], v[5].
766        for &x in arr.data.iter() {
767            assert!(x == 2.0 || x == 4.0 || x == 6.0);
768        }
769    }
770
771    #[test]
772    fn test_scale_no_nulls() {
773        let mut v = vec64![1.0, 2.0, 3.0];
774        scale(&mut v, 3.0, None, None);
775        assert_eq!(*v, [3.0, 6.0, 9.0]);
776    }
777
778    #[test]
779    fn test_scale_with_nulls() {
780        let mut v = vec64![1.0, 2.0, 3.0];
781        let mask = make_mask(&[false, true, true]);
782        scale(&mut v, 4.0, Some(&mask), Some(1));
783        // Only v[1] and v[2] are scaled
784        assert_eq!(*v, [1.0, 8.0, 12.0]);
785    }
786
787    #[test]
788    fn test_scale_vec_no_nulls() {
789        let mut v = vec64![2.0, 4.0, 8.0, 16.0];
790        let r = scale_vec(2, 0.5, &mut v, 2, None, None);
791        assert!(r.is_ok());
792        assert_eq!(*v, [1.0, 4.0, 4.0, 16.0]);
793    }
794
795    #[test]
796    fn test_scale_vec_with_nulls() {
797        let mut v = vec64![2.0, 4.0, 8.0, 16.0];
798        let mask = make_mask(&[false, true, true, false]);
799        let r = scale_vec(2, 0.5, &mut v, 2, Some(&mask), Some(2));
800        assert!(r.is_ok());
801        // Only index 0 and 2 are processed but only 2 is valid
802        assert_eq!(*v, [2.0, 4.0, 4.0, 16.0]);
803    }
804
805    #[test]
806    fn test_axpy_no_nulls() {
807        let mut y = vec64![10.0, 20.0, 30.0, 40.0];
808        let x = vec64![1.0, 2.0, 3.0, 4.0];
809        let r = axpy(4, 2.0, &x, 1, &mut y, 1, None, None);
810        assert!(r.is_ok());
811        assert_eq!(*y, [12.0, 24.0, 36.0, 48.0]);
812    }
813
814    #[test]
815    fn test_axpy_with_nulls() {
816        let mut y = vec64![10.0, 20.0, 30.0, 40.0];
817        let x = vec64![1.0, 2.0, 3.0, 4.0];
818        let mask = make_mask(&[false, true, false, true]);
819        let r = axpy(4, 2.0, &x, 1, &mut y, 1, Some(&mask), Some(2));
820        assert!(r.is_ok());
821        // Only indices 1 and 3 are updated
822        assert_eq!(*y, [10.0, 24.0, 30.0, 48.0]);
823    }
824
825    #[test]
826    fn test_l2_norm_no_nulls() {
827        let v = vec64![3.0, 4.0];
828        let n = l2_norm(&v, None, None);
829        assert!((n - 5.0).abs() < 1e-12);
830    }
831
832    #[test]
833    fn test_l2_norm_with_nulls() {
834        let v = vec64![3.0, 4.0];
835        let mask = make_mask(&[false, true]);
836        let n = l2_norm(&v, Some(&mask), Some(1));
837        assert!((n - 4.0).abs() < 1e-12);
838    }
839
840    #[test]
841    fn test_vector_norm_no_nulls() {
842        let v = vec64![3.0, 0.0, 4.0];
843        let n = vector_norm(3, &v, 1, None, None).unwrap();
844        assert!((n - 5.0).abs() < 1e-12);
845    }
846
847    #[test]
848    fn test_vector_norm_with_nulls() {
849        let v = vec64![3.0, 0.0, 4.0];
850        let mask = make_mask(&[true, false, true]);
851        let n = vector_norm(3, &v, 1, Some(&mask), Some(1)).unwrap();
852        // Only indices 0 and 2 are used: sqrt(9+16)=5
853        assert!((n - 5.0).abs() < 1e-12);
854    }
855
856    #[test]
857    fn test_vector_norm_all_nulls() {
858        let v = vec64![3.0, 0.0, 4.0];
859        let mask = make_mask(&[false, false, false]);
860        let n = vector_norm(3, &v, 1, Some(&mask), Some(3)).unwrap();
861        assert_eq!(n, 0.0);
862    }
863
864    /// Verify Neumaier compensation on a large dot product where naive
865    /// summation would accumulate significant floating-point error.
866    ///
867    /// Strategy: construct two 1M-element vectors where each product term
868    /// equals 1.0, so the exact dot product is 1_000_000.0. Add a large
869    /// offset to one vector to create catastrophic cancellation in naive
870    /// summation, then verify the compensated result stays within 1 ULP.
871    #[test]
872    fn test_dot_large_compensated_accuracy() {
873        let n = 1_000_000;
874        let offset = 1e12;
875
876        // v1[i] = offset + 1.0, v2[i] = 1.0 for all i
877        // Each product = offset + 1.0. Sum = n * (offset + 1.0).
878        // We compare against the known exact answer.
879        let v1: Vec64<f64> = (0..n).map(|_| offset + 1.0).collect();
880        let v2: Vec64<f64> = (0..n).map(|_| 1.0).collect();
881
882        let result = dot(&v1, &v2, None, None);
883        let exact = n as f64 * (offset + 1.0);
884
885        // With Neumaier compensation, error should be within a few ULPs.
886        // Without it, naive summation on 1M terms accumulates O(n) ULPs.
887        let ulps = ((result - exact) / exact).abs() / f64::EPSILON;
888        assert!(
889            ulps < 4.0,
890            "dot product off by {ulps:.1} ULPs: got {result}, expected {exact}",
891        );
892    }
893
894    /// Same principle but with alternating signs to stress cancellation
895    /// across SIMD lanes. Exact result should be 0.0 since pairs cancel.
896    #[test]
897    fn test_dot_large_cancellation() {
898        let n = 1_000_000;
899
900        // v1 = [1, -1, 1, -1, ...], v2 = [1, 1, 1, 1, ...]
901        // Exact dot = 0 (pairs cancel). Naive summation drifts.
902        let v1: Vec64<f64> = (0..n)
903            .map(|i| if i % 2 == 0 { 1e8 } else { -1e8 })
904            .collect();
905        let v2: Vec64<f64> = (0..n).map(|_| 1.0).collect();
906
907        let result = dot(&v1, &v2, None, None);
908        // Exact answer is 0.0 since n is even
909        assert!(
910            result.abs() < 1e-4,
911            "dot cancellation failed: got {result}, expected 0.0",
912        );
913    }
914
915    /// Verify compensated l2_norm on a large vector where naive summation
916    /// would lose precision in the sum-of-squares accumulation.
917    #[test]
918    fn test_l2_norm_large_compensated_accuracy() {
919        let n = 1_000_000;
920
921        // All elements = 1.0, so ||v||_2 = sqrt(n) = 1000.0
922        let v: Vec64<f64> = (0..n).map(|_| 1.0).collect();
923
924        let result = l2_norm(&v, None, None);
925        let exact = (n as f64).sqrt();
926
927        let ulps = ((result - exact) / exact).abs() / f64::EPSILON;
928        assert!(
929            ulps < 4.0,
930            "l2_norm off by {ulps:.1} ULPs: got {result}, expected {exact}",
931        );
932    }
933
934}