#[cfg(all(feature = "simd", not(feature = "complex")))]
use core::any::TypeId;
use std::time::Instant;
use crate::algebra::scalar::KrystScalar;
use crate::matrix::csr::CsrMatrix;
use super::scalar;
#[cfg(all(feature = "simd", not(feature = "complex")))]
use super::{sellc, simd_csr};
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum SpmvKernel {
Scalar,
#[cfg(all(feature = "simd", not(feature = "complex")))]
CsrSimdGather,
#[cfg(all(feature = "simd", not(feature = "complex")))]
SellC,
}
#[derive(Clone, Debug)]
pub struct SpmvPlan<S: KrystScalar> {
pub kernel: SpmvKernel,
pub nrows: usize,
pub ncols: usize,
pub row_ptr: Vec<usize>,
pub col_idx: Vec<usize>,
pub vals: Vec<S>,
#[cfg(all(feature = "simd", not(feature = "complex")))]
pub sell: Option<sellc::SellCStorage>,
#[cfg(all(feature = "simd", not(feature = "complex")))]
lanes: usize,
}
impl<S: KrystScalar> SpmvPlan<S> {
#[inline]
pub fn apply_scaled(&self, alpha: S, x: &[S], beta: S, y: &mut [S]) {
match self.kernel {
SpmvKernel::Scalar => scalar::spmv_scaled_csr(
self.nrows,
&self.row_ptr,
&self.col_idx,
&self.vals,
alpha,
x,
beta,
y,
),
#[cfg(all(feature = "simd", not(feature = "complex")))]
SpmvKernel::CsrSimdGather => {
debug_assert_eq!(TypeId::of::<S>(), TypeId::of::<f64>());
let alpha = unsafe { *(&alpha as *const S as *const f64) };
let beta = unsafe { *(&beta as *const S as *const f64) };
let x = unsafe { std::slice::from_raw_parts(x.as_ptr() as *const f64, x.len()) };
let y =
unsafe { std::slice::from_raw_parts_mut(y.as_mut_ptr() as *mut f64, y.len()) };
let vals = unsafe {
std::slice::from_raw_parts(self.vals.as_ptr() as *const f64, self.vals.len())
};
if self.lanes <= 1 {
simd_csr::fallback_scalar(
self.nrows,
&self.row_ptr,
&self.col_idx,
vals,
alpha,
x,
beta,
y,
);
} else {
simd_csr::dispatch_spmv_scaled_csr_simd_gather(
self.lanes,
self.nrows,
&self.row_ptr,
&self.col_idx,
vals,
alpha,
x,
beta,
y,
);
}
}
#[cfg(all(feature = "simd", not(feature = "complex")))]
SpmvKernel::SellC => {
let sell = self
.sell
.as_ref()
.expect("SELL-C plan missing storage for SellC kernel");
let alpha = unsafe { *(&alpha as *const S as *const f64) };
let beta = unsafe { *(&beta as *const S as *const f64) };
let x = unsafe { std::slice::from_raw_parts(x.as_ptr() as *const f64, x.len()) };
let y =
unsafe { std::slice::from_raw_parts_mut(y.as_mut_ptr() as *mut f64, y.len()) };
dispatch_sellc(self.lanes, sell, alpha, x, beta, y);
}
}
}
#[inline]
pub fn nrows(&self) -> usize {
self.nrows
}
pub fn build_scalar(matrix: &CsrMatrix<S>) -> Self {
Self {
kernel: SpmvKernel::Scalar,
nrows: matrix.nrows,
ncols: matrix.ncols,
row_ptr: matrix.rowptr.clone(),
col_idx: matrix.colind.clone(),
vals: matrix.values.clone(),
#[cfg(all(feature = "simd", not(feature = "complex")))]
sell: None,
#[cfg(all(feature = "simd", not(feature = "complex")))]
lanes: 1,
}
}
}
#[derive(Clone, Debug)]
pub struct SpmvTuning {
pub allow_simd: bool,
pub prefer_sellc: bool,
pub sell_c: usize,
pub sell_sigma: usize,
pub bench_nsamples: usize,
pub min_nnz_for_simd: usize,
}
impl Default for SpmvTuning {
fn default() -> Self {
Self {
allow_simd: cfg!(all(feature = "simd", not(feature = "complex"))),
prefer_sellc: true,
sell_c: 16,
sell_sigma: 64,
bench_nsamples: 3,
min_nnz_for_simd: 2_000,
}
}
}
pub fn build<S: KrystScalar>(matrix: &CsrMatrix<S>, tuning: &SpmvTuning) -> SpmvPlan<S> {
build_owned(matrix.clone(), tuning)
}
pub fn build_owned<S: KrystScalar>(matrix: CsrMatrix<S>, tuning: &SpmvTuning) -> SpmvPlan<S> {
let CsrMatrix {
nrows,
ncols,
rowptr,
colind,
values,
} = matrix;
#[allow(unused_mut)]
let mut plan = SpmvPlan {
kernel: SpmvKernel::Scalar,
nrows,
ncols,
row_ptr: rowptr,
col_idx: colind,
vals: values,
#[cfg(all(feature = "simd", not(feature = "complex")))]
sell: None,
#[cfg(all(feature = "simd", not(feature = "complex")))]
lanes: 1,
};
#[cfg(not(all(feature = "simd", not(feature = "complex"))))]
let _ = tuning;
#[cfg(all(feature = "simd", not(feature = "complex")))]
{
if TypeId::of::<S>() == TypeId::of::<f64>() && tuning.allow_simd {
let nnz = plan.col_idx.len();
if nnz >= tuning.min_nnz_for_simd {
let lanes = simd_csr::detect_simd_lanes();
if lanes > 1 && plan.nrows > 0 {
let mut lmin = usize::MAX;
let mut lmax = 0usize;
for row in 0..plan.nrows {
let len = plan.row_ptr[row + 1] - plan.row_ptr[row];
if len == 0 {
continue;
}
lmin = lmin.min(len);
lmax = lmax.max(len);
}
let is_uniformish = if lmin == usize::MAX {
true
} else {
lmax <= lmin.saturating_mul(2) && lmax <= 128
};
let mut best_kernel = SpmvKernel::CsrSimdGather;
let mut best_sell = None;
let bench_runs = tuning.bench_nsamples;
let mut y_buf = vec![0.0f64; plan.nrows];
let x_buf = vec![1.0f64; plan.ncols.max(1)];
let vals = unsafe {
std::slice::from_raw_parts(
plan.vals.as_ptr() as *const f64,
plan.vals.len(),
)
};
let gather_time = microbench(bench_runs, || {
simd_csr::dispatch_spmv_scaled_csr_simd_gather(
lanes,
plan.nrows,
&plan.row_ptr,
&plan.col_idx,
vals,
1.0,
&x_buf,
0.0,
&mut y_buf,
);
});
let prefer_sell = tuning.prefer_sellc || !is_uniformish;
if prefer_sell {
let sell_c = round_up_to_multiple(tuning.sell_c.max(lanes), lanes);
let sell_sigma = tuning.sell_sigma.max(sell_c);
let sell = sellc::csr_to_sellc(
plan.nrows,
plan.ncols,
&plan.row_ptr,
&plan.col_idx,
vals,
sell_c,
sell_sigma,
);
let sell_time = microbench(bench_runs, || {
dispatch_sellc(lanes, &sell, 1.0, &x_buf, 0.0, &mut y_buf);
});
if sell_time < gather_time {
best_kernel = SpmvKernel::SellC;
best_sell = Some(sell);
}
}
plan.kernel = best_kernel;
plan.lanes = lanes;
plan.sell = best_sell;
}
}
}
}
#[cfg(feature = "complex")]
debug_assert!(matches!(plan.kernel, SpmvKernel::Scalar));
plan
}
#[cfg(all(feature = "simd", not(feature = "complex")))]
fn dispatch_sellc(
lanes: usize,
storage: &sellc::SellCStorage,
alpha: f64,
x: &[f64],
beta: f64,
y: &mut [f64],
) {
sellc::spmv_scaled_sellc(
storage,
alpha,
x,
beta,
y,
match lanes {
4 => 4,
_ => 2,
},
);
}
#[cfg(all(feature = "simd", not(feature = "complex")))]
fn round_up_to_multiple(value: usize, multiple: usize) -> usize {
if multiple == 0 {
return value;
}
((value + multiple - 1) / multiple) * multiple
}
fn microbench<F: FnMut()>(nsamples: usize, mut f: F) -> f64 {
if nsamples == 0 {
f();
return 0.0;
}
let mut best = f64::INFINITY;
for _ in 0..nsamples {
let start = Instant::now();
f();
let elapsed = start.elapsed().as_secs_f64();
if elapsed < best {
best = elapsed;
}
}
best
}
#[cfg(all(test, feature = "complex"))]
mod tests {
use super::*;
use crate::algebra::prelude::*;
#[test]
fn complex_build_uses_scalar_kernel() {
let matrix = CsrMatrix::<S>::new(
2,
2,
vec![0, 1, 2],
vec![0, 1],
vec![S::from_real(1.0), S::from_real(2.0)],
);
let plan = build(&matrix, &SpmvTuning::default());
assert!(matches!(plan.kernel, SpmvKernel::Scalar));
}
}