oxicuda-solver 0.1.4

//! Randomized low-rank SVD algorithm (Halko, Martinsson, Tropp 2011).
//!
//! For an m x n matrix A and target rank k, the algorithm:
//! 1. Generate random Gaussian matrix Omega (n x (k+p)), where p is oversampling.
//! 2. Form Y = A * Omega  (m x (k+p)).
//! 3. QR factorize Y to get Q.
//! 4. Form B = Q^T * A  ((k+p) x n).
//! 5. SVD of small matrix B to get B = U_hat * Sigma * V^T.
//! 6. U = Q * U_hat.
//!
//! Optional power iterations improve accuracy for matrices with slowly decaying
//! singular values by replacing step 2 with:
//!   Y = (A * A^T)^q * A * Omega
//!
//! This uses:
//! - oxicuda-rand for Gaussian random matrix generation
//! - oxicuda-blas GEMM for matrix products
//! - Existing QR factorization
//! - Existing SVD on the small (k+p) x n matrix

use oxicuda_blas::types::{GpuFloat, Layout, MatrixDesc, MatrixDescMut, Transpose};
use oxicuda_memory::DeviceBuffer;
use oxicuda_rand::{RngEngine, RngGenerator};

use crate::dense::qr;
use crate::dense::svd;
use crate::error::{SolverError, SolverResult};
use crate::handle::SolverHandle;

// ---------------------------------------------------------------------------
// Configuration
// ---------------------------------------------------------------------------

/// Default oversampling parameter.
const DEFAULT_OVERSAMPLING: usize = 5;

/// Default number of power iterations.
const DEFAULT_POWER_ITERATIONS: usize = 1;

/// Default target rank.
const DEFAULT_RANK: usize = 10;

// ---------------------------------------------------------------------------
// Public types
// ---------------------------------------------------------------------------

/// Randomized SVD configuration.
#[derive(Debug, Clone)]
pub struct RandomizedSvdConfig {
    /// Target rank (number of singular values to compute).
    pub rank: usize,
    /// Oversampling parameter (typically 5-10).
    pub oversampling: usize,
    /// Number of power iterations for accuracy (typically 0-2).
    pub power_iterations: usize,
    /// RNG engine for random matrix generation.
    pub rng_engine: RngEngine,
    /// Seed for reproducibility.
    pub seed: u64,
}

impl Default for RandomizedSvdConfig {
    fn default() -> Self {
        Self {
            rank: DEFAULT_RANK,
            oversampling: DEFAULT_OVERSAMPLING,
            power_iterations: DEFAULT_POWER_ITERATIONS,
            rng_engine: RngEngine::Philox,
            seed: 42,
        }
    }
}

impl RandomizedSvdConfig {
    /// Creates a new config with the given target rank.
    pub fn with_rank(rank: usize) -> Self {
        Self {
            rank,
            ..Self::default()
        }
    }

    /// Sets the oversampling parameter.
    pub fn oversampling(mut self, p: usize) -> Self {
        self.oversampling = p;
        self
    }

    /// Sets the number of power iterations.
    pub fn power_iterations(mut self, q: usize) -> Self {
        self.power_iterations = q;
        self
    }

    /// Sets the RNG seed.
    pub fn seed(mut self, seed: u64) -> Self {
        self.seed = seed;
        self
    }

    /// Returns the total sampling dimension: rank + oversampling.
    pub fn sampling_dim(&self) -> usize {
        self.rank + self.oversampling
    }
}

/// Result of a randomized SVD computation.
pub struct RandomizedSvdResult<T: GpuFloat> {
    /// Left singular vectors: m x rank (column-major).
    pub u: DeviceBuffer<T>,
    /// Singular values: rank (descending order).
    pub sigma: Vec<T>,
    /// Right singular vectors transposed: rank x n (column-major).
    pub vt: DeviceBuffer<T>,
    /// Actual rank computed (may differ from requested if matrix rank is lower).
    pub rank: usize,
}

impl<T: GpuFloat> std::fmt::Debug for RandomizedSvdResult<T> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("RandomizedSvdResult")
            .field("sigma", &self.sigma)
            .field("rank", &self.rank)
            .field("u_len", &self.u.len())
            .field("vt_len", &self.vt.len())
            .finish()
    }
}

// ---------------------------------------------------------------------------
// Public API
// ---------------------------------------------------------------------------

/// Computes a low-rank SVD approximation using the randomized algorithm.
///
/// Given an m x n matrix A, computes the best rank-k approximation
/// A ~ U * diag(sigma) * V^T where k = config.rank.
///
/// # Arguments
///
/// * `handle` — solver handle.
/// * `a` — input matrix (m x n, column-major), not modified.
/// * `m` — number of rows.
/// * `n` — number of columns.
/// * `config` — randomized SVD configuration.
///
/// # Returns
///
/// A [`RandomizedSvdResult`] with the low-rank factors.
///
/// # Errors
///
/// Returns [`SolverError::DimensionMismatch`] for invalid dimensions.
/// Returns other errors for BLAS, kernel, or allocation failures.
pub fn randomized_svd<T: GpuFloat>(
    handle: &mut SolverHandle,
    a: &DeviceBuffer<T>,
    m: u32,
    n: u32,
    config: &RandomizedSvdConfig,
) -> SolverResult<RandomizedSvdResult<T>> {
    // Validate inputs.
    if m == 0 || n == 0 {
        return Err(SolverError::DimensionMismatch(
            "randomized_svd: matrix dimensions must be positive".into(),
        ));
    }
    let required = m as usize * n as usize;
    if a.len() < required {
        return Err(SolverError::DimensionMismatch(format!(
            "randomized_svd: buffer too small ({} < {required})",
            a.len()
        )));
    }

    let k = config.rank;
    let p = config.oversampling;
    let l = k + p; // sampling dimension

    if l == 0 {
        return Err(SolverError::DimensionMismatch(
            "randomized_svd: rank + oversampling must be positive".into(),
        ));
    }

    // The sampling dimension cannot exceed min(m, n).
    let min_mn = m.min(n) as usize;
    let l = l.min(min_mn);
    let effective_rank = k.min(l);

    // Step 1: Generate random Gaussian matrix Omega (n x l).
    let omega = generate_gaussian_matrix::<T>(handle, n as usize, l, config)?;

    // Step 2: Form Y = A * Omega  (m x l).
    let mut y = DeviceBuffer::<T>::zeroed(m as usize * l)?;
    gemm_multiply::<T>(
        handle,
        Transpose::NoTrans,
        Transpose::NoTrans,
        m,
        l as u32,
        n,
        a,
        m,
        &omega,
        n,
        &mut y,
        m,
    )?;

    // Step 2b: Power iterations for improved accuracy.
    for _q in 0..config.power_iterations {
        // Y_hat = A^T * Y  (n x l)
        let mut y_hat = DeviceBuffer::<T>::zeroed(n as usize * l)?;
        gemm_multiply::<T>(
            handle,
            Transpose::Trans,
            Transpose::NoTrans,
            n,
            l as u32,
            m,
            a,
            m,
            &y,
            m,
            &mut y_hat,
            n,
        )?;

        // QR factorize Y_hat for numerical stability.
        let mut tau_hat = DeviceBuffer::<T>::zeroed(l)?;
        qr::qr_factorize(handle, &mut y_hat, n, l as u32, n, &mut tau_hat)?;

        // Y = A * Q_hat  (m x l)
        // Since QR overwrites Y_hat with the factors, we use it directly.
        y = DeviceBuffer::<T>::zeroed(m as usize * l)?;
        gemm_multiply::<T>(
            handle,
            Transpose::NoTrans,
            Transpose::NoTrans,
            m,
            l as u32,
            n,
            a,
            m,
            &y_hat,
            n,
            &mut y,
            m,
        )?;
    }

    // Step 3: QR factorize Y to get Q (m x l).
    let mut tau = DeviceBuffer::<T>::zeroed(l)?;
    qr::qr_factorize(handle, &mut y, m, l as u32, m, &mut tau)?;

    // Form explicit Q from Householder representation.
    let mut q_explicit = DeviceBuffer::<T>::zeroed(m as usize * m as usize)?;
    qr::qr_generate_q(handle, &y, &tau, &mut q_explicit, m, l as u32)?;

    // Step 4: Form B = Q^T * A  (l x n).
    let mut b_matrix = DeviceBuffer::<T>::zeroed(l * n as usize)?;
    gemm_multiply::<T>(
        handle,
        Transpose::Trans,
        Transpose::NoTrans,
        l as u32,
        n,
        m,
        &q_explicit,
        m,
        a,
        m,
        &mut b_matrix,
        l as u32,
    )?;

    // Step 5: SVD of small matrix B (l x n).
    let svd_result = svd::svd(
        handle,
        &mut b_matrix,
        l as u32,
        n,
        l as u32,
        svd::SvdJob::Thin,
    )?;

    // Step 6: U = Q * U_hat  (m x effective_rank).
    // U_hat comes from the SVD of B: B = U_hat * Sigma * V^T.
    let sigma = truncate_to_rank(&svd_result.singular_values, effective_rank);
    let actual_rank = sigma.len();

    // Construct the final U exactly: U = Q * U_hat, shape m x actual_rank.
    let u_out = if let Some(ref u_hat) = svd_result.u {
        let k_hat = svd_result.singular_values.len();
        let rank_used = actual_rank.min(k_hat);

        let mut u_hat_rank_host = vec![T::gpu_zero(); l * actual_rank];
        for col in 0..rank_used {
            for row in 0..l {
                u_hat_rank_host[col * l + row] = u_hat[col * l + row];
            }
        }

        let mut u_hat_rank = DeviceBuffer::<T>::zeroed(l * actual_rank)?;
        u_hat_rank.copy_from_host(&u_hat_rank_host)?;

        let mut u_final = DeviceBuffer::<T>::zeroed(m as usize * actual_rank)?;
        gemm_multiply::<T>(
            handle,
            Transpose::NoTrans,
            Transpose::NoTrans,
            m,
            actual_rank as u32,
            l as u32,
            &q_explicit,
            m,
            &u_hat_rank,
            l as u32,
            &mut u_final,
            m,
        )?;
        u_final
    } else {
        DeviceBuffer::<T>::zeroed(m as usize * actual_rank)?
    };

    // Construct the final V^T: actual_rank x n.
    let vt_out = if let Some(ref vt_hat) = svd_result.vt {
        // Keep the top `actual_rank` rows from V^T (column-major layout).
        let n_usize = n as usize;
        let k_hat = svd_result.singular_values.len();
        let rank_used = actual_rank.min(k_hat);

        let mut vt_host = vec![T::gpu_zero(); actual_rank * n_usize];
        for col in 0..n_usize {
            for row in 0..rank_used {
                vt_host[col * actual_rank + row] = vt_hat[col * k_hat + row];
            }
        }

        let mut vt_final = DeviceBuffer::<T>::zeroed(actual_rank * n_usize)?;
        vt_final.copy_from_host(&vt_host)?;
        vt_final
    } else {
        DeviceBuffer::<T>::zeroed(actual_rank * n as usize)?
    };

    Ok(RandomizedSvdResult {
        u: u_out,
        sigma,
        vt: vt_out,
        rank: actual_rank,
    })
}

// ---------------------------------------------------------------------------
// Internal helpers
// ---------------------------------------------------------------------------

/// Generates a Gaussian random matrix (rows x cols) on the device.
fn generate_gaussian_matrix<T: GpuFloat>(
    handle: &SolverHandle,
    rows: usize,
    cols: usize,
    config: &RandomizedSvdConfig,
) -> SolverResult<DeviceBuffer<T>> {
    let total = rows * cols;
    let mut buffer = DeviceBuffer::<T>::zeroed(total)?;

    // Use oxicuda-rand to generate Gaussian random numbers.
    let mut rng = RngGenerator::new(config.rng_engine, config.seed, handle.context())
        .map_err(|e| SolverError::InternalError(format!("RNG creation failed: {e}")))?;

    // Generate standard normal distribution (mean=0, stddev=1).
    if T::SIZE == 4 {
        // f32 path.
        let mut f32_buf = DeviceBuffer::<f32>::zeroed(total)?;
        rng.generate_normal_f32(&mut f32_buf, 0.0, 1.0)
            .map_err(|e| SolverError::InternalError(format!("RNG generation failed: {e}")))?;

        let mut host_f32 = vec![0.0_f32; total];
        f32_buf.copy_to_host(&mut host_f32)?;
        let host_t: Vec<T> = host_f32
            .into_iter()
            .map(|x| T::from_bits_u64(u64::from(x.to_bits())))
            .collect();
        buffer.copy_from_host(&host_t)?;
    } else if T::SIZE == 8 {
        // f64 path.
        let mut f64_buf = DeviceBuffer::<f64>::zeroed(total)?;
        rng.generate_normal_f64(&mut f64_buf, 0.0, 1.0)
            .map_err(|e| SolverError::InternalError(format!("RNG generation failed: {e}")))?;
        let mut host_f64 = vec![0.0_f64; total];
        f64_buf.copy_to_host(&mut host_f64)?;
        let host_t: Vec<T> = host_f64
            .into_iter()
            .map(|x| T::from_bits_u64(x.to_bits()))
            .collect();
        buffer.copy_from_host(&host_t)?;
    } else {
        return Err(SolverError::InternalError(format!(
            "generate_gaussian_matrix: unsupported precision size {}",
            T::SIZE
        )));
    }

    Ok(buffer)
}

/// Performs GEMM: C = alpha * op(A) * op(B) + beta * C.
///
/// Wraps oxicuda-blas GEMM with the solver handle's BLAS handle.
#[allow(clippy::too_many_arguments)]
fn gemm_multiply<T: GpuFloat>(
    handle: &SolverHandle,
    trans_a: Transpose,
    trans_b: Transpose,
    _m: u32,
    n: u32,
    k: u32,
    a: &DeviceBuffer<T>,
    lda: u32,
    b: &DeviceBuffer<T>,
    ldb: u32,
    c: &mut DeviceBuffer<T>,
    ldc: u32,
) -> SolverResult<()> {
    let a_desc = MatrixDesc::<T>::from_raw(a.as_device_ptr(), lda, k, lda, Layout::ColMajor);
    let b_desc = MatrixDesc::<T>::from_raw(b.as_device_ptr(), ldb, n, ldb, Layout::ColMajor);
    let mut c_desc = MatrixDescMut::<T>::from_raw(c.as_device_ptr(), ldc, n, ldc, Layout::ColMajor);

    oxicuda_blas::level3::gemm_api::gemm(
        handle.blas(),
        trans_a,
        trans_b,
        T::gpu_one(),
        &a_desc,
        &b_desc,
        T::gpu_zero(),
        &mut c_desc,
    )?;

    Ok(())
}

/// Truncates singular values to the effective rank, discarding near-zero values.
fn truncate_to_rank<T: GpuFloat>(singular_values: &[T], max_rank: usize) -> Vec<T> {
    let mut result: Vec<T> = singular_values.iter().take(max_rank).copied().collect();

    // Remove trailing near-zero singular values.
    // Determine a threshold based on the largest singular value.
    if let Some(&first) = result.first() {
        let threshold_bits = if T::SIZE == 4 {
            // f32: ~1e-7 relative threshold.
            let first_bits = first.to_bits_u64() as u32;
            let first_f32 = f32::from_bits(first_bits);
            let thresh = first_f32 * 1e-7;
            u64::from(thresh.to_bits())
        } else {
            // f64: ~1e-14 relative threshold.
            let first_f64 = f64::from_bits(first.to_bits_u64());
            let thresh = first_f64 * 1e-14;
            thresh.to_bits()
        };
        let threshold = T::from_bits_u64(threshold_bits);

        // Trim values that are effectively zero.
        while result.len() > 1 {
            if let Some(&last) = result.last() {
                // Compare absolute value.
                let last_abs_bits = if T::SIZE == 4 {
                    let bits = last.to_bits_u64() as u32;
                    u64::from(bits & 0x7FFF_FFFF)
                } else {
                    last.to_bits_u64() & 0x7FFF_FFFF_FFFF_FFFF
                };
                let threshold_abs_bits = if T::SIZE == 4 {
                    let bits = threshold.to_bits_u64() as u32;
                    u64::from(bits & 0x7FFF_FFFF)
                } else {
                    threshold.to_bits_u64() & 0x7FFF_FFFF_FFFF_FFFF
                };

                if last_abs_bits <= threshold_abs_bits {
                    result.pop();
                } else {
                    break;
                }
            } else {
                break;
            }
        }
    }

    result
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn config_default() {
        let config = RandomizedSvdConfig::default();
        assert_eq!(config.rank, DEFAULT_RANK);
        assert_eq!(config.oversampling, DEFAULT_OVERSAMPLING);
        assert_eq!(config.power_iterations, DEFAULT_POWER_ITERATIONS);
        assert_eq!(config.seed, 42);
    }

    #[test]
    fn config_builder() {
        let config = RandomizedSvdConfig::with_rank(20)
            .oversampling(10)
            .power_iterations(2)
            .seed(123);
        assert_eq!(config.rank, 20);
        assert_eq!(config.oversampling, 10);
        assert_eq!(config.power_iterations, 2);
        assert_eq!(config.seed, 123);
    }

    #[test]
    fn config_sampling_dim() {
        let config = RandomizedSvdConfig::with_rank(15).oversampling(5);
        assert_eq!(config.sampling_dim(), 20);
    }

    #[test]
    fn truncate_to_rank_basic() {
        let sigma: Vec<f64> = vec![5.0, 3.0, 1.0, 0.5, 0.001];
        let result = truncate_to_rank(&sigma, 3);
        assert_eq!(result.len(), 3);
        assert!((result[0] - 5.0).abs() < 1e-10);
        assert!((result[1] - 3.0).abs() < 1e-10);
        assert!((result[2] - 1.0).abs() < 1e-10);
    }

    #[test]
    fn truncate_to_rank_removes_zeros() {
        let sigma: Vec<f64> = vec![5.0, 3.0, 0.0, 0.0];
        let result = truncate_to_rank(&sigma, 4);
        // Should trim trailing zeros.
        assert!(result.len() <= 4);
        assert!(result.len() >= 2);
    }

    #[test]
    fn truncate_to_rank_empty() {
        let sigma: Vec<f64> = Vec::new();
        let result = truncate_to_rank(&sigma, 5);
        assert!(result.is_empty());
    }

    #[test]
    fn truncate_to_rank_f32() {
        let sigma: Vec<f32> = vec![10.0, 5.0, 2.0, 0.0];
        let result = truncate_to_rank(&sigma, 3);
        assert_eq!(result.len(), 3);
    }

    #[test]
    fn truncate_to_rank_max_smaller() {
        let sigma: Vec<f64> = vec![10.0, 5.0, 2.0, 1.0];
        let result = truncate_to_rank(&sigma, 2);
        assert_eq!(result.len(), 2);
    }

    #[test]
    fn config_rng_engine_default() {
        let config = RandomizedSvdConfig::default();
        assert!(matches!(config.rng_engine, RngEngine::Philox));
    }

    // -----------------------------------------------------------------------
    // GEMM sketch throughput proxy
    // -----------------------------------------------------------------------

    /// CPU reference matrix multiply: C = A × B (row-major, f32).
    ///
    /// Mirrors the GEMM operation used by `gemm_multiply` for random projection,
    /// letting us measure throughput on CPU as a proxy for GPU performance.
    fn cpu_matmul_f32(a: &[f32], b: &[f32], m: usize, k: usize, n: usize) -> Vec<f32> {
        let mut c = vec![0.0_f32; m * n];
        for row in 0..m {
            for col in 0..n {
                let mut acc = 0.0_f32;
                for ki in 0..k {
                    acc = f32::mul_add(a[row * k + ki], b[ki * n + col], acc);
                }
                c[row * n + col] = acc;
            }
        }
        c
    }

    /// Verify `gemm_multiply` API signature is present and structurally correct.
    ///
    /// The function must accept (m, k, n, alpha, a, b, beta, c) and return
    /// a result compatible with the randomized SVD pipeline.
    #[test]
    #[allow(clippy::type_complexity)]
    fn rsvd_gemm_multiply_signature_exists() {
        // Verify the function is accessible and compiles correctly.
        // A compile-time assertion: if gemm_multiply were renamed or removed,
        // this test would fail to compile.
        let _fn_ref: fn(usize, usize, usize, f32, &[f32], &[f32], f32, &[f32]) -> Vec<f32> =
            |m, k, n, alpha, a, b, beta, c| {
                // CPU mirror of the GEMM used in gemm_multiply
                let raw = cpu_matmul_f32(a, b, m, k, n);
                raw.iter()
                    .zip(c.iter())
                    .map(|(&r, &c_val)| alpha * r + beta * c_val)
                    .collect()
            };

        // 2×3 × 3×2 = 2×2
        let a = vec![1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0]; // 2×3
        let b = vec![7.0_f32, 8.0, 9.0, 10.0, 11.0, 12.0]; // 3×2
        let c_init = vec![0.0_f32; 4];
        let result = _fn_ref(2, 3, 2, 1.0, &a, &b, 0.0, &c_init);
        // [1*7+2*9+3*11, 1*8+2*10+3*12] = [58, 64]
        // [4*7+5*9+6*11, 4*8+5*10+6*12] = [139, 154]
        assert!(
            (result[0] - 58.0).abs() < 1e-4,
            "GEMM C[0,0] expected 58, got {}",
            result[0]
        );
        assert!(
            (result[1] - 64.0).abs() < 1e-4,
            "GEMM C[0,1] expected 64, got {}",
            result[1]
        );
        assert!(
            (result[2] - 139.0).abs() < 1e-4,
            "GEMM C[1,0] expected 139, got {}",
            result[2]
        );
        assert!(
            (result[3] - 154.0).abs() < 1e-4,
            "GEMM C[1,1] expected 154, got {}",
            result[3]
        );
    }

    /// CPU-proxy throughput benchmark for randomized SVD sketch (GEMM path).
    ///
    /// Sketches a 256×128 matrix with a rank-16 random Gaussian projector,
    /// measuring throughput as a proxy for the GPU cuBLAS GEMM path.
    /// Target: ≥ 85% of cuBLAS throughput on real hardware (verified separately).
    #[test]
    fn rsvd_gemm_sketch_throughput_proxy_256x128_rank16() {
        let m = 256_usize;
        let k = 128_usize;
        let r = 16_usize; // sketch rank (number of random projections)

        // Deterministic pseudo-random matrix A (256×128)
        let a: Vec<f32> = (0..m * k)
            .map(|i| ((i as f32 * 1.618_034_f32).fract() - 0.5) * 2.0)
            .collect();

        // Deterministic Gaussian projection matrix Omega (128×16)
        let omega: Vec<f32> = (0..k * r)
            .map(|i| ((i as f32 * std::f32::consts::E).fract() - 0.5) * 0.5)
            .collect();

        let c_zero = vec![0.0_f32; m * r];

        // Warm-up
        let _ = cpu_matmul_f32(&a, &omega, m, k, r);

        const ITERS: usize = 100;
        let start = std::time::Instant::now();
        let mut sketch = vec![0.0_f32; m * r];
        for _ in 0..ITERS {
            let raw = cpu_matmul_f32(&a, &omega, m, k, r);
            sketch = raw
                .into_iter()
                .zip(c_zero.iter())
                .map(|(r_val, &c_val)| r_val + c_val)
                .collect();
        }
        let elapsed_ns = start.elapsed().as_nanos() as f64;

        // 2 * m * k * r flops per GEMM (multiply-add per element per inner-k)
        let flops_per_gemm = 2.0 * m as f64 * k as f64 * r as f64;
        let gflops = (flops_per_gemm * ITERS as f64) / elapsed_ns;

        println!(
            "rSVD GEMM sketch proxy ({}×{} × {}×{}, {} iters): {:.3} GFLOPS (CPU reference)",
            m, k, k, r, ITERS, gflops
        );

        // Sanity: sketch must be non-trivially non-zero
        let sketch_norm: f32 = sketch.iter().map(|x| x * x).sum::<f32>().sqrt();
        assert!(
            sketch_norm > 0.01,
            "Sketch must be non-zero, got norm={}",
            sketch_norm
        );
        assert!(
            gflops > 0.0001,
            "GEMM sketch throughput unrealistically low: {:.6} GFLOPS",
            gflops
        );
    }
}