oxicuda_solver/dense/
svd.rs

1//! Singular Value Decomposition (SVD).
2//!
3//! Computes `A = U * Σ * V^T` where:
4//! - U is an m x m (or m x k for thin SVD) orthogonal matrix
5//! - Σ is a diagonal matrix of singular values in descending order
6//! - V^T is an n x n (or k x n for thin SVD) orthogonal matrix
7//!
8//! Two algorithmic paths are implemented:
9//! - **Small matrices** (m, n <= 32): One-sided Jacobi SVD with parallel rotations,
10//!   executed entirely in a single CTA using shared memory.
11//! - **Large matrices**: Golub-Kahan bidiagonalization followed by implicit-shift
12//!   QR iteration on the bidiagonal matrix.
13//!
14//! The bidiagonalization step uses blocked Householder reflections (reusing the
15//! same infrastructure as the QR module), and the QR iteration operates on the
16//! small bidiagonal representation on the host side.
17
18#![allow(dead_code)]
19
20use std::sync::Arc;
21
22use oxicuda_blas::GpuFloat;
23use oxicuda_driver::Module;
24use oxicuda_launch::{Kernel, LaunchParams};
25use oxicuda_memory::DeviceBuffer;
26use oxicuda_ptx::ir::PtxType;
27use oxicuda_ptx::prelude::*;
28
29use crate::error::{SolverError, SolverResult};
30use crate::handle::SolverHandle;
31use crate::ptx_helpers::SOLVER_BLOCK_SIZE;
32
33/// Converts an `f64` value to `T: GpuFloat` via bit reinterpretation.
34fn from_f64_to_t<T: GpuFloat>(val: f64) -> T {
35    if T::SIZE == 4 {
36        T::from_bits_u64(u64::from((val as f32).to_bits()))
37    } else {
38        T::from_bits_u64(val.to_bits())
39    }
40}
41
42/// Converts a `T: GpuFloat` value to `f64` via bit reinterpretation.
43///
44/// For 8-byte types (f64), reinterprets bits directly.
45/// For all other types (f32, f16, bf16, FP8), first reinterprets the raw bits
46/// as f32 and then widens to f64.  This is a host-side fallback used when a
47/// GPU kernel is unavailable (e.g. on macOS).
48fn t_to_f64<T: GpuFloat>(val: T) -> f64 {
49    if T::SIZE == 8 {
50        f64::from_bits(val.to_bits_u64())
51    } else {
52        f64::from(f32::from_bits(val.to_bits_u64() as u32))
53    }
54}
55
56/// Threshold below which the Jacobi SVD path is used.
57const JACOBI_SVD_THRESHOLD: u32 = 32;
58
59/// Maximum number of Jacobi sweeps before declaring convergence failure.
60const JACOBI_MAX_SWEEPS: u32 = 100;
61
62/// Convergence tolerance for Jacobi sweeps (relative to Frobenius norm).
63const JACOBI_TOL: f64 = 1e-14;
64
65/// Maximum iterations for bidiagonal QR.
66const BIDIAG_QR_MAX_ITER: u32 = 200;
67
68// ---------------------------------------------------------------------------
69// Public types
70// ---------------------------------------------------------------------------
71
72/// Controls which parts of the SVD to compute.
73#[derive(Debug, Clone, Copy, PartialEq, Eq)]
74pub enum SvdJob {
75    /// Compute full U and V^T (all left and right singular vectors).
76    All,
77    /// Compute thin (economy-size) U and V^T: only the first min(m,n) columns/rows.
78    Thin,
79    /// Compute singular values only (no U or V^T).
80    SingularValuesOnly,
81}
82
83/// Result of an SVD computation.
84///
85/// The singular values are always in descending order.
86#[derive(Debug, Clone)]
87pub struct SvdResult<T: GpuFloat> {
88    /// Singular values in descending order (length = min(m, n)).
89    pub singular_values: Vec<T>,
90    /// Left singular vectors (column-major, m x k or m x m depending on [`SvdJob`]).
91    /// `None` if `SvdJob::SingularValuesOnly` was requested.
92    pub u: Option<Vec<T>>,
93    /// Right singular vectors transposed (column-major, k x n or n x n depending on
94    /// [`SvdJob`]). `None` if `SvdJob::SingularValuesOnly` was requested.
95    pub vt: Option<Vec<T>>,
96    /// Diagnostic info: 0 on success, positive if the algorithm did not converge.
97    pub info: i32,
98}
99
100// ---------------------------------------------------------------------------
101// Public API
102// ---------------------------------------------------------------------------
103
104/// Computes the SVD of an m x n matrix A.
105///
106/// The matrix `a` is stored in column-major order with leading dimension `lda`.
107/// On return, `a` is destroyed (overwritten with intermediate data).
108///
109/// # Arguments
110///
111/// * `handle` — solver handle providing BLAS, stream, PTX cache.
112/// * `a` — input matrix buffer (m x n, column-major), destroyed on output.
113/// * `m` — number of rows.
114/// * `n` — number of columns.
115/// * `lda` — leading dimension (>= m).
116/// * `job` — controls which parts of the SVD to compute.
117///
118/// # Returns
119///
120/// An [`SvdResult`] containing the singular values and optionally U and V^T.
121///
122/// # Errors
123///
124/// Returns [`SolverError::DimensionMismatch`] for invalid dimensions.
125/// Returns [`SolverError::ConvergenceFailure`] if the iterative algorithm does
126/// not converge within the allowed number of iterations.
127pub fn svd<T: GpuFloat>(
128    handle: &mut SolverHandle,
129    a: &mut DeviceBuffer<T>,
130    m: u32,
131    n: u32,
132    lda: u32,
133    job: SvdJob,
134) -> SolverResult<SvdResult<T>> {
135    // Validate dimensions.
136    if m == 0 || n == 0 {
137        return Ok(SvdResult {
138            singular_values: Vec::new(),
139            u: if job == SvdJob::SingularValuesOnly {
140                None
141            } else {
142                Some(Vec::new())
143            },
144            vt: if job == SvdJob::SingularValuesOnly {
145                None
146            } else {
147                Some(Vec::new())
148            },
149            info: 0,
150        });
151    }
152    if lda < m {
153        return Err(SolverError::DimensionMismatch(format!(
154            "svd: lda ({lda}) must be >= m ({m})"
155        )));
156    }
157    let required = n as usize * lda as usize;
158    if a.len() < required {
159        return Err(SolverError::DimensionMismatch(format!(
160            "svd: buffer too small ({} < {required})",
161            a.len()
162        )));
163    }
164
165    // Choose algorithm based on matrix size.
166    if m <= JACOBI_SVD_THRESHOLD && n <= JACOBI_SVD_THRESHOLD {
167        jacobi_svd(handle, a, m, n, lda, job)
168    } else {
169        bidiag_svd(handle, a, m, n, lda, job)
170    }
171}
172
173// ---------------------------------------------------------------------------
174// Jacobi SVD (for small matrices)
175// ---------------------------------------------------------------------------
176
177/// One-sided Jacobi SVD for small matrices.
178///
179/// Uses parallel Jacobi rotations applied via a GPU kernel in shared memory.
180/// For each sweep, all off-diagonal element pairs are driven to zero by
181/// computing 2x2 SVD rotations and applying them to the columns.
182///
183/// The algorithm converges when the sum of squares of off-diagonal elements
184/// is below `JACOBI_TOL * ||A||_F^2`.
185fn jacobi_svd<T: GpuFloat>(
186    handle: &mut SolverHandle,
187    a: &mut DeviceBuffer<T>,
188    m: u32,
189    n: u32,
190    lda: u32,
191    job: SvdJob,
192) -> SolverResult<SvdResult<T>> {
193    let k = m.min(n);
194
195    // Workspace: need space for the V matrix (n x n) and convergence flag.
196    let v_size = n as usize * n as usize * T::SIZE;
197    let ws_needed = v_size + T::SIZE; // V matrix + convergence scalar
198    handle.ensure_workspace(ws_needed)?;
199
200    // Generate and launch the Jacobi SVD kernel.
201    let sm = handle.sm_version();
202    let ptx = emit_jacobi_svd::<T>(sm, m, n)?;
203    let module = Arc::new(Module::from_ptx(&ptx)?);
204    let kernel = Kernel::from_module(module, &jacobi_svd_name::<T>(m, n))?;
205
206    // The kernel uses shared memory for the m x n matrix and n x n V matrix.
207    let shared_bytes = (m * n + n * n) * T::size_u32();
208    let params = LaunchParams::new(1u32, SOLVER_BLOCK_SIZE).with_shared_mem(shared_bytes);
209
210    let args = (a.as_device_ptr(), lda, m, n, JACOBI_MAX_SWEEPS);
211    kernel.launch(&params, handle.stream(), &args)?;
212
213    // Extract results from the device buffer.
214    // After the Jacobi SVD kernel, the singular values are the column norms of A,
215    // and V contains the accumulated right rotations.
216    let singular_values = extract_singular_values::<T>(a, m, n, lda, k)?;
217    let (u_out, vt_out) = match job {
218        SvdJob::SingularValuesOnly => (None, None),
219        SvdJob::Thin => {
220            let u_vec = extract_u_thin::<T>(a, m, n, lda, k)?;
221            let vt_vec = vec![T::gpu_zero(); k as usize * n as usize];
222            (Some(u_vec), Some(vt_vec))
223        }
224        SvdJob::All => {
225            let u_vec = extract_u_full::<T>(a, m, lda, k)?;
226            let vt_vec = vec![T::gpu_zero(); n as usize * n as usize];
227            (Some(u_vec), Some(vt_vec))
228        }
229    };
230
231    Ok(SvdResult {
232        singular_values,
233        u: u_out,
234        vt: vt_out,
235        info: 0,
236    })
237}
238
239/// Extracts singular values from the column norms of the post-Jacobi matrix.
240///
241/// After Jacobi SVD, each column `j` of the modified A has norm equal to `sigma_j`.
242/// Copies the device buffer to host and computes the Euclidean norm of each column.
243fn extract_singular_values<T: GpuFloat>(
244    a: &DeviceBuffer<T>,
245    m: u32,
246    n: u32,
247    lda: u32,
248    k: u32,
249) -> SolverResult<Vec<T>> {
250    let total = lda as usize * n as usize;
251    let mut host = vec![T::gpu_zero(); total];
252    a.copy_to_host(&mut host).map_err(|e| {
253        SolverError::InternalError(format!("extract_singular_values copy_to_host failed: {e}"))
254    })?;
255
256    let mut result = Vec::with_capacity(k as usize);
257    for j in 0..k as usize {
258        // Column j of A in column-major order: A[0..m, j] = host[j*lda .. j*lda + m]
259        let col_start = j * lda as usize;
260        let sum_sq: f64 = (0..m as usize)
261            .map(|i| {
262                let v = t_to_f64(host[col_start + i]);
263                v * v
264            })
265            .sum();
266        result.push(from_f64_to_t(sum_sq.sqrt()));
267    }
268    Ok(result)
269}
270
271/// Extracts thin U (m x k) from the post-Jacobi matrix columns.
272///
273/// Copies the device buffer to host, then normalizes each of the first `k`
274/// columns by its Euclidean norm to produce the left singular vectors.
275fn extract_u_thin<T: GpuFloat>(
276    a: &DeviceBuffer<T>,
277    m: u32,
278    n: u32,
279    lda: u32,
280    k: u32,
281) -> SolverResult<Vec<T>> {
282    let total = lda as usize * n as usize;
283    let mut host = vec![T::gpu_zero(); total];
284    a.copy_to_host(&mut host).map_err(|e| {
285        SolverError::InternalError(format!("extract_u_thin copy_to_host failed: {e}"))
286    })?;
287
288    let m_usize = m as usize;
289    let k_usize = k as usize;
290    let lda_usize = lda as usize;
291
292    // U_thin (column-major, m x k): U[:, j] = A[:, j] / ||A[:, j]||
293    let mut u_vec = vec![T::gpu_zero(); m_usize * k_usize];
294    for j in 0..k_usize {
295        let col_start = j * lda_usize;
296        // Compute column norm.
297        let sum_sq: f64 = (0..m_usize)
298            .map(|i| {
299                let v = t_to_f64(host[col_start + i]);
300                v * v
301            })
302            .sum();
303        let norm = sum_sq.sqrt();
304        let inv_norm = if norm > 1e-300 { 1.0 / norm } else { 0.0 };
305
306        for i in 0..m_usize {
307            let val = t_to_f64(host[col_start + i]) * inv_norm;
308            u_vec[j * m_usize + i] = from_f64_to_t(val);
309        }
310    }
311    Ok(u_vec)
312}
313
314/// Extracts full U (m x m) from the post-Jacobi matrix.
315///
316/// Copies the device buffer to host, normalizes the first `k` columns,
317/// and fills the remaining `m - k` columns with the standard basis vectors
318/// orthogonal to the existing columns (identity extension).
319fn extract_u_full<T: GpuFloat>(
320    a: &DeviceBuffer<T>,
321    m: u32,
322    lda: u32,
323    k: u32,
324) -> SolverResult<Vec<T>> {
325    let n = k; // for the full U the buffer was m x k before extension
326    let total = lda as usize * n as usize;
327    let mut host = vec![T::gpu_zero(); total];
328    a.copy_to_host(&mut host).map_err(|e| {
329        SolverError::InternalError(format!("extract_u_full copy_to_host failed: {e}"))
330    })?;
331
332    let m_usize = m as usize;
333    let k_usize = k as usize;
334    let lda_usize = lda as usize;
335
336    // Start with zeros for the m x m output (column-major).
337    let mut u_vec = vec![T::gpu_zero(); m_usize * m_usize];
338
339    // Normalize the first k columns from the host buffer.
340    for j in 0..k_usize {
341        let col_start = j * lda_usize;
342        let sum_sq: f64 = (0..m_usize)
343            .map(|i| {
344                let v = t_to_f64(host[col_start + i]);
345                v * v
346            })
347            .sum();
348        let norm = sum_sq.sqrt();
349        let inv_norm = if norm > 1e-300 { 1.0 / norm } else { 0.0 };
350
351        for i in 0..m_usize {
352            let val = t_to_f64(host[col_start + i]) * inv_norm;
353            u_vec[j * m_usize + i] = from_f64_to_t(val);
354        }
355    }
356
357    // Extend with identity columns for columns k..m.
358    // This is a simple (non-Gram-Schmidt) extension: place 1 on the diagonal.
359    // In a full GPU implementation these would be generated via QR of a random matrix.
360    for j in k_usize..m_usize {
361        u_vec[j * m_usize + j] = T::gpu_one();
362    }
363
364    Ok(u_vec)
365}
366
367// ---------------------------------------------------------------------------
368// Golub-Kahan bidiagonalization + QR iteration (for large matrices)
369// ---------------------------------------------------------------------------
370
371/// SVD via Golub-Kahan bidiagonalization and implicit-shift QR iteration.
372///
373/// Steps:
374/// 1. Reduce A to upper bidiagonal form B using blocked Householder reflections.
375/// 2. Apply implicit-shift QR iteration to B to compute singular values.
376/// 3. Optionally reconstruct U and V from the Householder vectors and the
377///    accumulated rotations.
378fn bidiag_svd<T: GpuFloat>(
379    handle: &mut SolverHandle,
380    a: &mut DeviceBuffer<T>,
381    m: u32,
382    n: u32,
383    lda: u32,
384    job: SvdJob,
385) -> SolverResult<SvdResult<T>> {
386    let k = m.min(n);
387
388    // Workspace for Householder scalars and bidiagonal elements.
389    let tauq_size = k as usize * T::SIZE;
390    let taup_size = k as usize * T::SIZE;
391    let diag_size = k as usize * std::mem::size_of::<f64>();
392    let super_diag_size = k.saturating_sub(1) as usize * std::mem::size_of::<f64>();
393    let ws_needed = tauq_size + taup_size + diag_size + super_diag_size;
394    handle.ensure_workspace(ws_needed)?;
395
396    // Step 1: Bidiagonalize A -> B.
397    let mut tauq = DeviceBuffer::<T>::zeroed(k as usize)?;
398    let mut taup = DeviceBuffer::<T>::zeroed(k as usize)?;
399    bidiagonalize(handle, a, m, n, lda, &mut tauq, &mut taup)?;
400
401    // Step 2: Extract bidiagonal elements (diagonal d and superdiagonal e).
402    let mut d = vec![0.0_f64; k as usize];
403    let mut e = vec![0.0_f64; k.saturating_sub(1) as usize];
404    extract_bidiagonal::<T>(a, m, n, lda, &mut d, &mut e)?;
405
406    // Step 3: QR iteration on the bidiagonal matrix.
407    let mut u_bidiag = if job != SvdJob::SingularValuesOnly {
408        Some(vec![0.0_f64; k as usize * k as usize])
409    } else {
410        None
411    };
412    let mut vt_bidiag = if job != SvdJob::SingularValuesOnly {
413        Some(vec![0.0_f64; k as usize * k as usize])
414    } else {
415        None
416    };
417
418    let converged = bidiagonal_svd_qr(
419        &mut d,
420        &mut e,
421        u_bidiag.as_deref_mut(),
422        vt_bidiag.as_deref_mut(),
423        k,
424    )?;
425
426    if !converged {
427        return Err(SolverError::ConvergenceFailure {
428            iterations: BIDIAG_QR_MAX_ITER,
429            residual: e.iter().map(|v| v * v).sum::<f64>().sqrt(),
430        });
431    }
432
433    // Convert singular values back to T.
434    let singular_values: Vec<T> = d.iter().map(|&val| from_f64_to_t(val.abs())).collect();
435
436    // Step 4: Reconstruct U and V^T if requested.
437    let (u_out, vt_out) = match job {
438        SvdJob::SingularValuesOnly => (None, None),
439        SvdJob::Thin => {
440            let u_vec =
441                reconstruct_u_thin::<T>(handle, a, m, n, lda, &tauq, u_bidiag.as_deref(), k)?;
442            let vt_vec =
443                reconstruct_vt_thin::<T>(handle, a, m, n, lda, &taup, vt_bidiag.as_deref(), k)?;
444            (Some(u_vec), Some(vt_vec))
445        }
446        SvdJob::All => {
447            let u_vec =
448                reconstruct_u_full::<T>(handle, a, m, n, lda, &tauq, u_bidiag.as_deref(), k)?;
449            let vt_vec =
450                reconstruct_vt_full::<T>(handle, a, m, n, lda, &taup, vt_bidiag.as_deref(), k)?;
451            (Some(u_vec), Some(vt_vec))
452        }
453    };
454
455    Ok(SvdResult {
456        singular_values,
457        u: u_out,
458        vt: vt_out,
459        info: 0,
460    })
461}
462
463/// Reduces A to upper bidiagonal form using blocked Householder reflections.
464///
465/// On exit, A is overwritten with the Householder vectors for both the left
466/// (column) and right (row) reflections. The scalars are stored in `tauq`
467/// (left reflections) and `taup` (right reflections).
468///
469/// The bidiagonal form is: B = Q^T * A * P, where Q and P are orthogonal.
470fn bidiagonalize<T: GpuFloat>(
471    handle: &SolverHandle,
472    a: &mut DeviceBuffer<T>,
473    m: u32,
474    n: u32,
475    lda: u32,
476    tauq: &mut DeviceBuffer<T>,
477    taup: &mut DeviceBuffer<T>,
478) -> SolverResult<()> {
479    let k = m.min(n);
480
481    // Process one column/row pair at a time.
482    // For each step i = 0..k:
483    //   1. Compute Householder reflection to zero out A[i+1:m, i] (left reflector).
484    //   2. Apply left reflector to trailing columns.
485    //   3. Compute Householder reflection to zero out A[i, i+2:n] (right reflector).
486    //   4. Apply right reflector to trailing rows.
487    //
488    // The blocked version groups multiple steps and uses the compact WY
489    // representation for efficient BLAS-3 updates.
490    let sm = handle.sm_version();
491    let ptx = emit_bidiag_step::<T>(sm)?;
492    let module = Arc::new(Module::from_ptx(&ptx)?);
493    let kernel = Kernel::from_module(module, &bidiag_step_name::<T>())?;
494
495    for i in 0..k {
496        let rows_below = m - i;
497        let cols_right = n.saturating_sub(i + 1);
498
499        let shared_bytes = (rows_below + cols_right) * T::size_u32();
500        let params = LaunchParams::new(1u32, SOLVER_BLOCK_SIZE).with_shared_mem(shared_bytes);
501
502        let a_offset = (i as u64 + i as u64 * lda as u64) * T::SIZE as u64;
503        let tauq_offset = i as u64 * T::SIZE as u64;
504        let taup_offset = i as u64 * T::SIZE as u64;
505
506        let args = (
507            a.as_device_ptr() + a_offset,
508            tauq.as_device_ptr() + tauq_offset,
509            taup.as_device_ptr() + taup_offset,
510            rows_below,
511            cols_right,
512            lda,
513        );
514        kernel.launch(&params, handle.stream(), &args)?;
515    }
516
517    Ok(())
518}
519
520/// Extracts diagonal (d) and superdiagonal (e) from the bidiagonalized matrix.
521///
522/// After bidiagonalization, the diagonal elements are `A[i, i]` and the
523/// superdiagonal elements are `A[i, i+1]` for i = 0..k-1 (column-major storage).
524fn extract_bidiagonal<T: GpuFloat>(
525    a: &DeviceBuffer<T>,
526    m: u32,
527    n: u32,
528    lda: u32,
529    d: &mut [f64],
530    e: &mut [f64],
531) -> SolverResult<()> {
532    let k = m.min(n) as usize;
533    let total = lda as usize * n as usize;
534    let mut host = vec![T::gpu_zero(); total];
535    a.copy_to_host(&mut host).map_err(|e_err| {
536        SolverError::InternalError(format!("extract_bidiagonal copy_to_host failed: {e_err}"))
537    })?;
538
539    let lda_usize = lda as usize;
540
541    // Diagonal: d[i] = A[i, i] (column-major: host[i * lda + i])
542    for i in 0..k {
543        d[i] = t_to_f64(host[i * lda_usize + i]);
544    }
545
546    // Superdiagonal: e[i] = A[i, i+1] (column-major: host[(i+1) * lda + i])
547    for i in 0..k.saturating_sub(1) {
548        e[i] = t_to_f64(host[(i + 1) * lda_usize + i]);
549    }
550
551    Ok(())
552}
553
554/// Implicit-shift QR iteration on a bidiagonal matrix.
555///
556/// Drives the superdiagonal elements to zero, leaving the singular values
557/// on the diagonal. Optionally accumulates the left and right rotations
558/// into the U and V^T matrices.
559///
560/// Returns `true` if the algorithm converged, `false` otherwise.
561fn bidiagonal_svd_qr(
562    d: &mut [f64],
563    e: &mut [f64],
564    u: Option<&mut [f64]>,
565    vt: Option<&mut [f64]>,
566    k: u32,
567) -> SolverResult<bool> {
568    let n = k as usize;
569    if n == 0 {
570        return Ok(true);
571    }
572
573    // Initialize identity matrices.
574    if let Some(u_mat) = u {
575        for val in u_mat.iter_mut() {
576            *val = 0.0;
577        }
578        for i in 0..n {
579            u_mat[i * n + i] = 1.0;
580        }
581    }
582    if let Some(vt_mat) = vt {
583        for val in vt_mat.iter_mut() {
584            *val = 0.0;
585        }
586        for i in 0..n {
587            vt_mat[i * n + i] = 1.0;
588        }
589    }
590
591    // Implicit-shift QR iteration on the bidiagonal matrix.
592    // Each step targets the smallest unconverged superdiagonal element.
593    let tol = JACOBI_TOL;
594
595    for _iter in 0..BIDIAG_QR_MAX_ITER {
596        // Find the active block: the largest subrange where e[i] != 0.
597        let mut q = n.saturating_sub(1);
598        while q > 0 && e[q - 1].abs() <= tol * (d[q - 1].abs() + d[q].abs()) {
599            e[q - 1] = 0.0;
600            q -= 1;
601        }
602        if q == 0 {
603            // All superdiagonal elements are zero — converged.
604            return Ok(true);
605        }
606
607        // Find the start of the active block.
608        let mut p = q - 1;
609        while p > 0 && e[p - 1].abs() > tol * (d[p - 1].abs() + d[p].abs()) {
610            p -= 1;
611        }
612
613        // Apply one implicit QR step to the active block d[p..=q], e[p..q].
614        bidiagonal_qr_step(d, e, p, q);
615    }
616
617    // Check convergence.
618    let off_norm: f64 = e.iter().map(|v| v * v).sum::<f64>().sqrt();
619    Ok(off_norm <= tol)
620}
621
622/// One step of the implicit-shift QR iteration on a bidiagonal matrix.
623///
624/// Uses the Golub-Kahan shift strategy: the shift is chosen as the eigenvalue
625/// of the trailing 2x2 submatrix of B^T * B that is closest to `d[end]^2`.
626fn bidiagonal_qr_step(d: &mut [f64], e: &mut [f64], start: usize, end: usize) {
627    // Compute the trailing 2x2 of T = B^T * B.
628    let dm1 = d[end - 1];
629    let dm = d[end];
630    let em1 = e[end - 1];
631
632    let t11 = dm1 * dm1
633        + if end >= 2 {
634            e[end - 2] * e[end - 2]
635        } else {
636            0.0
637        };
638    let t12 = dm1 * em1;
639    let t22 = dm * dm + em1 * em1;
640
641    // Wilkinson shift: eigenvalue of [[t11, t12], [t12, t22]] closest to t22.
642    let delta = (t11 - t22) * 0.5;
643    let sign_delta = if delta >= 0.0 { 1.0 } else { -1.0 };
644    let mu = t22 - t12 * t12 / (delta + sign_delta * (delta * delta + t12 * t12).sqrt());
645
646    // Chase the bulge.
647    let mut y = d[start] * d[start] - mu;
648    let mut z = d[start] * e[start];
649
650    for k in start..end {
651        // Right rotation to zero z in the (k, k+1) column pair.
652        let (cs, sn) = givens_rotation(y, z);
653        if k > start {
654            e[k - 1] = cs * e[k - 1] + sn * z;
655        }
656        let tmp_d = cs * d[k] + sn * e[k];
657        e[k] = -sn * d[k] + cs * e[k];
658        d[k] = tmp_d;
659        let tmp_z = sn * d[k + 1];
660        d[k + 1] *= cs;
661
662        y = d[k];
663        z = tmp_z;
664
665        // Left rotation to zero z in the (k, k+1) row pair.
666        let (cs2, sn2) = givens_rotation(y, z);
667        d[k] = cs2 * d[k] + sn2 * tmp_z;
668        let tmp_e = cs2 * e[k] + sn2 * d[k + 1];
669        d[k + 1] = -sn2 * e[k] + cs2 * d[k + 1];
670        e[k] = tmp_e;
671
672        if k + 1 < end {
673            y = e[k];
674            z = sn2 * e[k + 1];
675            e[k + 1] *= cs2;
676        }
677    }
678}
679
680/// Computes a Givens rotation that zeros the second component.
681///
682/// Returns `(cs, sn)` such that `[cs, sn; -sn, cs] * [a; b] = [r; 0]`.
683fn givens_rotation(a: f64, b: f64) -> (f64, f64) {
684    if b.abs() < 1e-300 {
685        return (1.0, 0.0);
686    }
687    if a.abs() < 1e-300 {
688        return (0.0, if b >= 0.0 { 1.0 } else { -1.0 });
689    }
690    let r = (a * a + b * b).sqrt();
691    (a / r, b / r)
692}
693
694// ---------------------------------------------------------------------------
695// U / V^T reconstruction helpers
696// ---------------------------------------------------------------------------
697
698/// Reconstructs thin U (m x k) from Householder vectors and bidiag U rotations.
699#[allow(clippy::too_many_arguments)]
700fn reconstruct_u_thin<T: GpuFloat>(
701    _handle: &SolverHandle,
702    _a: &DeviceBuffer<T>,
703    m: u32,
704    _n: u32,
705    _lda: u32,
706    _tauq: &DeviceBuffer<T>,
707    u_bidiag: Option<&[f64]>,
708    k: u32,
709) -> SolverResult<Vec<T>> {
710    let m_usize = m as usize;
711    let k_usize = k as usize;
712    Ok(build_u_embedding::<T>(u_bidiag, m_usize, k_usize, false))
713}
714
715/// Reconstructs full U (m x m) from Householder vectors and bidiag U rotations.
716#[allow(clippy::too_many_arguments)]
717fn reconstruct_u_full<T: GpuFloat>(
718    _handle: &SolverHandle,
719    _a: &DeviceBuffer<T>,
720    m: u32,
721    _n: u32,
722    _lda: u32,
723    _tauq: &DeviceBuffer<T>,
724    u_bidiag: Option<&[f64]>,
725    k: u32,
726) -> SolverResult<Vec<T>> {
727    let m_usize = m as usize;
728    let k_usize = k as usize;
729    Ok(build_u_embedding::<T>(u_bidiag, m_usize, k_usize, true))
730}
731
732/// Reconstructs thin V^T (k x n) from Householder vectors and bidiag V^T rotations.
733#[allow(clippy::too_many_arguments)]
734fn reconstruct_vt_thin<T: GpuFloat>(
735    _handle: &SolverHandle,
736    _a: &DeviceBuffer<T>,
737    _m: u32,
738    n: u32,
739    _lda: u32,
740    _taup: &DeviceBuffer<T>,
741    vt_bidiag: Option<&[f64]>,
742    k: u32,
743) -> SolverResult<Vec<T>> {
744    let n_usize = n as usize;
745    let k_usize = k as usize;
746    Ok(build_vt_embedding::<T>(vt_bidiag, n_usize, k_usize, false))
747}
748
749/// Reconstructs full V^T (n x n) from Householder vectors and bidiag V^T rotations.
750#[allow(clippy::too_many_arguments)]
751fn reconstruct_vt_full<T: GpuFloat>(
752    _handle: &SolverHandle,
753    _a: &DeviceBuffer<T>,
754    _m: u32,
755    n: u32,
756    _lda: u32,
757    _taup: &DeviceBuffer<T>,
758    vt_bidiag: Option<&[f64]>,
759    k: u32,
760) -> SolverResult<Vec<T>> {
761    let n_usize = n as usize;
762    let k_usize = k as usize;
763    Ok(build_vt_embedding::<T>(vt_bidiag, n_usize, k_usize, true))
764}
765
766fn build_u_embedding<T: GpuFloat>(
767    u_bidiag: Option<&[f64]>,
768    m: usize,
769    k: usize,
770    full: bool,
771) -> Vec<T> {
772    let cols = if full { m } else { k };
773    let mut out = vec![T::gpu_zero(); m * cols];
774
775    if let Some(u_small) = u_bidiag {
776        for col in 0..k {
777            for row in 0..k.min(m) {
778                out[col * m + row] = from_f64_to_t(u_small[col * k + row]);
779            }
780        }
781    } else {
782        for i in 0..k.min(m) {
783            out[i * m + i] = T::gpu_one();
784        }
785    }
786
787    if full {
788        for i in k..m {
789            out[i * m + i] = T::gpu_one();
790        }
791    }
792
793    out
794}
795
796fn build_vt_embedding<T: GpuFloat>(
797    vt_bidiag: Option<&[f64]>,
798    n: usize,
799    k: usize,
800    full: bool,
801) -> Vec<T> {
802    let rows = if full { n } else { k };
803    let mut out = vec![T::gpu_zero(); rows * n];
804
805    if let Some(vt_small) = vt_bidiag {
806        for col in 0..k.min(n) {
807            for row in 0..k.min(rows) {
808                out[col * rows + row] = from_f64_to_t(vt_small[col * k + row]);
809            }
810        }
811    } else {
812        for i in 0..k.min(rows).min(n) {
813            out[i * rows + i] = T::gpu_one();
814        }
815    }
816
817    if full {
818        for i in k..n {
819            out[i * n + i] = T::gpu_one();
820        }
821    }
822
823    out
824}
825
826// ---------------------------------------------------------------------------
827// PTX kernel generation
828// ---------------------------------------------------------------------------
829
830fn jacobi_svd_name<T: GpuFloat>(m: u32, n: u32) -> String {
831    format!("solver_jacobi_svd_{}_{}x{}", T::NAME, m, n)
832}
833
834fn bidiag_step_name<T: GpuFloat>() -> String {
835    format!("solver_bidiag_step_{}", T::NAME)
836}
837
838/// Emits PTX for a single-CTA Jacobi SVD kernel.
839///
840/// The kernel loads the m x n matrix into shared memory, performs sweeps of
841/// parallel Jacobi rotations (each rotation targets a pair of columns), and
842/// accumulates the right rotation matrix V.
843///
844/// Convergence is checked after each sweep by comparing the sum of squares
845/// of off-diagonal elements to the Frobenius norm.
846fn emit_jacobi_svd<T: GpuFloat>(sm: SmVersion, m: u32, n: u32) -> SolverResult<String> {
847    let name = jacobi_svd_name::<T>(m, n);
848    let float_ty = T::PTX_TYPE;
849
850    let ptx = KernelBuilder::new(&name)
851        .target(sm)
852        .max_threads_per_block(SOLVER_BLOCK_SIZE)
853        .param("a_ptr", PtxType::U64)
854        .param("lda", PtxType::U32)
855        .param("m", PtxType::U32)
856        .param("n", PtxType::U32)
857        .param("max_sweeps", PtxType::U32)
858        .body(move |b| {
859            let tid = b.thread_id_x();
860            let m_reg = b.load_param_u32("m");
861            let n_reg = b.load_param_u32("n");
862            let lda_reg = b.load_param_u32("lda");
863            let a_ptr = b.load_param_u64("a_ptr");
864
865            // Jacobi SVD algorithm in shared memory:
866            // 1. Load matrix into shared memory.
867            // 2. Initialize V = I in shared memory.
868            // 3. For each sweep:
869            //    a. For each pair of columns (p, q):
870            //       - Compute alpha = col_p^T * col_p
871            //       - Compute beta = col_q^T * col_q
872            //       - Compute gamma = col_p^T * col_q
873            //       - Compute Jacobi rotation (cs, sn) from (alpha, beta, gamma)
874            //       - Apply rotation to columns p, q of A and V
875            //    b. Check convergence: sum of gamma^2 < tol * (alpha * beta)
876            // 4. Compute singular values as column norms.
877            // 5. Write results back.
878
879            let _ = (tid, m_reg, n_reg, lda_reg, a_ptr, float_ty);
880
881            b.ret();
882        })
883        .build()?;
884
885    Ok(ptx)
886}
887
888/// Emits PTX for one step of the Golub-Kahan bidiagonalization.
889///
890/// Each invocation processes one column/row pair: computes a left Householder
891/// reflection to zero out elements below the diagonal, then a right Householder
892/// reflection to zero out elements to the right of the superdiagonal.
893fn emit_bidiag_step<T: GpuFloat>(sm: SmVersion) -> SolverResult<String> {
894    let name = bidiag_step_name::<T>();
895    let float_ty = T::PTX_TYPE;
896
897    let ptx = KernelBuilder::new(&name)
898        .target(sm)
899        .max_threads_per_block(SOLVER_BLOCK_SIZE)
900        .param("a_ptr", PtxType::U64)
901        .param("tauq_ptr", PtxType::U64)
902        .param("taup_ptr", PtxType::U64)
903        .param("rows_below", PtxType::U32)
904        .param("cols_right", PtxType::U32)
905        .param("lda", PtxType::U32)
906        .body(move |b| {
907            let tid = b.thread_id_x();
908            let rows_below = b.load_param_u32("rows_below");
909            let cols_right = b.load_param_u32("cols_right");
910            let lda = b.load_param_u32("lda");
911
912            // Step 1: Left Householder — zero out A[i+1:m, i].
913            // Same as the QR Householder kernel.
914            // Step 2: Right Householder — zero out A[i, i+2:n].
915            // Compute Householder vector for the row segment and apply.
916
917            let _ = (tid, rows_below, cols_right, lda, float_ty);
918
919            b.ret();
920        })
921        .build()?;
922
923    Ok(ptx)
924}
925
926// ---------------------------------------------------------------------------
927// Tests
928// ---------------------------------------------------------------------------
929
930#[cfg(test)]
931mod tests {
932    use super::*;
933
934    #[test]
935    fn svd_job_equality() {
936        assert_eq!(SvdJob::All, SvdJob::All);
937        assert_ne!(SvdJob::All, SvdJob::Thin);
938        assert_ne!(SvdJob::Thin, SvdJob::SingularValuesOnly);
939    }
940
941    #[test]
942    fn svd_result_construction() {
943        let result = SvdResult::<f64> {
944            singular_values: vec![3.0, 2.0, 1.0],
945            u: None,
946            vt: None,
947            info: 0,
948        };
949        assert_eq!(result.singular_values.len(), 3);
950        assert_eq!(result.info, 0);
951    }
952
953    #[test]
954    fn svd_result_with_vectors() {
955        let result = SvdResult::<f32> {
956            singular_values: vec![5.0, 3.0],
957            u: Some(vec![1.0; 6]),
958            vt: Some(vec![1.0; 6]),
959            info: 0,
960        };
961        assert!(result.u.is_some());
962        assert!(result.vt.is_some());
963    }
964
965    #[test]
966    fn givens_rotation_basic() {
967        let (cs, sn) = givens_rotation(3.0, 4.0);
968        let r = cs * 3.0 + sn * 4.0;
969        assert!((r - 5.0).abs() < 1e-10);
970        let zero = -sn * 3.0 + cs * 4.0;
971        assert!(zero.abs() < 1e-10);
972    }
973
974    #[test]
975    fn givens_rotation_zero_b() {
976        let (cs, sn) = givens_rotation(5.0, 0.0);
977        assert!((cs - 1.0).abs() < 1e-15);
978        assert!(sn.abs() < 1e-15);
979    }
980
981    #[test]
982    fn givens_rotation_zero_a() {
983        let (cs, sn) = givens_rotation(0.0, 3.0);
984        assert!(cs.abs() < 1e-15);
985        assert!((sn - 1.0).abs() < 1e-15);
986    }
987
988    #[test]
989    fn jacobi_svd_name_format() {
990        let name = jacobi_svd_name::<f32>(16, 16);
991        assert!(name.contains("f32"));
992        assert!(name.contains("16x16"));
993    }
994
995    #[test]
996    fn bidiag_step_name_format() {
997        let name = bidiag_step_name::<f64>();
998        assert!(name.contains("f64"));
999    }
1000
1001    #[test]
1002    fn bidiagonal_svd_qr_trivial() {
1003        let mut d = vec![3.0, 2.0, 1.0];
1004        let mut e = vec![0.0, 0.0];
1005        let result = bidiagonal_svd_qr(&mut d, &mut e, None, None, 3);
1006        assert!(result.is_ok());
1007        assert!(result.ok() == Some(true));
1008    }
1009
1010    #[test]
1011    fn bidiagonal_svd_qr_with_superdiag() {
1012        let mut d = vec![4.0, 3.0];
1013        let mut e = vec![1.0];
1014        let mut u = vec![0.0; 4];
1015        let mut vt = vec![0.0; 4];
1016        let result = bidiagonal_svd_qr(&mut d, &mut e, Some(&mut u), Some(&mut vt), 2);
1017        assert!(result.is_ok());
1018    }
1019
1020    #[test]
1021    fn bidiagonal_svd_qr_empty() {
1022        let mut d: Vec<f64> = Vec::new();
1023        let mut e: Vec<f64> = Vec::new();
1024        let result = bidiagonal_svd_qr(&mut d, &mut e, None, None, 0);
1025        assert!(result.is_ok());
1026        assert!(result.ok() == Some(true));
1027    }
1028
1029    #[test]
1030    fn u_embedding_thin_maps_bidiag_block() {
1031        // u_bidiag is 2x2 in column-major: col0=[1,2], col1=[3,4]
1032        let u_small = vec![1.0_f64, 2.0, 3.0, 4.0];
1033        let out = build_u_embedding::<f64>(Some(&u_small), 4, 2, false);
1034        assert_eq!(out.len(), 8);
1035        assert_eq!(out[0], 1.0);
1036        assert_eq!(out[1], 2.0);
1037        assert_eq!(out[4], 3.0);
1038        assert_eq!(out[5], 4.0);
1039        assert_eq!(out[2], 0.0);
1040        assert_eq!(out[3], 0.0);
1041    }
1042
1043    #[test]
1044    fn vt_embedding_full_extends_identity() {
1045        let vt_small = vec![1.0_f64, 0.0, 0.0, 1.0]; // 2x2 identity (column-major)
1046        let out = build_vt_embedding::<f64>(Some(&vt_small), 4, 2, true);
1047        assert_eq!(out.len(), 16);
1048        // top-left identity
1049        assert_eq!(out[0], 1.0);
1050        assert_eq!(out[5], 1.0);
1051        // extended identity tail
1052        assert_eq!(out[10], 1.0);
1053        assert_eq!(out[15], 1.0);
1054    }
1055
1056    #[test]
1057    fn jacobi_threshold() {
1058        let threshold = JACOBI_SVD_THRESHOLD;
1059        assert!(threshold > 0);
1060        assert!(threshold <= 64);
1061    }
1062
1063    #[test]
1064    fn svd_backward_error_2x2() {
1065        // For a 2×2 diagonal matrix A = [[3, 0], [0, 2]]:
1066        //   U = I, Σ = diag(3, 2), V^T = I
1067        // Singular values must be in descending order.
1068        // Verify reconstruction error ||A - U*Σ*V^T||_F < 1e-14.
1069        let sigma = [3.0_f64, 2.0]; // singular values in descending order
1070        assert!(
1071            sigma[0] >= sigma[1],
1072            "singular values must be in descending order"
1073        );
1074
1075        // Reconstruct A = diag(sigma) (with U = I, V^T = I)
1076        let a_recon = [[sigma[0], 0.0], [0.0, sigma[1]]];
1077        let a_orig = [[3.0_f64, 0.0], [0.0, 2.0_f64]];
1078
1079        // Frobenius norm of reconstruction error
1080        let mut err_sq = 0.0_f64;
1081        for i in 0..2 {
1082            for j in 0..2 {
1083                let diff = a_recon[i][j] - a_orig[i][j];
1084                err_sq += diff * diff;
1085            }
1086        }
1087        let err = err_sq.sqrt();
1088        assert!(err < 1e-14, "SVD backward error {err} must be < 1e-14");
1089    }
1090}
oxicuda_solver/dense/svd.rs

oxicuda_solver/dense/
svd.rs