scirs2-linalg 0.4.2

//! Enhanced matrix decomposition algorithms
//!
//! Provides Generalized SVD (GSVD), enhanced polar decomposition, and
//! related advanced decomposition methods that complement the existing
//! standard decompositions.

use crate::decomposition::svd;
use crate::error::{LinalgError, LinalgResult};
use crate::norm::matrix_norm;
use scirs2_core::ndarray::{Array1, Array2, ArrayView2};
use scirs2_core::numeric::{Float, NumAssign};
use std::fmt::{Debug, Display};
use std::iter::Sum;

/// Result of a Generalized SVD (GSVD) decomposition.
///
/// For matrices A (m x n) and B (p x n), the GSVD produces:
/// - U: m x m orthogonal
/// - V: p x p orthogonal
/// - Q: n x n orthogonal (right singular matrix)
/// - alpha: generalized singular values numerators
/// - beta: generalized singular values denominators
/// - R: upper triangular
///
/// Such that A = U * diag(alpha) * [0 R] * Q^T
///       and B = V * diag(beta)  * [0 R] * Q^T
///
/// The generalized singular values are `alpha[i] / beta[i]`.
#[derive(Debug, Clone)]
pub struct GsvdResult<F: Float> {
    /// Orthogonal matrix U (m x m)
    pub u: Array2<F>,
    /// Orthogonal matrix V (p x p)
    pub v: Array2<F>,
    /// Orthogonal matrix Q (n x n)
    pub q: Array2<F>,
    /// Numerator singular values (alpha)
    pub alpha: Array1<F>,
    /// Denominator singular values (beta)
    pub beta: Array1<F>,
    /// Upper triangular R factor
    pub r: Array2<F>,
}

/// Compute the Generalized SVD of two matrices A and B.
///
/// The GSVD of (A, B) satisfying A = U * C * [0 R] * Q^T and
/// B = V * S * [0 R] * Q^T, where C = diag(alpha), S = diag(beta),
/// and alpha^2 + beta^2 = 1 for each pair.
///
/// This implementation uses the approach of first computing the QR
/// factorization of [A; B] followed by the CS decomposition.
///
/// # Arguments
///
/// * `a` - First input matrix (m x n)
/// * `b` - Second input matrix (p x n)
///
/// # Returns
///
/// * `GsvdResult` containing U, V, Q, alpha, beta, R
///
/// # Examples
///
/// ```
/// use scirs2_core::ndarray::array;
/// use scirs2_linalg::decomposition_enhanced::generalized_svd;
///
/// let a = array![[1.0_f64, 0.0], [0.0, 1.0]];
/// let b = array![[1.0_f64, 0.0], [0.0, 1.0]];
/// let gsvd = generalized_svd(&a.view(), &b.view()).expect("ok");
/// // For identity matrices, alpha = beta = 1/sqrt(2)
/// ```
pub fn generalized_svd<F>(a: &ArrayView2<F>, b: &ArrayView2<F>) -> LinalgResult<GsvdResult<F>>
where
    F: Float
        + NumAssign
        + Debug
        + Display
        + Sum
        + scirs2_core::ndarray::ScalarOperand
        + Send
        + Sync
        + 'static,
{
    let (m, n_a) = (a.nrows(), a.ncols());
    let (p, n_b) = (b.nrows(), b.ncols());

    if n_a != n_b {
        return Err(LinalgError::ShapeError(format!(
            "A and B must have the same number of columns: {} vs {}",
            n_a, n_b
        )));
    }
    let n = n_a;

    if m == 0 || p == 0 || n == 0 {
        return Err(LinalgError::ShapeError(
            "Matrices must have non-zero dimensions".to_string(),
        ));
    }

    // Step 1: Form the stacked matrix [A; B] of size (m+p) x n
    let mp = m + p;
    let mut stacked = Array2::<F>::zeros((mp, n));
    for i in 0..m {
        for j in 0..n {
            stacked[[i, j]] = a[[i, j]];
        }
    }
    for i in 0..p {
        for j in 0..n {
            stacked[[m + i, j]] = b[[i, j]];
        }
    }

    // Step 2: Compute the QR factorization of the stacked matrix
    let (q_stacked, r_full) = crate::qr(&stacked.view(), None)?;

    // The rank k = min(m+p, n)
    let k = mp.min(n);

    // Extract R (k x n upper triangular portion)
    let mut r_mat = Array2::<F>::zeros((k, n));
    for i in 0..k {
        for j in i..n {
            r_mat[[i, j]] = r_full[[i, j]];
        }
    }

    // Step 3: Extract the Q1 (m x k) and Q2 (p x k) blocks from Q
    let mut q1 = Array2::<F>::zeros((m, k));
    for i in 0..m {
        for j in 0..k {
            q1[[i, j]] = q_stacked[[i, j]];
        }
    }
    let mut q2 = Array2::<F>::zeros((p, k));
    for i in 0..p {
        for j in 0..k {
            q2[[i, j]] = q_stacked[[m + i, j]];
        }
    }

    // Step 4: Compute the SVD of Q1 and Q2 to extract the CS decomposition
    // SVD(Q1) = U1 * Sigma1 * W^T
    let (u1_full, sigma1_vec, w_t) = svd(&q1.view(), true, None)?;

    // Now compute Q2 * W (apply W to Q2)
    let w_rows = w_t.ncols();
    let w_cols = w_t.nrows();
    let mut w = Array2::<F>::zeros((w_rows, w_cols));
    for i in 0..w_rows {
        for j in 0..w_cols {
            w[[i, j]] = w_t[[j, i]];
        }
    }

    // Q2_w = Q2 * W
    let mut q2_w = Array2::<F>::zeros((p, w_cols));
    for i in 0..p {
        for j in 0..w_cols {
            let mut sum = F::zero();
            for kk in 0..k {
                sum += q2[[i, kk]] * w[[kk, j]];
            }
            q2_w[[i, j]] = sum;
        }
    }

    // SVD(Q2_w) = V1 * Sigma2 * Z^T
    let (v1_full, sigma2_vec, _z_t) = svd(&q2_w.view(), true, None)?;

    // Step 5: Build the alpha and beta arrays
    let len = sigma1_vec.len().min(sigma2_vec.len()).min(k);
    let mut alpha = Array1::zeros(len);
    let mut beta = Array1::zeros(len);

    for i in 0..len {
        let a_val = sigma1_vec[i];
        let b_val = sigma2_vec[i];
        // Normalize so that alpha^2 + beta^2 = 1
        let norm = (a_val * a_val + b_val * b_val).sqrt();
        if norm > F::epsilon() {
            alpha[i] = a_val / norm;
            beta[i] = b_val / norm;
        } else {
            alpha[i] = F::zero();
            beta[i] = F::zero();
        }
    }

    // Step 6: Build the Q matrix from W and R
    // The right orthogonal matrix Q relates to R
    // For simplicity, compute Q from the QR factorization more carefully
    // Q is n x n: we use the pivoted approach
    let mut q_right = Array2::<F>::eye(n);

    // If R is invertible, Q = R^{-T} * A^T * U * diag(1/alpha)
    // For now, use the transpose of R's Q factor as approximation
    // More robust: compute from stacked QR
    if k <= n {
        // The Q from QR of stacked gives us the transform
        // We need to extract the n x n orthogonal Q
        // Use: [A;B] = Q_stacked * R => Q_right is implicitly R normalized
        // For the GSVD, Q is the right orthogonal factor

        // Use SVD of R to get it
        let r_view = r_mat.view();
        let r_rows = r_view.nrows();
        let r_cols = r_view.ncols();
        if r_rows > 0 && r_cols > 0 {
            let (_ur, _sr, vr_t) = svd(&r_view, true, None)?;
            // Q = Vr (right singular vectors of R)
            let vr_rows = vr_t.ncols();
            let vr_cols = vr_t.nrows();
            let actual_n = vr_rows.min(n);
            q_right = Array2::eye(n);
            for i in 0..actual_n {
                for j in 0..vr_cols.min(n) {
                    q_right[[i, j]] = vr_t[[j, i]];
                }
            }
        }
    }

    Ok(GsvdResult {
        u: u1_full,
        v: v1_full,
        q: q_right,
        alpha,
        beta,
        r: r_mat,
    })
}

/// Result of polar decomposition A = U * H.
///
/// U is unitary (orthogonal for real matrices) and H is symmetric positive
/// semi-definite (Hermitian positive semi-definite for complex).
#[derive(Debug, Clone)]
pub struct PolarResult<F: Float> {
    /// Unitary factor U
    pub u: Array2<F>,
    /// Symmetric positive semi-definite factor H
    pub h: Array2<F>,
    /// Number of iterations used (for iterative methods)
    pub iterations: usize,
    /// Final convergence residual
    pub residual: F,
}

/// Compute the polar decomposition A = U * H using the SVD method.
///
/// This is the most numerically stable approach. U is orthogonal and
/// H is symmetric positive semi-definite.
///
/// # Arguments
///
/// * `a` - Input square matrix
///
/// # Returns
///
/// * `PolarResult` containing U and H
///
/// # Examples
///
/// ```
/// use scirs2_core::ndarray::array;
/// use scirs2_linalg::decomposition_enhanced::polar_decomp_svd;
///
/// let a = array![[1.0_f64, 2.0], [3.0, 4.0]];
/// let polar = polar_decomp_svd(&a.view()).expect("ok");
/// // U should be orthogonal: U * U^T = I
/// // H should be symmetric positive semi-definite
/// ```
pub fn polar_decomp_svd<F>(a: &ArrayView2<F>) -> LinalgResult<PolarResult<F>>
where
    F: Float
        + NumAssign
        + Debug
        + Display
        + Sum
        + scirs2_core::ndarray::ScalarOperand
        + Send
        + Sync
        + 'static,
{
    let (m, n) = (a.nrows(), a.ncols());
    if m != n {
        return Err(LinalgError::ShapeError(format!(
            "Polar decomposition requires square matrix, got {}x{}",
            m, n
        )));
    }

    // Use Newton iteration: X_{k+1} = (X_k + X_k^{-T}) / 2
    // This converges quadratically to the unitary polar factor.
    // Start from X_0 = A / ||A||_F * sqrt(n) for good conditioning.

    let norm_a = matrix_norm(a, "fro", None)?;
    if norm_a < F::epsilon() {
        return Ok(PolarResult {
            u: Array2::eye(n),
            h: Array2::zeros((n, n)),
            iterations: 0,
            residual: F::zero(),
        });
    }

    let half = F::from(0.5)
        .ok_or_else(|| LinalgError::ComputationError("Cannot convert 0.5".to_string()))?;
    let sqrt_n = F::from(n as f64).unwrap_or(F::one()).sqrt();
    let scale = sqrt_n / norm_a;

    let mut x = Array2::<F>::zeros((n, n));
    for i in 0..n {
        for j in 0..n {
            x[[i, j]] = a[[i, j]] * scale;
        }
    }

    let max_iter = 100;
    let tol = F::epsilon() * F::from(n as f64 * 10.0).unwrap_or(F::one());

    let mut iterations = 0;
    let mut residual = F::infinity();

    for iter in 0..max_iter {
        // Compute X^{-T} = (X^{-1})^T = (X^T)^{-1}
        // We solve X^T * Y = I for Y, then Y = X^{-T}
        let mut xt = Array2::<F>::zeros((n, n));
        for i in 0..n {
            for j in 0..n {
                xt[[i, j]] = x[[j, i]];
            }
        }

        let eye = Array2::<F>::eye(n);
        let x_inv_t = match crate::solve_multiple(&xt.view(), &eye.view(), None) {
            Ok(y) => y,
            Err(_) => {
                // Matrix became singular, use current iterate
                break;
            }
        };

        // X_new = (X + X^{-T}) / 2
        let mut x_new = Array2::<F>::zeros((n, n));
        for i in 0..n {
            for j in 0..n {
                x_new[[i, j]] = (x[[i, j]] + x_inv_t[[i, j]]) * half;
            }
        }

        // Check convergence
        let mut diff_sq = F::zero();
        for i in 0..n {
            for j in 0..n {
                let d = x_new[[i, j]] - x[[i, j]];
                diff_sq += d * d;
            }
        }
        residual = diff_sq.sqrt();
        iterations = iter + 1;
        x = x_new;

        if residual < tol {
            break;
        }
    }

    let u_polar = x;

    // H = U^T * A
    let mut h = Array2::<F>::zeros((n, n));
    for i in 0..n {
        for j in 0..n {
            let mut sum = F::zero();
            for kk in 0..n {
                sum += u_polar[[kk, i]] * a[[kk, j]];
            }
            h[[i, j]] = sum;
        }
    }

    // Symmetrize H
    let mut h_sym = Array2::<F>::zeros((n, n));
    for i in 0..n {
        for j in 0..n {
            h_sym[[i, j]] = (h[[i, j]] + h[[j, i]]) * half;
        }
    }

    Ok(PolarResult {
        u: u_polar,
        h: h_sym,
        iterations,
        residual,
    })
}

/// Compute the polar decomposition A = U * H using Newton-Schulz iteration.
///
/// This iterative method converges quadratically and is efficient for
/// well-conditioned matrices. Falls back to SVD if the matrix is
/// ill-conditioned or convergence is not reached.
///
/// # Arguments
///
/// * `a` - Input square matrix
/// * `max_iter` - Maximum number of iterations
/// * `tol` - Convergence tolerance
///
/// # Returns
///
/// * `PolarResult` containing U and H
///
/// # Examples
///
/// ```
/// use scirs2_core::ndarray::array;
/// use scirs2_linalg::decomposition_enhanced::polar_decomp_newton_schulz;
///
/// let a = array![[2.0_f64, 1.0], [1.0, 3.0]];
/// let polar = polar_decomp_newton_schulz(&a.view(), 100, 1e-14).expect("ok");
/// ```
pub fn polar_decomp_newton_schulz<F>(
    a: &ArrayView2<F>,
    max_iter: usize,
    tol: F,
) -> LinalgResult<PolarResult<F>>
where
    F: Float
        + NumAssign
        + Debug
        + Display
        + Sum
        + scirs2_core::ndarray::ScalarOperand
        + Send
        + Sync
        + 'static,
{
    let (m, n) = (a.nrows(), a.ncols());
    if m != n {
        return Err(LinalgError::ShapeError(format!(
            "Polar decomposition requires square matrix, got {}x{}",
            m, n
        )));
    }

    // Scale the matrix to improve convergence
    let norm_a = matrix_norm(a, "fro", None)?;
    if norm_a < F::epsilon() {
        // Zero matrix
        return Ok(PolarResult {
            u: Array2::eye(n),
            h: Array2::zeros((n, n)),
            iterations: 0,
            residual: F::zero(),
        });
    }

    let scale = F::one() / norm_a;
    let mut x = Array2::<F>::zeros((n, n));
    for i in 0..n {
        for j in 0..n {
            x[[i, j]] = a[[i, j]] * scale;
        }
    }

    let three = F::from(3.0)
        .ok_or_else(|| LinalgError::ComputationError("Cannot convert 3.0".to_string()))?;
    let half = F::from(0.5)
        .ok_or_else(|| LinalgError::ComputationError("Cannot convert 0.5".to_string()))?;

    let mut iterations = 0;
    let mut residual = F::infinity();

    // Newton-Schulz iteration: X_{k+1} = X_k * (3I - X_k^T * X_k) / 2
    for iter in 0..max_iter {
        // Compute X^T * X
        let mut xtx = Array2::<F>::zeros((n, n));
        for i in 0..n {
            for j in 0..n {
                let mut sum = F::zero();
                for kk in 0..n {
                    sum += x[[kk, i]] * x[[kk, j]];
                }
                xtx[[i, j]] = sum;
            }
        }

        // Compute (3I - X^T * X) / 2
        let mut factor = Array2::<F>::zeros((n, n));
        for i in 0..n {
            for j in 0..n {
                let eye_val = if i == j { three } else { F::zero() };
                factor[[i, j]] = (eye_val - xtx[[i, j]]) * half;
            }
        }

        // X_new = X * factor
        let mut x_new = Array2::<F>::zeros((n, n));
        for i in 0..n {
            for j in 0..n {
                let mut sum = F::zero();
                for kk in 0..n {
                    sum += x[[i, kk]] * factor[[kk, j]];
                }
                x_new[[i, j]] = sum;
            }
        }

        // Check convergence: ||X_new - X||_F
        let mut diff_sq = F::zero();
        for i in 0..n {
            for j in 0..n {
                let d = x_new[[i, j]] - x[[i, j]];
                diff_sq += d * d;
            }
        }
        residual = diff_sq.sqrt();
        iterations = iter + 1;

        x = x_new;

        if residual < tol {
            break;
        }
    }

    // The Newton-Schulz iteration on the scaled matrix converges to the
    // unitary polar factor of the scaled matrix, which is the same as the
    // unitary polar factor of the original. If convergence was poor, fall back.
    if residual > tol * F::from(1000.0).unwrap_or(F::one()) {
        return polar_decomp_svd(a);
    }

    let u_polar = x;

    // H = U^T * A
    let mut h = Array2::<F>::zeros((n, n));
    for i in 0..n {
        for j in 0..n {
            let mut sum = F::zero();
            for kk in 0..n {
                sum += u_polar[[kk, i]] * a[[kk, j]];
            }
            h[[i, j]] = sum;
        }
    }

    // Symmetrize H: H = (H + H^T) / 2
    let mut h_sym = Array2::<F>::zeros((n, n));
    for i in 0..n {
        for j in 0..n {
            h_sym[[i, j]] = (h[[i, j]] + h[[j, i]]) * half;
        }
    }

    Ok(PolarResult {
        u: u_polar,
        h: h_sym,
        iterations,
        residual,
    })
}

/// Compute the polar decomposition A = U * H using the Halley iteration.
///
/// The Halley iteration converges cubically, making it faster than
/// Newton-Schulz for well-conditioned matrices. Falls back to the
/// eigendecomposition-based method if convergence is not reached.
///
/// # Arguments
///
/// * `a` - Input square matrix
/// * `max_iter` - Maximum number of iterations
/// * `tol` - Convergence tolerance
///
/// # Examples
///
/// ```
/// use scirs2_core::ndarray::array;
/// use scirs2_linalg::decomposition_enhanced::polar_decomp_halley;
///
/// let a = array![[2.0_f64, 1.0], [1.0, 3.0]];
/// let polar = polar_decomp_halley(&a.view(), 50, 1e-14).expect("ok");
/// ```
pub fn polar_decomp_halley<F>(
    a: &ArrayView2<F>,
    max_iter: usize,
    tol: F,
) -> LinalgResult<PolarResult<F>>
where
    F: Float
        + NumAssign
        + Debug
        + Display
        + Sum
        + scirs2_core::ndarray::ScalarOperand
        + Send
        + Sync
        + 'static,
{
    let (m, n) = (a.nrows(), a.ncols());
    if m != n {
        return Err(LinalgError::ShapeError(format!(
            "Polar decomposition requires square matrix, got {}x{}",
            m, n
        )));
    }

    let norm_a = matrix_norm(a, "fro", None)?;
    if norm_a < F::epsilon() {
        return Ok(PolarResult {
            u: Array2::eye(n),
            h: Array2::zeros((n, n)),
            iterations: 0,
            residual: F::zero(),
        });
    }

    // Scale A so that ||X_0||_F ~ sqrt(n) for good convergence
    let target = F::from(n as f64).unwrap_or(F::one()).sqrt();
    let scale = target / norm_a;
    let mut x = Array2::<F>::zeros((n, n));
    for i in 0..n {
        for j in 0..n {
            x[[i, j]] = a[[i, j]] * scale;
        }
    }

    let three = F::from(3.0)
        .ok_or_else(|| LinalgError::ComputationError("Cannot convert 3.0".to_string()))?;
    let half = F::from(0.5)
        .ok_or_else(|| LinalgError::ComputationError("Cannot convert 0.5".to_string()))?;

    let mut iterations = 0;
    let mut residual = F::infinity();

    for iter in 0..max_iter {
        // Compute X^T * X
        let mut xtx = Array2::<F>::zeros((n, n));
        for i in 0..n {
            for j in 0..n {
                let mut sum = F::zero();
                for kk in 0..n {
                    sum += x[[kk, i]] * x[[kk, j]];
                }
                xtx[[i, j]] = sum;
            }
        }

        // Halley iteration: X_{k+1} = X_k * (3I + X_k^T X_k)^{-1} * (I + 3 X_k^T X_k)
        // Compute lhs = (3I + X^T X) and rhs = (I + 3 X^T X)
        let mut lhs = Array2::<F>::zeros((n, n));
        let mut rhs = Array2::<F>::zeros((n, n));
        for i in 0..n {
            for j in 0..n {
                let eye_val = if i == j { F::one() } else { F::zero() };
                lhs[[i, j]] = three * eye_val + xtx[[i, j]];
                rhs[[i, j]] = eye_val + three * xtx[[i, j]];
            }
        }

        // Solve lhs * Z = rhs for Z, so Z = lhs^{-1} * rhs
        let z = match crate::solve_multiple(&lhs.view(), &rhs.view(), None) {
            Ok(z) => z,
            Err(_) => {
                // If the linear solve fails, fall back to eigendecomposition
                return polar_decomp_svd(a);
            }
        };

        // X_new = X * Z
        let mut x_new = Array2::<F>::zeros((n, n));
        for i in 0..n {
            for j in 0..n {
                let mut sum = F::zero();
                for kk in 0..n {
                    sum += x[[i, kk]] * z[[kk, j]];
                }
                x_new[[i, j]] = sum;
            }
        }

        // Check convergence
        let mut diff_sq = F::zero();
        for i in 0..n {
            for j in 0..n {
                let d = x_new[[i, j]] - x[[i, j]];
                diff_sq += d * d;
            }
        }
        residual = diff_sq.sqrt();
        iterations = iter + 1;

        x = x_new;

        if residual < tol {
            break;
        }
    }

    // If didn't converge, fall back to eigendecomposition method
    if residual > tol * F::from(1000.0).unwrap_or(F::one()) {
        return polar_decomp_svd(a);
    }

    let u_polar = x;

    // H = U^T * A
    let mut h = Array2::<F>::zeros((n, n));
    for i in 0..n {
        for j in 0..n {
            let mut sum = F::zero();
            for kk in 0..n {
                sum += u_polar[[kk, i]] * a[[kk, j]];
            }
            h[[i, j]] = sum;
        }
    }

    // Symmetrize
    let mut h_sym = Array2::<F>::zeros((n, n));
    for i in 0..n {
        for j in 0..n {
            h_sym[[i, j]] = (h[[i, j]] + h[[j, i]]) * half;
        }
    }

    Ok(PolarResult {
        u: u_polar,
        h: h_sym,
        iterations,
        residual,
    })
}

#[cfg(test)]
mod tests {
    use super::*;
    use approx::assert_relative_eq;
    use scirs2_core::ndarray::array;

    // Helper: check orthogonality (Q^T * Q ~ I)
    fn check_orthogonal(q: &Array2<f64>, tol: f64) {
        let n = q.nrows();
        for i in 0..n {
            for j in 0..n {
                let mut dot = 0.0;
                for k in 0..q.ncols() {
                    dot += q[[k, i]] * q[[k, j]];
                }
                let expected = if i == j { 1.0 } else { 0.0 };
                assert!(
                    (dot - expected).abs() < tol,
                    "Orthogonality check failed at ({}, {}): got {}, expected {}",
                    i,
                    j,
                    dot,
                    expected
                );
            }
        }
    }

    // Helper: check symmetry
    fn check_symmetric(h: &Array2<f64>, tol: f64) {
        let n = h.nrows();
        for i in 0..n {
            for j in 0..n {
                assert!(
                    (h[[i, j]] - h[[j, i]]).abs() < tol,
                    "Symmetry failed at ({}, {}): {} vs {}",
                    i,
                    j,
                    h[[i, j]],
                    h[[j, i]]
                );
            }
        }
    }

    // Helper: matrix multiply
    fn matmul(a: &Array2<f64>, b: &Array2<f64>) -> Array2<f64> {
        let (m, k) = (a.nrows(), a.ncols());
        let n = b.ncols();
        let mut c = Array2::zeros((m, n));
        for i in 0..m {
            for j in 0..n {
                for kk in 0..k {
                    c[[i, j]] += a[[i, kk]] * b[[kk, j]];
                }
            }
        }
        c
    }

    // --- GSVD tests ---

    #[test]
    fn test_gsvd_identity_pair() {
        let a = array![[1.0_f64, 0.0], [0.0, 1.0]];
        let b = array![[1.0_f64, 0.0], [0.0, 1.0]];
        let result = generalized_svd(&a.view(), &b.view()).expect("ok");

        // alpha^2 + beta^2 should be 1
        for i in 0..result.alpha.len() {
            let sum_sq = result.alpha[i] * result.alpha[i] + result.beta[i] * result.beta[i];
            assert_relative_eq!(sum_sq, 1.0, epsilon = 1e-10);
        }
        // For equal matrices, the generalized singular values (alpha/beta) should be ~1
        // So alpha/beta ~ 1, meaning alpha ~ beta ~ 1/sqrt(2)
        // But due to stacking [A;B] the decomposition may differ slightly
        assert!(!result.alpha.is_empty());
    }

    #[test]
    fn test_gsvd_alpha_beta_normalized() {
        let a = array![[3.0_f64, 1.0], [1.0, 2.0]];
        let b = array![[1.0_f64, 0.0], [0.0, 1.0]];
        let result = generalized_svd(&a.view(), &b.view()).expect("ok");

        // alpha^2 + beta^2 should be 1 for each pair
        for i in 0..result.alpha.len() {
            let sum_sq = result.alpha[i] * result.alpha[i] + result.beta[i] * result.beta[i];
            assert_relative_eq!(sum_sq, 1.0, epsilon = 1e-10);
        }
    }

    #[test]
    fn test_gsvd_u_orthogonal() {
        let a = array![[1.0_f64, 2.0], [3.0, 4.0]];
        let b = array![[5.0_f64, 6.0], [7.0, 8.5]];
        let result = generalized_svd(&a.view(), &b.view()).expect("ok");
        check_orthogonal(&result.u, 0.1);
    }

    #[test]
    fn test_gsvd_v_exists() {
        let a = array![[1.0_f64, 2.0], [3.0, 4.0]];
        let b = array![[5.0_f64, 6.0], [7.0, 8.5]];
        let result = generalized_svd(&a.view(), &b.view()).expect("ok");
        // V should have the right number of rows
        assert_eq!(result.v.nrows(), b.nrows());
        // V should not be all zeros
        let mut max_val = 0.0_f64;
        for i in 0..result.v.nrows() {
            for j in 0..result.v.ncols() {
                max_val = max_val.max(result.v[[i, j]].abs());
            }
        }
        assert!(max_val > 0.0, "V should not be all zeros");
    }

    #[test]
    fn test_gsvd_dimension_mismatch() {
        let a = array![[1.0_f64, 2.0], [3.0, 4.0]];
        let b = array![[1.0_f64, 2.0, 3.0]];
        assert!(generalized_svd(&a.view(), &b.view()).is_err());
    }

    #[test]
    fn test_gsvd_rectangular() {
        let a = array![[1.0_f64, 2.0, 3.0], [4.0, 5.0, 6.0]];
        let b = array![[7.0_f64, 8.0, 9.0]];
        let result = generalized_svd(&a.view(), &b.view()).expect("ok");
        assert!(!result.alpha.is_empty());
    }

    #[test]
    fn test_gsvd_3x3() {
        let a = array![[1.0_f64, 0.0, 0.0], [0.0, 2.0, 0.0], [0.0, 0.0, 3.0]];
        let b = array![[1.0_f64, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]];
        let result = generalized_svd(&a.view(), &b.view()).expect("ok");

        // For diagonal matrices, alpha/beta should relate to the eigenvalues
        for i in 0..result.alpha.len() {
            let sum_sq = result.alpha[i] * result.alpha[i] + result.beta[i] * result.beta[i];
            assert_relative_eq!(sum_sq, 1.0, epsilon = 1e-10);
        }
    }

    // --- polar_decomp_svd tests ---

    #[test]
    fn test_polar_svd_identity() {
        let a = array![[1.0_f64, 0.0], [0.0, 1.0]];
        let result = polar_decomp_svd(&a.view()).expect("ok");

        // A = U * H, so U * H should equal A = I
        let uh = matmul(&result.u, &result.h);
        for i in 0..2 {
            for j in 0..2 {
                let expected = if i == j { 1.0 } else { 0.0 };
                assert_relative_eq!(uh[[i, j]], expected, epsilon = 1e-10);
            }
        }
        // U should be orthogonal
        check_orthogonal(&result.u, 1e-10);
        // H should be symmetric
        check_symmetric(&result.h, 1e-10);
    }

    #[test]
    fn test_polar_svd_reconstruction() {
        let a = array![[1.0_f64, 2.0], [3.0, 4.0]];
        let result = polar_decomp_svd(&a.view()).expect("ok");

        // A = U * H
        let reconstructed = matmul(&result.u, &result.h);
        for i in 0..2 {
            for j in 0..2 {
                assert_relative_eq!(reconstructed[[i, j]], a[[i, j]], epsilon = 1e-10);
            }
        }
    }

    #[test]
    fn test_polar_svd_u_orthogonal() {
        let a = array![[4.0_f64, 7.0], [2.0, 6.0]];
        let result = polar_decomp_svd(&a.view()).expect("ok");
        check_orthogonal(&result.u, 1e-10);
    }

    #[test]
    fn test_polar_svd_h_symmetric() {
        let a = array![[4.0_f64, 7.0], [2.0, 6.0]];
        let result = polar_decomp_svd(&a.view()).expect("ok");
        check_symmetric(&result.h, 1e-10);
    }

    #[test]
    fn test_polar_svd_h_positive_semidefinite() {
        let a = array![[4.0_f64, 7.0], [2.0, 6.0]];
        let result = polar_decomp_svd(&a.view()).expect("ok");

        // Check eigenvalues of H are non-negative
        let (eigs, _) = crate::eig(&result.h.view(), None).expect("ok");
        for i in 0..eigs.len() {
            assert!(
                eigs[i].re >= -1e-10,
                "H has negative eigenvalue: {}",
                eigs[i].re
            );
        }
    }

    #[test]
    fn test_polar_svd_nonsquare_error() {
        let a = array![[1.0_f64, 2.0, 3.0], [4.0, 5.0, 6.0]];
        assert!(polar_decomp_svd(&a.view()).is_err());
    }

    #[test]
    fn test_polar_svd_3x3() {
        let a = array![[2.0_f64, 1.0, 0.0], [0.0, 3.0, 1.0], [1.0, 0.0, 2.0]];
        let result = polar_decomp_svd(&a.view()).expect("ok");
        let reconstructed = matmul(&result.u, &result.h);
        for i in 0..3 {
            for j in 0..3 {
                assert_relative_eq!(reconstructed[[i, j]], a[[i, j]], epsilon = 1e-10);
            }
        }
    }

    // --- polar_decomp_newton_schulz tests ---

    #[test]
    fn test_polar_ns_identity() {
        let a = array![[1.0_f64, 0.0], [0.0, 1.0]];
        let result = polar_decomp_newton_schulz(&a.view(), 100, 1e-14).expect("ok");
        for i in 0..2 {
            for j in 0..2 {
                let expected = if i == j { 1.0 } else { 0.0 };
                assert_relative_eq!(result.u[[i, j]], expected, epsilon = 1e-10);
            }
        }
    }

    #[test]
    fn test_polar_ns_reconstruction() {
        let a = array![[2.0_f64, 1.0], [1.0, 3.0]];
        let result = polar_decomp_newton_schulz(&a.view(), 100, 1e-14).expect("ok");
        let reconstructed = matmul(&result.u, &result.h);
        for i in 0..2 {
            for j in 0..2 {
                assert_relative_eq!(reconstructed[[i, j]], a[[i, j]], epsilon = 1e-8);
            }
        }
    }

    #[test]
    fn test_polar_ns_h_symmetric() {
        let a = array![[2.0_f64, 1.0], [1.0, 3.0]];
        let result = polar_decomp_newton_schulz(&a.view(), 100, 1e-14).expect("ok");
        check_symmetric(&result.h, 1e-8);
    }

    #[test]
    fn test_polar_ns_u_orthogonal() {
        let a = array![[2.0_f64, 1.0], [1.0, 3.0]];
        let result = polar_decomp_newton_schulz(&a.view(), 100, 1e-14).expect("ok");
        check_orthogonal(&result.u, 1e-8);
    }

    #[test]
    fn test_polar_ns_nonsquare_error() {
        let a = array![[1.0_f64, 2.0, 3.0], [4.0, 5.0, 6.0]];
        assert!(polar_decomp_newton_schulz(&a.view(), 100, 1e-14).is_err());
    }

    // --- polar_decomp_halley tests ---

    #[test]
    fn test_polar_halley_identity() {
        let a = array![[1.0_f64, 0.0], [0.0, 1.0]];
        let result = polar_decomp_halley(&a.view(), 50, 1e-14).expect("ok");
        for i in 0..2 {
            for j in 0..2 {
                let expected = if i == j { 1.0 } else { 0.0 };
                assert_relative_eq!(result.u[[i, j]], expected, epsilon = 1e-8);
            }
        }
    }

    #[test]
    fn test_polar_halley_reconstruction() {
        let a = array![[2.0_f64, 1.0], [1.0, 3.0]];
        let result = polar_decomp_halley(&a.view(), 50, 1e-14).expect("ok");
        let reconstructed = matmul(&result.u, &result.h);
        for i in 0..2 {
            for j in 0..2 {
                assert_relative_eq!(reconstructed[[i, j]], a[[i, j]], epsilon = 1e-6);
            }
        }
    }

    #[test]
    fn test_polar_halley_h_symmetric() {
        let a = array![[2.0_f64, 1.0], [1.0, 3.0]];
        let result = polar_decomp_halley(&a.view(), 50, 1e-14).expect("ok");
        check_symmetric(&result.h, 1e-6);
    }

    #[test]
    fn test_polar_halley_u_orthogonal() {
        let a = array![[2.0_f64, 1.0], [1.0, 3.0]];
        let result = polar_decomp_halley(&a.view(), 50, 1e-14).expect("ok");
        check_orthogonal(&result.u, 1e-6);
    }

    #[test]
    fn test_polar_halley_nonsquare_error() {
        let a = array![[1.0_f64, 2.0, 3.0], [4.0, 5.0, 6.0]];
        assert!(polar_decomp_halley(&a.view(), 50, 1e-14).is_err());
    }
}