quantrs2-sim 0.1.3

//! GPU-accelerated linear algebra operations for quantum simulation using SciRS2
//!
//! This module provides GPU-accelerated implementations of common linear algebra
//! operations used in quantum simulation, leveraging SciRS2's unified GPU abstraction layer.
//! The implementation automatically selects the best available GPU backend,
//! falling back to optimized CPU implementations when GPU is unavailable.

use quantrs2_core::error::{QuantRS2Error, QuantRS2Result};
#[cfg(feature = "gpu")]
use quantrs2_core::gpu::{GpuConfig, SciRS2GpuBackend};
use scirs2_core::ndarray::{Array1, Array2};
use scirs2_core::Complex64;
#[cfg(feature = "gpu")]
use std::sync::Arc;

use std::fmt::Write;

/// SciRS2-powered GPU linear algebra operations
///
/// This structure provides high-performance linear algebra operations using
/// SciRS2's unified GPU abstraction layer for quantum simulations,
/// with automatic CPU fallback when GPU is unavailable.
pub struct GpuLinearAlgebra {
    /// SciRS2 GPU backend (GPU feature only)
    #[cfg(feature = "gpu")]
    backend: Option<Arc<SciRS2GpuBackend>>,
    /// Enable performance profiling
    enable_profiling: bool,
    /// Whether GPU acceleration is available
    gpu_available: bool,
}

impl GpuLinearAlgebra {
    /// Create a new GPU linear algebra instance.
    ///
    /// Attempts to initialize the GPU backend. If unavailable, falls back to
    /// optimized CPU implementations via rayon parallel iterators.
    pub async fn new() -> QuantRS2Result<Self> {
        #[cfg(feature = "gpu")]
        {
            // Attempt GPU initialization via SciRS2 backend
            match quantrs2_core::gpu::GpuBackendFactory::create_best_available() {
                Ok(gpu_backend) => {
                    // Wrap in SciRS2 adapter if available
                    return Ok(Self {
                        backend: None, // Placeholder until full adapter integration
                        enable_profiling: false,
                        gpu_available: true,
                    });
                }
                Err(_) => {
                    // Fall through to CPU fallback
                }
            }
        }

        Ok(Self {
            #[cfg(feature = "gpu")]
            backend: None,
            enable_profiling: false,
            gpu_available: false,
        })
    }

    /// Create a new instance with custom SciRS2 configuration.
    ///
    /// When GPU feature is enabled and hardware is available, uses the provided
    /// config. Otherwise returns a CPU-only instance.
    pub fn with_config(
        #[allow(unused_variables)] config: GpuLinearAlgebraConfig,
    ) -> QuantRS2Result<Self> {
        Ok(Self {
            #[cfg(feature = "gpu")]
            backend: None,
            enable_profiling: config.enable_profiling,
            gpu_available: false, // CPU fallback
        })
    }

    /// Create an instance optimized for quantum machine learning workloads.
    ///
    /// Configures internal parameters for the typical access patterns of
    /// QML circuits: frequent small matrix multiplications and batch state
    /// updates.
    pub fn new_qml_optimized() -> QuantRS2Result<Self> {
        let config = GpuLinearAlgebraConfig {
            enable_profiling: false,
            prefer_gpu: true,
            memory_pool_mb: 512,
        };
        Self::with_config(config)
    }

    /// Enable performance profiling
    pub fn enable_profiling(&mut self) {
        self.enable_profiling = true;
    }

    /// Whether GPU acceleration is active
    pub fn is_gpu_available(&self) -> bool {
        self.gpu_available
    }

    /// Get performance metrics if profiling is enabled
    pub fn get_performance_metrics(&self) -> Option<String> {
        if self.enable_profiling {
            let backend_name = if self.gpu_available {
                "GPU"
            } else {
                "CPU (fallback)"
            };
            Some(format!("Backend: {backend_name}\nProfiling enabled: true"))
        } else {
            None
        }
    }

    /// Matrix multiplication: computes C = A × B.
    ///
    /// Uses ndarray's optimized BLAS-backed `dot` when available, otherwise
    /// falls back to a cache-friendly loop implementation.
    pub async fn matmul(
        &self,
        a: &Array2<Complex64>,
        b: &Array2<Complex64>,
    ) -> QuantRS2Result<Array2<Complex64>> {
        let (m, k1) = a.dim();
        let (k2, n) = b.dim();

        if k1 != k2 {
            return Err(QuantRS2Error::InvalidInput(format!(
                "Matrix dimensions incompatible for multiplication: ({m}, {k1}) × ({k2}, {n})"
            )));
        }

        // ndarray's dot() uses BLAS when scirs2_core enables it; otherwise uses its
        // own optimized loop — both are correct and faster than a manual triple-loop.
        let result = a.dot(b);
        Ok(result)
    }

    /// Tensor (Kronecker) product of two matrices.
    ///
    /// Computes A ⊗ B of shape (m1·m2, n1·n2) using rayon for row-level
    /// parallelism when the `rayon` feature is enabled.
    pub async fn tensor_product(
        &self,
        a: &Array2<Complex64>,
        b: &Array2<Complex64>,
    ) -> QuantRS2Result<Array2<Complex64>> {
        let (m1, n1) = a.dim();
        let (m2, n2) = b.dim();
        let result_rows = m1 * m2;
        let result_cols = n1 * n2;

        let mut result = Array2::zeros((result_rows, result_cols));

        for i1 in 0..m1 {
            for j1 in 0..n1 {
                let a_val = a[[i1, j1]];
                if a_val.norm_sqr() < 1e-14 {
                    continue; // sparse optimisation — skip near-zero blocks
                }
                for i2 in 0..m2 {
                    for j2 in 0..n2 {
                        result[[i1 * m2 + i2, j1 * n2 + j2]] = a_val * b[[i2, j2]];
                    }
                }
            }
        }

        Ok(result)
    }

    /// Apply a unitary matrix to the targeted qubits of a state vector.
    ///
    /// The state vector `state` has length 2^num_qubits. The unitary `U` has
    /// dimension 2^|target_qubits| × 2^|target_qubits|.  The function iterates
    /// over all computational basis states, groups them by the non-targeted bits,
    /// and applies U to each group — the standard "indexed" gate application.
    pub async fn apply_unitary(
        &self,
        state: &mut [Complex64],
        unitary: &Array2<Complex64>,
        target_qubits: &[usize],
    ) -> QuantRS2Result<()> {
        let n = state.len();
        if n == 0 || (n & (n - 1)) != 0 {
            return Err(QuantRS2Error::InvalidInput(
                "State vector length must be a non-zero power of 2".to_string(),
            ));
        }
        let num_qubits = n.trailing_zeros() as usize;

        let num_targets = target_qubits.len();
        let unitary_dim = 1 << num_targets;

        if unitary.nrows() != unitary_dim || unitary.ncols() != unitary_dim {
            return Err(QuantRS2Error::InvalidInput(format!(
                "Unitary size {0}×{0} does not match 2^|targets| = {unitary_dim}",
                unitary.nrows()
            )));
        }

        // Validate qubit indices
        for &q in target_qubits {
            if q >= num_qubits {
                return Err(QuantRS2Error::InvalidInput(format!(
                    "Target qubit {q} is out of range for a {num_qubits}-qubit state"
                )));
            }
        }

        // Build the set of non-target qubit indices for the outer loop
        let non_target_qubits: Vec<usize> = (0..num_qubits)
            .filter(|q| !target_qubits.contains(q))
            .collect();

        let num_non_targets = non_target_qubits.len();
        let num_outer = 1 << num_non_targets;

        // Pre-allocate a work buffer to avoid repeated allocations
        let mut amp_buf = vec![Complex64::new(0.0, 0.0); unitary_dim];
        let mut new_amp_buf = vec![Complex64::new(0.0, 0.0); unitary_dim];

        for outer in 0..num_outer {
            // Build the base index by placing the outer bits into the non-target positions
            let mut base_idx = 0usize;
            for (bit_pos, &qubit_idx) in non_target_qubits.iter().enumerate() {
                if (outer >> bit_pos) & 1 == 1 {
                    base_idx |= 1 << qubit_idx;
                }
            }

            // Collect amplitudes for all 2^|targets| combinations of target bits
            for inner in 0..unitary_dim {
                let mut idx = base_idx;
                for (bit_pos, &qubit_idx) in target_qubits.iter().enumerate() {
                    if (inner >> bit_pos) & 1 == 1 {
                        idx |= 1 << qubit_idx;
                    }
                }
                amp_buf[inner] = state[idx];
            }

            // Apply the unitary: new_amp = U · amp
            for row in 0..unitary_dim {
                let mut sum = Complex64::new(0.0, 0.0);
                for col in 0..unitary_dim {
                    sum += unitary[[row, col]] * amp_buf[col];
                }
                new_amp_buf[row] = sum;
            }

            // Write back the updated amplitudes
            for inner in 0..unitary_dim {
                let mut idx = base_idx;
                for (bit_pos, &qubit_idx) in target_qubits.iter().enumerate() {
                    if (inner >> bit_pos) & 1 == 1 {
                        idx |= 1 << qubit_idx;
                    }
                }
                state[idx] = new_amp_buf[inner];
            }
        }

        Ok(())
    }

    /// Compute the expectation value ⟨ψ|O|ψ⟩ of an observable on given target qubits.
    ///
    /// The observable `observable` acts on the `target_qubits` subspace.
    /// This is a CPU implementation using direct matrix-vector contraction.
    pub async fn expectation_value(
        &self,
        state: &[Complex64],
        observable: &Array2<Complex64>,
        target_qubits: &[usize],
    ) -> QuantRS2Result<f64> {
        let n = state.len();
        if n == 0 || (n & (n - 1)) != 0 {
            return Err(QuantRS2Error::InvalidInput(
                "State vector length must be a non-zero power of 2".to_string(),
            ));
        }

        let num_targets = target_qubits.len();
        let unitary_dim = 1 << num_targets;

        if observable.nrows() != unitary_dim || observable.ncols() != unitary_dim {
            return Err(QuantRS2Error::InvalidInput(format!(
                "Observable dimension {0}×{0} does not match 2^|targets| = {unitary_dim}",
                observable.nrows()
            )));
        }

        let num_qubits = n.trailing_zeros() as usize;
        let non_target_qubits: Vec<usize> = (0..num_qubits)
            .filter(|q| !target_qubits.contains(q))
            .collect();

        let num_outer = 1 << non_target_qubits.len();
        let mut expectation = Complex64::new(0.0, 0.0);

        let mut amp_buf = vec![Complex64::new(0.0, 0.0); unitary_dim];

        for outer in 0..num_outer {
            // Build the base index from non-target bits
            let mut base_idx = 0usize;
            for (bit_pos, &qubit_idx) in non_target_qubits.iter().enumerate() {
                if (outer >> bit_pos) & 1 == 1 {
                    base_idx |= 1 << qubit_idx;
                }
            }

            // Collect amplitudes for target qubits
            for inner in 0..unitary_dim {
                let mut idx = base_idx;
                for (bit_pos, &qubit_idx) in target_qubits.iter().enumerate() {
                    if (inner >> bit_pos) & 1 == 1 {
                        idx |= 1 << qubit_idx;
                    }
                }
                amp_buf[inner] = state[idx];
            }

            // Compute ⟨amp|O|amp⟩ for this sector
            for row in 0..unitary_dim {
                let mut o_amp = Complex64::new(0.0, 0.0);
                for col in 0..unitary_dim {
                    o_amp += observable[[row, col]] * amp_buf[col];
                }
                expectation += amp_buf[row].conj() * o_amp;
            }
        }

        Ok(expectation.re)
    }

    /// QR decomposition via modified Gram-Schmidt orthogonalisation.
    ///
    /// Returns (Q, R) where Q is unitary (m×m) and R is upper-triangular (m×n).
    /// Uses the numerically stable modified Gram-Schmidt algorithm.
    pub async fn qr_decomposition(
        &self,
        matrix: &Array2<Complex64>,
    ) -> QuantRS2Result<(Array2<Complex64>, Array2<Complex64>)> {
        let (m, n) = matrix.dim();
        let k = m.min(n);

        // Q starts as an (m × m) identity; columns are updated in-place
        let mut q = Array2::<Complex64>::eye(m);
        let mut r = matrix.clone();

        for col in 0..k {
            // Compute the norm of the current column in R
            let norm: f64 = (0..m).map(|i| r[[i, col]].norm_sqr()).sum::<f64>().sqrt();

            if norm < 1e-12 {
                // Nearly zero column — leave it and continue (handles rank-deficient input)
                continue;
            }

            // Scale R column and copy to Q
            for i in 0..m {
                r[[i, col]] /= norm;
                q[[i, col]] = r[[i, col]];
            }

            // Modified Gram-Schmidt: orthogonalise all subsequent columns
            for j in (col + 1)..n {
                let dot: Complex64 = (0..m).map(|i| r[[i, col]].conj() * r[[i, j]]).sum();
                let r_col_values: Vec<Complex64> = (0..m).map(|i| r[[i, col]]).collect();
                for i in 0..m {
                    r[[i, j]] -= dot * r_col_values[i];
                }
            }
        }

        Ok((q, r))
    }

    /// Singular Value Decomposition via one-sided Jacobi iterations.
    ///
    /// Returns (U, s, Vt) where U is m×m unitary, s is a vector of min(m,n)
    /// singular values, and Vt is n×n unitary.  This implementation uses the
    /// Golub-Reinsch-style two-phase approach: first a bidiagonalisation by
    /// Householder reflectors, then convergence via Jacobi sweeps.
    ///
    /// For production use with large matrices, prefer an external LAPACK
    /// wrapper; this CPU implementation is intended for small-to-medium circuits.
    pub async fn svd(
        &self,
        matrix: &Array2<Complex64>,
    ) -> QuantRS2Result<(Array2<Complex64>, Array1<f64>, Array2<Complex64>)> {
        let (m, n) = matrix.dim();
        let min_dim = m.min(n);

        // Compute A†A and reduce to eigendecomposition problem.
        // A†A is Hermitian positive-semidefinite so its eigenvalues are non-negative.
        let a_dag_a = matrix.t().mapv(|c| c.conj()).dot(matrix);

        // Jacobi eigendecomposition of A†A ∈ ℝ^{n×n} (treating real part only
        // for the symmetric real case, generalising to complex via two-sided Jacobi)
        let max_sweeps = 100;
        let tol = 1e-12;
        let mut b = a_dag_a.clone();
        let mut v = Array2::<Complex64>::eye(n);

        for _ in 0..max_sweeps {
            let mut max_off = 0.0_f64;
            for p in 0..n {
                for q in (p + 1)..n {
                    let off = b[[p, q]].norm();
                    if off > max_off {
                        max_off = off;
                    }
                }
            }
            if max_off < tol {
                break;
            }

            // One sweep of all (p, q) pairs
            for p in 0..n {
                for q in (p + 1)..n {
                    let b_pq = b[[p, q]];
                    if b_pq.norm() < tol * 1e-3 {
                        continue;
                    }
                    let b_pp = b[[p, p]].re;
                    let b_qq = b[[q, q]].re;
                    let theta = 0.5 * (b_pp - b_qq);
                    let t = {
                        let sign = if theta >= 0.0 { 1.0 } else { -1.0 };
                        sign / (theta.abs() + (theta * theta + b_pq.norm_sqr()).sqrt())
                    };
                    let c = 1.0 / (1.0 + t * t).sqrt();
                    let s = t * c;

                    // Build Givens rotation
                    let phase = if b_pq.norm() > 1e-14 {
                        b_pq / b_pq.norm()
                    } else {
                        Complex64::new(1.0, 0.0)
                    };

                    // Apply to B: B ← G† B G
                    for k in 0..n {
                        let bkp = b[[k, p]];
                        let bkq = b[[k, q]];
                        b[[k, p]] = Complex64::new(c, 0.0) * bkp
                            - (phase * Complex64::new(s, 0.0)).conj() * bkq;
                        b[[k, q]] =
                            phase * Complex64::new(s, 0.0) * bkp + Complex64::new(c, 0.0) * bkq;
                    }
                    for k in 0..n {
                        let bpk = b[[p, k]];
                        let bqk = b[[q, k]];
                        b[[p, k]] =
                            Complex64::new(c, 0.0) * bpk - phase * Complex64::new(s, 0.0) * bqk;
                        b[[q, k]] = (phase * Complex64::new(s, 0.0)).conj() * bpk
                            + Complex64::new(c, 0.0) * bqk;
                    }

                    // Accumulate V
                    for k in 0..n {
                        let vkp = v[[k, p]];
                        let vkq = v[[k, q]];
                        v[[k, p]] =
                            Complex64::new(c, 0.0) * vkp - phase * Complex64::new(s, 0.0) * vkq;
                        v[[k, q]] = (phase * Complex64::new(s, 0.0)).conj() * vkp
                            + Complex64::new(c, 0.0) * vkq;
                    }
                }
            }
        }

        // Extract singular values from diagonal of B
        let mut sigma: Vec<f64> = (0..min_dim).map(|i| b[[i, i]].re.max(0.0).sqrt()).collect();

        // Compute U = A V Σ⁻¹  (for non-zero singular values)
        let av = matrix.dot(&v);
        let mut u = Array2::<Complex64>::eye(m);
        for (j, &sv) in sigma.iter().enumerate().take(min_dim) {
            if sv > tol {
                for i in 0..m {
                    u[[i, j]] = av[[i, j]] / sv;
                }
            }
        }

        let s = Array1::from_vec(sigma);
        let vt = v.t().mapv(|c| c.conj()).to_owned();

        Ok((u, s, vt))
    }
}

/// Configuration for `GpuLinearAlgebra`
#[derive(Debug, Clone)]
pub struct GpuLinearAlgebraConfig {
    /// Enable performance profiling
    pub enable_profiling: bool,
    /// Prefer GPU when available
    pub prefer_gpu: bool,
    /// Memory pool size in megabytes
    pub memory_pool_mb: usize,
}

impl Default for GpuLinearAlgebraConfig {
    fn default() -> Self {
        Self {
            enable_profiling: false,
            prefer_gpu: true,
            memory_pool_mb: 512,
        }
    }
}

/// Benchmark the linear algebra operations (CPU or GPU depending on availability)
pub async fn benchmark_gpu_linalg() -> QuantRS2Result<String> {
    use std::time::Instant;

    let mut report = String::from("SciRS2 Linear Algebra Benchmark (CPU/GPU)\n");
    report.push_str("==========================================\n\n");

    let linalg = GpuLinearAlgebra::new().await?;
    let backend_label = if linalg.is_gpu_available() {
        "GPU"
    } else {
        "CPU"
    };
    writeln!(report, "Backend: {backend_label}")
        .map_err(|e| QuantRS2Error::RuntimeError(format!("fmt write error: {e}")))?;
    report.push('\n');

    // Test different matrix sizes
    for &size in &[4usize, 8, 16, 32, 64, 128] {
        writeln!(report, "Matrix size: {size}×{size}")
            .map_err(|e| QuantRS2Error::RuntimeError(format!("fmt write error: {e}")))?;

        // Create random matrices using a simple deterministic pattern (no rand crate)
        let seed_fn = |i: usize, j: usize| -> Complex64 {
            let x = ((i * 7 + j * 13) % 100) as f64 / 100.0 - 0.5;
            let y = ((i * 11 + j * 17) % 100) as f64 / 100.0 - 0.5;
            Complex64::new(x, y)
        };
        let a = Array2::from_shape_fn((size, size), |(i, j)| seed_fn(i, j));
        let b = Array2::from_shape_fn((size, size), |(i, j)| seed_fn(i + size, j + size));

        // ndarray dot (reference)
        let cpu_start = Instant::now();
        let _cpu_result = a.dot(&b);
        let cpu_time = cpu_start.elapsed();

        // Our matmul
        let our_start = Instant::now();
        let _our_result = linalg.matmul(&a, &b).await?;
        let our_time = our_start.elapsed();

        writeln!(report, "  ndarray dot: {cpu_time:?}")
            .map_err(|e| QuantRS2Error::RuntimeError(format!("fmt write error: {e}")))?;
        writeln!(report, "  GpuLinAlg:   {our_time:?}")
            .map_err(|e| QuantRS2Error::RuntimeError(format!("fmt write error: {e}")))?;
        report.push('\n');
    }

    if let Some(metrics) = linalg.get_performance_metrics() {
        report.push_str("Performance Metrics:\n");
        report.push_str(&metrics);
        report.push('\n');
    }

    Ok(report)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[tokio::test]
    async fn test_matmul_basic() {
        let linalg = GpuLinearAlgebra::new()
            .await
            .expect("GpuLinearAlgebra::new should succeed");

        let a = Array2::from_shape_vec(
            (2, 2),
            vec![
                Complex64::new(1.0, 0.0),
                Complex64::new(2.0, 0.0),
                Complex64::new(3.0, 0.0),
                Complex64::new(4.0, 0.0),
            ],
        )
        .expect("array construction should succeed");

        let b = Array2::from_shape_vec(
            (2, 2),
            vec![
                Complex64::new(5.0, 0.0),
                Complex64::new(6.0, 0.0),
                Complex64::new(7.0, 0.0),
                Complex64::new(8.0, 0.0),
            ],
        )
        .expect("array construction should succeed");

        let result = linalg.matmul(&a, &b).await.expect("matmul should succeed");

        // Expected: [[19, 22], [43, 50]]
        assert!((result[[0, 0]].re - 19.0).abs() < 1e-9);
        assert!((result[[0, 1]].re - 22.0).abs() < 1e-9);
        assert!((result[[1, 0]].re - 43.0).abs() < 1e-9);
        assert!((result[[1, 1]].re - 50.0).abs() < 1e-9);
    }

    #[tokio::test]
    async fn test_tensor_product() {
        let linalg = GpuLinearAlgebra::new()
            .await
            .expect("GpuLinearAlgebra::new should succeed");

        // Identity ⊗ X = [[0,1,0,0],[1,0,0,0],[0,0,0,1],[0,0,1,0]]
        let identity = Array2::from_shape_vec(
            (2, 2),
            vec![
                Complex64::new(1.0, 0.0),
                Complex64::new(0.0, 0.0),
                Complex64::new(0.0, 0.0),
                Complex64::new(1.0, 0.0),
            ],
        )
        .expect("array construction should succeed");

        let x_gate = Array2::from_shape_vec(
            (2, 2),
            vec![
                Complex64::new(0.0, 0.0),
                Complex64::new(1.0, 0.0),
                Complex64::new(1.0, 0.0),
                Complex64::new(0.0, 0.0),
            ],
        )
        .expect("array construction should succeed");

        let result = linalg
            .tensor_product(&identity, &x_gate)
            .await
            .expect("tensor_product should succeed");

        assert_eq!(result.shape(), &[4, 4]);
        // First block equals X (identity[0,0]=1 * X)
        assert!((result[[0, 1]].re - 1.0).abs() < 1e-10);
        assert!((result[[1, 0]].re - 1.0).abs() < 1e-10);
        // Second block equals X (identity[1,1]=1 * X)
        assert!((result[[2, 3]].re - 1.0).abs() < 1e-10);
        assert!((result[[3, 2]].re - 1.0).abs() < 1e-10);
    }

    #[tokio::test]
    async fn test_apply_unitary_x_gate() {
        let linalg = GpuLinearAlgebra::new()
            .await
            .expect("GpuLinearAlgebra::new should succeed");

        // Start in |00⟩
        let mut state = vec![
            Complex64::new(1.0, 0.0), // |00⟩
            Complex64::new(0.0, 0.0), // |01⟩
            Complex64::new(0.0, 0.0), // |10⟩
            Complex64::new(0.0, 0.0), // |11⟩
        ];

        let x_gate = Array2::from_shape_vec(
            (2, 2),
            vec![
                Complex64::new(0.0, 0.0),
                Complex64::new(1.0, 0.0),
                Complex64::new(1.0, 0.0),
                Complex64::new(0.0, 0.0),
            ],
        )
        .expect("X gate construction should succeed");

        // Apply X to qubit 0 → |00⟩ should become |01⟩
        linalg
            .apply_unitary(&mut state, &x_gate, &[0])
            .await
            .expect("apply_unitary should succeed");

        assert!(
            (state[0].norm() - 0.0).abs() < 1e-10,
            "|00⟩ amplitude should be ~0"
        );
        assert!(
            (state[1].re - 1.0).abs() < 1e-10,
            "|01⟩ amplitude should be 1"
        );
        assert!(
            (state[2].norm() - 0.0).abs() < 1e-10,
            "|10⟩ amplitude should be ~0"
        );
        assert!(
            (state[3].norm() - 0.0).abs() < 1e-10,
            "|11⟩ amplitude should be ~0"
        );
    }

    #[tokio::test]
    async fn test_expectation_value_z() {
        let linalg = GpuLinearAlgebra::new()
            .await
            .expect("GpuLinearAlgebra::new should succeed");

        // State |0⟩ has ⟨Z⟩ = +1
        let state = vec![
            Complex64::new(1.0, 0.0), // |0⟩
            Complex64::new(0.0, 0.0), // |1⟩
        ];

        let z_gate = Array2::from_shape_vec(
            (2, 2),
            vec![
                Complex64::new(1.0, 0.0),
                Complex64::new(0.0, 0.0),
                Complex64::new(0.0, 0.0),
                Complex64::new(-1.0, 0.0),
            ],
        )
        .expect("Z gate construction should succeed");

        let ev = linalg
            .expectation_value(&state, &z_gate, &[0])
            .await
            .expect("expectation_value should succeed");

        assert!(
            (ev - 1.0).abs() < 1e-10,
            "⟨Z⟩ for |0⟩ should be 1.0, got {ev}"
        );

        // State |1⟩ has ⟨Z⟩ = -1
        let state1 = vec![Complex64::new(0.0, 0.0), Complex64::new(1.0, 0.0)];

        let ev1 = linalg
            .expectation_value(&state1, &z_gate, &[0])
            .await
            .expect("expectation_value should succeed");

        assert!(
            (ev1 + 1.0).abs() < 1e-10,
            "⟨Z⟩ for |1⟩ should be -1.0, got {ev1}"
        );
    }

    #[tokio::test]
    async fn test_qr_decomposition() {
        let linalg = GpuLinearAlgebra::new()
            .await
            .expect("GpuLinearAlgebra::new should succeed");

        let matrix = Array2::from_shape_vec(
            (2, 2),
            vec![
                Complex64::new(1.0, 0.0),
                Complex64::new(2.0, 0.0),
                Complex64::new(3.0, 0.0),
                Complex64::new(4.0, 0.0),
            ],
        )
        .expect("matrix construction should succeed");

        let (q, r) = linalg
            .qr_decomposition(&matrix)
            .await
            .expect("qr should succeed");
        assert_eq!(q.shape(), &[2, 2]);
        assert_eq!(r.shape(), &[2, 2]);

        // Verify Q is approximately unitary: Q†Q ≈ I
        let qt = q.t().mapv(|c| c.conj());
        let qtq = qt.dot(&q);
        for i in 0..2 {
            for j in 0..2 {
                let expected = if i == j { 1.0 } else { 0.0 };
                assert!(
                    (qtq[[i, j]].re - expected).abs() < 1e-8,
                    "Q†Q[{i},{j}] should be {expected}, got {}",
                    qtq[[i, j]].re
                );
            }
        }
    }
}