Skip to main content

prism_q/backend/statevector/
mod.rs

1//! Full state-vector simulation backend.
2//!
3//! Stores the complete 2^n amplitude vector and applies gates via direct
4//! index manipulation. This is the reference backend,
5//! but memory-limited to ~25-30 qubits on typical hardware.
6//!
7//! # Memory layout
8//!
9//! State is a contiguous `Vec<Complex64>` of length 2^n, indexed by computational
10//! basis state in standard binary order (qubit 0 = least significant bit).
11//!
12//! # Hot-path design
13//!
14//! - Single-qubit gates: iterate 2^(n-1) pairs with stride 2^target.
15//! - CX/CZ/SWAP: specialized routines avoid materializing a 4×4 matrix.
16//! - All gate kernels are `#[inline(always)]` to enable LTO to inline across
17//!   the dispatch boundary.
18//! - No heap allocation in per-amplitude inner loops. QFT twiddle tables may
19//!   allocate once outside the loop and are bounded by cache policy.
20//!
21//! # Threading strategy
22//!
23//! The pair-iteration loops are embarrassingly parallel. When the `parallel`
24//! feature is enabled and the qubit count meets `PARALLEL_THRESHOLD_QUBITS`
25//! (default: 14), each kernel dispatches to a Rayon-parallelized variant
26//! using `par_chunks_mut` / `split_at_mut`. Most partitioning uses safe
27//! slices; a few hot kernels use raw pointers after proving disjoint access.
28//! The sequential path is unchanged when the feature is off or the circuit is
29//! below threshold.
30//!
31//! # SIMD strategy
32//!
33//! The single-qubit gate kernel uses explicit SIMD intrinsics via the shared
34//! `simd` module. Complex64 (2×f64 = 128 bits) maps to one `__m128d` register.
35//! Matrix entries are precomputed as broadcast pairs `[re, re]` / `[im, im]`.
36//! Runtime dispatch: AVX2+FMA (256-bit) > FMA (128-bit) > SSE2 > scalar.
37//!
38//! Two-qubit gate and measurement parallel inner loops use SIMD bulk helpers
39//! (`negate_slice`, `swap_slices`, `norm_sqr_sum`, `zero_slice`)
40//! that dispatch to AVX2 implementations when available.
41
42pub(crate) mod kernels;
43#[cfg(test)]
44mod tests;
45
46use num_complex::Complex64;
47use rand::SeedableRng;
48use rand_chacha::ChaCha8Rng;
49
50#[cfg(feature = "gpu")]
51use std::sync::Arc;
52
53use crate::backend::simd;
54use crate::backend::Backend;
55use crate::circuit::Instruction;
56#[cfg(feature = "gpu")]
57use crate::circuit::{qft_textbook_steps, QftTextbookStep};
58use crate::error::Result;
59use crate::gates::Gate;
60
61#[cfg(feature = "gpu")]
62use crate::gpu::{GpuContext, GpuState};
63
64#[cfg(feature = "parallel")]
65use rayon::prelude::*;
66
67#[cfg(feature = "parallel")]
68pub(crate) use super::{MIN_PAR_ELEMS, PARALLEL_THRESHOLD_QUBITS};
69
70#[cfg(feature = "gpu")]
71fn reduced_density_matrix_from_state(state: &[Complex64], qubit: usize) -> [[Complex64; 2]; 2] {
72    let half = 1usize << qubit;
73    let block_size = half << 1;
74    let mut p0 = 0.0f64;
75    let mut p1 = 0.0f64;
76    let mut r = Complex64::new(0.0, 0.0);
77
78    for block in state.chunks(block_size) {
79        let (lo, hi) = block.split_at(half);
80        for i in 0..half {
81            let a0 = lo[i];
82            let a1 = hi[i];
83            p0 += a0.norm_sqr();
84            p1 += a1.norm_sqr();
85            r += a1 * a0.conj();
86        }
87    }
88
89    [
90        [Complex64::new(p0, 0.0), r.conj()],
91        [r, Complex64::new(p1, 0.0)],
92    ]
93}
94
95/// Insert a zero bit at `bit_pos`, shifting all higher bits left by one.
96///
97/// Maps a compact iteration index to a state-vector index with a gap at
98/// `bit_pos`. Chaining multiple calls (in ascending `bit_pos` order) creates
99/// gaps at all controlled-gate qubit positions.
100#[inline(always)]
101pub(crate) fn insert_zero_bit(val: usize, bit_pos: usize) -> usize {
102    let lo = val & ((1 << bit_pos) - 1);
103    let hi = val >> bit_pos;
104    (hi << (bit_pos + 1)) | lo
105}
106
107/// Wrapper to send a raw pointer across Rayon threads.
108///
109/// SAFETY: Callers must ensure no data races. Each thread must access
110/// disjoint elements. The mask-based index bijection guarantees this for
111/// controlled-gate kernels.
112#[cfg(feature = "parallel")]
113#[derive(Copy, Clone)]
114pub(crate) struct SendPtr(pub(crate) *mut Complex64);
115
116#[cfg(feature = "parallel")]
117// SAFETY: SendPtr is only used by kernels that partition the state into
118// disjoint mutable indices before entering Rayon work.
119unsafe impl Send for SendPtr {}
120#[cfg(feature = "parallel")]
121// SAFETY: Sharing the pointer wrapper is sound because mutation happens only
122// through disjoint indices established by each caller's index bijection.
123unsafe impl Sync for SendPtr {}
124
125#[cfg(feature = "parallel")]
126impl SendPtr {
127    #[inline(always)]
128    pub(crate) unsafe fn load(self, idx: usize) -> Complex64 {
129        *self.0.add(idx)
130    }
131
132    #[inline(always)]
133    pub(crate) unsafe fn store(self, idx: usize, val: Complex64) {
134        *self.0.add(idx) = val;
135    }
136
137    #[inline(always)]
138    pub(crate) fn as_f64_ptr(self) -> *mut f64 {
139        self.0 as *mut f64
140    }
141}
142
143/// Full state-vector backend.
144pub struct StatevectorBackend {
145    pub(crate) num_qubits: usize,
146    pub(crate) state: Vec<Complex64>,
147    pub(crate) classical_bits: Vec<bool>,
148    pub(crate) rng: ChaCha8Rng,
149    pub(crate) pending_norm: f64,
150    #[cfg(feature = "gpu")]
151    gpu_context: Option<Arc<GpuContext>>,
152    #[cfg(feature = "gpu")]
153    gpu_state: Option<GpuState>,
154}
155
156/// Kill switch for the native whole-state `QftBlock` FFT path.
157///
158/// Set `PRISM_NO_QFT_BLOCK=1` to force textbook expansion for A/B runs.
159#[inline]
160fn qft_block_enabled() -> bool {
161    use std::sync::OnceLock;
162    static ENABLED: OnceLock<bool> = OnceLock::new();
163    *ENABLED.get_or_init(|| std::env::var_os("PRISM_NO_QFT_BLOCK").is_none())
164}
165
166impl StatevectorBackend {
167    /// Create a new statevector backend with the given RNG seed.
168    pub fn new(seed: u64) -> Self {
169        Self {
170            num_qubits: 0,
171            state: Vec::new(),
172            classical_bits: Vec::new(),
173            rng: ChaCha8Rng::seed_from_u64(seed),
174            pending_norm: 1.0,
175            #[cfg(feature = "gpu")]
176            gpu_context: None,
177            #[cfg(feature = "gpu")]
178            gpu_state: None,
179        }
180    }
181
182    /// Opt into GPU acceleration using the given shared execution context.
183    ///
184    /// When set, [`Backend::init`] allocates a device-resident state instead of a host
185    /// `Vec<Complex64>` and gate application routes through GPU kernels.
186    #[cfg(feature = "gpu")]
187    pub fn with_gpu(mut self, context: Arc<GpuContext>) -> Self {
188        self.gpu_context = Some(context);
189        self
190    }
191
192    #[cfg(feature = "gpu")]
193    fn apply_gpu(&mut self, instruction: &Instruction) -> Result<()> {
194        debug_assert!(
195            self.gpu_state.is_some(),
196            "apply_gpu called without gpu_state (callers must check self.gpu_state.is_some() first)"
197        );
198        // Unconditional GPU dispatch: every instruction routes to a kernel once
199        // `gpu_state` is Some. This path is the explicit opt-in surface
200        // (`StatevectorBackend::with_gpu(ctx)`) and intentionally skips the
201        // dispatch-level crossover. Direct `with_gpu` callers request
202        // kernel behavior. For size-based crossover plus decomposition-aware
203        // routing, enter via `sim::run_with(BackendKind::StatevectorGpu
204        // { context })`, which honors `gpu_min_qubits()` per sub-block.
205        //
206        // Caveat: Multi2q launches one kernel for each subgate (rare in practice).
207        match instruction {
208            Instruction::Gate { gate, targets } => self.dispatch_gate_gpu(gate, targets),
209            Instruction::Measure {
210                qubit,
211                classical_bit,
212            } => self.apply_measure_gpu(*qubit, *classical_bit),
213            Instruction::Reset { qubit } => self.apply_reset_gpu(*qubit),
214            Instruction::Barrier { .. } => Ok(()),
215            Instruction::Conditional {
216                condition,
217                gate,
218                targets,
219            } => {
220                if condition.evaluate(&self.classical_bits) {
221                    self.dispatch_gate_gpu(gate, targets)
222                } else {
223                    Ok(())
224                }
225            }
226        }
227    }
228
229    #[cfg(feature = "gpu")]
230    fn dispatch_gate_gpu(&mut self, gate: &Gate, targets: &[usize]) -> Result<()> {
231        use crate::gpu::kernels::dense as k;
232
233        let gpu = self
234            .gpu_state
235            .as_mut()
236            .expect("dispatch_gate_gpu called without gpu_state");
237        let ctx = gpu.context().clone();
238
239        match gate {
240            Gate::Rzz(theta) => k::launch_apply_rzz(&ctx, gpu, targets[0], targets[1], *theta),
241            Gate::Cx => k::launch_apply_cx(&ctx, gpu, targets[0], targets[1]),
242            Gate::Cz => k::launch_apply_cz(&ctx, gpu, targets[0], targets[1]),
243            Gate::Swap => k::launch_apply_swap(&ctx, gpu, targets[0], targets[1]),
244            Gate::Cu(mat) => {
245                if let Some(phase) = gate.controlled_phase() {
246                    k::launch_apply_cu_phase(&ctx, gpu, targets[0], targets[1], phase)
247                } else {
248                    k::launch_apply_cu(&ctx, gpu, targets[0], targets[1], **mat)
249                }
250            }
251            Gate::Mcu(data) => {
252                let num_ctrl = data.num_controls as usize;
253                let controls = &targets[..num_ctrl];
254                let target = targets[num_ctrl];
255                if let Some(phase) = gate.controlled_phase() {
256                    k::launch_apply_mcu_phase(&ctx, gpu, controls, target, phase)
257                } else {
258                    k::launch_apply_mcu(&ctx, gpu, controls, target, data.mat)
259                }
260            }
261            Gate::BatchPhase(data) => {
262                let control = targets[0];
263                k::launch_apply_batch_phase(&ctx, gpu, control, &data.phases)
264            }
265            Gate::BatchRzz(data) => k::launch_apply_batch_rzz(&ctx, gpu, &data.edges),
266            Gate::DiagonalBatch(data) => k::launch_apply_diagonal_batch(&ctx, gpu, &data.entries),
267            Gate::QftBlock { start, num } => {
268                let h = Gate::H.matrix_2x2();
269                for step in qft_textbook_steps(*start as usize, *num as usize) {
270                    match step {
271                        QftTextbookStep::Hadamard(q) => k::launch_apply_gate_1q(&ctx, gpu, q, h)?,
272                        QftTextbookStep::CPhase {
273                            control,
274                            target,
275                            theta,
276                        } => k::launch_apply_cu_phase(
277                            &ctx,
278                            gpu,
279                            control,
280                            target,
281                            Complex64::from_polar(1.0, theta),
282                        )?,
283                        QftTextbookStep::Swap(a, b) => k::launch_apply_swap(&ctx, gpu, a, b)?,
284                    }
285                }
286                Ok(())
287            }
288            Gate::MultiFused(data) => {
289                if data.all_diagonal {
290                    k::launch_apply_multi_fused_diagonal(&ctx, gpu, &data.gates)
291                } else {
292                    k::launch_apply_multi_fused_nondiag(&ctx, gpu, &data.gates)
293                }
294            }
295            Gate::Fused2q(mat) => k::launch_apply_fused_2q(&ctx, gpu, targets[0], targets[1], mat),
296            Gate::Multi2q(data) => {
297                for &(q0, q1, mat) in &data.gates {
298                    k::launch_apply_fused_2q(&ctx, gpu, q0, q1, &mat)?;
299                }
300                Ok(())
301            }
302            _ => {
303                let mat = gate.matrix_2x2();
304                if gate.is_diagonal_1q() {
305                    k::launch_apply_diagonal_1q(&ctx, gpu, targets[0], mat[0][0], mat[1][1])
306                } else {
307                    k::launch_apply_gate_1q(&ctx, gpu, targets[0], mat)
308                }
309            }
310        }
311    }
312
313    #[cfg(feature = "gpu")]
314    fn apply_measure_gpu(&mut self, qubit: usize, classical_bit: usize) -> Result<()> {
315        use crate::gpu::kernels::dense as k;
316        use rand::Rng;
317
318        let gpu = self
319            .gpu_state
320            .as_mut()
321            .expect("apply_measure_gpu called without gpu_state");
322        let ctx = gpu.context().clone();
323
324        let prob_one = k::measure_prob_one(&ctx, gpu, qubit)?;
325        let u: f64 = self.rng.random();
326        let outcome = u < prob_one;
327        self.classical_bits[classical_bit] = outcome;
328
329        k::measure_collapse(&ctx, gpu, qubit, outcome)?;
330
331        let inv_sqrt = crate::backend::measurement_inv_norm(outcome, prob_one);
332        gpu.set_pending_norm(gpu.pending_norm() * inv_sqrt);
333        Ok(())
334    }
335
336    #[cfg(feature = "gpu")]
337    fn apply_reset_gpu(&mut self, qubit: usize) -> Result<()> {
338        use crate::gpu::kernels::dense as k;
339
340        let gpu = self
341            .gpu_state
342            .as_mut()
343            .expect("apply_reset_gpu called without gpu_state");
344        let ctx = gpu.context().clone();
345
346        // Match CPU semantics (src/backend/statevector/kernels.rs::apply_reset):
347        // deterministic projection onto |0⟩ irrespective of the current amplitude on |1⟩.
348        let prob_one = k::measure_prob_one(&ctx, gpu, qubit)?;
349        let prob_zero = (1.0 - prob_one).clamp(0.0, 1.0);
350
351        k::measure_collapse(&ctx, gpu, qubit, false)?;
352
353        if prob_zero > crate::backend::NORM_CLAMP_MIN {
354            let inv_norm = 1.0 / prob_zero.sqrt();
355            gpu.set_pending_norm(gpu.pending_norm() * inv_norm);
356        } else {
357            // |0> half was empty, reinitialize to |0...0> (CPU does the same).
358            k::launch_set_initial_state(&ctx, gpu)?;
359            gpu.set_pending_norm(1.0);
360        }
361        Ok(())
362    }
363
364    #[cfg(feature = "gpu")]
365    fn apply_1q_matrix_gpu(&mut self, qubit: usize, matrix: &[[Complex64; 2]; 2]) -> Result<()> {
366        use crate::gpu::kernels::dense as k;
367
368        let gpu = self
369            .gpu_state
370            .as_mut()
371            .expect("apply_1q_matrix_gpu called without gpu_state");
372        let ctx = gpu.context().clone();
373
374        let is_diagonal = matrix[0][1].norm() < 1e-14 && matrix[1][0].norm() < 1e-14;
375        if is_diagonal {
376            k::launch_apply_diagonal_1q(&ctx, gpu, qubit, matrix[0][0], matrix[1][1])
377        } else {
378            k::launch_apply_gate_1q(&ctx, gpu, qubit, *matrix)
379        }
380    }
381
382    /// Read-only access to the raw amplitude vector.
383    ///
384    /// After measurements, amplitudes may be un-normalized due to deferred
385    /// normalization. Call [`export_statevector`](Self::export_statevector)
386    /// for a properly normalized copy.
387    pub fn state_vector(&self) -> &[Complex64] {
388        &self.state
389    }
390
391    /// Initialize the backend from a pre-computed state vector.
392    ///
393    /// Accepts ownership of the amplitude vector, bypassing the default |0...0⟩
394    /// initialization. The vector length must be a power of 2.
395    pub fn init_from_state(
396        &mut self,
397        state: Vec<Complex64>,
398        num_classical_bits: usize,
399    ) -> crate::error::Result<()> {
400        #[cfg(feature = "parallel")]
401        crate::backend::init_thread_pool();
402
403        let dim = state.len();
404        if !dim.is_power_of_two() || dim < 2 {
405            return Err(crate::error::PrismError::InvalidParameter {
406                message: format!(
407                    "state vector length must be a power of 2 and >= 2, got {}",
408                    dim
409                ),
410            });
411        }
412        self.num_qubits = dim.trailing_zeros() as usize;
413        self.state = state;
414        self.pending_norm = 1.0;
415        crate::backend::init_classical_bits(&mut self.classical_bits, num_classical_bits);
416        Ok(())
417    }
418
419    #[inline(always)]
420    fn dispatch_gate(&mut self, gate: &Gate, targets: &[usize]) {
421        match gate {
422            Gate::Rzz(theta) => self.apply_rzz(targets[0], targets[1], *theta),
423            Gate::Cx => self.apply_cx(targets[0], targets[1]),
424            Gate::Cz => self.apply_cz(targets[0], targets[1]),
425            Gate::Swap => self.apply_swap(targets[0], targets[1]),
426            Gate::Cu(mat) => {
427                if let Some(phase) = gate.controlled_phase() {
428                    self.apply_cu_phase(targets[0], targets[1], phase);
429                } else {
430                    self.apply_cu(targets[0], targets[1], **mat);
431                }
432            }
433            Gate::Mcu(data) => {
434                let num_ctrl = data.num_controls as usize;
435                if let Some(phase) = gate.controlled_phase() {
436                    self.apply_mcu_phase(&targets[..num_ctrl], targets[num_ctrl], phase);
437                } else {
438                    self.apply_mcu(&targets[..num_ctrl], targets[num_ctrl], data.mat);
439                }
440            }
441            Gate::BatchPhase(data) => {
442                self.apply_batch_phase(targets[0], &data.phases);
443            }
444            Gate::QftBlock { start, num } => {
445                let start = *start as usize;
446                let num = *num as usize;
447                if start == 0 && num == self.num_qubits {
448                    self.apply_qft_block(start, num);
449                } else {
450                    self.apply_qft_block_textbook(start, num);
451                }
452            }
453            Gate::BatchRzz(data) => {
454                self.apply_batch_rzz(&data.edges);
455            }
456            Gate::DiagonalBatch(data) => {
457                self.apply_diagonal_batch(&data.entries);
458            }
459            Gate::MultiFused(data) => {
460                if data.all_diagonal {
461                    self.apply_multi_1q_diagonal(&data.gates);
462                } else {
463                    self.apply_multi_1q(&data.gates);
464                }
465            }
466            Gate::Fused2q(mat) => {
467                self.apply_fused_2q(targets[0], targets[1], mat);
468            }
469            Gate::Multi2q(data) => {
470                self.apply_multi_2q(&data.gates);
471            }
472            _ => {
473                let mat = gate.matrix_2x2();
474                if gate.is_diagonal_1q() {
475                    self.apply_diagonal_gate(targets[0], mat[0][0], mat[1][1]);
476                } else {
477                    self.apply_single_gate(targets[0], mat);
478                }
479            }
480        }
481    }
482}
483
484impl Backend for StatevectorBackend {
485    fn name(&self) -> &'static str {
486        "statevector"
487    }
488
489    fn supports_qft_block(&self) -> bool {
490        if !qft_block_enabled() {
491            return false;
492        }
493        #[cfg(feature = "gpu")]
494        {
495            self.gpu_context.is_none()
496        }
497        #[cfg(not(feature = "gpu"))]
498        {
499            true
500        }
501    }
502
503    fn init(&mut self, num_qubits: usize, num_classical_bits: usize) -> Result<()> {
504        #[cfg(feature = "parallel")]
505        crate::backend::init_thread_pool();
506
507        self.num_qubits = num_qubits;
508        self.pending_norm = 1.0;
509        crate::backend::init_classical_bits(&mut self.classical_bits, num_classical_bits);
510
511        #[cfg(feature = "gpu")]
512        if let Some(ctx) = self.gpu_context.clone() {
513            self.state.clear();
514            self.gpu_state = Some(GpuState::new(ctx, num_qubits)?);
515            return Ok(());
516        }
517
518        let dim = 1usize << num_qubits;
519        if self.state.len() == dim {
520            self.state.fill(Complex64::new(0.0, 0.0));
521        } else {
522            self.state = vec![Complex64::new(0.0, 0.0); dim];
523        }
524        self.state[0] = Complex64::new(1.0, 0.0);
525        Ok(())
526    }
527
528    fn apply(&mut self, instruction: &Instruction) -> Result<()> {
529        #[cfg(feature = "gpu")]
530        if self.gpu_state.is_some() {
531            return self.apply_gpu(instruction);
532        }
533        match instruction {
534            Instruction::Gate { gate, targets } => self.dispatch_gate(gate, targets),
535            Instruction::Measure {
536                qubit,
537                classical_bit,
538            } => {
539                self.apply_measure(*qubit, *classical_bit);
540            }
541            Instruction::Reset { qubit } => {
542                self.apply_reset(*qubit);
543            }
544            Instruction::Barrier { .. } => {}
545            Instruction::Conditional {
546                condition,
547                gate,
548                targets,
549            } => {
550                if condition.evaluate(&self.classical_bits) {
551                    self.dispatch_gate(gate, targets);
552                }
553            }
554        }
555        Ok(())
556    }
557
558    fn classical_results(&self) -> &[bool] {
559        &self.classical_bits
560    }
561
562    fn probabilities(&self) -> Result<Vec<f64>> {
563        #[cfg(feature = "gpu")]
564        if let Some(gpu) = self.gpu_state.as_ref() {
565            return gpu.probabilities();
566        }
567        let norm_sq = self.pending_norm * self.pending_norm;
568        let dim = self.state.len();
569        let mut probs = vec![0.0_f64; dim];
570
571        #[cfg(feature = "parallel")]
572        if self.num_qubits >= PARALLEL_THRESHOLD_QUBITS {
573            let src_chunks = self.state.par_chunks(MIN_PAR_ELEMS);
574            let dst_chunks = probs.par_chunks_mut(MIN_PAR_ELEMS);
575            if norm_sq == 1.0 {
576                src_chunks.zip(dst_chunks).for_each(|(s, d)| {
577                    simd::norm_sqr_to_slice(s, d);
578                });
579            } else {
580                src_chunks.zip(dst_chunks).for_each(|(s, d)| {
581                    simd::norm_sqr_to_slice_scaled(s, d, norm_sq);
582                });
583            }
584            return Ok(probs);
585        }
586
587        if norm_sq == 1.0 {
588            simd::norm_sqr_to_slice(&self.state, &mut probs);
589        } else {
590            simd::norm_sqr_to_slice_scaled(&self.state, &mut probs, norm_sq);
591        }
592        Ok(probs)
593    }
594
595    fn num_qubits(&self) -> usize {
596        self.num_qubits
597    }
598
599    fn export_statevector(&self) -> crate::error::Result<Vec<Complex64>> {
600        #[cfg(feature = "gpu")]
601        if let Some(gpu) = self.gpu_state.as_ref() {
602            return gpu.export_statevector();
603        }
604        if self.pending_norm == 1.0 {
605            return Ok(self.state.clone());
606        }
607        let s = Complex64::new(self.pending_norm, 0.0);
608        Ok(self.state.iter().map(|&c| c * s).collect())
609    }
610
611    fn qubit_probability(&self, qubit: usize) -> crate::error::Result<f64> {
612        #[cfg(feature = "gpu")]
613        if let Some(gpu) = self.gpu_state.as_ref() {
614            return crate::gpu::kernels::dense::measure_prob_one(gpu.context(), gpu, qubit);
615        }
616        Ok(self.qubit_probability_one(qubit))
617    }
618
619    fn reduced_density_matrix_1q(&self, qubit: usize) -> Result<[[Complex64; 2]; 2]> {
620        #[cfg(feature = "gpu")]
621        if let Some(gpu) = self.gpu_state.as_ref() {
622            let state = gpu.export_statevector()?;
623            return Ok(reduced_density_matrix_from_state(&state, qubit));
624        }
625        Ok(self.reduced_density_matrix_one(qubit))
626    }
627
628    fn reset(&mut self, qubit: usize) -> Result<()> {
629        #[cfg(feature = "gpu")]
630        if self.gpu_state.is_some() {
631            return self.apply_reset_gpu(qubit);
632        }
633        self.apply_reset(qubit);
634        Ok(())
635    }
636
637    fn apply_1q_matrix(&mut self, qubit: usize, matrix: &[[Complex64; 2]; 2]) -> Result<()> {
638        #[cfg(feature = "gpu")]
639        if self.gpu_state.is_some() {
640            return self.apply_1q_matrix_gpu(qubit, matrix);
641        }
642        let is_diagonal = matrix[0][1].norm() < 1e-14 && matrix[1][0].norm() < 1e-14;
643        if is_diagonal {
644            self.apply_diagonal_gate(qubit, matrix[0][0], matrix[1][1]);
645        } else {
646            self.apply_single_gate(qubit, *matrix);
647        }
648        Ok(())
649    }
650}