1pub(crate) mod kernels;
43#[cfg(test)]
44mod tests;
45
46use num_complex::Complex64;
47use rand::SeedableRng;
48use rand_chacha::ChaCha8Rng;
49
50#[cfg(feature = "gpu")]
51use std::sync::Arc;
52
53use crate::backend::simd;
54use crate::backend::Backend;
55use crate::circuit::Instruction;
56#[cfg(feature = "gpu")]
57use crate::circuit::{qft_textbook_steps, QftTextbookStep};
58use crate::error::Result;
59use crate::gates::Gate;
60
61#[cfg(feature = "gpu")]
62use crate::gpu::{GpuContext, GpuState};
63
64#[cfg(feature = "parallel")]
65use rayon::prelude::*;
66
67#[cfg(feature = "parallel")]
68pub(crate) use super::{MIN_PAR_ELEMS, PARALLEL_THRESHOLD_QUBITS};
69
70#[cfg(feature = "gpu")]
71fn reduced_density_matrix_from_state(state: &[Complex64], qubit: usize) -> [[Complex64; 2]; 2] {
72 let half = 1usize << qubit;
73 let block_size = half << 1;
74 let mut p0 = 0.0f64;
75 let mut p1 = 0.0f64;
76 let mut r = Complex64::new(0.0, 0.0);
77
78 for block in state.chunks(block_size) {
79 let (lo, hi) = block.split_at(half);
80 for i in 0..half {
81 let a0 = lo[i];
82 let a1 = hi[i];
83 p0 += a0.norm_sqr();
84 p1 += a1.norm_sqr();
85 r += a1 * a0.conj();
86 }
87 }
88
89 [
90 [Complex64::new(p0, 0.0), r.conj()],
91 [r, Complex64::new(p1, 0.0)],
92 ]
93}
94
95#[inline(always)]
101pub(crate) fn insert_zero_bit(val: usize, bit_pos: usize) -> usize {
102 let lo = val & ((1 << bit_pos) - 1);
103 let hi = val >> bit_pos;
104 (hi << (bit_pos + 1)) | lo
105}
106
107#[cfg(feature = "parallel")]
113#[derive(Copy, Clone)]
114pub(crate) struct SendPtr(pub(crate) *mut Complex64);
115
116#[cfg(feature = "parallel")]
117unsafe impl Send for SendPtr {}
120#[cfg(feature = "parallel")]
121unsafe impl Sync for SendPtr {}
124
125#[cfg(feature = "parallel")]
126impl SendPtr {
127 #[inline(always)]
128 pub(crate) unsafe fn load(self, idx: usize) -> Complex64 {
129 *self.0.add(idx)
130 }
131
132 #[inline(always)]
133 pub(crate) unsafe fn store(self, idx: usize, val: Complex64) {
134 *self.0.add(idx) = val;
135 }
136
137 #[inline(always)]
138 pub(crate) fn as_f64_ptr(self) -> *mut f64 {
139 self.0 as *mut f64
140 }
141}
142
143pub struct StatevectorBackend {
145 pub(crate) num_qubits: usize,
146 pub(crate) state: Vec<Complex64>,
147 pub(crate) classical_bits: Vec<bool>,
148 pub(crate) rng: ChaCha8Rng,
149 pub(crate) pending_norm: f64,
150 #[cfg(feature = "gpu")]
151 gpu_context: Option<Arc<GpuContext>>,
152 #[cfg(feature = "gpu")]
153 gpu_state: Option<GpuState>,
154}
155
156#[inline]
160fn qft_block_enabled() -> bool {
161 use std::sync::OnceLock;
162 static ENABLED: OnceLock<bool> = OnceLock::new();
163 *ENABLED.get_or_init(|| std::env::var_os("PRISM_NO_QFT_BLOCK").is_none())
164}
165
166impl StatevectorBackend {
167 pub fn new(seed: u64) -> Self {
169 Self {
170 num_qubits: 0,
171 state: Vec::new(),
172 classical_bits: Vec::new(),
173 rng: ChaCha8Rng::seed_from_u64(seed),
174 pending_norm: 1.0,
175 #[cfg(feature = "gpu")]
176 gpu_context: None,
177 #[cfg(feature = "gpu")]
178 gpu_state: None,
179 }
180 }
181
182 #[cfg(feature = "gpu")]
187 pub fn with_gpu(mut self, context: Arc<GpuContext>) -> Self {
188 self.gpu_context = Some(context);
189 self
190 }
191
192 #[cfg(feature = "gpu")]
193 fn apply_gpu(&mut self, instruction: &Instruction) -> Result<()> {
194 debug_assert!(
195 self.gpu_state.is_some(),
196 "apply_gpu called without gpu_state (callers must check self.gpu_state.is_some() first)"
197 );
198 match instruction {
208 Instruction::Gate { gate, targets } => self.dispatch_gate_gpu(gate, targets),
209 Instruction::Measure {
210 qubit,
211 classical_bit,
212 } => self.apply_measure_gpu(*qubit, *classical_bit),
213 Instruction::Reset { qubit } => self.apply_reset_gpu(*qubit),
214 Instruction::Barrier { .. } => Ok(()),
215 Instruction::Conditional {
216 condition,
217 gate,
218 targets,
219 } => {
220 if condition.evaluate(&self.classical_bits) {
221 self.dispatch_gate_gpu(gate, targets)
222 } else {
223 Ok(())
224 }
225 }
226 }
227 }
228
229 #[cfg(feature = "gpu")]
230 fn dispatch_gate_gpu(&mut self, gate: &Gate, targets: &[usize]) -> Result<()> {
231 use crate::gpu::kernels::dense as k;
232
233 let gpu = self
234 .gpu_state
235 .as_mut()
236 .expect("dispatch_gate_gpu called without gpu_state");
237 let ctx = gpu.context().clone();
238
239 match gate {
240 Gate::Rzz(theta) => k::launch_apply_rzz(&ctx, gpu, targets[0], targets[1], *theta),
241 Gate::Cx => k::launch_apply_cx(&ctx, gpu, targets[0], targets[1]),
242 Gate::Cz => k::launch_apply_cz(&ctx, gpu, targets[0], targets[1]),
243 Gate::Swap => k::launch_apply_swap(&ctx, gpu, targets[0], targets[1]),
244 Gate::Cu(mat) => {
245 if let Some(phase) = gate.controlled_phase() {
246 k::launch_apply_cu_phase(&ctx, gpu, targets[0], targets[1], phase)
247 } else {
248 k::launch_apply_cu(&ctx, gpu, targets[0], targets[1], **mat)
249 }
250 }
251 Gate::Mcu(data) => {
252 let num_ctrl = data.num_controls as usize;
253 let controls = &targets[..num_ctrl];
254 let target = targets[num_ctrl];
255 if let Some(phase) = gate.controlled_phase() {
256 k::launch_apply_mcu_phase(&ctx, gpu, controls, target, phase)
257 } else {
258 k::launch_apply_mcu(&ctx, gpu, controls, target, data.mat)
259 }
260 }
261 Gate::BatchPhase(data) => {
262 let control = targets[0];
263 k::launch_apply_batch_phase(&ctx, gpu, control, &data.phases)
264 }
265 Gate::BatchRzz(data) => k::launch_apply_batch_rzz(&ctx, gpu, &data.edges),
266 Gate::DiagonalBatch(data) => k::launch_apply_diagonal_batch(&ctx, gpu, &data.entries),
267 Gate::QftBlock { start, num } => {
268 let h = Gate::H.matrix_2x2();
269 for step in qft_textbook_steps(*start as usize, *num as usize) {
270 match step {
271 QftTextbookStep::Hadamard(q) => k::launch_apply_gate_1q(&ctx, gpu, q, h)?,
272 QftTextbookStep::CPhase {
273 control,
274 target,
275 theta,
276 } => k::launch_apply_cu_phase(
277 &ctx,
278 gpu,
279 control,
280 target,
281 Complex64::from_polar(1.0, theta),
282 )?,
283 QftTextbookStep::Swap(a, b) => k::launch_apply_swap(&ctx, gpu, a, b)?,
284 }
285 }
286 Ok(())
287 }
288 Gate::MultiFused(data) => {
289 if data.all_diagonal {
290 k::launch_apply_multi_fused_diagonal(&ctx, gpu, &data.gates)
291 } else {
292 k::launch_apply_multi_fused_nondiag(&ctx, gpu, &data.gates)
293 }
294 }
295 Gate::Fused2q(mat) => k::launch_apply_fused_2q(&ctx, gpu, targets[0], targets[1], mat),
296 Gate::Multi2q(data) => {
297 for &(q0, q1, mat) in &data.gates {
298 k::launch_apply_fused_2q(&ctx, gpu, q0, q1, &mat)?;
299 }
300 Ok(())
301 }
302 _ => {
303 let mat = gate.matrix_2x2();
304 if gate.is_diagonal_1q() {
305 k::launch_apply_diagonal_1q(&ctx, gpu, targets[0], mat[0][0], mat[1][1])
306 } else {
307 k::launch_apply_gate_1q(&ctx, gpu, targets[0], mat)
308 }
309 }
310 }
311 }
312
313 #[cfg(feature = "gpu")]
314 fn apply_measure_gpu(&mut self, qubit: usize, classical_bit: usize) -> Result<()> {
315 use crate::gpu::kernels::dense as k;
316 use rand::Rng;
317
318 let gpu = self
319 .gpu_state
320 .as_mut()
321 .expect("apply_measure_gpu called without gpu_state");
322 let ctx = gpu.context().clone();
323
324 let prob_one = k::measure_prob_one(&ctx, gpu, qubit)?;
325 let u: f64 = self.rng.random();
326 let outcome = u < prob_one;
327 self.classical_bits[classical_bit] = outcome;
328
329 k::measure_collapse(&ctx, gpu, qubit, outcome)?;
330
331 let inv_sqrt = crate::backend::measurement_inv_norm(outcome, prob_one);
332 gpu.set_pending_norm(gpu.pending_norm() * inv_sqrt);
333 Ok(())
334 }
335
336 #[cfg(feature = "gpu")]
337 fn apply_reset_gpu(&mut self, qubit: usize) -> Result<()> {
338 use crate::gpu::kernels::dense as k;
339
340 let gpu = self
341 .gpu_state
342 .as_mut()
343 .expect("apply_reset_gpu called without gpu_state");
344 let ctx = gpu.context().clone();
345
346 let prob_one = k::measure_prob_one(&ctx, gpu, qubit)?;
349 let prob_zero = (1.0 - prob_one).clamp(0.0, 1.0);
350
351 k::measure_collapse(&ctx, gpu, qubit, false)?;
352
353 if prob_zero > crate::backend::NORM_CLAMP_MIN {
354 let inv_norm = 1.0 / prob_zero.sqrt();
355 gpu.set_pending_norm(gpu.pending_norm() * inv_norm);
356 } else {
357 k::launch_set_initial_state(&ctx, gpu)?;
359 gpu.set_pending_norm(1.0);
360 }
361 Ok(())
362 }
363
364 #[cfg(feature = "gpu")]
365 fn apply_1q_matrix_gpu(&mut self, qubit: usize, matrix: &[[Complex64; 2]; 2]) -> Result<()> {
366 use crate::gpu::kernels::dense as k;
367
368 let gpu = self
369 .gpu_state
370 .as_mut()
371 .expect("apply_1q_matrix_gpu called without gpu_state");
372 let ctx = gpu.context().clone();
373
374 let is_diagonal = matrix[0][1].norm() < 1e-14 && matrix[1][0].norm() < 1e-14;
375 if is_diagonal {
376 k::launch_apply_diagonal_1q(&ctx, gpu, qubit, matrix[0][0], matrix[1][1])
377 } else {
378 k::launch_apply_gate_1q(&ctx, gpu, qubit, *matrix)
379 }
380 }
381
382 pub fn state_vector(&self) -> &[Complex64] {
388 &self.state
389 }
390
391 pub fn init_from_state(
396 &mut self,
397 state: Vec<Complex64>,
398 num_classical_bits: usize,
399 ) -> crate::error::Result<()> {
400 #[cfg(feature = "parallel")]
401 crate::backend::init_thread_pool();
402
403 let dim = state.len();
404 if !dim.is_power_of_two() || dim < 2 {
405 return Err(crate::error::PrismError::InvalidParameter {
406 message: format!(
407 "state vector length must be a power of 2 and >= 2, got {}",
408 dim
409 ),
410 });
411 }
412 self.num_qubits = dim.trailing_zeros() as usize;
413 self.state = state;
414 self.pending_norm = 1.0;
415 crate::backend::init_classical_bits(&mut self.classical_bits, num_classical_bits);
416 Ok(())
417 }
418
419 #[inline(always)]
420 fn dispatch_gate(&mut self, gate: &Gate, targets: &[usize]) {
421 match gate {
422 Gate::Rzz(theta) => self.apply_rzz(targets[0], targets[1], *theta),
423 Gate::Cx => self.apply_cx(targets[0], targets[1]),
424 Gate::Cz => self.apply_cz(targets[0], targets[1]),
425 Gate::Swap => self.apply_swap(targets[0], targets[1]),
426 Gate::Cu(mat) => {
427 if let Some(phase) = gate.controlled_phase() {
428 self.apply_cu_phase(targets[0], targets[1], phase);
429 } else {
430 self.apply_cu(targets[0], targets[1], **mat);
431 }
432 }
433 Gate::Mcu(data) => {
434 let num_ctrl = data.num_controls as usize;
435 if let Some(phase) = gate.controlled_phase() {
436 self.apply_mcu_phase(&targets[..num_ctrl], targets[num_ctrl], phase);
437 } else {
438 self.apply_mcu(&targets[..num_ctrl], targets[num_ctrl], data.mat);
439 }
440 }
441 Gate::BatchPhase(data) => {
442 self.apply_batch_phase(targets[0], &data.phases);
443 }
444 Gate::QftBlock { start, num } => {
445 let start = *start as usize;
446 let num = *num as usize;
447 if start == 0 && num == self.num_qubits {
448 self.apply_qft_block(start, num);
449 } else {
450 self.apply_qft_block_textbook(start, num);
451 }
452 }
453 Gate::BatchRzz(data) => {
454 self.apply_batch_rzz(&data.edges);
455 }
456 Gate::DiagonalBatch(data) => {
457 self.apply_diagonal_batch(&data.entries);
458 }
459 Gate::MultiFused(data) => {
460 if data.all_diagonal {
461 self.apply_multi_1q_diagonal(&data.gates);
462 } else {
463 self.apply_multi_1q(&data.gates);
464 }
465 }
466 Gate::Fused2q(mat) => {
467 self.apply_fused_2q(targets[0], targets[1], mat);
468 }
469 Gate::Multi2q(data) => {
470 self.apply_multi_2q(&data.gates);
471 }
472 _ => {
473 let mat = gate.matrix_2x2();
474 if gate.is_diagonal_1q() {
475 self.apply_diagonal_gate(targets[0], mat[0][0], mat[1][1]);
476 } else {
477 self.apply_single_gate(targets[0], mat);
478 }
479 }
480 }
481 }
482}
483
484impl Backend for StatevectorBackend {
485 fn name(&self) -> &'static str {
486 "statevector"
487 }
488
489 fn supports_qft_block(&self) -> bool {
490 if !qft_block_enabled() {
491 return false;
492 }
493 #[cfg(feature = "gpu")]
494 {
495 self.gpu_context.is_none()
496 }
497 #[cfg(not(feature = "gpu"))]
498 {
499 true
500 }
501 }
502
503 fn init(&mut self, num_qubits: usize, num_classical_bits: usize) -> Result<()> {
504 #[cfg(feature = "parallel")]
505 crate::backend::init_thread_pool();
506
507 self.num_qubits = num_qubits;
508 self.pending_norm = 1.0;
509 crate::backend::init_classical_bits(&mut self.classical_bits, num_classical_bits);
510
511 #[cfg(feature = "gpu")]
512 if let Some(ctx) = self.gpu_context.clone() {
513 self.state.clear();
514 self.gpu_state = Some(GpuState::new(ctx, num_qubits)?);
515 return Ok(());
516 }
517
518 let dim = 1usize << num_qubits;
519 if self.state.len() == dim {
520 self.state.fill(Complex64::new(0.0, 0.0));
521 } else {
522 self.state = vec![Complex64::new(0.0, 0.0); dim];
523 }
524 self.state[0] = Complex64::new(1.0, 0.0);
525 Ok(())
526 }
527
528 fn apply(&mut self, instruction: &Instruction) -> Result<()> {
529 #[cfg(feature = "gpu")]
530 if self.gpu_state.is_some() {
531 return self.apply_gpu(instruction);
532 }
533 match instruction {
534 Instruction::Gate { gate, targets } => self.dispatch_gate(gate, targets),
535 Instruction::Measure {
536 qubit,
537 classical_bit,
538 } => {
539 self.apply_measure(*qubit, *classical_bit);
540 }
541 Instruction::Reset { qubit } => {
542 self.apply_reset(*qubit);
543 }
544 Instruction::Barrier { .. } => {}
545 Instruction::Conditional {
546 condition,
547 gate,
548 targets,
549 } => {
550 if condition.evaluate(&self.classical_bits) {
551 self.dispatch_gate(gate, targets);
552 }
553 }
554 }
555 Ok(())
556 }
557
558 fn classical_results(&self) -> &[bool] {
559 &self.classical_bits
560 }
561
562 fn probabilities(&self) -> Result<Vec<f64>> {
563 #[cfg(feature = "gpu")]
564 if let Some(gpu) = self.gpu_state.as_ref() {
565 return gpu.probabilities();
566 }
567 let norm_sq = self.pending_norm * self.pending_norm;
568 let dim = self.state.len();
569 let mut probs = vec![0.0_f64; dim];
570
571 #[cfg(feature = "parallel")]
572 if self.num_qubits >= PARALLEL_THRESHOLD_QUBITS {
573 let src_chunks = self.state.par_chunks(MIN_PAR_ELEMS);
574 let dst_chunks = probs.par_chunks_mut(MIN_PAR_ELEMS);
575 if norm_sq == 1.0 {
576 src_chunks.zip(dst_chunks).for_each(|(s, d)| {
577 simd::norm_sqr_to_slice(s, d);
578 });
579 } else {
580 src_chunks.zip(dst_chunks).for_each(|(s, d)| {
581 simd::norm_sqr_to_slice_scaled(s, d, norm_sq);
582 });
583 }
584 return Ok(probs);
585 }
586
587 if norm_sq == 1.0 {
588 simd::norm_sqr_to_slice(&self.state, &mut probs);
589 } else {
590 simd::norm_sqr_to_slice_scaled(&self.state, &mut probs, norm_sq);
591 }
592 Ok(probs)
593 }
594
595 fn num_qubits(&self) -> usize {
596 self.num_qubits
597 }
598
599 fn export_statevector(&self) -> crate::error::Result<Vec<Complex64>> {
600 #[cfg(feature = "gpu")]
601 if let Some(gpu) = self.gpu_state.as_ref() {
602 return gpu.export_statevector();
603 }
604 if self.pending_norm == 1.0 {
605 return Ok(self.state.clone());
606 }
607 let s = Complex64::new(self.pending_norm, 0.0);
608 Ok(self.state.iter().map(|&c| c * s).collect())
609 }
610
611 fn qubit_probability(&self, qubit: usize) -> crate::error::Result<f64> {
612 #[cfg(feature = "gpu")]
613 if let Some(gpu) = self.gpu_state.as_ref() {
614 return crate::gpu::kernels::dense::measure_prob_one(gpu.context(), gpu, qubit);
615 }
616 Ok(self.qubit_probability_one(qubit))
617 }
618
619 fn reduced_density_matrix_1q(&self, qubit: usize) -> Result<[[Complex64; 2]; 2]> {
620 #[cfg(feature = "gpu")]
621 if let Some(gpu) = self.gpu_state.as_ref() {
622 let state = gpu.export_statevector()?;
623 return Ok(reduced_density_matrix_from_state(&state, qubit));
624 }
625 Ok(self.reduced_density_matrix_one(qubit))
626 }
627
628 fn reset(&mut self, qubit: usize) -> Result<()> {
629 #[cfg(feature = "gpu")]
630 if self.gpu_state.is_some() {
631 return self.apply_reset_gpu(qubit);
632 }
633 self.apply_reset(qubit);
634 Ok(())
635 }
636
637 fn apply_1q_matrix(&mut self, qubit: usize, matrix: &[[Complex64; 2]; 2]) -> Result<()> {
638 #[cfg(feature = "gpu")]
639 if self.gpu_state.is_some() {
640 return self.apply_1q_matrix_gpu(qubit, matrix);
641 }
642 let is_diagonal = matrix[0][1].norm() < 1e-14 && matrix[1][0].norm() < 1e-14;
643 if is_diagonal {
644 self.apply_diagonal_gate(qubit, matrix[0][0], matrix[1][1]);
645 } else {
646 self.apply_single_gate(qubit, *matrix);
647 }
648 Ok(())
649 }
650}