Skip to main content

prism_q/sim/
noise.rs

1use num_complex::Complex64;
2use rand::SeedableRng;
3use rand_chacha::ChaCha8Rng;
4use smallvec::smallvec;
5
6use crate::backend::stabilizer::StabilizerBackend;
7use crate::backend::Backend;
8use crate::circuit::{Circuit, Instruction, SmallVec};
9use crate::error::Result;
10use crate::gates::Gate;
11use crate::sim::compiled::{
12    batch_propagate_backward, compile_measurements, xor_words, PackedShots,
13};
14use crate::sim::ShotsResult;
15
16/// A single-qubit (or two-qubit, for `TwoQubitDepolarizing`) noise channel.
17///
18/// Channels are applied by the trajectory engine after each gate whose
19/// `NoiseModel::after_gate` entry contains a matching `NoiseEvent`. For
20/// Pauli-only noise on Clifford circuits, the compiled stabilizer sampler
21/// is used instead, see [`NoiseModel::is_pauli_only`].
22#[derive(Debug, Clone)]
23pub enum NoiseChannel {
24    /// Independent Pauli X/Y/Z error with given per-branch probabilities.
25    Pauli { px: f64, py: f64, pz: f64 },
26    /// Symmetric depolarizing: `I` with probability `1-p`, each of `X/Y/Z`
27    /// with probability `p/3`.
28    Depolarizing { p: f64 },
29    /// T1 relaxation: `|1⟩ → |0⟩` with amplitude `sqrt(gamma)`.
30    AmplitudeDamping { gamma: f64 },
31    /// Pure dephasing: phase randomisation with amplitude `sqrt(gamma)` on the
32    /// `|1⟩` state. Populations are preserved.
33    PhaseDamping { gamma: f64 },
34    /// Combined T1 + T2 relaxation over a gate of duration `gate_time`.
35    /// Sampled as a population reset (prob. `1 - exp(-gate_time/t1)`) followed
36    /// by a pure-dephasing branch when `t2 < 2·t1`.
37    ThermalRelaxation { t1: f64, t2: f64, gate_time: f64 },
38    /// Symmetric two-qubit depolarizing: each of the 15 non-identity
39    /// two-qubit Paulis occurs with probability `p/15`.
40    TwoQubitDepolarizing { p: f64 },
41    /// General single-qubit channel described by a set of Kraus operators.
42    ///
43    /// Dense Kraus operators are supported by trajectory backends that expose
44    /// `Backend::reduced_density_matrix_1q`. Branch probabilities use the full
45    /// one-qubit reduced density matrix, so channels may depend on coherence
46    /// as well as populations. Call [`NoiseModel::validate`] in setup code to
47    /// catch empty or non-finite custom channels before sampling.
48    Custom { kraus: Vec<[[Complex64; 2]; 2]> },
49}
50
51impl NoiseChannel {
52    pub fn as_pauli(&self) -> Option<(f64, f64, f64)> {
53        match self {
54            NoiseChannel::Pauli { px, py, pz } => Some((*px, *py, *pz)),
55            NoiseChannel::Depolarizing { p } => {
56                let pp = p / 3.0;
57                Some((pp, pp, pp))
58            }
59            _ => None,
60        }
61    }
62
63    pub fn is_pauli(&self) -> bool {
64        matches!(
65            self,
66            NoiseChannel::Pauli { .. } | NoiseChannel::Depolarizing { .. }
67        )
68    }
69
70    /// Return true when the trajectory engine can sample this channel exactly.
71    pub fn is_exactly_samplable(&self) -> bool {
72        self.validate().is_ok()
73    }
74
75    /// Validate that this channel is usable by the trajectory engine.
76    pub fn validate(&self) -> Result<()> {
77        match self {
78            NoiseChannel::Pauli { px, py, pz } => {
79                validate_probability("px", *px)?;
80                validate_probability("py", *py)?;
81                validate_probability("pz", *pz)?;
82                if *px + *py + *pz > 1.0 + 1e-12 {
83                    return Err(crate::error::PrismError::InvalidParameter {
84                        message: "Pauli channel probabilities must sum to <= 1".into(),
85                    });
86                }
87            }
88            NoiseChannel::Depolarizing { p } | NoiseChannel::TwoQubitDepolarizing { p } => {
89                validate_probability("p", *p)?;
90            }
91            NoiseChannel::AmplitudeDamping { gamma } | NoiseChannel::PhaseDamping { gamma } => {
92                validate_probability("gamma", *gamma)?;
93            }
94            NoiseChannel::ThermalRelaxation { t1, t2, gate_time } => {
95                if !t1.is_finite() || *t1 <= 0.0 {
96                    return Err(crate::error::PrismError::InvalidParameter {
97                        message: "thermal relaxation t1 must be finite and positive".into(),
98                    });
99                }
100                if !t2.is_finite() || *t2 <= 0.0 {
101                    return Err(crate::error::PrismError::InvalidParameter {
102                        message: "thermal relaxation t2 must be finite and positive".into(),
103                    });
104                }
105                if !gate_time.is_finite() || *gate_time < 0.0 {
106                    return Err(crate::error::PrismError::InvalidParameter {
107                        message: "thermal relaxation gate_time must be finite and non-negative"
108                            .into(),
109                    });
110                }
111            }
112            NoiseChannel::Custom { kraus } => {
113                if kraus.is_empty() {
114                    return Err(crate::error::PrismError::InvalidParameter {
115                        message: "Custom Kraus channel must contain at least one operator".into(),
116                    });
117                }
118                for (op_idx, op) in kraus.iter().enumerate() {
119                    for (row_idx, row) in op.iter().enumerate() {
120                        for (col_idx, val) in row.iter().enumerate() {
121                            if !val.re.is_finite() || !val.im.is_finite() {
122                                return Err(crate::error::PrismError::InvalidParameter {
123                                    message: format!(
124                                        "Custom Kraus operator {op_idx} entry ({row_idx},{col_idx}) must be finite"
125                                    ),
126                                });
127                            }
128                        }
129                    }
130                }
131            }
132        }
133        Ok(())
134    }
135}
136
137fn validate_probability(name: &str, value: f64) -> Result<()> {
138    if !value.is_finite() || !(0.0..=1.0).contains(&value) {
139        return Err(crate::error::PrismError::InvalidParameter {
140            message: format!("{name} must be finite and in [0, 1]"),
141        });
142    }
143    Ok(())
144}
145
146#[derive(Debug, Clone)]
147pub struct NoiseEvent {
148    pub channel: NoiseChannel,
149    pub qubits: SmallVec<[usize; 2]>,
150}
151
152impl NoiseEvent {
153    pub fn pauli(qubit: usize, px: f64, py: f64, pz: f64) -> Self {
154        Self {
155            channel: NoiseChannel::Pauli { px, py, pz },
156            qubits: smallvec![qubit],
157        }
158    }
159
160    pub fn qubit(&self) -> usize {
161        self.qubits[0]
162    }
163
164    /// Return the per-Pauli probabilities for a Pauli/Depolarizing channel.
165    ///
166    /// # Panics
167    ///
168    /// Panics if the channel is not Pauli or Depolarizing. Callers on the
169    /// compiled stabilizer sampler path must guard with
170    /// [`NoiseModel::ensure_pauli_only`] before invoking this method.
171    pub fn pauli_probs(&self) -> (f64, f64, f64) {
172        self.channel
173            .as_pauli()
174            .expect("pauli_probs called on non-Pauli channel; caller must use ensure_pauli_only")
175    }
176}
177
178#[derive(Debug, Clone)]
179pub struct ReadoutError {
180    pub p01: f64,
181    pub p10: f64,
182}
183
184pub struct NoiseModel {
185    pub after_gate: Vec<Vec<NoiseEvent>>,
186    pub readout: Vec<Option<ReadoutError>>,
187}
188
189impl NoiseModel {
190    pub fn uniform_depolarizing(circuit: &Circuit, p: f64) -> Self {
191        let px = p / 3.0;
192        let py = p / 3.0;
193        let pz = p / 3.0;
194
195        let mut after_gate = Vec::with_capacity(circuit.instructions.len());
196        for instr in &circuit.instructions {
197            match instr {
198                Instruction::Gate { targets, .. } => {
199                    let ops: Vec<NoiseEvent> = targets
200                        .iter()
201                        .map(|&q| NoiseEvent::pauli(q, px, py, pz))
202                        .collect();
203                    after_gate.push(ops);
204                }
205                _ => {
206                    after_gate.push(Vec::new());
207                }
208            }
209        }
210
211        Self {
212            after_gate,
213            readout: vec![None; circuit.num_classical_bits],
214        }
215    }
216
217    pub fn with_amplitude_damping(circuit: &Circuit, gamma: f64) -> Self {
218        let mut after_gate = Vec::with_capacity(circuit.instructions.len());
219        for instr in &circuit.instructions {
220            match instr {
221                Instruction::Gate { targets, .. } => {
222                    let ops: Vec<NoiseEvent> = targets
223                        .iter()
224                        .map(|&q| NoiseEvent {
225                            channel: NoiseChannel::AmplitudeDamping { gamma },
226                            qubits: smallvec![q],
227                        })
228                        .collect();
229                    after_gate.push(ops);
230                }
231                _ => {
232                    after_gate.push(Vec::new());
233                }
234            }
235        }
236
237        Self {
238            after_gate,
239            readout: vec![None; circuit.num_classical_bits],
240        }
241    }
242
243    pub fn with_readout_error(&mut self, p01: f64, p10: f64) -> &mut Self {
244        self.readout.fill(Some(ReadoutError { p01, p10 }));
245        self
246    }
247
248    pub fn is_pauli_only(&self) -> bool {
249        self.after_gate
250            .iter()
251            .flat_map(|events| events.iter())
252            .all(|e| e.channel.is_pauli())
253            && self.readout.iter().all(|r| r.is_none())
254    }
255
256    pub fn has_noise(&self) -> bool {
257        self.after_gate.iter().any(|events| !events.is_empty())
258            || self.readout.iter().any(|r| r.is_some())
259    }
260
261    /// Return an error if the noise model contains any non-Pauli channels
262    /// or readout errors. Used as a precondition check by the compiled
263    /// sampler path, which can only handle Pauli noise on Clifford circuits.
264    pub fn ensure_pauli_only(&self) -> Result<()> {
265        if !self.is_pauli_only() {
266            return Err(crate::error::PrismError::IncompatibleBackend {
267                backend: "compiled stabilizer sampler".into(),
268                reason: "non-Pauli noise channels (amplitude damping, phase damping, \
269                         thermal relaxation, custom Kraus) or readout errors require \
270                         the trajectory engine; use a non-stabilizer backend or a \
271                         Pauli-only noise model"
272                    .into(),
273            });
274        }
275        Ok(())
276    }
277
278    /// Validate all channels and readout errors in this noise model.
279    pub fn validate(&self) -> Result<()> {
280        for (instr_idx, events) in self.after_gate.iter().enumerate() {
281            for (event_idx, event) in events.iter().enumerate() {
282                event.channel.validate().map_err(|err| {
283                    crate::error::PrismError::InvalidParameter {
284                        message: format!(
285                            "noise event {event_idx} after instruction {instr_idx}: {err}"
286                        ),
287                    }
288                })?;
289
290                let expected_qubits = match &event.channel {
291                    NoiseChannel::TwoQubitDepolarizing { .. } => 2,
292                    _ => 1,
293                };
294                if event.qubits.len() != expected_qubits {
295                    return Err(crate::error::PrismError::InvalidParameter {
296                        message: format!(
297                            "noise event {event_idx} after instruction {instr_idx}: expected {expected_qubits} qubit(s), got {}",
298                            event.qubits.len()
299                        ),
300                    });
301                }
302            }
303        }
304
305        for (bit_idx, readout) in self.readout.iter().enumerate() {
306            if let Some(readout) = readout {
307                validate_probability("readout p01", readout.p01).map_err(|err| {
308                    crate::error::PrismError::InvalidParameter {
309                        message: format!("readout error {bit_idx}: {err}"),
310                    }
311                })?;
312                validate_probability("readout p10", readout.p10).map_err(|err| {
313                    crate::error::PrismError::InvalidParameter {
314                        message: format!("readout error {bit_idx}: {err}"),
315                    }
316                })?;
317            }
318        }
319
320        Ok(())
321    }
322}
323
324struct FlatNoiseSensitivity {
325    x_data: Vec<u64>,
326    z_data: Vec<u64>,
327    probs: Vec<[f64; 3]>,
328    m_words: usize,
329}
330
331impl FlatNoiseSensitivity {
332    fn new(m_words: usize, capacity: usize) -> Self {
333        Self {
334            x_data: Vec::with_capacity(capacity * m_words),
335            z_data: Vec::with_capacity(capacity * m_words),
336            probs: Vec::with_capacity(capacity),
337            m_words,
338        }
339    }
340
341    fn push(&mut self, x_flip: &[u64], z_flip: &[u64], px: f64, py: f64, pz: f64) {
342        self.x_data.extend_from_slice(x_flip);
343        self.z_data.extend_from_slice(z_flip);
344        self.probs.push([px, py, pz]);
345    }
346
347    #[inline(always)]
348    fn len(&self) -> usize {
349        self.probs.len()
350    }
351
352    #[inline(always)]
353    fn is_empty(&self) -> bool {
354        self.probs.is_empty()
355    }
356
357    #[inline(always)]
358    fn x_flip(&self, idx: usize) -> &[u64] {
359        let off = idx * self.m_words;
360        &self.x_data[off..off + self.m_words]
361    }
362
363    #[inline(always)]
364    fn z_flip(&self, idx: usize) -> &[u64] {
365        let off = idx * self.m_words;
366        &self.z_data[off..off + self.m_words]
367    }
368}
369
370#[inline(always)]
371fn geometric_sample(rng: &mut ChaCha8Rng, ln_1mp: f64) -> usize {
372    let u: f64 = 1.0 - rand::Rng::random::<f64>(rng);
373    (u.ln() / ln_1mp) as usize
374}
375
376#[cfg(feature = "gpu")]
377fn geometric_sample_xoshiro(
378    rng: &mut crate::sim::compiled::rng::Xoshiro256PlusPlus,
379    ln_1mp: f64,
380) -> usize {
381    // `next_f64` returns [0, 1); shift to (0, 1] so `ln` stays finite.
382    let u: f64 = 1.0 - rng.next_f64();
383    (u.ln() / ln_1mp) as usize
384}
385
386/// RNG source for [`fill_one_event`]. Bundles uniform and geometric draws
387/// onto a single trait so the shared body can use one `&mut` borrow instead
388/// of two concurrent closures over the underlying generator.
389#[cfg(feature = "gpu")]
390trait EventRng {
391    fn uniform(&mut self) -> f64;
392    fn geom(&mut self, ln_1mp: f64) -> usize;
393}
394
395#[cfg(feature = "gpu")]
396impl EventRng for ChaCha8Rng {
397    #[inline]
398    fn uniform(&mut self) -> f64 {
399        rand::Rng::random::<f64>(self)
400    }
401    #[inline]
402    fn geom(&mut self, ln_1mp: f64) -> usize {
403        geometric_sample(self, ln_1mp)
404    }
405}
406
407#[cfg(feature = "gpu")]
408impl EventRng for crate::sim::compiled::rng::Xoshiro256PlusPlus {
409    #[inline]
410    fn uniform(&mut self) -> f64 {
411        self.next_f64()
412    }
413    #[inline]
414    fn geom(&mut self, ln_1mp: f64) -> usize {
415        geometric_sample_xoshiro(self, ln_1mp)
416    }
417}
418
419/// Fill one event's portion of the noise masks. Shared by the serial master
420/// ChaCha path and the parallel per-event Xoshiro path, so the bernoulli
421/// and geometric branches live in exactly one place.
422#[cfg(feature = "gpu")]
423#[inline]
424fn fill_one_event(
425    x_slot: &mut [u64],
426    z_slot: &mut [u64],
427    probs: [f64; 3],
428    chunk_shots: usize,
429    rng: &mut dyn EventRng,
430) {
431    let [px, py, pz] = probs;
432    let p_event = px + py + pz;
433    if p_event == 0.0 {
434        return;
435    }
436    let pxy = px + py;
437
438    // High-p or tiny chunk: a per-shot bernoulli sweep dominates geometric
439    // skipping overhead because most shots flip anyway.
440    if p_event >= 0.5 || chunk_shots < 32 {
441        for s in 0..chunk_shots {
442            let r = rng.uniform();
443            let slot_idx = s / 64;
444            let bit = 1u64 << (s % 64);
445            if r < px {
446                z_slot[slot_idx] |= bit;
447            } else if r < pxy {
448                z_slot[slot_idx] |= bit;
449                x_slot[slot_idx] |= bit;
450            } else if r < p_event {
451                x_slot[slot_idx] |= bit;
452            }
453        }
454        return;
455    }
456
457    // Low-p: skip inactive shots via geometric sampling. `px_frac` / `pxy_frac`
458    // rescale to the conditional distribution given an event fired.
459    let ln_1mp = (1.0 - p_event).ln();
460    let px_frac = px / p_event;
461    let pxy_frac = pxy / p_event;
462    let mut pos = rng.geom(ln_1mp);
463    while pos < chunk_shots {
464        let r = rng.uniform();
465        let slot_idx = pos / 64;
466        let bit = 1u64 << (pos % 64);
467        if r < px_frac {
468            z_slot[slot_idx] |= bit;
469        } else if r < pxy_frac {
470            z_slot[slot_idx] |= bit;
471            x_slot[slot_idx] |= bit;
472        } else {
473            x_slot[slot_idx] |= bit;
474        }
475        pos += 1 + rng.geom(ln_1mp);
476    }
477}
478
479const NOISE_LUT_K: usize = 8;
480const NOISE_LUT_MIN_EVENTS: usize = 16;
481const NOISE_LUT_TILE: usize = 4096;
482
483fn unpack_and_remap(
484    accum: &[u64],
485    m_words: usize,
486    num_shots: usize,
487    classical_bit_order: &[usize],
488    num_classical: usize,
489) -> Vec<Vec<bool>> {
490    fn unpack_one(src: &[u64], classical_bit_order: &[usize], num_classical: usize) -> Vec<bool> {
491        let mut out = vec![false; num_classical];
492        for (mi, &cbit) in classical_bit_order.iter().enumerate() {
493            if cbit < num_classical {
494                out[cbit] = (src[mi / 64] >> (mi % 64)) & 1 != 0;
495            }
496        }
497        out
498    }
499
500    #[cfg(feature = "parallel")]
501    if num_shots >= 256 {
502        use rayon::prelude::*;
503        return accum
504            .par_chunks(m_words)
505            .map(|src| unpack_one(src, classical_bit_order, num_classical))
506            .collect();
507    }
508
509    #[cfg(not(feature = "parallel"))]
510    let _ = num_shots;
511
512    accum
513        .chunks(m_words)
514        .map(|src| unpack_one(src, classical_bit_order, num_classical))
515        .collect()
516}
517
518fn unpack_and_remap_packed(
519    packed: &PackedShots,
520    num_shots: usize,
521    classical_bit_order: &[usize],
522    num_classical: usize,
523) -> Vec<Vec<bool>> {
524    fn unpack_one_packed(
525        packed: &PackedShots,
526        shot: usize,
527        classical_bit_order: &[usize],
528        num_classical: usize,
529    ) -> Vec<bool> {
530        let mut out = vec![false; num_classical];
531        for (mi, &cbit) in classical_bit_order.iter().enumerate() {
532            if cbit < num_classical {
533                out[cbit] = packed.get_bit(shot, mi);
534            }
535        }
536        out
537    }
538
539    #[cfg(feature = "parallel")]
540    if num_shots >= 256 {
541        use rayon::prelude::*;
542        return (0..num_shots)
543            .into_par_iter()
544            .map(|s| unpack_one_packed(packed, s, classical_bit_order, num_classical))
545            .collect();
546    }
547
548    #[cfg(not(feature = "parallel"))]
549    let _ = num_shots;
550
551    (0..packed.num_shots())
552        .map(|s| unpack_one_packed(packed, s, classical_bit_order, num_classical))
553        .collect()
554}
555
556struct NoiseFlipLut {
557    data: Vec<u64>,
558    m_words: usize,
559    num_full_groups: usize,
560    remainder_size: usize,
561}
562
563impl NoiseFlipLut {
564    fn build_from_flat(flat_data: &[u64], num_rows: usize, m_words: usize) -> Self {
565        let num_full_groups = num_rows / NOISE_LUT_K;
566        let remainder_size = num_rows % NOISE_LUT_K;
567        let total_groups = num_full_groups + usize::from(remainder_size > 0);
568        let entries = 1 << NOISE_LUT_K;
569
570        let mut data = vec![0u64; total_groups * entries * m_words];
571
572        for g in 0..total_groups {
573            let group_start = g * NOISE_LUT_K;
574            let k = if g < num_full_groups {
575                NOISE_LUT_K
576            } else {
577                remainder_size
578            };
579            let lut_off = g * entries * m_words;
580
581            for byte in 1..(1usize << k) {
582                let lowest = byte & byte.wrapping_neg();
583                let row_idx = group_start + lowest.trailing_zeros() as usize;
584                let prev = byte ^ lowest;
585
586                let dst = lut_off + byte * m_words;
587                let src = lut_off + prev * m_words;
588                let row_off = row_idx * m_words;
589
590                for w in 0..m_words {
591                    data[dst + w] = data[src + w] ^ flat_data[row_off + w];
592                }
593            }
594        }
595
596        Self {
597            data,
598            m_words,
599            num_full_groups,
600            remainder_size,
601        }
602    }
603
604    #[inline(always)]
605    fn total_groups(&self) -> usize {
606        self.num_full_groups + usize::from(self.remainder_size > 0)
607    }
608
609    #[inline(always)]
610    fn apply_masked(&self, accum: &mut [u64], mask: &[u64]) {
611        for g in 0..self.total_groups() {
612            let byte = ((mask[g / 8] >> ((g % 8) * 8)) & 0xFF) as usize;
613            if byte != 0 {
614                let offset = (g * (1 << NOISE_LUT_K) + byte) * self.m_words;
615                xor_words(accum, &self.data[offset..offset + self.m_words]);
616            }
617        }
618    }
619}
620
621fn build_noise_luts(events: &FlatNoiseSensitivity) -> (Option<NoiseFlipLut>, Option<NoiseFlipLut>) {
622    let ne = events.len();
623    if ne < NOISE_LUT_MIN_EVENTS {
624        return (None, None);
625    }
626
627    let avg_p: f64 = events
628        .probs
629        .iter()
630        .map(|[px, py, pz]| px + py + pz)
631        .sum::<f64>()
632        / ne as f64;
633    if avg_p < 0.05 {
634        return (None, None);
635    }
636
637    let mw = events.m_words;
638    let z_lut = NoiseFlipLut::build_from_flat(&events.z_data, ne, mw);
639    let x_lut = NoiseFlipLut::build_from_flat(&events.x_data, ne, mw);
640    (Some(z_lut), Some(x_lut))
641}
642
643/// Pack `(px, py, pz)` per event into three `u64` thresholds for the fused
644/// noise kernel. Each threshold is `p * 2^64` rounded down, so a uniform
645/// `u64` random compares below it with probability `p`. Returns the flat
646/// buffer along with `false` when any event's probabilities are out of range
647/// (negative, NaN, or sum greater than 1). The caller then falls back to the
648/// host mask path.
649#[cfg(feature = "gpu")]
650fn flatten_event_thresholds_u64(probs: &[[f64; 3]]) -> (Vec<u64>, bool) {
651    // Nearest f64 below 2^64. Scaling by 2^64 exactly rounds to 2^64 which
652    // doesn't fit in a u64; this preserves `u64_rand < threshold` at the
653    // intended probability.
654    const SCALE: f64 = 18446744073709549568.0_f64;
655    const PROB_SUM_SLACK: f64 = 1e-12;
656
657    let mut out = Vec::with_capacity(probs.len() * 3);
658    let mut valid = true;
659    for &[px, py, pz] in probs {
660        let finite = px.is_finite() && py.is_finite() && pz.is_finite();
661        let sum = px + py + pz;
662        if !finite || px < 0.0 || py < 0.0 || pz < 0.0 || sum > 1.0 + PROB_SUM_SLACK {
663            valid = false;
664        }
665        let to_u64 = |p: f64| -> u64 {
666            if !p.is_finite() || p <= 0.0 {
667                0
668            } else if p >= 1.0 {
669                u64::MAX
670            } else {
671                (p * SCALE) as u64
672            }
673        };
674        out.push(to_u64(px));
675        out.push(to_u64(px + py));
676        out.push(to_u64(sum));
677    }
678    (out, valid)
679}
680
681/// Transpose the event-major `(x_data, z_data)` bitmaps in a
682/// `FlatNoiseSensitivity` into a row-major CSR. Each row's entry list names
683/// the events that touch it, packed as `event << 2 | flag`: flag bit 0 for
684/// an X contribution, bit 1 for Z, both set for a Y contribution. Lets the
685/// per-(row, batch) kernel accumulate XORs in a register without atomics.
686/// Paired with `bts_generate_and_apply_noise_meas_major_by_row`.
687#[cfg(feature = "gpu")]
688fn transpose_events_to_row_major(
689    events: &FlatNoiseSensitivity,
690    num_meas: usize,
691) -> (Vec<u32>, Vec<u32>) {
692    let num_events = events.len();
693    let m_words = events.m_words;
694
695    // First pass: count per-row entries.
696    let mut row_counts = vec![0u32; num_meas];
697    for e in 0..num_events {
698        let x = &events.x_data[e * m_words..(e + 1) * m_words];
699        let z = &events.z_data[e * m_words..(e + 1) * m_words];
700        for w in 0..m_words {
701            let union = x[w] | z[w];
702            let mut bits = union;
703            while bits != 0 {
704                let row = w * 64 + bits.trailing_zeros() as usize;
705                if row < num_meas {
706                    row_counts[row] += 1;
707                }
708                bits &= bits - 1;
709            }
710        }
711    }
712
713    let mut offsets = Vec::with_capacity(num_meas + 1);
714    offsets.push(0u32);
715    let mut running = 0u32;
716    for c in &row_counts {
717        running += c;
718        offsets.push(running);
719    }
720    let total = running as usize;
721
722    // Second pass: write entries. Use a fresh cursor array because
723    // `row_counts` is still needed to validate; cheaper to just reuse it as
724    // a write cursor starting at `offsets[row]`.
725    let mut entries = vec![0u32; total];
726    let mut cursor: Vec<u32> = offsets[..num_meas].to_vec();
727    for e in 0..num_events {
728        let x = &events.x_data[e * m_words..(e + 1) * m_words];
729        let z = &events.z_data[e * m_words..(e + 1) * m_words];
730        for w in 0..m_words {
731            let union = x[w] | z[w];
732            let mut bits = union;
733            while bits != 0 {
734                let b = bits.trailing_zeros() as usize;
735                bits &= bits - 1;
736                let row = w * 64 + b;
737                if row >= num_meas {
738                    continue;
739                }
740                let bit_mask = 1u64 << b;
741                let flag: u32 = (if x[w] & bit_mask != 0 { 1 } else { 0 })
742                    | (if z[w] & bit_mask != 0 { 2 } else { 0 });
743                let slot = cursor[row] as usize;
744                entries[slot] = ((e as u32) << 2) | flag;
745                cursor[row] += 1;
746            }
747        }
748    }
749
750    (offsets, entries)
751}
752
753#[cfg(feature = "gpu")]
754fn flatten_event_rows_to_csr(
755    flat_data: &[u64],
756    num_rows: usize,
757    m_words: usize,
758) -> (Vec<u32>, Vec<u32>) {
759    let mut row_offsets = Vec::with_capacity(num_rows + 1);
760    let mut row_indices = Vec::new();
761
762    for row in 0..num_rows {
763        row_offsets.push(row_indices.len() as u32);
764        let base = row * m_words;
765        for word in 0..m_words {
766            let mut bits = flat_data[base + word];
767            while bits != 0 {
768                let bit = bits.trailing_zeros() as usize;
769                row_indices.push((word * 64 + bit) as u32);
770                bits &= bits - 1;
771            }
772        }
773    }
774    row_offsets.push(row_indices.len() as u32);
775
776    (row_offsets, row_indices)
777}
778
779#[cfg(feature = "gpu")]
780struct GpuNoiseCache {
781    context: std::sync::Arc<crate::gpu::GpuContext>,
782    x_row_offsets_dev: crate::gpu::GpuBuffer<u32>,
783    x_row_indices_dev: crate::gpu::GpuBuffer<u32>,
784    z_row_offsets_dev: crate::gpu::GpuBuffer<u32>,
785    z_row_indices_dev: crate::gpu::GpuBuffer<u32>,
786    x_masks_dev: crate::gpu::GpuBuffer<u64>,
787    z_masks_dev: crate::gpu::GpuBuffer<u64>,
788    x_masks_host: Vec<u64>,
789    z_masks_host: Vec<u64>,
790    /// Precomputed per-event thresholds for the fused generator. Three u64s
791    /// per event hold `[px, px+py, px+py+pz]` scaled to `2^64`, so the
792    /// kernel's inner loop replaces a floating-point multiply and branch per
793    /// shot with a `u64 <` compare. Stays resident across sampling calls.
794    event_thresholds_dev: crate::gpu::GpuBuffer<u64>,
795    /// True when every event's probabilities can be represented as a `u64`
796    /// threshold. A NaN or a sum greater than 1 flips this to false and
797    /// routes the caller to the host mask path.
798    event_thresholds_valid: bool,
799    /// Row-major event CSR for the per-(row, batch) fused kernel. Each entry
800    /// packs `event << 2 | flag`, where bit 0 means the event contributes X
801    /// to this row and bit 1 means Z. The row-major layout lets one thread
802    /// accumulate every XOR for its output word in a register instead of
803    /// broadcasting `(event, batch)` writes across rows with atomics.
804    row_event_offsets_dev: crate::gpu::GpuBuffer<u32>,
805    row_event_entries_dev: crate::gpu::GpuBuffer<u32>,
806}
807
808#[cfg(feature = "gpu")]
809impl GpuNoiseCache {
810    fn new(
811        context: std::sync::Arc<crate::gpu::GpuContext>,
812        events: &FlatNoiseSensitivity,
813    ) -> Result<Self> {
814        let device = context.device();
815        let num_events = events.len();
816        let (x_row_offsets, x_row_indices) =
817            flatten_event_rows_to_csr(&events.x_data, num_events, events.m_words);
818        let (z_row_offsets, z_row_indices) =
819            flatten_event_rows_to_csr(&events.z_data, num_events, events.m_words);
820
821        let mut x_row_offsets_dev =
822            crate::gpu::GpuBuffer::<u32>::alloc_zeros(device, x_row_offsets.len().max(1))?;
823        let mut x_row_indices_dev =
824            crate::gpu::GpuBuffer::<u32>::alloc_zeros(device, x_row_indices.len().max(1))?;
825        let mut z_row_offsets_dev =
826            crate::gpu::GpuBuffer::<u32>::alloc_zeros(device, z_row_offsets.len().max(1))?;
827        let mut z_row_indices_dev =
828            crate::gpu::GpuBuffer::<u32>::alloc_zeros(device, z_row_indices.len().max(1))?;
829
830        if !x_row_offsets.is_empty() {
831            x_row_offsets_dev.copy_from_host(device, &x_row_offsets)?;
832        }
833        if !x_row_indices.is_empty() {
834            x_row_indices_dev.copy_from_host(device, &x_row_indices)?;
835        }
836        if !z_row_offsets.is_empty() {
837            z_row_offsets_dev.copy_from_host(device, &z_row_offsets)?;
838        }
839        if !z_row_indices.is_empty() {
840            z_row_indices_dev.copy_from_host(device, &z_row_indices)?;
841        }
842
843        let (thresholds_host, thresholds_valid) = flatten_event_thresholds_u64(&events.probs);
844        let mut event_thresholds_dev =
845            crate::gpu::GpuBuffer::<u64>::alloc_zeros(device, thresholds_host.len().max(1))?;
846        if !thresholds_host.is_empty() {
847            event_thresholds_dev.copy_from_host(device, &thresholds_host)?;
848        }
849
850        // Row-major transpose for the per-(row, batch) fused kernel. Cheap
851        // bit-scan over events.x_data | events.z_data at cache init time.
852        let (row_event_offsets_host, row_event_entries_host) =
853            transpose_events_to_row_major(events, events.m_words * 64);
854        let mut row_event_offsets_dev =
855            crate::gpu::GpuBuffer::<u32>::alloc_zeros(device, row_event_offsets_host.len().max(1))?;
856        let mut row_event_entries_dev =
857            crate::gpu::GpuBuffer::<u32>::alloc_zeros(device, row_event_entries_host.len().max(1))?;
858        if !row_event_offsets_host.is_empty() {
859            row_event_offsets_dev.copy_from_host(device, &row_event_offsets_host)?;
860        }
861        if !row_event_entries_host.is_empty() {
862            row_event_entries_dev.copy_from_host(device, &row_event_entries_host)?;
863        }
864
865        Ok(Self {
866            context: context.clone(),
867            x_row_offsets_dev,
868            x_row_indices_dev,
869            z_row_offsets_dev,
870            z_row_indices_dev,
871            x_masks_dev: crate::gpu::GpuBuffer::<u64>::alloc_zeros(device, 1)?,
872            z_masks_dev: crate::gpu::GpuBuffer::<u64>::alloc_zeros(device, 1)?,
873            x_masks_host: Vec::new(),
874            z_masks_host: Vec::new(),
875            event_thresholds_dev,
876            event_thresholds_valid: thresholds_valid,
877            row_event_offsets_dev,
878            row_event_entries_dev,
879        })
880    }
881
882    fn ensure_mask_capacity(&mut self, len: usize) -> Result<()> {
883        let needed = len.max(1);
884        let device = self.context.device();
885        if self.x_masks_dev.len() < needed {
886            self.x_masks_dev = crate::gpu::GpuBuffer::<u64>::alloc_zeros(device, needed)?;
887        }
888        if self.z_masks_dev.len() < needed {
889            self.z_masks_dev = crate::gpu::GpuBuffer::<u64>::alloc_zeros(device, needed)?;
890        }
891        if self.x_masks_host.len() < needed {
892            self.x_masks_host.resize(needed, 0);
893        }
894        if self.z_masks_host.len() < needed {
895            self.z_masks_host.resize(needed, 0);
896        }
897        Ok(())
898    }
899}
900
901/// Compiled sampler for Clifford circuits with Pauli noise.
902///
903/// The sampler reuses the compiled measurement parity map for the noiseless
904/// circuit, then applies the requested Pauli noise model across batches of
905/// shots. When a GPU context is attached via `with_gpu` (available with the
906/// `gpu` feature), the underlying parity sampling stage may use the GPU BTS
907/// path. Materialized
908/// shot output still returns to the CPU in shot-major form, while exact
909/// counts and marginals can keep the packed measurement-major buffer on the
910/// device through noise application and reduction.
911pub struct NoisyCompiledSampler {
912    noiseless: crate::sim::compiled::CompiledSampler,
913    events: FlatNoiseSensitivity,
914    num_measurements: usize,
915    rng: ChaCha8Rng,
916    z_lut: Option<NoiseFlipLut>,
917    x_lut: Option<NoiseFlipLut>,
918    #[cfg(feature = "gpu")]
919    gpu_noise_cache: Option<GpuNoiseCache>,
920}
921
922impl NoisyCompiledSampler {
923    /// Opt in to GPU-accelerated sampling for the underlying compiled
924    /// noiseless sampler.
925    ///
926    /// Small shot batches may continue to use the CPU path when the compiled
927    /// sampler stays below the GPU BTS threshold. Noise masks are still
928    /// generated on the CPU; counts and marginals can then apply them and
929    /// reduce on the device.
930    #[cfg(feature = "gpu")]
931    pub fn with_gpu(mut self, context: std::sync::Arc<crate::gpu::GpuContext>) -> Self {
932        self.noiseless = self.noiseless.with_gpu(context);
933        self.gpu_noise_cache = None;
934        self
935    }
936
937    fn sample_bulk_words_shot_major_cpu(&mut self, num_shots: usize) -> (Vec<u64>, usize) {
938        let (mut accum, m_words) = self.noiseless.sample_bulk_words_shot_major(num_shots);
939        self.apply_noise_bulk(&mut accum, num_shots, m_words);
940
941        let ref_bits_packed = self.noiseless.ref_bits_packed();
942
943        #[cfg(feature = "parallel")]
944        if num_shots >= 256 {
945            use rayon::prelude::*;
946            accum
947                .par_chunks_mut(m_words)
948                .for_each(|shot| xor_words(shot, ref_bits_packed));
949        } else {
950            for s in 0..num_shots {
951                let shot_base = s * m_words;
952                xor_words(&mut accum[shot_base..shot_base + m_words], ref_bits_packed);
953            }
954        }
955
956        #[cfg(not(feature = "parallel"))]
957        for s in 0..num_shots {
958            let shot_base = s * m_words;
959            xor_words(&mut accum[shot_base..shot_base + m_words], ref_bits_packed);
960        }
961
962        (accum, m_words)
963    }
964
965    #[cfg(feature = "gpu")]
966    fn try_sample_bulk_packed_with_gpu_context(&mut self, num_shots: usize) -> Result<PackedShots> {
967        let m_words = self.num_measurements.div_ceil(64);
968        if num_shots == 0 || self.num_measurements == 0 {
969            return Ok(PackedShots::from_shot_major(
970                vec![0u64; num_shots * m_words],
971                num_shots,
972                self.num_measurements,
973            ));
974        }
975
976        // Prefer the GPU shot-major path: BTS plus on-device bit-transpose
977        // into shot-major layout eliminates the host `into_shot_major_data()`
978        // transpose that otherwise dominates noisy sampling at large shot
979        // counts. Falls back to the default sampler (which may itself dispatch
980        // to the GPU meas-major BTS) when the shot-major path does not apply.
981        let mut accum = match self.noiseless.try_sample_bulk_shot_major_gpu(num_shots) {
982            Some(Ok(data)) => data,
983            Some(Err(e)) => return Err(e),
984            None => {
985                let (accum, _m_words) = self.sample_bulk_words_shot_major_cpu(num_shots);
986                return Ok(PackedShots::from_shot_major(
987                    accum,
988                    num_shots,
989                    self.num_measurements,
990                ));
991            }
992        };
993        self.apply_noise_bulk(&mut accum, num_shots, m_words);
994        Ok(PackedShots::from_shot_major(
995            accum,
996            num_shots,
997            self.num_measurements,
998        ))
999    }
1000
1001    #[cfg(feature = "gpu")]
1002    fn take_gpu_noise_cache(&mut self) -> Result<GpuNoiseCache> {
1003        if let Some(cache) = self.gpu_noise_cache.take() {
1004            return Ok(cache);
1005        }
1006        let context = self
1007            .noiseless
1008            .gpu_context()
1009            .expect("gpu noise cache requested without gpu_context");
1010        GpuNoiseCache::new(context, &self.events)
1011    }
1012
1013    #[cfg(feature = "gpu")]
1014    fn fill_noise_masks_event_major(
1015        events: &FlatNoiseSensitivity,
1016        rng: &mut ChaCha8Rng,
1017        x_masks: &mut [u64],
1018        z_masks: &mut [u64],
1019        chunk_shots: usize,
1020        chunk_s_words: usize,
1021    ) {
1022        x_masks.fill(0);
1023        z_masks.fill(0);
1024
1025        // Events touch disjoint per-event slots of `x_masks` / `z_masks`, so
1026        // they parallelise cleanly once each worker has its own RNG stream.
1027        // Seed one Xoshiro per event from the master ChaCha so the draws are
1028        // independent of Rayon's thread count while matching the serial
1029        // reference statistically.
1030        #[cfg(feature = "parallel")]
1031        {
1032            use rand::RngCore;
1033            use rayon::prelude::*;
1034            // Rayon split + per-event seed draw has to be dominated by real
1035            // work; below this threshold the serial master-rng loop wins.
1036            // Tuned against the noisy-sampling bench at 1M shots where
1037            // num_events is small and the tile is also small.
1038            const MIN_PAR_EVENTS: usize = 64;
1039            const MIN_PAR_SLOTS: usize = 32_768;
1040            let work_slots = events.len().saturating_mul(chunk_s_words);
1041            if events.len() >= MIN_PAR_EVENTS
1042                && work_slots >= MIN_PAR_SLOTS
1043                && x_masks.len() == events.len() * chunk_s_words
1044                && z_masks.len() == events.len() * chunk_s_words
1045            {
1046                let thread_seeds: Vec<[u64; 4]> = (0..events.len())
1047                    .map(|_| {
1048                        [
1049                            rng.next_u64(),
1050                            rng.next_u64(),
1051                            rng.next_u64(),
1052                            rng.next_u64(),
1053                        ]
1054                    })
1055                    .collect();
1056
1057                x_masks
1058                    .par_chunks_mut(chunk_s_words)
1059                    .zip(z_masks.par_chunks_mut(chunk_s_words))
1060                    .zip(thread_seeds.par_iter())
1061                    .enumerate()
1062                    .for_each(|(i, ((x_slot, z_slot), seed))| {
1063                        let mut trng =
1064                            crate::sim::compiled::rng::Xoshiro256PlusPlus::from_seeds(*seed);
1065                        fill_one_event(x_slot, z_slot, events.probs[i], chunk_shots, &mut trng);
1066                    });
1067                return;
1068            }
1069        }
1070
1071        for (i, &probs) in events.probs.iter().enumerate() {
1072            let base = i * chunk_s_words;
1073            let end = base + chunk_s_words;
1074            fill_one_event(
1075                &mut x_masks[base..end],
1076                &mut z_masks[base..end],
1077                probs,
1078                chunk_shots,
1079                rng,
1080            );
1081        }
1082    }
1083
1084    #[cfg(feature = "gpu")]
1085    fn apply_noise_device_meas_major(
1086        &mut self,
1087        packed: &mut crate::sim::compiled::DevicePackedShots,
1088    ) -> Result<()> {
1089        if self.events.is_empty() || packed.num_shots() == 0 || self.num_measurements == 0 {
1090            return Ok(());
1091        }
1092
1093        let num_events = self.events.len();
1094        let mut cache = self.take_gpu_noise_cache()?;
1095        let total_s_words = packed.s_words();
1096        let context = packed.context().clone();
1097        let num_shots = packed.num_shots();
1098
1099        // Fused path: the kernel seeds a per-(event, batch) xoshiro stream
1100        // from a master seed drawn on the host, then generates and XORs the
1101        // noise masks in one launch per tile. Drops the per-tile mask fill
1102        // and ~25-50 MB PCIe upload that the fallback branch pays at large
1103        // shot counts. The fallback runs only when compiled event
1104        // probabilities fail the `u64` threshold check.
1105        let use_fused = cache.event_thresholds_valid;
1106        let master_seed: u64 = if use_fused {
1107            rand::RngCore::next_u64(&mut self.rng)
1108        } else {
1109            0
1110        };
1111
1112        let mut shot_start = 0;
1113        while shot_start < num_shots {
1114            let chunk_shots = (shot_start + NOISE_LUT_TILE).min(num_shots) - shot_start;
1115            let chunk_s_words = chunk_shots.div_ceil(64);
1116            let word_offset = shot_start / 64;
1117
1118            if use_fused {
1119                crate::gpu::kernels::bts::generate_and_apply_noise_masks_meas_major_by_row(
1120                    &context,
1121                    crate::gpu::kernels::bts::NoiseDeviceGenApplyByRow {
1122                        meas_major: packed.data_mut(),
1123                        num_meas: self.num_measurements,
1124                        s_words: total_s_words,
1125                        word_offset,
1126                        chunk_s_words,
1127                        row_event_offsets: &cache.row_event_offsets_dev,
1128                        row_event_entries: &cache.row_event_entries_dev,
1129                        event_thresholds: &cache.event_thresholds_dev,
1130                        master_seed,
1131                        batch_offset: word_offset as u64,
1132                    },
1133                )?;
1134            } else {
1135                let mask_len = num_events * chunk_s_words;
1136                cache.ensure_mask_capacity(mask_len)?;
1137                Self::fill_noise_masks_event_major(
1138                    &self.events,
1139                    &mut self.rng,
1140                    &mut cache.x_masks_host[..mask_len],
1141                    &mut cache.z_masks_host[..mask_len],
1142                    chunk_shots,
1143                    chunk_s_words,
1144                );
1145                if mask_len > 0 {
1146                    cache
1147                        .x_masks_dev
1148                        .copy_from_host(cache.context.device(), &cache.x_masks_host[..mask_len])?;
1149                    cache
1150                        .z_masks_dev
1151                        .copy_from_host(cache.context.device(), &cache.z_masks_host[..mask_len])?;
1152                    let base = crate::gpu::kernels::bts::NoiseApplyBase {
1153                        meas_major: packed.data_mut(),
1154                        num_meas: self.num_measurements,
1155                        s_words: total_s_words,
1156                        word_offset,
1157                        chunk_s_words,
1158                        num_events,
1159                        x_row_offsets: &cache.x_row_offsets_dev,
1160                        x_row_indices: &cache.x_row_indices_dev,
1161                        z_row_offsets: &cache.z_row_offsets_dev,
1162                        z_row_indices: &cache.z_row_indices_dev,
1163                    };
1164                    crate::gpu::kernels::bts::apply_noise_masks_meas_major(
1165                        &context,
1166                        crate::gpu::kernels::bts::NoiseMaskApply {
1167                            base,
1168                            x_masks: &cache.x_masks_dev,
1169                            z_masks: &cache.z_masks_dev,
1170                        },
1171                    )?;
1172                }
1173            }
1174
1175            shot_start += chunk_shots;
1176        }
1177
1178        self.gpu_noise_cache = Some(cache);
1179        Ok(())
1180    }
1181
1182    #[cfg(feature = "gpu")]
1183    fn try_sample_marginals_gpu(&mut self, total_shots: usize) -> Option<Result<Vec<f64>>> {
1184        if !self.noiseless.should_use_gpu_bts(total_shots) {
1185            return None;
1186        }
1187
1188        let mut packed = match self.noiseless.sample_bulk_packed_device(total_shots) {
1189            Ok(packed) => packed,
1190            Err(e) => return Some(Err(e)),
1191        };
1192        if let Err(e) = self.apply_noise_device_meas_major(&mut packed) {
1193            return Some(Err(e));
1194        }
1195        Some(packed.marginals())
1196    }
1197
1198    #[cfg(feature = "gpu")]
1199    fn try_sample_counts_gpu(
1200        &mut self,
1201        total_shots: usize,
1202    ) -> Option<Result<std::collections::HashMap<Vec<u64>, u64>>> {
1203        if !self.noiseless.should_use_gpu_bts(total_shots) {
1204            return None;
1205        }
1206
1207        let mut packed = match self.noiseless.sample_bulk_packed_device(total_shots) {
1208            Ok(packed) => packed,
1209            Err(e) => return Some(Err(e)),
1210        };
1211        if let Err(e) = self.apply_noise_device_meas_major(&mut packed) {
1212            return Some(Err(e));
1213        }
1214        Some(packed.counts_with_rank_hint(self.num_measurements))
1215    }
1216
1217    /// Sample `num_shots` noisy outcomes and return them in packed form.
1218    ///
1219    /// The returned [`PackedShots`] value uses shot-major layout. With a GPU
1220    /// context attached, the underlying parity sampling stage may run on the
1221    /// GPU before noise is applied and the packed output is materialized on
1222    /// the CPU.
1223    pub fn sample_bulk_packed(&mut self, num_shots: usize) -> PackedShots {
1224        match self.try_sample_bulk_packed(num_shots) {
1225            Ok(packed) => packed,
1226            Err(_) => self.sample_bulk_packed_cpu(num_shots),
1227        }
1228    }
1229
1230    pub fn try_sample_bulk_packed(&mut self, num_shots: usize) -> Result<PackedShots> {
1231        #[cfg(feature = "gpu")]
1232        if self.noiseless.has_gpu_context() {
1233            return self.try_sample_bulk_packed_with_gpu_context(num_shots);
1234        }
1235
1236        Ok(self.sample_bulk_packed_cpu(num_shots))
1237    }
1238
1239    fn sample_bulk_packed_cpu(&mut self, num_shots: usize) -> PackedShots {
1240        let m_words = self.num_measurements.div_ceil(64);
1241        if num_shots == 0 || self.num_measurements == 0 {
1242            return PackedShots::from_shot_major(
1243                vec![0u64; num_shots * m_words],
1244                num_shots,
1245                self.num_measurements,
1246            );
1247        }
1248
1249        let (accum, _m_words) = self.sample_bulk_words_shot_major_cpu(num_shots);
1250        PackedShots::from_shot_major(accum, num_shots, self.num_measurements)
1251    }
1252
1253    /// Stream noisy shots into `acc` using the default chunk size.
1254    ///
1255    /// This avoids materializing the full shot matrix when the caller only
1256    /// needs derived aggregates such as counts or marginals.
1257    pub fn sample_chunked<A: crate::sim::compiled::ShotAccumulator>(
1258        &mut self,
1259        total_shots: usize,
1260        acc: &mut A,
1261    ) {
1262        let chunk_size = crate::sim::compiled::default_chunk_size(self.num_measurements);
1263        self.sample_chunked_with_size(total_shots, chunk_size, acc);
1264    }
1265
1266    /// Stream noisy shots into `acc` with an explicit chunk size.
1267    ///
1268    /// When a GPU context is attached, each chunk may use GPU BTS sampling for
1269    /// the noiseless parity stage before noise is applied.
1270    pub fn sample_chunked_with_size<A: crate::sim::compiled::ShotAccumulator>(
1271        &mut self,
1272        total_shots: usize,
1273        chunk_size: usize,
1274        acc: &mut A,
1275    ) {
1276        #[cfg(feature = "gpu")]
1277        if self.noiseless.has_gpu_context()
1278            && self
1279                .noiseless
1280                .should_use_gpu_bts(total_shots.min(chunk_size.max(1)))
1281        {
1282            let mut remaining = total_shots;
1283            while remaining > 0 {
1284                let this_batch = remaining.min(chunk_size);
1285                let packed = match self.try_sample_bulk_packed_with_gpu_context(this_batch) {
1286                    Ok(packed) => packed,
1287                    Err(_) => self.sample_bulk_packed_cpu(this_batch),
1288                };
1289                acc.accumulate(&packed);
1290                remaining -= this_batch;
1291            }
1292            return;
1293        }
1294
1295        let m_words = self.num_measurements.div_ceil(64);
1296        let mut accum_buf: Vec<u64> = Vec::new();
1297        let mut rand_buf: Vec<u8> = Vec::new();
1298        let ref_bits_packed = self.noiseless.ref_bits_packed().to_vec();
1299        let mut remaining = total_shots;
1300        while remaining > 0 {
1301            let this_batch = remaining.min(chunk_size);
1302            self.noiseless.sample_bulk_words_shot_major_reuse(
1303                &mut accum_buf,
1304                &mut rand_buf,
1305                this_batch,
1306            );
1307
1308            self.apply_noise_bulk(&mut accum_buf, this_batch, m_words);
1309
1310            #[cfg(feature = "parallel")]
1311            if this_batch >= 256 {
1312                use rayon::prelude::*;
1313                accum_buf[..this_batch * m_words]
1314                    .par_chunks_mut(m_words)
1315                    .for_each(|shot| xor_words(shot, &ref_bits_packed));
1316            } else {
1317                for s in 0..this_batch {
1318                    let shot_base = s * m_words;
1319                    xor_words(
1320                        &mut accum_buf[shot_base..shot_base + m_words],
1321                        &ref_bits_packed,
1322                    );
1323                }
1324            }
1325
1326            #[cfg(not(feature = "parallel"))]
1327            for s in 0..this_batch {
1328                let shot_base = s * m_words;
1329                xor_words(
1330                    &mut accum_buf[shot_base..shot_base + m_words],
1331                    &ref_bits_packed,
1332                );
1333            }
1334
1335            let data = std::mem::take(&mut accum_buf);
1336            let packed = PackedShots::from_shot_major(data, this_batch, self.num_measurements);
1337            acc.accumulate(&packed);
1338            accum_buf = packed.into_data();
1339            remaining -= this_batch;
1340        }
1341    }
1342
1343    /// Sample noisy outcomes and return exact packed counts.
1344    pub fn sample_counts(
1345        &mut self,
1346        total_shots: usize,
1347    ) -> std::collections::HashMap<Vec<u64>, u64> {
1348        match self.try_sample_counts(total_shots) {
1349            Ok(counts) => counts,
1350            Err(_) => self.sample_counts_cpu(total_shots),
1351        }
1352    }
1353
1354    pub fn try_sample_counts(
1355        &mut self,
1356        total_shots: usize,
1357    ) -> Result<std::collections::HashMap<Vec<u64>, u64>> {
1358        #[cfg(feature = "gpu")]
1359        if let Some(counts) = self.try_sample_counts_gpu(total_shots) {
1360            return counts;
1361        }
1362
1363        Ok(self.sample_counts_cpu(total_shots))
1364    }
1365
1366    fn sample_counts_cpu(
1367        &mut self,
1368        total_shots: usize,
1369    ) -> std::collections::HashMap<Vec<u64>, u64> {
1370        let mut acc = crate::sim::compiled::HistogramAccumulator::new();
1371        self.sample_chunked(total_shots, &mut acc);
1372        acc.into_counts()
1373    }
1374
1375    /// Sample noisy outcomes and return per-measurement `P(bit = 1)`.
1376    pub fn sample_marginals(&mut self, total_shots: usize) -> Vec<f64> {
1377        match self.try_sample_marginals(total_shots) {
1378            Ok(marginals) => marginals,
1379            Err(_) => self.sample_marginals_cpu(total_shots),
1380        }
1381    }
1382
1383    pub fn try_sample_marginals(&mut self, total_shots: usize) -> Result<Vec<f64>> {
1384        #[cfg(feature = "gpu")]
1385        if let Some(marginals) = self.try_sample_marginals_gpu(total_shots) {
1386            return marginals;
1387        }
1388
1389        Ok(self.sample_marginals_cpu(total_shots))
1390    }
1391
1392    fn sample_marginals_cpu(&mut self, total_shots: usize) -> Vec<f64> {
1393        let mut acc = crate::sim::compiled::MarginalsAccumulator::new(self.num_measurements);
1394        self.sample_chunked(total_shots, &mut acc);
1395        acc.marginals()
1396    }
1397
1398    #[cfg(test)]
1399    fn sample_bulk(&mut self, num_shots: usize) -> Vec<Vec<bool>> {
1400        self.sample_bulk_packed(num_shots).to_shots()
1401    }
1402
1403    fn apply_noise_bulk(&mut self, accum: &mut [u64], num_shots: usize, m_words: usize) {
1404        if self.events.is_empty() {
1405            return;
1406        }
1407
1408        if self.z_lut.is_some() {
1409            self.apply_noise_bulk_grouped(accum, num_shots, m_words);
1410        } else {
1411            self.apply_noise_bulk_scalar(accum, num_shots, m_words);
1412        }
1413    }
1414
1415    fn apply_noise_bulk_scalar(&mut self, accum: &mut [u64], num_shots: usize, m_words: usize) {
1416        #[cfg(feature = "parallel")]
1417        if num_shots >= 4096 {
1418            self.apply_noise_bulk_scalar_par(accum, num_shots, m_words);
1419            return;
1420        }
1421
1422        self.apply_noise_bulk_scalar_seq(accum, num_shots, m_words);
1423    }
1424
1425    fn apply_noise_bulk_scalar_seq(&mut self, accum: &mut [u64], num_shots: usize, m_words: usize) {
1426        Self::apply_noise_range(&self.events, accum, 0, num_shots, m_words, &mut self.rng);
1427    }
1428
1429    #[cfg(feature = "parallel")]
1430    fn apply_noise_bulk_scalar_par(&mut self, accum: &mut [u64], num_shots: usize, m_words: usize) {
1431        use rayon::prelude::*;
1432
1433        let num_threads = rayon::current_num_threads().max(1);
1434        let shots_per_thread = (num_shots.div_ceil(num_threads) + 63) & !63;
1435
1436        let seeds: Vec<u64> = (0..num_threads)
1437            .map(|_| rand::Rng::random(&mut self.rng))
1438            .collect();
1439
1440        let events = &self.events;
1441
1442        accum
1443            .par_chunks_mut(shots_per_thread * m_words)
1444            .enumerate()
1445            .for_each(|(tid, chunk)| {
1446                let chunk_shots = chunk.len() / m_words;
1447                if chunk_shots == 0 {
1448                    return;
1449                }
1450                let mut rng = ChaCha8Rng::seed_from_u64(seeds[tid]);
1451                Self::apply_noise_range(events, chunk, 0, chunk_shots, m_words, &mut rng);
1452            });
1453    }
1454
1455    fn apply_noise_range(
1456        events: &FlatNoiseSensitivity,
1457        accum: &mut [u64],
1458        start: usize,
1459        end: usize,
1460        m_words: usize,
1461        rng: &mut ChaCha8Rng,
1462    ) {
1463        let ne = events.len();
1464        let num_shots = end - start;
1465
1466        for i in 0..ne {
1467            let [px, py, pz] = events.probs[i];
1468            let p_event = px + py + pz;
1469            if p_event == 0.0 {
1470                continue;
1471            }
1472
1473            if p_event >= 0.5 || num_shots < 32 {
1474                for s in start..end {
1475                    let r: f64 = rand::Rng::random(rng);
1476                    if r < px {
1477                        let b = s * m_words;
1478                        xor_words(&mut accum[b..b + m_words], events.z_flip(i));
1479                    } else if r < px + py {
1480                        let b = s * m_words;
1481                        xor_words(&mut accum[b..b + m_words], events.x_flip(i));
1482                        xor_words(&mut accum[b..b + m_words], events.z_flip(i));
1483                    } else if r < p_event {
1484                        let b = s * m_words;
1485                        xor_words(&mut accum[b..b + m_words], events.x_flip(i));
1486                    }
1487                }
1488            } else {
1489                let ln_1mp = (1.0 - p_event).ln();
1490                let px_frac = px / p_event;
1491                let pxy_frac = (px + py) / p_event;
1492
1493                let mut pos = start + geometric_sample(rng, ln_1mp);
1494                while pos < end {
1495                    let r: f64 = rand::Rng::random(rng);
1496                    let b = pos * m_words;
1497                    if r < px_frac {
1498                        xor_words(&mut accum[b..b + m_words], events.z_flip(i));
1499                    } else if r < pxy_frac {
1500                        xor_words(&mut accum[b..b + m_words], events.x_flip(i));
1501                        xor_words(&mut accum[b..b + m_words], events.z_flip(i));
1502                    } else {
1503                        xor_words(&mut accum[b..b + m_words], events.x_flip(i));
1504                    }
1505                    pos += 1 + geometric_sample(rng, ln_1mp);
1506                }
1507            }
1508        }
1509    }
1510
1511    fn apply_noise_bulk_grouped(&mut self, accum: &mut [u64], num_shots: usize, m_words: usize) {
1512        let num_events = self.events.len();
1513        let e_words = num_events.div_ceil(64);
1514
1515        for tile_start in (0..num_shots).step_by(NOISE_LUT_TILE) {
1516            let tile_end = (tile_start + NOISE_LUT_TILE).min(num_shots);
1517            let tile_n = tile_end - tile_start;
1518
1519            let mut z_mask = vec![0u64; tile_n * e_words];
1520            let mut x_mask = vec![0u64; tile_n * e_words];
1521
1522            for i in 0..num_events {
1523                let [px, py, pz] = self.events.probs[i];
1524                let p_event = px + py + pz;
1525                if p_event == 0.0 {
1526                    continue;
1527                }
1528
1529                let ew = i / 64;
1530                let eb = 1u64 << (i % 64);
1531
1532                if p_event >= 0.5 || tile_n < 32 {
1533                    for s in 0..tile_n {
1534                        let r: f64 = rand::Rng::random(&mut self.rng);
1535                        if r < px {
1536                            z_mask[s * e_words + ew] |= eb;
1537                        } else if r < px + py {
1538                            z_mask[s * e_words + ew] |= eb;
1539                            x_mask[s * e_words + ew] |= eb;
1540                        } else if r < p_event {
1541                            x_mask[s * e_words + ew] |= eb;
1542                        }
1543                    }
1544                } else {
1545                    let ln_1mp = (1.0 - p_event).ln();
1546                    let px_frac = px / p_event;
1547                    let pxy_frac = (px + py) / p_event;
1548
1549                    let mut pos = geometric_sample(&mut self.rng, ln_1mp);
1550                    while pos < tile_n {
1551                        let r: f64 = rand::Rng::random(&mut self.rng);
1552                        if r < px_frac {
1553                            z_mask[pos * e_words + ew] |= eb;
1554                        } else if r < pxy_frac {
1555                            z_mask[pos * e_words + ew] |= eb;
1556                            x_mask[pos * e_words + ew] |= eb;
1557                        } else {
1558                            x_mask[pos * e_words + ew] |= eb;
1559                        }
1560                        pos += 1 + geometric_sample(&mut self.rng, ln_1mp);
1561                    }
1562                }
1563            }
1564
1565            let z_lut = self.z_lut.as_ref().unwrap();
1566            let x_lut = self.x_lut.as_ref().unwrap();
1567
1568            for s in 0..tile_n {
1569                let shot_base = (tile_start + s) * m_words;
1570                let mask_base = s * e_words;
1571                let shot_accum = &mut accum[shot_base..shot_base + m_words];
1572
1573                z_lut.apply_masked(shot_accum, &z_mask[mask_base..mask_base + e_words]);
1574                x_lut.apply_masked(shot_accum, &x_mask[mask_base..mask_base + e_words]);
1575            }
1576        }
1577    }
1578}
1579
1580/// Compile a Clifford circuit with Pauli noise into a reusable noisy sampler.
1581///
1582/// Requires the same circuit subset as [`crate::compile_measurements`],
1583/// namely terminal measurements with no resets or classical conditionals.
1584pub fn compile_noisy(
1585    circuit: &Circuit,
1586    noise: &NoiseModel,
1587    seed: u64,
1588) -> Result<NoisyCompiledSampler> {
1589    noise.ensure_pauli_only()?;
1590    if circuit.num_qubits >= 4 {
1591        let blocks = circuit.independent_subsystems();
1592        if blocks.len() > 1 {
1593            let max_block = blocks.iter().map(|b| b.len()).max().unwrap_or(0);
1594            if max_block < circuit.num_qubits {
1595                return compile_noisy_filtered(circuit, noise, &blocks, seed);
1596            }
1597        }
1598    }
1599
1600    compile_noisy_monolithic(circuit, noise, seed)
1601}
1602
1603fn compile_noisy_filtered(
1604    circuit: &Circuit,
1605    noise: &NoiseModel,
1606    blocks: &[Vec<usize>],
1607    seed: u64,
1608) -> Result<NoisyCompiledSampler> {
1609    let noiseless = compile_measurements(circuit, seed)?;
1610
1611    let measurement_qubits: Vec<usize> = circuit
1612        .instructions
1613        .iter()
1614        .filter_map(|inst| match inst {
1615            Instruction::Measure { qubit, .. } => Some(*qubit),
1616            _ => None,
1617        })
1618        .collect();
1619    let num_measurements = measurement_qubits.len();
1620    let m_words = num_measurements.div_ceil(64);
1621
1622    if num_measurements == 0 {
1623        return Ok(NoisyCompiledSampler {
1624            noiseless,
1625            events: FlatNoiseSensitivity::new(1, 0),
1626
1627            num_measurements: 0,
1628            rng: ChaCha8Rng::seed_from_u64(seed.wrapping_add(0xA01CE)),
1629            z_lut: None,
1630            x_lut: None,
1631            #[cfg(feature = "gpu")]
1632            gpu_noise_cache: None,
1633        });
1634    }
1635
1636    let mut qubit_to_block = vec![0usize; circuit.num_qubits];
1637    let mut qubit_to_local = vec![0usize; circuit.num_qubits];
1638    for (bi, block) in blocks.iter().enumerate() {
1639        for (li, &q) in block.iter().enumerate() {
1640            qubit_to_block[q] = bi;
1641            qubit_to_local[q] = li;
1642        }
1643    }
1644
1645    let mut block_meas: Vec<Vec<usize>> = vec![Vec::new(); blocks.len()];
1646    for (mi, &q) in measurement_qubits.iter().enumerate() {
1647        block_meas[qubit_to_block[q]].push(mi);
1648    }
1649
1650    let total_noise_events: usize = noise.after_gate.iter().map(|ops| ops.len()).sum();
1651    let mut events = FlatNoiseSensitivity::new(m_words, total_noise_events);
1652    let mut global_x_buf = vec![0u64; m_words];
1653    let mut global_z_buf = vec![0u64; m_words];
1654
1655    for (bi, block) in blocks.iter().enumerate() {
1656        let bm_list = &block_meas[bi];
1657        if bm_list.is_empty() {
1658            continue;
1659        }
1660
1661        let bn = block.len();
1662        let bm = bm_list.len();
1663        let bm_words = bm.div_ceil(64);
1664
1665        let mut x_packed: Vec<Vec<u64>> = vec![vec![0u64; bm_words]; bn];
1666        let mut z_packed: Vec<Vec<u64>> = vec![vec![0u64; bm_words]; bn];
1667        let mut sign_packed: Vec<u64> = vec![0u64; bm_words];
1668
1669        for (local_mi, &global_mi) in bm_list.iter().enumerate() {
1670            let q = measurement_qubits[global_mi];
1671            let local_q = qubit_to_local[q];
1672            z_packed[local_q][local_mi / 64] |= 1u64 << (local_mi % 64);
1673        }
1674
1675        let block_gates: Vec<(usize, &Gate, SmallVec<[usize; 4]>)> = circuit
1676            .instructions
1677            .iter()
1678            .enumerate()
1679            .filter_map(|(idx, inst)| match inst {
1680                Instruction::Gate { gate, targets } => {
1681                    if targets.iter().all(|&t| qubit_to_block[t] == bi) {
1682                        let local_targets: SmallVec<[usize; 4]> =
1683                            targets.iter().map(|&t| qubit_to_local[t]).collect();
1684                        Some((idx, gate, local_targets))
1685                    } else {
1686                        None
1687                    }
1688                }
1689                Instruction::Conditional { gate, targets, .. } => {
1690                    if targets.iter().all(|&t| qubit_to_block[t] == bi) {
1691                        let local_targets: SmallVec<[usize; 4]> =
1692                            targets.iter().map(|&t| qubit_to_local[t]).collect();
1693                        Some((idx, gate, local_targets))
1694                    } else {
1695                        None
1696                    }
1697                }
1698                _ => None,
1699            })
1700            .collect();
1701
1702        for (instr_idx, gate, local_targets) in block_gates.iter().rev() {
1703            let noise_ops = &noise.after_gate[*instr_idx];
1704            if !noise_ops.is_empty() {
1705                for event in noise_ops {
1706                    let (px, py, pz) = event.pauli_probs();
1707                    let local_q = qubit_to_local[event.qubit()];
1708
1709                    let has_any = x_packed[local_q].iter().any(|&w| w != 0)
1710                        || z_packed[local_q].iter().any(|&w| w != 0);
1711                    if has_any {
1712                        global_x_buf.fill(0);
1713                        global_z_buf.fill(0);
1714                        for (local_mi, &global_mi) in bm_list.iter().enumerate() {
1715                            if (x_packed[local_q][local_mi / 64] >> (local_mi % 64)) & 1 != 0 {
1716                                global_x_buf[global_mi / 64] |= 1u64 << (global_mi % 64);
1717                            }
1718                            if (z_packed[local_q][local_mi / 64] >> (local_mi % 64)) & 1 != 0 {
1719                                global_z_buf[global_mi / 64] |= 1u64 << (global_mi % 64);
1720                            }
1721                        }
1722                        events.push(&global_x_buf, &global_z_buf, px, py, pz);
1723                    }
1724                }
1725            }
1726
1727            batch_propagate_backward(
1728                &mut x_packed,
1729                &mut z_packed,
1730                &mut sign_packed,
1731                gate,
1732                local_targets.as_slice(),
1733                bm_words,
1734            );
1735        }
1736    }
1737
1738    let (z_lut, x_lut) = build_noise_luts(&events);
1739
1740    Ok(NoisyCompiledSampler {
1741        noiseless,
1742        events,
1743
1744        num_measurements,
1745        rng: ChaCha8Rng::seed_from_u64(seed.wrapping_add(0xCAFE_BABE)),
1746        z_lut,
1747        x_lut,
1748        #[cfg(feature = "gpu")]
1749        gpu_noise_cache: None,
1750    })
1751}
1752
1753fn compile_noisy_monolithic(
1754    circuit: &Circuit,
1755    noise: &NoiseModel,
1756    seed: u64,
1757) -> Result<NoisyCompiledSampler> {
1758    let noiseless = compile_measurements(circuit, seed)?;
1759
1760    let n = circuit.num_qubits;
1761
1762    let gate_indices: Vec<(usize, &Gate, &[usize])> = circuit
1763        .instructions
1764        .iter()
1765        .enumerate()
1766        .filter_map(|(idx, inst)| match inst {
1767            Instruction::Gate { gate, targets } => Some((idx, gate, targets.as_slice())),
1768            Instruction::Conditional { gate, targets, .. } => Some((idx, gate, targets.as_slice())),
1769            _ => None,
1770        })
1771        .collect();
1772
1773    let measurement_qubits: Vec<usize> = circuit
1774        .instructions
1775        .iter()
1776        .filter_map(|inst| match inst {
1777            Instruction::Measure { qubit, .. } => Some(*qubit),
1778            _ => None,
1779        })
1780        .collect();
1781    let num_measurements = measurement_qubits.len();
1782    let m_words = num_measurements.div_ceil(64);
1783
1784    if num_measurements == 0 {
1785        return Ok(NoisyCompiledSampler {
1786            noiseless,
1787            events: FlatNoiseSensitivity::new(m_words, 0),
1788
1789            num_measurements: 0,
1790            rng: ChaCha8Rng::seed_from_u64(seed.wrapping_add(0xA01CE)),
1791            z_lut: None,
1792            x_lut: None,
1793            #[cfg(feature = "gpu")]
1794            gpu_noise_cache: None,
1795        });
1796    }
1797
1798    let mut x_packed: Vec<Vec<u64>> = vec![vec![0u64; m_words]; n];
1799    let mut z_packed: Vec<Vec<u64>> = vec![vec![0u64; m_words]; n];
1800    let mut sign_packed: Vec<u64> = vec![0u64; m_words];
1801
1802    for (mi, &q) in measurement_qubits.iter().enumerate() {
1803        z_packed[q][mi / 64] |= 1u64 << (mi % 64);
1804    }
1805
1806    let total_noise_events: usize = noise.after_gate.iter().map(|ops| ops.len()).sum();
1807    let mut events = FlatNoiseSensitivity::new(m_words, total_noise_events);
1808
1809    for &(instr_idx, gate, targets) in gate_indices.iter().rev() {
1810        let noise_ops = &noise.after_gate[instr_idx];
1811        if !noise_ops.is_empty() {
1812            for event in noise_ops {
1813                let (px, py, pz) = event.pauli_probs();
1814                let q = event.qubit();
1815
1816                let has_any =
1817                    x_packed[q].iter().any(|&w| w != 0) || z_packed[q].iter().any(|&w| w != 0);
1818                if has_any {
1819                    events.push(&x_packed[q], &z_packed[q], px, py, pz);
1820                }
1821            }
1822        }
1823
1824        batch_propagate_backward(
1825            &mut x_packed,
1826            &mut z_packed,
1827            &mut sign_packed,
1828            gate,
1829            targets,
1830            m_words,
1831        );
1832    }
1833
1834    let (z_lut, x_lut) = build_noise_luts(&events);
1835
1836    Ok(NoisyCompiledSampler {
1837        noiseless,
1838        events,
1839
1840        num_measurements,
1841        rng: ChaCha8Rng::seed_from_u64(seed.wrapping_add(0xCAFE_BABE)),
1842        z_lut,
1843        x_lut,
1844        #[cfg(feature = "gpu")]
1845        gpu_noise_cache: None,
1846    })
1847}
1848
1849const FRAME_BATCH_SIZE: usize = 256;
1850
1851struct ReferenceInfo {
1852    outcomes: Vec<bool>,
1853    is_random: Vec<bool>,
1854    random_x_support: Vec<Vec<usize>>,
1855}
1856
1857fn reference_simulation(circuit: &Circuit, seed: u64) -> Result<ReferenceInfo> {
1858    let mut stab = StabilizerBackend::new(seed);
1859    stab.init(circuit.num_qubits, circuit.num_classical_bits)?;
1860
1861    let meas_info: Vec<(usize, usize, usize)> = circuit
1862        .instructions
1863        .iter()
1864        .enumerate()
1865        .filter_map(|(i, inst)| match inst {
1866            Instruction::Measure {
1867                qubit,
1868                classical_bit,
1869            } => Some((i, *qubit, *classical_bit)),
1870            _ => None,
1871        })
1872        .collect();
1873
1874    let num_meas = meas_info.len();
1875    if num_meas == 0 {
1876        return Ok(ReferenceInfo {
1877            outcomes: Vec::new(),
1878            is_random: Vec::new(),
1879            random_x_support: Vec::new(),
1880        });
1881    }
1882
1883    let first_meas_idx = meas_info[0].0;
1884    let all_at_end = meas_info
1885        .iter()
1886        .enumerate()
1887        .all(|(i, &(inst_idx, _, _))| inst_idx == first_meas_idx + i);
1888
1889    if all_at_end {
1890        stab.apply_gates_only(&circuit.instructions[..first_meas_idx])?;
1891
1892        let measurements: Vec<(usize, usize)> = meas_info
1893            .iter()
1894            .map(|&(_, qubit, classical_bit)| (qubit, classical_bit))
1895            .collect();
1896        let (is_random, random_x_support, outcomes) = stab.batch_measure_ref_info(&measurements);
1897
1898        return Ok(ReferenceInfo {
1899            outcomes,
1900            is_random,
1901            random_x_support,
1902        });
1903    }
1904
1905    let mut is_random = Vec::with_capacity(num_meas);
1906    let mut random_x_support: Vec<Vec<usize>> = Vec::with_capacity(num_meas);
1907
1908    let mut seg_start = 0usize;
1909    for &(meas_inst_idx, qubit, classical_bit) in &meas_info {
1910        if seg_start < meas_inst_idx {
1911            stab.apply_gates_only(&circuit.instructions[seg_start..meas_inst_idx])?;
1912        }
1913
1914        let (meas_random, support) = stab.apply_measure_with_info(qubit, classical_bit);
1915        is_random.push(meas_random);
1916        random_x_support.push(support);
1917        seg_start = meas_inst_idx + 1;
1918    }
1919
1920    if seg_start < circuit.instructions.len() {
1921        stab.apply_gates_only(&circuit.instructions[seg_start..])?;
1922    }
1923
1924    let outcomes: Vec<bool> = meas_info
1925        .iter()
1926        .map(|&(_, _, cbit)| stab.classical_results()[cbit])
1927        .collect();
1928
1929    Ok(ReferenceInfo {
1930        outcomes,
1931        is_random,
1932        random_x_support,
1933    })
1934}
1935
1936#[inline(always)]
1937fn apply_gate_to_frame(
1938    gate: &Gate,
1939    targets: &[usize],
1940    x_frame: &mut [Vec<u64>],
1941    z_frame: &mut [Vec<u64>],
1942    bw: usize,
1943) {
1944    match gate {
1945        Gate::H => {
1946            let q = targets[0];
1947            std::mem::swap(&mut x_frame[q], &mut z_frame[q]);
1948        }
1949        Gate::S | Gate::Sdg => {
1950            let q = targets[0];
1951            for w in 0..bw {
1952                z_frame[q][w] ^= x_frame[q][w];
1953            }
1954        }
1955        Gate::SX | Gate::SXdg => {
1956            let q = targets[0];
1957            for w in 0..bw {
1958                x_frame[q][w] ^= z_frame[q][w];
1959            }
1960        }
1961        Gate::X | Gate::Y | Gate::Z | Gate::Id => {}
1962        Gate::Cx => {
1963            let ctrl = targets[0];
1964            let tgt = targets[1];
1965            for w in 0..bw {
1966                x_frame[tgt][w] ^= x_frame[ctrl][w];
1967                z_frame[ctrl][w] ^= z_frame[tgt][w];
1968            }
1969        }
1970        Gate::Cz => {
1971            let q0 = targets[0];
1972            let q1 = targets[1];
1973            for w in 0..bw {
1974                z_frame[q0][w] ^= x_frame[q1][w];
1975                z_frame[q1][w] ^= x_frame[q0][w];
1976            }
1977        }
1978        Gate::Swap => {
1979            let q0 = targets[0];
1980            let q1 = targets[1];
1981            x_frame.swap(q0, q1);
1982            z_frame.swap(q0, q1);
1983        }
1984        _ => {
1985            debug_assert!(
1986                false,
1987                "apply_gate_to_frame: unhandled Clifford gate {:?}",
1988                gate
1989            );
1990        }
1991    }
1992}
1993
1994fn run_shots_noisy_frame(
1995    circuit: &Circuit,
1996    noise: &NoiseModel,
1997    num_shots: usize,
1998    seed: u64,
1999) -> Result<ShotsResult> {
2000    let n = circuit.num_qubits;
2001    let num_classical = circuit.num_classical_bits;
2002
2003    let ref_info = reference_simulation(circuit, seed)?;
2004
2005    let classical_bit_order: Vec<usize> = circuit
2006        .instructions
2007        .iter()
2008        .filter_map(|inst| match inst {
2009            Instruction::Measure { classical_bit, .. } => Some(*classical_bit),
2010            _ => None,
2011        })
2012        .collect();
2013    let num_measurements = classical_bit_order.len();
2014    let m_words = num_measurements.div_ceil(64);
2015
2016    let mut all_packed = vec![0u64; num_shots * m_words];
2017    let mut rng = ChaCha8Rng::seed_from_u64(seed.wrapping_add(0xFAAB_E001));
2018
2019    for batch_start in (0..num_shots).step_by(FRAME_BATCH_SIZE) {
2020        let batch_end = (batch_start + FRAME_BATCH_SIZE).min(num_shots);
2021        let batch_n = batch_end - batch_start;
2022        let bw = batch_n.div_ceil(64);
2023
2024        let mut x_frame: Vec<Vec<u64>> = vec![vec![0u64; bw]; n];
2025        let mut z_frame: Vec<Vec<u64>> = vec![vec![0u64; bw]; n];
2026
2027        let mut meas_idx = 0usize;
2028
2029        for (idx, inst) in circuit.instructions.iter().enumerate() {
2030            match inst {
2031                Instruction::Gate { gate, targets }
2032                | Instruction::Conditional { gate, targets, .. } => {
2033                    apply_gate_to_frame(gate, targets.as_slice(), &mut x_frame, &mut z_frame, bw);
2034
2035                    for event in &noise.after_gate[idx] {
2036                        let (px, py, pz) = event.pauli_probs();
2037                        let q = event.qubit();
2038                        let p_event = px + py + pz;
2039                        if p_event == 0.0 {
2040                            continue;
2041                        }
2042
2043                        let px_frac = px / p_event;
2044                        let pxy_frac = (px + py) / p_event;
2045
2046                        if p_event < 0.5 && batch_n >= 32 {
2047                            let ln_1mp = (1.0 - p_event).ln();
2048                            let mut pos = geometric_sample(&mut rng, ln_1mp);
2049                            while pos < batch_n {
2050                                let r: f64 = rand::Rng::random(&mut rng);
2051                                let bit = 1u64 << (pos % 64);
2052                                let w = pos / 64;
2053                                if r < px_frac {
2054                                    x_frame[q][w] ^= bit;
2055                                } else if r < pxy_frac {
2056                                    x_frame[q][w] ^= bit;
2057                                    z_frame[q][w] ^= bit;
2058                                } else {
2059                                    z_frame[q][w] ^= bit;
2060                                }
2061                                pos += 1 + geometric_sample(&mut rng, ln_1mp);
2062                            }
2063                        } else {
2064                            for s in 0..batch_n {
2065                                let r: f64 = rand::Rng::random(&mut rng);
2066                                if r < px {
2067                                    x_frame[q][s / 64] ^= 1u64 << (s % 64);
2068                                } else if r < px + py {
2069                                    x_frame[q][s / 64] ^= 1u64 << (s % 64);
2070                                    z_frame[q][s / 64] ^= 1u64 << (s % 64);
2071                                } else if r < p_event {
2072                                    z_frame[q][s / 64] ^= 1u64 << (s % 64);
2073                                }
2074                            }
2075                        }
2076                    }
2077                }
2078                Instruction::Measure {
2079                    qubit,
2080                    classical_bit: _,
2081                } => {
2082                    if ref_info.is_random[meas_idx] {
2083                        let support = &ref_info.random_x_support[meas_idx];
2084                        #[allow(clippy::needless_range_loop)]
2085                        for w in 0..bw {
2086                            let random_word: u64 = rand::Rng::random(&mut rng);
2087                            let mask = if w == bw - 1 && batch_n % 64 != 0 {
2088                                random_word & ((1u64 << (batch_n % 64)) - 1)
2089                            } else {
2090                                random_word
2091                            };
2092                            if mask != 0 {
2093                                for &q in support {
2094                                    x_frame[q][w] ^= mask;
2095                                }
2096                            }
2097                        }
2098                    }
2099
2100                    let ref_bit = ref_info.outcomes[meas_idx];
2101                    let mi_word = meas_idx / 64;
2102                    let mi_bit = meas_idx % 64;
2103                    #[allow(clippy::needless_range_loop)]
2104                    for w in 0..bw {
2105                        let frame_word = x_frame[*qubit][w];
2106                        let effective = if ref_bit { !frame_word } else { frame_word };
2107                        let num_bits = if w == bw - 1 && batch_n % 64 != 0 {
2108                            batch_n % 64
2109                        } else {
2110                            64
2111                        };
2112                        let mask = if num_bits == 64 {
2113                            effective
2114                        } else {
2115                            effective & ((1u64 << num_bits) - 1)
2116                        };
2117                        let mut bits = mask;
2118                        while bits != 0 {
2119                            let s = bits.trailing_zeros() as usize;
2120                            let gs = batch_start + w * 64 + s;
2121                            all_packed[gs * m_words + mi_word] |= 1u64 << mi_bit;
2122                            bits &= bits - 1;
2123                        }
2124                    }
2125
2126                    meas_idx += 1;
2127                }
2128                _ => {}
2129            }
2130        }
2131    }
2132
2133    let shots = unpack_and_remap(
2134        &all_packed,
2135        m_words,
2136        num_shots,
2137        &classical_bit_order,
2138        num_classical,
2139    );
2140
2141    Ok(ShotsResult {
2142        shots,
2143        num_classical_bits: circuit.num_classical_bits,
2144    })
2145}
2146
2147/// Sample a Pauli-noisy Clifford circuit through the compiled noisy sampler,
2148/// frame sampler, or homological path as appropriate.
2149///
2150/// Circuits with resets or classical conditionals fall back to brute-force
2151/// stabilizer simulation so the public API stays correct.
2152pub fn run_shots_noisy(
2153    circuit: &Circuit,
2154    noise: &NoiseModel,
2155    num_shots: usize,
2156    seed: u64,
2157) -> Result<ShotsResult> {
2158    noise.ensure_pauli_only()?;
2159    if !circuit.is_clifford_only() {
2160        return run_shots_noisy_brute_with(
2161            |s| Box::new(StabilizerBackend::new(s)),
2162            circuit,
2163            noise,
2164            num_shots,
2165            seed,
2166        );
2167    }
2168
2169    if !super::supports_compiled_measurement_sampling(circuit) {
2170        return run_shots_noisy_brute_with(
2171            |s| Box::new(StabilizerBackend::new(s)),
2172            circuit,
2173            noise,
2174            num_shots,
2175            seed,
2176        );
2177    }
2178
2179    // Try homological sampler for high shot counts, O(r_quantum + 1) per shot
2180    // when syndrome rank ≤ 20. Falls back to compiled/frame if rank too high.
2181    if num_shots >= 1000 {
2182        if let Ok(sampler) = super::homological::HomologicalSampler::compile(circuit, noise, seed) {
2183            return super::homological::run_shots_homological_inner(sampler, circuit, num_shots);
2184        }
2185    }
2186
2187    let n = circuit.num_qubits;
2188    let gate_count = circuit
2189        .instructions
2190        .iter()
2191        .filter(|i| {
2192            matches!(
2193                i,
2194                Instruction::Gate { .. } | Instruction::Conditional { .. }
2195            )
2196        })
2197        .count();
2198
2199    let depth_ratio = gate_count as f64 / n.max(1) as f64;
2200    let use_frame = depth_ratio < 3.0 || (n >= 200 && depth_ratio < 5.0);
2201
2202    if use_frame {
2203        run_shots_noisy_frame(circuit, noise, num_shots, seed)
2204    } else {
2205        run_shots_noisy_compiled(circuit, noise, num_shots, seed)
2206    }
2207}
2208
2209fn finish_noisy_compiled_run(
2210    mut sampler: NoisyCompiledSampler,
2211    circuit: &Circuit,
2212    num_shots: usize,
2213) -> Result<ShotsResult> {
2214    let classical_bit_order: Vec<usize> = circuit
2215        .instructions
2216        .iter()
2217        .filter_map(|inst| match inst {
2218            Instruction::Measure { classical_bit, .. } => Some(*classical_bit),
2219            _ => None,
2220        })
2221        .collect();
2222    let num_classical = circuit.num_classical_bits;
2223
2224    let packed = sampler.try_sample_bulk_packed(num_shots)?;
2225    let shots = unpack_and_remap_packed(&packed, num_shots, &classical_bit_order, num_classical);
2226
2227    Ok(ShotsResult {
2228        shots,
2229        num_classical_bits: circuit.num_classical_bits,
2230    })
2231}
2232
2233fn run_shots_noisy_compiled(
2234    circuit: &Circuit,
2235    noise: &NoiseModel,
2236    num_shots: usize,
2237    seed: u64,
2238) -> Result<ShotsResult> {
2239    let sampler = compile_noisy(circuit, noise, seed)?;
2240    finish_noisy_compiled_run(sampler, circuit, num_shots)
2241}
2242
2243#[cfg(feature = "gpu")]
2244pub(crate) fn run_shots_noisy_with_gpu(
2245    circuit: &Circuit,
2246    noise: &NoiseModel,
2247    num_shots: usize,
2248    seed: u64,
2249    context: std::sync::Arc<crate::gpu::GpuContext>,
2250) -> Result<ShotsResult> {
2251    noise.ensure_pauli_only()?;
2252    if !circuit.is_clifford_only() {
2253        return run_shots_noisy_brute_with(
2254            |s| Box::new(StabilizerBackend::new(s)),
2255            circuit,
2256            noise,
2257            num_shots,
2258            seed,
2259        );
2260    }
2261
2262    if !super::supports_compiled_measurement_sampling(circuit) {
2263        return run_shots_noisy_brute_with(
2264            |s| Box::new(StabilizerBackend::new(s)),
2265            circuit,
2266            noise,
2267            num_shots,
2268            seed,
2269        );
2270    }
2271
2272    if num_shots >= 1000 {
2273        if let Ok(sampler) = super::homological::HomologicalSampler::compile(circuit, noise, seed) {
2274            return super::homological::run_shots_homological_inner(sampler, circuit, num_shots);
2275        }
2276    }
2277
2278    let n = circuit.num_qubits;
2279    let gate_count = circuit
2280        .instructions
2281        .iter()
2282        .filter(|i| {
2283            matches!(
2284                i,
2285                Instruction::Gate { .. } | Instruction::Conditional { .. }
2286            )
2287        })
2288        .count();
2289
2290    let depth_ratio = gate_count as f64 / n.max(1) as f64;
2291    let use_frame = depth_ratio < 3.0 || (n >= 200 && depth_ratio < 5.0);
2292    if use_frame {
2293        return run_shots_noisy_frame(circuit, noise, num_shots, seed);
2294    }
2295
2296    let sampler = compile_noisy(circuit, noise, seed)?.with_gpu(context);
2297    finish_noisy_compiled_run(sampler, circuit, num_shots)
2298}
2299
2300pub(crate) fn run_shots_noisy_brute_with(
2301    backend_factory: impl Fn(u64) -> Box<dyn Backend>,
2302    circuit: &Circuit,
2303    noise: &NoiseModel,
2304    num_shots: usize,
2305    seed: u64,
2306) -> Result<ShotsResult> {
2307    let mut shots = Vec::with_capacity(num_shots);
2308
2309    for i in 0..num_shots {
2310        let shot_seed = seed.wrapping_add(i as u64);
2311        let mut rng = ChaCha8Rng::seed_from_u64(shot_seed);
2312        let mut backend = backend_factory(shot_seed);
2313        backend.init(circuit.num_qubits, circuit.num_classical_bits)?;
2314
2315        for (idx, instr) in circuit.instructions.iter().enumerate() {
2316            backend.apply(instr)?;
2317
2318            let noise_events = &noise.after_gate[idx];
2319            for event in noise_events {
2320                let (px, py, pz) = event.pauli_probs();
2321                let q = event.qubit();
2322                let r: f64 = rand::Rng::random(&mut rng);
2323                if r < px {
2324                    backend.apply(&Instruction::Gate {
2325                        gate: Gate::X,
2326                        targets: SmallVec::from_elem(q, 1),
2327                    })?;
2328                } else if r < px + py {
2329                    backend.apply(&Instruction::Gate {
2330                        gate: Gate::Y,
2331                        targets: SmallVec::from_elem(q, 1),
2332                    })?;
2333                } else if r < px + py + pz {
2334                    backend.apply(&Instruction::Gate {
2335                        gate: Gate::Z,
2336                        targets: SmallVec::from_elem(q, 1),
2337                    })?;
2338                }
2339            }
2340        }
2341
2342        shots.push(backend.classical_results().to_vec());
2343    }
2344
2345    Ok(ShotsResult {
2346        shots,
2347        num_classical_bits: circuit.num_classical_bits,
2348    })
2349}
2350
2351#[cfg(test)]
2352mod tests {
2353    use super::*;
2354    use crate::circuits;
2355
2356    #[test]
2357    fn noisy_ghz_produces_varied_outcomes() {
2358        let n = 10;
2359        let mut circuit = circuits::ghz_circuit(n);
2360        circuit.num_classical_bits = n;
2361        for i in 0..n {
2362            circuit.add_measure(i, i);
2363        }
2364
2365        let noise = NoiseModel::uniform_depolarizing(&circuit, 0.01);
2366        let result = run_shots_noisy(&circuit, &noise, 1000, 42).unwrap();
2367
2368        assert_eq!(result.shots.len(), 1000);
2369        assert_eq!(result.shots[0].len(), n);
2370
2371        let all_zero: Vec<bool> = vec![false; n];
2372        let all_one: Vec<bool> = vec![true; n];
2373        let num_00 = result.shots.iter().filter(|s| **s == all_zero).count();
2374        let num_11 = result.shots.iter().filter(|s| **s == all_one).count();
2375        let num_other = 1000 - num_00 - num_11;
2376
2377        assert!(num_other > 0, "noise should produce non-GHZ outcomes");
2378        assert!(num_00 > 100, "should still have many |00...0> outcomes");
2379        assert!(num_11 > 100, "should still have many |11...1> outcomes");
2380    }
2381
2382    #[test]
2383    fn zero_noise_matches_noiseless() {
2384        let n = 5;
2385        let mut circuit = circuits::ghz_circuit(n);
2386        circuit.num_classical_bits = n;
2387        for i in 0..n {
2388            circuit.add_measure(i, i);
2389        }
2390
2391        let noise = NoiseModel::uniform_depolarizing(&circuit, 0.0);
2392        let result = run_shots_noisy(&circuit, &noise, 100, 42).unwrap();
2393
2394        for shot in &result.shots {
2395            let all_same = shot.iter().all(|&b| b == shot[0]);
2396            assert!(all_same, "GHZ with zero noise must be all-0 or all-1");
2397        }
2398    }
2399
2400    #[test]
2401    fn noise_model_length_matches_circuit() {
2402        let n = 10;
2403        let mut circuit = circuits::ghz_circuit(n);
2404        circuit.num_classical_bits = n;
2405        for i in 0..n {
2406            circuit.add_measure(i, i);
2407        }
2408
2409        let noise = NoiseModel::uniform_depolarizing(&circuit, 0.001);
2410        assert_eq!(noise.after_gate.len(), circuit.instructions.len());
2411    }
2412
2413    #[test]
2414    fn compiled_noisy_stats_match_brute_force() {
2415        let n = 10;
2416        let mut circuit = circuits::ghz_circuit(n);
2417        circuit.num_classical_bits = n;
2418        for i in 0..n {
2419            circuit.add_measure(i, i);
2420        }
2421
2422        let noise = NoiseModel::uniform_depolarizing(&circuit, 0.01);
2423        let num_shots = 10000;
2424
2425        let brute = run_shots_noisy_brute_with(
2426            |s| Box::new(StabilizerBackend::new(s)),
2427            &circuit,
2428            &noise,
2429            num_shots,
2430            42,
2431        )
2432        .unwrap();
2433        let compiled = run_shots_noisy_compiled(&circuit, &noise, num_shots, 42).unwrap();
2434
2435        let count_all_same = |shots: &[Vec<bool>]| -> usize {
2436            shots
2437                .iter()
2438                .filter(|s| s.iter().all(|&b| b == s[0]))
2439                .count()
2440        };
2441
2442        let brute_coherent = count_all_same(&brute.shots);
2443        let compiled_coherent = count_all_same(&compiled.shots);
2444
2445        let brute_frac = brute_coherent as f64 / num_shots as f64;
2446        let compiled_frac = compiled_coherent as f64 / num_shots as f64;
2447
2448        assert!(
2449            (brute_frac - compiled_frac).abs() < 0.05,
2450            "coherent fraction should be similar: brute={brute_frac:.3}, compiled={compiled_frac:.3}"
2451        );
2452
2453        let count_errors = |shots: &[Vec<bool>]| -> usize {
2454            shots
2455                .iter()
2456                .filter(|s| !s.iter().all(|&b| b == s[0]))
2457                .count()
2458        };
2459
2460        let brute_errors = count_errors(&brute.shots);
2461        let compiled_errors = count_errors(&compiled.shots);
2462
2463        assert!(
2464            brute_errors > 0 && compiled_errors > 0,
2465            "both should produce errors"
2466        );
2467    }
2468
2469    #[test]
2470    fn compiled_noisy_clifford_produces_noise() {
2471        let n = 20;
2472        let circuit_base = circuits::clifford_heavy_circuit(n, 10, 42);
2473        let mut circuit = circuit_base;
2474        circuit.num_classical_bits = n;
2475        for i in 0..n {
2476            circuit.add_measure(i, i);
2477        }
2478
2479        let noise = NoiseModel::uniform_depolarizing(&circuit, 0.01);
2480        let result = run_shots_noisy(&circuit, &noise, 100, 42).unwrap();
2481
2482        assert_eq!(result.shots.len(), 100);
2483        assert_eq!(result.shots[0].len(), n);
2484
2485        let unique: std::collections::HashSet<Vec<bool>> = result.shots.iter().cloned().collect();
2486        assert!(unique.len() > 1, "noise should produce varied outcomes");
2487    }
2488
2489    #[test]
2490    fn compile_noisy_rejects_reset_circuits() {
2491        let mut circuit = Circuit::new(1, 1);
2492        circuit.add_reset(0);
2493        circuit.add_measure(0, 0);
2494        let noise = NoiseModel::uniform_depolarizing(&circuit, 0.01);
2495        assert!(compile_noisy(&circuit, &noise, 42).is_err());
2496    }
2497
2498    #[test]
2499    fn compile_noisy_rejects_conditionals() {
2500        let mut circuit = Circuit::new(2, 2);
2501        circuit.add_gate(Gate::H, &[0]);
2502        circuit.add_measure(0, 0);
2503        circuit.instructions.push(Instruction::Conditional {
2504            condition: crate::circuit::ClassicalCondition::BitIsOne(0),
2505            gate: Gate::X,
2506            targets: crate::circuit::smallvec![1],
2507        });
2508        circuit.add_measure(1, 1);
2509        let noise = NoiseModel::uniform_depolarizing(&circuit, 0.01);
2510        assert!(compile_noisy(&circuit, &noise, 42).is_err());
2511    }
2512
2513    #[test]
2514    fn run_shots_noisy_handles_reset_circuits() {
2515        let mut circuit = Circuit::new(1, 1);
2516        circuit.add_gate(Gate::X, &[0]);
2517        circuit.add_reset(0);
2518        circuit.add_measure(0, 0);
2519        let noise = NoiseModel::uniform_depolarizing(&circuit, 0.0);
2520        let result = run_shots_noisy(&circuit, &noise, 32, 42).unwrap();
2521        assert!(result.shots.iter().all(|shot| !shot[0]));
2522    }
2523
2524    #[test]
2525    fn run_shots_noisy_handles_conditionals() {
2526        let mut circuit = Circuit::new(2, 2);
2527        circuit.add_gate(Gate::H, &[0]);
2528        circuit.add_measure(0, 0);
2529        circuit.instructions.push(Instruction::Conditional {
2530            condition: crate::circuit::ClassicalCondition::BitIsOne(0),
2531            gate: Gate::X,
2532            targets: crate::circuit::smallvec![1],
2533        });
2534        circuit.add_measure(1, 1);
2535        let noise = NoiseModel::uniform_depolarizing(&circuit, 0.0);
2536        let result = run_shots_noisy(&circuit, &noise, 256, 42).unwrap();
2537        assert!(result.shots.iter().all(|shot| shot[0] == shot[1]));
2538    }
2539
2540    #[test]
2541    fn frame_ghz_100q_produces_varied_outcomes() {
2542        let n = 100;
2543        let mut circuit = circuits::ghz_circuit(n);
2544        circuit.num_classical_bits = n;
2545        for i in 0..n {
2546            circuit.add_measure(i, i);
2547        }
2548
2549        let noise = NoiseModel::uniform_depolarizing(&circuit, 0.01);
2550        let result = run_shots_noisy_frame(&circuit, &noise, 1000, 42).unwrap();
2551
2552        assert_eq!(result.shots.len(), 1000);
2553        assert_eq!(result.shots[0].len(), n);
2554
2555        let all_zero: Vec<bool> = vec![false; n];
2556        let all_one: Vec<bool> = vec![true; n];
2557        let num_00 = result.shots.iter().filter(|s| **s == all_zero).count();
2558        let num_11 = result.shots.iter().filter(|s| **s == all_one).count();
2559        let num_other = 1000 - num_00 - num_11;
2560
2561        assert!(num_other > 0, "noise should produce non-GHZ outcomes");
2562        assert!(num_00 > 50, "should still have many |00...0> outcomes");
2563        assert!(num_11 > 50, "should still have many |11...1> outcomes");
2564    }
2565
2566    #[test]
2567    fn frame_zero_noise_matches_noiseless_100q() {
2568        let n = 100;
2569        let mut circuit = circuits::ghz_circuit(n);
2570        circuit.num_classical_bits = n;
2571        for i in 0..n {
2572            circuit.add_measure(i, i);
2573        }
2574
2575        let noise = NoiseModel::uniform_depolarizing(&circuit, 0.0);
2576        let result = run_shots_noisy_frame(&circuit, &noise, 100, 42).unwrap();
2577
2578        for shot in &result.shots {
2579            let all_same = shot.iter().all(|&b| b == shot[0]);
2580            assert!(all_same, "GHZ with zero noise must be all-0 or all-1");
2581        }
2582    }
2583
2584    #[test]
2585    fn frame_stats_match_compiled_ghz() {
2586        let n = 100;
2587        let mut circuit = circuits::ghz_circuit(n);
2588        circuit.num_classical_bits = n;
2589        for i in 0..n {
2590            circuit.add_measure(i, i);
2591        }
2592
2593        let noise = NoiseModel::uniform_depolarizing(&circuit, 0.01);
2594        let num_shots = 5000;
2595
2596        let frame = run_shots_noisy_frame(&circuit, &noise, num_shots, 42).unwrap();
2597        let compiled = run_shots_noisy_compiled(&circuit, &noise, num_shots, 42).unwrap();
2598
2599        let count_coherent = |shots: &[Vec<bool>]| -> usize {
2600            shots
2601                .iter()
2602                .filter(|s| s.iter().all(|&b| b == s[0]))
2603                .count()
2604        };
2605
2606        let frame_coh = count_coherent(&frame.shots) as f64 / num_shots as f64;
2607        let compiled_coh = count_coherent(&compiled.shots) as f64 / num_shots as f64;
2608
2609        assert!(
2610            (frame_coh - compiled_coh).abs() < 0.05,
2611            "coherent fraction should be similar: frame={frame_coh:.3}, compiled={compiled_coh:.3}"
2612        );
2613    }
2614
2615    #[test]
2616    fn frame_clifford_100q_produces_noise() {
2617        let n = 100;
2618        let circuit_base = circuits::clifford_heavy_circuit(n, 10, 42);
2619        let mut circuit = circuit_base;
2620        circuit.num_classical_bits = n;
2621        for i in 0..n {
2622            circuit.add_measure(i, i);
2623        }
2624
2625        let noise = NoiseModel::uniform_depolarizing(&circuit, 0.01);
2626        let result = run_shots_noisy_frame(&circuit, &noise, 100, 42).unwrap();
2627
2628        assert_eq!(result.shots.len(), 100);
2629        assert_eq!(result.shots[0].len(), n);
2630
2631        let unique: std::collections::HashSet<Vec<bool>> = result.shots.iter().cloned().collect();
2632        assert!(unique.len() > 1, "noise should produce varied outcomes");
2633    }
2634
2635    #[test]
2636    fn filtered_noisy_bell_pairs_matches_monolithic() {
2637        let n_pairs = 50;
2638        let n = n_pairs * 2;
2639        let mut circuit = circuits::independent_bell_pairs(n_pairs);
2640        circuit.num_classical_bits = n;
2641        for i in 0..n {
2642            circuit.add_measure(i, i);
2643        }
2644
2645        let noise = NoiseModel::uniform_depolarizing(&circuit, 0.01);
2646        let seed = 42u64;
2647
2648        let filtered =
2649            compile_noisy_filtered(&circuit, &noise, &circuit.independent_subsystems(), seed)
2650                .unwrap();
2651        let monolithic = compile_noisy_monolithic(&circuit, &noise, seed).unwrap();
2652
2653        assert_eq!(filtered.num_measurements, monolithic.num_measurements);
2654        assert_eq!(filtered.events.len(), monolithic.events.len());
2655
2656        let mut filtered = filtered;
2657        let mut monolithic = monolithic;
2658        let num_shots = 10_000;
2659        let shots_f = filtered.sample_bulk(num_shots);
2660        let shots_m = monolithic.sample_bulk(num_shots);
2661
2662        assert_eq!(shots_f.len(), num_shots);
2663        assert_eq!(shots_m.len(), num_shots);
2664
2665        let mut agree_f = 0usize;
2666        let mut agree_m = 0usize;
2667        for shot in &shots_f {
2668            for pair in shot.chunks(2) {
2669                if pair[0] == pair[1] {
2670                    agree_f += 1;
2671                }
2672            }
2673        }
2674        for shot in &shots_m {
2675            for pair in shot.chunks(2) {
2676                if pair[0] == pair[1] {
2677                    agree_m += 1;
2678                }
2679            }
2680        }
2681
2682        let total_pairs = num_shots * n_pairs;
2683        let agree_rate_f = agree_f as f64 / total_pairs as f64;
2684        let agree_rate_m = agree_m as f64 / total_pairs as f64;
2685        assert!(
2686            agree_rate_f > 0.95,
2687            "filtered agreement rate {agree_rate_f:.4} should be >0.95 with low noise"
2688        );
2689        assert!(
2690            agree_rate_m > 0.95,
2691            "monolithic agreement rate {agree_rate_m:.4} should be >0.95 with low noise"
2692        );
2693        assert!(
2694            (agree_rate_f - agree_rate_m).abs() < 0.02,
2695            "filtered ({agree_rate_f:.4}) and monolithic ({agree_rate_m:.4}) should have similar agreement rates"
2696        );
2697    }
2698
2699    #[cfg(feature = "gpu")]
2700    #[test]
2701    fn compiled_noisy_with_stub_gpu_matches_cpu_below_threshold() {
2702        let n = 12;
2703        let mut circuit = circuits::ghz_circuit(n);
2704        circuit.num_classical_bits = n;
2705        for i in 0..n {
2706            circuit.add_measure(i, i);
2707        }
2708
2709        let noise = NoiseModel::uniform_depolarizing(&circuit, 0.01);
2710        let shots = 20_000;
2711
2712        let mut cpu = compile_noisy(&circuit, &noise, 42).unwrap();
2713        let cpu_marginals = cpu.sample_marginals(shots);
2714
2715        let mut gpu = compile_noisy(&circuit, &noise, 42)
2716            .unwrap()
2717            .with_gpu(crate::gpu::GpuContext::stub_for_tests());
2718        let gpu_marginals = gpu.sample_marginals(shots);
2719
2720        for (idx, (cpu_p1, gpu_p1)) in cpu_marginals.iter().zip(gpu_marginals.iter()).enumerate() {
2721            assert!(
2722                (cpu_p1 - gpu_p1).abs() < 0.03,
2723                "marginal[{idx}] diverged too much: cpu={cpu_p1}, gpu={gpu_p1}"
2724            );
2725        }
2726    }
2727
2728    #[cfg(feature = "gpu")]
2729    #[test]
2730    fn compiled_noisy_with_stub_gpu_low_rank_above_threshold_uses_cpu_fallback() {
2731        let shots = crate::gpu::bts_min_shots().max(1);
2732        let n = 12;
2733        let mut circuit = circuits::ghz_circuit(n);
2734        circuit.num_classical_bits = n;
2735        for i in 0..n {
2736            circuit.add_measure(i, i);
2737        }
2738
2739        let noise = NoiseModel::uniform_depolarizing(&circuit, 0.01);
2740        let mut gpu = compile_noisy(&circuit, &noise, 42)
2741            .unwrap()
2742            .with_gpu(crate::gpu::GpuContext::stub_for_tests());
2743
2744        assert!(!gpu.noiseless.should_use_gpu_bts(shots));
2745        let counts = gpu.sample_counts(shots);
2746        let total: u64 = counts.values().sum();
2747        assert_eq!(total, shots as u64);
2748    }
2749}