gam_gpu/
encode_throughput.rs

1//! Measured device-resident throughput of the SAE/LLM batched-solve COMPONENT —
2//! the resident penalized normal-equations inner solve, NOT the full exact SAE
3//! encode (see the SCOPE section below) (#1412, #988, #1017 Phase-3).
4//!
5//! ## Why this module exists
6//!
7//! The historical throughput "decision gate" (#1412) asserted a `100_000`
8//! rows/sec/GPU deployment target **without ever measuring a device**. Its
9//! successor still keyed the deployment decision on a *CPU* measurement scaled
10//! by a hardcoded `CPU_TO_GPU_SCALING = 100.0` fudge factor — so passing the
11//! gate established nothing about real GPU throughput. #988 closed
12//! `COMPLETED` while the maintainer's own follow-up confirmed the GPU
13//! steady-state encode rate had never been measured.
14//!
15//! This module makes the measurement real and *testable as a library function*
16//! (the prior real benchmark lived only in `examples/throughput_1412.rs`, which
17//! nothing in CI ran or asserted). [`measure_resident_solve_throughput`] runs
18//! the production IRLS inner step — upload `X` once, then repeatedly solve the
19//! penalized normal equations `(XᵀWX + ridge·I)β = rhs` with the `p×p` Gram and
20//! its Cholesky factor kept DEVICE-RESIDENT, downloading only the `p`-vector
21//! `β` — on the real device, and reports the measured design-rows/sec.
22//!
23//! ## SCOPE — this is a COMPONENT benchmark, not the full exact SAE encode
24//!
25//! What is timed here is the resident penalized normal-equations *inner solve*
26//! `(XᵀWX + ridge·I)β = rhs` ONLY. That is one component of the SAE encode, NOT
27//! the full exact per-row SAE encode, and the measured rate is therefore NOT
28//! evidence for a "batched exact per-row GPU encode" title claim. The full exact
29//! encode would additionally require, per row: active-set routing (which atoms
30//! are live), the per-row latent-coordinate Newton refinement on the manifold,
31//! the assignment/gate (softmax/IBP) solve, and the certificate/fallback +
32//! reconstruction-validation path. None of those are exercised or timed by this
33//! function. Establishing the end-to-end encode-throughput claim requires a
34//! separate benchmark that times the *production encode path itself* (routing +
35//! latent-coordinate Newton + assignment/gate solve + fallback/certificate), not
36//! this inner-solve cell. Treat the number below strictly as the resident
37//! normal-equations inner-solve throughput.
38//!
39//! ## Fail-loud, never false-route
40//!
41//! The single recurring failure mode this guards against is *false GPU
42//! routing*: claiming a device measurement while the work silently ran on the
43//! CPU. [`ResidentSolveThroughput::engaged`] is `true` only when
44//! [`ResidentDesignGram::try_new`] actually staged `X` on the device AND every
45//! timed solve returned a device result. If the device path declines or fails
46//! mid-measurement, `engaged` is `false` and `measured_rows_per_sec` is left at
47//! `0.0` — a non-measurement that [`GpuThroughputVerdict`] can never report as
48//! meeting the target. There is no CPU fallback inside the measurement: a
49//! caller that wants the CPU oracle runs it separately for parity.
50
51use std::hint::black_box;
52use std::time::{Duration, Instant};
53
54use ndarray::{Array1, Array2, ArrayView1, ArrayView2};
55
56use super::linalg_dispatch::ResidentDesignGram;
57use super::policy::{GpuThroughputVerdict, GPU_THROUGHPUT_TARGET_ROWS_PER_SEC};
58
59/// A representative LLM/SAE batched-solve work cell: `n` design rows, `p` wide
60/// decoder border. (`d`, the per-atom reduced-Schur block size, is fixed by the
61/// term and does not enter the resident-solve throughput.)
62#[derive(Clone, Copy, Debug)]
63pub struct EncodeShape {
64    /// Human-readable label for reporting.
65    pub label: &'static str,
66    /// Design rows pushed through the device per fit.
67    pub n: usize,
68    /// Decoder-border width (the resident Gram is `p×p`).
69    pub p: usize,
70}
71
72/// The canonical qwen/olmo-scale SAE residual-block shapes (matches the
73/// `examples/throughput_1412.rs` workload so the library measurement and the
74/// example agree).
75pub const CANONICAL_ENCODE_SHAPES: &[EncodeShape] = &[
76    EncodeShape {
77        label: "sae-2k-2048",
78        n: 2_000,
79        p: 2_048,
80    },
81    EncodeShape {
82        label: "sae-4k-4096",
83        n: 4_000,
84        p: 4_096,
85    },
86    EncodeShape {
87        label: "sae-8k-1024",
88        n: 8_000,
89        p: 1_024,
90    },
91];
92
93/// Outcome of measuring the device-resident penalized-solve throughput for one
94/// [`EncodeShape`].
95#[derive(Clone, Copy, Debug)]
96pub struct ResidentSolveThroughput {
97    /// The shape that was measured.
98    pub shape: EncodeShape,
99    /// `true` iff `X` was staged on the device AND every timed solve returned a
100    /// device result. `false` means the device path declined or failed — the
101    /// number below is **not** a device measurement.
102    pub engaged: bool,
103    /// Measured design-rows/sec for the resident solve, or `0.0` when the
104    /// device path did not engage (a non-measurement).
105    pub measured_rows_per_sec: f64,
106    /// The verdict comparing `measured_rows_per_sec` against
107    /// [`GPU_THROUGHPUT_TARGET_ROWS_PER_SEC`].
108    pub verdict: GpuThroughputVerdict,
109}
110
111/// Deterministic LCG in `[-1, 1)` — no `rand` dependency, fully reproducible
112/// across runs so the measured fixture is stable.
113fn lcg(state: &mut u64) -> f64 {
114    *state = state
115        .wrapping_mul(6364136223846793005)
116        .wrapping_add(1442695040888963407);
117    (*state >> 11) as f64 / (1u64 << 53) as f64 * 2.0 - 1.0
118}
119
120/// Build a deterministic `n×p` design fixture for the throughput measurement.
121fn planted_design(n: usize, p: usize, seed: u64) -> Array2<f64> {
122    let mut s = seed;
123    Array2::from_shape_fn((n, p), |_| lcg(&mut s) * 0.05)
124}
125
126/// Measure the device-resident penalized-normal-equations solve throughput for
127/// one shape: upload `X` once, then time `reps` solves that cross only `w`
128/// (H2D), `rhs` (H2D, fixed), and `β` (D2H) — the production IRLS inner step.
129///
130/// `reps` is the number of timed solves; `w` is perturbed per rep so each solve
131/// is genuine work, mirroring an IRLS weight update. Returns a
132/// [`ResidentSolveThroughput`] whose `engaged` flag is the false-routing guard:
133/// on a CPU-only host (or if the device declines) it is `false` and the rate is
134/// `0.0`.
135#[must_use]
136pub fn measure_resident_solve_throughput(shape: EncodeShape, reps: usize) -> ResidentSolveThroughput {
137    let EncodeShape { n, p, .. } = shape;
138    let not_engaged = |shape| ResidentSolveThroughput {
139        shape,
140        engaged: false,
141        measured_rows_per_sec: 0.0,
142        verdict: GpuThroughputVerdict::from_measurement(0.0),
143    };
144    if n == 0 || p == 0 || reps == 0 {
145        return not_engaged(shape);
146    }
147
148    let x = planted_design(n, p, 0x1412_a100_dead_beef);
149    let w = {
150        let mut s = 0x988_5ae_e0c0_de01u64;
151        Array1::from_shape_fn(n, |_| lcg(&mut s).abs() + 1e-3)
152    };
153    let rhs = Array1::from_shape_fn(p, |j| ((j as f64 + 1.0) * 0.03).cos());
154    let ridge = 1e-3_f64;
155
156    // Stage X once. `None` => no device / shape below the Gram threshold => not
157    // a device measurement.
158    let handle = match ResidentDesignGram::try_new(x.view()) {
159        Some(h) => h,
160        None => return not_engaged(shape),
161    };
162
163    // Warm the resident solve (allocations, kernel handles) outside the timer;
164    // if even the warm solve declines, the device path is not usable here.
165    if handle.solve_normal_equations(w.view(), rhs.view(), ridge).is_none() {
166        return not_engaged(shape);
167    }
168
169    let mut total = Duration::ZERO;
170    for r in 0..reps {
171        let wr = Array1::from_shape_fn(n, |i| (w[i] + 1e-3 * (r as f64)).abs());
172        let start = Instant::now();
173        match handle.solve_normal_equations(wr.view(), rhs.view(), ridge) {
174            Some(beta) => {
175                black_box(beta);
176            }
177            // A mid-measurement decline means the timed region is no longer a
178            // pure device measurement — refuse to report it as one.
179            None => return not_engaged(shape),
180        }
181        total += start.elapsed();
182    }
183
184    let secs = total.as_secs_f64() / reps as f64;
185    let measured_rows_per_sec = if secs > 0.0 { n as f64 / secs } else { 0.0 };
186    ResidentSolveThroughput {
187        shape,
188        engaged: measured_rows_per_sec > 0.0,
189        measured_rows_per_sec,
190        verdict: GpuThroughputVerdict::from_measurement(measured_rows_per_sec),
191    }
192}
193
194/// CPU oracle for the same penalized normal-equations solve, used for parity:
195/// `(XᵀWX + ridge·I)β = rhs` solved by a host Cholesky. This is the definition
196/// of truth the device solve must match (up to IEEE-754 reduction order).
197#[must_use]
198pub fn cpu_oracle_normal_equations_solve(
199    x: ArrayView2<'_, f64>,
200    w: ArrayView1<'_, f64>,
201    rhs: ArrayView1<'_, f64>,
202    ridge: f64,
203) -> Array1<f64> {
204    let (n, p) = x.dim();
205    assert_eq!(w.len(), n, "w must have one entry per design row");
206    assert_eq!(rhs.len(), p, "rhs must have one entry per border column");
207
208    // Gram = Xᵀ diag(w) X + ridge·I, formed in f64 as (√w⊙X)ᵀ(√w⊙X) via the
209    // BLAS-backed `dot` (the scalar triple loop is O(n·p²) and dominates the
210    // oracle at p in the thousands). Folding √w into both factors keeps the
211    // weighting exact: row i contributes wᵢ·xᵢₐ·xᵢᵦ as (√wᵢxᵢₐ)(√wᵢxᵢᵦ).
212    let mut xw = x.to_owned();
213    for i in 0..n {
214        let sw = w[i].sqrt();
215        for a in 0..p {
216            xw[[i, a]] *= sw;
217        }
218    }
219    let mut gram = xw.t().dot(&xw);
220    for j in 0..p {
221        gram[[j, j]] += ridge;
222    }
223
224    // Cholesky: gram = L Lᵀ (lower), then solve L y = rhs, Lᵀ β = y.
225    let mut l = Array2::<f64>::zeros((p, p));
226    for j in 0..p {
227        let mut diag = gram[[j, j]];
228        for s in 0..j {
229            diag -= l[[j, s]] * l[[j, s]];
230        }
231        // The oracle exists to be the truth the device is checked against, so a
232        // non-PD pivot must fail loudly here rather than clamp to 0 and launder
233        // a divide-by-zero into a silent NaN in the back-substitution. For the
234        // ridge·I + XᵀWX systems this is called on (ridge > 0, w > 0) the pivot
235        // is always strictly positive; a non-positive pivot means the caller
236        // passed a degenerate system and parity would be meaningless.
237        assert!(
238            diag > 0.0,
239            "cpu_oracle: non-positive Cholesky pivot {diag:.3e} at index {j} — \
240             the Gram is not positive-definite (need ridge>0 and w>0)"
241        );
242        let ljj = diag.sqrt();
243        l[[j, j]] = ljj;
244        for i in (j + 1)..p {
245            let mut off = gram[[i, j]];
246            for s in 0..j {
247                off -= l[[i, s]] * l[[j, s]];
248            }
249            l[[i, j]] = off / ljj;
250        }
251    }
252    let mut y = rhs.to_owned();
253    for i in 0..p {
254        let mut acc = y[i];
255        for s in 0..i {
256            acc -= l[[i, s]] * y[s];
257        }
258        y[i] = acc / l[[i, i]];
259    }
260    let mut beta = y;
261    for i in (0..p).rev() {
262        let mut acc = beta[i];
263        for s in (i + 1)..p {
264            acc -= l[[s, i]] * beta[s];
265        }
266        beta[i] = acc / l[[i, i]];
267    }
268    beta
269}
270
271/// The deployment target, re-exported so callers measuring throughput do not
272/// have to import the policy module directly.
273pub const DEPLOYMENT_TARGET_ROWS_PER_SEC: f64 = GPU_THROUGHPUT_TARGET_ROWS_PER_SEC;
274
275// ===========================================================================
276// FULL exact per-row encode throughput + correctness (#1412 follow-up).
277//
278// The component benchmark above times ONLY the resident normal-equations inner
279// solve `(XᵀWX+ridge·I)β=rhs` and is explicit (see the SCOPE section) that this
280// is NOT the full exact per-row SAE encode. The pieces below are the reusable,
281// gam-sae-free instrument for benchmarking the *full* production encode path
282// end-to-end — active-set/chart routing + per-row latent-coordinate Newton +
283// gate/assignment (amplitude) + Kantorovich certificate/fallback +
284// reconstruction. They live here (CPU-linkable, no `gam-sae` dependency: this
285// crate is *below* `gam-sae`) so the timing harness and the correctness gate
286// are shared, while the driver that actually calls the production
287// `EncodeAtlas::certified_encode_batch` lives in
288// `crates/gam-gpu/tests/encode_full_path_throughput.rs` (a dev-dependency cycle
289// onto `gam-sae`, allowed by cargo for test-only edges).
290//
291// HONEST DEVICE STATUS. There is currently NO device-resident exact-encode
292// kernel: the production `certified_encode_*` path is per-row host ndarray work
293// (the only SAE GPU kernel, `gam_sae::gpu_kernels::sae_rowjet`, accelerates the
294// *fitting* reconstruction-jet tower, not the encode). So the
295// [`FullEncodeThroughput::device_encode_engaged`] flag is `false` even on a GPU
296// host until such a kernel exists. This benchmark therefore does NOT yet
297// substantiate a device "batched exact per-row GPU encode" number — by design,
298// it refuses to fabricate one (the same fail-loud, never-false-route discipline
299// as the component benchmark). What it DOES establish is the real end-to-end
300// encode throughput (CPU today) and a correctness contract — support agreement,
301// coordinate error, reconstruction explained-variance, and fallback rate
302// against the production CPU encode — that any future device encode must match.
303// ===========================================================================
304
305/// End-to-end throughput of the FULL exact per-row encode for one batch.
306///
307/// Distinct from [`ResidentSolveThroughput`] (which times only the inner solve):
308/// `rows_per_sec` here is `n_rows / encode_secs` for the *entire* production
309/// `certified_encode_batch` — routing, per-row Newton, certificate, fallback,
310/// and the per-row reconstruction selection included.
311#[derive(Clone, Copy, Debug)]
312pub struct FullEncodeThroughput {
313    /// Rows encoded in the timed batch.
314    pub n_rows: usize,
315    /// Wall-clock seconds for the full encode of the batch.
316    pub encode_secs: f64,
317    /// `n_rows / encode_secs` (`0.0` for a degenerate / non-positive time).
318    pub rows_per_sec: f64,
319    /// `true` ONLY if a device-resident exact-encode kernel actually ran the
320    /// encode. No such kernel exists yet, so this is `false` even on a GPU host
321    /// — the flag is the false-routing guard that keeps the CPU encode rate from
322    /// ever being reported as a device measurement.
323    pub device_encode_engaged: bool,
324}
325
326impl FullEncodeThroughput {
327    /// Build a throughput record from a measured elapsed time. `engaged` is the
328    /// caller's honest assertion that a device-resident encode kernel produced
329    /// the result; pass `false` for the host encode path.
330    #[must_use]
331    pub fn from_elapsed(n_rows: usize, elapsed: Duration, device_encode_engaged: bool) -> Self {
332        let encode_secs = elapsed.as_secs_f64();
333        let rows_per_sec = if n_rows > 0 && encode_secs > 0.0 {
334            n_rows as f64 / encode_secs
335        } else {
336            0.0
337        };
338        Self {
339            n_rows,
340            encode_secs,
341            rows_per_sec,
342            device_encode_engaged,
343        }
344    }
345}
346
347/// Correctness of an encode result, measured against the production CPU encode
348/// (a per-row reference) and the reconstruction it implies.
349///
350/// Every field is a quantity a "batched exact per-row encode" claim has to
351/// stand on: it must AGREE with the production per-row encode (support +
352/// coordinates), it must RECONSTRUCT the targets (explained variance), and it
353/// must be honest about how many rows it could not certify (fallback rate).
354#[derive(Clone, Copy, Debug)]
355pub struct EncodeQualityMetrics {
356    /// Rows compared.
357    pub n_rows: usize,
358    /// Rows the encode-under-test certified (`h ≤ ½`, exact-into-the-ball).
359    pub certified_rows: usize,
360    /// Fraction of rows the encode-under-test could NOT certify and flagged for
361    /// the multi-start fallback (`1 - certified_rows/n_rows`). This is the
362    /// "fallback rate".
363    pub fallback_rate: f64,
364    /// Fraction of rows whose certificate flag AGREES with the per-row reference
365    /// encode. For a correct batched encode this is `1.0` (the batch is just the
366    /// per-row encode fanned out).
367    pub support_agreement: f64,
368    /// Largest absolute latent-coordinate difference between the encode-under-test
369    /// and the per-row reference encode, over all rows and coordinate dims. A
370    /// correct batched encode matches the per-row encode to round-off (≈ `0`).
371    pub max_coord_abs_err: f64,
372    /// Largest absolute element-wise reconstruction residual `|x̂ − x|` over the
373    /// whole batch (the "amplitude"/reconstruction error in raw output units).
374    pub max_reconstruction_abs_err: f64,
375    /// Reconstruction explained variance `1 − ‖X − X̂‖²_F / ‖X − X̄‖²_F`, with each
376    /// output column centered by its own mean `X̄`. `1.0` is a perfect on-manifold
377    /// reconstruction; `0.0` is no better than the per-column mean.
378    pub reconstruction_ev: f64,
379}
380
381/// Compute [`EncodeQualityMetrics`] for an encode result.
382///
383/// * `coords` / `certified` — the encode UNDER TEST (`n×d` coords, `n` flags).
384/// * `coords_ref` / `certified_ref` — the production per-row reference encode
385///   (the definition of truth the batched/accelerated encode must match).
386/// * `reconstruction` — the decoded reconstruction `x̂` implied by `coords`
387///   (`n×p`, i.e. `amplitudeᵢ · Φ(coordsᵢ) · B`).
388/// * `targets` — the encode inputs `x` (`n×p`).
389///
390/// Panics on a shape mismatch: this is a benchmark/correctness helper and a
391/// mismatched comparison would silently launder a wrong number.
392#[must_use]
393pub fn encode_quality_metrics(
394    coords: ArrayView2<'_, f64>,
395    certified: &[bool],
396    coords_ref: ArrayView2<'_, f64>,
397    certified_ref: &[bool],
398    reconstruction: ArrayView2<'_, f64>,
399    targets: ArrayView2<'_, f64>,
400) -> EncodeQualityMetrics {
401    let (n, d) = coords.dim();
402    assert_eq!(
403        coords_ref.dim(),
404        (n, d),
405        "encode_quality_metrics: reference coords shape {:?} != under-test {:?}",
406        coords_ref.dim(),
407        (n, d)
408    );
409    assert_eq!(certified.len(), n, "certified flags must have one entry per row");
410    assert_eq!(
411        certified_ref.len(),
412        n,
413        "reference certified flags must have one entry per row"
414    );
415    let (nt, p) = targets.dim();
416    assert_eq!(nt, n, "targets must have one row per encoded row");
417    assert_eq!(
418        reconstruction.dim(),
419        (n, p),
420        "reconstruction shape {:?} != targets {:?}",
421        reconstruction.dim(),
422        (n, p)
423    );
424
425    let certified_rows = certified.iter().filter(|c| **c).count();
426    let fallback_rate = if n > 0 {
427        1.0 - certified_rows as f64 / n as f64
428    } else {
429        0.0
430    };
431
432    let agree = certified
433        .iter()
434        .zip(certified_ref.iter())
435        .filter(|(a, b)| a == b)
436        .count();
437    let support_agreement = if n > 0 { agree as f64 / n as f64 } else { 1.0 };
438
439    let mut max_coord_abs_err = 0.0_f64;
440    for i in 0..n {
441        for j in 0..d {
442            max_coord_abs_err = max_coord_abs_err.max((coords[[i, j]] - coords_ref[[i, j]]).abs());
443        }
444    }
445
446    // Reconstruction error + explained variance (per-column centering).
447    let mut max_reconstruction_abs_err = 0.0_f64;
448    let mut ss_res = 0.0_f64;
449    let mut ss_tot = 0.0_f64;
450    for c in 0..p {
451        let mut mean = 0.0_f64;
452        for i in 0..n {
453            mean += targets[[i, c]];
454        }
455        if n > 0 {
456            mean /= n as f64;
457        }
458        for i in 0..n {
459            let resid = reconstruction[[i, c]] - targets[[i, c]];
460            max_reconstruction_abs_err = max_reconstruction_abs_err.max(resid.abs());
461            ss_res += resid * resid;
462            let centered = targets[[i, c]] - mean;
463            ss_tot += centered * centered;
464        }
465    }
466    let reconstruction_ev = if ss_tot > 0.0 {
467        1.0 - ss_res / ss_tot
468    } else {
469        // Degenerate (all targets equal their column mean): a perfect
470        // reconstruction is EV 1, anything else is 0 rather than a NaN.
471        if ss_res == 0.0 { 1.0 } else { 0.0 }
472    };
473
474    EncodeQualityMetrics {
475        n_rows: n,
476        certified_rows,
477        fallback_rate,
478        support_agreement,
479        max_coord_abs_err,
480        max_reconstruction_abs_err,
481        reconstruction_ev,
482    }
483}
484
485#[cfg(test)]
486mod full_encode_metric_tests {
487    use super::*;
488    use ndarray::array;
489
490    #[test]
491    fn throughput_is_rows_over_seconds_and_guards_degenerate_time() {
492        let t = FullEncodeThroughput::from_elapsed(8_000, Duration::from_millis(100), false);
493        assert_eq!(t.n_rows, 8_000);
494        assert!(!t.device_encode_engaged);
495        // 8000 rows / 0.1 s = 80_000 rows/sec.
496        assert!((t.rows_per_sec - 80_000.0).abs() < 1.0, "got {}", t.rows_per_sec);
497        // Zero elapsed is a non-measurement, not an infinite rate.
498        let z = FullEncodeThroughput::from_elapsed(8_000, Duration::ZERO, false);
499        assert_eq!(z.rows_per_sec, 0.0);
500    }
501
502    #[test]
503    fn perfect_match_scores_full_agreement_and_unit_ev() {
504        // Two rows, 1 latent dim, 2 output dims. Reconstruction == targets.
505        let coords = array![[0.10], [0.40]];
506        let targets = array![[1.0, 0.0], [0.0, 1.0]];
507        let m = encode_quality_metrics(
508            coords.view(),
509            &[true, true],
510            coords.view(),
511            &[true, true],
512            targets.view(),
513            targets.view(),
514        );
515        assert_eq!(m.n_rows, 2);
516        assert_eq!(m.certified_rows, 2);
517        assert_eq!(m.fallback_rate, 0.0);
518        assert_eq!(m.support_agreement, 1.0);
519        assert_eq!(m.max_coord_abs_err, 0.0);
520        assert_eq!(m.max_reconstruction_abs_err, 0.0);
521        assert!((m.reconstruction_ev - 1.0).abs() < 1e-12);
522    }
523
524    #[test]
525    fn divergence_is_surfaced_in_every_axis() {
526        let coords = array![[0.10], [0.40]];
527        let coords_ref = array![[0.10], [0.50]]; // row 1 differs by 0.10
528        let targets = array![[1.0, 0.0], [0.0, 1.0]];
529        // Reconstruction misses target by 0.25 on one element.
530        let recon = array![[1.0, 0.0], [0.0, 0.75]];
531        let m = encode_quality_metrics(
532            coords.view(),
533            &[true, false], // row 1 uncertified under test
534            coords_ref.view(),
535            &[true, true], // reference certified both
536            recon.view(),
537            targets.view(),
538        );
539        assert_eq!(m.certified_rows, 1);
540        assert!((m.fallback_rate - 0.5).abs() < 1e-12);
541        assert!((m.support_agreement - 0.5).abs() < 1e-12); // row 1 flags disagree
542        assert!((m.max_coord_abs_err - 0.10).abs() < 1e-12);
543        assert!((m.max_reconstruction_abs_err - 0.25).abs() < 1e-12);
544        assert!(m.reconstruction_ev < 1.0);
545    }
546}
gam_gpu/encode_throughput.rs

gam_gpu/
encode_throughput.rs