realizar 0.8.4

Pure Rust ML inference engine built from scratch - model serving for GGUF and safetensors
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
//! CUDA-accelerated quantized model
//!
//! This module provides GPU-accelerated inference for quantized models
//! using NVIDIA CUDA.
//!
//! # Architecture
//!
//! `OwnedQuantizedModelCuda` wraps an `OwnedQuantizedModel` with a CUDA executor
//! for GPU-accelerated matrix operations. Key features:
//!
//! - GPU-resident KV cache (avoids CPU→GPU transfer per token)
//! - Fused attention kernels
//! - Pre-cached quantized weights
//! - Batch generation support
//!
//! # Module Structure
//!
//! - `backend.rs`: CUDA kernel configuration and PTX generation (CudaBackend)
//! - `forward.rs`: Forward pass methods (single token, cached, GPU-resident)
//! - `generation.rs`: Token generation loops (basic, cached, streaming, batch)
//! - `speculative.rs`: Speculative decoding (self-speculative, draft model)
//! - `weights.rs`: Weight management (pre-caching, GPU upload)
//!
//! # Example
//!
//! ```rust,ignore
//! use realizar::gguf::{OwnedQuantizedModel, OwnedQuantizedModelCuda};
//!
//! let model = OwnedQuantizedModel::from_mapped(&mapped)?;
//! let mut cuda_model = OwnedQuantizedModelCuda::new(model, 0)?; // GPU 0
//!
//! // GPU-accelerated forward pass
//! let logits = cuda_model.forward_cuda(&tokens)?;
//! ```

mod backend;
mod forward;
mod generation;
mod speculative;
mod weights;

// Re-export types for public API
pub use backend::CudaBackend;
// PMAT-072: Step-wise batched decode state for lock-releasing scheduler
pub use generation::BatchedDecodeState;

use crate::error::{RealizarError, Result};

// Import types from peer modules (parent of cuda/)
use super::model::OwnedQuantizedModel;
use super::quantized::{OwnedQKVWeights, OwnedQuantizedTensor};
use super::runtime::{OwnedQuantizedKVCache, QuantizedGenerateConfig};
use super::utils::verbose;

// =============================================================================
// IMP-800: CUDA-Accelerated Model Wrapper
// =============================================================================

/// Error from CUDA model initialization that preserves the unconsumed model.
///
/// When `OwnedQuantizedModelCuda::new()` fails, the model is returned inside this error
/// so callers can fall back to CPU without an expensive 1GB clone.
pub struct CudaInitError {
    /// The initialization error
    pub error: RealizarError,
    /// The unconsumed model, returned for CPU fallback
    model: OwnedQuantizedModel,
}

impl CudaInitError {
    /// Extract the unconsumed model for CPU fallback
    #[must_use]
    pub fn into_model(self) -> OwnedQuantizedModel {
        self.model
    }
}

impl std::fmt::Display for CudaInitError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.error)
    }
}

impl std::fmt::Debug for CudaInitError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "CudaInitError({:?})", self.error)
    }
}

/// CUDA-accelerated wrapper for `OwnedQuantizedModel` (IMP-800a)
///
/// Provides GPU-accelerated forward pass using NVIDIA CUDA via trueno-gpu.
/// Caches the CudaExecutor to avoid initialization overhead (~50ms) per call.
///
/// # Example
///
/// ```rust,ignore
/// use realizar::gguf::{OwnedQuantizedModel, OwnedQuantizedModelCuda};
///
/// let model = OwnedQuantizedModel::from_mapped(&mapped)?;
/// let mut cuda_model = OwnedQuantizedModelCuda::new(model, 0)?; // GPU 0
///
/// // GPU-accelerated forward pass
/// let logits = cuda_model.forward_cuda(&tokens)?;
/// ```
pub struct OwnedQuantizedModelCuda {
    /// Inner model
    pub(crate) model: OwnedQuantizedModel,
    /// Cached CUDA executor
    pub(crate) executor: crate::cuda::CudaExecutor,
    /// GPU device name
    device_name: String,
    /// GPU memory (free, total) in bytes
    memory_info: (usize, usize),
    /// PAR-083: Pre-allocated embedding buffer to eliminate per-token heap allocation.
    /// Five-Whys root cause: embed() allocates Vec<f32> per token (~14KB for 7B).
    /// Fix: Reuse this buffer with embed_into().
    embed_buf: Vec<f32>,
    /// realizr#199 (PMAT-450): Prefix cache for prompt KV reuse.
    /// Stores GPU KV cache snapshots keyed by prompt tokens.
    /// On cache hit, skip prefill entirely (TTFT ~900ms → ~5ms).
    #[cfg(feature = "gpu")]
    prefix_cache: crate::gguf::batch_scheduler::PrefixCache,
}

impl OwnedQuantizedModelCuda {
    /// Create a new CUDA-accelerated model wrapper
    ///
    /// # Arguments
    ///
    /// * `model` - The quantized model to wrap
    /// * `device_ordinal` - GPU device index (0 for first GPU)
    ///
    /// # Errors
    ///
    /// Returns error if CUDA is not available or device doesn't exist.
    /// Create a CUDA model wrapper. Model is consumed on failure.
    /// Use `with_max_seq_len` for recoverable errors (returns model on failure).
    pub fn new(model: OwnedQuantizedModel, device_ordinal: i32) -> Result<Self> {
        Self::with_max_seq_len(model, device_ordinal, 2048).map_err(|e| e.error)
    }

    /// Create a new CUDA-accelerated model wrapper with custom max sequence length
    ///
    /// # Arguments
    ///
    /// * `model` - The quantized model to wrap
    /// * `device_ordinal` - GPU device index (0 for first GPU)
    /// * `max_seq_len` - Maximum sequence length for GPU KV cache (PAR-018)
    ///
    /// # Errors
    ///
    /// Returns `CudaInitError` containing both the error and the unconsumed model,
    /// allowing callers to recover the model for CPU fallback without cloning.
    /// GH-280: Check GPU capability before allocation.
    /// Extracted to reduce cognitive complexity of `with_max_seq_len`.
    fn check_gpu_capability(
        model: OwnedQuantizedModel,
    ) -> std::result::Result<OwnedQuantizedModel, CudaInitError> {
        let required = crate::capability::required_ops(&model.config.constraints);
        let supported = crate::capability::gpu_supported_ops();
        if let Err(missing) = crate::capability::check_capability(&required, &supported) {
            let missing_names: Vec<String> = missing.iter().map(ToString::to_string).collect();
            return Err(CudaInitError {
                error: RealizarError::CapabilityMismatch {
                    architecture: model.config.architecture.clone(),
                    missing_ops: missing_names.join(", "),
                    suggestion: "Model will use CPU inference. To add GPU support, implement the missing kernels in trueno.".to_string(),
                },
                model,
            });
        }
        Ok(model)
    }

    /// GH-199/PARITY-GATE: Preload GPU weights and verify correctness.
    /// Extracted to reduce cognitive complexity of `with_max_seq_len`.
    fn preload_and_verify(mut self) -> std::result::Result<Self, CudaInitError> {
        if !self.supports_gpu_resident() {
            return Ok(self);
        }

        // GH-199 ROOT CAUSE B: Eagerly preload weights for GPU-resident path.
        if let Err(e) = self.preload_weights_gpu() {
            return Err(CudaInitError {
                error: e,
                model: self.into_model(),
            });
        }

        // PMAT-037: Eagerly warm FP16 weight cache for HGEMM prefill.
        // Five-Whys root cause: FP16 cache lazily populated on first inference
        // request (303ms cold-start), inflating TTFT P50 from 25ms to 50.9ms.
        // Pre-populating at model init moves cost to startup (one-time).
        // PMAT-067: Skip FP16 cache when FP8 is active — saves ~1.5 GB VRAM.
        // PMAT-400: Skip FP16 cache on unified memory (cc>=120) — saves 61 GB for 32B model.
        // PMAT-409: Override with FORCE_FP16_CACHE=1 for 7B on GB10 (2.9 GB, restores prefill perf).
        let skip_fp16_unified = self.executor.gpu_profile.cc >= 120
            && std::env::var("FORCE_FP16_CACHE").as_deref() != Ok("1");
        if std::env::var("HGEMM_PREFILL").as_deref() != Ok("0")
            && !self.executor.gpu_profile.fp8_prefill
            && !skip_fp16_unified
        {
            let num_layers = self.model.config.num_layers;
            let hidden_dim = self.model.config.hidden_dim as u32;
            let intermediate_dim = self.model.config.intermediate_dim as u32;
            let vocab_size = self.model.config.vocab_size as u32;
            if let Err(e) = self.executor.ensure_cublas() {
                eprintln!("[PMAT-037] cuBLAS init failed (non-fatal): {e}");
            } else if let Err(e) = self.executor.warmup_hgemm_cache(
                num_layers,
                hidden_dim,
                intermediate_dim,
                vocab_size,
            ) {
                eprintln!("[PMAT-037] FP16 cache warmup failed (non-fatal): {e}");
            }
        }

        // PMAT-053/067: FP8 E4M3 weight cache warmup (auto-enabled on sm_89+)
        // GH-286: Skip if --no-fp8-cache (saves ~1.5 GB RSS)
        let no_fp8 = std::env::var("REALIZR_NO_FP8_CACHE").as_deref() == Ok("1");
        if self.executor.gpu_profile.fp8_prefill && !no_fp8 {
            let num_layers = self.model.config.num_layers;
            let hidden_dim = self.model.config.hidden_dim as u32;
            let intermediate_dim = self.model.config.intermediate_dim as u32;
            let vocab_size = self.model.config.vocab_size as u32;
            if let Err(e) =
                self.executor
                    .warmup_fp8_cache(num_layers, hidden_dim, intermediate_dim, vocab_size)
            {
                eprintln!("[PMAT-053] FP8 cache warmup failed (non-fatal): {e}");
            }
        }

        // PMAT-091: Interleaved Q4K weight cache warmup (W4A16 WMMA GEMM)
        if self.executor.gpu_profile.w4a16_interleaved {
            let num_layers = self.model.config.num_layers;
            let hidden_dim = self.model.config.hidden_dim as u32;
            let intermediate_dim = self.model.config.intermediate_dim as u32;
            let vocab_size = self.model.config.vocab_size as u32;
            if let Err(e) = self.executor.warmup_interleaved_cache(
                num_layers,
                hidden_dim,
                intermediate_dim,
                vocab_size,
            ) {
                eprintln!("[PMAT-091] Interleaved cache warmup failed (non-fatal): {e}");
            }
        }

        // GH-181: Reinitialize workspace after cache warmup (FP8/FP16/interleaved).
        // Cache allocations can relocate workspace buffers, causing the parity
        // gate's forward pass to read stale pointers (cosine -0.28 on RTX 4060).
        {
            let hidden_dim = self.model.config.hidden_dim;
            let intermediate_dim = self.model.config.intermediate_dim;
            self.executor.force_workspace_reinit();
            if let Err(e) = self.executor.init_workspace(hidden_dim, intermediate_dim) {
                eprintln!("[GH-181] Workspace reinit failed (non-fatal): {e}");
            }
        }

        // PARITY-GATE: Jidoka — stop-the-line if GPU diverges from CPU.
        // Run ONE token through both backends and compare logits.
        // If cosine similarity < 0.99, refuse to construct.
        // Skip gate if SKIP_PARITY_GATE=1 (for debugging the gate itself)
        let skip_gate = std::env::var("SKIP_PARITY_GATE")
            .map(|v| v == "1")
            .unwrap_or(false);

        if !skip_gate {
            if let Err(e) = parity_gate(&mut self) {
                return Err(CudaInitError {
                    error: e,
                    model: self.into_model(),
                });
            }
        }

        Ok(self)
    }

    /// Create a GPU-accelerated inference engine with a custom maximum sequence length.
    pub fn with_max_seq_len(
        model: OwnedQuantizedModel,
        device_ordinal: i32,
        max_seq_len: usize,
    ) -> std::result::Result<Self, CudaInitError> {
        use crate::cuda::CudaExecutor;

        // GH-279: Contract gate — validate architecture and dimensions before CUDA init
        if let Err(e) = crate::contract_gate::validate_model_load_basic(
            &model.config.architecture,
            model.config.num_layers,
            model.config.hidden_dim,
            model.config.num_heads,
            model.config.num_kv_heads,
            model.config.intermediate_dim,
            model.config.vocab_size,
        ) {
            return Err(CudaInitError {
                error: crate::contract_gate::gate_error(e),
                model,
            });
        }

        // GH-280: Capability gate — refuse GPU if model requires unsupported ops.
        let model = Self::check_gpu_capability(model)?;

        let mut executor = match CudaExecutor::new(device_ordinal) {
            Ok(e) => e,
            Err(e) => {
                return Err(CudaInitError {
                    error: RealizarError::UnsupportedOperation {
                        operation: "CudaExecutor::new".to_string(),
                        reason: format!("CUDA initialization failed: {e}"),
                    },
                    model,
                });
            },
        };

        let device_name = executor
            .device_name()
            .unwrap_or_else(|_| "Unknown GPU".to_string());
        let memory_info = executor.memory_info().unwrap_or((0, 0));

        // PAR-018: Initialize GPU-resident KV cache for attention acceleration
        // This avoids ~66 MB CPU→GPU transfer per token for TinyLlama
        let num_layers = model.layers.len();
        let num_heads = model.config.num_heads;
        let num_kv_heads = model.config.num_kv_heads; // PAR-021 GQA support
        let head_dim = model.config.hidden_dim / num_heads;

        if let Err(e) =
            executor.init_kv_cache_gpu(num_layers, num_heads, num_kv_heads, head_dim, max_seq_len)
        {
            return Err(CudaInitError {
                error: RealizarError::UnsupportedOperation {
                    operation: "init_kv_cache_gpu".to_string(),
                    reason: format!("GPU KV cache initialization failed: {e}"),
                },
                model,
            });
        }

        // PMAT-399: Auto-size max_batch if env var not set
        if std::env::var("CUDA_MAX_BATCH").is_err() {
            let auto_batch = executor.compute_max_batch_for_memory(
                num_layers,
                num_kv_heads,
                head_dim,
                max_seq_len,
            );
            eprintln!("[PMAT-399] Auto-sized CUDA_MAX_BATCH={auto_batch} (no env var set)");
            // Store for scheduler to pick up
            std::env::set_var("CUDA_MAX_BATCH", auto_batch.to_string());
        }

        // PAR-118: Initialize Flash Decoding for split-K attention acceleration.
        // Five-Whys: batched_incremental_attention uses Grid=(num_heads,M,1) Block=(32,1,1)
        // = only 896 threads on RTX 4090 for 7B (28 heads). Flash Decoding splits KV cache
        // into chunks processed in parallel, achieving 1.5-2x decode speedup.
        // NOTE: flash_decode_enabled is used by BOTH the graphed decode path
        // (flash_decoding_graphed in attention.rs) and the batched path (batched.rs).
        // The batched path uses threshold 1024 to avoid triggering during normal prefill.
        if let Err(e) = executor.init_flash_decoding(num_heads, head_dim, max_seq_len, 1) {
            if verbose() {
                eprintln!(
                    "[PAR-118] Flash Decoding init failed: {e}, falling back to sequential attention"
                );
            }
            // Non-fatal: sequential attention still works, just slower
        }

        // PAR-060: Set RoPE theta for position embeddings
        if verbose() {
            eprintln!(
                "[PAR-060] Setting rope_theta = {} for GPU path",
                model.config.rope_theta
            );
        }
        executor.set_rope_theta(model.config.rope_theta);

        // CORRECTNESS-011: Set rope_type for correct RoPE style (NORM vs NEOX)
        if verbose() {
            eprintln!(
                "[CORRECTNESS-011] Setting rope_type = {} for GPU path (0=NORM, 2=NEOX)",
                model.config.rope_type
            );
        }
        executor.set_rope_type(model.config.rope_type);

        // GH-129: Pre-load ALL kernel modules BEFORE heavy GPU allocations.
        // Five-Whys root cause: PTX JIT compilation requires GPU memory for the
        // JIT compiler itself. On Jetson (unified memory), weight upload consumes
        // ~1 GB, leaving less for JIT. Moving kernel preload here (before weight
        // upload) ensures JIT runs with maximum available GPU memory.
        {
            let hidden_dim = model.config.hidden_dim as u32;
            let intermediate_dim = model.config.intermediate_dim as u32;
            let vocab_size = model.config.vocab_size as u32;
            match executor.preload_modules_for_capture(
                num_layers,
                hidden_dim,
                intermediate_dim,
                vocab_size,
            ) {
                Ok(()) => eprintln!(
                    "[GH-129] Early kernel preload: {} modules compiled",
                    executor.module_count()
                ),
                Err(e) => {
                    eprintln!("[GH-129] Early kernel preload failed: {e}");
                },
            }
        }

        // PAR-083: Pre-allocate embedding buffer (hidden_dim f32s) to avoid per-token malloc.
        let embed_buf = vec![0.0f32; model.config.hidden_dim];

        let cuda_model = Self {
            model,
            executor,
            device_name,
            memory_info,
            embed_buf,
            #[cfg(feature = "gpu")]
            prefix_cache: crate::gguf::batch_scheduler::PrefixCache::new(16),
        };

        // GH-199 ROOT CAUSE B + PARITY-GATE: preload weights and verify GPU correctness.
        cuda_model.preload_and_verify()
    }

    /// GH-129: Free CPU projection weight copies after GPU preload.
    ///
    /// On Jetson (unified memory), the CPU `Vec<u8>` copies are redundant after
    /// `preload_weights_gpu()` uploads them to GPU buffers. Frees ~1 GB for 1.5B.
    pub fn free_cpu_weights(&mut self) {
        let mut freed = 0usize;
        for layer in &mut self.model.layers {
            freed += layer.qkv_weight.data_bytes();
            freed += layer.attn_output_weight.data.len();
            freed += layer.ffn_up_weight.data.len();
            freed += layer.ffn_down_weight.data.len();
            if let Some(ref gate) = layer.ffn_gate_weight {
                freed += gate.data.len();
            }
            layer.free_projection_weights();
        }
        freed += self.model.lm_head_weight.data.len();
        self.model.lm_head_weight.data = Vec::new();
        eprintln!(
            "[GH-129] Freed {:.1} MB CPU weight copies (GPU-resident path active)",
            freed as f64 / (1024.0 * 1024.0)
        );
    }

    /// Check if CUDA is available
    #[must_use]
    pub fn is_available() -> bool {
        crate::cuda::CudaExecutor::is_available()
    }

    /// Get number of CUDA devices
    #[must_use]
    pub fn num_devices() -> usize {
        crate::cuda::CudaExecutor::num_devices()
    }

    /// Get GPU device name
    #[must_use]
    pub fn device_name(&self) -> &str {
        &self.device_name
    }

    /// Get GPU memory info (free, total) in bytes
    #[must_use]
    pub fn memory_info(&self) -> (usize, usize) {
        self.memory_info
    }

    /// Get VRAM usage in MB
    #[must_use]
    pub fn vram_mb(&self) -> u64 {
        (self.memory_info.1 / (1024 * 1024)) as u64
    }

    // ========================================================================
    // PAR-073: BrickProfiler API for per-brick timing
    // ========================================================================

    /// Enable per-brick profiling for real timing measurements.
    ///
    /// When enabled, each brick operation is timed individually using
    /// `std::time::Instant` with CUDA sync for accurate GPU timing.
    pub fn enable_profiling(&mut self) {
        self.executor.enable_profiling();
    }

    /// Disable per-brick profiling (default state).
    pub fn disable_profiling(&mut self) {
        self.executor.disable_profiling();
    }

    /// Check if profiling is enabled.
    #[must_use]
    pub fn is_profiling_enabled(&self) -> bool {
        self.executor.is_profiling_enabled()
    }

    /// Get the brick profiler for reading statistics.
    #[must_use]
    pub fn profiler(&self) -> &trueno::BrickProfiler {
        self.executor.profiler()
    }

    /// Reset profiler statistics.
    pub fn reset_profiler(&mut self) {
        self.executor.reset_profiler();
    }

    /// Get profiler summary report.
    #[must_use]
    pub fn profiler_summary(&self) -> String {
        self.executor.profiler_summary()
    }

    /// Get reference to inner model
    #[must_use]
    pub fn model(&self) -> &OwnedQuantizedModel {
        &self.model
    }

    /// Consume CUDA wrapper and return inner model (for CPU fallback)
    #[must_use]
    pub fn into_model(self) -> OwnedQuantizedModel {
        self.model
    }

    /// PAR-111: Get mutable reference to CUDA executor
    ///
    /// Allows direct access for batched forward path and workspace initialization.
    #[must_use]
    pub fn executor_mut(&mut self) -> &mut crate::cuda::CudaExecutor {
        &mut self.executor
    }

    /// Synchronize CUDA stream (wait for all GPU operations to complete)
    pub fn synchronize(&self) -> Result<()> {
        self.executor
            .synchronize()
            .map_err(|e| RealizarError::UnsupportedOperation {
                operation: "CudaExecutor::synchronize".to_string(),
                reason: format!("CUDA sync failed: {e}"),
            })
    }
}

// =============================================================================
// PARITY GATE: Load-time mathematical proof of GPU/CPU equivalence
// =============================================================================
//
// Toyota Way: Jidoka (自働化) — stop-the-line on defect.
//
// Just as build.rs refuses to compile if ALG-001 through ALG-009 fail,
// this gate refuses to construct `OwnedQuantizedModelCuda` if GPU and CPU
// compute different functions.
//
// An `OwnedQuantizedModelCuda` that passes this gate is PROVEN to produce
// the same output as CPU. One that fails CANNOT be constructed.
//
// Contract: layer-parity-v1.yaml
// Tolerance: cosine_similarity ≥ 0.98 on first-token logits

/// Minimum cosine similarity for parity gate to pass.
/// 0.98 accommodates DP4A integer dot-product kernels which use int8 arithmetic
/// with different rounding than CPU float dequantization. On Qwen 1.5B Q4K:
///   - Non-DP4A (float GEMV): cosine ~0.9999
///   - DP4A (int8 dot product): cosine ~0.9887
/// Real bugs (corrupted weights, wrong kernel) produce cosine < 0.5.
const PARITY_GATE_COSINE_MIN: f32 = 0.98;

include!("mod_parity_gate.rs");