realizar 0.8.4

Pure Rust ML inference engine built from scratch - model serving for GGUF and safetensors
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
//! Persistent GPU weight management for CudaExecutor
//!
//! This module implements:
//! - PARITY-037: FP32 weight loading and caching
//! - PAR-005: Quantized weight cache (Q4_K/Q5_K/Q6_K)
//! - PAR-043: Indexed weight access for O(1) lookup
//! - PAR-058: Mixed-quantization type tracking

use super::*;

impl CudaExecutor {
    // ========================================================================
    // PARITY-037: Persistent GPU Weight Management
    // ========================================================================

    /// Load weights to GPU and cache them for reuse (PARITY-037)
    ///
    /// Weights are stored in GPU memory and persist until explicitly cleared
    /// or the executor is dropped. This eliminates H2D transfer overhead
    /// for repeated forward passes.
    ///
    /// # Arguments
    ///
    /// * `name` - Unique identifier for the weight tensor (e.g., "layer0.ffn.fc1")
    /// * `weights` - Weight data to upload (row-major)
    ///
    /// # Returns
    ///
    /// Size in bytes of the uploaded weights.
    ///
    /// # Errors
    ///
    /// Returns error if GPU allocation or transfer fails.
    pub fn load_weights(&mut self, name: &str, weights: &[f32]) -> Result<usize, GpuError> {
        // PMAT-396: On unified memory (cc>=120), register mmap'd pages directly
        let buf = if self.gpu_profile.cc >= 120 {
            unsafe { GpuBuffer::from_host_registered(weights.as_ptr().cast_mut(), weights.len())? }
        } else {
            GpuBuffer::from_host(&self.context, weights)?
        };
        let size_bytes = buf.size_bytes();
        self.weight_cache.insert(name.to_string(), buf);
        Ok(size_bytes)
    }

    /// GH-174: Load FP16 weights to GPU for HGEMM dispatch.
    ///
    /// SafeTensors models with F16/BF16 dtype should use this path
    /// instead of `load_weights()`. The weights stay in FP16 on GPU,
    /// enabling cuBLAS HGEMM (2x bandwidth savings vs FP32 SGEMM).
    ///
    /// # Arguments
    ///
    /// * `name` - Unique identifier for the weight tensor
    /// * `weights_f16` - Weight data as raw u16 (IEEE FP16 bits)
    ///
    /// # Returns
    ///
    /// Size in bytes of the uploaded weights.
    pub fn load_weights_f16(&mut self, name: &str, weights_f16: &[u16]) -> Result<usize, GpuError> {
        let buf = GpuBuffer::from_host(&self.context, weights_f16)?;
        let size_bytes = buf.size_bytes();
        self.named_fp16_weight_cache.insert(name.to_string(), buf);
        Ok(size_bytes)
    }

    /// GH-174: Check if named FP16 weights exist for a given key.
    #[must_use]
    pub fn has_weights_f16(&self, name: &str) -> bool {
        self.named_fp16_weight_cache.contains_key(name)
    }

    /// Check if weights are cached on GPU
    #[must_use]
    pub fn has_weights(&self, name: &str) -> bool {
        self.weight_cache.contains_key(name)
    }

    /// Get the number of cached weight tensors
    #[must_use]
    pub fn cached_weight_count(&self) -> usize {
        self.weight_cache.len()
    }

    /// Get total size of cached weights in bytes
    #[must_use]
    pub fn cached_weight_bytes(&self) -> usize {
        self.weight_cache.values().map(GpuBuffer::size_bytes).sum()
    }

    /// Clear all cached weights (releases GPU memory)
    pub fn clear_weights(&mut self) {
        self.weight_cache.clear();
    }

    // ========================================================================
    // PAR-005: Quantized Weight Cache (Q4_K/Q5_K/Q6_K)
    // ========================================================================

    /// Load quantized weights onto GPU for persistent caching
    ///
    /// Uploads raw quantized bytes (Q4_K/Q5_K/Q6_K format) to GPU memory.
    /// These weights are reused for all forward passes, eliminating
    /// the ~50+ CPU→GPU transfers per token.
    ///
    /// # Arguments
    ///
    /// * `name` - Unique identifier for this weight tensor (e.g., "layer_0.attn_q")
    /// * `data` - Raw quantized weight bytes
    ///
    /// # Returns
    ///
    /// Size in bytes of the uploaded weights.
    ///
    /// # Errors
    ///
    /// Returns error if GPU allocation or transfer fails.
    pub fn load_quantized_weights(&mut self, name: &str, data: &[u8]) -> Result<usize, GpuError> {
        // Default to Q4K (type 12) for backwards compatibility
        self.load_quantized_weights_with_type(name, data, 12)
    }

    /// PAR-058: Load quantized weights with explicit quantization type
    ///
    /// Like `load_quantized_weights` but stores the quantization type for later kernel dispatch.
    /// This is needed for mixed-quantization models like Qwen 0.5B where Q/K use Q5_0.
    ///
    /// # Arguments
    ///
    /// * `name` - Unique identifier for this weight tensor
    /// * `data` - Raw quantized weight bytes
    /// * `qtype` - GGML quantization type (6=Q5_0, 8=Q8_0, 12=Q4K, 13=Q5K, 14=Q6K)
    ///
    /// # Returns
    ///
    /// Size in bytes of the uploaded weights.
    pub fn load_quantized_weights_with_type(
        &mut self,
        name: &str,
        data: &[u8],
        qtype: u32,
    ) -> Result<usize, GpuError> {
        // PMAT-396: On unified memory (cc>=120), register mmap'd pages directly
        let buf = if self.gpu_profile.cc >= 120 {
            unsafe { GpuBuffer::from_host_registered(data.as_ptr().cast_mut(), data.len())? }
        } else {
            GpuBuffer::from_host(&self.context, data)?
        };
        let size_bytes = buf.size_bytes();
        self.quantized_weight_cache.insert(name.to_string(), buf);
        self.quantized_weight_types.insert(name.to_string(), qtype);
        Ok(size_bytes)
    }

    /// PAR-058: Get the quantization type for a cached weight
    ///
    /// Returns the GGML type ID (6=Q5_0, 8=Q8_0, 12=Q4K, 13=Q5K, 14=Q6K).
    /// Returns None if the weight is not cached.
    #[must_use]
    pub fn get_quantized_weight_type(&self, name: &str) -> Option<u32> {
        self.quantized_weight_types.get(name).copied()
    }

    /// Check if quantized weights are cached on GPU
    #[must_use]
    pub fn has_quantized_weights(&self, name: &str) -> bool {
        self.quantized_weight_pool_entries.contains_key(name)
            || self.quantized_weight_cache.contains_key(name)
    }

    /// Get raw device pointer for cached quantized weights
    ///
    /// Returns the raw u64 device pointer for the named weight buffer.
    /// ALB-098: Checks pool entries first, then individual cache.
    ///
    /// # Errors
    ///
    /// Returns error if weight is not cached.
    pub fn get_quantized_weight_ptr(&self, name: &str) -> Result<u64, GpuError> {
        // ALB-098: Check pool first (MoE models use pooled allocation)
        if let Some(&(ptr, _size)) = self.quantized_weight_pool_entries.get(name) {
            return Ok(ptr);
        }
        self.quantized_weight_cache
            .get(name)
            .map(trueno_gpu::driver::GpuBuffer::as_ptr)
            .ok_or_else(|| {
                GpuError::InvalidLaunchConfig(format!("Quantized weight '{}' not cached", name))
            })
    }

    /// Get raw device pointer and size for cached quantized weights.
    ///
    /// ALB-098: Checks pool entries first, then individual cache.
    pub fn get_quantized_weight_ptr_and_size(&self, name: &str) -> Result<(u64, usize), GpuError> {
        if let Some(&(ptr, size)) = self.quantized_weight_pool_entries.get(name) {
            return Ok((ptr, size));
        }
        self.quantized_weight_cache
            .get(name)
            .map(|buf| (buf.as_ptr(), buf.size_bytes()))
            .ok_or_else(|| {
                GpuError::InvalidLaunchConfig(format!("Quantized weight '{}' not cached", name))
            })
    }

    /// Get the number of cached quantized weight tensors
    #[must_use]
    pub fn cached_quantized_weight_count(&self) -> usize {
        self.quantized_weight_cache.len() + self.quantized_weight_pool_entries.len()
    }

    /// Get total size of cached quantized weights in bytes
    #[must_use]
    pub fn cached_quantized_weight_bytes(&self) -> usize {
        let cache_bytes: usize = self
            .quantized_weight_cache
            .values()
            .map(GpuBuffer::size_bytes)
            .sum();
        let pool_bytes: usize = self
            .quantized_weight_pool_entries
            .values()
            .map(|&(_ptr, size)| size)
            .sum();
        cache_bytes + pool_bytes
    }

    /// Clear all cached quantized weights (releases GPU memory)
    pub fn clear_quantized_weights(&mut self) {
        self.quantized_weight_cache.clear();
        self.quantized_weight_pool = None;
        self.quantized_weight_pool_entries.clear();
    }

    // ========================================================================
    // ALB-098: Pooled GPU Weight Allocation (MoE models)
    // ========================================================================

    /// Allocate a single contiguous GPU buffer for all quantized weights.
    ///
    /// Call this BEFORE `load_quantized_weights_pooled()`.
    /// Replaces 18,867 individual cuMemAlloc calls with 1.
    ///
    /// # Arguments
    ///
    /// * `total_bytes` - Total bytes for all quantized weights (pre-computed)
    ///
    /// # Errors
    ///
    /// Returns error if GPU allocation fails.
    pub fn allocate_quantized_weight_pool(&mut self, total_bytes: usize) -> Result<(), GpuError> {
        // Allocate single large buffer
        let pool = GpuBuffer::<u8>::new(&self.context, total_bytes)?;
        self.quantized_weight_pool = Some(pool);
        self.quantized_weight_pool_entries.clear();
        Ok(())
    }

    /// Load quantized weights into the pre-allocated pool at a specific offset.
    ///
    /// Does NOT call cuMemAlloc — copies data into the existing pool buffer.
    ///
    /// # Arguments
    ///
    /// * `name` - Tensor name for later lookup
    /// * `data` - Raw quantized bytes
    /// * `qtype` - GGML quantization type
    /// * `offset` - Byte offset within the pool
    ///
    /// # Returns
    ///
    /// Size in bytes of the uploaded data.
    pub fn load_quantized_weights_pooled(
        &mut self,
        name: &str,
        data: &[u8],
        qtype: u32,
        offset: usize,
    ) -> Result<usize, GpuError> {
        let pool = self.quantized_weight_pool.as_mut().ok_or_else(|| {
            GpuError::InvalidLaunchConfig("ALB-098: Weight pool not allocated".to_string())
        })?;

        let pool_ptr = pool.as_ptr();
        let dest_ptr = pool_ptr + offset as u64;

        // Copy data into pool at the specified byte offset
        pool.copy_from_host_at(data, offset)
            .map_err(|e| GpuError::Transfer(format!("Pool H2D copy failed for {name}: {e}")))?;

        // Record entry: store absolute device pointer for direct kernel use
        self.quantized_weight_pool_entries
            .insert(name.to_string(), (dest_ptr, data.len()));
        self.quantized_weight_types.insert(name.to_string(), qtype);

        Ok(data.len())
    }

    // ========================================================================
    // PAR-043: Indexed Weight Access (eliminate HashMap/string overhead)
    // ========================================================================

    /// Build indexed weight lookup table from loaded caches.
    ///
    /// GH-279: Now takes `ArchConstraints` and validates every layer's weights
    /// against the architecture's required roles. If any required weight is
    /// missing (ptr=0, len=0), returns a descriptive error — never silent garbage.
    ///
    /// MUST be called after all weights are loaded via `load_quantized_weights()` and
    /// `load_rmsnorm_gamma()`. This pre-computes device pointers for O(1) access
    /// during decode, eliminating ~10ms constant overhead per token.
    ///
    /// # Arguments
    ///
    /// * `num_layers` - Number of transformer layers in the model
    /// * `layer_prefix_fn` - Function to generate layer prefix from index (e.g., `|i| format!("blk.{}", i)`)
    /// * `arch` - Architecture constraints for weight validation (GH-279)
    ///
    /// # Errors
    ///
    /// Returns error if any required weight is not cached, or if architecture
    /// validation fails (missing required weight for the declared architecture).
    pub fn build_indexed_weights<F>(
        &mut self,
        num_layers: usize,
        layer_prefix_fn: F,
        arch: &crate::gguf::ArchConstraints,
    ) -> Result<(), GpuError>
    where
        F: Fn(usize) -> String,
    {
        let mut indexed = Vec::with_capacity(num_layers);

        for layer_idx in 0..num_layers {
            let prefix = layer_prefix_fn(layer_idx);

            // Build weight names matching GGML convention
            let q_name = format!("{}.attn_q.weight", prefix);
            let k_name = format!("{}.attn_k.weight", prefix);
            let v_name = format!("{}.attn_v.weight", prefix);
            let o_name = format!("{}.attn_output.weight", prefix);
            let gate_name = format!("{}.ffn_gate.weight", prefix);
            let up_name = format!("{}.ffn_up.weight", prefix);
            let down_name = format!("{}.ffn_down.weight", prefix);
            let attn_norm_name = format!("{}.attn_norm.gamma", prefix);
            let ffn_norm_name = format!("{}.ffn_norm.gamma", prefix);

            // Get pointers from quantized weight cache (ALB-098: pool-aware)
            let get_qweight = |name: &str| -> Result<(u64, usize), GpuError> {
                self.get_quantized_weight_ptr_and_size(name)
            };

            // Get pointers from RMSNorm cache
            let get_rmsnorm = |name: &str| -> Result<(u64, usize), GpuError> {
                let buf = self.rmsnorm_cache.get(name).ok_or_else(|| {
                    GpuError::InvalidLaunchConfig(format!(
                        "PAR-043: RMSNorm gamma '{}' not cached",
                        name
                    ))
                })?;
                Ok((buf.as_ptr(), buf.len()))
            };

            let (attn_q_ptr, attn_q_len) = get_qweight(&q_name)?;
            let (attn_k_ptr, attn_k_len) = get_qweight(&k_name)?;
            let (attn_v_ptr, attn_v_len) = get_qweight(&v_name)?;
            let (attn_output_ptr, attn_output_len) = get_qweight(&o_name)?;
            let (ffn_gate_ptr, ffn_gate_len) = get_qweight(&gate_name)?;
            let (ffn_up_ptr, ffn_up_len) = get_qweight(&up_name)?;
            let (ffn_down_ptr, ffn_down_len) = get_qweight(&down_name)?;
            let (attn_norm_ptr, attn_norm_len) = get_rmsnorm(&attn_norm_name)?;
            let (ffn_norm_ptr, ffn_norm_len) = get_rmsnorm(&ffn_norm_name)?;

            // PAR-058: Resolve quantization types for all weight tensors
            let attn_q_qtype = self.resolve_qtype(&q_name);
            let attn_k_qtype = self.resolve_qtype(&k_name);
            let attn_v_qtype = self.resolve_qtype(&v_name);
            let attn_output_qtype = self.resolve_qtype(&o_name);
            let ffn_gate_qtype = self.resolve_qtype(&gate_name);
            let ffn_up_qtype = self.resolve_qtype(&up_name);
            let ffn_down_qtype = self.resolve_qtype(&down_name);

            // Log if non-Q4K types detected (for debugging mixed-quant models)
            self.log_mixed_quant_types(
                layer_idx,
                attn_q_qtype,
                attn_k_qtype,
                attn_v_qtype,
                attn_output_qtype,
                ffn_gate_qtype,
                ffn_up_qtype,
                ffn_down_qtype,
            );

            // BIAS-FIX: Get QKV bias pointers from bias_cache (optional - 0/0 if not present)
            let q_bias_name = format!("{}.attn_q.bias", prefix);
            let k_bias_name = format!("{}.attn_k.bias", prefix);
            let v_bias_name = format!("{}.attn_v.bias", prefix);

            let (attn_q_bias_ptr, attn_q_bias_len) = self
                .bias_cache
                .get(&q_bias_name)
                .map_or((0, 0), |b| (b.as_ptr(), b.len()));
            let (attn_k_bias_ptr, attn_k_bias_len) = self
                .bias_cache
                .get(&k_bias_name)
                .map_or((0, 0), |b| (b.as_ptr(), b.len()));
            let (attn_v_bias_ptr, attn_v_bias_len) = self
                .bias_cache
                .get(&v_bias_name)
                .map_or((0, 0), |b| (b.as_ptr(), b.len()));

            // GH-279: QkNorm pointers from rmsnorm_cache (optional - 0/0 if not present)
            let q_norm_name = format!("{}.attn_q_norm.gamma", prefix);
            let k_norm_name = format!("{}.attn_k_norm.gamma", prefix);
            let (attn_q_norm_ptr, attn_q_norm_len) = self
                .rmsnorm_cache
                .get(&q_norm_name)
                .map_or((0, 0), |b| (b.as_ptr(), b.len()));
            let (attn_k_norm_ptr, attn_k_norm_len) = self
                .rmsnorm_cache
                .get(&k_norm_name)
                .map_or((0, 0), |b| (b.as_ptr(), b.len()));

            let raw = IndexedLayerWeights {
                attn_q_ptr,
                attn_q_len,
                attn_q_qtype,
                attn_k_ptr,
                attn_k_len,
                attn_k_qtype,
                attn_v_ptr,
                attn_v_len,
                attn_v_qtype,
                attn_output_ptr,
                attn_output_len,
                attn_output_qtype, // PAR-058: was missing
                ffn_gate_ptr,
                ffn_gate_len,
                ffn_gate_qtype, // PAR-058: was missing
                ffn_up_ptr,
                ffn_up_len,
                ffn_up_qtype, // PAR-058: was missing
                ffn_down_ptr,
                ffn_down_len,
                ffn_down_qtype,
                attn_norm_ptr,
                attn_norm_len,
                ffn_norm_ptr,
                ffn_norm_len,
                // BIAS-FIX: QKV bias pointers
                attn_q_bias_ptr,
                attn_q_bias_len,
                attn_k_bias_ptr,
                attn_k_bias_len,
                attn_v_bias_ptr,
                attn_v_bias_len,
                // GH-279: QkNorm pointers (Qwen3 per-head RMSNorm)
                attn_q_norm_ptr,
                attn_q_norm_len,
                attn_k_norm_ptr,
                attn_k_norm_len,
            };

            // GH-279: Validate that all architecture-required fields are non-zero.
            // This is the Poka-Yoke enforcement point — if a loader forgot to
            // populate a required field, we fail HERE (not during inference).
            use crate::cuda::types::ValidatedLayerWeights;
            let validated = ValidatedLayerWeights::validate(raw, arch, layer_idx)
                .map_err(|e| GpuError::InvalidLaunchConfig(e.to_string()))?;

            indexed.push(validated);
        }

        self.indexed_layer_weights = indexed;
        self.index_output_weights();
        Ok(())
    }

    /// Check if indexed weights have been built
    #[must_use]
    pub fn has_indexed_weights(&self) -> bool {
        !self.indexed_layer_weights.is_empty()
    }

    /// Get validated indexed weights for a specific layer.
    ///
    /// GH-279: Returns `&ValidatedLayerWeights` — all architecture-required fields
    /// are guaranteed non-zero by construction.
    ///
    /// # Panics
    ///
    /// Panics if `layer_idx >= num_layers` or if `build_indexed_weights()` hasn't been called.
    #[must_use]
    pub fn get_indexed_layer(&self, layer_idx: usize) -> &ValidatedLayerWeights {
        &self.indexed_layer_weights[layer_idx]
    }

    /// Clear indexed weights (call before reloading model)
    pub fn clear_indexed_weights(&mut self) {
        self.indexed_layer_weights.clear();
        self.output_norm_ptr = 0;
        self.output_norm_len = 0;
        self.lm_head_ptr = 0;
        self.lm_head_len = 0;
        self.lm_head_qtype = WeightQuantType::Q4K;
        // PAR-064-FIX: Also clear LM head bias pointer
        self.lm_head_bias_ptr = 0;
        self.lm_head_bias_len = 0;
    }

    // ========================================================================
    // PAR-058: Helper methods for quantization type resolution
    // ========================================================================

    /// Resolve the quantization type for a named weight tensor.
    ///
    /// Looks up the GGML type stored during `load_quantized_weights_with_type()`,
    /// converts it to `WeightQuantType`, and defaults to Q4K if not found.
    fn resolve_qtype(&self, name: &str) -> WeightQuantType {
        self.quantized_weight_types
            .get(name)
            .and_then(|&t| WeightQuantType::from_ggml_type(t))
            .unwrap_or(WeightQuantType::Q4K)
    }

    /// Index output norm and LM head pointers for zero-allocation forward pass.
    ///
    /// PAR-054: LM head weight for CUDA graph capture.
    /// PAR-058: Detect LM head quantization type (Q6_K in Qwen 1.5B, not Q4_K).
    fn index_output_weights(&mut self) {
        if let Some(buf) = self.rmsnorm_cache.get("output_norm.gamma") {
            self.output_norm_ptr = buf.as_ptr();
            self.output_norm_len = buf.len();
        }

        // ALB-098: Check pool first, then individual cache
        if let Ok((ptr, size)) = self.get_quantized_weight_ptr_and_size("output.weight") {
            self.lm_head_ptr = ptr;
            self.lm_head_len = size;
            self.lm_head_qtype = self.resolve_qtype("output.weight");
            if verbose() {
                eprintln!(
                    "[PAR-058] LM head qtype: {:?}, ptr={:#x}, len={}",
                    self.lm_head_qtype, self.lm_head_ptr, self.lm_head_len
                );
            }
        }
    }

    /// Log non-Q4K quantization types for debugging mixed-quant models (PAR-058).
    ///
    /// Only emits output when `verbose()` is true and at least one weight
    /// uses a quantization type other than Q4K.
    #[allow(clippy::too_many_arguments)]
    fn log_mixed_quant_types(
        &self,
        layer_idx: usize,
        attn_q: WeightQuantType,
        attn_k: WeightQuantType,
        attn_v: WeightQuantType,
        attn_output: WeightQuantType,
        ffn_gate: WeightQuantType,
        ffn_up: WeightQuantType,
        ffn_down: WeightQuantType,
    ) {
        if !verbose() {
            return;
        }
        if attn_q != WeightQuantType::Q4K || attn_k != WeightQuantType::Q4K {
            eprintln!(
                "[PAR-058] Layer {}: Q={:?}, K={:?}, V={:?}",
                layer_idx, attn_q, attn_k, attn_v
            );
        }
        if attn_output != WeightQuantType::Q4K
            || ffn_gate != WeightQuantType::Q4K
            || ffn_up != WeightQuantType::Q4K
            || ffn_down != WeightQuantType::Q4K
        {
            eprintln!(
                "[PAR-058] Layer {}: O={:?}, gate={:?}, up={:?}, down={:?}",
                layer_idx, attn_output, ffn_gate, ffn_up, ffn_down
            );
        }
    }
}

include!("weights_tests.rs");