realizar 0.8.4

Pure Rust ML inference engine built from scratch - model serving for GGUF and safetensors
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
/// Load a quantized tensor from APR format, trying multiple names.
///
/// Handles APR native q8/q4 formats by dequantizing to f32.
/// For Conv1D architectures, transposes weights to [out, in] layout.
fn apr_load_quantized_tensor(
    apr: &crate::apr::MappedAprModel,
    data: &[u8],
    data_offset: usize,
    names: &[&str],
    in_dim: usize,
    out_dim: usize,
    transpose: bool,
) -> Result<OwnedQuantizedTensor> {
    use crate::apr::MappedAprModel;

    let (tensor, found_name) = names
        .iter()
        .find_map(|name| apr.find_tensor(name).map(|t| (t, *name)))
        .ok_or_else(|| RealizarError::FormatError {
            reason: format!("APR: tensor not found (tried: {})", names.join(", ")),
        })?;
    let start = data_offset + tensor.offset as usize;
    let end = start + tensor.size as usize;
    if end > data.len() {
        return Err(RealizarError::FormatError {
            reason: format!("APR: tensor {found_name} extends past EOF"),
        });
    }
    let raw = &data[start..end];
    let dtype = tensor.dtype.as_str();
    let num_elements = in_dim * out_dim;

    match dtype {
        "q8" => {
            // GH-285: APR native q8 requires CPU dequant → F32 (slow).
            // Re-import with `apr import` for GPU-optimal Q4K loading.
            eprintln!(
                "[GH-285] APR native q8 tensor '{}': CPU dequant to F32 \
                 (slow — re-import with `apr import` for GPU-optimal Q4K)",
                found_name
            );
            let mut f32_data = crate::apr::dequant::dequantize_apr_q8(raw, num_elements);
            if transpose {
                f32_data = transpose_f32_matrix(&f32_data, in_dim, out_dim);
            }
            let f32_bytes: Vec<u8> = f32_data.iter().flat_map(|v| v.to_le_bytes()).collect();
            Ok(OwnedQuantizedTensor {
                data: f32_bytes,
                in_dim,
                out_dim,
                qtype: 0,
            })
        },
        "q4" => {
            // GH-285: APR native q4 requires CPU dequant → F32 (slow).
            // Re-import with `apr import` for GPU-optimal Q4K loading.
            eprintln!(
                "[GH-285] APR native q4 tensor '{}': CPU dequant to F32 \
                 (slow — re-import with `apr import` for GPU-optimal Q4K)",
                found_name
            );
            let mut f32_data = crate::apr::dequant::dequantize_apr_q4(raw, num_elements);
            if transpose {
                f32_data = transpose_f32_matrix(&f32_data, in_dim, out_dim);
            }
            let f32_bytes: Vec<u8> = f32_data.iter().flat_map(|v| v.to_le_bytes()).collect();
            Ok(OwnedQuantizedTensor {
                data: f32_bytes,
                in_dim,
                out_dim,
                qtype: 0,
            })
        },
        _ => {
            let qtype = MappedAprModel::dtype_to_qtype(dtype);
            Ok(OwnedQuantizedTensor {
                data: raw.to_vec(),
                in_dim,
                out_dim,
                qtype,
            })
        },
    }
}

/// Load an F32 tensor from APR format, trying multiple names.
fn apr_load_f32_tensor(
    apr: &crate::apr::MappedAprModel,
    data: &[u8],
    data_offset: usize,
    names: &[&str],
) -> Result<Vec<f32>> {
    let (tensor, found_name) = names
        .iter()
        .find_map(|name| apr.find_tensor(name).map(|t| (t, *name)))
        .ok_or_else(|| RealizarError::FormatError {
            reason: format!("APR: tensor not found (tried: {})", names.join(", ")),
        })?;
    let start = data_offset + tensor.offset as usize;
    let end = start + tensor.size as usize;
    if end > data.len() {
        return Err(RealizarError::FormatError {
            reason: format!("APR: tensor {found_name} extends past EOF"),
        });
    }
    Ok(data[start..end]
        .chunks_exact(4)
        .map(|c| f32::from_le_bytes([c[0], c[1], c[2], c[3]]))
        .collect())
}

/// Try loading an optional F32 bias tensor from APR format.
fn apr_try_load_f32(
    apr: &crate::apr::MappedAprModel,
    data: &[u8],
    data_offset: usize,
    name: &str,
) -> Option<Vec<f32>> {
    let tensor = apr.find_tensor(name)?;
    let start = data_offset + tensor.offset as usize;
    let end = start + tensor.size as usize;
    if end > data.len() {
        return None;
    }
    let raw = &data[start..end];
    // GH-180: Dispatch on dtype — FP16 APR models store biases as F16
    match tensor.dtype.as_str() {
        "F16" => Some(
            raw.chunks_exact(2)
                .map(|c| half::f16::from_le_bytes([c[0], c[1]]).to_f32())
                .collect(),
        ),
        _ => Some(
            raw.chunks_exact(4)
                .map(|c| f32::from_le_bytes([c[0], c[1], c[2], c[3]]))
                .collect(),
        ),
    }
}

/// Infer vocab_size from APR metadata or embedding tensor shape.
/// GH-337: Infer vocab size from metadata or embedding tensor shape.
///
/// **Design by Contract**: No hardcoded fallback. Returns 0 on failure
/// (callers validate via contract gate).
fn apr_infer_vocab_size(apr: &crate::apr::MappedAprModel) -> usize {
    if let Some(v) = apr.metadata.vocab_size {
        if v > 0 {
            return v;
        }
    }
    // Try embedding tensor shape (first dimension = vocab size)
    apr.tensors
        .iter()
        .find(|t| {
            t.name.contains("embed_tokens")
                || t.name.contains("tok_embeddings")
                || t.name.contains("token_embd")
        })
        .and_then(|t| t.shape.first().copied())
        .unwrap_or(0)
}

impl OwnedQuantizedModel {
    /// Create model from memory-mapped APR file (SHOWCASE-APR-GPU)
    ///
    /// Converts APR Q4K format to GGUF-compatible model for GPU inference.
    /// The raw Q4K tensor data is byte-compatible between formats.
    ///
    /// # Arguments
    /// * `apr` - Memory-mapped APR model
    ///
    /// # Errors
    /// Returns error if APR format is invalid or missing required tensors.
    pub fn from_apr(apr: &crate::apr::MappedAprModel) -> Result<Self> {
        let t0 = std::time::Instant::now();
        let data = apr.data();
        let data_offset = apr.data_offset() as usize;

        // Phase 2: Deduplicated APR config extraction + validated construction.
        let vocab_size = apr_infer_vocab_size(apr);
        let validated = ValidatedModelConfig::from_apr(apr, vocab_size)?;

        // GH-279: Contract gate — validate architecture and dimensions before loading weights
        let _proof = crate::contract_gate::validate_model_load_basic(
            validated.architecture(),
            validated.num_layers(),
            validated.hidden_dim(),
            validated.num_heads(),
            validated.num_kv_heads(),
            validated.intermediate_dim(),
            validated.vocab_size(),
        )
        .map_err(crate::contract_gate::gate_error)?;

        // Extract inner GGUFConfig for storage (struct field is typed GGUFConfig)
        let mut config = validated.into_inner();

        // GH-278: Detect Conv1D layout from contract (not string matching)
        let transpose = config.constraints.needs_transpose();

        // Extract dimensions from validated config for use below
        let hidden_dim = config.hidden_dim;
        let num_layers = config.num_layers;
        let intermediate_dim = config.intermediate_dim;

        // GH-479: Infer explicit head_dim from Q proj tensor shape (Qwen3 head_dim != hidden/heads)
        let q_tensor_name = "model.layers.0.self_attn.q_proj.weight";
        let gguf_q_name = "blk.0.attn_q.weight";
        if let Some(q_tensor) = apr.find_tensor(q_tensor_name).or_else(|| apr.find_tensor(gguf_q_name)) {
            if q_tensor.shape.len() == 2 {
                let q_out_dim = q_tensor.shape[0];
                let inferred_head_dim = if config.num_heads > 0 { q_out_dim / config.num_heads } else { 0 };
                let default_head_dim = if config.num_heads > 0 { hidden_dim / config.num_heads } else { 0 };
                if inferred_head_dim > 0 && inferred_head_dim != default_head_dim {
                    config.explicit_head_dim = Some(inferred_head_dim);
                }
            }
        }

        // Load token embeddings
        let token_embedding =
            Self::load_apr_token_embedding(apr, data, data_offset, vocab_size, hidden_dim)?;

        // Build layers
        // GH-479: q_dim may differ from hidden_dim (Qwen3 head_dim != hidden/heads)
        let q_dim = config.q_dim();
        let kv_dim = config.kv_dim();
        let mut layers = Vec::with_capacity(num_layers);

        for layer_idx in 0..num_layers {
            layers.push(Self::load_apr_layer(
                apr,
                data,
                data_offset,
                layer_idx,
                hidden_dim,
                q_dim,
                kv_dim,
                intermediate_dim,
                transpose,
            )?);
        }

        // Output norm
        let output_norm_weight =
            apr_load_f32_tensor(apr, data, data_offset, &["model.norm.weight", "output_norm.weight"])?;
        let output_norm_bias = apr_try_load_f32(apr, data, data_offset, "model.norm.bias");

        // LM head (try HF name first, then GGUF)
        let lm_head_weight = apr_load_quantized_tensor(
            apr, data, data_offset,
            &["lm_head.weight", "output.weight"],
            hidden_dim, vocab_size, transpose,
        )?;
        let lm_head_bias = apr_try_load_f32(apr, data, data_offset, "lm_head.bias");

        // GH-278: Load learned position embeddings (GPT-2 style)
        let position_embedding =
            apr_try_load_f32(apr, data, data_offset, "model.position_embedding.weight");

        let load_ms = t0.elapsed().as_secs_f64() * 1000.0;
        eprintln!(
            "[GH-175] OwnedQuantizedModel::from_apr: {} layers loaded in {:.1}ms",
            num_layers, load_ms
        );

        Ok(Self {
            config,
            token_embedding,
            position_embedding,
            layers,
            encoder_layers: vec![],
            encoder_output_norm_weight: None,
            encoder_output_norm_bias: None,
            output_norm_weight,
            output_norm_bias,
            lm_head_weight,
            lm_head_bias,
            #[cfg(feature = "cuda")]
            cuda_executor: None,
            #[cfg(feature = "cuda")]
            cuda_kernel_count: std::sync::atomic::AtomicU64::new(0),
            #[cfg(feature = "cuda")]
            cached_weight_names: std::sync::Mutex::new(std::collections::HashSet::new()),
        })
    }

    /// Load token embeddings from APR format.
    fn load_apr_token_embedding(
        apr: &crate::apr::MappedAprModel,
        data: &[u8],
        data_offset: usize,
        vocab_size: usize,
        hidden_dim: usize,
    ) -> Result<Vec<f32>> {
        let embed_name = apr
            .tensors
            .iter()
            .find(|t| {
                t.name.contains("embed_tokens")
                    || t.name.contains("tok_embeddings")
                    || t.name.contains("token_embd")
            })
            .map(|t| t.name.as_str())
            .ok_or_else(|| RealizarError::FormatError {
                reason: "APR: embedding tensor not found".to_string(),
            })?;

        let embed_tensor = apr.find_tensor(embed_name).ok_or_else(|| RealizarError::FormatError {
            reason: "APR: embedding tensor not found".to_string(),
        })?;
        let embed_start = data_offset + embed_tensor.offset as usize;
        let embed_end = embed_start + embed_tensor.size as usize;
        if embed_end > data.len() {
            return Err(RealizarError::FormatError {
                reason: "APR: embedding tensor extends past EOF".to_string(),
            });
        }
        let embed_data = &data[embed_start..embed_end];
        dequantize_embedding(embed_data, embed_tensor.dtype.as_str(), vocab_size * hidden_dim)
    }

    /// Load a single transformer layer from APR format.
    #[allow(clippy::too_many_arguments)]
    fn load_apr_layer(
        apr: &crate::apr::MappedAprModel,
        data: &[u8],
        data_offset: usize,
        layer_idx: usize,
        hidden_dim: usize,
        q_dim: usize,
        kv_dim: usize,
        intermediate_dim: usize,
        transpose: bool,
    ) -> Result<OwnedQuantizedLayer> {
        // HF names (primary, from SafeTensors->APR pipeline)
        let hf_q = format!("model.layers.{layer_idx}.self_attn.q_proj.weight");
        let hf_k = format!("model.layers.{layer_idx}.self_attn.k_proj.weight");
        let hf_v = format!("model.layers.{layer_idx}.self_attn.v_proj.weight");
        let hf_o = format!("model.layers.{layer_idx}.self_attn.o_proj.weight");
        let hf_gate = format!("model.layers.{layer_idx}.mlp.gate_proj.weight");
        let hf_up = format!("model.layers.{layer_idx}.mlp.up_proj.weight");
        let hf_down = format!("model.layers.{layer_idx}.mlp.down_proj.weight");
        let hf_attn_norm = format!("model.layers.{layer_idx}.input_layernorm.weight");
        let hf_ffn_norm = format!("model.layers.{layer_idx}.post_attention_layernorm.weight");

        // GGUF names (fallback, from GGUF->APR path)
        let gguf_q = format!("blk.{layer_idx}.attn_q.weight");
        let gguf_k = format!("blk.{layer_idx}.attn_k.weight");
        let gguf_v = format!("blk.{layer_idx}.attn_v.weight");
        let gguf_o = format!("blk.{layer_idx}.attn_output.weight");
        let gguf_gate = format!("blk.{layer_idx}.ffn_gate.weight");
        let gguf_up = format!("blk.{layer_idx}.ffn_up.weight");
        let gguf_down = format!("blk.{layer_idx}.ffn_down.weight");
        let gguf_attn_norm = format!("blk.{layer_idx}.attn_norm.weight");
        let gguf_ffn_norm = format!("blk.{layer_idx}.ffn_norm.weight");

        // GH-479: Q dim may differ from hidden_dim (Qwen3 head_dim != hidden/heads)
        let q_weight = apr_load_quantized_tensor(apr, data, data_offset, &[&hf_q, &gguf_q], hidden_dim, q_dim, transpose)?;
        let k_weight = apr_load_quantized_tensor(apr, data, data_offset, &[&hf_k, &gguf_k], hidden_dim, kv_dim, transpose)?;
        let v_weight = apr_load_quantized_tensor(apr, data, data_offset, &[&hf_v, &gguf_v], hidden_dim, kv_dim, transpose)?;

        let qkv_weight = OwnedQKVWeights::Separate {
            q: q_weight,
            k: k_weight,
            v: v_weight,
        };

        // QKV biases (Qwen2 has separate Q, K, V biases — concatenate for CUDA)
        // GH-87: Try both HF names (SafeTensors→APR) and GGUF names (GGUF→APR Q4K)
        let hf_q_bias = format!("model.layers.{layer_idx}.self_attn.q_proj.bias");
        let hf_k_bias = format!("model.layers.{layer_idx}.self_attn.k_proj.bias");
        let hf_v_bias = format!("model.layers.{layer_idx}.self_attn.v_proj.bias");
        let gguf_q_bias = format!("blk.{layer_idx}.attn_q.bias");
        let gguf_k_bias = format!("blk.{layer_idx}.attn_k.bias");
        let gguf_v_bias = format!("blk.{layer_idx}.attn_v.bias");
        let qkv_bias = apr_try_load_f32(apr, data, data_offset, &hf_q_bias)
            .or_else(|| apr_try_load_f32(apr, data, data_offset, &gguf_q_bias))
            .and_then(|q_b| {
                let k_b = apr_try_load_f32(apr, data, data_offset, &hf_k_bias)
                    .or_else(|| apr_try_load_f32(apr, data, data_offset, &gguf_k_bias))?;
                let v_b = apr_try_load_f32(apr, data, data_offset, &hf_v_bias)
                    .or_else(|| apr_try_load_f32(apr, data, data_offset, &gguf_v_bias))?;
                let mut combined = Vec::with_capacity(q_b.len() + k_b.len() + v_b.len());
                combined.extend_from_slice(&q_b);
                combined.extend_from_slice(&k_b);
                combined.extend_from_slice(&v_b);
                Some(combined)
            });

        // GH-479: O proj maps q_dim -> hidden_dim (Qwen3 q_dim != hidden_dim)
        let o_weight = apr_load_quantized_tensor(apr, data, data_offset, &[&hf_o, &gguf_o], q_dim, hidden_dim, transpose)?;

        // FFN weights (gate is optional — GPT-2 has no SwiGLU gate)
        let ffn_gate_weight = apr_load_quantized_tensor(apr, data, data_offset, &[&hf_gate, &gguf_gate], hidden_dim, intermediate_dim, transpose).ok();
        let ffn_up_weight = apr_load_quantized_tensor(apr, data, data_offset, &[&hf_up, &gguf_up], hidden_dim, intermediate_dim, transpose)?;
        let ffn_down_weight = apr_load_quantized_tensor(apr, data, data_offset, &[&hf_down, &gguf_down], intermediate_dim, hidden_dim, transpose)?;

        // Norm weights (F32)
        let attn_norm_weight = apr_load_f32_tensor(apr, data, data_offset, &[&hf_attn_norm, &gguf_attn_norm])?;
        let ffn_norm_weight = apr_load_f32_tensor(apr, data, data_offset, &[&hf_ffn_norm, &gguf_ffn_norm]).ok();

        // GH-278: Load biases (GPT-2/phi-2 style models have biases on all projections)
        // GH-87: Try both HF names and GGUF names for all bias tensors
        let hf_attn_norm_bias = format!("model.layers.{layer_idx}.input_layernorm.bias");
        let hf_ffn_norm_bias = format!("model.layers.{layer_idx}.post_attention_layernorm.bias");
        let hf_o_bias = format!("model.layers.{layer_idx}.self_attn.o_proj.bias");
        let hf_up_bias = format!("model.layers.{layer_idx}.mlp.up_proj.bias");
        let hf_down_bias = format!("model.layers.{layer_idx}.mlp.down_proj.bias");
        let gguf_attn_norm_bias = format!("blk.{layer_idx}.attn_norm.bias");
        let gguf_ffn_norm_bias = format!("blk.{layer_idx}.ffn_norm.bias");
        let gguf_o_bias = format!("blk.{layer_idx}.attn_output.bias");
        let gguf_up_bias = format!("blk.{layer_idx}.ffn_up.bias");
        let gguf_down_bias = format!("blk.{layer_idx}.ffn_down.bias");

        Ok(OwnedQuantizedLayer {
            attn_norm_weight,
            attn_norm_bias: apr_try_load_f32(apr, data, data_offset, &hf_attn_norm_bias)
                .or_else(|| apr_try_load_f32(apr, data, data_offset, &gguf_attn_norm_bias)),
            qkv_weight,
            qkv_bias,
            attn_output_weight: o_weight,
            attn_output_bias: apr_try_load_f32(apr, data, data_offset, &hf_o_bias)
                .or_else(|| apr_try_load_f32(apr, data, data_offset, &gguf_o_bias)),
            ffn_norm_weight,
            ffn_norm_bias: apr_try_load_f32(apr, data, data_offset, &hf_ffn_norm_bias)
                .or_else(|| apr_try_load_f32(apr, data, data_offset, &gguf_ffn_norm_bias)),
            ffn_gate_weight,
            ffn_gate_bias: None,
            ffn_up_weight,
            ffn_up_bias: apr_try_load_f32(apr, data, data_offset, &hf_up_bias)
                .or_else(|| apr_try_load_f32(apr, data, data_offset, &gguf_up_bias)),
            ffn_down_weight,
            ffn_down_bias: apr_try_load_f32(apr, data, data_offset, &hf_down_bias)
                .or_else(|| apr_try_load_f32(apr, data, data_offset, &gguf_down_bias)),
            // GH-479: QK norm weights (Qwen3 per-head RMSNorm)
            // Contract: qk-norm-apr-loader-v1 §QKN-LOAD-002
            attn_q_norm_weight: apr_try_load_f32(apr, data, data_offset,
                &format!("model.layers.{layer_idx}.self_attn.q_norm.weight"))
                .or_else(|| apr_try_load_f32(apr, data, data_offset,
                    &format!("blk.{layer_idx}.attn_q_norm.weight"))),
            attn_k_norm_weight: apr_try_load_f32(apr, data, data_offset,
                &format!("model.layers.{layer_idx}.self_attn.k_norm.weight"))
                .or_else(|| apr_try_load_f32(apr, data, data_offset,
                    &format!("blk.{layer_idx}.attn_k_norm.weight"))),
        })
    }
}