realizar 0.8.5

Pure Rust ML inference engine built from scratch - model serving for GGUF and safetensors
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
impl GpuModel {
    /// Create a new GPU-accelerated model with random initialization
    ///
    /// # Errors
    ///
    /// Returns error if GPU initialization fails
    pub fn new(config: GpuModelConfig) -> Result<Self> {
        let scheduler = HybridScheduler::new()?;

        // Initialize weights (small random values for testing)
        let embedding_weights = vec![0.01f32; config.vocab_size * config.hidden_dim];

        let mut block_weights = Vec::with_capacity(config.num_layers);
        for _ in 0..config.num_layers {
            block_weights.push(BlockWeights {
                attn_norm_weight: vec![1.0f32; config.hidden_dim],
                attn_norm_bias: vec![0.0f32; config.hidden_dim],
                qkv_weight: vec![0.01f32; config.hidden_dim * 3 * config.hidden_dim],
                qkv_bias: vec![0.0f32; 3 * config.hidden_dim],
                out_weight: vec![0.01f32; config.hidden_dim * config.hidden_dim],
                out_bias: vec![0.0f32; config.hidden_dim],
                ffn_norm_weight: vec![1.0f32; config.hidden_dim],
                ffn_norm_bias: vec![0.0f32; config.hidden_dim],
                ffn_fc1_weight: vec![0.01f32; config.hidden_dim * config.intermediate_dim],
                ffn_fc1_bias: vec![0.0f32; config.intermediate_dim],
                ffn_fc2_weight: vec![0.01f32; config.intermediate_dim * config.hidden_dim],
                ffn_fc2_bias: vec![0.0f32; config.hidden_dim],
                ffn_gate_weight: None, // No SwiGLU in test models
                // GH-278: No linear attention in test models
                linear_attn: None,
                moe_experts: None,
            });
        }

        let final_norm_weight = vec![1.0f32; config.hidden_dim];
        let final_norm_bias = vec![0.0f32; config.hidden_dim];
        let lm_head_weight = vec![0.01f32; config.hidden_dim * config.vocab_size];
        let lm_head_bias = vec![0.0f32; config.vocab_size];

        // Pre-compute transposed LM head for fast CPU inference
        // Original: [hidden_dim, vocab_size] -> Transposed: [vocab_size, hidden_dim]
        let lm_head_weight_t =
            Self::transpose_weights(&lm_head_weight, config.hidden_dim, config.vocab_size);

        Ok(Self {
            embedding_weights,
            block_weights,
            final_norm_weight,
            final_norm_bias,
            lm_head_weight,
            lm_head_weight_t,
            lm_head_bias,
            scheduler,
            #[cfg(feature = "cuda")]
            cuda_scheduler: None,
            config,
            attention_buffers: None,
            test_executor: None,
            linear_attn_state: None,
        })
    }

    /// IMP-1003: Create GPU model with CUDA-only scheduler
    ///
    /// Unlike `new()`, this constructor creates a model that ALWAYS uses CUDA
    /// for matmul operations, even for m=1 (single-token generation).
    ///
    /// # Errors
    ///
    /// Returns error if GPU or CUDA initialization fails
    #[cfg(feature = "cuda")]
    pub fn new_with_cuda(config: GpuModelConfig) -> Result<Self> {
        let scheduler = HybridScheduler::new()?;
        let cuda_scheduler = Some(CudaScheduler::new()?);

        // Initialize weights (small random values for testing)
        let embedding_weights = vec![0.01f32; config.vocab_size * config.hidden_dim];

        let mut block_weights = Vec::with_capacity(config.num_layers);
        for _ in 0..config.num_layers {
            block_weights.push(BlockWeights {
                attn_norm_weight: vec![1.0f32; config.hidden_dim],
                attn_norm_bias: vec![0.0f32; config.hidden_dim],
                qkv_weight: vec![0.01f32; config.hidden_dim * config.qkv_dim()],
                qkv_bias: vec![0.0f32; config.qkv_dim()],
                out_weight: vec![0.01f32; config.hidden_dim * config.hidden_dim],
                out_bias: vec![0.0f32; config.hidden_dim],
                ffn_norm_weight: vec![1.0f32; config.hidden_dim],
                ffn_norm_bias: vec![0.0f32; config.hidden_dim],
                ffn_fc1_weight: vec![0.01f32; config.hidden_dim * config.intermediate_dim],
                ffn_fc1_bias: vec![0.0f32; config.intermediate_dim],
                ffn_fc2_weight: vec![0.01f32; config.intermediate_dim * config.hidden_dim],
                ffn_fc2_bias: vec![0.0f32; config.hidden_dim],
                ffn_gate_weight: None, // No SwiGLU in test models
                // GH-278: No linear attention in test models
                linear_attn: None,
                moe_experts: None,
            });
        }

        let final_norm_weight = vec![1.0f32; config.hidden_dim];
        let final_norm_bias = vec![0.0f32; config.hidden_dim];
        let lm_head_weight = vec![0.01f32; config.hidden_dim * config.vocab_size];
        let lm_head_bias = vec![0.0f32; config.vocab_size];

        let lm_head_weight_t =
            Self::transpose_weights(&lm_head_weight, config.hidden_dim, config.vocab_size);

        Ok(Self {
            embedding_weights,
            block_weights,
            final_norm_weight,
            final_norm_bias,
            lm_head_weight,
            lm_head_weight_t,
            lm_head_bias,
            scheduler,
            cuda_scheduler,
            config,
            attention_buffers: None,
            test_executor: None,
            linear_attn_state: None,
        })
    }

    /// IMP-1003: Check if this model has CUDA scheduler enabled
    #[cfg(feature = "cuda")]
    #[must_use]
    pub fn has_cuda_scheduler(&self) -> bool {
        self.cuda_scheduler.is_some()
    }

    /// Phase 43: Inject test executor for dependency injection
    ///
    /// When a test executor is set, it takes priority over all other schedulers
    /// in `do_matmul()`. This enables testing forward pass logic without GPU.
    ///
    /// # Example
    ///
    /// ```ignore
    /// use realizar::gpu::executor::MockExecutor;
    ///
    /// let mut model = GpuModel::new(config)?;
    /// let mock = MockExecutor::new("test");
    /// model.with_test_executor(Box::new(mock));
    ///
    /// // Now model.do_matmul() uses the mock
    /// ```
    pub fn with_test_executor(
        &mut self,
        executor: Box<dyn super::super::executor::GpuExecutorTrait + Send + Sync>,
    ) {
        self.test_executor = Some(executor);
    }

    /// Phase 43: Check if test executor is set
    #[must_use]
    pub fn has_test_executor(&self) -> bool {
        self.test_executor.is_some()
    }

    /// Phase 43: Clear test executor (restore normal operation)
    pub fn clear_test_executor(&mut self) {
        self.test_executor = None;
    }

    /// IMP-1003: Perform matmul using CUDA scheduler (always GPU, even for m=1)
    ///
    /// # Errors
    ///
    /// Returns error if CUDA scheduler is not available or matmul fails
    #[cfg(feature = "cuda")]
    #[allow(clippy::many_single_char_names)]
    pub fn cuda_matmul(
        &mut self,
        a: &[f32],
        b: &[f32],
        m: usize,
        k: usize,
        n: usize,
    ) -> Result<Vec<f32>> {
        if let Some(ref mut cuda_sched) = self.cuda_scheduler {
            cuda_sched.matmul(a, b, m, k, n)
        } else {
            // Fallback to HybridScheduler
            self.scheduler.matmul(a, b, m, k, n)
        }
    }

    /// IMP-1005: Unified matmul dispatch that prefers CudaScheduler when available
    ///
    /// This method is used throughout forward_gpu() and forward_block_idx() to
    /// ensure CUDA is used for all matmul operations when cuda_scheduler is present.
    ///
    /// # Phase 43: Test Executor Support
    ///
    /// Priority order:
    /// 1. `test_executor` (if present) - for testing without GPU
    /// 2. `cuda_scheduler` (if present) - for CUDA acceleration
    /// 3. `scheduler` (HybridScheduler) - default fallback
    ///
    /// # Errors
    ///
    /// Returns error if matmul fails
    #[allow(clippy::many_single_char_names)]
    pub fn do_matmul(
        &mut self,
        a: &[f32],
        b: &[f32],
        m: usize,
        k: usize,
        n: usize,
    ) -> Result<Vec<f32>> {
        // Phase 43: Test executor takes priority (for testing without GPU)
        if let Some(ref mut test_exec) = self.test_executor {
            return test_exec.matmul(a, b, m, k, n);
        }

        #[cfg(feature = "cuda")]
        if let Some(ref mut cuda_sched) = self.cuda_scheduler {
            return cuda_sched.matmul(a, b, m, k, n);
        }
        // Fallback to HybridScheduler (or always use it when cuda feature disabled)
        self.scheduler.matmul(a, b, m, k, n)
    }

    /// Matmul with transposed B: C[m,n] = A[m,k] @ B[n,k]^T
    ///
    /// Routes through test_executor if present, enabling mock testing of
    /// attention score computation (Q @ K^T).
    #[allow(clippy::many_single_char_names)]
    pub fn do_matmul_transpose_b(
        &mut self,
        a: &[f32],
        b: &[f32],
        m: usize,
        k: usize,
        n: usize,
    ) -> Result<Vec<f32>> {
        // Test executor takes priority (for testing without GPU)
        if let Some(ref mut test_exec) = self.test_executor {
            // Transpose B and use standard matmul
            let b_t: Vec<f32> = (0..k)
                .flat_map(|i| (0..n).map(move |j| b[j * k + i]))
                .collect();
            return test_exec.matmul(a, &b_t, m, k, n);
        }

        // Use HybridScheduler which has matmul_transpose_b
        self.scheduler.matmul_transpose_b(a, b, m, k, n)
    }

    /// IMP-1007: Zero-clone matmul using split borrow pattern
    ///
    /// This method eliminates weight cloning by using Rust's split borrow pattern.
    /// It directly borrows weights from block_weights while mutably borrowing schedulers.
    ///
    /// # Arguments
    ///
    /// * `input` - Input tensor
    /// * `block_idx` - Block index for block weights (ignored for LmHead)
    /// * `op` - Which matmul operation/weight to use
    ///
    /// # Errors
    ///
    /// Returns error if matmul fails
    pub fn matmul_split(
        &mut self,
        input: &[f32],
        block_idx: usize,
        op: WeightType,
    ) -> Result<Vec<f32>> {
        // IMP-1007: Use split borrowing to avoid weight cloning
        // Extract dimensions from config (Copy types, no borrow conflict)
        let hidden_dim = self.config.hidden_dim;
        let qkv_dim = self.config.qkv_dim();
        let intermediate_dim = self.config.intermediate_dim;
        let vocab_size = self.config.vocab_size;

        // Get weight reference and dimensions based on operation
        let (weight, m, k, n) = match op {
            WeightType::Qkv => (
                &self.block_weights[block_idx].qkv_weight,
                1,
                hidden_dim,
                qkv_dim,
            ),
            WeightType::Output => (
                &self.block_weights[block_idx].out_weight,
                1,
                hidden_dim,
                hidden_dim,
            ),
            WeightType::FfnFc1 => (
                &self.block_weights[block_idx].ffn_fc1_weight,
                1,
                hidden_dim,
                intermediate_dim,
            ),
            WeightType::FfnFc2 => (
                &self.block_weights[block_idx].ffn_fc2_weight,
                1,
                intermediate_dim,
                hidden_dim,
            ),
            WeightType::LmHead => (&self.lm_head_weight, 1, hidden_dim, vocab_size),
        };

        // Clone weight to work around borrow checker - this is the safe fallback.
        // For zero-clone operations, use matmul_zero_clone() instead (IMP-1007).
        let weight_clone = weight.clone();

        // Now call do_matmul with cloned weight
        self.do_matmul(input, &weight_clone, m, k, n)
    }

    /// IMP-1007: Zero-clone matmul helper using explicit scheduler extraction
    ///
    /// This is a more aggressive optimization that temporarily extracts the
    /// cuda_scheduler to enable truly zero-clone matmul operations.
    ///
    /// # Safety
    ///
    /// This method uses `Option::take()` to temporarily move the scheduler,
    /// which is safe but requires careful handling to restore it.
    #[cfg(feature = "cuda")]
    pub fn matmul_zero_clone(
        &mut self,
        input: &[f32],
        block_idx: usize,
        op: WeightType,
    ) -> Result<Vec<f32>> {
        // Extract dimensions
        let hidden_dim = self.config.hidden_dim;
        let qkv_dim = self.config.qkv_dim();
        let intermediate_dim = self.config.intermediate_dim;
        let vocab_size = self.config.vocab_size;

        // Temporarily take cuda_scheduler out of self
        let mut cuda_sched: Option<CudaScheduler> = self.cuda_scheduler.take();

        // Now we can borrow block_weights freely
        let (weight, m, k, n) = match op {
            WeightType::Qkv => (
                &self.block_weights[block_idx].qkv_weight,
                1,
                hidden_dim,
                qkv_dim,
            ),
            WeightType::Output => (
                &self.block_weights[block_idx].out_weight,
                1,
                hidden_dim,
                hidden_dim,
            ),
            WeightType::FfnFc1 => (
                &self.block_weights[block_idx].ffn_fc1_weight,
                1,
                hidden_dim,
                intermediate_dim,
            ),
            WeightType::FfnFc2 => (
                &self.block_weights[block_idx].ffn_fc2_weight,
                1,
                intermediate_dim,
                hidden_dim,
            ),
            WeightType::LmHead => (&self.lm_head_weight, 1, hidden_dim, vocab_size),
        };

        // Perform matmul with extracted scheduler
        let result: Result<Vec<f32>> = if let Some(sched) = cuda_sched.as_mut() {
            CudaScheduler::matmul(sched, input, weight, m, k, n)
        } else {
            self.scheduler.matmul(input, weight, m, k, n)
        };

        // Restore cuda_scheduler
        self.cuda_scheduler = cuda_sched;

        result
    }

    // =========================================================================
    // IMP-1008: RefCell-based zero-clone matmul (interior mutability pattern)
    // =========================================================================

    /// IMP-1008: Zero-clone matmul using interior mutability
    ///
    /// This method takes `&self` instead of `&mut self` by wrapping scheduler
    /// access in RefCell. This eliminates the need to clone weights.
    ///
    /// # Errors
    ///
    /// Returns error if matmul fails or RefCell is already borrowed.
    #[cfg(feature = "cuda")]
    #[allow(clippy::many_single_char_names)]
    pub fn matmul_refcell(
        &self,
        a: &[f32],
        b: &[f32],
        m: usize,
        k: usize,
        n: usize,
    ) -> Result<Vec<f32>> {
        // IMP-1008: For RefCell pattern, we need to use a different approach
        // Since cuda_scheduler is Option<CudaScheduler>, we use UnsafeCell
        // pattern with explicit unsafe block to avoid changing struct layout.
        //
        // This is safe because:
        // 1. We only access cuda_scheduler mutably here
        // 2. No other code paths access it during matmul
        // 3. This is single-threaded execution

        // Use raw pointer to bypass borrow checker (safe in single-threaded context)
        // SAFETY: This is safe because:
        // - We're in single-threaded context (LLM inference)
        // - cuda_scheduler is only accessed through this method during matmul
        // - The borrow is released before returning
        let cuda_sched_ptr = std::ptr::addr_of!(self.cuda_scheduler).cast_mut();

        // SAFETY: Memory safety ensured by bounds checking and alignment
        let result: Result<Vec<f32>> = unsafe {
            if let Some(sched) = (*cuda_sched_ptr).as_mut() {
                CudaScheduler::matmul(sched, a, b, m, k, n)
            } else {
                // Fallback to HybridScheduler (also needs mut access)
                let sched_ptr = std::ptr::addr_of!(self.scheduler).cast_mut();
                (*sched_ptr).matmul(a, b, m, k, n)
            }
        };
        result
    }
}