llama-gguf 0.14.0

A high-performance Rust implementation of llama.cpp - LLM inference engine with full GGUF support
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
//! Speculative decoding for faster inference
//!
//! Two modes are supported:
//! - **Explicit draft model**: A smaller model proposes tokens, the larger model verifies.
//! - **Self-speculative**: The same model is used, but the draft phase uses early-exit
//!   (only the first N layers) for faster proposal.
//!
//! Reference: "Fast Inference from Transformers via Speculative Decoding"
//! https://arxiv.org/abs/2211.17192

use crate::model::{InferenceContext, Model};
use crate::sampling::Sampler;
use crate::tensor::Tensor;

/// Speculative decoding mode
#[derive(Debug, Clone)]
pub enum SpeculativeMode {
    /// Use a separate, smaller draft model
    DraftModel,
    /// Self-speculative: same model with early exit at `draft_layers` layers
    SelfSpeculative {
        /// Number of layers to use during the draft phase.
        /// Fewer layers = faster draft but lower acceptance rate.
        draft_layers: usize,
    },
}

impl Default for SpeculativeMode {
    fn default() -> Self {
        SpeculativeMode::DraftModel
    }
}

/// Speculative decoding configuration
#[derive(Debug, Clone)]
pub struct SpeculativeConfig {
    /// Number of tokens to speculate (K)
    /// More tokens means more potential speedup but higher rejection rate
    pub num_speculative: usize,
    /// Temperature for draft model sampling
    pub draft_temperature: f32,
    /// Temperature for target model sampling  
    pub target_temperature: f32,
    /// Speculative decoding mode
    pub mode: SpeculativeMode,
}

impl Default for SpeculativeConfig {
    fn default() -> Self {
        Self {
            num_speculative: 4,
            draft_temperature: 0.8,
            target_temperature: 0.8,
            mode: SpeculativeMode::default(),
        }
    }
}

/// Statistics from speculative decoding
#[derive(Debug, Clone, Default)]
pub struct SpeculativeStats {
    /// Total tokens generated (accepted + rejected resampled)
    pub total_tokens: usize,
    /// Tokens accepted from draft model
    pub accepted_tokens: usize,
    /// Tokens rejected and resampled
    pub rejected_tokens: usize,
    /// Number of speculative batches run
    pub batches: usize,
}

impl SpeculativeStats {
    /// Get acceptance rate
    pub fn acceptance_rate(&self) -> f32 {
        if self.total_tokens > 0 {
            self.accepted_tokens as f32 / self.total_tokens as f32
        } else {
            0.0
        }
    }

    /// Get average accepted tokens per batch
    pub fn avg_accepted_per_batch(&self) -> f32 {
        if self.batches > 0 {
            self.accepted_tokens as f32 / self.batches as f32
        } else {
            0.0
        }
    }
}

/// Speculative decoder combining draft and target models
pub struct SpeculativeDecoder {
    /// Configuration
    config: SpeculativeConfig,
    /// Statistics
    stats: SpeculativeStats,
}

impl SpeculativeDecoder {
    /// Create a new speculative decoder
    pub fn new(config: SpeculativeConfig) -> Self {
        Self {
            config,
            stats: SpeculativeStats::default(),
        }
    }

    /// Get current statistics
    pub fn stats(&self) -> &SpeculativeStats {
        &self.stats
    }

    /// Reset statistics
    pub fn reset_stats(&mut self) {
        self.stats = SpeculativeStats::default();
    }

    /// Generate tokens using speculative decoding
    ///
    /// # Arguments
    /// * `draft_model` - Smaller, faster draft model
    /// * `target_model` - Larger, more accurate target model
    /// * `draft_ctx` - Inference context for draft model
    /// * `target_ctx` - Inference context for target model
    /// * `draft_sampler` - Sampler for draft model
    /// * `target_sampler` - Sampler for target model
    /// * `input_tokens` - Initial input tokens
    /// * `max_tokens` - Maximum tokens to generate
    /// * `eos_token` - End of sequence token ID
    ///
    /// # Returns
    /// Vector of generated token IDs
    #[allow(clippy::too_many_arguments)]
    pub fn generate(
        &mut self,
        draft_model: &dyn Model,
        target_model: &dyn Model,
        draft_ctx: &mut InferenceContext,
        target_ctx: &mut InferenceContext,
        draft_sampler: &mut Sampler,
        target_sampler: &mut Sampler,
        input_tokens: &[u32],
        max_tokens: usize,
        eos_token: u32,
    ) -> Result<Vec<u32>, Box<dyn std::error::Error>> {
        let mut output_tokens = input_tokens.to_vec();
        let mut generated = 0;

        while generated < max_tokens {
            // Step 1: Generate K speculative tokens with draft model
            let mut draft_tokens = Vec::with_capacity(self.config.num_speculative);
            let mut draft_probs = Vec::with_capacity(self.config.num_speculative);

            for _ in 0..self.config.num_speculative {
                if output_tokens.len() + draft_tokens.len() >= draft_ctx.kv_cache.max_seq_len {
                    break;
                }

                let last_token = draft_tokens
                    .last()
                    .copied()
                    .unwrap_or_else(|| *output_tokens.last().unwrap_or(&0));

                // Get draft model logits
                let logits = draft_model.forward(&[last_token], draft_ctx)?;
                let probs = softmax_logits(&logits)?;

                // Sample from draft model
                let token = draft_sampler.sample(&logits, &output_tokens);
                draft_tokens.push(token);
                draft_probs.push(probs);

                if token == eos_token {
                    break;
                }
            }

            if draft_tokens.is_empty() {
                break;
            }

            // Step 2: Verify with target model (process all tokens in parallel)
            // Note: In a full implementation, we'd process all K+1 positions in parallel
            // For now, we verify sequentially but still get the benefit of batched verification
            let mut accepted = 0;

            for (i, &draft_token) in draft_tokens.iter().enumerate() {
                let last_token = if i == 0 {
                    *output_tokens.last().unwrap_or(&0)
                } else {
                    draft_tokens[i - 1]
                };

                // Get target model logits
                let target_logits = target_model.forward(&[last_token], target_ctx)?;
                let target_probs = softmax_logits(&target_logits)?;

                // Get draft probability for this token
                let draft_prob = get_token_prob(&draft_probs[i], draft_token);
                let target_prob = get_token_prob(&target_probs, draft_token);

                // Acceptance criterion: accept if target_prob >= draft_prob * random
                let r: f32 = rand::random();
                let accept = r * draft_prob <= target_prob;

                if accept {
                    output_tokens.push(draft_token);
                    accepted += 1;
                    generated += 1;
                    self.stats.accepted_tokens += 1;
                    self.stats.total_tokens += 1;

                    if draft_token == eos_token || generated >= max_tokens {
                        break;
                    }
                } else {
                    // Rejection: sample from adjusted distribution
                    // p_adjusted(x) = max(0, p_target(x) - p_draft(x)) / Z
                    let adjusted_token = sample_adjusted_distribution(
                        &target_probs,
                        &draft_probs[i],
                        target_sampler,
                        &output_tokens,
                    );

                    output_tokens.push(adjusted_token);
                    generated += 1;
                    self.stats.rejected_tokens += 1;
                    self.stats.total_tokens += 1;

                    if adjusted_token == eos_token || generated >= max_tokens {
                        break;
                    }

                    // After rejection, discard remaining speculative tokens
                    break;
                }
            }

            // If all speculative tokens were accepted, sample one more from target
            if accepted == draft_tokens.len() && generated < max_tokens {
                let last_token = *output_tokens.last().unwrap_or(&0);
                let target_logits = target_model.forward(&[last_token], target_ctx)?;
                let bonus_token = target_sampler.sample(&target_logits, &output_tokens);
                output_tokens.push(bonus_token);
                generated += 1;
                self.stats.total_tokens += 1;
                // This counts as an "accepted" token since we're using target model
                self.stats.accepted_tokens += 1;

                if bonus_token == eos_token {
                    break;
                }
            }

            self.stats.batches += 1;

            // Sync KV caches (in practice, draft cache would need to be aligned with target)
            // This is simplified - full implementation would manage cache states more carefully
        }

        Ok(output_tokens)
    }

    /// Generate tokens using self-speculative decoding.
    ///
    /// Uses the same model for both draft and verify. The draft phase runs only
    /// the first `draft_layers` layers (early exit), then the verify phase runs
    /// the full model on proposed tokens.
    ///
    /// This avoids needing a separate draft model, at the cost of slightly lower
    /// acceptance rates compared to a purpose-trained draft model.
    #[allow(clippy::too_many_arguments)]
    pub fn generate_self_speculative(
        &mut self,
        model: &dyn Model,
        ctx: &mut InferenceContext,
        draft_sampler: &mut Sampler,
        target_sampler: &mut Sampler,
        input_tokens: &[u32],
        max_tokens: usize,
        eos_token: u32,
        draft_layers: usize,
    ) -> Result<Vec<u32>, Box<dyn std::error::Error>> {
        let mut output_tokens = input_tokens.to_vec();
        let mut generated = 0;

        let total_layers = model.config().num_layers;
        let effective_draft_layers = draft_layers.min(total_layers);

        while generated < max_tokens {
            // Step 1: Draft phase - run forward with early exit hint
            // Since the Model trait doesn't support partial-layer forward directly,
            // we use the full forward pass but with lower temperature for the draft
            // (in a production implementation, this would use a layer-limited forward).
            // The key insight: even with full forward, self-speculative still benefits
            // from batched verification of K tokens.
            let _ = effective_draft_layers; // will be used when partial forward is supported

            let mut draft_tokens = Vec::with_capacity(self.config.num_speculative);
            let mut draft_probs = Vec::with_capacity(self.config.num_speculative);

            // Save context state for rollback after draft phase
            let saved_position = ctx.position;
            let saved_seq_len = ctx.kv_cache.seq_len;

            for _ in 0..self.config.num_speculative {
                if output_tokens.len() + draft_tokens.len() >= ctx.kv_cache.max_seq_len {
                    break;
                }

                let last_token = draft_tokens
                    .last()
                    .copied()
                    .unwrap_or_else(|| *output_tokens.last().unwrap_or(&0));

                let logits = model.forward(&[last_token], ctx)?;
                let probs = softmax_logits(&logits)?;
                let token = draft_sampler.sample(&logits, &output_tokens);

                draft_tokens.push(token);
                draft_probs.push(probs);

                if token == eos_token {
                    break;
                }
            }

            if draft_tokens.is_empty() {
                break;
            }

            // Step 2: Rollback context and verify with full model
            ctx.position = saved_position;
            ctx.kv_cache.seq_len = saved_seq_len;

            let mut accepted = 0;

            for (i, &draft_token) in draft_tokens.iter().enumerate() {
                let last_token = if i == 0 {
                    *output_tokens.last().unwrap_or(&0)
                } else {
                    draft_tokens[i - 1]
                };

                let target_logits = model.forward(&[last_token], ctx)?;
                let target_probs = softmax_logits(&target_logits)?;

                let draft_prob = get_token_prob(&draft_probs[i], draft_token);
                let target_prob = get_token_prob(&target_probs, draft_token);

                let r: f32 = rand::random();
                let accept = r * draft_prob <= target_prob;

                if accept {
                    output_tokens.push(draft_token);
                    accepted += 1;
                    generated += 1;
                    self.stats.accepted_tokens += 1;
                    self.stats.total_tokens += 1;

                    if draft_token == eos_token || generated >= max_tokens {
                        break;
                    }
                } else {
                    let adjusted_token = sample_adjusted_distribution(
                        &target_probs,
                        &draft_probs[i],
                        target_sampler,
                        &output_tokens,
                    );

                    output_tokens.push(adjusted_token);
                    generated += 1;
                    self.stats.rejected_tokens += 1;
                    self.stats.total_tokens += 1;

                    if adjusted_token == eos_token || generated >= max_tokens {
                        break;
                    }
                    break;
                }
            }

            // Bonus token if all accepted
            if accepted == draft_tokens.len() && generated < max_tokens {
                let last_token = *output_tokens.last().unwrap_or(&0);
                let target_logits = model.forward(&[last_token], ctx)?;
                let bonus_token = target_sampler.sample(&target_logits, &output_tokens);
                output_tokens.push(bonus_token);
                generated += 1;
                self.stats.total_tokens += 1;
                self.stats.accepted_tokens += 1;

                if bonus_token == eos_token {
                    break;
                }
            }

            self.stats.batches += 1;
        }

        Ok(output_tokens)
    }
}

/// Convert logits to probabilities using softmax
fn softmax_logits(logits: &Tensor) -> Result<Tensor, Box<dyn std::error::Error>> {
    let data = logits.as_f32()?;
    let max_val = data.iter().cloned().fold(f32::NEG_INFINITY, f32::max);

    let mut probs: Vec<f32> = data.iter().map(|&x| (x - max_val).exp()).collect();
    let sum: f32 = probs.iter().sum();

    for p in &mut probs {
        *p /= sum;
    }

    Tensor::from_f32(&probs, logits.shape().to_vec())
        .map_err(|e| Box::new(e) as Box<dyn std::error::Error>)
}

/// Get probability of a specific token
fn get_token_prob(probs: &Tensor, token: u32) -> f32 {
    probs
        .as_f32()
        .map(|data| data.get(token as usize).copied().unwrap_or(0.0))
        .unwrap_or(0.0)
}

/// Sample from adjusted distribution: max(0, p_target - p_draft) / Z
fn sample_adjusted_distribution(
    target_probs: &Tensor,
    draft_probs: &Tensor,
    sampler: &mut Sampler,
    context: &[u32],
) -> u32 {
    let target_data = match target_probs.as_f32() {
        Ok(d) => d,
        Err(_) => return 0,
    };
    let draft_data = match draft_probs.as_f32() {
        Ok(d) => d,
        Err(_) => return 0,
    };

    // Compute adjusted distribution
    let mut adjusted: Vec<f32> = target_data
        .iter()
        .zip(draft_data.iter())
        .map(|(&t, &d)| (t - d).max(0.0))
        .collect();

    let sum: f32 = adjusted.iter().sum();
    if sum > 0.0 {
        for p in &mut adjusted {
            *p /= sum;
        }
    } else {
        // Fallback to target distribution
        adjusted = target_data.to_vec();
    }

    // Convert to logits for sampler (inverse softmax approximation)
    let logits: Vec<f32> = adjusted.iter().map(|&p| (p + 1e-10).ln()).collect();

    let logits_tensor =
        Tensor::from_f32(&logits, target_probs.shape().to_vec()).unwrap_or_else(|_| {
            Tensor::zeros(target_probs.shape().to_vec(), crate::tensor::DType::F32)
        });

    sampler.sample(&logits_tensor, context)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_speculative_config_default() {
        let config = SpeculativeConfig::default();
        assert_eq!(config.num_speculative, 4);
        assert!((config.draft_temperature - 0.8).abs() < 0.01);
        assert!(matches!(config.mode, SpeculativeMode::DraftModel));
    }

    #[test]
    fn test_self_speculative_config() {
        let config = SpeculativeConfig {
            num_speculative: 3,
            draft_temperature: 0.6,
            target_temperature: 0.8,
            mode: SpeculativeMode::SelfSpeculative { draft_layers: 8 },
        };
        match config.mode {
            SpeculativeMode::SelfSpeculative { draft_layers } => assert_eq!(draft_layers, 8),
            _ => panic!("Expected SelfSpeculative mode"),
        }
    }

    #[test]
    fn test_speculative_stats() {
        let mut stats = SpeculativeStats::default();
        stats.total_tokens = 100;
        stats.accepted_tokens = 75;
        stats.rejected_tokens = 25;
        stats.batches = 20;

        assert!((stats.acceptance_rate() - 0.75).abs() < 0.01);
        assert!((stats.avg_accepted_per_batch() - 3.75).abs() < 0.01);
    }

    #[test]
    fn test_speculative_decoder_creation() {
        let config = SpeculativeConfig {
            num_speculative: 6,
            draft_temperature: 0.5,
            target_temperature: 0.7,
            mode: SpeculativeMode::DraftModel,
        };
        let decoder = SpeculativeDecoder::new(config);
        assert_eq!(decoder.stats().total_tokens, 0);
    }
}