transformers/models/raw/
utils.rs

1//! Apply penalty and repeat_kv
2
3use candle_core::{Result, Tensor};
4use candle_nn::Module;
5
6pub fn apply_repeat_penalty(logits: &Tensor, penalty: f32, context: &[u32]) -> Result<Tensor> {
7    let device = logits.device();
8    let mut logits = logits.to_dtype(candle_core::DType::F32)?.to_vec1::<f32>()?;
9    let mut already_seen = std::collections::HashSet::new();
10    for token_id in context {
11        if already_seen.contains(token_id) {
12            continue;
13        }
14        already_seen.insert(token_id);
15        if let Some(logit) = logits.get_mut(*token_id as usize) {
16            if *logit >= 0. {
17                *logit /= penalty
18            } else {
19                *logit *= penalty
20            }
21        }
22    }
23    let logits_len = logits.len();
24    Tensor::from_vec(logits, logits_len, device)
25}
26
27/// Repeats a key or value tensor for grouped query attention
28/// The input tensor should have a shape `(batch, num_kv_heads, seq_len, head_dim)`,
29pub fn repeat_kv(xs: Tensor, n_rep: usize) -> Result<Tensor> {
30    if n_rep == 1 {
31        Ok(xs)
32    } else {
33        let (b_sz, n_kv_head, seq_len, head_dim) = xs.dims4()?;
34        // Using cat is faster than a broadcast as it avoids going through a potentially
35        // strided copy.
36        // https://github.com/huggingface/candle/pull/2043
37        Tensor::cat(&vec![&xs; n_rep], 2)?.reshape((b_sz, n_kv_head * n_rep, seq_len, head_dim))
38    }
39}
40
41// QMatMul wrapper adding some tracing.
42#[derive(Clone)]
43pub struct QMatMul {
44    inner: candle_core::quantized::QMatMul,
45    span: tracing::Span,
46}
47
48impl QMatMul {
49    pub fn new(
50        out_dim: usize,
51        in_dim: usize,
52        vb: super::quantized_var_builder::VarBuilder,
53    ) -> Result<Self> {
54        let ws = vb.get((in_dim, out_dim), "weight")?;
55        let inner = candle_core::quantized::QMatMul::from_arc(ws)?;
56        let span = tracing::span!(tracing::Level::TRACE, "qmatmul");
57        Ok(Self { inner, span })
58    }
59
60    pub fn from_weights(ws: std::sync::Arc<candle_core::quantized::QTensor>) -> Result<Self> {
61        let inner = candle_core::quantized::QMatMul::from_arc(ws)?;
62        let span = tracing::span!(tracing::Level::TRACE, "qmatmul");
63        Ok(Self { inner, span })
64    }
65}
66
67impl Module for QMatMul {
68    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
69        let _enter = self.span.enter();
70        self.inner.forward(xs)
71    }
72}
73
74impl std::fmt::Debug for QMatMul {
75    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
76        write!(f, "QMatMul")
77    }
78}