1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
use super::attention_helpers::ALiBi;
#[allow(clippy::wildcard_imports)]
use super::*;
impl ALiBi {
/// Create `ALiBi` with specified number of attention heads.
///
/// Slopes follow geometric sequence: 2^(-8/n), 2^(-16/n), ...
#[must_use]
pub fn new(num_heads: usize) -> Self {
let slopes = Self::compute_slopes(num_heads);
Self { num_heads, slopes }
}
/// Compute slopes using the formula from the paper.
fn compute_slopes(num_heads: usize) -> Vec<f32> {
// For power-of-2 heads, use geometric sequence
// For non-power-of-2, interpolate
let closest_pow2 = (num_heads as f32).log2().ceil() as u32;
let base = 2.0_f32.powf(-(8.0 / 2.0_f32.powi(closest_pow2 as i32)));
let mut slopes = Vec::with_capacity(num_heads);
if num_heads.is_power_of_two() {
for i in 0..num_heads {
slopes.push(base.powi((i + 1) as i32));
}
} else {
// Interpolate for non-power-of-2
let extra_base = 2.0_f32.powf(-(8.0 / 2.0_f32.powi(closest_pow2 as i32 - 1)));
let num_extra = 2 * num_heads - 2_usize.pow(closest_pow2);
for i in 0..num_extra {
slopes.push(extra_base.powi(((i + 1) * 2) as i32));
}
for i in num_extra..num_heads {
slopes.push(base.powi((i - num_extra + 1) as i32));
}
}
slopes
}
/// Compute `ALiBi` bias matrix for given sequence length.
///
/// # Arguments
///
/// * `seq_len` - Current sequence length
///
/// # Returns
///
/// Bias tensor [`num_heads`, `seq_len`, `seq_len`] to add to attention scores.
#[must_use]
pub fn compute_bias(&self, seq_len: usize) -> Tensor {
let mut bias = vec![0.0; self.num_heads * seq_len * seq_len];
for h in 0..self.num_heads {
let slope = self.slopes[h];
for i in 0..seq_len {
for j in 0..seq_len {
let distance = (i as i32 - j as i32).abs() as f32;
let idx = h * seq_len * seq_len + i * seq_len + j;
bias[idx] = -slope * distance;
}
}
}
Tensor::new(&bias, &[self.num_heads, seq_len, seq_len])
}
/// Apply `ALiBi` to attention scores.
///
/// # Arguments
///
/// * `scores` - Attention scores [batch, `num_heads`, `seq_len`, `seq_len`]
///
/// # Returns
///
/// Scores with `ALiBi` bias applied.
#[must_use]
pub fn apply(&self, scores: &Tensor) -> Tensor {
let shape = scores.shape();
assert!(shape.len() == 4, "Expected 4D tensor");
assert_eq!(shape[1], self.num_heads, "num_heads mismatch");
let (batch, _, seq_len, _) = (shape[0], shape[1], shape[2], shape[3]);
let bias = self.compute_bias(seq_len);
// Add bias (broadcast over batch)
let mut output = scores.data().to_vec();
for b in 0..batch {
for h in 0..self.num_heads {
for i in 0..seq_len {
for j in 0..seq_len {
let score_idx = b * self.num_heads * seq_len * seq_len
+ h * seq_len * seq_len
+ i * seq_len
+ j;
let bias_idx = h * seq_len * seq_len + i * seq_len + j;
output[score_idx] += bias.data()[bias_idx];
}
}
}
}
Tensor::from_vec(output, shape)
}
/// Get slopes for each head.
#[must_use]
pub fn slopes(&self) -> &[f32] {
&self.slopes
}
/// Get number of heads.
#[must_use]
pub fn num_heads(&self) -> usize {
self.num_heads
}
}
#[cfg(test)]
mod tests;