1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
impl ALiBi {
/// Create a new `ALiBi` layer
///
/// # Arguments
///
/// * `num_heads` - Number of attention heads
///
/// # Errors
///
/// Returns error if `num_heads` is zero
pub fn new(num_heads: usize) -> Result<Self> {
if num_heads == 0 {
return Err(RealizarError::InvalidShape {
reason: "num_heads must be > 0".to_string(),
});
}
// Compute slopes for each head
let slopes = Self::compute_slopes(num_heads);
Ok(Self { num_heads, slopes })
}
/// Compute head-specific slopes following `ALiBi` paper algorithm
///
/// For powers of 2: m[h] = 2^(-8h/n)
/// For non-powers of 2: interpolate between adjacent powers of 2
fn compute_slopes(num_heads: usize) -> Vec<f32> {
// Find closest power of 2
let closest_power_of_2 = if num_heads.is_power_of_two() {
num_heads
} else {
num_heads.next_power_of_two() / 2
};
#[allow(clippy::cast_precision_loss)]
let ratio = 8.0 / (closest_power_of_2 as f32);
let mut slopes = Vec::with_capacity(num_heads);
// Compute slopes for power of 2 heads
for i in 0..closest_power_of_2.min(num_heads) {
#[allow(clippy::cast_precision_loss)]
let exponent = -(i as f32) * ratio;
slopes.push(2_f32.powf(exponent));
}
// If not power of 2, add extra slopes with step=2
if num_heads > closest_power_of_2 {
#[allow(clippy::cast_precision_loss)]
let extra_ratio = 4.0 / (closest_power_of_2 as f32);
for i in 0..(num_heads - closest_power_of_2) {
#[allow(clippy::cast_precision_loss)]
let exponent = -((2 * i + 1) as f32) * extra_ratio;
slopes.push(2_f32.powf(exponent));
}
}
slopes
}
/// Get bias matrix for a given sequence length
///
/// Returns a tensor of shape `[seq_len, seq_len, num_heads]` where:
/// ```text
/// bias[i, j, h] = -slopes[h] * abs(i - j)
/// ```
///
/// # Arguments
///
/// * `seq_len` - Sequence length for computing bias
///
/// # Returns
///
/// Tensor of shape `[seq_len, seq_len, num_heads]` containing position biases
///
/// # Errors
///
/// Returns error if `seq_len` is zero
pub fn get_bias(&self, seq_len: usize) -> Result<Tensor<f32>> {
if seq_len == 0 {
return Err(RealizarError::InvalidShape {
reason: "seq_len must be > 0".to_string(),
});
}
let total_size = seq_len * seq_len * self.num_heads;
let mut data = Vec::with_capacity(total_size);
// Compute bias for each position pair and head
for i in 0..seq_len {
for j in 0..seq_len {
for &slope in &self.slopes {
#[allow(clippy::cast_precision_loss)]
let distance = (i as f32 - j as f32).abs();
let bias = -slope * distance;
data.push(bias);
}
}
}
Tensor::from_vec(vec![seq_len, seq_len, self.num_heads], data)
}
/// Get number of attention heads
#[must_use]
pub fn num_heads(&self) -> usize {
self.num_heads
}
/// Get head-specific slopes
#[must_use]
pub fn slopes(&self) -> &[f32] {
&self.slopes
}
}