1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
impl ALiBi {
/// Create a new `ALiBi` layer
///
/// # Arguments
///
/// * `num_heads` - Number of attention heads
///
/// # Errors
///
/// Returns error if `num_heads` is zero
pub fn new(num_heads: usize) -> Result<Self> {
if num_heads == 0 {
return Err(RealizarError::InvalidShape {
reason: "num_heads must be > 0".to_string(),
});
}
// Compute slopes for each head
let slopes = Self::compute_slopes(num_heads);
Ok(Self { num_heads, slopes })
}
/// Compute head-specific slopes following `ALiBi` paper algorithm
///
/// For powers of 2: m[h] = 2^(-8(h+1)/n)
/// For non-powers of 2: interpolate between adjacent powers of 2
///
/// Matches Press et al. 2021 and llama.cpp ggml `soft_max_ext`
/// (`m0 = 2^(-8/n)`, `slope = m0^(h+1)`). For n=8: slopes are
/// 0.5, 0.25, ..., 2^(-8) = 0.00390625. The `+ 1` is load-bearing:
/// without it head 0 carries slope 1.0 (an extra factor of `m0`).
fn compute_slopes(num_heads: usize) -> Vec<f32> {
// Find closest power of 2
let closest_power_of_2 = if num_heads.is_power_of_two() {
num_heads
} else {
num_heads.next_power_of_two() / 2
};
#[allow(clippy::cast_precision_loss)]
let ratio = 8.0 / (closest_power_of_2 as f32);
let mut slopes = Vec::with_capacity(num_heads);
// Compute slopes for power of 2 heads.
// slope[h] = 2^(-8(h+1)/n) — the (i + 1) is load-bearing (PMAT-858):
// without it, head 0 gets 2^0 = 1.0 (an extra factor of m0 = 2^(8/n))
// instead of the correct 2^(-8/n).
for i in 0..closest_power_of_2.min(num_heads) {
#[allow(clippy::cast_precision_loss)]
let exponent = -((i + 1) as f32) * ratio;
slopes.push(2_f32.powf(exponent));
}
// If not power of 2, add extra slopes with step=2
if num_heads > closest_power_of_2 {
#[allow(clippy::cast_precision_loss)]
let extra_ratio = 4.0 / (closest_power_of_2 as f32);
for i in 0..(num_heads - closest_power_of_2) {
#[allow(clippy::cast_precision_loss)]
let exponent = -((2 * i + 1) as f32) * extra_ratio;
slopes.push(2_f32.powf(exponent));
}
}
slopes
}
/// Get bias matrix for a given sequence length
///
/// Returns a tensor of shape `[seq_len, seq_len, num_heads]` where:
/// ```text
/// bias[i, j, h] = -slopes[h] * abs(i - j)
/// ```
///
/// # Arguments
///
/// * `seq_len` - Sequence length for computing bias
///
/// # Returns
///
/// Tensor of shape `[seq_len, seq_len, num_heads]` containing position biases
///
/// # Errors
///
/// Returns error if `seq_len` is zero
pub fn get_bias(&self, seq_len: usize) -> Result<Tensor<f32>> {
if seq_len == 0 {
return Err(RealizarError::InvalidShape {
reason: "seq_len must be > 0".to_string(),
});
}
let total_size = seq_len * seq_len * self.num_heads;
let mut data = Vec::with_capacity(total_size);
// Compute bias for each position pair and head
for i in 0..seq_len {
for j in 0..seq_len {
for &slope in &self.slopes {
#[allow(clippy::cast_precision_loss)]
let distance = (i as f32 - j as f32).abs();
let bias = -slope * distance;
data.push(bias);
}
}
}
Tensor::from_vec(vec![seq_len, seq_len, self.num_heads], data)
}
/// Get number of attention heads
#[must_use]
pub fn num_heads(&self) -> usize {
self.num_heads
}
/// Get head-specific slopes
#[must_use]
pub fn slopes(&self) -> &[f32] {
&self.slopes
}
}
// ============================================================================
// PMAT-858: ALiBi slope exponent falsifier
//
// Bug: compute_slopes used exponent = -h * (8/n), giving slope[h] = 2^(-8h/n).
// For h=0 that is 2^0 = 1.0 — every head carried an extra factor of m0 = 2^(8/n)
// vs the reference. The correct ALiBi slope (Press et al. 2021 + llama.cpp ggml
// soft_max_ext: m0 = 2^(-8/n), slope = m0^(h+1)) is slope[h] = 2^(-8(h+1)/n).
//
// RED (buggy code): slopes()[0] == 1.0
// GREEN (fixed code): slopes()[0] == 0.5 for n=8, slopes()[7] == 2^(-8).
// ============================================================================
#[cfg(test)]
mod pmat_858_alibi_slope_falsifier {
use super::ALiBi;
/// Reference slope per Press et al. 2021 / llama.cpp ggml:
/// m0 = 2^(-8/n), slope[h] = m0^(h+1) = 2^(-8(h+1)/n).
fn reference_slope(h: usize, num_heads: usize) -> f32 {
#[allow(clippy::cast_precision_loss)]
let exponent = -8.0 * ((h + 1) as f32) / (num_heads as f32);
2_f32.powf(exponent)
}
#[test]
fn falsifier_alibi_slopes_power_of_two_match_reference() {
// n=8: reference slopes are 2^(-1), 2^(-2), ..., 2^(-8).
let alibi = ALiBi::new(8).expect("8 heads is valid");
let slopes = alibi.slopes();
assert_eq!(slopes.len(), 8);
// Headline falsifier: head 0 must be 0.5, NOT the buggy 1.0.
assert!(
(slopes[0] - 0.5).abs() < 1e-7,
"PMAT-858: slopes[0] must be 0.5 (2^(-8/8)), got {} (buggy code yields 1.0)",
slopes[0]
);
// Last head: 2^(-8) = 0.003_906_25 (buggy code yields 2^(-7) = 0.007_812_5).
assert!(
(slopes[7] - 0.003_906_25).abs() < 1e-7,
"PMAT-858: slopes[7] must be 2^(-8) = 0.00390625, got {}",
slopes[7]
);
// Every head must match the closed-form reference exactly (within fp tol).
for (h, &s) in slopes.iter().enumerate() {
let want = reference_slope(h, 8);
assert!(
(s - want).abs() < 1e-7,
"PMAT-858: slopes[{h}] = {s}, reference 2^(-8(h+1)/8) = {want}"
);
}
}
#[test]
fn falsifier_alibi_slopes_single_head() {
// n=1: slope[0] = 2^(-8) (NOT 2^0 = 1.0).
let alibi = ALiBi::new(1).expect("1 head is valid");
let slopes = alibi.slopes();
assert_eq!(slopes.len(), 1);
assert!(
(slopes[0] - 0.003_906_25).abs() < 1e-7,
"PMAT-858: single-head slope must be 2^(-8) = 0.00390625, got {}",
slopes[0]
);
}
#[test]
fn falsifier_alibi_slopes_non_power_of_two_match_paper() {
// n=12 exercises the interpolation branch. The original ALiBi paper
// get_slopes(12) = get_slopes_power_of_2(8) followed by the even
// entries of get_slopes(16):
// [2^-1, 2^-2, .., 2^-8, 2^-0.5, 2^-1.5, 2^-2.5, 2^-3.5]
let alibi = ALiBi::new(12).expect("12 heads is valid");
let slopes = alibi.slopes();
assert_eq!(slopes.len(), 12);
let expected: [f32; 12] = [
// power-of-2 block (closest_power_of_2 = 8): 2^(-8(h+1)/8)
0.5,
0.25,
0.125,
0.062_5,
0.031_25,
0.015_625,
0.007_812_5,
0.003_906_25,
// interpolation block: 2^(-4(2i+1)/8) = 2^(-(2i+1)/2)
2_f32.powf(-0.5),
2_f32.powf(-1.5),
2_f32.powf(-2.5),
2_f32.powf(-3.5),
];
for (h, (&got, &want)) in slopes.iter().zip(expected.iter()).enumerate() {
assert!(
(got - want).abs() < 1e-6,
"PMAT-858: slopes[{h}] = {got}, paper get_slopes(12)[{h}] = {want}"
);
}
// First head of the power-of-2 block must still be 0.5, not 1.0.
assert!(
(slopes[0] - 0.5).abs() < 1e-7,
"PMAT-858: non-power-of-2 head 0 must be 0.5, got {}",
slopes[0]
);
}
#[test]
fn falsifier_alibi_slopes_strictly_below_one() {
// With the fix, no slope can be >= 1.0 (the buggy head-0 = 1.0 is gone).
for n in [1usize, 2, 4, 8, 12, 16, 32] {
let alibi = ALiBi::new(n).expect("valid head count");
for (h, &s) in alibi.slopes().iter().enumerate() {
assert!(s > 0.0, "PMAT-858: slope[{h}] (n={n}) must be > 0, got {s}");
assert!(
s < 1.0,
"PMAT-858: slope[{h}] (n={n}) must be < 1.0 (buggy head 0 = 1.0), got {s}"
);
}
}
}
}