aprender-gpu 0.31.2

Pure Rust PTX generation for NVIDIA CUDA - no LLVM, no nvcc
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
//! CPU-side NF4 quantization and dequantization (Dettmers et al. 2023).
//!
//! NF4 (4-bit NormalFloat) is a quantization scheme optimized for normally-distributed
//! weights. Each value maps to one of 16 codebook entries derived from normal distribution
//! quantiles, achieving near-optimal information-theoretic compression for Gaussian weights.
//!
//! # Block Layout (36 bytes for 64 values)
//!
//! ```text
//! ┌───────────────────────────────────────────────────┐
//! │ Offset 0-3:   scale (f32, absmax normalization)   │
//! │ Offset 4-35:  data  (32 bytes, 64 × 4-bit packed) │
//! └───────────────────────────────────────────────────┘
//! ```
//!
//! # Contracts
//!
//! - C-NF4-001: Codebook fidelity — round-trip error < 0.05 for N(0,1) weights
//! - C-NF4-002: Block alignment — K divisible by 64, packed size = (K/64) × 36 × N
//! - C-NF4-004: Compression ratio — ≥7.1x vs fp32

/// NF4 block size: 64 values per quantization block.
pub const NF4_BLOCK_SIZE: usize = 64;

/// Bytes per NF4 block: 4 (f32 scale) + 32 (packed nibbles) = 36.
pub const NF4_BLOCK_BYTES: usize = 36;

/// NF4 codebook: 16 values derived from normal distribution quantiles.
///
/// These are the optimal reconstruction points for a standard normal distribution
/// quantized to 4 bits (Dettmers et al., "QLoRA", NeurIPS 2023).
#[allow(clippy::excessive_precision, clippy::unreadable_literal)]
pub const NF4_LUT: [f32; 16] = [
    -1.0,
    -0.6961928009986877,
    -0.5250730514526367,
    -0.39491748809814453,
    -0.28444138169288635,
    -0.18477343022823334,
    -0.09105003625154495,
    0.0,
    0.07958029955625534,
    0.16093020141124725,
    0.24611230194568634,
    0.33791524171829224,
    0.44070982933044434,
    0.5626170039176941,
    0.7229568362236023,
    1.0,
];

/// NF4-quantized weight matrix.
///
/// Stores per-block f32 scale factors and packed 4-bit codebook indices.
/// Two values are packed per byte (low nibble first).
#[derive(Debug, Clone)]
pub struct Nf4Quantized {
    /// Per-block scale factors (absmax of each 64-element block).
    pub scales: Vec<f32>,
    /// Packed 4-bit indices (2 values per byte, low nibble first).
    pub data: Vec<u8>,
    /// Original matrix shape (rows, cols). rows × cols must be divisible by 64.
    pub shape: (usize, usize),
}

impl Nf4Quantized {
    /// Total number of quantization blocks.
    #[must_use]
    pub fn num_blocks(&self) -> usize {
        self.scales.len()
    }

    /// Total number of values represented.
    #[must_use]
    pub fn num_values(&self) -> usize {
        self.shape.0 * self.shape.1
    }

    /// Packed byte count (excludes scale storage).
    #[must_use]
    pub fn data_bytes(&self) -> usize {
        self.data.len()
    }

    /// Total bytes for the quantized representation (scales + packed data).
    #[must_use]
    pub fn total_bytes(&self) -> usize {
        self.scales.len() * 4 + self.data.len()
    }
}

/// Find the nearest NF4 codebook index for a normalized value in [-1, 1].
///
/// Uses linear scan (16 entries is small enough that binary search has no advantage).
fn nearest_nf4_index(normalized: f32) -> u8 {
    let mut best_idx = 0u8;
    let mut best_dist = f32::MAX;
    for (i, &entry) in NF4_LUT.iter().enumerate() {
        let dist = (normalized - entry).abs();
        if dist < best_dist {
            best_dist = dist;
            best_idx = i as u8;
        }
    }
    best_idx
}

/// Quantize an f32 slice to NF4 format.
///
/// # Contract: C-NF4-002 (Block Alignment)
///
/// - **Precondition**: `values.len()` must be divisible by [`NF4_BLOCK_SIZE`] (64).
/// - **Postcondition**: `result.data.len() == values.len() / 2`,
///   `result.scales.len() == values.len() / 64`.
///
/// # Panics
///
/// Panics if `values.len()` is not divisible by 64.
#[must_use]
pub fn quantize_nf4(values: &[f32], rows: usize, cols: usize) -> Nf4Quantized {
    let n = values.len();
    assert!(
        n % NF4_BLOCK_SIZE == 0,
        "C-NF4-002: value count {n} not divisible by NF4 block size {NF4_BLOCK_SIZE}"
    );
    assert_eq!(
        rows * cols,
        n,
        "C-NF4-002: shape ({rows}, {cols}) does not match value count {n}"
    );

    let num_blocks = n / NF4_BLOCK_SIZE;
    let mut scales = Vec::with_capacity(num_blocks);
    let mut data = Vec::with_capacity(n / 2);

    for block_idx in 0..num_blocks {
        let start = block_idx * NF4_BLOCK_SIZE;
        let block = &values[start..start + NF4_BLOCK_SIZE];

        // Compute absmax for this block
        let absmax = block.iter().fold(0.0f32, |acc, &v| acc.max(v.abs()));

        scales.push(absmax);

        // Quantize: normalize to [-1, 1], find nearest codebook entry
        let inv_scale = if absmax > 0.0 { 1.0 / absmax } else { 0.0 };

        for pair in block.chunks_exact(2) {
            let idx_lo = nearest_nf4_index(pair[0] * inv_scale);
            let idx_hi = nearest_nf4_index(pair[1] * inv_scale);
            data.push(idx_lo | (idx_hi << 4));
        }
    }

    Nf4Quantized {
        scales,
        data,
        shape: (rows, cols),
    }
}

/// Dequantize NF4 back to f32.
///
/// # Contract: C-NF4-001 (Codebook Fidelity)
///
/// - **Postcondition**: Max normalized error < 0.16 per element
///   (i.e., `|dequant(quant(x)) / absmax - x / absmax| < 0.16`).
#[must_use]
pub fn dequantize_nf4(q: &Nf4Quantized) -> Vec<f32> {
    let n = q.num_values();
    let mut output = Vec::with_capacity(n);

    for (block_idx, &scale) in q.scales.iter().enumerate() {
        let data_start = block_idx * (NF4_BLOCK_SIZE / 2);

        for byte_idx in 0..(NF4_BLOCK_SIZE / 2) {
            let packed = q.data[data_start + byte_idx];
            let idx_lo = (packed & 0x0F) as usize;
            let idx_hi = (packed >> 4) as usize;

            output.push(NF4_LUT[idx_lo] * scale);
            output.push(NF4_LUT[idx_hi] * scale);
        }
    }

    output
}

/// Pack NF4 quantized data into contiguous GPU-ready buffer.
///
/// Returns a flat buffer suitable for GPU upload with layout:
/// `[scale_0: f32][data_0: u8 × 32][scale_1: f32][data_1: u8 × 32]...`
///
/// Each block is exactly [`NF4_BLOCK_BYTES`] (36) bytes.
#[must_use]
pub fn pack_nf4_for_gpu(q: &Nf4Quantized) -> Vec<u8> {
    let num_blocks = q.num_blocks();
    let mut packed = Vec::with_capacity(num_blocks * NF4_BLOCK_BYTES);

    for block_idx in 0..num_blocks {
        // Write scale as f32 (4 bytes, little-endian)
        packed.extend_from_slice(&q.scales[block_idx].to_le_bytes());

        // Write 32 packed bytes (64 values)
        let data_start = block_idx * (NF4_BLOCK_SIZE / 2);
        let data_end = data_start + (NF4_BLOCK_SIZE / 2);
        packed.extend_from_slice(&q.data[data_start..data_end]);
    }

    packed
}

/// Unpack GPU buffer back to [`Nf4Quantized`] struct.
///
/// Inverse of [`pack_nf4_for_gpu`].
#[must_use]
pub fn unpack_nf4_from_gpu(packed: &[u8], rows: usize, cols: usize) -> Nf4Quantized {
    let n = rows * cols;
    let num_blocks = n / NF4_BLOCK_SIZE;
    let mut scales = Vec::with_capacity(num_blocks);
    let mut data = Vec::with_capacity(n / 2);

    for block_idx in 0..num_blocks {
        let offset = block_idx * NF4_BLOCK_BYTES;

        // Read scale (f32, little-endian)
        let scale_bytes: [u8; 4] = packed[offset..offset + 4]
            .try_into()
            .expect("C-NF4-002: packed buffer too short for scale");
        scales.push(f32::from_le_bytes(scale_bytes));

        // Read 32 packed data bytes
        data.extend_from_slice(&packed[offset + 4..offset + NF4_BLOCK_BYTES]);
    }

    Nf4Quantized {
        scales,
        data,
        shape: (rows, cols),
    }
}

#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
    use super::*;

    /// C-NF4-001: Round-trip fidelity — max normalized error < 0.16 for arbitrary weights.
    ///
    /// The NF4 codebook has max gap of 0.304 between entries 0 (-1.0) and 1 (-0.696),
    /// yielding worst-case quantization error of ~0.152 (half-gap). This is by design:
    /// NF4 concentrates precision near zero where normally-distributed weights cluster.
    /// Average error for Gaussian weights is ~0.02.
    #[test]
    fn test_c_nf4_001_codebook_fidelity() {
        // Generate pseudo-normal samples using Box-Muller (deterministic seed)
        let n = 1024; // 16 blocks of 64
        let mut values = Vec::with_capacity(n);
        for i in 0..n {
            // Simple PRNG to approximate normal distribution
            let u1 = ((i * 1103515245 + 12345) % 65536) as f32 / 65536.0;
            let u2 = ((i * 6364136223 + 1442695) % 65536) as f32 / 65536.0;
            let u1_clamped = u1.clamp(1e-6, 1.0 - 1e-6);
            let z = (-2.0 * u1_clamped.ln()).sqrt() * (2.0 * std::f32::consts::PI * u2).cos();
            values.push(z);
        }

        let q = quantize_nf4(&values, 1, n);
        let deq = dequantize_nf4(&q);

        assert_eq!(deq.len(), n);

        // Check per-block normalized error
        // Threshold 0.16 = half of max codebook gap (0.304 between entries 0 and 1)
        let mut max_err = 0.0f32;
        for block_idx in 0..q.num_blocks() {
            let start = block_idx * NF4_BLOCK_SIZE;
            let absmax = q.scales[block_idx];
            if absmax == 0.0 {
                continue;
            }
            for i in 0..NF4_BLOCK_SIZE {
                let orig_norm = values[start + i] / absmax;
                let deq_norm = deq[start + i] / absmax;
                let err = (orig_norm - deq_norm).abs();
                max_err = max_err.max(err);
                assert!(
                    err < 0.16,
                    "C-NF4-001 violated: block {block_idx} element {i}: \
                     orig_norm={orig_norm:.4}, deq_norm={deq_norm:.4}, error={err:.4}"
                );
            }
        }

        // Verify error is reasonable (should be well below threshold for most values)
        assert!(
            max_err > 0.0,
            "Max error should be non-zero for random data"
        );
    }

    /// C-NF4-002: Block alignment — sizes match expected layout.
    #[test]
    fn test_c_nf4_002_block_alignment() {
        let rows = 896;
        let cols = 896;
        let n = rows * cols;
        let values = vec![0.1f32; n];

        let q = quantize_nf4(&values, rows, cols);

        let expected_blocks = n / NF4_BLOCK_SIZE;
        assert_eq!(q.num_blocks(), expected_blocks);
        assert_eq!(q.data.len(), n / 2);
        assert_eq!(q.total_bytes(), expected_blocks * 4 + n / 2);

        // GPU packed buffer must be exactly num_blocks * 36
        let packed = pack_nf4_for_gpu(&q);
        assert_eq!(packed.len(), expected_blocks * NF4_BLOCK_BYTES);
    }

    /// C-NF4-004: Compression ratio ≥ 7.1x vs fp32.
    #[test]
    fn test_c_nf4_004_compression_ratio() {
        let rows = 896;
        let cols = 896;
        let n = rows * cols;
        let values = vec![0.5f32; n];

        let q = quantize_nf4(&values, rows, cols);

        let fp32_bytes = n * 4;
        let nf4_bytes = q.total_bytes();
        let ratio = fp32_bytes as f64 / nf4_bytes as f64;

        assert!(
            ratio >= 7.1,
            "C-NF4-004 violated: compression ratio {ratio:.2}x < 7.1x \
             (fp32={fp32_bytes}, nf4={nf4_bytes})"
        );
    }

    /// Verify codebook has exactly 16 entries spanning [-1, 1].
    #[test]
    fn test_nf4_codebook_properties() {
        assert_eq!(NF4_LUT.len(), 16);
        assert_eq!(NF4_LUT[0], -1.0);
        assert_eq!(NF4_LUT[15], 1.0);

        // Monotonically increasing
        for i in 1..16 {
            assert!(
                NF4_LUT[i] > NF4_LUT[i - 1],
                "NF4_LUT not monotonic at index {i}"
            );
        }
    }

    /// Verify zero values quantize/dequantize correctly.
    #[test]
    fn test_nf4_zero_block() {
        let values = vec![0.0f32; 64];
        let q = quantize_nf4(&values, 1, 64);
        let deq = dequantize_nf4(&q);

        for (i, &v) in deq.iter().enumerate() {
            assert_eq!(v, 0.0, "zero block element {i} = {v}");
        }
    }

    /// Verify GPU pack/unpack round-trips correctly.
    #[test]
    fn test_nf4_gpu_pack_roundtrip() {
        let n = 256;
        let values: Vec<f32> = (0..n).map(|i| (i as f32 - 128.0) / 128.0).collect();

        let q = quantize_nf4(&values, 4, 64);
        let packed = pack_nf4_for_gpu(&q);
        let unpacked = unpack_nf4_from_gpu(&packed, 4, 64);

        assert_eq!(unpacked.scales, q.scales);
        assert_eq!(unpacked.data, q.data);
        assert_eq!(unpacked.shape, q.shape);
    }

    /// Verify nearest_nf4_index maps boundary values correctly.
    #[test]
    fn test_nearest_nf4_index_boundaries() {
        // -1.0 should map to index 0
        assert_eq!(nearest_nf4_index(-1.0), 0);
        // 1.0 should map to index 15
        assert_eq!(nearest_nf4_index(1.0), 15);
        // 0.0 should map to index 7
        assert_eq!(nearest_nf4_index(0.0), 7);
    }

    #[test]
    #[should_panic(expected = "C-NF4-002")]
    fn test_nf4_rejects_misaligned_input() {
        let values = vec![0.0f32; 63]; // Not divisible by 64
        let _ = quantize_nf4(&values, 1, 63);
    }
}