realizar 0.8.4

Pure Rust ML inference engine built from scratch - model serving for GGUF and safetensors
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
//! Parallel dequantization functions (PMAT-802)
//!
//! Implements parallel dequantization using rayon for multi-core acceleration:
//! - `dequantize_q4_k_parallel` - Parallel Q4_K dequantization
//! - `dequantize_q4_k_simd` - SIMD-accelerated Q4_K dequantization
//! - `dequantize_q8_0_parallel` - Parallel Q8_0 dequantization
//! - `dequantize_q8_0_simd` - SIMD-accelerated Q8_0 dequantization

use super::dequant::{f16_to_f32, read_f16};
use super::simd::extract_scale_min;
use super::types::QK_K;
use crate::error::{RealizarError, Result};

// ============================================================================

/// Parallel Q4_K dequantization using rayon
///
/// Processes super-blocks in parallel for faster model loading.
/// Each super-block (256 values) is dequantized independently.
///
/// # Arguments
///
/// * `data` - Raw Q4_K quantized data (144 bytes per super-block)
///
/// # Returns
///
/// Dequantized f32 values
///
/// # Errors
///
/// Returns error if data length is not a multiple of super-block size
///
/// # Performance
///
/// On a 16-core system, achieves ~10x speedup over serial dequantization
/// for large models (1B+ parameters).
pub fn dequantize_q4_k_parallel(data: &[u8]) -> Result<Vec<f32>> {
    use rayon::prelude::*;

    const SUPER_BLOCK_BYTES: usize = 144;

    if !data.len().is_multiple_of(SUPER_BLOCK_BYTES) {
        return Err(RealizarError::InvalidShape {
            reason: format!(
                "Q4_K data length {} is not a multiple of super-block size {}",
                data.len(),
                SUPER_BLOCK_BYTES
            ),
        });
    }

    let num_super_blocks = data.len() / SUPER_BLOCK_BYTES;

    // Process super-blocks in parallel
    let result: Vec<f32> = (0..num_super_blocks)
        .into_par_iter()
        .flat_map(|sb_idx| {
            let sb_start = sb_idx * SUPER_BLOCK_BYTES;
            let sb_data = &data[sb_start..sb_start + SUPER_BLOCK_BYTES];
            dequantize_q4_k_superblock(sb_data)
        })
        .collect();

    Ok(result)
}

/// Dequantize a single Q4_K super-block (256 values)
///
/// Internal helper for parallel processing.
///
/// Exposed as `pub(crate)` for direct testing.
#[inline]
pub(crate) fn dequantize_q4_k_superblock(sb_data: &[u8]) -> Vec<f32> {
    // CB-506: Real bounds check (not debug-only) prevents OOB panic on corrupted data
    if sb_data.len() < 144 {
        return vec![0.0f32; QK_K];
    }
    let mut result = vec![0.0f32; QK_K];

    // Read d (f16 -> f32) — bounds guaranteed by check above
    let d = read_f16(&sb_data[0..2]);

    // Read dmin (f16 -> f32)
    let dmin = read_f16(&sb_data[2..4]);

    // Read scales (12 bytes)
    let mut scales = [0u8; 12];
    scales.copy_from_slice(&sb_data[4..16]);

    // Read qs (128 bytes)
    let qs = &sb_data[16..144];

    // Dequantize following candle's layout:
    // For each 64-value chunk, output 32 low nibbles then 32 high nibbles
    let mut ys_index = 0;

    for j in (0..QK_K).step_by(64) {
        let q = &qs[j / 2..j / 2 + 32];

        // Get scales for the two 32-value halves
        let is = j / 32;
        let (sc1, m1) = extract_scale_min(&scales, is);
        let d1 = d * sc1;
        let dm1 = dmin * m1;

        let (sc2, m2) = extract_scale_min(&scales, is + 1);
        let d2 = d * sc2;
        let dm2 = dmin * m2;

        // First pass: 32 low nibbles
        for &byte in q {
            result[ys_index] = d1 * (byte & 0xF) as f32 - dm1;
            ys_index += 1;
        }

        // Second pass: 32 high nibbles
        for &byte in q {
            result[ys_index] = d2 * (byte >> 4) as f32 - dm2;
            ys_index += 1;
        }
    }

    result
}

/// SIMD-accelerated Q4_K dequantization with runtime feature detection
///
/// Uses AVX2 when available, falls back to parallel scalar otherwise.
///
/// # Arguments
///
/// * `data` - Raw Q4_K quantized data (144 bytes per super-block)
///
/// # Returns
///
/// Dequantized f32 values
///
/// # Errors
///
/// Returns error if data length is not a multiple of super-block size
///
/// # Performance
///
/// AVX2: ~4x speedup over scalar per super-block
/// Combined with rayon: ~40x speedup on 16-core system
pub fn dequantize_q4_k_simd(data: &[u8]) -> Result<Vec<f32>> {
    #[cfg(target_arch = "x86_64")]
    {
        if is_x86_feature_detected!("avx2") {
            // SAFETY: AVX2 verified at runtime
            return unsafe { dequantize_q4_k_avx2_parallel(data) };
        }
    }

    // Fallback to parallel scalar
    dequantize_q4_k_parallel(data)
}

/// AVX2-accelerated parallel Q4_K dequantization
///
/// # Safety
///
/// Caller must ensure AVX2 is available (use runtime feature detection)
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
// SAFETY: Caller must satisfy the documented preconditions
unsafe fn dequantize_q4_k_avx2_parallel(data: &[u8]) -> Result<Vec<f32>> {
    use rayon::prelude::*;

    const SUPER_BLOCK_BYTES: usize = 144;
    // Process 64 super-blocks per parallel task to reduce scheduling overhead
    const CHUNK_SIZE: usize = 64;
    const CHUNK_BYTES: usize = SUPER_BLOCK_BYTES * CHUNK_SIZE;

    if !data.len().is_multiple_of(SUPER_BLOCK_BYTES) {
        return Err(RealizarError::InvalidShape {
            reason: format!(
                "Q4_K data length {} is not a multiple of super-block size {}",
                data.len(),
                SUPER_BLOCK_BYTES
            ),
        });
    }

    let num_super_blocks = data.len() / SUPER_BLOCK_BYTES;

    // For small data, skip parallelism overhead
    if num_super_blocks < CHUNK_SIZE * 2 {
        let mut result = Vec::with_capacity(num_super_blocks * QK_K);
        for sb_idx in 0..num_super_blocks {
            let sb_start = sb_idx * SUPER_BLOCK_BYTES;
            let sb_data = &data[sb_start..sb_start + SUPER_BLOCK_BYTES];
            // SAFETY: AVX2 availability verified by caller
            result.extend(unsafe { dequantize_q4_k_superblock_avx2(sb_data) });
        }
        return Ok(result);
    }

    // Process chunks of super-blocks in parallel
    let result: Vec<f32> = data
        .par_chunks(CHUNK_BYTES)
        .flat_map(|chunk| {
            let mut chunk_result = Vec::with_capacity(chunk.len() / SUPER_BLOCK_BYTES * QK_K);
            for sb_data in chunk.chunks_exact(SUPER_BLOCK_BYTES) {
                // SAFETY: AVX2 availability verified by caller
                chunk_result.extend(unsafe { dequantize_q4_k_superblock_avx2(sb_data) });
            }
            chunk_result
        })
        .collect();

    Ok(result)
}

/// AVX2 SIMD dequantization for a single Q4_K super-block
///
/// Uses 256-bit SIMD to process 8 values simultaneously.
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2", enable = "fma")]
// SAFETY: Caller must satisfy the documented preconditions
unsafe fn dequantize_q4_k_superblock_avx2(sb_data: &[u8]) -> Vec<f32> {
    #[allow(clippy::wildcard_imports)]
    use std::arch::x86_64::*;

    // CB-506: Real bounds check (not debug-only) prevents OOB panic on corrupted data
    if sb_data.len() < 144 {
        return vec![0.0f32; QK_K];
    }
    let mut result = vec![0.0f32; QK_K];

    // Read d and dmin — bounds guaranteed by check above
    let d = read_f16(&sb_data[0..2]);
    let dmin = read_f16(&sb_data[2..4]);

    // SAFETY: AVX2 availability verified by caller's target_feature
    unsafe {
        // Read scales
        let mut scales = [0u8; 12];
        scales.copy_from_slice(&sb_data[4..16]);

        let qs = &sb_data[16..144];

        // Dequantize following candle's layout:
        // For each 64-value chunk, output 32 low nibbles then 32 high nibbles
        let mut ys_index = 0;

        for j in (0..QK_K).step_by(64) {
            let q = &qs[j / 2..j / 2 + 32];

            // Get scales for the two 32-value halves
            let is = j / 32;
            let (sc1, m1) = extract_scale_min(&scales, is);
            let d1 = d * sc1;
            let dm1 = dmin * m1;
            let d1_vec = _mm256_set1_ps(d1);
            let dm1_vec = _mm256_set1_ps(dm1);

            let (sc2, m2) = extract_scale_min(&scales, is + 1);
            let d2 = d * sc2;
            let dm2 = dmin * m2;
            let d2_vec = _mm256_set1_ps(d2);
            let dm2_vec = _mm256_set1_ps(dm2);

            // First pass: 32 low nibbles in 4 iterations of 8
            for chunk in 0..4 {
                let byte_start = chunk * 8;

                // Extract 8 low nibbles from 8 bytes
                let q0 = (q[byte_start] & 0x0F) as i32;
                let q1 = (q[byte_start + 1] & 0x0F) as i32;
                let q2 = (q[byte_start + 2] & 0x0F) as i32;
                let q3 = (q[byte_start + 3] & 0x0F) as i32;
                let q4 = (q[byte_start + 4] & 0x0F) as i32;
                let q5 = (q[byte_start + 5] & 0x0F) as i32;
                let q6 = (q[byte_start + 6] & 0x0F) as i32;
                let q7 = (q[byte_start + 7] & 0x0F) as i32;

                let q_vec = _mm256_setr_epi32(q0, q1, q2, q3, q4, q5, q6, q7);
                let q_f32 = _mm256_cvtepi32_ps(q_vec);
                let dequant = _mm256_fmsub_ps(d1_vec, q_f32, dm1_vec);

                _mm256_storeu_ps(result.as_mut_ptr().add(ys_index), dequant);
                ys_index += 8;
            }

            // Second pass: 32 high nibbles in 4 iterations of 8
            for chunk in 0..4 {
                let byte_start = chunk * 8;

                // Extract 8 high nibbles from 8 bytes
                let q0 = (q[byte_start] >> 4) as i32;
                let q1 = (q[byte_start + 1] >> 4) as i32;
                let q2 = (q[byte_start + 2] >> 4) as i32;
                let q3 = (q[byte_start + 3] >> 4) as i32;
                let q4 = (q[byte_start + 4] >> 4) as i32;
                let q5 = (q[byte_start + 5] >> 4) as i32;
                let q6 = (q[byte_start + 6] >> 4) as i32;
                let q7 = (q[byte_start + 7] >> 4) as i32;

                let q_vec = _mm256_setr_epi32(q0, q1, q2, q3, q4, q5, q6, q7);
                let q_f32 = _mm256_cvtepi32_ps(q_vec);
                let dequant = _mm256_fmsub_ps(d2_vec, q_f32, dm2_vec);

                _mm256_storeu_ps(result.as_mut_ptr().add(ys_index), dequant);
                ys_index += 8;
            }
        }
    }

    result
}

/// Parallel Q8_0 dequantization using rayon
///
/// Q8_0 is simpler than Q4_K (no scale packing), making SIMD even more effective.
///
/// # Arguments
///
/// * `data` - Raw Q8_0 quantized data (36 bytes per block: 4 scale + 32 quants)
///
/// # Returns
///
/// Dequantized f32 values
///
/// # Errors
///
/// Returns error if data length is not a multiple of block size
pub fn dequantize_q8_0_parallel(data: &[u8]) -> Result<Vec<f32>> {
    use rayon::prelude::*;

    const BLOCK_BYTES: usize = 34; // 2 (f16 scale) + 32 (i8 quants)

    if !data.len().is_multiple_of(BLOCK_BYTES) {
        return Err(RealizarError::InvalidShape {
            reason: format!(
                "Q8_0 data length {} is not a multiple of block size {}",
                data.len(),
                BLOCK_BYTES
            ),
        });
    }

    let num_blocks = data.len() / BLOCK_BYTES;

    // Process blocks in parallel
    let result: Vec<f32> = (0..num_blocks)
        .into_par_iter()
        .flat_map(|block_idx| {
            let block_start = block_idx * BLOCK_BYTES;
            let block_data = &data[block_start..block_start + BLOCK_BYTES];
            dequantize_q8_0_block(block_data)
        })
        .collect();

    Ok(result)
}

/// Dequantize a single Q8_0 block (32 values)
///
/// Exposed as `pub(crate)` for direct testing.
#[inline]
pub(crate) fn dequantize_q8_0_block(block_data: &[u8]) -> Vec<f32> {
    // CB-506: Real bounds check (not debug-only) prevents OOB panic on corrupted data
    if block_data.len() < 34 {
        return vec![0.0f32; 32];
    }
    let mut result = Vec::with_capacity(32);

    // Read scale (f16 -> f32) — bounds guaranteed by check above
    let scale_bits = u16::from_le_bytes([block_data[0], block_data[1]]);
    let scale = f16_to_f32(scale_bits);

    // Dequantize 32 int8 values
    for &byte in &block_data[2..34] {
        let value = i8::from_le_bytes([byte]);
        result.push(scale * f32::from(value));
    }

    result
}

/// SIMD-accelerated Q8_0 dequantization
///
/// Uses AVX2 when available for 8x parallel i8→f32 conversion.
///
/// # Errors
///
/// Returns error if data length is not a multiple of block size (36 bytes)
pub fn dequantize_q8_0_simd(data: &[u8]) -> Result<Vec<f32>> {
    #[cfg(target_arch = "x86_64")]
    {
        if is_x86_feature_detected!("avx2") {
            // SAFETY: AVX2 verified at runtime
            return unsafe { dequantize_q8_0_avx2_parallel(data) };
        }
    }

    // Fallback to parallel scalar
    dequantize_q8_0_parallel(data)
}

/// AVX2-accelerated parallel Q8_0 dequantization
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
// SAFETY: Caller must satisfy the documented preconditions
unsafe fn dequantize_q8_0_avx2_parallel(data: &[u8]) -> Result<Vec<f32>> {
    use rayon::prelude::*;

    const BLOCK_BYTES: usize = 34; // 2 (f16 scale) + 32 (i8 quants)

    if !data.len().is_multiple_of(BLOCK_BYTES) {
        return Err(RealizarError::InvalidShape {
            reason: format!(
                "Q8_0 data length {} is not a multiple of block size {}",
                data.len(),
                BLOCK_BYTES
            ),
        });
    }

    let num_blocks = data.len() / BLOCK_BYTES;

    let result: Vec<f32> = (0..num_blocks)
        .into_par_iter()
        .flat_map(|block_idx| {
            let block_start = block_idx * BLOCK_BYTES;
            let block_data = &data[block_start..block_start + BLOCK_BYTES];
            // SAFETY: AVX2 availability verified by caller
            unsafe { dequantize_q8_0_block_avx2(block_data) }
        })
        .collect();

    Ok(result)
}

include!("rope.rs");
include!("parallel_dequant_dequantize.rs");