Skip to main content

trueno/backends/q4k/
colmajor.rs

1//! Column-major Q4_K matrix-vector multiplication.
2//!
3//! This module implements column-major GEMV for GGML/GGUF format weights,
4//! where weights are stored column-first for cache-efficient streaming.
5
6use super::{parse_q4k_header, SUPER_BLOCK_BYTES, SUPER_BLOCK_SIZE};
7
8/// Accumulate one Q4_K superblock into output (column-major layout).
9#[inline]
10fn accumulate_q4k_superblock_colmajor(
11    sb_data: &[u8],
12    x_j: f32,
13    output: &mut [f32],
14    output_offset: usize,
15    ne0: usize,
16) {
17    let (d, dmin, scales, mins) = parse_q4k_header(sb_data);
18    let qs = sb_data.get(16..144).expect("Q4_K: need ≥144 bytes for qs");
19
20    for chunk in 0..4 {
21        let chunk_start = chunk * 64;
22        let q_start = chunk * 32;
23
24        let d1 = d * f32::from(scales[chunk * 2]);
25        let dm1 = dmin * f32::from(mins[chunk * 2]);
26        let d2 = d * f32::from(scales[chunk * 2 + 1]);
27        let dm2 = dmin * f32::from(mins[chunk * 2 + 1]);
28
29        // Process low nibbles (first 32 values)
30        for i in 0..32 {
31            let output_idx = output_offset + chunk_start + i;
32            if output_idx < ne0 {
33                let dequant = d1 * (qs[q_start + i] & 0x0F) as f32 - dm1;
34                output[output_idx] += x_j * dequant;
35            }
36        }
37
38        // Process high nibbles (next 32 values)
39        for i in 0..32 {
40            let output_idx = output_offset + chunk_start + 32 + i;
41            if output_idx < ne0 {
42                let dequant = d2 * (qs[q_start + i] >> 4) as f32 - dm2;
43                output[output_idx] += x_j * dequant;
44            }
45        }
46    }
47}
48
49/// Fused Q4_K matrix-vector multiply for GGML column-major layout
50///
51/// Computes: output = input @ Q4K_weight (GGML convention: y = x @ W)
52/// where weight is stored in Q4_K format with GGML column-major super-block organization.
53///
54/// # GGML Column-Major Layout
55///
56/// For a weight tensor with shape [ne0, ne1] in GGML notation:
57/// - ne0 is the output dimension (rows)
58/// - ne1 is the input/reduction dimension (columns)
59/// - Elements are stored column-major: W[i,j] at offset i + j*ne0
60/// - Each column j (length ne0) contains weights from input[j] to all outputs
61///
62/// # Arguments
63/// * `q4k_data` - Raw Q4K bytes in GGML column-major layout [ne0, ne1]
64/// * `input` - F32 input vector [ne1] (input/reduction dimension)
65/// * `ne0` - Size of output dimension (rows in GGML, output size)
66/// * `ne1` - Size of input/reduction dimension (columns in GGML, input size)
67///
68/// # Returns
69/// F32 output vector [ne0]
70#[deprecated(
71    since = "0.15.0",
72    note = "LAYOUT-001: Use row-major kernels. APR/GGUF data is transposed at import boundary."
73)]
74pub fn matmul_q4k_f32_colmajor(
75    q4k_data: &[u8],
76    input: &[f32],
77    ne0: usize, // output dimension (rows)
78    ne1: usize, // input/reduction dimension (columns)
79) -> Vec<f32> {
80    assert_eq!(input.len(), ne1, "Input length must match ne1 (input dimension)");
81
82    let blocks_per_col = (ne0 + SUPER_BLOCK_SIZE - 1) / SUPER_BLOCK_SIZE;
83    let col_bytes = blocks_per_col * SUPER_BLOCK_BYTES;
84
85    let mut output = vec![0.0f32; ne0];
86
87    for col_idx in 0..ne1 {
88        let col_start = col_idx * col_bytes;
89        let x_j = input[col_idx];
90
91        if x_j == 0.0 {
92            continue;
93        }
94
95        for sb_idx in 0..blocks_per_col {
96            let sb_start = col_start + sb_idx * SUPER_BLOCK_BYTES;
97            if sb_start + SUPER_BLOCK_BYTES > q4k_data.len() {
98                break;
99            }
100            let sb_data = &q4k_data[sb_start..sb_start + SUPER_BLOCK_BYTES];
101            let output_offset = sb_idx * SUPER_BLOCK_SIZE;
102            accumulate_q4k_superblock_colmajor(sb_data, x_j, &mut output, output_offset, ne0);
103        }
104    }
105
106    output
107}
108
109/// Runtime dispatch for column-major Q4K matmul
110///
111/// Uses scalar implementation for correctness.
112/// Matches GGUF tensor layout without requiring transposition.
113#[deprecated(
114    since = "0.15.0",
115    note = "LAYOUT-001: Use row-major kernels. APR/GGUF data is transposed at import boundary."
116)]
117#[inline]
118pub fn matmul_q4k_f32_colmajor_dispatch(
119    q4k_data: &[u8],
120    input: &[f32],
121    ne0: usize,
122    ne1: usize,
123) -> Vec<f32> {
124    #[allow(deprecated)]
125    matmul_q4k_f32_colmajor(q4k_data, input, ne0, ne1)
126}