trueno/backends/q4k/
colmajor.rs1use super::{parse_q4k_header, SUPER_BLOCK_BYTES, SUPER_BLOCK_SIZE};
7
8#[inline]
10fn accumulate_q4k_superblock_colmajor(
11 sb_data: &[u8],
12 x_j: f32,
13 output: &mut [f32],
14 output_offset: usize,
15 ne0: usize,
16) {
17 let (d, dmin, scales, mins) = parse_q4k_header(sb_data);
18 let qs = sb_data.get(16..144).expect("Q4_K: need ≥144 bytes for qs");
19
20 for chunk in 0..4 {
21 let chunk_start = chunk * 64;
22 let q_start = chunk * 32;
23
24 let d1 = d * f32::from(scales[chunk * 2]);
25 let dm1 = dmin * f32::from(mins[chunk * 2]);
26 let d2 = d * f32::from(scales[chunk * 2 + 1]);
27 let dm2 = dmin * f32::from(mins[chunk * 2 + 1]);
28
29 for i in 0..32 {
31 let output_idx = output_offset + chunk_start + i;
32 if output_idx < ne0 {
33 let dequant = d1 * (qs[q_start + i] & 0x0F) as f32 - dm1;
34 output[output_idx] += x_j * dequant;
35 }
36 }
37
38 for i in 0..32 {
40 let output_idx = output_offset + chunk_start + 32 + i;
41 if output_idx < ne0 {
42 let dequant = d2 * (qs[q_start + i] >> 4) as f32 - dm2;
43 output[output_idx] += x_j * dequant;
44 }
45 }
46 }
47}
48
49#[deprecated(
71 since = "0.15.0",
72 note = "LAYOUT-001: Use row-major kernels. APR/GGUF data is transposed at import boundary."
73)]
74pub fn matmul_q4k_f32_colmajor(
75 q4k_data: &[u8],
76 input: &[f32],
77 ne0: usize, ne1: usize, ) -> Vec<f32> {
80 assert_eq!(input.len(), ne1, "Input length must match ne1 (input dimension)");
81
82 let blocks_per_col = (ne0 + SUPER_BLOCK_SIZE - 1) / SUPER_BLOCK_SIZE;
83 let col_bytes = blocks_per_col * SUPER_BLOCK_BYTES;
84
85 let mut output = vec![0.0f32; ne0];
86
87 for col_idx in 0..ne1 {
88 let col_start = col_idx * col_bytes;
89 let x_j = input[col_idx];
90
91 if x_j == 0.0 {
92 continue;
93 }
94
95 for sb_idx in 0..blocks_per_col {
96 let sb_start = col_start + sb_idx * SUPER_BLOCK_BYTES;
97 if sb_start + SUPER_BLOCK_BYTES > q4k_data.len() {
98 break;
99 }
100 let sb_data = &q4k_data[sb_start..sb_start + SUPER_BLOCK_BYTES];
101 let output_offset = sb_idx * SUPER_BLOCK_SIZE;
102 accumulate_q4k_superblock_colmajor(sb_data, x_j, &mut output, output_offset, ne0);
103 }
104 }
105
106 output
107}
108
109#[deprecated(
114 since = "0.15.0",
115 note = "LAYOUT-001: Use row-major kernels. APR/GGUF data is transposed at import boundary."
116)]
117#[inline]
118pub fn matmul_q4k_f32_colmajor_dispatch(
119 q4k_data: &[u8],
120 input: &[f32],
121 ne0: usize,
122 ne1: usize,
123) -> Vec<f32> {
124 #[allow(deprecated)]
125 matmul_q4k_f32_colmajor(q4k_data, input, ne0, ne1)
126}