avx_arrow/compression/
bitpack.rs1use crate::error::{ArrowError, Result};
6
7pub fn pack(data: &[i64], bit_width: u8) -> Result<Vec<u8>> {
9 if bit_width == 0 || bit_width > 64 {
10 return Err(ArrowError::InvalidData(
11 format!("Invalid bit width: {}", bit_width)
12 ));
13 }
14
15 if data.is_empty() {
16 return Ok(Vec::new());
17 }
18
19 let mask = if bit_width == 64 {
20 u64::MAX
21 } else {
22 (1u64 << bit_width) - 1
23 };
24
25 let total_bits = data.len() * bit_width as usize;
26 let total_bytes = (total_bits + 7) / 8;
27 let mut output = vec![0u8; total_bytes];
28
29 let mut bit_offset = 0usize;
30 for &value in data {
31 let packed = (value as u64) & mask;
32 write_bits(&mut output, bit_offset, packed, bit_width);
33 bit_offset += bit_width as usize;
34 }
35
36 Ok(output)
37}
38
39pub fn unpack(data: &[u8], bit_width: u8, count: usize) -> Result<Vec<i64>> {
41 if bit_width == 0 || bit_width > 64 {
42 return Err(ArrowError::InvalidData(
43 format!("Invalid bit width: {}", bit_width)
44 ));
45 }
46
47 let mask = if bit_width == 64 {
48 u64::MAX
49 } else {
50 (1u64 << bit_width) - 1
51 };
52
53 let mut output = Vec::with_capacity(count);
54 let mut bit_offset = 0usize;
55
56 for _ in 0..count {
57 let value = read_bits(data, bit_offset, bit_width);
58 output.push((value & mask) as i64);
59 bit_offset += bit_width as usize;
60 }
61
62 Ok(output)
63}
64
65fn write_bits(data: &mut [u8], bit_offset: usize, value: u64, bit_width: u8) {
67 let byte_offset = bit_offset / 8;
68 let bit_shift = bit_offset % 8;
69 let bits_remaining = bit_width as usize;
70
71 let mut value = value;
72 let mut bits_written = 0;
73 let mut current_byte = byte_offset;
74
75 while bits_written < bits_remaining {
76 if current_byte >= data.len() {
77 break;
78 }
79
80 let bits_in_current = 8 - if bits_written == 0 { bit_shift } else { 0 };
81 let bits_to_write = bits_remaining - bits_written;
82 let bits_this_round = bits_in_current.min(bits_to_write);
83
84 let shift = if bits_written == 0 { bit_shift } else { 0 };
85 let mask = ((1u64 << bits_this_round) - 1) << shift;
86
87 data[current_byte] &= !(mask as u8);
88 data[current_byte] |= ((value << shift) & mask) as u8;
89
90 value >>= bits_this_round;
91 bits_written += bits_this_round;
92 current_byte += 1;
93 }
94}
95
96fn read_bits(data: &[u8], bit_offset: usize, bit_width: u8) -> u64 {
98 let byte_offset = bit_offset / 8;
99 let bit_shift = bit_offset % 8;
100 let bits_remaining = bit_width as usize;
101
102 let mut result = 0u64;
103 let mut bits_read = 0;
104 let mut current_byte = byte_offset;
105
106 while bits_read < bits_remaining && current_byte < data.len() {
107 let bits_in_current = 8 - if bits_read == 0 { bit_shift } else { 0 };
108 let bits_to_read = bits_remaining - bits_read;
109 let bits_this_round = bits_in_current.min(bits_to_read);
110
111 let shift = if bits_read == 0 { bit_shift } else { 0 };
112 let mask = (1u64 << bits_this_round) - 1;
113
114 let bits = ((data[current_byte] as u64) >> shift) & mask;
115 result |= bits << bits_read;
116
117 bits_read += bits_this_round;
118 current_byte += 1;
119 }
120
121 result
122}
123
124pub fn detect_bit_width(data: &[i64]) -> u8 {
126 if data.is_empty() {
127 return 0;
128 }
129
130 let max_value = data.iter()
131 .map(|&v| v.abs() as u64)
132 .max()
133 .unwrap_or(0);
134
135 if max_value == 0 {
136 return 1;
137 }
138
139 64 - max_value.leading_zeros() as u8
140}
141
142pub struct BitPackEncoder {
144 values: Vec<i64>,
145}
146
147impl BitPackEncoder {
148 pub fn new() -> Self {
150 Self {
151 values: Vec::new(),
152 }
153 }
154
155 pub fn encode(&mut self, value: i64) {
157 self.values.push(value);
158 }
159
160 pub fn finish(self) -> Result<(Vec<u8>, u8, usize)> {
162 let bit_width = detect_bit_width(&self.values);
163 let count = self.values.len();
164 let packed = pack(&self.values, bit_width)?;
165 Ok((packed, bit_width, count))
166 }
167}
168
169impl Default for BitPackEncoder {
170 fn default() -> Self {
171 Self::new()
172 }
173}
174
175#[cfg(target_arch = "x86_64")]
177pub mod simd {
178 use super::*;
179
180 #[cfg(target_arch = "x86_64")]
181 use std::arch::x86_64::*;
182
183 #[target_feature(enable = "avx2")]
185 pub unsafe fn pack_i32_x8(data: &[i32], bit_width: u8) -> Vec<u8> {
186 if data.len() < 8 || bit_width > 32 {
187 return vec![];
188 }
189
190 let mask = if bit_width == 32 {
191 u32::MAX
192 } else {
193 (1u32 << bit_width) - 1
194 };
195
196 let mut output = Vec::new();
197 let mut i = 0;
198
199 while i + 8 <= data.len() {
200 let values = _mm256_loadu_si256(data[i..].as_ptr() as *const __m256i);
202
203 let mask_vec = _mm256_set1_epi32(mask as i32);
205 let masked = _mm256_and_si256(values, mask_vec);
206
207 let mut buffer = [0i32; 8];
209 _mm256_storeu_si256(buffer.as_mut_ptr() as *mut __m256i, masked);
210
211 for &v in &buffer {
212 output.extend_from_slice(&v.to_le_bytes());
213 }
214
215 i += 8;
216 }
217
218 output
219 }
220}
221
222#[cfg(test)]
223mod tests {
224 use super::*;
225
226 #[test]
227 fn test_bitpack_small_values() {
228 let data: Vec<i64> = vec![0, 1, 2, 3, 4, 5, 6, 7];
229 let bit_width = detect_bit_width(&data);
230 assert_eq!(bit_width, 3); let packed = pack(&data, bit_width).unwrap();
233 assert!(packed.len() < data.len() * 8);
234
235 let unpacked = unpack(&packed, bit_width, data.len()).unwrap();
236 assert_eq!(unpacked, data);
237 }
238
239 #[test]
240 fn test_bitpack_powers_of_two() {
241 let data: Vec<i64> = (0..16).map(|i| 1i64 << i).collect();
242 let bit_width = detect_bit_width(&data);
243
244 let packed = pack(&data, bit_width).unwrap();
245 let unpacked = unpack(&packed, bit_width, data.len()).unwrap();
246 assert_eq!(unpacked, data);
247 }
248
249 #[test]
250 fn test_detect_bit_width() {
251 assert_eq!(detect_bit_width(&[0, 1]), 1);
252 assert_eq!(detect_bit_width(&[0, 3]), 2);
253 assert_eq!(detect_bit_width(&[0, 7]), 3);
254 assert_eq!(detect_bit_width(&[0, 15]), 4);
255 assert_eq!(detect_bit_width(&[0, 255]), 8);
256 }
257
258 #[test]
259 fn test_bitpack_encoder() {
260 let mut encoder = BitPackEncoder::new();
261 for i in 0..100 {
262 encoder.encode(i % 16); }
264
265 let (packed, bit_width, count) = encoder.finish().unwrap();
266 assert_eq!(bit_width, 4);
267 assert_eq!(count, 100);
268
269 let unpacked = unpack(&packed, bit_width, count).unwrap();
270 let expected: Vec<i64> = (0..100).map(|i| i % 16).collect();
271 assert_eq!(unpacked, expected);
272 }
273
274 #[test]
275 fn test_bitpack_roundtrip() {
276 for bit_width in 1..=16 {
277 let max_val = (1i64 << bit_width) - 1;
278 let data: Vec<i64> = (0..100).map(|i| i % max_val).collect();
279
280 let packed = pack(&data, bit_width).unwrap();
281 let unpacked = unpack(&packed, bit_width, data.len()).unwrap();
282 assert_eq!(unpacked, data, "Failed for bit_width={}", bit_width);
283 }
284 }
285}
286
287
288
289
290