avx_arrow/compression/
mod.rs

1//! Native compression algorithms without external dependencies
2//!
3//! High-performance compression optimized for columnar data with SIMD acceleration.
4
5pub mod rle;
6pub mod delta;
7pub mod dictionary;
8pub mod bitpack;
9
10pub use rle::{RleEncoder, RleDecoder};
11pub use delta::DeltaEncoder;
12pub use dictionary::{DictionaryEncoder, DictionaryEncoderI64, DictionaryEncoderF64};
13pub use bitpack::BitPackEncoder;
14
15use crate::error::{ArrowError, Result};
16
17/// Compression codec type
18#[derive(Debug, Clone, Copy, PartialEq, Eq)]
19pub enum Codec {
20    /// No compression
21    None,
22    /// Run-Length Encoding (best for repeated values)
23    Rle,
24    /// Delta encoding (best for sorted/sequential data)
25    Delta,
26    /// Dictionary encoding (best for low cardinality)
27    Dictionary,
28    /// Bit-packing (best for small integers)
29    BitPack,
30}
31
32/// Compression level
33#[derive(Debug, Clone, Copy, PartialEq, Eq)]
34pub enum Level {
35    /// Fastest compression, lower ratio
36    Fast = 1,
37    /// Balanced compression
38    Default = 5,
39    /// Best compression, slower
40    Max = 9,
41}
42
43impl Default for Level {
44    fn default() -> Self {
45        Level::Default
46    }
47}
48
49/// Compress data using specified codec
50pub fn compress(data: &[u8], codec: Codec, _level: Level) -> Result<Vec<u8>> {
51    match codec {
52        Codec::None => Ok(data.to_vec()),
53        Codec::Rle => rle::encode(data),
54        Codec::Delta => {
55            // Convert bytes to i64 for delta encoding
56            if data.len() % 8 != 0 {
57                return Err(ArrowError::InvalidData(
58                    "Delta codec requires 8-byte aligned data".to_string()
59                ));
60            }
61            let values: Vec<i64> = data.chunks_exact(8)
62                .map(|chunk| i64::from_le_bytes(chunk.try_into().unwrap()))
63                .collect();
64            delta::encode_i64(&values)
65        }
66        Codec::Dictionary => dictionary::encode(data),
67        Codec::BitPack => {
68            // Convert bytes to i64 for bit-packing
69            if data.len() % 8 != 0 {
70                return Err(ArrowError::InvalidData(
71                    "BitPack codec requires 8-byte aligned data".to_string()
72                ));
73            }
74            let values: Vec<i64> = data.chunks_exact(8)
75                .map(|chunk| i64::from_le_bytes(chunk.try_into().unwrap()))
76                .collect();
77            let bit_width = bitpack::detect_bit_width(&values);
78            let mut packed = bitpack::pack(&values, bit_width)?;
79            // Prepend bit_width and count
80            let mut output = Vec::new();
81            output.push(bit_width);
82            output.extend_from_slice(&(values.len() as u32).to_le_bytes());
83            output.append(&mut packed);
84            Ok(output)
85        }
86    }
87}
88
89/// Decompress data using specified codec
90pub fn decompress(data: &[u8], codec: Codec) -> Result<Vec<u8>> {
91    match codec {
92        Codec::None => Ok(data.to_vec()),
93        Codec::Rle => rle::decode(data),
94        Codec::Delta => {
95            let values = delta::decode_i64(data)?;
96            let mut output = Vec::with_capacity(values.len() * 8);
97            for v in values {
98                output.extend_from_slice(&v.to_le_bytes());
99            }
100            Ok(output)
101        }
102        Codec::Dictionary => dictionary::decode(data),
103        Codec::BitPack => {
104            if data.len() < 5 {
105                return Err(ArrowError::InvalidData(
106                    "BitPack data too short".to_string()
107                ));
108            }
109            let bit_width = data[0];
110            let count = u32::from_le_bytes(data[1..5].try_into().unwrap()) as usize;
111            let packed = &data[5..];
112            let values = bitpack::unpack(packed, bit_width, count)?;
113            let mut output = Vec::with_capacity(values.len() * 8);
114            for v in values {
115                output.extend_from_slice(&v.to_le_bytes());
116            }
117            Ok(output)
118        }
119    }
120}
121
122/// Auto-select best codec for data
123pub fn auto_codec(data: &[u8]) -> Codec {
124    if data.len() < 64 {
125        return Codec::None;
126    }
127
128    // Sample data to determine best codec
129    let sample_size = data.len().min(1024);
130    let sample = &data[..sample_size];
131
132    let unique_count = count_unique(sample);
133    let unique_ratio = unique_count as f64 / sample_size as f64;
134
135    // Check for very repeated values (RLE) - less than 10% unique
136    if unique_ratio < 0.1 {
137        return Codec::Rle;
138    }
139
140    // Check for sequential/sorted data (Delta)
141    if is_mostly_sequential(sample) {
142        return Codec::Delta;
143    }
144
145    // Check for low cardinality (Dictionary) - 10-25% unique
146    if unique_ratio < 0.25 {
147        return Codec::Dictionary;
148    }
149
150    // Default to bit-packing for integers
151    Codec::BitPack
152}
153
154fn count_unique(data: &[u8]) -> usize {
155    let mut seen = [false; 256];
156    let mut count = 0;
157    for &byte in data {
158        if !seen[byte as usize] {
159            seen[byte as usize] = true;
160            count += 1;
161        }
162    }
163    count
164}
165
166fn is_mostly_sequential(data: &[u8]) -> bool {
167    if data.len() < 2 {
168        return false;
169    }
170
171    let mut sequential = 0;
172    for i in 1..data.len() {
173        let diff = (data[i] as i16 - data[i-1] as i16).abs();
174        if diff <= 2 {
175            sequential += 1;
176        }
177    }
178
179    sequential > data.len() * 3 / 4
180}
181
182#[cfg(test)]
183mod tests {
184    use super::*;
185
186    #[test]
187    fn test_rle_codec_selection() {
188        // Repeated values -> RLE
189        let repeated = vec![1u8; 1000];
190        assert_eq!(auto_codec(&repeated), Codec::Rle);
191    }
192
193    #[test]
194    fn test_delta_codec_selection() {
195        // Sequential -> Delta
196        let sequential: Vec<u8> = (0..=255).cycle().take(1000).collect();
197        assert_eq!(auto_codec(&sequential), Codec::Delta);
198    }
199
200    #[test]
201    fn test_dictionary_codec_selection() {
202        // Low cardinality non-sequential (12% unique) -> Dictionary
203        let values = [7u8, 23, 41, 59, 77, 93, 111, 127, 143, 161, 179, 197, 213, 231, 249,
204                      11, 29, 47, 63, 81, 97, 113, 131, 149, 167, 183, 199, 217, 233, 251,
205                      13, 31, 53, 71, 89, 107, 123, 139, 157, 173, 191, 209, 227, 241,
206                      17, 37, 57, 73, 91, 109, 137, 151, 169, 187, 203, 221, 239,
207                      19, 43, 61, 79, 103, 121, 141, 163, 181, 193, 211, 229,
208                      3, 33, 49, 67, 83, 101, 117, 133, 147, 165, 177, 195, 207, 223, 237,
209                      5, 27, 39, 51, 69, 87, 99, 115, 129, 145, 159, 171, 189, 201, 219, 235,
210                      9, 21, 35, 55, 75, 85, 95, 105, 125, 135, 153, 175, 185, 205, 215, 225, 245];
211        let low_card: Vec<u8> = (0..1000).map(|i| values[i % values.len()]).collect();
212        assert_eq!(auto_codec(&low_card), Codec::Dictionary);
213    }
214}
215
216
217
218
219