1pub mod rle;
6pub mod delta;
7pub mod dictionary;
8pub mod bitpack;
9
10pub use rle::{RleEncoder, RleDecoder};
11pub use delta::DeltaEncoder;
12pub use dictionary::{DictionaryEncoder, DictionaryEncoderI64, DictionaryEncoderF64};
13pub use bitpack::BitPackEncoder;
14
15use crate::error::{ArrowError, Result};
16
17#[derive(Debug, Clone, Copy, PartialEq, Eq)]
19pub enum Codec {
20 None,
22 Rle,
24 Delta,
26 Dictionary,
28 BitPack,
30}
31
32#[derive(Debug, Clone, Copy, PartialEq, Eq)]
34pub enum Level {
35 Fast = 1,
37 Default = 5,
39 Max = 9,
41}
42
43impl Default for Level {
44 fn default() -> Self {
45 Level::Default
46 }
47}
48
49pub fn compress(data: &[u8], codec: Codec, _level: Level) -> Result<Vec<u8>> {
51 match codec {
52 Codec::None => Ok(data.to_vec()),
53 Codec::Rle => rle::encode(data),
54 Codec::Delta => {
55 if data.len() % 8 != 0 {
57 return Err(ArrowError::InvalidData(
58 "Delta codec requires 8-byte aligned data".to_string()
59 ));
60 }
61 let values: Vec<i64> = data.chunks_exact(8)
62 .map(|chunk| i64::from_le_bytes(chunk.try_into().unwrap()))
63 .collect();
64 delta::encode_i64(&values)
65 }
66 Codec::Dictionary => dictionary::encode(data),
67 Codec::BitPack => {
68 if data.len() % 8 != 0 {
70 return Err(ArrowError::InvalidData(
71 "BitPack codec requires 8-byte aligned data".to_string()
72 ));
73 }
74 let values: Vec<i64> = data.chunks_exact(8)
75 .map(|chunk| i64::from_le_bytes(chunk.try_into().unwrap()))
76 .collect();
77 let bit_width = bitpack::detect_bit_width(&values);
78 let mut packed = bitpack::pack(&values, bit_width)?;
79 let mut output = Vec::new();
81 output.push(bit_width);
82 output.extend_from_slice(&(values.len() as u32).to_le_bytes());
83 output.append(&mut packed);
84 Ok(output)
85 }
86 }
87}
88
89pub fn decompress(data: &[u8], codec: Codec) -> Result<Vec<u8>> {
91 match codec {
92 Codec::None => Ok(data.to_vec()),
93 Codec::Rle => rle::decode(data),
94 Codec::Delta => {
95 let values = delta::decode_i64(data)?;
96 let mut output = Vec::with_capacity(values.len() * 8);
97 for v in values {
98 output.extend_from_slice(&v.to_le_bytes());
99 }
100 Ok(output)
101 }
102 Codec::Dictionary => dictionary::decode(data),
103 Codec::BitPack => {
104 if data.len() < 5 {
105 return Err(ArrowError::InvalidData(
106 "BitPack data too short".to_string()
107 ));
108 }
109 let bit_width = data[0];
110 let count = u32::from_le_bytes(data[1..5].try_into().unwrap()) as usize;
111 let packed = &data[5..];
112 let values = bitpack::unpack(packed, bit_width, count)?;
113 let mut output = Vec::with_capacity(values.len() * 8);
114 for v in values {
115 output.extend_from_slice(&v.to_le_bytes());
116 }
117 Ok(output)
118 }
119 }
120}
121
122pub fn auto_codec(data: &[u8]) -> Codec {
124 if data.len() < 64 {
125 return Codec::None;
126 }
127
128 let sample_size = data.len().min(1024);
130 let sample = &data[..sample_size];
131
132 let unique_count = count_unique(sample);
133 let unique_ratio = unique_count as f64 / sample_size as f64;
134
135 if unique_ratio < 0.1 {
137 return Codec::Rle;
138 }
139
140 if is_mostly_sequential(sample) {
142 return Codec::Delta;
143 }
144
145 if unique_ratio < 0.25 {
147 return Codec::Dictionary;
148 }
149
150 Codec::BitPack
152}
153
154fn count_unique(data: &[u8]) -> usize {
155 let mut seen = [false; 256];
156 let mut count = 0;
157 for &byte in data {
158 if !seen[byte as usize] {
159 seen[byte as usize] = true;
160 count += 1;
161 }
162 }
163 count
164}
165
166fn is_mostly_sequential(data: &[u8]) -> bool {
167 if data.len() < 2 {
168 return false;
169 }
170
171 let mut sequential = 0;
172 for i in 1..data.len() {
173 let diff = (data[i] as i16 - data[i-1] as i16).abs();
174 if diff <= 2 {
175 sequential += 1;
176 }
177 }
178
179 sequential > data.len() * 3 / 4
180}
181
182#[cfg(test)]
183mod tests {
184 use super::*;
185
186 #[test]
187 fn test_rle_codec_selection() {
188 let repeated = vec![1u8; 1000];
190 assert_eq!(auto_codec(&repeated), Codec::Rle);
191 }
192
193 #[test]
194 fn test_delta_codec_selection() {
195 let sequential: Vec<u8> = (0..=255).cycle().take(1000).collect();
197 assert_eq!(auto_codec(&sequential), Codec::Delta);
198 }
199
200 #[test]
201 fn test_dictionary_codec_selection() {
202 let values = [7u8, 23, 41, 59, 77, 93, 111, 127, 143, 161, 179, 197, 213, 231, 249,
204 11, 29, 47, 63, 81, 97, 113, 131, 149, 167, 183, 199, 217, 233, 251,
205 13, 31, 53, 71, 89, 107, 123, 139, 157, 173, 191, 209, 227, 241,
206 17, 37, 57, 73, 91, 109, 137, 151, 169, 187, 203, 221, 239,
207 19, 43, 61, 79, 103, 121, 141, 163, 181, 193, 211, 229,
208 3, 33, 49, 67, 83, 101, 117, 133, 147, 165, 177, 195, 207, 223, 237,
209 5, 27, 39, 51, 69, 87, 99, 115, 129, 145, 159, 171, 189, 201, 219, 235,
210 9, 21, 35, 55, 75, 85, 95, 105, 125, 135, 153, 175, 185, 205, 215, 225, 245];
211 let low_card: Vec<u8> = (0..1000).map(|i| values[i % values.len()]).collect();
212 assert_eq!(auto_codec(&low_card), Codec::Dictionary);
213 }
214}
215
216
217
218
219