huffc/
lib.rs

1//! Huffc Core - Huffman Encoding and Decoding
2//!
3//! This module implements the Huffman encoding and decoding logic,
4//! including frequency analysis, tree construction, and bitwise encoding.
5//!
6//! ## Features
7//! - Computes symbol frequencies efficiently
8//! - Builds a Huffman tree to encode data optimally
9//! - Encodes and decodes data using bitwise representations
10//! - Supports serialization and deserialization of Huffman-encoded data
11pub mod cli;
12pub mod fs;
13
14use std::collections::{HashMap, VecDeque};
15
16use bitvec::{order::Msb0, vec::BitVec};
17
18#[derive(Debug)]
19pub struct FrequencyBuffer(pub [u64; 256]);
20
21pub fn tally_frequency(bytes: &[u8]) -> FrequencyBuffer {
22    let mut fb = FrequencyBuffer([0; 256]);
23    bytes.iter().for_each(|byte| unsafe {
24        *fb.0.as_mut_ptr().add(*byte as usize) += 1;
25    });
26    fb
27}
28
29// Returns the index of the minimum value, Option because when all values are zero there is nothing
30// to pop
31type Idx = u8;
32type Freq = u64;
33pub fn find_and_pop_min(freq_buf: &mut [u64]) -> Option<(Idx, Freq)> {
34    let mut min_value_idx = None;
35    let mut min_value = None;
36    freq_buf.iter_mut().enumerate().for_each(|(idx, byte)| {
37        if *byte == 0 {
38            return;
39        }
40
41        match min_value {
42            None => {
43                min_value = Some(*byte);
44                min_value_idx = Some(idx as u8);
45            }
46            Some(v) if *byte < v => {
47                min_value = Some(*byte);
48                min_value_idx = Some(idx as u8);
49            }
50            _ => (),
51        }
52    });
53
54    if let Some(idx) = min_value_idx {
55        let byte = unsafe { freq_buf.get_unchecked_mut(idx as usize) };
56        *byte = 0;
57    }
58
59    match (min_value_idx, min_value) {
60        (Some(idx), Some(value)) => Some((idx, value)),
61        _ => None,
62    }
63}
64
65pub fn huff_encode_bitvec(bytes: &[u8], encoded_map: &HashMap<u8, Encoded>) -> (Vec<u8>, u64) {
66    let mut final_bits: BitVec<u8, Msb0> = BitVec::with_capacity(bytes.len() / 2);
67    for byte in bytes {
68        let encoded = encoded_map.get(byte).unwrap();
69        final_bits.extend(encoded.bits.iter());
70    }
71
72    let total_bits = final_bits.len();
73    (final_bits.into(), total_bits as u64)
74}
75
76#[derive(Debug, PartialEq, Eq)]
77pub struct Encoded {
78    // bits: Vec<u8>,
79    bits: BitVec<u8, Msb0>,
80    /// Number of bits in the sequence
81    num_bits_sequence: u8,
82    original_value: u8,
83}
84
85fn u64_to_u8(value: u64) -> [u8; 8] {
86    [
87        (value >> 56) as u8,
88        (value >> 48) as u8,
89        (value >> 40) as u8,
90        (value >> 32) as u8,
91        (value >> 24) as u8,
92        (value >> 16) as u8,
93        (value >> 8) as u8,
94        value as u8,
95    ]
96}
97
98pub fn serialize_huffman(
99    encoded_map: &HashMap<u8, Encoded>,
100    bit_buffer: Vec<u8>,
101    total_bits: u64,
102) -> Vec<u8> {
103    let mut serialized_buffer = u64_to_u8(total_bits).to_vec();
104
105    let mut tmp_buffer = Vec::new();
106
107    for encoded in encoded_map.values() {
108        tmp_buffer.push(encoded.original_value);
109        tmp_buffer.push(encoded.num_bits_sequence);
110        tmp_buffer.push(*encoded.bits.last().unwrap() as u8);
111    }
112
113    let size_of_header_bytes = tmp_buffer.len() as u64;
114    let size_of_header_arr = u64_to_u8(size_of_header_bytes);
115    serialized_buffer.extend_from_slice(&size_of_header_arr);
116    serialized_buffer.extend(tmp_buffer);
117    serialized_buffer.extend(bit_buffer);
118
119    serialized_buffer
120}
121
122#[derive(Debug, Clone, Copy)]
123struct ValueBitMap {
124    value: u8,
125    ends_in_1: bool,
126}
127
128pub fn deserialze_huffman(huff_bytes: &[u8]) -> Vec<u8> {
129    let total_bits = u8_to_u64(&huff_bytes[0..8]);
130    let header_end_byte = 16;
131    let header_num_bytes = u8_to_u64(&huff_bytes[8..header_end_byte]);
132
133    let mut map_values = HashMap::new();
134    let mut idx = header_end_byte;
135    let mut max_bits = 0;
136    while idx - header_end_byte < header_num_bytes as usize {
137        let value = huff_bytes[idx];
138        let encoding_number_of_bits = huff_bytes[idx + 1];
139        let ends_in_1 = huff_bytes[idx + 2] != 0;
140
141        max_bits = max_bits.max(encoding_number_of_bits);
142        let value_bit_map = ValueBitMap { value, ends_in_1 };
143
144        map_values
145            .entry(encoding_number_of_bits)
146            .and_modify(|v: &mut Vec<ValueBitMap>| v.push(value_bit_map))
147            .or_insert(vec![value_bit_map]);
148
149        idx += 3
150    }
151
152    let mut decoded_buffer: Vec<u8> = Vec::new();
153    let bit_vec: BitVec<u8, Msb0> = BitVec::from_slice(&huff_bytes[idx..]);
154    let mut bit_vec_iter = bit_vec.iter();
155
156    let mut bits_to_target = 0;
157
158    let mut read_bits = 0;
159    while read_bits < total_bits {
160        let bit = bit_vec_iter.next().unwrap();
161        if *bit {
162            bits_to_target += 1;
163            // Here I need to do logic to find the number of bits
164            let value_bit_map = map_values.get(&bits_to_target).unwrap();
165            decoded_buffer.push(value_bit_map.iter().find(|v| v.ends_in_1).unwrap().value);
166            read_bits += bits_to_target as u64;
167            bits_to_target = 0;
168        } else {
169            bits_to_target += 1;
170            if bits_to_target >= max_bits {
171                // We hit the least occuring character, now we need to find it
172                let value_bit_map = map_values.get(&bits_to_target).unwrap();
173                decoded_buffer.push(value_bit_map.iter().find(|v| !v.ends_in_1).unwrap().value);
174                read_bits += bits_to_target as u64;
175                bits_to_target = 0
176            }
177        }
178    }
179
180    decoded_buffer
181}
182
183fn u8_to_u64(bytes: &[u8]) -> u64 {
184    assert!(bytes.len() >= 8);
185    (bytes[0] as u64) << 56
186        | (bytes[1] as u64) << 48
187        | (bytes[2] as u64) << 40
188        | (bytes[3] as u64) << 32
189        | (bytes[4] as u64) << 24
190        | (bytes[5] as u64) << 16
191        | (bytes[6] as u64) << 8
192        | bytes[7] as u64
193}
194
195/// Build huffman array, this represents the huffman tree, the first index is encoded 1, the next
196/// is 01, the next 001 and so on... until the last one which is encoded 0 (repeated n) where n is
197/// the length of the vector, the least frequent is at the back the most frequent is at the front,
198/// the actual frequency does not matter, only their relative frequency, which is represented by
199/// their position in the buffer
200pub fn build_huffman_array(mut freq_buffer: FrequencyBuffer) -> Vec<u8> {
201    let mut buffer = VecDeque::new();
202    while let Some((idx, _)) = find_and_pop_min(&mut freq_buffer.0) {
203        buffer.push_front(idx);
204    }
205    buffer.into()
206}
207
208pub fn encode_huffman_array(huffman_array: &[u8]) -> HashMap<u8, Encoded> {
209    huffman_array
210        .iter()
211        .enumerate()
212        .map(|(idx, value)| {
213            if idx == huffman_array.len() - 1 {
214                let bv: BitVec<u8, Msb0> = (0..idx as u8).map(|_| false).collect();
215                let num_bits_sequence = bv.len() as u8;
216                return (
217                    *value,
218                    Encoded {
219                        bits: bv,
220                        num_bits_sequence,
221                        original_value: *value,
222                    },
223                );
224            }
225
226            let bv: BitVec<u8, Msb0> = (0..idx as u8 + 1)
227                .map(|i| {
228                    if i == idx as u8 {
229                        return true;
230                    }
231                    false
232                })
233                .collect();
234            let num_bits_sequence = bv.len() as u8;
235            (
236                *value,
237                Encoded {
238                    bits: bv,
239                    num_bits_sequence,
240                    original_value: *value,
241                },
242            )
243        })
244        .collect()
245}
246
247#[cfg(test)]
248mod tests {
249    use super::*;
250    use bitvec::prelude::*;
251
252    fn encoded_buffer_to_string(buffer: &[u8]) -> String {
253        buffer
254            .iter()
255            .map(|b| format!("{:08b}", b))
256            .collect::<String>()
257    }
258
259    #[test]
260    fn find_and_pop_min_not_empty() {
261        let mut bytes = [4, 8, 1];
262        let result = find_and_pop_min(&mut bytes);
263
264        assert!(result == Some((2, 1)));
265        assert!(bytes == [4, 8, 0]);
266    }
267
268    #[test]
269    fn find_and_pop_min_empty() {
270        let mut bytes = [0, 0, 0];
271        let result = find_and_pop_min(&mut bytes);
272        assert!(result.is_none());
273    }
274
275    #[test]
276    fn build_small_huffman_array() {
277        let bytes = [1, 2, 1, 1, 1, 1, 1, 1, 1, 3, 1];
278        let freq_buff = tally_frequency(&bytes);
279        let actual = build_huffman_array(freq_buff);
280        let expected = vec![1, 3, 2];
281        assert_eq!(actual, expected)
282    }
283
284    #[test]
285    fn build_encoded_map_from_huffman_array() {
286        let huff_arr = vec![1, 3, 2];
287        let actual = encode_huffman_array(&huff_arr);
288
289        let mut expected = HashMap::new();
290        let mut bv = bitvec![u8, Msb0;];
291        bv.push(true);
292        expected.insert(
293            1,
294            Encoded {
295                bits: bv,
296                num_bits_sequence: 1,
297                original_value: 1,
298            },
299        );
300        let mut bv = bitvec![u8, Msb0;];
301        bv.push(false);
302        bv.push(true);
303        expected.insert(
304            3,
305            Encoded {
306                bits: bv,
307                num_bits_sequence: 2,
308                original_value: 3,
309            },
310        );
311        let mut bv = bitvec![u8, Msb0;];
312        bv.push(false);
313        bv.push(false);
314        expected.insert(
315            2,
316            Encoded {
317                bits: bv,
318                num_bits_sequence: 2,
319                original_value: 2,
320            },
321        );
322
323        assert_eq!(actual, expected)
324    }
325
326    #[test]
327    fn build_huffman_tree_test_simple() {
328        let bytes = [1, 2, 1, 1, 1, 1, 1, 1, 1, 3, 1];
329        let freq_buff = tally_frequency(&bytes);
330        let huffnode = build_huffman_array(freq_buff);
331        let encode_map = encode_huffman_array(&huffnode);
332        let (encoded_buffer, total_bits) = huff_encode_bitvec(&bytes, &encode_map);
333        let expected_buffer = "1001111111011000";
334        assert_eq!(encoded_buffer_to_string(&encoded_buffer), expected_buffer);
335        let expected_total_bits = 13;
336        assert_eq!(total_bits, expected_total_bits);
337    }
338
339    #[test]
340    fn serialize_huffman_test() {
341        let bytes = [1, 3, 1, 2];
342        let freq_buff = tally_frequency(&bytes);
343        let huffnode = build_huffman_array(freq_buff);
344        let encode_map = encode_huffman_array(&huffnode);
345        let (encoded_buffer, total_bits) = huff_encode_bitvec(&bytes, &encode_map);
346        let mut serialized_buffer = serialize_huffman(&encode_map, encoded_buffer, total_bits);
347        serialized_buffer.sort();
348        let mut expected = [
349            0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 9, 1, 1, 1, 3, 2, 1, 2, 2, 0, 176,
350        ];
351        expected.sort();
352
353        assert_eq!(serialized_buffer, expected);
354    }
355
356    #[test]
357    fn test_deserialize_huffman() {
358        let target = [1, 3, 1, 2];
359
360        let serialized_bytes = [
361            0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 9, 1, 1, 1, 2, 2, 0, 3, 2, 1, 176,
362        ];
363
364        let actual = deserialze_huffman(&serialized_bytes);
365
366        assert_eq!(actual, target);
367    }
368}