huffman_coding/
lib.rs

1//! **huffman_coding** is a small library for reading and writing huffman encoded data
2//!
3//! There are only 3 things you need to know:
4//! * Use HuffmanTree to change the coding and decoding based on your data
5//! * Use HuffmanWriter to encode data
6//! * use HuffmanReader to decode data
7
8extern crate bitstream;
9extern crate bit_vec;
10
11use std::io::{Write, Read};
12use std::io::Result as IOResult;
13use std::io::Error as IOError;
14use std::io::ErrorKind as IOErrorKind;
15use std::cmp;
16use std::collections::{HashMap, BinaryHeap};
17use bit_vec::BitVec;
18
19/// *HuffmanTree* is a simple tree structure used convert encoded words to decoded words and
20/// vice versa.
21///
22/// Each leaf of the tree represents a single code word. Their probability is saved as single byte
23/// where 255 represents the highest probability, and 0 means the value does not appear.
24///
25/// You most likely don't want to construct this tree yourself, so look for the 2 methods
26/// for constructing the tree for you.
27///
28/// # Examples
29/// ```
30/// extern crate huffman_coding;
31///
32/// let fake_data = vec![1, 1, 0, 0, 2];
33/// let tree = huffman_coding::HuffmanTree::from_data(&fake_data[..]);
34/// let probability = tree.get_byte_prob(1);
35/// assert!(probability.is_some());
36/// assert_eq!(probability.unwrap(), 255);
37/// ```
38#[derive(Eq, Debug)]
39pub enum HuffmanTree {
40    Leaf(u8, u8),
41    Node(Box<HuffmanTree>, Box<HuffmanTree>),
42}
43
44impl Ord for HuffmanTree {
45    fn cmp(&self, other: &Self) -> cmp::Ordering {
46        let own_prob = self.get_probability();
47        let other_prob = other.get_probability();
48
49        // We want to use the std heap, which is a max heap. However, we want to have
50        // the minimum probability on top
51        if own_prob > other_prob {
52            cmp::Ordering::Less
53        } else if own_prob == other_prob {
54            cmp::Ordering::Equal
55        } else {
56            cmp::Ordering::Greater
57        }
58    }
59}
60
61impl PartialOrd for HuffmanTree {
62    fn partial_cmp(&self, other: &Self) -> Option<cmp::Ordering> {
63        Some(self.cmp(other))
64    }
65}
66
67impl PartialEq for HuffmanTree {
68    fn eq(&self, other: &HuffmanTree) -> bool {
69        match (self, other) {
70            (&HuffmanTree::Leaf(ref x1, ref prob1), &HuffmanTree::Leaf(ref x2, ref prob2)) => {
71                x1 == x2 && prob1 == prob2
72            },
73            (&HuffmanTree::Node(ref zero1, ref one1), &HuffmanTree::Node(ref zero2, ref one2)) => {
74                zero1 == zero2 && one1 == one2
75            },
76            _ => false
77        }
78    }
79}
80
81impl HuffmanTree {
82    /// Method to read the probability of all 256 possible u8 values from a slice containing 256
83    /// elements.
84    ///
85    /// This method can be used to construct a new tree from a list of probabilities. The first
86    /// element in the slice will be interpreted as the probability of the `0` value appearing, the
87    /// second as the probability of the `1` value, etc.
88    ///
89    /// # Examples
90    /// ```
91    /// extern crate huffman_coding;
92    ///
93    /// let mut table_data: [u8; 256] = [0; 256];
94    /// table_data[0] = 255;
95    /// table_data[1] = 128;
96    /// table_data[2] = 128;
97    /// let tree = huffman_coding::HuffmanTree::from_table(&table_data[..]);
98    ///
99    /// let test_query = tree.get_byte_prob(1);
100    /// assert!(test_query.is_some());
101    /// assert_eq!(test_query.unwrap(), 128);
102    /// ```
103    /// # Panics
104    /// If data contains less than 256 elements
105    pub fn from_table(data: &[u8]) -> Self {
106        let mut heap: BinaryHeap<_> = data
107            .iter()
108            .enumerate()
109            .filter(|x| *x.1 > 0)
110            .map(|x| HuffmanTree::Leaf(x.0 as u8, *x.1))
111            .collect();
112
113        while heap.len() > 2 {
114            let a = heap.pop().unwrap();
115            let b = heap.pop().unwrap();
116            let insert = if a < b {
117                HuffmanTree::Node(Box::new(a), Box::new(b))
118            } else {
119                HuffmanTree::Node(Box::new(b), Box::new(a))
120            };
121            heap.push(insert);
122        }
123        let a = heap.pop().unwrap();
124        let b = heap.pop().unwrap();
125        HuffmanTree::Node(Box::new(a), Box::new(b))
126    }
127
128    /// Reads all of data and constructs a huffman tree according to the provided sample data
129    ///
130    /// # Examples
131    /// ```
132    /// extern crate huffman_coding;
133    /// let pseudo_data = vec![0, 0, 1, 2, 2];
134    /// let tree = huffman_coding::HuffmanTree::from_data(&pseudo_data[..]);
135    ///
136    /// let test_query = tree.get_byte_prob(0);
137    /// assert!(test_query.is_some());
138    /// assert_eq!(test_query.unwrap(), 255);
139    /// ```
140    pub fn from_data(data: &[u8]) -> Self {
141        let mut probability: [usize; 256] = [0; 256];
142        let mut max = 0;
143        for item in data {
144            probability[*item as usize] += 1;
145
146            if probability[*item as usize] > max {
147                max = probability[*item as usize];
148            }
149        }
150
151        let norm = HuffmanTree::normalize(&probability, max);
152        HuffmanTree::from_table(&norm[..])
153    }
154
155    /// Convert an existing huffman tree into an array where each element represents the probability
156    /// of the index byte to appear according to the huffman tree
157    ///
158    /// This can be used to transmit the encoding scheme via byte buffer
159    pub fn to_table(&self) -> [u8; 256] {
160        let mut table: [u8; 256] = [0; 256];
161        for i in 0..256 {
162            table[i] = self.get_byte_prob(i as u8).unwrap_or(0);
163        }
164        table
165    }
166
167    /// Return the probability of the given byte to appear according to the tree
168    ///
169    /// If this returns None, then the byte should not appear according to the huffman tree
170    /// If this returns Some, it will be between 255 meaning highest probability, and 1, meaning
171    /// lowest probability
172    pub fn get_byte_prob(&self, byte: u8) -> Option<u8> {
173        match self {
174            &HuffmanTree::Leaf(item, prob) if item == byte => Some(prob),
175            &HuffmanTree::Node(ref zero, ref one) => {
176                zero.get_byte_prob(byte).or(one.get_byte_prob(byte))
177            },
178            _ => None
179        }
180    }
181
182    fn normalize(data: &[usize], max_elem: usize) -> [u8; 256] {
183        let mut normalized_data: [u8; 256] = [0; 256];
184
185        for i in 0..data.len() {
186            if data[i] > 0 {
187                normalized_data[i] = cmp::max((data[i] * 255 / max_elem) as u8, 1);
188            }
189        }
190        normalized_data
191    }
192
193    fn get_probability(&self) -> u16 {
194        match self {
195            &HuffmanTree::Leaf(_, prob) => prob as u16,
196            &HuffmanTree::Node(ref zero, ref one) => {
197                zero.get_probability() + one.get_probability()
198            }
199        }
200    }
201
202    fn to_lookup_table(&self) -> HashMap<u8, BitVec> {
203        let mut table = HashMap::new();
204        self.to_lookup_table_inner(&mut table, BitVec::new());
205        table
206    }
207
208    fn to_lookup_table_inner(&self, data: &mut HashMap<u8, BitVec>, prev: BitVec) {
209        match self {
210            &HuffmanTree::Leaf(ref elem, _) => {
211                data.insert(*elem, prev);
212            },
213            &HuffmanTree::Node(ref zero, ref one) => {
214                let mut zero_branch = prev.clone();
215                zero_branch.push(false);
216                zero.to_lookup_table_inner(data, zero_branch);
217                let mut one_branch = prev;
218                one_branch.push(true);
219                one.to_lookup_table_inner(data, one_branch);
220            }
221        }
222    }
223}
224
225/// *HuffmanWriter* is a Write implementation that writes encoded words to the
226/// inner writer.
227///
228/// # Examples
229/// ```
230/// extern crate huffman_coding;
231/// let pseudo_data = vec![0, 0, 1, 2, 2];
232/// let tree = huffman_coding::HuffmanTree::from_data(&pseudo_data[..]);
233///
234/// let mut output = Vec::new();
235/// {
236///     use std::io::Write;
237///     let mut writer = huffman_coding::HuffmanWriter::new(&mut output, &tree);
238///     assert!(writer.write(&[2, 2, 0, 0, 1]).is_ok());
239/// }
240/// assert_eq!(&output[..], [43, 8]);
241/// ```
242pub struct HuffmanWriter<W> where W: Write {
243    inner: bitstream::BitWriter<W>,
244    table: HashMap<u8, BitVec>,
245}
246
247impl<W> HuffmanWriter<W> where W: Write {
248    /// Construct a new HuffmanWriter using the provided HuffmanTree
249    pub fn new(writer: W, tree: &HuffmanTree) -> Self {
250        HuffmanWriter {
251            inner: bitstream::BitWriter::new(writer),
252            table: tree.to_lookup_table()
253        }
254    }
255}
256
257impl<W> Write for HuffmanWriter<W> where W: Write {
258    fn write(&mut self, buf: &[u8]) -> IOResult<usize> {
259        for item in buf {
260            let bits = self.table.get(item).ok_or(IOError::from(IOErrorKind::InvalidData))?;
261            for bit in bits {
262                self.inner.write_bit(bit)?;
263            }
264        }
265        Ok(buf.len())
266    }
267
268    fn flush(&mut self) -> IOResult<()> {
269        Ok(())
270    }
271}
272
273/// *HuffmanReader* is a Read implementation that can read encoded words from the inner reader
274///
275/// # Examples
276/// ```
277/// extern crate huffman_coding;
278/// let pseudo_data = vec![0, 0, 1, 2, 2];
279/// let tree = huffman_coding::HuffmanTree::from_data(&pseudo_data[..]);
280///
281/// use std::io::{Read, Cursor};
282/// let cursor = Cursor::new([43, 8]);
283/// let mut buffer: [u8; 5] = [0; 5];
284///
285/// let mut reader = huffman_coding::HuffmanReader::new(cursor, tree);
286/// assert!(reader.read_exact(&mut buffer[..]).is_ok());
287/// assert_eq!(&buffer[..], &[2, 2, 0, 0, 1]);
288/// ```
289pub struct HuffmanReader<R> where R: Read {
290    inner: bitstream::BitReader<R>,
291    tree: HuffmanTree,
292}
293
294impl<R> HuffmanReader<R> where R: Read {
295    /// Construct a new reader, using the provided HuffmanTree for decoding
296    pub fn new(reader: R, tree: HuffmanTree) -> Self {
297        HuffmanReader {
298            inner: bitstream::BitReader::new(reader),
299            tree: tree,
300        }
301    }
302}
303
304impl<R> Read for HuffmanReader<R> where R: Read {
305    fn read(&mut self, buf: &mut [u8]) -> IOResult<usize> {
306        let mut pos = 0;
307        let mut state = &self.tree;
308        while pos < buf.len() {
309            let bit_opt = self.inner.read_bit()?;
310            if let Some(bit) = bit_opt {
311                match state {
312                    &HuffmanTree::Leaf(x, _) => {
313                        buf[pos] = x;
314                        pos += 1;
315                        state = &self.tree;
316                    },
317                    &HuffmanTree::Node(ref zero, ref one) => {
318                        state = if bit { one } else { zero };
319                        if let &HuffmanTree::Leaf(x, _) = state {
320                            buf[pos] = x;
321                            pos += 1;
322                            state = &self.tree;
323                        }
324                    }
325                }
326            } else {
327                if &self.tree != state {
328                    return Err(IOError::from(IOErrorKind::InvalidData))
329                } else {
330                    break;
331                }
332            }
333        }
334        Ok((pos))
335    }
336}
337
338#[cfg(test)]
339mod tests {
340    use super::*;
341    use bit_vec::BitVec;
342
343    #[test]
344    fn test_tree_builder() {
345        let vec = vec![1, 2, 3, 1, 1, 2];
346        let tree = HuffmanTree::from_data(&vec[..]);
347        let table = tree.to_lookup_table();
348
349        use std::iter::FromIterator;
350        assert_eq!(table[&1u8], BitVec::from_iter(vec![false].into_iter()));
351        assert_eq!(table[&2u8], BitVec::from_iter(vec![true, false].into_iter()));
352        assert_eq!(table[&3u8], BitVec::from_iter(vec![true, true].into_iter()));
353    }
354
355    #[test]
356    fn test_writer() {
357        use std::io::Write;
358        let pseudo_data = vec![0, 0, 1, 2, 2];
359        let tree = HuffmanTree::from_data(&pseudo_data[..]);
360
361        let mut vec = Vec::new();
362        {
363            let mut writer = HuffmanWriter::new(&mut vec, &tree);
364            assert!(writer.write(&[0, 0, 1, 1, 2, 2, 2, 2]).is_ok())
365        }
366        assert_eq!(&vec[..], &[175, 0 , 4]);
367    }
368
369    #[test]
370    fn test_reader() {
371        let mut table: [u8; 256] = [0; 256];
372        table[0] = 255;
373        table[1] = 128;
374        table[2] = 255;
375        let tree = HuffmanTree::from_table(&table[..]);
376
377        let mut input: [u8; 3] = [0; 3];
378        input[0] = 175;
379        input[1] = 0;
380        input[2] = 4;
381        use std::io::Cursor;
382        let mut buf = vec![0; 8];
383        let mut read = HuffmanReader::new(Cursor::new(input), tree);
384        use std::io::Read;
385        assert!(read.read_exact(&mut buf[..]).is_ok());
386        assert_eq!(&buf[..], &[0, 0, 1, 1, 2, 2, 2, 2]);
387        let read_end = read.read(&mut buf[..]);
388        assert!(read_end.is_ok());
389        assert_eq!(read_end.unwrap(), 0);
390    }
391}
392