huffman_compress/
lib.rs

1//! [Huffman compression](https://en.wikipedia.org/wiki/Huffman_coding)
2//! given a probability distribution over arbitrary symbols.
3//!
4//! # Examples
5//!
6//! ```rust
7//! extern crate bit_vec;
8//! extern crate huffman_compress;
9//!
10//! # use std::error::Error;
11//! #
12//! # fn try_main() -> Result<(), Box<dyn Error>> {
13//! use std::iter::FromIterator;
14//! use std::collections::HashMap;
15//! use bit_vec::BitVec;
16//! use huffman_compress::{CodeBuilder, Book, Tree};
17//!
18//! let mut weights = HashMap::new();
19//! weights.insert("CG", 293);
20//! weights.insert("AG", 34);
21//! weights.insert("AT", 4);
22//! weights.insert("CT", 4);
23//! weights.insert("TG", 1);
24//!
25//! // Construct a Huffman code based on the weights (e.g. counts or relative
26//! // frequencies).
27//! let (book, tree) = CodeBuilder::from_iter(weights).finish();
28//!
29//! // More frequent symbols will be encoded with fewer bits.
30//! assert!(book.get("CG").map_or(0, |cg| cg.len()) <
31//!         book.get("AG").map_or(0, |ag| ag.len()));
32//!
33//! // Encode some symbols using the book.
34//! let mut buffer = BitVec::new();
35//! let example = vec!["AT", "CG", "AT", "TG", "AG", "CT", "CT", "AG", "CG"];
36//! for symbol in &example {
37//!     book.encode(&mut buffer, symbol);
38//! }
39//!
40//! // Decode the symbols using the tree.
41//! let decoded: Vec<&str> = tree.decoder(&buffer, example.len()).collect();
42//! assert_eq!(decoded, example);
43//! #     Ok(())
44//! # }
45//! #
46//! # fn main() {
47//! #     try_main().unwrap();
48//! # }
49//! ```
50
51#![doc(html_root_url = "https://docs.rs/huffman-compress/0.7.0")]
52#![forbid(unsafe_code)]
53#![deny(missing_docs)]
54#![deny(missing_debug_implementations)]
55
56extern crate bit_vec;
57extern crate num_traits;
58
59#[cfg(test)]
60#[macro_use]
61extern crate quickcheck;
62
63use std::{
64    borrow::Borrow,
65    cmp,
66    cmp::Reverse,
67    collections::{btree_map, BTreeMap, BinaryHeap},
68    error::Error,
69    fmt,
70    iter::{FromIterator, Take},
71};
72
73use bit_vec::BitVec;
74use num_traits::ops::saturating::Saturating;
75
76/// A trie used for decoding.
77#[derive(Debug, Clone)]
78pub struct Tree<K> {
79    root: usize,
80    arena: Vec<Node<K>>,
81}
82
83#[derive(Debug, Clone)]
84struct Node<K> {
85    parent: Option<usize>,
86    data: NodeData<K>,
87}
88
89#[derive(Debug, Clone)]
90enum NodeData<K> {
91    Leaf { symbol: K },
92    Branch { left: usize, right: usize },
93}
94
95impl<K: Clone> Tree<K> {
96    /// An iterator decoding symbols from a source of bits.
97    ///
98    /// In pathologic cases the iterator is unbounded: If there is only one
99    /// symbol the iterator will yield that symbol **infinitely** often without
100    /// consuming any bits.
101    ///
102    /// If there are no symbols the decoded sequence is empty without consuming
103    /// any bits.
104    ///
105    /// If the source is exhausted no further symbols will be decoded
106    /// (not even incomplete ones).
107    pub fn unbounded_decoder<I>(&self, iterable: I) -> UnboundedDecoder<'_, K, I>
108    where
109        I: IntoIterator<Item = bool>,
110    {
111        UnboundedDecoder {
112            tree: self,
113            iter: iterable.into_iter(),
114        }
115    }
116
117    /// An iterator decoding up to `num_symbols` symbols from a source of bits.
118    ///
119    /// Also see [`unbounded_decoder()`](#method.unbounded_decoder).
120    ///
121    /// If there are no symbols the decoded sequence is empty without consuming
122    /// any bits.
123    ///
124    /// If the source is exhausted no further symbols will be decoded
125    /// (not even incomplete ones).
126    pub fn decoder<I>(&self, iterable: I, num_symbols: usize) -> Decoder<'_, K, I>
127    where
128        I: IntoIterator<Item = bool>,
129    {
130        self.unbounded_decoder(iterable).take(num_symbols)
131    }
132}
133
134/// A bounded [decoder](struct.UnboundedDecoder.html), decoding symbols from
135/// a source of bits.
136pub type Decoder<'a, K, I> = Take<UnboundedDecoder<'a, K, I>>;
137
138/// Decodes symbols from a source of bits.
139#[derive(Debug)]
140pub struct UnboundedDecoder<'a, K: 'a, I: IntoIterator<Item = bool>> {
141    tree: &'a Tree<K>,
142    iter: I::IntoIter,
143}
144
145impl<'a, K: Clone, I: IntoIterator<Item = bool>> Iterator for UnboundedDecoder<'a, K, I> {
146    type Item = K;
147
148    fn next(&mut self) -> Option<K> {
149        let mut node = self.tree.arena.get(self.tree.root)?;
150
151        loop {
152            match node.data {
153                NodeData::Leaf { ref symbol } => return Some(symbol.clone()),
154                NodeData::Branch { left, right } => {
155                    node = match self.iter.next() {
156                        Some(true) => &self.tree.arena[left],
157                        Some(false) => &self.tree.arena[right],
158                        None => return None,
159                    };
160                }
161            }
162        }
163    }
164}
165
166/// A codebook used for encoding.
167#[derive(Clone, Debug)]
168pub struct Book<K> {
169    book: BTreeMap<K, BitVec>,
170}
171
172impl<K: Ord + Clone> Book<K> {
173    /// Returns the underlying B-Tree.
174    pub fn into_inner(self) -> BTreeMap<K, BitVec> {
175        self.book
176    }
177
178    /// An iterator over all symbols in sorted order.
179    pub fn symbols(&self) -> btree_map::Keys<'_, K, BitVec> {
180        self.book.keys()
181    }
182
183    /// An iterator over all symbol and code word pairs, sorted by symbol.
184    pub fn iter(&self) -> btree_map::Iter<'_, K, BitVec> {
185        self.book.iter()
186    }
187
188    /// Returns the number of symbols in the book.
189    pub fn len(&self) -> usize {
190        self.book.len()
191    }
192
193    /// Returns true if the map has no symbols.
194    pub fn is_empty(&self) -> bool {
195        self.book.is_empty()
196    }
197
198    /// Returns the code word for a given symbol.
199    pub fn get<Q>(&self, k: &Q) -> Option<&BitVec>
200    where
201        K: Borrow<Q>,
202        Q: ?Sized + Ord,
203    {
204        self.book.get(k)
205    }
206
207    /// Returns true if the book contains the specified symbol.
208    pub fn contains_symbol<Q>(&self, k: &Q) -> bool
209    where
210        K: Borrow<Q>,
211        Q: ?Sized + Ord,
212    {
213        self.book.contains_key(k)
214    }
215
216    /// Writes the code word for the given key to a bit vector.
217    ///
218    /// # Errors
219    ///
220    /// Returns [`EncodeError`] if `k` is not in the codebook.
221    ///
222    /// [`EncodeError`]: struct.EncodeError.html
223    pub fn encode<Q>(&self, buffer: &mut BitVec, k: &Q) -> Result<(), EncodeError>
224    where
225        K: Borrow<Q>,
226        Q: ?Sized + Ord,
227    {
228        match self.book.get(k) {
229            Some(code) => buffer.extend(code),
230            None => return Err(EncodeError {}),
231        }
232
233        Ok(())
234    }
235
236    fn new() -> Book<K> {
237        Book {
238            book: BTreeMap::new(),
239        }
240    }
241
242    fn build(&mut self, arena: &[Node<K>], node: &Node<K>, word: BitVec) {
243        match node.data {
244            NodeData::Leaf { ref symbol } => {
245                self.book.insert(symbol.clone(), word);
246            }
247            NodeData::Branch { left, right } => {
248                let mut left_word = word.clone();
249                left_word.push(true);
250                self.build(arena, &arena[left], left_word);
251
252                let mut right_word = word;
253                right_word.push(false);
254                self.build(arena, &arena[right], right_word);
255            }
256        }
257    }
258}
259
260/// Tried to encode an unknown symbol.
261#[derive(Debug, Clone)]
262pub struct EncodeError;
263
264impl fmt::Display for EncodeError {
265    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
266        "encode error: tried to encode an unknown symbol".fmt(f)
267    }
268}
269
270impl Error for EncodeError {
271    fn description(&self) -> &str {
272        "encode error: tried to encode an unknown symbol"
273    }
274}
275
276/// Collects information about symbols and their weights used to construct
277/// a Huffman code.
278///
279/// # Stability
280///
281/// The constructed code is guaranteed to be deterministic and stable across
282/// semver compatible releases if:
283///
284/// * There is a strict order on the symbols `K`.
285/// * No duplicate symbols are added.
286///
287/// The ordering of symbols will be used to break ties when weights are equal.
288#[derive(Debug, Clone)]
289pub struct CodeBuilder<K: Ord + Clone, W: Saturating + Ord> {
290    heap: BinaryHeap<HeapData<K, W>>,
291    arena: Vec<Node<K>>,
292}
293
294impl<K: Ord + Clone, W: Saturating + Ord> CodeBuilder<K, W> {
295    /// Creates a new, empty `CodeBuilder<K, W>`.
296    pub fn new() -> CodeBuilder<K, W> {
297        CodeBuilder {
298            heap: BinaryHeap::new(),
299            arena: Vec::new(),
300        }
301    }
302
303    /// Creates a new, empty `CodeBuilder<K, W>` and preallocates space
304    /// for `capacity` symbols.
305    pub fn with_capacity(capacity: usize) -> CodeBuilder<K, W> {
306        CodeBuilder {
307            heap: BinaryHeap::with_capacity(capacity),
308            arena: Vec::with_capacity(2 * capacity),
309        }
310    }
311
312    /// Adds a symbol and weight pair.
313    pub fn push(&mut self, symbol: K, weight: W) {
314        self.heap.push(HeapData {
315            weight: Reverse(weight),
316            symbol: symbol.clone(),
317            id: self.arena.len(),
318        });
319
320        self.arena.push(Node {
321            parent: None,
322            data: NodeData::Leaf { symbol },
323        });
324    }
325
326    /// Constructs a [book](struct.Book.html) and [tree](struct.Tree.html) pair
327    /// for encoding and decoding.
328    pub fn finish(mut self) -> (Book<K>, Tree<K>) {
329        let mut book = Book::new();
330
331        let root = loop {
332            let left = match self.heap.pop() {
333                Some(left) => left,
334                None => {
335                    return (
336                        book,
337                        Tree {
338                            root: 0,
339                            arena: self.arena,
340                        },
341                    )
342                }
343            };
344
345            let right = match self.heap.pop() {
346                Some(right) => right,
347                None => break left,
348            };
349
350            let id = self.arena.len();
351
352            self.arena[left.id].parent = Some(id);
353            self.arena[right.id].parent = Some(id);
354
355            self.heap.push(HeapData {
356                weight: Reverse(left.weight.0.saturating_add(right.weight.0)),
357                symbol: cmp::min(left.symbol, right.symbol),
358                id,
359            });
360
361            self.arena.push(Node {
362                parent: None,
363                data: NodeData::Branch {
364                    left: left.id,
365                    right: right.id,
366                },
367            });
368        };
369
370        book.build(&self.arena, &self.arena[root.id], BitVec::new());
371
372        (
373            book,
374            Tree {
375                root: root.id,
376                arena: self.arena,
377            },
378        )
379    }
380}
381
382impl<K: Ord + Clone, W: Saturating + Ord> Default for CodeBuilder<K, W> {
383    fn default() -> CodeBuilder<K, W> {
384        CodeBuilder::new()
385    }
386}
387
388impl<K: Ord + Clone, W: Saturating + Ord> FromIterator<(K, W)> for CodeBuilder<K, W> {
389    fn from_iter<T>(weights: T) -> CodeBuilder<K, W>
390    where
391        T: IntoIterator<Item = (K, W)>,
392    {
393        let iter = weights.into_iter();
394        let (size_hint, _) = iter.size_hint();
395        let mut code = CodeBuilder::with_capacity(size_hint);
396        code.extend(iter);
397        code
398    }
399}
400
401impl<K: Ord + Clone, W: Saturating + Ord> Extend<(K, W)> for CodeBuilder<K, W> {
402    fn extend<T>(&mut self, weights: T)
403    where
404        T: IntoIterator<Item = (K, W)>,
405    {
406        for (symbol, weight) in weights {
407            self.push(symbol, weight);
408        }
409    }
410}
411
412impl<'a, K: Ord + Clone, W: Saturating + Ord + Clone> FromIterator<(&'a K, &'a W)>
413    for CodeBuilder<K, W>
414{
415    fn from_iter<T>(weights: T) -> CodeBuilder<K, W>
416    where
417        T: IntoIterator<Item = (&'a K, &'a W)>,
418    {
419        CodeBuilder::from_iter(weights.into_iter().map(|(k, v)| (k.clone(), v.clone())))
420    }
421}
422
423impl<'a, K: Ord + Clone, W: Saturating + Ord + Clone> Extend<(&'a K, &'a W)> for CodeBuilder<K, W> {
424    fn extend<T>(&mut self, weights: T)
425    where
426        T: IntoIterator<Item = (&'a K, &'a W)>,
427    {
428        self.extend(weights.into_iter().map(|(k, v)| (k.clone(), v.clone())));
429    }
430}
431
432#[derive(Eq, PartialEq, Ord, PartialOrd, Debug)]
433struct HeapData<K, W> {
434    weight: Reverse<W>,
435    symbol: K, // tie breaker
436    id: usize,
437}
438
439impl<K: Clone, W: Clone> Clone for HeapData<K, W> {
440    fn clone(&self) -> HeapData<K, W> {
441        HeapData {
442            weight: Reverse(self.weight.0.clone()),
443            symbol: self.symbol.clone(),
444            id: self.id,
445        }
446    }
447}
448
449/// Shortcut for
450/// [`CodeBuilder::from_iter(weights).finish()`](struct.CodeBuilder.html).
451pub fn codebook<'a, I, K, W>(weights: I) -> (Book<K>, Tree<K>)
452where
453    I: IntoIterator<Item = (&'a K, &'a W)>,
454    K: 'a + Ord + Clone,
455    W: 'a + Saturating + Ord + Clone,
456{
457    CodeBuilder::from_iter(weights).finish()
458}
459
460#[cfg(test)]
461mod tests {
462    use std::collections::HashMap;
463
464    use super::*;
465
466    #[test]
467    fn test_uniform() {
468        let mut sample = HashMap::new();
469        sample.insert(1, 1);
470        sample.insert(2, 1);
471        sample.insert(3, 1);
472        sample.insert(4, 1);
473        sample.insert(5, 1);
474        let (book, tree) = CodeBuilder::from_iter(sample).finish();
475
476        let mut buffer = BitVec::new();
477        book.encode(&mut buffer, &1).unwrap();
478        book.encode(&mut buffer, &2).unwrap();
479        book.encode(&mut buffer, &3).unwrap();
480        book.encode(&mut buffer, &4).unwrap();
481        book.encode(&mut buffer, &5).unwrap();
482
483        let mut decoder = tree.unbounded_decoder(buffer);
484        assert_eq!(decoder.next(), Some(1));
485        assert_eq!(decoder.next(), Some(2));
486        assert_eq!(decoder.next(), Some(3));
487        assert_eq!(decoder.next(), Some(4));
488        assert_eq!(decoder.next(), Some(5));
489        assert_eq!(decoder.next(), None);
490    }
491
492    #[test]
493    fn test_uniform_from_static() {
494        const WEIGHTS: &[(&char, &usize)] = &[(&'a', &1), (&'b', &1), (&'c', &1), (&'d', &1)];
495        let (book, tree) = codebook(WEIGHTS.iter().cloned());
496
497        let mut buffer = BitVec::new();
498        book.encode(&mut buffer, &'a').unwrap();
499        book.encode(&mut buffer, &'b').unwrap();
500        book.encode(&mut buffer, &'c').unwrap();
501        book.encode(&mut buffer, &'d').unwrap();
502
503        let mut decoder = tree.unbounded_decoder(buffer);
504        assert_eq!(decoder.next(), Some('a'));
505        assert_eq!(decoder.next(), Some('b'));
506        assert_eq!(decoder.next(), Some('c'));
507        assert_eq!(decoder.next(), Some('d'));
508        assert_eq!(decoder.next(), None);
509    }
510
511    #[test]
512    fn test_empty() {
513        let (book, tree) = CodeBuilder::<&str, i32>::new().finish();
514
515        let mut buffer = BitVec::new();
516        assert!(book.encode(&mut buffer, "hello").is_err());
517
518        let mut decoder = tree.unbounded_decoder(buffer);
519        assert_eq!(decoder.next(), None);
520    }
521
522    #[test]
523    fn test_single() {
524        let mut builder = CodeBuilder::new();
525        builder.push("hello", 1);
526        let (book, tree) = builder.finish();
527
528        let mut buffer = BitVec::new();
529        book.encode(&mut buffer, "hello").unwrap();
530
531        let mut decoder = tree.unbounded_decoder(buffer);
532        assert_eq!(decoder.next(), Some("hello"));
533        assert_eq!(decoder.next(), Some("hello")); // repeats
534    }
535
536    quickcheck! {
537        fn efficient_order(ag: u32, at: u32, cg: u32, ct: u32, tg: u32) -> bool {
538            let mut builder = CodeBuilder::new();
539            builder.push("CG", cg);
540            builder.push("AG", ag);
541            builder.push("AT", at);
542            builder.push("CT", ct);
543            builder.push("TG", tg);
544            let (book, _) = builder.finish();
545
546            let len = |symbol| {
547                book.get(symbol).map_or(0, |code| code.len())
548            };
549
550            at >= ct || len("CT") <= len("AT") ||
551            ag.saturating_add(at).saturating_add(cg).saturating_add(ct).saturating_add(tg) == u32::MAX
552        }
553
554        fn encode_decode_bytes(symbols: Vec<u8>) -> bool {
555            let mut counts = [0; 256];
556            for symbol in &symbols {
557                counts[usize::from(*symbol)] += 1;
558            }
559
560            let (book, tree) = counts.iter()
561                .enumerate()
562                .map(|(k, v)| (k as u8, *v))
563                .collect::<CodeBuilder<_, _>>()
564                .finish();
565
566            let mut buffer = BitVec::new();
567            for symbol in &symbols {
568                book.encode(&mut buffer, symbol).unwrap();
569            }
570
571            tree.unbounded_decoder(&buffer).eq(symbols)
572        }
573    }
574}