huffman_compress2/
lib.rs

1// Copyright 2018 Niklas Fiekas <niklas.fiekas@backscattering.de>
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9//! [Huffman compression](https://en.wikipedia.org/wiki/Huffman_coding)
10//! given a probability distribution over arbitrary symbols.
11//!
12//! # Examples
13//!
14//! ```rust
15//!
16//! # use std::error::Error;
17//! #
18//! # fn try_main() -> Result<(), Box<dyn Error>> {
19//! use std::iter::FromIterator;
20//! use std::collections::HashMap;
21//! use bit_vec::BitVec;
22//! use huffman_compress2::{CodeBuilder, Book, Tree};
23//!
24//! let mut weights = HashMap::new();
25//! weights.insert("CG", 293);
26//! weights.insert("AG", 34);
27//! weights.insert("AT", 4);
28//! weights.insert("CT", 4);
29//! weights.insert("TG", 1);
30//!
31//! // Construct a Huffman code based on the weights (e.g. counts or relative
32//! // frequencies).
33//! let (book, tree) = CodeBuilder::from_iter(weights).finish();
34//!
35//! // More frequent symbols will be encoded with fewer bits.
36//! assert!(book.get("CG").map_or(0, |cg| cg.len()) <
37//!         book.get("AG").map_or(0, |ag| ag.len()));
38//!
39//! // Encode some symbols using the book.
40//! let mut buffer = BitVec::new();
41//! let example = vec!["AT", "CG", "AT", "TG", "AG", "CT", "CT", "AG", "CG"];
42//! for symbol in &example {
43//!     book.encode(&mut buffer, symbol);
44//! }
45//!
46//! // Decode the symbols using the tree.
47//! let decoded: Vec<&str> = tree.decoder(&buffer, example.len()).collect();
48//! assert_eq!(decoded, example);
49//! #     Ok(())
50//! # }
51//! #
52//! # fn main() {
53//! #     try_main().unwrap();
54//! # }
55//! ```
56
57#![doc(html_root_url = "https://docs.rs/huffman-compress2/0.7.0")]
58#![forbid(unsafe_code)]
59#![deny(missing_docs)]
60#![deny(missing_debug_implementations)]
61#![warn(clippy::pedantic)]
62#![allow(clippy::manual_let_else)]
63
64use std::borrow::Borrow;
65use std::cmp;
66use std::cmp::Reverse;
67use std::collections::{btree_map, BTreeMap, BinaryHeap};
68use std::error::Error;
69use std::fmt;
70use std::iter::{FromIterator, Take};
71
72use bit_vec::BitVec;
73
74#[cfg(feature = "serde")]
75use serde::{Deserialize, Serialize};
76
77use num_traits::ops::saturating::Saturating;
78
79/// A tree used for decoding.
80#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
81#[derive(Debug, Clone)]
82pub struct Tree<K> {
83    root: usize,
84    arena: Vec<Node<K>>,
85}
86
87#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
88#[derive(Debug, Clone)]
89struct Node<K> {
90    parent: Option<usize>,
91    data: NodeData<K>,
92}
93
94#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
95#[derive(Debug, Clone)]
96enum NodeData<K> {
97    Leaf { symbol: K },
98    Branch { left: usize, right: usize },
99}
100
101impl<K: Clone> Tree<K> {
102    /// An iterator decoding symbols from a source of bits.
103    ///
104    /// In pathologic cases the iterator is unbounded: If there is only one
105    /// symbol the iterator will yield that symbol **infinitely** often without
106    /// consuming any bits.
107    ///
108    /// If there are no symbols the decoded sequence is empty without consuming
109    /// any bits.
110    ///
111    /// If the source is exhausted no further symbols will be decoded
112    /// (not even incomplete ones).
113    pub fn unbounded_decoder<I>(&self, iterable: I) -> UnboundedDecoder<K, I>
114    where
115        I: IntoIterator<Item = bool>,
116    {
117        UnboundedDecoder {
118            tree: self,
119            iter: iterable.into_iter(),
120        }
121    }
122
123    /// An iterator decoding up to `num_symbols` symbols from a source of bits.
124    ///
125    /// Also see [`unbounded_decoder()`](#method.unbounded_decoder).
126    ///
127    /// If there are no symbols the decoded sequence is empty without consuming
128    /// any bits.
129    ///
130    /// If the source is exhausted no further symbols will be decoded
131    /// (not even incomplete ones).
132    pub fn decoder<I>(&self, iterable: I, num_symbols: usize) -> Decoder<K, I>
133    where
134        I: IntoIterator<Item = bool>,
135    {
136        self.unbounded_decoder(iterable).take(num_symbols)
137    }
138}
139
140/// A bounded [decoder](struct.UnboundedDecoder.html), decoding symbols from
141/// a source of bits.
142pub type Decoder<'a, K, I> = Take<UnboundedDecoder<'a, K, I>>;
143
144/// Decodes symbols from a source of bits.
145#[derive(Debug)]
146pub struct UnboundedDecoder<'a, K: 'a, I: IntoIterator<Item = bool>> {
147    tree: &'a Tree<K>,
148    iter: I::IntoIter,
149}
150
151impl<'a, K: Clone, I: IntoIterator<Item = bool>> Iterator for UnboundedDecoder<'a, K, I> {
152    type Item = K;
153
154    fn next(&mut self) -> Option<K> {
155        let mut node = self.tree.arena.get(self.tree.root)?;
156
157        loop {
158            match node.data {
159                NodeData::Leaf { ref symbol } => return Some(symbol.clone()),
160                NodeData::Branch { left, right } => {
161                    node = if self.iter.next()? {
162                        &self.tree.arena[left]
163                    } else {
164                        &self.tree.arena[right]
165                    };
166                }
167            }
168        }
169    }
170}
171
172/// A codebook used for encoding.
173#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
174#[derive(Clone, Debug)]
175pub struct Book<K: Ord + Clone> {
176    book: BTreeMap<K, BitVec>,
177}
178
179impl<K: Ord + Clone> Book<K> {
180    /// Returns the underlying B-Tree.
181    #[must_use]
182    pub fn into_inner(self) -> BTreeMap<K, BitVec> {
183        self.book
184    }
185
186    /// An iterator over all symbols in sorted order.
187    pub fn symbols(&self) -> btree_map::Keys<K, BitVec> {
188        self.book.keys()
189    }
190
191    /// An iterator over all symbol and code word pairs, sorted by symbol.
192    pub fn iter(&self) -> btree_map::Iter<K, BitVec> {
193        self.book.iter()
194    }
195
196    /// Returns the number of symbols in the book.
197    #[must_use]
198    pub fn len(&self) -> usize {
199        self.book.len()
200    }
201
202    /// Returns true if the map has no symbols.
203    #[must_use]
204    pub fn is_empty(&self) -> bool {
205        self.book.is_empty()
206    }
207
208    /// Returns the code word for a given symbol.
209    pub fn get<Q: ?Sized>(&self, k: &Q) -> Option<&BitVec>
210    where
211        K: Borrow<Q>,
212        Q: Ord,
213    {
214        self.book.get(k)
215    }
216
217    /// Returns true if the book contains the specified symbol.
218    pub fn contains_symbol<Q: ?Sized>(&self, k: &Q) -> bool
219    where
220        K: Borrow<Q>,
221        Q: Ord,
222    {
223        self.book.contains_key(k)
224    }
225
226    /// Writes the code word for the given key to a bit vector.
227    ///
228    /// # Errors
229    ///
230    /// Returns [`EncodeError`] if `k` is not in the codebook.
231    ///
232    /// [`EncodeError`]: struct.EncodeError.html
233    pub fn encode<Q: ?Sized>(&self, buffer: &mut BitVec, k: &Q) -> Result<(), EncodeError>
234    where
235        K: Borrow<Q>,
236        Q: Ord,
237    {
238        self.book
239            .get(k)
240            .map(|code| buffer.extend(code))
241            .ok_or(EncodeError {})
242    }
243
244    fn new() -> Book<K> {
245        Book {
246            book: BTreeMap::new(),
247        }
248    }
249
250    fn build(&mut self, arena: &[Node<K>], node: &Node<K>, word: BitVec) {
251        match node.data {
252            NodeData::Leaf { ref symbol } => {
253                self.book.insert(symbol.clone(), word);
254            }
255            NodeData::Branch { left, right } => {
256                let mut left_word = word.clone();
257                left_word.push(true);
258                self.build(arena, &arena[left], left_word);
259
260                let mut right_word = word;
261                right_word.push(false);
262                self.build(arena, &arena[right], right_word);
263            }
264        }
265    }
266}
267
268/// Tried to encode an unknown symbol.
269#[derive(Debug, Clone)]
270pub struct EncodeError;
271
272impl fmt::Display for EncodeError {
273    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
274        "encode error: tried to encode an unknown symbol".fmt(f)
275    }
276}
277
278impl Error for EncodeError {
279    fn description(&self) -> &str {
280        "encode error: tried to encode an unknown symbol"
281    }
282}
283
284/// Collects information about symbols and their weights used to construct
285/// a Huffman code.
286///
287/// # Stability
288///
289/// The constructed code is guaranteed to be deterministic and stable across
290/// semver compatible releases if:
291///
292/// * There is a strict order on the symbols `K`.
293/// * No duplicate symbols are added.
294///
295/// The ordering of symbols will be used to break ties when weights are equal.
296#[derive(Debug, Clone)]
297pub struct CodeBuilder<K: Ord + Clone, W: Saturating + Ord> {
298    heap: BinaryHeap<HeapData<K, W>>,
299    arena: Vec<Node<K>>,
300}
301
302impl<K: Ord + Clone, W: Saturating + Ord> CodeBuilder<K, W> {
303    /// Creates a new, empty `CodeBuilder<K, W>`.
304    #[must_use]
305    pub fn new() -> CodeBuilder<K, W> {
306        CodeBuilder {
307            heap: BinaryHeap::new(),
308            arena: Vec::new(),
309        }
310    }
311
312    /// Creates a new, empty `CodeBuilder<K, W>` and preallocates space
313    /// for `capacity` symbols.
314    #[must_use]
315    pub fn with_capacity(capacity: usize) -> CodeBuilder<K, W> {
316        CodeBuilder {
317            heap: BinaryHeap::with_capacity(capacity),
318            arena: Vec::with_capacity(2 * capacity),
319        }
320    }
321
322    /// Adds a symbol and weight pair.
323    pub fn push(&mut self, symbol: K, weight: W) {
324        self.heap.push(HeapData {
325            weight: Reverse(weight),
326            symbol: symbol.clone(),
327            id: self.arena.len(),
328        });
329
330        self.arena.push(Node {
331            parent: None,
332            data: NodeData::Leaf { symbol },
333        });
334    }
335
336    /// Constructs a [book](struct.Book.html) and [tree](struct.Tree.html) pair
337    /// for encoding and decoding.
338    #[must_use]
339    pub fn finish(mut self) -> (Book<K>, Tree<K>) {
340        let mut book = Book::new();
341
342        let root = loop {
343            let left = match self.heap.pop() {
344                Some(left) => left,
345                None => {
346                    return (
347                        book,
348                        Tree {
349                            root: 0,
350                            arena: self.arena,
351                        },
352                    )
353                }
354            };
355
356            let right = match self.heap.pop() {
357                Some(right) => right,
358                None => break left,
359            };
360
361            let id = self.arena.len();
362
363            self.arena[left.id].parent = Some(id);
364            self.arena[right.id].parent = Some(id);
365
366            self.heap.push(HeapData {
367                weight: Reverse(left.weight.0.saturating_add(right.weight.0)),
368                symbol: cmp::min(left.symbol, right.symbol),
369                id,
370            });
371
372            self.arena.push(Node {
373                parent: None,
374                data: NodeData::Branch {
375                    left: left.id,
376                    right: right.id,
377                },
378            });
379        };
380
381        book.build(&self.arena, &self.arena[root.id], BitVec::new());
382        (
383            book,
384            Tree {
385                root: root.id,
386                arena: self.arena,
387            },
388        )
389    }
390}
391
392impl<K: Ord + Clone, W: Saturating + Ord> Default for CodeBuilder<K, W> {
393    fn default() -> CodeBuilder<K, W> {
394        CodeBuilder::new()
395    }
396}
397
398impl<K: Ord + Clone, W: Saturating + Ord> FromIterator<(K, W)> for CodeBuilder<K, W> {
399    fn from_iter<T>(weights: T) -> CodeBuilder<K, W>
400    where
401        T: IntoIterator<Item = (K, W)>,
402    {
403        let iter = weights.into_iter();
404        let (size_hint, _) = iter.size_hint();
405        let mut code = CodeBuilder::with_capacity(size_hint);
406        code.extend(iter);
407        code
408    }
409}
410
411impl<K: Ord + Clone, W: Saturating + Ord> Extend<(K, W)> for CodeBuilder<K, W> {
412    fn extend<T>(&mut self, weights: T)
413    where
414        T: IntoIterator<Item = (K, W)>,
415    {
416        for (symbol, weight) in weights {
417            self.push(symbol, weight);
418        }
419    }
420}
421
422impl<'a, K: Ord + Clone, W: Saturating + Ord + Clone> FromIterator<(&'a K, &'a W)>
423    for CodeBuilder<K, W>
424{
425    fn from_iter<T>(weights: T) -> CodeBuilder<K, W>
426    where
427        T: IntoIterator<Item = (&'a K, &'a W)>,
428    {
429        weights
430            .into_iter()
431            .map(|(k, v)| (k.clone(), v.clone()))
432            .collect()
433    }
434}
435
436impl<'a, K: Ord + Clone, W: Saturating + Ord + Clone> Extend<(&'a K, &'a W)> for CodeBuilder<K, W> {
437    fn extend<T>(&mut self, weights: T)
438    where
439        T: IntoIterator<Item = (&'a K, &'a W)>,
440    {
441        self.extend(weights.into_iter().map(|(k, v)| (k.clone(), v.clone())));
442    }
443}
444
445#[derive(Eq, PartialEq, Ord, PartialOrd, Debug)]
446struct HeapData<K, W> {
447    weight: Reverse<W>,
448    symbol: K, // tie breaker
449    id: usize,
450}
451
452impl<K: Clone, W: Clone> Clone for HeapData<K, W> {
453    fn clone(&self) -> HeapData<K, W> {
454        HeapData {
455            weight: Reverse(self.weight.0.clone()),
456            symbol: self.symbol.clone(),
457            id: self.id,
458        }
459    }
460}
461
462/// Shortcut for
463/// [`CodeBuilder::from_iter(weights).finish()`](struct.CodeBuilder.html).
464pub fn codebook<'a, I, K, W>(weights: I) -> (Book<K>, Tree<K>)
465where
466    I: IntoIterator<Item = (&'a K, &'a W)>,
467    K: 'a + Ord + Clone,
468    W: 'a + Saturating + Ord + Clone,
469{
470    CodeBuilder::from_iter(weights).finish()
471}
472
473#[cfg(test)]
474mod tests {
475    use super::*;
476    use quickcheck::quickcheck;
477    use std::collections::HashMap;
478
479    #[test]
480    fn test_uniform() {
481        let mut sample = HashMap::new();
482        sample.insert(1, 1);
483        sample.insert(2, 1);
484        sample.insert(3, 1);
485        sample.insert(4, 1);
486        sample.insert(5, 1);
487        let (book, tree) = CodeBuilder::from_iter(sample).finish();
488
489        let mut buffer = BitVec::new();
490        book.encode(&mut buffer, &1).unwrap();
491        book.encode(&mut buffer, &2).unwrap();
492        book.encode(&mut buffer, &3).unwrap();
493        book.encode(&mut buffer, &4).unwrap();
494        book.encode(&mut buffer, &5).unwrap();
495
496        let mut decoder = tree.unbounded_decoder(buffer);
497        assert_eq!(decoder.next(), Some(1));
498        assert_eq!(decoder.next(), Some(2));
499        assert_eq!(decoder.next(), Some(3));
500        assert_eq!(decoder.next(), Some(4));
501        assert_eq!(decoder.next(), Some(5));
502        assert_eq!(decoder.next(), None);
503    }
504
505    #[test]
506    fn test_uniform_from_static() {
507        const WEIGHTS: &[(&char, &usize)] = &[(&'a', &1), (&'b', &1), (&'c', &1), (&'d', &1)];
508        let (book, tree) = codebook(WEIGHTS.iter().copied());
509
510        let mut buffer = BitVec::new();
511        book.encode(&mut buffer, &'a').unwrap();
512        book.encode(&mut buffer, &'b').unwrap();
513        book.encode(&mut buffer, &'c').unwrap();
514        book.encode(&mut buffer, &'d').unwrap();
515
516        let mut decoder = tree.unbounded_decoder(buffer);
517        assert_eq!(decoder.next(), Some('a'));
518        assert_eq!(decoder.next(), Some('b'));
519        assert_eq!(decoder.next(), Some('c'));
520        assert_eq!(decoder.next(), Some('d'));
521        assert_eq!(decoder.next(), None);
522    }
523
524    #[test]
525    fn test_empty() {
526        let (book, tree) = CodeBuilder::<&str, i32>::new().finish();
527
528        let mut buffer = BitVec::new();
529        assert!(book.encode(&mut buffer, "hello").is_err());
530
531        let mut decoder = tree.unbounded_decoder(buffer);
532        assert_eq!(decoder.next(), None);
533    }
534
535    #[test]
536    fn test_single() {
537        let mut builder = CodeBuilder::new();
538        builder.push("hello", 1);
539        let (book, tree) = builder.finish();
540
541        let mut buffer = BitVec::new();
542        book.encode(&mut buffer, "hello").unwrap();
543
544        let mut decoder = tree.unbounded_decoder(buffer);
545        assert_eq!(decoder.next(), Some("hello"));
546        assert_eq!(decoder.next(), Some("hello")); // repeats
547    }
548
549    quickcheck! {
550        fn efficient_order(ag: u32, at: u32, cg: u32, ct: u32, tg: u32) -> bool {
551            let mut builder = CodeBuilder::new();
552            builder.push("CG", cg);
553            builder.push("AG", ag);
554            builder.push("AT", at);
555            builder.push("CT", ct);
556            builder.push("TG", tg);
557            let (book, _) = builder.finish();
558
559            let len = |symbol| {
560                book.get(symbol).map_or(0, bit_vec::BitVec::len)
561            };
562
563            at >= ct || len("CT") <= len("AT") ||
564            ag.saturating_add(at).saturating_add(cg).saturating_add(ct).saturating_add(tg) == u32::MAX
565        }
566
567        fn encode_decode_bytes(symbols: Vec<u8>) -> bool {
568            let mut counts = [0; 256];
569            for symbol in &symbols {
570                counts[usize::from(*symbol)] += 1;
571            }
572
573            let (book, tree) = counts.iter()
574                .enumerate()
575                .map(|(k, v)| (k as u8, *v))
576                .collect::<CodeBuilder<_, _>>()
577                .finish();
578
579            let mut buffer = BitVec::new();
580            for symbol in &symbols {
581                book.encode(&mut buffer, symbol).unwrap();
582            }
583
584            tree.unbounded_decoder(&buffer).eq(symbols)
585        }
586    }
587}