bitstream_io/
huffman.rs

1// Copyright 2017 Brian Langenberger
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9//! Traits and implementations for reading or writing Huffman codes
10//! from or to a stream.
11
12#![warn(missing_docs)]
13
14use super::BitQueue;
15use super::Endianness;
16#[cfg(feature = "alloc")]
17use alloc::boxed::Box;
18#[cfg(feature = "alloc")]
19use alloc::collections::BTreeMap;
20#[cfg(feature = "alloc")]
21use alloc::vec::Vec;
22#[cfg(feature = "alloc")]
23use core::fmt;
24#[cfg(feature = "alloc")]
25use core::marker::PhantomData;
26#[cfg(feature = "alloc")]
27use core2::error::Error;
28
29#[cfg(not(feature = "alloc"))]
30use std::collections::BTreeMap;
31#[cfg(not(feature = "alloc"))]
32use std::error::Error;
33#[cfg(not(feature = "alloc"))]
34use std::fmt;
35#[cfg(not(feature = "alloc"))]
36use std::marker::PhantomData;
37
38/// A compiled Huffman tree element for use with the `read_huffman` method.
39/// Returned by `compile_read_tree`.
40///
41/// Compiled read trees are optimized for faster lookup
42/// and are therefore endian-specific.
43///
44/// In addition, each symbol in the source tree may occur many times
45/// in the compiled tree.  If symbols require a nontrivial amount of space,
46/// consider using reference counting so that they may be cloned
47/// more efficiently.
48pub enum ReadHuffmanTree<E: Endianness, T: Clone> {
49    /// The final value and new reader state
50    Done(T, u8, u32, PhantomData<E>),
51    /// Another byte is necessary to determine final value
52    Continue(Box<[ReadHuffmanTree<E, T>]>),
53    /// An invalid reader state has been used
54    InvalidState,
55}
56
57/// Given a vector of symbol/code pairs, compiles a Huffman tree
58/// for reading.
59///
60/// Code must be 0 or 1 bits and are always read from the stream
61/// from least-significant in the list to most signficant
62/// (which makes them easier to read for humans).
63///
64/// All possible codes must be assigned some symbol,
65/// and it is acceptable for the same symbol to occur multiple times.
66///
67/// ## Examples
68/// ```
69/// use bitstream_io::huffman::compile_read_tree;
70/// use bitstream_io::BigEndian;
71/// assert!(compile_read_tree::<BigEndian,i32>(
72///     vec![(1, vec![0]),
73///          (2, vec![1, 0]),
74///          (3, vec![1, 1])]).is_ok());
75/// ```
76///
77/// ```
78/// use std::io::{Read, Cursor};
79/// use bitstream_io::{BigEndian, BitReader, HuffmanRead};
80/// use bitstream_io::huffman::compile_read_tree;
81/// let tree = compile_read_tree(
82///     vec![('a', vec![0]),
83///          ('b', vec![1, 0]),
84///          ('c', vec![1, 1, 0]),
85///          ('d', vec![1, 1, 1])]).unwrap();
86/// let data = [0b10110111];
87/// let mut cursor = Cursor::new(&data);
88/// let mut reader = BitReader::endian(&mut cursor, BigEndian);
89/// assert_eq!(reader.read_huffman(&tree).unwrap(), 'b');
90/// assert_eq!(reader.read_huffman(&tree).unwrap(), 'c');
91/// assert_eq!(reader.read_huffman(&tree).unwrap(), 'd');
92/// ```
93pub fn compile_read_tree<E, T>(
94    values: Vec<(T, Vec<u8>)>,
95) -> Result<Box<[ReadHuffmanTree<E, T>]>, HuffmanTreeError>
96where
97    E: Endianness,
98    T: Clone,
99{
100    let tree = FinalHuffmanTree::new(values)?;
101
102    let mut result = Vec::with_capacity(256);
103    result.extend((0..256).map(|_| ReadHuffmanTree::InvalidState));
104    let queue = BitQueue::from_value(0, 0);
105    let i = queue.to_state();
106    result[i] = compile_queue(queue, &tree);
107    for bits in 1..8 {
108        for value in 0..(1 << bits) {
109            let queue = BitQueue::from_value(value, bits);
110            let i = queue.to_state();
111            result[i] = compile_queue(queue, &tree);
112        }
113    }
114    assert_eq!(result.len(), 256);
115    Ok(result.into_boxed_slice())
116}
117
118fn compile_queue<E, T>(
119    mut queue: BitQueue<E, u8>,
120    tree: &FinalHuffmanTree<T>,
121) -> ReadHuffmanTree<E, T>
122where
123    E: Endianness,
124    T: Clone,
125{
126    match tree {
127        FinalHuffmanTree::Leaf(ref value) => {
128            let len = queue.len();
129            ReadHuffmanTree::Done(value.clone(), queue.value(), len, PhantomData)
130        }
131        FinalHuffmanTree::Tree(ref bit0, ref bit1) => {
132            if queue.is_empty() {
133                ReadHuffmanTree::Continue(
134                    (0..256)
135                        .map(|byte| compile_queue(BitQueue::from_value(byte as u8, 8), tree))
136                        .collect::<Vec<ReadHuffmanTree<E, T>>>()
137                        .into_boxed_slice(),
138                )
139            } else if queue.pop(1) == 0 {
140                compile_queue(queue, bit0)
141            } else {
142                compile_queue(queue, bit1)
143            }
144        }
145    }
146}
147
148// A complete Huffman tree with no empty nodes
149enum FinalHuffmanTree<T: Clone> {
150    Leaf(T),
151    Tree(Box<FinalHuffmanTree<T>>, Box<FinalHuffmanTree<T>>),
152}
153
154impl<T: Clone> FinalHuffmanTree<T> {
155    fn new(values: Vec<(T, Vec<u8>)>) -> Result<FinalHuffmanTree<T>, HuffmanTreeError> {
156        let mut tree = WipHuffmanTree::new_empty();
157
158        for (symbol, code) in values {
159            tree.add(code.as_slice(), symbol)?;
160        }
161
162        tree.into_read_tree()
163    }
164}
165
166// Work-in-progress trees may have empty nodes during construction
167// but those are not allowed in a finalized tree.
168// If the user wants some codes to be None or an error symbol of some sort,
169// those will need to be specified explicitly.
170enum WipHuffmanTree<T: Clone> {
171    Empty,
172    Leaf(T),
173    Tree(Box<WipHuffmanTree<T>>, Box<WipHuffmanTree<T>>),
174}
175
176impl<T: Clone> WipHuffmanTree<T> {
177    fn new_empty() -> WipHuffmanTree<T> {
178        WipHuffmanTree::Empty
179    }
180
181    fn new_leaf(value: T) -> WipHuffmanTree<T> {
182        WipHuffmanTree::Leaf(value)
183    }
184
185    fn new_tree() -> WipHuffmanTree<T> {
186        WipHuffmanTree::Tree(Box::new(Self::new_empty()), Box::new(Self::new_empty()))
187    }
188
189    fn into_read_tree(self) -> Result<FinalHuffmanTree<T>, HuffmanTreeError> {
190        match self {
191            WipHuffmanTree::Empty => Err(HuffmanTreeError::MissingLeaf),
192            WipHuffmanTree::Leaf(v) => Ok(FinalHuffmanTree::Leaf(v)),
193            WipHuffmanTree::Tree(zero, one) => {
194                let zero = zero.into_read_tree()?;
195                let one = one.into_read_tree()?;
196                Ok(FinalHuffmanTree::Tree(Box::new(zero), Box::new(one)))
197            }
198        }
199    }
200
201    fn add(&mut self, code: &[u8], symbol: T) -> Result<(), HuffmanTreeError> {
202        match self {
203            WipHuffmanTree::Empty => {
204                if code.is_empty() {
205                    *self = WipHuffmanTree::new_leaf(symbol);
206                    Ok(())
207                } else {
208                    *self = WipHuffmanTree::new_tree();
209                    self.add(code, symbol)
210                }
211            }
212            WipHuffmanTree::Leaf(_) => Err(if code.is_empty() {
213                HuffmanTreeError::DuplicateLeaf
214            } else {
215                HuffmanTreeError::OrphanedLeaf
216            }),
217            WipHuffmanTree::Tree(ref mut zero, ref mut one) => {
218                if code.is_empty() {
219                    Err(HuffmanTreeError::DuplicateLeaf)
220                } else {
221                    match code[0] {
222                        0 => zero.add(&code[1..], symbol),
223                        1 => one.add(&code[1..], symbol),
224                        _ => Err(HuffmanTreeError::InvalidBit),
225                    }
226                }
227            }
228        }
229    }
230}
231
232/// An error type during Huffman tree compilation.
233#[derive(PartialEq, Eq, Copy, Clone, Debug)]
234pub enum HuffmanTreeError {
235    /// One of the bits in a Huffman code is not 0 or 1
236    InvalidBit,
237    /// A Huffman code in the specification has no defined symbol
238    MissingLeaf,
239    /// The same Huffman code specifies multiple symbols
240    DuplicateLeaf,
241    /// A Huffman code is the prefix of some longer code
242    OrphanedLeaf,
243}
244
245impl fmt::Display for HuffmanTreeError {
246    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
247        match *self {
248            HuffmanTreeError::InvalidBit => write!(f, "invalid bit in code"),
249            HuffmanTreeError::MissingLeaf => write!(f, "missing leaf node in specification"),
250            HuffmanTreeError::DuplicateLeaf => write!(f, "duplicate leaf node in specification"),
251            HuffmanTreeError::OrphanedLeaf => write!(f, "orphaned leaf node in specification"),
252        }
253    }
254}
255
256impl Error for HuffmanTreeError {}
257
258/// Given a vector of symbol/code pairs, compiles a Huffman tree
259/// for writing.
260///
261/// Code must be 0 or 1 bits and are always written to the stream
262/// from least-significant in the list to most signficant
263/// (which makes them easier to read for humans).
264///
265/// If the same symbol occurs multiple times, the first code is used.
266/// Unlike in read trees, not all possible codes need to be
267/// assigned a symbol.
268///
269/// ## Examples
270/// ```
271/// use bitstream_io::huffman::compile_write_tree;
272/// use bitstream_io::BigEndian;
273/// assert!(compile_write_tree::<BigEndian,i32>(
274///     vec![(1, vec![0]),
275///          (2, vec![1, 0]),
276///          (3, vec![1, 1])]).is_ok());
277/// ```
278///
279/// ```
280/// use std::io::Write;
281/// use bitstream_io::{BigEndian, BitWriter, HuffmanWrite};
282/// use bitstream_io::huffman::compile_write_tree;
283/// let tree = compile_write_tree(
284///     vec![('a', vec![0]),
285///          ('b', vec![1, 0]),
286///          ('c', vec![1, 1, 0]),
287///          ('d', vec![1, 1, 1])]).unwrap();
288/// let mut data = Vec::new();
289/// {
290///     let mut writer = BitWriter::endian(&mut data, BigEndian);
291///     writer.write_huffman(&tree, 'b').unwrap();
292///     writer.write_huffman(&tree, 'c').unwrap();
293///     writer.write_huffman(&tree, 'd').unwrap();
294/// }
295/// assert_eq!(data, [0b10110111]);
296/// ```
297pub fn compile_write_tree<E, T>(
298    values: Vec<(T, Vec<u8>)>,
299) -> Result<WriteHuffmanTree<E, T>, HuffmanTreeError>
300where
301    E: Endianness,
302    T: Ord + Clone,
303{
304    let mut map = BTreeMap::new();
305
306    for (symbol, code) in values {
307        let mut encoded = Vec::new();
308        for bits in code.chunks(32) {
309            let mut acc = BitQueue::<E, u32>::new();
310            for bit in bits {
311                match *bit {
312                    0 => acc.push(1, 0),
313                    1 => acc.push(1, 1),
314                    _ => return Err(HuffmanTreeError::InvalidBit),
315                }
316            }
317            let len = acc.len();
318            encoded.push((len, acc.value()))
319        }
320        map.entry(symbol)
321            .or_insert_with(|| encoded.into_boxed_slice());
322    }
323
324    Ok(WriteHuffmanTree {
325        map,
326        phantom: PhantomData,
327    })
328}
329
330/// A compiled Huffman tree for use with the `write_huffman` method.
331/// Returned by `compiled_write_tree`.
332pub struct WriteHuffmanTree<E: Endianness, T: Ord> {
333    map: BTreeMap<T, Box<[(u32, u32)]>>,
334    phantom: PhantomData<E>,
335}
336
337impl<E: Endianness, T: Ord + Clone> WriteHuffmanTree<E, T> {
338    /// Returns true if symbol is in tree.
339    #[inline]
340    pub fn has_symbol(&self, symbol: &T) -> bool {
341        self.map.contains_key(symbol)
342    }
343
344    /// Given symbol, returns iterator of
345    /// (bits, value) pairs for writing code.
346    /// Panics if symbol is not found.
347    #[inline]
348    pub fn get(&self, symbol: &T) -> impl Iterator<Item = &(u32, u32)> {
349        self.map[symbol].iter()
350    }
351}