cshannon/encoding/
balanced_tree.rs

1// Copyright 2020 Prathmesh Prabhu
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Create a new balanced tree [`Encoding`].
16//!
17//! All letters in this encoding are fixed width bit strings. The smallest width
18//! necessary to generate the required number of [`Letter`]s is used.
19//!
20//! The generated [`Encoding`] is stable: calling `new` on a [`Model`]
21//! repeatedly yields the same [`Encoding`].
22
23use super::Encoding;
24use crate::model::Model;
25use crate::{code::Letter, tokens::Token};
26use anyhow::{anyhow, Result};
27use std::collections::HashMap;
28
29/// Create a new balanced tree encoding.
30///
31/// See [package documentation] for details.
32///
33/// [package documentation]: index.html
34pub fn new<T: Token>(m: Model<T>) -> Result<Encoding<T>> {
35    let mut map = HashMap::new();
36    let mut letter_generator = LetterGenerator::new(log2(m.len() as u64))?;
37    for t in m.tokens_sorted() {
38        match letter_generator.next() {
39            // Programming error, since bit_count should guarantee we never run
40            // out of letters.
41            None => panic!("Ran out of letters"),
42            Some(l) => {
43                map.insert(t, l);
44            }
45        }
46    }
47    Ok(Encoding::new(map)?)
48}
49
50struct LetterGenerator {
51    bit_count: u64,
52    current: u64,
53    max: u64,
54}
55
56impl LetterGenerator {
57    pub fn new(bit_count: u64) -> Result<Self> {
58        if bit_count > 64 {
59            return Err(anyhow!("model has too many keys"));
60        }
61        Ok(Self {
62            bit_count,
63            // Do not use the code all-0s as it is indistinguishable from
64            // trailing 0s when decompressing text.
65            current: 1,
66            max: 1 << bit_count,
67        })
68    }
69}
70
71impl Iterator for LetterGenerator {
72    type Item = Letter;
73
74    fn next(&mut self) -> Option<Self::Item> {
75        if self.current >= self.max {
76            return None;
77        }
78        let l = Letter::new(
79            &(self.current << (64 - self.bit_count))
80                .to_be_bytes()
81                .to_vec(),
82            self.bit_count as u64,
83        );
84        self.current += 1;
85        Some(l)
86    }
87}
88
89fn log2(n: u64) -> u64 {
90    let max = std::mem::size_of::<u64>() as u64 * 8;
91    let zeroes = n.leading_zeros() as u64;
92    max - zeroes
93}
94
95#[cfg(test)]
96mod tests {
97    use super::*;
98    use crate::code::Letter;
99    use crate::model;
100    use crate::tokens::test_utils::I32Token;
101    use std::collections::HashMap;
102
103    #[test]
104    fn empty() {
105        let m: Model<I32Token> = model::with_frequencies(&[]);
106        let t = new(m).unwrap();
107        assert!(t.alphabet().is_empty());
108        assert!(t.map().is_empty());
109    }
110
111    #[test]
112    fn one_token() {
113        let m = model::with_frequencies(&[(I32Token(1), 2)]);
114        let t = new(m).unwrap();
115        assert_eq!(t.alphabet().len(), 1);
116        let want: HashMap<I32Token, Letter> = [(I32Token(1), Letter::new(&[0b1000_0000], 1))]
117            .iter()
118            .cloned()
119            .collect();
120        assert_eq!(t.map(), &want);
121    }
122
123    #[test]
124    fn max_tokens_for_some_tree_height() {
125        let m = model::with_frequencies(&[(I32Token(1), 1), (I32Token(2), 2), (I32Token(3), 3)]);
126        let t = new(m).unwrap();
127        assert_eq!(t.alphabet().len(), 3);
128        let want: HashMap<I32Token, Letter> = [
129            (I32Token(3), Letter::new(&[0b0100_0000], 2)),
130            (I32Token(2), Letter::new(&[0b1000_0000], 2)),
131            (I32Token(1), Letter::new(&[0b1100_0000], 2)),
132        ]
133        .iter()
134        .cloned()
135        .collect();
136        assert_eq!(t.map(), &want);
137    }
138
139    #[test]
140    fn max_tokens_for_some_tree_height_then_one() {
141        let m = model::with_frequencies(&[
142            (I32Token(1), 1),
143            (I32Token(2), 2),
144            (I32Token(3), 3),
145            (I32Token(4), 4),
146        ]);
147        let t = new(m).unwrap();
148        assert_eq!(t.alphabet().len(), 4);
149        let want: HashMap<I32Token, Letter> = [
150            (I32Token(4), Letter::new(&[0b0010_0000], 3)),
151            (I32Token(3), Letter::new(&[0b0100_0000], 3)),
152            (I32Token(2), Letter::new(&[0b0110_0000], 3)),
153            (I32Token(1), Letter::new(&[0b1000_0000], 3)),
154        ]
155        .iter()
156        .cloned()
157        .collect();
158        assert_eq!(t.map(), &want);
159    }
160}