cshannon/encoding/
balanced_tree.rs

1// Copyright 2020 Prathmesh Prabhu
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Create a new balanced tree [`Encoding`].
16//!
17//! All letters in this encoding are fixed width bit strings. The smallest width
18//! necessary to generate the required number of [`Letter`]s is used.
19//!
20//! The generated [`Encoding`] is stable: calling `new` on a [`Model`]
21//! repeatedly yields the same [`Encoding`].
22
23use super::Encoding;
24use crate::code::Letter;
25use crate::model::Model;
26use crate::tokens::Token;
27use anyhow::{anyhow, Result};
28use std::collections::HashMap;
29
30/// Create a new balanced tree encoding.
31///
32/// See [package documentation] for details.
33///
34/// [package documentation]: index.html
35pub fn new<T>(m: Model<T>) -> Result<Encoding<T>>
36where
37    T: Token,
38{
39    let mut map = HashMap::new();
40    let mut letter_generator = LetterGenerator::new(log2(m.len() as u64))?;
41    for t in m.tokens_sorted() {
42        match letter_generator.next() {
43            // Programming error, since bit_count should guarantee we never run
44            // out of letters.
45            None => panic!("Ran out of letters".to_owned()),
46            Some(l) => {
47                map.insert(t, l);
48            }
49        }
50    }
51    Ok(Encoding::new(map)?)
52}
53
54struct LetterGenerator {
55    bit_count: u64,
56    current: u64,
57    max: u64,
58}
59
60impl LetterGenerator {
61    pub fn new(bit_count: u64) -> Result<Self> {
62        if bit_count > 64 {
63            return Err(anyhow!("model has too many keys"));
64        }
65        Ok(Self {
66            bit_count,
67            // Do not use the code all-0s as it is indistinguishable from
68            // trailing 0s when decompressing text.
69            current: 1,
70            max: 1 << bit_count,
71        })
72    }
73}
74
75impl Iterator for LetterGenerator {
76    type Item = Letter;
77
78    fn next(&mut self) -> Option<Self::Item> {
79        if self.current >= self.max {
80            return None;
81        }
82        let l = Letter::new(
83            &(self.current << (64 - self.bit_count))
84                .to_be_bytes()
85                .to_vec(),
86            self.bit_count as u64,
87        );
88        self.current += 1;
89        Some(l)
90    }
91}
92
93// TODO: rename
94fn log2(n: u64) -> u64 {
95    let max = std::mem::size_of::<u64>() as u64 * 8;
96    let zeroes = n.leading_zeros() as u64;
97    max - zeroes
98}
99
100#[cfg(test)]
101mod tests {
102    use super::*;
103    use crate::code::Letter;
104    use crate::model;
105    use crate::tokens::test_utils::I32Token;
106    use std::collections::HashMap;
107
108    #[test]
109    fn empty() {
110        let m: Model<I32Token> = model::with_frequencies(&[]);
111        let t = new(m).unwrap();
112        assert!(t.alphabet().is_empty());
113        assert!(t.map().is_empty());
114    }
115
116    #[test]
117    fn one_token() {
118        let m = model::with_frequencies(&[(I32Token(1), 2)]);
119        let t = new(m).unwrap();
120        assert_eq!(t.alphabet().len(), 1);
121        let want: HashMap<I32Token, Letter> = [(I32Token(1), Letter::new(&[0b1000_0000], 1))]
122            .iter()
123            .cloned()
124            .collect();
125        assert_eq!(t.map(), &want);
126    }
127
128    #[test]
129    fn max_tokens_for_some_tree_height() {
130        let m = model::with_frequencies(&[(I32Token(1), 1), (I32Token(2), 2), (I32Token(3), 3)]);
131        let t = new(m).unwrap();
132        assert_eq!(t.alphabet().len(), 3);
133        let want: HashMap<I32Token, Letter> = [
134            (I32Token(3), Letter::new(&[0b0100_0000], 2)),
135            (I32Token(2), Letter::new(&[0b1000_0000], 2)),
136            (I32Token(1), Letter::new(&[0b1100_0000], 2)),
137        ]
138        .iter()
139        .cloned()
140        .collect();
141        assert_eq!(t.map(), &want);
142    }
143
144    #[test]
145    fn max_tokens_for_some_tree_height_then_one() {
146        let m = model::with_frequencies(&[
147            (I32Token(1), 1),
148            (I32Token(2), 2),
149            (I32Token(3), 3),
150            (I32Token(4), 4),
151        ]);
152        let t = new(m).unwrap();
153        assert_eq!(t.alphabet().len(), 4);
154        let want: HashMap<I32Token, Letter> = [
155            (I32Token(4), Letter::new(&[0b0010_0000], 3)),
156            (I32Token(3), Letter::new(&[0b0100_0000], 3)),
157            (I32Token(2), Letter::new(&[0b0110_0000], 3)),
158            (I32Token(1), Letter::new(&[0b1000_0000], 3)),
159        ]
160        .iter()
161        .cloned()
162        .collect();
163        assert_eq!(t.map(), &want);
164    }
165}