bzip2_os/huffman_coding/
huffman.rs

1//! The main huffman function generates the bistream for the Rust version of the standard BZIP2 library.
2//!
3//! BZIP2 is a block-oriented approach to compress data. 
4//! 
5//! 
6//! The huffman coding algorithm is rather complex. The source code is generously commented to explain the algorithm.
7//! It may be helpful to know that the process is iterative. Each chunk of 50 bytes is analayzed four times to maximize
8//! the compression ratio. Each iteration seeks to improve the codes generated by the previous iteration.
9//! 
10//! 
11//! The process of encoding each block is inherently sequential and does not benefit from multithreading.
12//! 
13//! 
14
15
16use log::{error, info, trace};
17
18use crate::bitstream::bitpacker::BitPacker;
19
20use super::huffman_code_from_weights::improve_code_len_from_weights;
21use std::cmp::Ordering;
22
23
24/// Nodes are either terminal(leaf) or nonterminal (have two child nodes). The depth of the node in the tree is what 
25/// determines the weights of the huffman codes. 
26#[derive(Eq, PartialEq, PartialOrd, Ord, Debug, Clone)]
27pub enum NodeData {
28    Kids(Box<Node>, Box<Node>),
29    Leaf(u16),
30}
31
32/// Huffman codes are built from weights derived from a tree structure of these nodes.
33#[derive(Debug, Clone)]
34pub struct Node {
35    /// The weight of the node is based on the frequency of the symbols under this node
36    pub weight: u32,
37    /// The depth of the node in the tree
38    pub depth: u8,
39    // A unique identifier for this node based on the symbols of the child nodes, used in sorting
40    pub syms: u32,
41    /// The data associated with this node (either a leaf or a node that (eventually) has leaves)
42    pub node_data: NodeData,
43}
44impl Node {
45    /// Create a new node
46    pub fn new(weight: u32, depth: u8, syms: u32, node_data: NodeData) -> Node {
47        Node {
48            weight,
49            depth,
50            syms,
51            node_data,
52        }
53    }
54}
55impl PartialOrd for Node {
56    /// Sort Nodes by decreasing weight and decreasing symbol value
57    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
58        Some((other.weight, other.syms).cmp(&(self.weight, self.syms)))
59    }
60}
61impl Ord for Node {
62    /// Sort Nodes by decreasing weight and decreasing symbol value
63    fn cmp(&self, other: &Self) -> Ordering {
64        (other.weight, other.syms).cmp(&(self.weight, self.syms))
65    }
66}
67impl PartialEq for Node {
68    fn eq(&self, other: &Self) -> bool {
69        (self.weight, self.syms) == (other.weight, other.syms)
70    }
71}
72
73impl Eq for Node {}
74
75#[allow(clippy::unusual_byte_groupings)]
76/// Encode MTF/RLE2 data using Julian Seward's multi-table system.
77/// We need a BitPacker, block data, frequency array for the data, end of block symbol, and the symbol 
78/// map that will be encoded with the data. Data is returned via the BitPacker.
79pub fn huf_encode(
80    bp: &mut BitPacker,
81    rle2: &[u16],
82    freq: &[u32; 256],
83    eob: u16,
84    symbol_map: &[u16],
85) {
86    // We can have 2-6 coding tables depending on how much data we have coming in.
87    let table_count: usize = match rle2.len() {
88        0..=199 => 2,
89        200..=599 => 3,
90        600..=1199 => 4,
91        1200..=2399 => 5,
92        _ => 6,
93    };
94
95    // Now we can initialize the coding tables based on our frequency counts
96    let mut tables = init_tables(freq, table_count, eob);
97
98    // And initialize a count of how many selectors we need, a vec to store them,
99    let selector_count = rle2.len() / 50 + usize::from(rle2.len() % 50 != 0);
100    let mut selectors = vec![0_usize; selector_count];
101
102    /*
103     So now we have our tables divided out by frequency ratios. Each symbol in each table
104     is either a 0 or a 15. At the end of this next loop, those will be adjusted as we test
105     against real data. These adjusted numbers are used to build a huffman tree, and
106     thereby the huffman codes.
107
108     We will iterate four times (default value, can be changed in options) to improve the tables.
109     ds: Nov 2022: Looking at trial runs, it seems that three trial runs gains almost the same
110     value as four times. Each larger size gains slightly more in ascii texts. Therefor I added
111     an option to let the user change the number of trial runs -- mostly for more testing purposes.
112    */
113
114    for iter in 0..4 {
115        // initialize favorites[] to 0 for each table/group, for reporting only
116        let mut favorites = [0; 6];
117
118        // Initilalize the total cost for this iteration (used only in reporting)
119        let mut total_cost = 0;
120
121        // Initialize "recalculated" frequency array for each table/group (for adjusting the tables)
122        let mut rfreq = [[0u32; 258]; 6];
123
124        // Reset the selectors on each iteration - not needed because we record them only on the last iteration
125        //selectors.clear();
126
127        /*
128        Time to move through the input 50 bytes at a time. For each group of 50, we
129        compute the best table to use based on the one that has the lowest "weight" cost.
130
131        NOTE: Julian did a trick with rolling all six 16 bit arrays into 3 32 bit arrays.
132        I'm not doing that here. If we do in the future, I believe we could use 1 128 bit array
133        for the same purpose (or possibly 2 64 bit arrays, or a struct).
134
135        Our goal is to find the coding table which has the lowest cost for this chunk
136        of data, and record that in the selector table.
137        */
138
139        rle2.chunks(50).enumerate().for_each(|(i, chunk)| {
140            // the cost array helps us find which table is best for each 50 byte chunk
141            let mut cost = [0; 6];
142
143            // Find the best table for the next chunk
144            // For each symbol, iterate through each table and calculate that tables cost for the symbol
145            chunk.iter().for_each(|symbol| {
146                (0..table_count).for_each(|t| cost[t] += tables[t][*symbol as usize])
147            });
148
149            // Get the position of the lowest non-zero cost (or first if several have the same low cost)
150            let bt = cost
151                .iter()
152                .position(|n| n == cost[0..table_count as usize].iter().min().unwrap())
153                .unwrap() as usize;
154
155            // Add that lowest cost to total_cost for the entire input data set
156            total_cost += cost[bt];
157
158            // For reporting only, increment the appropriate fave array with the index
159            // so we know how many times this table was chosen as "best"
160            favorites[bt] += 1;
161
162            // Now that we know the best table, go get the frequency counts for
163            // the symbols in this group of 50 bytes and store the freq counts into rfreq.
164            // As we go through an iteration, this becomes cumulative.
165            // This is used to improve the tables for the next iteration.
166            chunk
167                .iter()
168                .for_each(|&symbol| rfreq[bt as usize][symbol as usize] += 1);
169
170            // On the last iteration, collect the selector list
171            if iter == 3 {
172                selectors[i] = bt;
173            } // End of the for_each loop, we've gone through the entire input (again).
174        });
175
176        info!(
177            " pass {}: best cost is {}, grp uses are {:?}",
178            iter + 1,
179            total_cost / 8,
180            favorites
181        );
182
183        if iter == 3 {
184            trace!("Final tables:",);
185            for (i, table) in tables.iter().enumerate().take(table_count) {
186                trace!(
187                    "\n      {}: {:?}",
188                    i,
189                    table.iter().take(eob as usize + 1).collect::<Vec<_>>()
190                );
191            }
192        }
193
194        // Next we will call improve_code_len_from_weights on each of the tables we made.
195        // This makes actual node trees based off our weighting. This will put the
196        // improved weights into the weight arrays. As mentioned, we do this 4 times.
197        (0..table_count).for_each(|t| {
198            improve_code_len_from_weights(&mut tables[t], &rfreq[t], eob);
199        });
200    }
201    /*
202      All iterations are now done, and we have good tables and selectors.
203      Time to make actual binary codes for reach table. Since we have good lengths,
204      we can use the code_from_length function to quickly generate codes.
205    */
206
207    // Write out the the symbol maps, 16 bit L1 + 0-16 words of 16 bit L2 maps.
208    trace!("\r\x1b[43mSymbol maps written at {}.     \x1b[0m", bp.loc());
209    for word in symbol_map {
210        bp.out16(*word);
211        trace!("\r\x1b[43m{:0>16b}     \x1b[0m", word);
212    }
213
214    // Symbol maps are followed by a 3 bit number of Huffman trees that exist
215    trace!("\r\x1b[43mTable count written at {}.     \x1b[0m", bp.loc());
216    bp.out24((3 << 24) | table_count as u32);
217
218    // Then a 15 bit number indicating the how many selectors are used
219    // (how many 50 byte groups are in this block of data)
220    trace!(
221        "\r\x1b[43mSelector count written at {}.     \x1b[0m",
222        bp.loc()
223    );
224    bp.out24((15 << 24) | selector_count as u32);
225
226    /*
227    Selectors tell us which table is to be used for each 50 symbol chunk of input
228    data in this block.
229
230    Given a list of selectors such as [0,2,0,2,1,0], we can see that bytes
231    1-50 are decoded by table 0, 51-100 are decoded by table 2, etc.
232
233    HOWEVER, the selectors are written after a Move-To-Front transform, to save space.
234    */
235
236    // Initialize an index to the selector tables in preparation for the MTF transform for the selectors
237    let mut table_idx = vec![0, 1, 2, 3, 4, 5];
238
239    // Prepare the output selector vec
240    let mut mtf_selectors = vec![0_usize; selector_count];
241
242    // ...then do the MTF transform of the selector vector
243    for i in 0..selector_count {
244        // Get the index of the next selector
245        let mut idx = table_idx.iter().position(|c| c == &selectors[i]).unwrap();
246        // Write it to the MTF version of the selector vector
247        mtf_selectors[i as usize] = idx;
248        // Adjust the  table index
249        // For speed, don't take time to adjust the index if we don't need to.
250        match idx {
251            0 => {}
252            1 => {
253                table_idx.swap(0, 1);
254            }
255            _ => {
256                // Shift each index at the front of selector "forward" one. Do blocks of 3 for speed.
257                let temp_sel = table_idx[idx as usize];
258
259                while idx > 2 {
260                    table_idx[idx as usize] = table_idx[idx as usize - 1];
261                    table_idx[idx as usize - 1] = table_idx[idx as usize - 2];
262                    table_idx[idx as usize - 2] = table_idx[idx as usize - 3];
263                    idx -= 3;
264                }
265                // ...then clean up any odd ones
266                while idx > 0 {
267                    table_idx[idx as usize] = table_idx[idx as usize - 1];
268                    idx -= 1;
269                }
270                // ...and finally put the "new" symbol index at the front of the index.
271                table_idx[0] = temp_sel;
272            }
273        }
274    }
275
276    // Now write out all the mtf'ed selectors
277    trace!(
278        "\r\x1b[43m{} Selectors written at {}.     \x1b[0m",
279        mtf_selectors.len(),
280        bp.loc()
281    );
282    for selector in &mtf_selectors {
283        match selector {
284            0 => bp.out24(0x01000000),
285            1 => bp.out24(0x02000002),
286            2 => bp.out24(0x03000006),
287            3 => bp.out24(0x0400000e),
288            4 => bp.out24(0x0500001e),
289            5 => bp.out24(0x0600003e),
290            _ => error!("Bad selector value of {}", selector),
291        };
292    }
293    // All done with mtf_selectors.
294    drop(mtf_selectors);
295
296    /*
297    Now create the huffman codes. We need to convert our weights to huffman codes.
298    (And later we will want to use the BitPacker with those codes also.)
299    We will need both a vec of all output code tables, and a temporary place
300    to build each output-style table.
301
302    Remember, our tables are full 258 size arrays. We've done indexing and move-to-
303    front transforms, so we are using only the bottom portion of that array.
304
305    We will shift from an array format to a vec, which allows us to use Rust's
306    optimized sorting functions.
307    */
308
309    // Create the vec for the output-style code tables
310    let mut out_code_tables = vec![];
311
312    // For as many tables as we have, we have quite few steps to do
313    #[allow(clippy::needless_range_loop)]
314    for i in 0..table_count as usize {
315        // Because we use a fixed array of tables for speed, we must use an index to get only the ones we want
316        let table = tables[i];
317        // Calculate the size once
318        let sym_size = eob as usize + 1;
319        // Now create a output-style table
320        let mut out_codes = vec![(0_u16, 0_u32); sym_size];
321        // ... and create a vec of the symbols actually used
322        let mut len_sym: Vec<(u32, u16)> = table.iter().enumerate().take(sym_size).fold(
323            Vec::with_capacity(sym_size),
324            |mut vec, (i, t)| {
325                vec.push((*t, i as u16));
326                vec
327            },
328        );
329        // ... and sort that vec ascending by length
330        len_sym.sort_unstable();
331
332        /*
333        Get the minimum length in use so we can create the "next code".
334        Next_code is a tuple of length from len_sym and a 32 bit code we build.
335
336        Codes are sequential within each length range. For example, for a length
337        of 3, the codes would be 000, 001, 010, 011, etc.
338        */
339        // Initialize next_code to the length of the first (smallest) length, and 0.
340        let mut next_code: (u32, u32) = (len_sym[0].0, 0);
341
342        // Create a vec where we can store the codes.
343        let mut sym_codes = vec![(0_u16, 0_u32); sym_size];
344
345        /*
346        When the length changes, do a shift left for each increment and continue. So
347        for example, if the length is now 5 and the last code had a length of 3 and
348        was 010, we would now start with 01000, 01001, 01010, etc.
349
350        We store a version for the BitPacker, a format I also use in my hashmap
351        in the decompression side. This is in addition to the format we need below.
352
353        The length is in the most significant 8 bits, the code in the least.
354        For example if the length is 5 and the code is 11111, we'd see
355            01234567_XXXXXX_0123456780123456
356            00000101_000000_0000000000011111
357        X indicates space we never use. Excuse the odd _ marking. It is why I use
358        #[allow(clippy::unusual_byte_groupings)]
359        */
360        // For each symbol...
361        for (i, (len, sym)) in len_sym.iter().enumerate() {
362            if *len != next_code.0 {
363                next_code.1 <<= len - next_code.0;
364                next_code.0 = *len;
365            }
366            // ...save a version of the code in the BitPacker format
367            out_codes[i] = (*sym, len << 24 | next_code.1);
368
369            // ...and also save it for encoding below.
370            sym_codes[i] = (*sym, next_code.1);
371
372            // Increment the next_code.1 counter to generate the next code
373            next_code.1 += 1;
374        }
375
376        /*
377        Next we write out the symbol lengths that will be used in the decompression.
378        They start with an "origin" length of five bits taken from the first symbol.
379
380        Each symbol's length (INCLUDING THE FIRST SYMBOL) will be output as the delta
381        (difference) from the last symbol. Each delta is exactly 2 bits long, a 11 or
382        a 10. The end of the delta is indicated with a single zero bit.
383        It seems odd to me that we write the first symbol, which will ALWAYS have a
384        delta of zero.
385        */
386
387        // The len_sym vec now needs to be sorted by symbol, not length
388        len_sym.sort_unstable_by(|a, b| a.1.cmp(&b.1));
389
390        // We write the origin as a five bit int
391        let mut origin = len_sym[0].0;
392        trace!(
393            "\r\x1b[43mWriting origin {} for huffman map {} at {}.   \x1b[0m",
394            origin,
395            i,
396            bp.loc()
397        );
398        bp.out24((5 << 24) | origin as u32);
399
400        // ... and iterate through the entire symbol list writing the deltas
401        for entry in len_sym.iter() {
402            trace!(
403                "\r\x1b[43mIndex for {} is {} (Delta = {}), written at {}.     \x1b[0m",
404                entry.1,
405                entry.0,
406                origin as i32 - entry.0 as i32,
407                bp.loc(),
408            );
409            // get the next length
410            let (l, _) = entry;
411            // create the delta from the last length
412            let mut delta = *l as i32 - origin as i32;
413            // put this new length into origin for the next iteration of this loop
414            origin = *l;
415            // write out the length delta as ±1 repeatedly
416            loop {
417                match delta.cmp(&0) {
418                    // if the delta is greater than 0, write 0x10
419                    Ordering::Greater => {
420                        bp.out24(0x02_000002);
421                        // subtract one from the delta and loop again
422                        delta -= 1;
423                    }
424                    // if the delta is less than 0, write 0x11
425                    Ordering::Less => {
426                        bp.out24(0x02_000003);
427                        // add one t the delta and loop again
428                        delta += 1;
429                    }
430                    // if there is no delta, break out of this loop
431                    Ordering::Equal => {
432                        break;
433                    }
434                }
435            }
436            // write a single 0 bit to indicate we are done with this symbol's length code
437            bp.out24(0x01_000000);
438        }
439        // Sort the output codes by symbol before saving them in the output tables
440        out_codes.sort_unstable();
441        out_code_tables.push(out_codes);
442    }
443
444    /*
445    Now encode and write the data.
446    Each symbol in the input is basically an index to the code.
447    We do this using the 50 byte table selectors, so we have to switch that up regularly.
448    */
449
450    // For debugging
451    let mut index = 0;
452    for (idx, chunk) in rle2.chunks(50).enumerate() {
453        let table_idx = selectors[idx];
454        chunk.iter().for_each(|symbol| {
455            bp.out24(out_code_tables[table_idx][*symbol as usize].1);
456            // For debugging
457            trace!(
458                "\r\x1b[43mBP: RLE2 {} ({}) written at {}.   \x1b[0m",
459                index,
460                *symbol,
461                bp.loc()
462            );
463            // For debugging
464            index += 1;
465        })
466    }
467    // All done
468}
469
470#[allow(clippy::unusual_byte_groupings)]
471/// Initialize 2-6 frequency tables based on the frequencies of the symbols in the data
472fn init_tables(freqs: &[u32], table_count: usize, eob: u16) -> [[u32; 258]; 6] {
473    // Initialize the tables to weights of 15. Since Rust requires compile time array
474    // sizing, let's just make 6 even though we might need less.
475    let mut tables = [[15_u32; 258]; 6];
476
477    // Then set the soft limits to divide the data out to the tables.
478    let portion_limit: u32 = freqs.iter().sum::<u32>() / table_count as u32;
479    /* How this works is a bit weird.
480    We initially make tables based on the frequency of the symbols. For example, say we
481    have enough data for six tables. Some symbols will have greater frequency than other
482    symbols - and because of our MTF, symbols like RUNA and RUNB will be very frequent in
483    many cases.
484
485    We will build the tables based on symbol frequency. We assign a weight of zero to each
486    possible symbol for those symbols that are in this  table, and a weight of 15 for any
487    symbol that doesn't get into this table. If we have lots of RUNA symbols, it is very
488    possible that over 1/6 of the frequency will be RUNA symbols. So this table would have
489    a weight of 0 given to RUNA and a weight of 15 given to every other symbol. The next
490    table gets as many symbols as needed to get to the next 1/6th of the frequency, with
491    weights similarly apportioned.
492
493    After making these initial tables, we run through the data 50 bytes at a time and see
494    which table results in the lowest "cost" for those 50 bytes. We adjust costs/weights
495    and repeat three more times. Julian must have found that this works better than just
496    doing a straight-up huffman tree based on frequencies of the entire block.
497    */
498
499    // Update our coding tables. Note: Tables 3 and 5 are unique in that they get
500    // just shy of the limit rather than just over the limit. If we did not do this,
501    // we may not get enough symbols in the last tables.
502
503    // First set our table index to the last table we need, and the portion sum to 0.
504    let mut table_index = table_count - 1;
505    let mut portion = 0;
506    // For each symbol add the frequency to portion and set the weight value for this
507    // symbol in this table to 0. If the current portion meets the portion limit
508    // (based on how many groups we have, and remembering the special limits for
509    // tables 2 and 4) increment the table index to point to the next table and
510    // reset the portion sum to 0. Keep going through all the symbols.
511    for (i, freq) in freqs.iter().enumerate().take(eob as usize + 1) {
512        let f = freq;
513        if portion + f > portion_limit && (table_index == 2 || table_index == 4) {
514            table_index = table_index.saturating_sub(1);
515            tables[table_index][i] = 0;
516            portion = *f;
517            if portion > portion_limit {
518                tables[table_index][i] = 0;
519                table_index = table_index.saturating_sub(1);
520                portion = 0;
521            }
522        } else {
523            portion += f;
524            tables[table_index][i] = 0;
525            if portion > portion_limit {
526                table_index = table_index.saturating_sub(1);
527                portion = 0;
528            }
529        };
530    }
531    tables
532}