bzip2_os/huffman_coding/huffman.rs
1//! The main huffman function generates the bistream for the Rust version of the standard BZIP2 library.
2//!
3//! BZIP2 is a block-oriented approach to compress data.
4//!
5//!
6//! The huffman coding algorithm is rather complex. The source code is generously commented to explain the algorithm.
7//! It may be helpful to know that the process is iterative. Each chunk of 50 bytes is analayzed four times to maximize
8//! the compression ratio. Each iteration seeks to improve the codes generated by the previous iteration.
9//!
10//!
11//! The process of encoding each block is inherently sequential and does not benefit from multithreading.
12//!
13//!
14
15
16use log::{error, info, trace};
17
18use crate::bitstream::bitpacker::BitPacker;
19
20use super::huffman_code_from_weights::improve_code_len_from_weights;
21use std::cmp::Ordering;
22
23
24/// Nodes are either terminal(leaf) or nonterminal (have two child nodes). The depth of the node in the tree is what
25/// determines the weights of the huffman codes.
26#[derive(Eq, PartialEq, PartialOrd, Ord, Debug, Clone)]
27pub enum NodeData {
28 Kids(Box<Node>, Box<Node>),
29 Leaf(u16),
30}
31
32/// Huffman codes are built from weights derived from a tree structure of these nodes.
33#[derive(Debug, Clone)]
34pub struct Node {
35 /// The weight of the node is based on the frequency of the symbols under this node
36 pub weight: u32,
37 /// The depth of the node in the tree
38 pub depth: u8,
39 // A unique identifier for this node based on the symbols of the child nodes, used in sorting
40 pub syms: u32,
41 /// The data associated with this node (either a leaf or a node that (eventually) has leaves)
42 pub node_data: NodeData,
43}
44impl Node {
45 /// Create a new node
46 pub fn new(weight: u32, depth: u8, syms: u32, node_data: NodeData) -> Node {
47 Node {
48 weight,
49 depth,
50 syms,
51 node_data,
52 }
53 }
54}
55impl PartialOrd for Node {
56 /// Sort Nodes by decreasing weight and decreasing symbol value
57 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
58 Some((other.weight, other.syms).cmp(&(self.weight, self.syms)))
59 }
60}
61impl Ord for Node {
62 /// Sort Nodes by decreasing weight and decreasing symbol value
63 fn cmp(&self, other: &Self) -> Ordering {
64 (other.weight, other.syms).cmp(&(self.weight, self.syms))
65 }
66}
67impl PartialEq for Node {
68 fn eq(&self, other: &Self) -> bool {
69 (self.weight, self.syms) == (other.weight, other.syms)
70 }
71}
72
73impl Eq for Node {}
74
75#[allow(clippy::unusual_byte_groupings)]
76/// Encode MTF/RLE2 data using Julian Seward's multi-table system.
77/// We need a BitPacker, block data, frequency array for the data, end of block symbol, and the symbol
78/// map that will be encoded with the data. Data is returned via the BitPacker.
79pub fn huf_encode(
80 bp: &mut BitPacker,
81 rle2: &[u16],
82 freq: &[u32; 256],
83 eob: u16,
84 symbol_map: &[u16],
85) {
86 // We can have 2-6 coding tables depending on how much data we have coming in.
87 let table_count: usize = match rle2.len() {
88 0..=199 => 2,
89 200..=599 => 3,
90 600..=1199 => 4,
91 1200..=2399 => 5,
92 _ => 6,
93 };
94
95 // Now we can initialize the coding tables based on our frequency counts
96 let mut tables = init_tables(freq, table_count, eob);
97
98 // And initialize a count of how many selectors we need, a vec to store them,
99 let selector_count = rle2.len() / 50 + usize::from(rle2.len() % 50 != 0);
100 let mut selectors = vec![0_usize; selector_count];
101
102 /*
103 So now we have our tables divided out by frequency ratios. Each symbol in each table
104 is either a 0 or a 15. At the end of this next loop, those will be adjusted as we test
105 against real data. These adjusted numbers are used to build a huffman tree, and
106 thereby the huffman codes.
107
108 We will iterate four times (default value, can be changed in options) to improve the tables.
109 ds: Nov 2022: Looking at trial runs, it seems that three trial runs gains almost the same
110 value as four times. Each larger size gains slightly more in ascii texts. Therefor I added
111 an option to let the user change the number of trial runs -- mostly for more testing purposes.
112 */
113
114 for iter in 0..4 {
115 // initialize favorites[] to 0 for each table/group, for reporting only
116 let mut favorites = [0; 6];
117
118 // Initilalize the total cost for this iteration (used only in reporting)
119 let mut total_cost = 0;
120
121 // Initialize "recalculated" frequency array for each table/group (for adjusting the tables)
122 let mut rfreq = [[0u32; 258]; 6];
123
124 // Reset the selectors on each iteration - not needed because we record them only on the last iteration
125 //selectors.clear();
126
127 /*
128 Time to move through the input 50 bytes at a time. For each group of 50, we
129 compute the best table to use based on the one that has the lowest "weight" cost.
130
131 NOTE: Julian did a trick with rolling all six 16 bit arrays into 3 32 bit arrays.
132 I'm not doing that here. If we do in the future, I believe we could use 1 128 bit array
133 for the same purpose (or possibly 2 64 bit arrays, or a struct).
134
135 Our goal is to find the coding table which has the lowest cost for this chunk
136 of data, and record that in the selector table.
137 */
138
139 rle2.chunks(50).enumerate().for_each(|(i, chunk)| {
140 // the cost array helps us find which table is best for each 50 byte chunk
141 let mut cost = [0; 6];
142
143 // Find the best table for the next chunk
144 // For each symbol, iterate through each table and calculate that tables cost for the symbol
145 chunk.iter().for_each(|symbol| {
146 (0..table_count).for_each(|t| cost[t] += tables[t][*symbol as usize])
147 });
148
149 // Get the position of the lowest non-zero cost (or first if several have the same low cost)
150 let bt = cost
151 .iter()
152 .position(|n| n == cost[0..table_count as usize].iter().min().unwrap())
153 .unwrap() as usize;
154
155 // Add that lowest cost to total_cost for the entire input data set
156 total_cost += cost[bt];
157
158 // For reporting only, increment the appropriate fave array with the index
159 // so we know how many times this table was chosen as "best"
160 favorites[bt] += 1;
161
162 // Now that we know the best table, go get the frequency counts for
163 // the symbols in this group of 50 bytes and store the freq counts into rfreq.
164 // As we go through an iteration, this becomes cumulative.
165 // This is used to improve the tables for the next iteration.
166 chunk
167 .iter()
168 .for_each(|&symbol| rfreq[bt as usize][symbol as usize] += 1);
169
170 // On the last iteration, collect the selector list
171 if iter == 3 {
172 selectors[i] = bt;
173 } // End of the for_each loop, we've gone through the entire input (again).
174 });
175
176 info!(
177 " pass {}: best cost is {}, grp uses are {:?}",
178 iter + 1,
179 total_cost / 8,
180 favorites
181 );
182
183 if iter == 3 {
184 trace!("Final tables:",);
185 for (i, table) in tables.iter().enumerate().take(table_count) {
186 trace!(
187 "\n {}: {:?}",
188 i,
189 table.iter().take(eob as usize + 1).collect::<Vec<_>>()
190 );
191 }
192 }
193
194 // Next we will call improve_code_len_from_weights on each of the tables we made.
195 // This makes actual node trees based off our weighting. This will put the
196 // improved weights into the weight arrays. As mentioned, we do this 4 times.
197 (0..table_count).for_each(|t| {
198 improve_code_len_from_weights(&mut tables[t], &rfreq[t], eob);
199 });
200 }
201 /*
202 All iterations are now done, and we have good tables and selectors.
203 Time to make actual binary codes for reach table. Since we have good lengths,
204 we can use the code_from_length function to quickly generate codes.
205 */
206
207 // Write out the the symbol maps, 16 bit L1 + 0-16 words of 16 bit L2 maps.
208 trace!("\r\x1b[43mSymbol maps written at {}. \x1b[0m", bp.loc());
209 for word in symbol_map {
210 bp.out16(*word);
211 trace!("\r\x1b[43m{:0>16b} \x1b[0m", word);
212 }
213
214 // Symbol maps are followed by a 3 bit number of Huffman trees that exist
215 trace!("\r\x1b[43mTable count written at {}. \x1b[0m", bp.loc());
216 bp.out24((3 << 24) | table_count as u32);
217
218 // Then a 15 bit number indicating the how many selectors are used
219 // (how many 50 byte groups are in this block of data)
220 trace!(
221 "\r\x1b[43mSelector count written at {}. \x1b[0m",
222 bp.loc()
223 );
224 bp.out24((15 << 24) | selector_count as u32);
225
226 /*
227 Selectors tell us which table is to be used for each 50 symbol chunk of input
228 data in this block.
229
230 Given a list of selectors such as [0,2,0,2,1,0], we can see that bytes
231 1-50 are decoded by table 0, 51-100 are decoded by table 2, etc.
232
233 HOWEVER, the selectors are written after a Move-To-Front transform, to save space.
234 */
235
236 // Initialize an index to the selector tables in preparation for the MTF transform for the selectors
237 let mut table_idx = vec![0, 1, 2, 3, 4, 5];
238
239 // Prepare the output selector vec
240 let mut mtf_selectors = vec![0_usize; selector_count];
241
242 // ...then do the MTF transform of the selector vector
243 for i in 0..selector_count {
244 // Get the index of the next selector
245 let mut idx = table_idx.iter().position(|c| c == &selectors[i]).unwrap();
246 // Write it to the MTF version of the selector vector
247 mtf_selectors[i as usize] = idx;
248 // Adjust the table index
249 // For speed, don't take time to adjust the index if we don't need to.
250 match idx {
251 0 => {}
252 1 => {
253 table_idx.swap(0, 1);
254 }
255 _ => {
256 // Shift each index at the front of selector "forward" one. Do blocks of 3 for speed.
257 let temp_sel = table_idx[idx as usize];
258
259 while idx > 2 {
260 table_idx[idx as usize] = table_idx[idx as usize - 1];
261 table_idx[idx as usize - 1] = table_idx[idx as usize - 2];
262 table_idx[idx as usize - 2] = table_idx[idx as usize - 3];
263 idx -= 3;
264 }
265 // ...then clean up any odd ones
266 while idx > 0 {
267 table_idx[idx as usize] = table_idx[idx as usize - 1];
268 idx -= 1;
269 }
270 // ...and finally put the "new" symbol index at the front of the index.
271 table_idx[0] = temp_sel;
272 }
273 }
274 }
275
276 // Now write out all the mtf'ed selectors
277 trace!(
278 "\r\x1b[43m{} Selectors written at {}. \x1b[0m",
279 mtf_selectors.len(),
280 bp.loc()
281 );
282 for selector in &mtf_selectors {
283 match selector {
284 0 => bp.out24(0x01000000),
285 1 => bp.out24(0x02000002),
286 2 => bp.out24(0x03000006),
287 3 => bp.out24(0x0400000e),
288 4 => bp.out24(0x0500001e),
289 5 => bp.out24(0x0600003e),
290 _ => error!("Bad selector value of {}", selector),
291 };
292 }
293 // All done with mtf_selectors.
294 drop(mtf_selectors);
295
296 /*
297 Now create the huffman codes. We need to convert our weights to huffman codes.
298 (And later we will want to use the BitPacker with those codes also.)
299 We will need both a vec of all output code tables, and a temporary place
300 to build each output-style table.
301
302 Remember, our tables are full 258 size arrays. We've done indexing and move-to-
303 front transforms, so we are using only the bottom portion of that array.
304
305 We will shift from an array format to a vec, which allows us to use Rust's
306 optimized sorting functions.
307 */
308
309 // Create the vec for the output-style code tables
310 let mut out_code_tables = vec![];
311
312 // For as many tables as we have, we have quite few steps to do
313 #[allow(clippy::needless_range_loop)]
314 for i in 0..table_count as usize {
315 // Because we use a fixed array of tables for speed, we must use an index to get only the ones we want
316 let table = tables[i];
317 // Calculate the size once
318 let sym_size = eob as usize + 1;
319 // Now create a output-style table
320 let mut out_codes = vec![(0_u16, 0_u32); sym_size];
321 // ... and create a vec of the symbols actually used
322 let mut len_sym: Vec<(u32, u16)> = table.iter().enumerate().take(sym_size).fold(
323 Vec::with_capacity(sym_size),
324 |mut vec, (i, t)| {
325 vec.push((*t, i as u16));
326 vec
327 },
328 );
329 // ... and sort that vec ascending by length
330 len_sym.sort_unstable();
331
332 /*
333 Get the minimum length in use so we can create the "next code".
334 Next_code is a tuple of length from len_sym and a 32 bit code we build.
335
336 Codes are sequential within each length range. For example, for a length
337 of 3, the codes would be 000, 001, 010, 011, etc.
338 */
339 // Initialize next_code to the length of the first (smallest) length, and 0.
340 let mut next_code: (u32, u32) = (len_sym[0].0, 0);
341
342 // Create a vec where we can store the codes.
343 let mut sym_codes = vec![(0_u16, 0_u32); sym_size];
344
345 /*
346 When the length changes, do a shift left for each increment and continue. So
347 for example, if the length is now 5 and the last code had a length of 3 and
348 was 010, we would now start with 01000, 01001, 01010, etc.
349
350 We store a version for the BitPacker, a format I also use in my hashmap
351 in the decompression side. This is in addition to the format we need below.
352
353 The length is in the most significant 8 bits, the code in the least.
354 For example if the length is 5 and the code is 11111, we'd see
355 01234567_XXXXXX_0123456780123456
356 00000101_000000_0000000000011111
357 X indicates space we never use. Excuse the odd _ marking. It is why I use
358 #[allow(clippy::unusual_byte_groupings)]
359 */
360 // For each symbol...
361 for (i, (len, sym)) in len_sym.iter().enumerate() {
362 if *len != next_code.0 {
363 next_code.1 <<= len - next_code.0;
364 next_code.0 = *len;
365 }
366 // ...save a version of the code in the BitPacker format
367 out_codes[i] = (*sym, len << 24 | next_code.1);
368
369 // ...and also save it for encoding below.
370 sym_codes[i] = (*sym, next_code.1);
371
372 // Increment the next_code.1 counter to generate the next code
373 next_code.1 += 1;
374 }
375
376 /*
377 Next we write out the symbol lengths that will be used in the decompression.
378 They start with an "origin" length of five bits taken from the first symbol.
379
380 Each symbol's length (INCLUDING THE FIRST SYMBOL) will be output as the delta
381 (difference) from the last symbol. Each delta is exactly 2 bits long, a 11 or
382 a 10. The end of the delta is indicated with a single zero bit.
383 It seems odd to me that we write the first symbol, which will ALWAYS have a
384 delta of zero.
385 */
386
387 // The len_sym vec now needs to be sorted by symbol, not length
388 len_sym.sort_unstable_by(|a, b| a.1.cmp(&b.1));
389
390 // We write the origin as a five bit int
391 let mut origin = len_sym[0].0;
392 trace!(
393 "\r\x1b[43mWriting origin {} for huffman map {} at {}. \x1b[0m",
394 origin,
395 i,
396 bp.loc()
397 );
398 bp.out24((5 << 24) | origin as u32);
399
400 // ... and iterate through the entire symbol list writing the deltas
401 for entry in len_sym.iter() {
402 trace!(
403 "\r\x1b[43mIndex for {} is {} (Delta = {}), written at {}. \x1b[0m",
404 entry.1,
405 entry.0,
406 origin as i32 - entry.0 as i32,
407 bp.loc(),
408 );
409 // get the next length
410 let (l, _) = entry;
411 // create the delta from the last length
412 let mut delta = *l as i32 - origin as i32;
413 // put this new length into origin for the next iteration of this loop
414 origin = *l;
415 // write out the length delta as ±1 repeatedly
416 loop {
417 match delta.cmp(&0) {
418 // if the delta is greater than 0, write 0x10
419 Ordering::Greater => {
420 bp.out24(0x02_000002);
421 // subtract one from the delta and loop again
422 delta -= 1;
423 }
424 // if the delta is less than 0, write 0x11
425 Ordering::Less => {
426 bp.out24(0x02_000003);
427 // add one t the delta and loop again
428 delta += 1;
429 }
430 // if there is no delta, break out of this loop
431 Ordering::Equal => {
432 break;
433 }
434 }
435 }
436 // write a single 0 bit to indicate we are done with this symbol's length code
437 bp.out24(0x01_000000);
438 }
439 // Sort the output codes by symbol before saving them in the output tables
440 out_codes.sort_unstable();
441 out_code_tables.push(out_codes);
442 }
443
444 /*
445 Now encode and write the data.
446 Each symbol in the input is basically an index to the code.
447 We do this using the 50 byte table selectors, so we have to switch that up regularly.
448 */
449
450 // For debugging
451 let mut index = 0;
452 for (idx, chunk) in rle2.chunks(50).enumerate() {
453 let table_idx = selectors[idx];
454 chunk.iter().for_each(|symbol| {
455 bp.out24(out_code_tables[table_idx][*symbol as usize].1);
456 // For debugging
457 trace!(
458 "\r\x1b[43mBP: RLE2 {} ({}) written at {}. \x1b[0m",
459 index,
460 *symbol,
461 bp.loc()
462 );
463 // For debugging
464 index += 1;
465 })
466 }
467 // All done
468}
469
470#[allow(clippy::unusual_byte_groupings)]
471/// Initialize 2-6 frequency tables based on the frequencies of the symbols in the data
472fn init_tables(freqs: &[u32], table_count: usize, eob: u16) -> [[u32; 258]; 6] {
473 // Initialize the tables to weights of 15. Since Rust requires compile time array
474 // sizing, let's just make 6 even though we might need less.
475 let mut tables = [[15_u32; 258]; 6];
476
477 // Then set the soft limits to divide the data out to the tables.
478 let portion_limit: u32 = freqs.iter().sum::<u32>() / table_count as u32;
479 /* How this works is a bit weird.
480 We initially make tables based on the frequency of the symbols. For example, say we
481 have enough data for six tables. Some symbols will have greater frequency than other
482 symbols - and because of our MTF, symbols like RUNA and RUNB will be very frequent in
483 many cases.
484
485 We will build the tables based on symbol frequency. We assign a weight of zero to each
486 possible symbol for those symbols that are in this table, and a weight of 15 for any
487 symbol that doesn't get into this table. If we have lots of RUNA symbols, it is very
488 possible that over 1/6 of the frequency will be RUNA symbols. So this table would have
489 a weight of 0 given to RUNA and a weight of 15 given to every other symbol. The next
490 table gets as many symbols as needed to get to the next 1/6th of the frequency, with
491 weights similarly apportioned.
492
493 After making these initial tables, we run through the data 50 bytes at a time and see
494 which table results in the lowest "cost" for those 50 bytes. We adjust costs/weights
495 and repeat three more times. Julian must have found that this works better than just
496 doing a straight-up huffman tree based on frequencies of the entire block.
497 */
498
499 // Update our coding tables. Note: Tables 3 and 5 are unique in that they get
500 // just shy of the limit rather than just over the limit. If we did not do this,
501 // we may not get enough symbols in the last tables.
502
503 // First set our table index to the last table we need, and the portion sum to 0.
504 let mut table_index = table_count - 1;
505 let mut portion = 0;
506 // For each symbol add the frequency to portion and set the weight value for this
507 // symbol in this table to 0. If the current portion meets the portion limit
508 // (based on how many groups we have, and remembering the special limits for
509 // tables 2 and 4) increment the table index to point to the next table and
510 // reset the portion sum to 0. Keep going through all the symbols.
511 for (i, freq) in freqs.iter().enumerate().take(eob as usize + 1) {
512 let f = freq;
513 if portion + f > portion_limit && (table_index == 2 || table_index == 4) {
514 table_index = table_index.saturating_sub(1);
515 tables[table_index][i] = 0;
516 portion = *f;
517 if portion > portion_limit {
518 tables[table_index][i] = 0;
519 table_index = table_index.saturating_sub(1);
520 portion = 0;
521 }
522 } else {
523 portion += f;
524 tables[table_index][i] = 0;
525 if portion > portion_limit {
526 table_index = table_index.saturating_sub(1);
527 portion = 0;
528 }
529 };
530 }
531 tables
532}