Skip to main content

sit_algos/
huffman_fixed.rs

1use std::io::{self, Seek as _};
2
3use bitstream_io::Endianness;
4use bitstream_io::{BigEndian, BitRead, BitReader};
5
6const FIXED_CODE_LENGTHS: [usize; 257] = [
7    3, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
8    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
9    8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
10    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 9, 9, 10, 10, 10, 10, 10, 10,
11    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
12    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
13    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
14    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
15    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
16    11, 11, 11, 11, 11, 11, 11, 11, 11, 13, 13, 12,
17];
18
19#[derive(thiserror::Error, Debug)]
20pub enum Error {
21    #[error("Invalid huffman tree")]
22    InvalidTree,
23
24    #[error("The input stream ended too soon")]
25    InsufficentSize,
26
27    #[error("Invalid stream")]
28    InvalidStream,
29
30    #[error(transparent)]
31    Io(#[from] io::Error),
32}
33
34impl From<Error> for io::Error {
35    fn from(val: Error) -> Self {
36        match val {
37            Error::Io(error) => error,
38            me => io::Error::other(me),
39        }
40    }
41}
42
43pub struct HuffmanReader<R: io::Read + io::Seek> {
44    inner: bitstream_io::BitReader<R, BigEndian>,
45    uncompressed_size: u64,
46    offset: u64,
47
48    tree: FixedHuffmanDecoder,
49    translations: [u8; 257],
50
51    current_block_size: usize,
52    current_block_offset: usize,
53
54    packbits_operation: packbits_rle::Operation,
55    packbits_cursor: io::Cursor<Vec<u8>>,
56}
57
58impl<R: io::Read + io::Seek> HuffmanReader<R> {
59    pub fn try_from(inner: R, uncompressed_size: u64) -> Result<Self, Error> {
60        let inner = bitstream_io::BitReader::new(inner);
61        let tree = Self::build_decoder();
62
63        Ok(Self {
64            inner,
65            uncompressed_size,
66            offset: 0,
67            tree,
68            translations: [0u8; 257],
69
70            current_block_size: 0,
71            current_block_offset: 0,
72
73            packbits_operation: Default::default(),
74            packbits_cursor: io::Cursor::new(Vec::new()),
75        })
76    }
77
78    fn build_decoder() -> FixedHuffmanDecoder {
79        let mut tree = FixedHuffmanDecoder::new();
80
81        let mut last_length = 0;
82        let mut last_code = 0;
83
84        FIXED_CODE_LENGTHS
85            .iter()
86            .enumerate()
87            .for_each(|(i, length)| {
88                let mut thiscode;
89
90                if last_length == 0 {
91                    thiscode = 0;
92                } else if *length < last_length {
93                    thiscode = (last_code >> (last_length - length)) + 1;
94                } else {
95                    thiscode = last_code + 1;
96                    if *length > last_length {
97                        thiscode <<= length - last_length;
98                    }
99                }
100
101                last_length = *length;
102                last_code = thiscode;
103
104                tree.add_value(i as i32, thiscode as u32, *length, *length)
105                    .unwrap();
106            });
107
108        tree.make_table(false);
109        tree
110    }
111
112    pub fn open_block(&mut self) -> io::Result<()> {
113        assert!(
114            self.inner.position_in_bits()? % 8 == 0,
115            "Should be opening at byte boundary",
116        );
117        let mut block_size: i32 = self.inner.read_var(32)?;
118        if block_size > 0 {
119            if block_size < 4 {
120                return Err(Error::InvalidStream)?;
121            }
122            block_size -= 4;
123
124            let uncompressed_block_size: u32 = self.inner.read_var(32)?;
125            self.current_block_offset = 0;
126            self.current_block_size = uncompressed_block_size as usize;
127
128            let bytes_read = self.read_huffman_symbols_for_block()?;
129            let mut chunk = vec![0u8; block_size as usize - bytes_read - 4];
130            self.inner.read_bytes(&mut chunk)?;
131            let chunk_reader = io::Cursor::new(chunk);
132            let mut bitstream: bitstream_io::BitReader<_, BigEndian> =
133                bitstream_io::BitReader::new(chunk_reader);
134            let mut buffer = vec![0u8; self.current_block_size + 1];
135            for (idx, byte) in buffer.iter_mut().enumerate() {
136                match self.tree.next_symbol(&mut bitstream)? {
137                    v if (0..256).contains(&v) => {
138                        self.current_block_offset += 1;
139                        *byte = self.translations[v as usize];
140                    }
141                    _ => {
142                        log::info!(
143                            "Stop code at byte {idx} (expected {})",
144                            self.current_block_size
145                        );
146                        buffer.truncate(idx);
147                        break;
148                    }
149                }
150            }
151            self.packbits_cursor = io::Cursor::new(buffer);
152
153            Ok(())
154        } else {
155            // packbits
156            self.current_block_offset = 0;
157            self.current_block_size = (-block_size) as usize;
158            if self.current_block_size < 4 {
159                return Err(Error::InvalidStream)?;
160            }
161            self.current_block_size -= 4;
162
163            let mut buffer = vec![0u8; self.current_block_size];
164            self.inner.read_bytes(&mut buffer)?;
165            self.packbits_cursor = io::Cursor::new(buffer);
166
167            Ok(())
168        }
169    }
170
171    fn next_byte(&mut self) -> io::Result<Option<u8>> {
172        if self.stream_position()? >= self.stream_len()? {
173            return Ok(None);
174        }
175
176        match self.packbits_operation.advance(&mut self.packbits_cursor) {
177            Ok((byte, next_state)) => {
178                self.packbits_operation = next_state;
179                Ok(Some(byte))
180            }
181            Err(packbits_rle::OperationError::InsufficientInput(command)) => {
182                self.open_block()?;
183                let (byte, next_state) = command.execute(&mut self.packbits_cursor)?;
184                self.packbits_operation = next_state;
185                Ok(Some(byte))
186            }
187            Err(packbits_rle::OperationError::UnexpectedEof) => {
188                self.open_block()?;
189                let (byte, next_state) =
190                    packbits_rle::Operation::default().advance(&mut self.packbits_cursor)?;
191                self.packbits_operation = next_state;
192                Ok(Some(byte))
193            }
194            Err(e) => Err(e)?,
195        }
196    }
197
198    fn read_huffman_symbols_for_block(&mut self) -> io::Result<usize> {
199        assert!(
200            self.inner.position_in_bits()? % 8 == 0,
201            "Should be reading symbols at byte boundary",
202        );
203        let symbol_count: u16 = self.inner.read_var(16)?;
204        if symbol_count >= 256 {
205            return Err(Error::InvalidStream)?;
206        }
207
208        //for i in 0..(symbol_count as usize) {
209        //self.translations[i] = self.inner.read_var(8)?;
210        //}
211        self.inner
212            .read_bytes(&mut self.translations[0..(symbol_count as usize)])?;
213
214        Ok(symbol_count as usize + 2)
215    }
216
217    pub fn into_inner(self) -> R {
218        self.inner.into_reader()
219    }
220}
221
222impl<R: io::Read + io::Seek> io::Read for HuffmanReader<R> {
223    #[inline]
224    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
225        for (idx, byte) in buf.iter_mut().enumerate() {
226            match self.next_byte()? {
227                Some(value) => {
228                    self.offset += 1;
229                    *byte = value;
230                }
231                None => return Ok(idx),
232            }
233        }
234
235        Ok(buf.len())
236    }
237}
238
239impl<R: io::Read + io::Seek> io::Seek for HuffmanReader<R> {
240    fn seek(&mut self, _: io::SeekFrom) -> io::Result<u64> {
241        todo!()
242    }
243
244    #[inline]
245    fn stream_len(&mut self) -> io::Result<u64> {
246        Ok(self.uncompressed_size)
247    }
248
249    #[inline]
250    fn stream_position(&mut self) -> io::Result<u64> {
251        Ok(self.offset)
252    }
253}
254
255#[derive(thiserror::Error, Debug)]
256pub enum InitializationError {
257    #[error("Prefix already exists")]
258    DuplicatedPrefix,
259    #[error("Invalid repeat position")]
260    InvalidRepeatPosition,
261    #[error("Invalid repeating code")]
262    InvaildRepeatingCode,
263}
264
265impl From<InitializationError> for io::Error {
266    fn from(value: InitializationError) -> Self {
267        io::Error::other(value)
268    }
269}
270
271#[derive(thiserror::Error, Debug)]
272pub enum DecompressionError {
273    #[error("Invalid prefix code when getting next symbol [length]")]
274    InvalidPrefixCodeLength,
275    #[error("Invalid prefix code when getting next symbol [code]")]
276    InvalidPrefixCodeCode,
277}
278
279impl From<DecompressionError> for io::Error {
280    fn from(value: DecompressionError) -> Self {
281        io::Error::other(value)
282    }
283}
284
285#[derive(Clone, Debug)]
286struct HuffmanTreeNode {
287    left: i32,
288    right: i32,
289}
290
291#[derive(Default, Clone)]
292struct HuffmanTableRow {
293    length: i32,
294    value: i32,
295}
296
297#[derive(Clone)]
298pub struct FixedHuffmanDecoder {
299    min_length: usize,
300    max_length: usize,
301
302    tree: Vec<HuffmanTreeNode>,
303
304    table: Option<Vec<HuffmanTableRow>>,
305    table_size: usize,
306}
307
308impl Default for FixedHuffmanDecoder {
309    fn default() -> Self {
310        Self::new()
311    }
312}
313
314impl FixedHuffmanDecoder {
315    pub fn new() -> Self {
316        let mut me = Self {
317            min_length: usize::MAX,
318            max_length: usize::MIN,
319            tree: Vec::new(),
320            table: None,
321            table_size: 0,
322        };
323        me.new_node();
324        me
325    }
326
327    pub fn initialize(
328        lengths: &[isize],
329        max_code_length: isize,
330        zeros: bool,
331    ) -> Result<Self, InitializationError> {
332        let mut me = Self::new();
333        let mut code = 0;
334        let mut unhandled_symbols = lengths.len();
335
336        for length in 1..=max_code_length {
337            for (i, cur_len) in lengths.iter().enumerate() {
338                if *cur_len != length {
339                    continue;
340                }
341
342                me.add_value(
343                    i as i32,
344                    if zeros { code } else { !code },
345                    length as usize,
346                    length as usize,
347                )?;
348
349                code += 1;
350
351                unhandled_symbols -= 1;
352                if unhandled_symbols == 0 {
353                    break;
354                }
355            }
356
357            code <<= 1;
358        }
359
360        Ok(me)
361    }
362
363    pub fn add_value(
364        &mut self,
365        value: i32,
366        code: u32,
367        length: usize,
368        repeat_pos: usize,
369    ) -> Result<(), InitializationError> {
370        self.max_length = self.max_length.max(length);
371        self.min_length = self.min_length.min(length);
372
373        let repeat_pos = length as isize - 1 - repeat_pos as isize;
374        let mut last_node = 0;
375
376        let codest = (((repeat_pos - 1) as u32) >> 1) & 3;
377        if repeat_pos == 0 || (repeat_pos >= 0 && (codest == 0 || codest == 3)) {
378            return Err(InitializationError::InvalidRepeatPosition);
379        }
380
381        let mut bitpos = length as isize - 1;
382        loop {
383            if bitpos < 0 {
384                break;
385            }
386
387            let bit = ((code >> bitpos) & 1) != 0;
388            if self.is_leaf_node(last_node) {
389                return Err(InitializationError::DuplicatedPrefix);
390            };
391
392            if bitpos == repeat_pos {
393                if !self.is_open_branch(last_node, bit) {
394                    return Err(InitializationError::InvaildRepeatingCode);
395                };
396
397                let repeat_node = self.new_node();
398                let next_node = self.new_node();
399
400                self.set_branch(last_node, bit, repeat_node);
401                self.set_branch(repeat_node, bit, repeat_node);
402                self.set_branch(repeat_node, !bit, next_node);
403
404                last_node = next_node;
405
406                bitpos += 1;
407            } else {
408                if self.is_open_branch(last_node, bit) {
409                    let new_node = self.new_node();
410                    self.set_branch(last_node, bit, new_node);
411                }
412                last_node = self.branch(last_node, bit);
413            }
414
415            bitpos -= 1;
416        }
417
418        if !self.is_empty_node(last_node) {
419            return Err(InitializationError::DuplicatedPrefix);
420        }
421
422        self.set_leaf_value(last_node, value);
423
424        Ok(())
425    }
426
427    pub(crate) fn new_node(&mut self) -> i32 {
428        self.tree.push(HuffmanTreeNode {
429            left: -1,
430            right: -2,
431        });
432        self.tree.len() as i32 - 1
433    }
434
435    pub(crate) fn set_leaf_value(&mut self, node: i32, value: i32) {
436        self.set_branch(node, false, value);
437        self.set_branch(node, true, value);
438    }
439
440    fn leaf_value(&self, node: i32) -> i32 {
441        assert!(self.branch(node, false) == self.branch(node, true));
442        self.branch(node, false)
443    }
444
445    fn is_empty_node(&self, node: i32) -> bool {
446        self.branch(node, false) == -1 && self.branch(node, true) == -2
447    }
448
449    fn is_leaf_node(&self, node: i32) -> bool {
450        self.branch(node, false) == self.branch(node, true)
451    }
452
453    fn is_open_branch(&self, node: i32, right_branch: bool) -> bool {
454        if right_branch {
455            self.tree[node as usize].right < 0
456        } else {
457            self.tree[node as usize].left < 0
458        }
459    }
460
461    pub(crate) fn set_branch(&mut self, node: i32, right_branch: bool, value: i32) {
462        if right_branch {
463            self.tree[node as usize].right = value;
464        } else {
465            self.tree[node as usize].left = value;
466        }
467    }
468
469    fn branch(&self, node: i32, right_branch: bool) -> i32 {
470        if right_branch {
471            self.tree[node as usize].right
472        } else {
473            self.tree[node as usize].left
474        }
475    }
476
477    fn is_invalid_node(&self, node: i32) -> bool {
478        node < 0
479    }
480
481    pub fn add_value_lf(
482        &mut self,
483        value: i32,
484        code: u32,
485        length: usize,
486        repeat_pos: usize,
487    ) -> Result<(), InitializationError> {
488        self.add_value(value, reverse_n(code, length), length, repeat_pos)
489    }
490
491    #[inline]
492    pub fn next_symbol<R: io::Read + io::Seek, E: Endianness>(
493        &mut self,
494        input: &mut bitstream_io::BitReader<R, E>,
495    ) -> io::Result<i32> {
496        let Some(_) = self.table.as_ref() else {
497            panic!("Huffman: search table not built");
498        };
499
500        let mut node = 0;
501        loop {
502            if self.is_leaf_node(node) {
503                break;
504            }
505            let bit = input.read_bit()?;
506            if self.is_open_branch(node, bit) {
507                return Err(DecompressionError::InvalidPrefixCodeCode)?;
508            }
509            node = self.branch(node, bit);
510        }
511
512        Ok(self.leaf_value(node))
513    }
514
515    fn make_table_recursive_le(&mut self, node: i32, table: &mut [HuffmanTableRow], depth: i32) {
516        let curr_table_size = 1 << (self.table_size as i32 - depth);
517        let curr_stride = 1 << depth;
518
519        if self.is_invalid_node(node) {
520            for i in 0..curr_table_size {
521                table[i * curr_stride].length = -1;
522            }
523        } else if self.is_leaf_node(node) {
524            for i in 0..curr_table_size {
525                table[i * curr_stride].length = depth;
526                table[i * curr_stride].value = self.leaf_value(node);
527            }
528        } else if depth == self.table_size as i32 {
529            table[0].length = self.table_size as i32 + 1;
530            table[0].value = node;
531        } else {
532            self.make_table_recursive_le(self.branch(node, false), table, depth + 1);
533            let size = table.len();
534            self.make_table_recursive_le(
535                self.branch(node, true),
536                &mut table[curr_stride..size],
537                depth + 1,
538            );
539        }
540    }
541
542    fn make_table_recursive_be(&mut self, node: i32, table: &mut [HuffmanTableRow], depth: i32) {
543        let curr_table_size = 1 << (self.table_size as i32 - depth);
544
545        if self.is_invalid_node(node) {
546            table
547                .iter_mut()
548                .take(curr_table_size)
549                .for_each(|i| i.length = -1);
550        } else if self.is_leaf_node(node) {
551            table.iter_mut().take(curr_table_size).for_each(|i| {
552                i.length = depth;
553                i.value = self.leaf_value(node);
554            });
555        } else if depth == self.table_size as i32 {
556            table[0].length = self.table_size as i32 + 1;
557            table[0].value = node;
558        } else {
559            self.make_table_recursive_be(self.branch(node, false), table, depth + 1);
560            let size = table.len();
561            self.make_table_recursive_be(
562                self.branch(node, true),
563                &mut table[(curr_table_size / 2)..size],
564                depth + 1,
565            );
566        }
567    }
568
569    pub fn make_table(&mut self, little_endian: bool) {
570        self.table_size = if self.max_length < self.min_length || self.max_length >= 10 {
571            10
572        } else {
573            self.max_length
574        };
575
576        let mut table = vec![Default::default(); 1 << self.table_size];
577        if little_endian {
578            self.make_table_recursive_le(0, &mut table, 0)
579        } else {
580            self.make_table_recursive_be(0, &mut table, 0)
581        }
582        self.table = Some(table);
583    }
584}
585
586#[inline]
587fn reverse(value: u32) -> u32 {
588    let mut val = value;
589    val = ((val >> 1) & 0x55555555) | ((val & 0x55555555) << 1);
590    val = ((val >> 2) & 0x33333333) | ((val & 0x33333333) << 2);
591    val = ((val >> 4) & 0x0F0F0F0F) | ((val & 0x0F0F0F0F) << 4);
592    val = ((val >> 8) & 0x00FF00FF) | ((val & 0x00FF00FF) << 8);
593    val.rotate_left(16)
594}
595
596#[inline]
597fn reverse_n(value: u32, length: usize) -> u32 {
598    reverse(value) >> (32 - length)
599}
600
601pub trait ReadWord {
602    fn peek_word(&mut self, bits: u32) -> io::Result<u32>;
603    fn read_word(&mut self, bits: u32) -> io::Result<u32>;
604}
605
606impl<R: io::Read + io::Seek, E: Endianness> ReadWord for BitReader<R, E> {
607    fn peek_word(&mut self, bits: u32) -> io::Result<u32> {
608        let start = self.position_in_bits().unwrap();
609        match self.read_var::<u32>(bits) {
610            Ok(result) => {
611                self.seek_bits(io::SeekFrom::Current(-(bits as i64)))?;
612                Ok(result)
613            }
614            Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => {
615                let end = self.seek_bits(io::SeekFrom::End(0)).unwrap();
616                self.seek_bits(io::SeekFrom::Start(start)).unwrap();
617
618                if start < end {
619                    let available_bits = end - start;
620                    let result = self.read_var::<u32>(available_bits as u32)?
621                        << (bits as u64 - available_bits);
622                    self.seek_bits(io::SeekFrom::Current(-(available_bits as i64)))?;
623                    return Ok(result);
624                }
625                Err(e)
626            }
627            Err(e) => Err(e),
628        }
629    }
630
631    fn read_word(&mut self, bits: u32) -> io::Result<u32> {
632        let start = self.position_in_bits().unwrap();
633
634        match self.read_var::<u32>(bits) {
635            Ok(result) => Ok(result),
636            Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => {
637                let end = self.seek_bits(io::SeekFrom::End(0)).unwrap();
638                if start < end {
639                    let available_bits = end - start;
640                    self.seek_bits(io::SeekFrom::Start(start)).unwrap();
641                    return Ok(self.read_var::<u32>(available_bits as u32)?
642                        << (bits as u64 - available_bits));
643                }
644
645                Err(e)
646            }
647            Err(e) => Err(e),
648        }
649    }
650}
651
652#[cfg(test)]
653mod test {
654    use super::*;
655
656    #[test]
657    fn successful_initialization() {
658        assert!(
659            FixedHuffmanDecoder::initialize(
660                &[
661                    3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7,
662                    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
663                    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
664                ],
665                8,
666                true,
667            )
668            .is_ok()
669        )
670    }
671}