loona_hpack/
huffman.rs

1//! A module exposing utilities for encoding and decoding Huffman-coded octet
2//! strings, under the Huffman code defined by HPACK.
3//! (HPACK-draft-10, Appendix B)
4
5use std::collections::HashMap;
6
7/// Represents a symbol that can be inserted into a Huffman-encoded octet
8/// string.
9enum HuffmanCodeSymbol {
10    /// Any octet is a valid symbol
11    Symbol(u8),
12    /// A special symbol represents the end of the string
13    EndOfString,
14}
15
16impl HuffmanCodeSymbol {
17    pub fn new(symbol: usize) -> HuffmanCodeSymbol {
18        if symbol == 256 {
19            HuffmanCodeSymbol::EndOfString
20        } else {
21            // It is safe to downcast since now we know that the value
22            // is in the half-open interval [0, 256)
23            HuffmanCodeSymbol::Symbol(symbol as u8)
24        }
25    }
26}
27
28#[derive(thiserror::Error, PartialEq, Copy, Clone, Debug)]
29#[non_exhaustive]
30pub enum HuffmanDecoderError {
31    /// Any padding strictly larger than 7 bits MUST be interpreted as an error
32    #[error("Padding too large")]
33    PaddingTooLarge,
34    /// Any padding that does not correspond to the most significant bits of
35    /// EOS MUST be interpreted as an error.
36    #[error("Invalid padding")]
37    InvalidPadding,
38    /// If EOS is ever found in the string, it causes an error.
39    #[error("EOS in string")]
40    EOSInString,
41}
42
43/// The type that represents the result of the `decode` method of the
44/// `HuffmanDecoder`.
45pub type HuffmanDecoderResult = Result<Vec<u8>, HuffmanDecoderError>;
46
47/// A simple implementation of a Huffman code decoder.
48pub struct HuffmanDecoder {
49    table: HashMap<u8, HashMap<u32, HuffmanCodeSymbol>>,
50    // The representation of the EOS: the left-aligned code representation and
51    // the actual length of the codepoint, as a tuple.
52    eos_codepoint: (u32, u8),
53}
54
55impl HuffmanDecoder {
56    /// Constructs a new `HuffmanDecoder` using the given table of
57    /// (code point, code length) tuples to represent the Huffman code.
58    fn from_table(table: &[(u32, u8)]) -> HuffmanDecoder {
59        if table.len() != 257 {
60            panic!("Invalid Huffman code table. It must define exactly 257 symbols.");
61        }
62
63        let mut decoder_table: HashMap<u8, HashMap<u32, HuffmanCodeSymbol>> = HashMap::new();
64        let mut eos_codepoint: Option<(u32, u8)> = None;
65
66        for (symbol, &(code, code_len)) in table.iter().enumerate() {
67            decoder_table.entry(code_len).or_default();
68            let subtable = decoder_table.get_mut(&code_len).unwrap();
69            let huff_symbol = HuffmanCodeSymbol::new(symbol);
70            if let HuffmanCodeSymbol::EndOfString = huff_symbol {
71                // We also remember the code point of the EOS for easier
72                // reference later on.
73                eos_codepoint = Some((code, code_len));
74            };
75            subtable.insert(code, huff_symbol);
76        }
77
78        HuffmanDecoder {
79            table: decoder_table,
80            eos_codepoint: eos_codepoint.unwrap(),
81        }
82    }
83
84    /// Constructs a new HuffmanDecoder with the default Huffman code table, as
85    /// defined in the HPACK-draft-10, Appendix B.
86    pub fn new() -> HuffmanDecoder {
87        HuffmanDecoder::from_table(HUFFMAN_CODE_TABLE)
88    }
89
90    /// Decodes the buffer `buf` into a newly allocated `Vec`.
91    ///
92    /// It assumes that the entire buffer should be considered as the Huffman
93    /// encoding of an octet string and handles the padding rules
94    /// accordingly.
95    pub fn decode(&mut self, buf: &[u8]) -> HuffmanDecoderResult {
96        let mut current: u32 = 0;
97        let mut current_len: u8 = 0;
98        let mut result: Vec<u8> = Vec::new();
99
100        for b in BitIterator::new(buf.iter()) {
101            current_len += 1;
102            current <<= 1;
103            if b {
104                current |= 1;
105            }
106
107            if self.table.contains_key(&current_len) {
108                let length_table = self.table.get(&current_len).unwrap();
109                if length_table.contains_key(&current) {
110                    let decoded_symbol = match length_table.get(&current).unwrap() {
111                        HuffmanCodeSymbol::Symbol(symbol) => *symbol,
112                        HuffmanCodeSymbol::EndOfString => {
113                            // If the EOS symbol is detected within the stream,
114                            // we need to consider it an error.
115                            return Err(HuffmanDecoderError::EOSInString);
116                        }
117                    };
118                    result.push(decoded_symbol);
119                    current = 0;
120                    current_len = 0;
121                }
122            }
123        }
124
125        // Now we need to verify that the padding is correct.
126        // The spec mandates that the padding must not be strictly longer than
127        // 7 bits and that it must represent the most significant bits of the
128        // EOS symbol's code.
129
130        // First: the check for the length of the padding
131        if current_len > 7 {
132            return Err(HuffmanDecoderError::PaddingTooLarge);
133        }
134
135        // Second: the padding corresponds to the most-significant bits of the
136        // EOS symbol.
137        // Align both of them to have their most significant bit as the most
138        // significant bit of a u32.
139        let right_align_current = if current_len == 0 {
140            0
141        } else {
142            current << (32 - current_len)
143        };
144        let right_align_eos = self.eos_codepoint.0 << (32 - self.eos_codepoint.1);
145        // Now take only the necessary amount of most significant bit of EOS.
146        // The mask defines a bit pattern of `current_len` leading set bits,
147        // followed by the rest of the bits 0.
148        let mask = if current_len == 0 {
149            0
150        } else {
151            ((1 << current_len) - 1) << (32 - current_len)
152        };
153        // The mask is now used to strip the unwanted bits of the EOS
154        let eos_mask = right_align_eos & mask;
155
156        if eos_mask != right_align_current {
157            return Err(HuffmanDecoderError::InvalidPadding);
158        }
159
160        Ok(result)
161    }
162}
163
164/// A helper struct that represents an iterator over individual bits of all
165/// bytes found in a wrapped Iterator over bytes.
166/// Bits are represented as `bool`s, where `true` corresponds to a set bit and
167/// `false` to a 0 bit.
168///
169/// Bits are yielded in order of significance, starting from the
170/// most-significant bit.
171struct BitIterator<'a, I: Iterator> {
172    buffer_iterator: I,
173    current_byte: Option<&'a u8>,
174    /// The bit-position within the current byte
175    pos: u8,
176}
177
178impl<'a, I: Iterator> BitIterator<'a, I>
179where
180    I: Iterator<Item = &'a u8>,
181{
182    pub fn new(iterator: I) -> BitIterator<'a, I> {
183        BitIterator::<'a, I> {
184            buffer_iterator: iterator,
185            current_byte: None,
186            pos: 7,
187        }
188    }
189}
190
191impl<'a, I> Iterator for BitIterator<'a, I>
192where
193    I: Iterator<Item = &'a u8>,
194{
195    type Item = bool;
196
197    fn next(&mut self) -> Option<bool> {
198        if self.current_byte.is_none() {
199            self.current_byte = self.buffer_iterator.next();
200            self.pos = 7;
201        }
202
203        // If we still have `None`, it means the buffer has been exhausted
204        self.current_byte?;
205
206        let b = *self.current_byte.unwrap();
207
208        let is_set = (b & (1 << self.pos)) == (1 << self.pos);
209        if self.pos == 0 {
210            // We have exhausted all bits from the current byte -- try to get
211            // a new one on the next pass.
212            self.current_byte = None;
213        } else {
214            // Still more bits left here...
215            self.pos -= 1;
216        }
217
218        Some(is_set)
219    }
220}
221
222static HUFFMAN_CODE_TABLE: &[(u32, u8)] = &[
223    (0x1ff8, 13),
224    (0x7fffd8, 23),
225    (0xfffffe2, 28),
226    (0xfffffe3, 28),
227    (0xfffffe4, 28),
228    (0xfffffe5, 28),
229    (0xfffffe6, 28),
230    (0xfffffe7, 28),
231    (0xfffffe8, 28),
232    (0xffffea, 24),
233    (0x3ffffffc, 30),
234    (0xfffffe9, 28),
235    (0xfffffea, 28),
236    (0x3ffffffd, 30),
237    (0xfffffeb, 28),
238    (0xfffffec, 28),
239    (0xfffffed, 28),
240    (0xfffffee, 28),
241    (0xfffffef, 28),
242    (0xffffff0, 28),
243    (0xffffff1, 28),
244    (0xffffff2, 28),
245    (0x3ffffffe, 30),
246    (0xffffff3, 28),
247    (0xffffff4, 28),
248    (0xffffff5, 28),
249    (0xffffff6, 28),
250    (0xffffff7, 28),
251    (0xffffff8, 28),
252    (0xffffff9, 28),
253    (0xffffffa, 28),
254    (0xffffffb, 28),
255    (0x14, 6),
256    (0x3f8, 10),
257    (0x3f9, 10),
258    (0xffa, 12),
259    (0x1ff9, 13),
260    (0x15, 6),
261    (0xf8, 8),
262    (0x7fa, 11),
263    (0x3fa, 10),
264    (0x3fb, 10),
265    (0xf9, 8),
266    (0x7fb, 11),
267    (0xfa, 8),
268    (0x16, 6),
269    (0x17, 6),
270    (0x18, 6),
271    (0x0, 5),
272    (0x1, 5),
273    (0x2, 5),
274    (0x19, 6),
275    (0x1a, 6),
276    (0x1b, 6),
277    (0x1c, 6),
278    (0x1d, 6),
279    (0x1e, 6),
280    (0x1f, 6),
281    (0x5c, 7),
282    (0xfb, 8),
283    (0x7ffc, 15),
284    (0x20, 6),
285    (0xffb, 12),
286    (0x3fc, 10),
287    (0x1ffa, 13),
288    (0x21, 6),
289    (0x5d, 7),
290    (0x5e, 7),
291    (0x5f, 7),
292    (0x60, 7),
293    (0x61, 7),
294    (0x62, 7),
295    (0x63, 7),
296    (0x64, 7),
297    (0x65, 7),
298    (0x66, 7),
299    (0x67, 7),
300    (0x68, 7),
301    (0x69, 7),
302    (0x6a, 7),
303    (0x6b, 7),
304    (0x6c, 7),
305    (0x6d, 7),
306    (0x6e, 7),
307    (0x6f, 7),
308    (0x70, 7),
309    (0x71, 7),
310    (0x72, 7),
311    (0xfc, 8),
312    (0x73, 7),
313    (0xfd, 8),
314    (0x1ffb, 13),
315    (0x7fff0, 19),
316    (0x1ffc, 13),
317    (0x3ffc, 14),
318    (0x22, 6),
319    (0x7ffd, 15),
320    (0x3, 5),
321    (0x23, 6),
322    (0x4, 5),
323    (0x24, 6),
324    (0x5, 5),
325    (0x25, 6),
326    (0x26, 6),
327    (0x27, 6),
328    (0x6, 5),
329    (0x74, 7),
330    (0x75, 7),
331    (0x28, 6),
332    (0x29, 6),
333    (0x2a, 6),
334    (0x7, 5),
335    (0x2b, 6),
336    (0x76, 7),
337    (0x2c, 6),
338    (0x8, 5),
339    (0x9, 5),
340    (0x2d, 6),
341    (0x77, 7),
342    (0x78, 7),
343    (0x79, 7),
344    (0x7a, 7),
345    (0x7b, 7),
346    (0x7ffe, 15),
347    (0x7fc, 11),
348    (0x3ffd, 14),
349    (0x1ffd, 13),
350    (0xffffffc, 28),
351    (0xfffe6, 20),
352    (0x3fffd2, 22),
353    (0xfffe7, 20),
354    (0xfffe8, 20),
355    (0x3fffd3, 22),
356    (0x3fffd4, 22),
357    (0x3fffd5, 22),
358    (0x7fffd9, 23),
359    (0x3fffd6, 22),
360    (0x7fffda, 23),
361    (0x7fffdb, 23),
362    (0x7fffdc, 23),
363    (0x7fffdd, 23),
364    (0x7fffde, 23),
365    (0xffffeb, 24),
366    (0x7fffdf, 23),
367    (0xffffec, 24),
368    (0xffffed, 24),
369    (0x3fffd7, 22),
370    (0x7fffe0, 23),
371    (0xffffee, 24),
372    (0x7fffe1, 23),
373    (0x7fffe2, 23),
374    (0x7fffe3, 23),
375    (0x7fffe4, 23),
376    (0x1fffdc, 21),
377    (0x3fffd8, 22),
378    (0x7fffe5, 23),
379    (0x3fffd9, 22),
380    (0x7fffe6, 23),
381    (0x7fffe7, 23),
382    (0xffffef, 24),
383    (0x3fffda, 22),
384    (0x1fffdd, 21),
385    (0xfffe9, 20),
386    (0x3fffdb, 22),
387    (0x3fffdc, 22),
388    (0x7fffe8, 23),
389    (0x7fffe9, 23),
390    (0x1fffde, 21),
391    (0x7fffea, 23),
392    (0x3fffdd, 22),
393    (0x3fffde, 22),
394    (0xfffff0, 24),
395    (0x1fffdf, 21),
396    (0x3fffdf, 22),
397    (0x7fffeb, 23),
398    (0x7fffec, 23),
399    (0x1fffe0, 21),
400    (0x1fffe1, 21),
401    (0x3fffe0, 22),
402    (0x1fffe2, 21),
403    (0x7fffed, 23),
404    (0x3fffe1, 22),
405    (0x7fffee, 23),
406    (0x7fffef, 23),
407    (0xfffea, 20),
408    (0x3fffe2, 22),
409    (0x3fffe3, 22),
410    (0x3fffe4, 22),
411    (0x7ffff0, 23),
412    (0x3fffe5, 22),
413    (0x3fffe6, 22),
414    (0x7ffff1, 23),
415    (0x3ffffe0, 26),
416    (0x3ffffe1, 26),
417    (0xfffeb, 20),
418    (0x7fff1, 19),
419    (0x3fffe7, 22),
420    (0x7ffff2, 23),
421    (0x3fffe8, 22),
422    (0x1ffffec, 25),
423    (0x3ffffe2, 26),
424    (0x3ffffe3, 26),
425    (0x3ffffe4, 26),
426    (0x7ffffde, 27),
427    (0x7ffffdf, 27),
428    (0x3ffffe5, 26),
429    (0xfffff1, 24),
430    (0x1ffffed, 25),
431    (0x7fff2, 19),
432    (0x1fffe3, 21),
433    (0x3ffffe6, 26),
434    (0x7ffffe0, 27),
435    (0x7ffffe1, 27),
436    (0x3ffffe7, 26),
437    (0x7ffffe2, 27),
438    (0xfffff2, 24),
439    (0x1fffe4, 21),
440    (0x1fffe5, 21),
441    (0x3ffffe8, 26),
442    (0x3ffffe9, 26),
443    (0xffffffd, 28),
444    (0x7ffffe3, 27),
445    (0x7ffffe4, 27),
446    (0x7ffffe5, 27),
447    (0xfffec, 20),
448    (0xfffff3, 24),
449    (0xfffed, 20),
450    (0x1fffe6, 21),
451    (0x3fffe9, 22),
452    (0x1fffe7, 21),
453    (0x1fffe8, 21),
454    (0x7ffff3, 23),
455    (0x3fffea, 22),
456    (0x3fffeb, 22),
457    (0x1ffffee, 25),
458    (0x1ffffef, 25),
459    (0xfffff4, 24),
460    (0xfffff5, 24),
461    (0x3ffffea, 26),
462    (0x7ffff4, 23),
463    (0x3ffffeb, 26),
464    (0x7ffffe6, 27),
465    (0x3ffffec, 26),
466    (0x3ffffed, 26),
467    (0x7ffffe7, 27),
468    (0x7ffffe8, 27),
469    (0x7ffffe9, 27),
470    (0x7ffffea, 27),
471    (0x7ffffeb, 27),
472    (0xffffffe, 28),
473    (0x7ffffec, 27),
474    (0x7ffffed, 27),
475    (0x7ffffee, 27),
476    (0x7ffffef, 27),
477    (0x7fffff0, 27),
478    (0x3ffffee, 26),
479    (0x3fffffff, 30),
480];
481
482#[cfg(test)]
483mod tests {
484    use super::BitIterator;
485    use super::HuffmanDecoder;
486    use super::HuffmanDecoderError;
487
488    /// A helper function that converts the given slice containing values `1`
489    /// and `0` to a `Vec` of `bool`s, according to the number.
490    fn to_expected_bit_result(numbers: &[u8]) -> Vec<bool> {
491        numbers.iter().map(|b| -> bool { *b == 1 }).collect()
492    }
493
494    #[test]
495    fn test_bit_iterator_single_byte() {
496        let expected_result = to_expected_bit_result(&[0, 0, 0, 0, 1, 0, 1, 0]);
497
498        let mut res: Vec<bool> = Vec::new();
499        for b in BitIterator::new([10u8].iter()) {
500            res.push(b);
501        }
502
503        assert_eq!(res, expected_result);
504    }
505
506    #[test]
507    fn test_bit_iterator_multiple_bytes() {
508        let expected_result = to_expected_bit_result(&[
509            0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
510            0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0,
511        ]);
512
513        let mut res: Vec<bool> = Vec::new();
514        for b in BitIterator::new([10u8, 255, 128, 1, 0, 170].iter()) {
515            res.push(b);
516        }
517
518        assert_eq!(res, expected_result);
519    }
520
521    /// Simple tests for the Huffman decoder -- whether it can decode a single
522    /// character code represented additionally with only a single byte.
523    #[test]
524    fn test_huffman_code_single_byte() {
525        let mut decoder = HuffmanDecoder::new();
526        // (The + (2^n - 1) at the final byte is to add the correct expected
527        // padding: 1s)
528        {
529            // We need to shift it by 3, since we need the top-order bytes to
530            // start the code point.
531            let hex_buffer = [(0x7 << 3) + 7];
532            let expected_result = vec![b'o'];
533
534            let result = decoder.decode(&hex_buffer).ok().unwrap();
535
536            assert_eq!(result, expected_result);
537        }
538        {
539            #[allow(clippy::identity_op)] // for communicating intent
540            let hex_buffer = [0x0 + 7];
541            let expected_result = vec![b'0'];
542
543            let result = decoder.decode(&hex_buffer).ok().unwrap();
544
545            assert_eq!(result, expected_result);
546        }
547        {
548            // The length of the codepoint is 6, so we shift by two
549            let hex_buffer = [(0x21 << 2) + 3];
550            let expected_result = vec![b'A'];
551
552            let result = decoder.decode(&hex_buffer).ok().unwrap();
553
554            assert_eq!(result, expected_result);
555        }
556    }
557
558    /// Tests that the Huffman decoder can decode a single character made of
559    /// multiple bytes.
560    #[test]
561    fn test_huffman_code_single_char_multiple_byte() {
562        let mut decoder = HuffmanDecoder::new();
563        // (The + (2^n - 1) at the final byte is to add the correct expected
564        // padding: 1s)
565        {
566            let hex_buffer = [255, 160 + 15];
567            let expected_result = vec![b'#'];
568
569            let result = decoder.decode(&hex_buffer).ok().unwrap();
570
571            assert_eq!(result, expected_result);
572        }
573        {
574            let hex_buffer = [255, 200 + 7];
575            let expected_result = vec![b'$'];
576
577            let result = decoder.decode(&hex_buffer).ok().unwrap();
578
579            assert_eq!(result, expected_result);
580        }
581        {
582            let hex_buffer = [255, 255, 255, 240 + 3];
583            let expected_result = vec![10];
584
585            let result = decoder.decode(&hex_buffer).ok().unwrap();
586
587            assert_eq!(result, expected_result);
588        }
589    }
590
591    #[test]
592    fn test_huffman_code_multiple_chars() {
593        let mut decoder = HuffmanDecoder::new();
594        {
595            let hex_buffer = [254, 1];
596            let expected_result = vec![b'!', b'0'];
597
598            let result = decoder.decode(&hex_buffer).ok().unwrap();
599
600            assert_eq!(result, expected_result);
601        }
602        {
603            let hex_buffer = [(0x14 << 2) | 0x3, 248];
604            let expected_result = vec![b' ', b'!'];
605
606            let result = decoder.decode(&hex_buffer).ok().unwrap();
607
608            assert_eq!(result, expected_result);
609        }
610    }
611
612    /// Tests that if we find the EOS symbol in the stream, we consider it an
613    /// error.
614    #[test]
615    fn test_eos_is_error() {
616        let mut decoder = HuffmanDecoder::new();
617        {
618            // The EOS is an all-ones pattern of length 30
619            let hex_buffer = [0xFF, 0xFF, 0xFF, 0xFF];
620
621            let result = decoder.decode(&hex_buffer);
622
623            assert_eq!(
624                HuffmanDecoderError::EOSInString,
625                match result {
626                    Err(e) => e,
627                    _ => panic!("Expected error due to EOS symbol in string"),
628                }
629            );
630        }
631        {
632            // Full EOS after a valid character.
633            let hex_buffer = [0x3F, 0xFF, 0xFF, 0xFF, 0xFF];
634
635            let result = decoder.decode(&hex_buffer);
636
637            assert_eq!(
638                HuffmanDecoderError::EOSInString,
639                match result {
640                    Err(e) => e,
641                    _ => panic!("Expected error due to EOS symbol in string"),
642                }
643            );
644        }
645    }
646
647    /// Tests that when there are 7 or less padding bits, the string is
648    /// correctly decoded.
649    #[test]
650    fn test_short_padding_okay() {
651        let mut decoder = HuffmanDecoder::new();
652        let hex_buffer = [0x3F];
653
654        let result = decoder.decode(&hex_buffer);
655
656        assert_eq!(b"o".to_vec(), result.ok().unwrap());
657    }
658
659    /// Tests that when there are more than 7 padding bits, we get an error.
660    #[test]
661    fn test_padding_too_long() {
662        let mut decoder = HuffmanDecoder::new();
663        let hex_buffer = [0x3F, 0xFF];
664
665        let result = decoder.decode(&hex_buffer);
666
667        assert_eq!(
668            HuffmanDecoderError::PaddingTooLarge,
669            match result {
670                Err(e) => e,
671                _ => panic!("Expected `PaddingTooLarge` error"),
672            }
673        );
674    }
675
676    /// Tests that when there is a certain number of padding bits that deviate
677    /// from the most significant bits of the EOS symbol, we get an error.
678    #[test]
679    fn test_padding_invalid() {
680        let mut decoder = HuffmanDecoder::new();
681        {
682            let hex_buffer = [0x3E];
683
684            let result = decoder.decode(&hex_buffer);
685
686            assert_eq!(
687                HuffmanDecoderError::InvalidPadding,
688                match result {
689                    Err(e) => e,
690                    _ => panic!("Expected `InvalidPadding` error"),
691                }
692            );
693        }
694        {
695            let hex_buffer = [254, 0];
696
697            let result = decoder.decode(&hex_buffer);
698
699            assert_eq!(
700                HuffmanDecoderError::InvalidPadding,
701                match result {
702                    Err(e) => e,
703                    _ => panic!("Expected `InvalidPadding` error"),
704                }
705            );
706        }
707    }
708}