compression/bzip2/
decoder.rs

1//! rust-compression
2//!
3//! # Licensing
4//! This Source Code is subject to the terms of the Mozilla Public License
5//! version 2.0 (the "License"). You can obtain a copy of the License at
6//! <http://mozilla.org/MPL/2.0/>.
7
8use crate::bitio::direction::left::Left;
9use crate::bitio::reader::{BitRead, BitReader};
10use crate::bitset::BitArray;
11use crate::bzip2::error::BZip2Error;
12use crate::bzip2::mtf::MtfPositionDecoder;
13use crate::bzip2::{HEADER_h, BZ_G_SIZE, HEADER_0, HEADER_B, HEADER_Z};
14use crate::core::hash::{BuildHasher, Hasher};
15use crate::crc32::{BuiltinDigest, IEEE_NORMAL};
16use crate::huffman::decoder::HuffmanDecoder;
17use crate::traits::decoder::{BitDecodeService, BitDecoderImpl, Decoder};
18#[cfg(not(feature = "std"))]
19use alloc::string::String;
20#[cfg(not(feature = "std"))]
21#[allow(unused_imports)]
22use alloc::vec;
23#[cfg(not(feature = "std"))]
24use alloc::vec::Vec;
25use log::debug;
26
27const BZ2_R_NUMS: [usize; 512] = [
28    619, 720, 127, 481, 931, 816, 813, 233, 566, 247, 985, 724, 205, 454, 863,
29    491, 741, 242, 949, 214, 733, 859, 335, 708, 621, 574, 73, 654, 730, 472,
30    419, 436, 278, 496, 867, 210, 399, 680, 480, 51, 878, 465, 811, 169, 869,
31    675, 611, 697, 867, 561, 862, 687, 507, 283, 482, 129, 807, 591, 733, 623,
32    150, 238, 59, 379, 684, 877, 625, 169, 643, 105, 170, 607, 520, 932, 727,
33    476, 693, 425, 174, 647, 73, 122, 335, 530, 442, 853, 695, 249, 445, 515,
34    909, 545, 703, 919, 874, 474, 882, 500, 594, 612, 641, 801, 220, 162, 819,
35    984, 589, 513, 495, 799, 161, 604, 958, 533, 221, 400, 386, 867, 600, 782,
36    382, 596, 414, 171, 516, 375, 682, 485, 911, 276, 98, 553, 163, 354, 666,
37    933, 424, 341, 533, 870, 227, 730, 475, 186, 263, 647, 537, 686, 600, 224,
38    469, 68, 770, 919, 190, 373, 294, 822, 808, 206, 184, 943, 795, 384, 383,
39    461, 404, 758, 839, 887, 715, 67, 618, 276, 204, 918, 873, 777, 604, 560,
40    951, 160, 578, 722, 79, 804, 96, 409, 713, 940, 652, 934, 970, 447, 318,
41    353, 859, 672, 112, 785, 645, 863, 803, 350, 139, 93, 354, 99, 820, 908,
42    609, 772, 154, 274, 580, 184, 79, 626, 630, 742, 653, 282, 762, 623, 680,
43    81, 927, 626, 789, 125, 411, 521, 938, 300, 821, 78, 343, 175, 128, 250,
44    170, 774, 972, 275, 999, 639, 495, 78, 352, 126, 857, 956, 358, 619, 580,
45    124, 737, 594, 701, 612, 669, 112, 134, 694, 363, 992, 809, 743, 168, 974,
46    944, 375, 748, 52, 600, 747, 642, 182, 862, 81, 344, 805, 988, 739, 511,
47    655, 814, 334, 249, 515, 897, 955, 664, 981, 649, 113, 974, 459, 893, 228,
48    433, 837, 553, 268, 926, 240, 102, 654, 459, 51, 686, 754, 806, 760, 493,
49    403, 415, 394, 687, 700, 946, 670, 656, 610, 738, 392, 760, 799, 887, 653,
50    978, 321, 576, 617, 626, 502, 894, 679, 243, 440, 680, 879, 194, 572, 640,
51    724, 926, 56, 204, 700, 707, 151, 457, 449, 797, 195, 791, 558, 945, 679,
52    297, 59, 87, 824, 713, 663, 412, 693, 342, 606, 134, 108, 571, 364, 631,
53    212, 174, 643, 304, 329, 343, 97, 430, 751, 497, 314, 983, 374, 822, 928,
54    140, 206, 73, 263, 980, 736, 876, 478, 430, 305, 170, 514, 364, 692, 829,
55    82, 855, 953, 676, 246, 369, 970, 294, 750, 807, 827, 150, 790, 288, 923,
56    804, 378, 215, 828, 592, 281, 565, 555, 710, 82, 896, 831, 547, 261, 524,
57    462, 293, 465, 502, 56, 661, 821, 976, 991, 658, 869, 905, 758, 745, 193,
58    768, 550, 608, 933, 378, 286, 215, 979, 792, 961, 61, 688, 793, 644, 986,
59    403, 106, 366, 905, 644, 372, 567, 466, 434, 645, 210, 389, 550, 919, 135,
60    780, 773, 635, 389, 707, 100, 626, 958, 165, 504, 920, 176, 193, 713, 857,
61    265, 203, 50, 668, 108, 645, 990, 626, 197, 510, 357, 358, 850, 858, 364,
62    936, 638,
63];
64
65#[derive(Debug)]
66struct BlockRandomise {
67    n2go: usize,
68    t_pos: usize,
69}
70
71impl BlockRandomise {
72    pub(crate) fn new() -> Self {
73        Self { n2go: 0, t_pos: 0 }
74    }
75
76    pub(crate) fn reset(&mut self) {
77        self.n2go = 0;
78        self.t_pos = 0;
79    }
80
81    pub(crate) fn next(&mut self) -> bool {
82        if self.n2go == 0 {
83            self.n2go = BZ2_R_NUMS[self.t_pos];
84            self.t_pos += 1;
85            if self.t_pos == 512 {
86                self.t_pos = 0;
87            }
88        }
89        self.n2go -= 1;
90        self.n2go == 1
91    }
92}
93
94#[derive(Debug)]
95pub(crate) struct BZip2DecoderBase {
96    block_no: usize,
97    block_size_100k: usize,
98    combined_crc: u32,
99    block_crc: u32,
100    block_crc_digest: BuiltinDigest,
101    tt: Vec<u32>,
102    n_block_used: usize,
103    t_pos: u32,
104    block_randomise: BlockRandomise,
105    block_randomised: bool,
106    result_count: usize,
107    result_wrote_count: usize,
108    result_charactor: u8,
109    stream_no: usize,
110}
111
112impl Default for BZip2DecoderBase {
113    fn default() -> Self {
114        Self::new()
115    }
116}
117
118impl BZip2DecoderBase {
119    const RUN_A: u16 = 0;
120    const RUN_B: u16 = 1;
121
122    pub(crate) fn new() -> Self {
123        Self {
124            block_no: 0,
125            block_size_100k: 0,
126            combined_crc: 0,
127            block_crc: 0,
128            block_crc_digest: IEEE_NORMAL.build_hasher(),
129            tt: Vec::new(),
130            n_block_used: 0,
131            t_pos: 0,
132            block_randomise: BlockRandomise::new(),
133            block_randomised: false,
134            result_count: 0,
135            result_wrote_count: 0,
136            result_charactor: 0,
137            stream_no: 1,
138        }
139    }
140
141    fn read_u8<R: BitRead, I: Iterator<Item = u8>>(
142        reader: &mut R,
143        iter: &mut I,
144    ) -> Result<u8, String> {
145        reader.read_bits(8, iter).map(|x| x.data())
146    }
147
148    fn read_u32<R: BitRead, I: Iterator<Item = u8>>(
149        reader: &mut R,
150        iter: &mut I,
151    ) -> Result<u32, String> {
152        reader.read_bits(32, iter).map(|x| x.data())
153    }
154
155    fn check_u8<R: BitRead, I: Iterator<Item = u8>>(
156        reader: &mut R,
157        iter: &mut I,
158        value: u8,
159    ) -> Result<bool, String> {
160        Self::read_u8(reader, iter).map(|x| x == value)
161    }
162
163    fn init_block<R: BitRead, I: Iterator<Item = u8>>(
164        &mut self,
165        reader: &mut R,
166        iter: &mut I,
167    ) -> Result<bool, BZip2Error> {
168        loop {
169            if self.block_no == 0 {
170                let magic_err = if self.stream_no == 1 {
171                    BZip2Error::DataErrorMagicFirst
172                } else {
173                    BZip2Error::DataErrorMagic
174                };
175                let _ = Self::check_u8(reader, iter, HEADER_B)
176                    .map_err(|_| magic_err)?;
177                let _ = Self::check_u8(reader, iter, HEADER_Z)
178                    .map_err(|_| magic_err)?;
179                let _ = Self::check_u8(reader, iter, HEADER_h)
180                    .map_err(|_| magic_err)?;
181                self.block_size_100k = {
182                    let b = Self::read_u8(reader, iter)
183                        .map_err(|_| BZip2Error::UnexpectedEof)?;
184                    if b < 1 + HEADER_0 || b > 9 + HEADER_0 {
185                        return Err(magic_err);
186                    }
187                    usize::from(b - HEADER_0)
188                };
189            } else {
190                let data_block_crc = self.block_crc_digest.finish() as u32;
191                debug!(
192                    " {{0x{:08x}, 0x{:08x}}}]",
193                    self.block_crc, data_block_crc
194                );
195                if data_block_crc != self.block_crc {
196                    return Err(BZip2Error::DataError);
197                }
198                self.combined_crc = ((self.combined_crc << 1)
199                    | (self.combined_crc >> 31))
200                    ^ self.block_crc;
201                self.block_crc_digest = IEEE_NORMAL.build_hasher();
202            }
203
204            let block_head_byte = Self::read_u8(reader, iter)
205                .map_err(|_| BZip2Error::UnexpectedEof)?;
206
207            if block_head_byte == 0x31 {
208                let _ = Self::check_u8(reader, iter, 0x41)
209                    .map_err(|_| BZip2Error::DataError)?;
210
211                let _ = Self::check_u8(reader, iter, 0x59)
212                    .map_err(|_| BZip2Error::DataError)?;
213
214                let _ = Self::check_u8(reader, iter, 0x26)
215                    .map_err(|_| BZip2Error::DataError)?;
216
217                let _ = Self::check_u8(reader, iter, 0x53)
218                    .map_err(|_| BZip2Error::DataError)?;
219
220                let _ = Self::check_u8(reader, iter, 0x59)
221                    .map_err(|_| BZip2Error::DataError)?;
222                self.block_no += 1;
223                debug!("    [{}: huff+mtf ", self.block_no);
224
225                self.block_crc = Self::read_u32(reader, iter)
226                    .map_err(|_| BZip2Error::UnexpectedEof)?;
227                self.block_randomised = reader
228                    .read_bits::<u8, _>(1, iter)
229                    .map_err(|_| BZip2Error::UnexpectedEof)?
230                    .data()
231                    == 1;
232
233                let orig_pos = reader
234                    .read_bits::<u32, _>(24, iter)
235                    .map_err(|_| BZip2Error::UnexpectedEof)?
236                    .data() as usize;
237
238                if orig_pos > 10 + 100_000 * self.block_size_100k {
239                    return Err(BZip2Error::DataError);
240                }
241
242                /*--- Receive the mapping table ---*/
243                let seq2unseq = {
244                    let mut in_use16 = BitArray::new(16);
245                    for i in 0..16 {
246                        in_use16.set(
247                            i,
248                            reader
249                                .read_bits::<u8, _>(1, iter)
250                                .map_err(|_| BZip2Error::UnexpectedEof)?
251                                .data()
252                                == 1,
253                        );
254                    }
255
256                    let mut ret = Vec::with_capacity(256);
257                    for (i, _) in
258                        in_use16.iter().enumerate().filter(|&(_, x)| x)
259                    {
260                        for j in 0..16 {
261                            if reader
262                                .read_bits::<u8, _>(1, iter)
263                                .map_err(|_| BZip2Error::UnexpectedEof)?
264                                .data()
265                                == 1
266                            {
267                                ret.push(i * 16 + j)
268                            }
269                        }
270                    }
271                    ret
272                };
273
274                if seq2unseq.is_empty() {
275                    return Err(BZip2Error::DataError);
276                }
277
278                let alpha_size = seq2unseq.len() + 2;
279
280                /*--- Now the selectors ---*/
281                let n_groups = reader
282                    .read_bits(3, iter)
283                    .map_err(|_| BZip2Error::UnexpectedEof)?
284                    .data();
285                if n_groups < 2 || n_groups > 6 {
286                    return Err(BZip2Error::DataError);
287                }
288                let n_selectors = reader
289                    .read_bits(15, iter)
290                    .map_err(|_| BZip2Error::UnexpectedEof)?
291                    .data();
292                if n_selectors < 1 {
293                    return Err(BZip2Error::DataError);
294                }
295
296                let mut selector = Vec::with_capacity(n_selectors);
297                {
298                    let mut selector_mtf_dec =
299                        MtfPositionDecoder::new(n_groups);
300                    for _ in 0..n_selectors {
301                        let mut j = 0;
302                        while reader
303                            .read_bits::<u8, _>(1, iter)
304                            .map_err(|_| BZip2Error::UnexpectedEof)?
305                            .data()
306                            != 0
307                        {
308                            j += 1;
309                            if j >= n_groups {
310                                return Err(BZip2Error::DataError);
311                            }
312                        }
313                        /*--- Undo the MTF values for the selectors. ---*/
314                        selector.push(selector_mtf_dec.pop(j));
315                    }
316                }
317
318                let mut len = vec![vec![0; alpha_size]; n_groups];
319                /*--- Now the coding tables ---*/
320                for t in &mut len {
321                    let mut curr = reader
322                        .read_bits::<u8, _>(5, iter)
323                        .map_err(|_| BZip2Error::UnexpectedEof)?
324                        .data();
325                    for i in t.iter_mut() {
326                        while reader
327                            .read_bits::<u8, _>(1, iter)
328                            .map_err(|_| BZip2Error::UnexpectedEof)?
329                            .data()
330                            != 0
331                        {
332                            if curr < 1 || curr > 20 {
333                                return Err(BZip2Error::DataError);
334                            }
335                            if reader
336                                .read_bits::<u8, _>(1, iter)
337                                .map_err(|_| BZip2Error::UnexpectedEof)?
338                                .data()
339                                == 0
340                            {
341                                curr += 1;
342                            } else {
343                                curr -= 1;
344                            }
345                        }
346                        *i = curr;
347                    }
348                }
349
350                /*--- Create the Huffman decoding tables ---*/
351                let mut code = Vec::with_capacity(n_groups);
352                for l in &len {
353                    code.push(
354                        HuffmanDecoder::<Left>::new(l, 12)
355                            .map_err(|_| BZip2Error::DataError)?,
356                    );
357                }
358
359                /*--- Now the MTF values ---*/
360                let eob = alpha_size as u16 - 1;
361                let nblock_max = 100_000 * self.block_size_100k;
362
363                let mut unzftab = vec![0; 257]; // LF-mapping Table
364                self.tt.clear();
365                self.tt.reserve_exact(nblock_max);
366
367                {
368                    let mut group_no = 0;
369                    let mut group_pos = 0;
370                    let mut n = 1;
371                    let mut es = 0;
372
373                    let mut mtf_decoder =
374                        MtfPositionDecoder::new(seq2unseq.len());
375
376                    loop {
377                        if group_pos == 0 {
378                            group_no += 1;
379                            if group_no > n_selectors {
380                                return Err(BZip2Error::DataError);
381                            }
382                            group_pos = BZ_G_SIZE;
383                        }
384                        group_pos -= 1;
385                        let next_sym = code[selector[group_no - 1]]
386                            .dec(reader, iter)
387                            .map_err(|_| BZip2Error::DataError)?
388                            .ok_or_else(|| BZip2Error::DataError)?;
389
390                        if es > 0
391                            && next_sym != Self::RUN_A
392                            && next_sym != Self::RUN_B
393                        {
394                            let uc = seq2unseq[mtf_decoder.pop(0)];
395                            unzftab[uc + 1] += es;
396                            for _ in 0..es {
397                                self.tt.push(uc as u32);
398                            }
399                            if self.tt.len() >= nblock_max {
400                                return Err(BZip2Error::DataError);
401                            }
402                            n = 1;
403                            es = 0;
404                        }
405
406                        if next_sym == eob {
407                            break;
408                        }
409
410                        /* Check that N doesn't get too big, so that es
411                        doesn't go negative.  The maximum value that can
412                        be RUNA/RUNB encoded is equal to the block size
413                        (post the initial RLE), viz, 900k, so bounding N
414                        at 2 million should guard against overflow
415                        without rejecting any legitimate inputs. */
416                        if n >= 2 * 1024 * 1024 {
417                            return Err(BZip2Error::DataError);
418                        }
419
420                        if next_sym == Self::RUN_A {
421                            es += n;
422                            n <<= 1;
423                        } else if next_sym == Self::RUN_B {
424                            n <<= 1;
425                            es += n;
426                        } else {
427                            if self.tt.len() >= nblock_max {
428                                return Err(BZip2Error::DataError);
429                            }
430
431                            let uc = seq2unseq
432                                [mtf_decoder.pop(next_sym as usize - 1)];
433                            unzftab[uc + 1] += 1;
434                            self.tt.push(uc as u32);
435                        }
436                    }
437                }
438
439                /* Now we know what nblock is, we can do a better sanity
440                check on s->origPtr. */
441                if orig_pos >= self.tt.len() {
442                    return Err(BZip2Error::DataError);
443                }
444
445                /*-- Set up cftab to facilitate generation of T^(-1) --*/
446                /* Actually generate cftab. */
447                if unzftab[0] != 0 {
448                    return Err(BZip2Error::DataError);
449                }
450
451                for i in 1..unzftab.len() {
452                    // /* Check: unzftab entries in range. */
453                    // if (unzftab[i] < 0 || unzftab[i] > nblock)
454                    //     throw new InvalidDataException();
455                    unzftab[i] += unzftab[i - 1];
456                    /* Check: cftab entries non-descending. */
457                    if unzftab[i - 1] > unzftab[i] {
458                        return Err(BZip2Error::DataError);
459                    }
460                }
461                /* Check: cftab entries in range. */
462                if unzftab[unzftab.len() - 1] != self.tt.len() {
463                    return Err(BZip2Error::DataError);
464                }
465
466                debug!("rt+rld");
467
468                /*-- compute the T^(-1) vector --*/
469                for i in 0..self.tt.len() {
470                    let uc = (self.tt[i] & 0xFF) as usize;
471                    self.tt[unzftab[uc]] |= (i as u32) << 8;
472                    unzftab[uc] += 1;
473                }
474
475                self.t_pos = self.tt[orig_pos] >> 8;
476                self.n_block_used = 0;
477
478                if self.block_randomised {
479                    self.block_randomise.reset();
480                }
481
482                self.result_count = 0;
483                self.result_wrote_count = 0;
484
485                return Ok(true);
486            } else if block_head_byte == 0x17 {
487                let _ = Self::check_u8(reader, iter, 0x72)
488                    .map_err(|_| BZip2Error::DataError)?;
489
490                let _ = Self::check_u8(reader, iter, 0x45)
491                    .map_err(|_| BZip2Error::DataError)?;
492
493                let _ = Self::check_u8(reader, iter, 0x38)
494                    .map_err(|_| BZip2Error::DataError)?;
495
496                let _ = Self::check_u8(reader, iter, 0x50)
497                    .map_err(|_| BZip2Error::DataError)?;
498
499                let _ = Self::check_u8(reader, iter, 0x90)
500                    .map_err(|_| BZip2Error::DataError)?;
501                let stored_combind_crc = Self::read_u32(reader, iter)
502                    .map_err(|_| BZip2Error::UnexpectedEof)?;
503                debug!(
504                    "    combined CRCs: stored = 0x{:08x}, computed = 0x{:08x}",
505                    stored_combind_crc, self.combined_crc
506                );
507                if stored_combind_crc != self.combined_crc {
508                    return Err(BZip2Error::DataError);
509                }
510                let _ = reader.skip_to_next_byte();
511                let next = reader
512                    .peek_bits::<usize, _>(8, iter)
513                    .map_err(|_| BZip2Error::Unexpected)?;
514                if next.len() == 8 {
515                    self.block_no = 0;
516                    self.combined_crc = 0;
517                    self.stream_no += 1;
518                } else {
519                    return Ok(false);
520                }
521            } else {
522                return Err(BZip2Error::DataError);
523            }
524        }
525    }
526
527    fn get_next_lfm(&mut self) -> Result<u8, BZip2Error> {
528        let mut position = self.t_pos;
529        /* c_tPos is unsigned, hence test < 0 is pointless. */
530        if position >= 100_000 * self.block_size_100k as u32 {
531            return Err(BZip2Error::DataError);
532        }
533        position = self.tt[position as usize];
534        let mut k0 = position as u8;
535        self.t_pos = position >> 8;
536        self.n_block_used += 1;
537        if self.block_randomised {
538            k0 ^= if self.block_randomise.next() { 1 } else { 0 };
539        }
540
541        Ok(k0)
542    }
543}
544
545impl BitDecodeService for BZip2DecoderBase {
546    type Direction = Left;
547    type Error = BZip2Error;
548    type Output = u8;
549
550    fn next<I: Iterator<Item = u8>>(
551        &mut self,
552        reader: &mut BitReader<Self::Direction>,
553        iter: &mut I,
554    ) -> Result<Option<u8>, Self::Error> {
555        if self.result_count == self.result_wrote_count {
556            if self.n_block_used == self.tt.len()
557                && !self.init_block(reader, iter)?
558            {
559                return Ok(None);
560            }
561
562            let buffer = self.get_next_lfm()?;
563            if buffer == self.result_charactor && self.result_count < 4 {
564                self.result_count += 1;
565                self.result_wrote_count += 1;
566            } else {
567                self.result_charactor = buffer;
568                self.result_count = 1;
569                self.result_wrote_count = 1;
570            }
571
572            if self.result_count == 4 {
573                self.result_count += usize::from(self.get_next_lfm()?);
574            }
575        } else {
576            self.result_wrote_count += 1;
577        }
578        self.block_crc_digest.write_u8(self.result_charactor);
579        Ok(Some(self.result_charactor))
580    }
581}
582
583#[derive(Debug)]
584pub struct BZip2Decoder {
585    inner: BitDecoderImpl<BZip2DecoderBase>,
586}
587
588impl BZip2Decoder {
589    pub fn new() -> Self {
590        Self {
591            inner: BitDecoderImpl::<BZip2DecoderBase>::new(),
592        }
593    }
594}
595
596impl Default for BZip2Decoder {
597    fn default() -> Self {
598        Self {
599            inner: BitDecoderImpl::<BZip2DecoderBase>::new(),
600        }
601    }
602}
603
604impl Decoder for BZip2Decoder {
605    type Input = u8;
606    type Output = u8;
607    type Error = BZip2Error;
608
609    fn next<I: Iterator<Item = Self::Input>>(
610        &mut self,
611        iter: &mut I,
612    ) -> Option<Result<Self::Output, Self::Error>> {
613        self.inner.next(iter)
614    }
615}