Skip to main content

rar_stream/decompress/
rar29.rs

1//! RAR 2.9 (RAR4) decompression.
2//!
3//! Implements the LZSS + Huffman decompression used in RAR versions 2.x-4.x.
4//! This is the most common format for scene releases.
5
6// Allow disabled debug blocks in test code (written >= 0 && written < 0 is intentionally false)
7#![cfg_attr(test, allow(clippy::logic_bug))]
8
9use super::{
10    bit_reader::BitReader,
11    huffman::HuffmanDecoder,
12    lzss::LzssDecoder,
13    ppm::{PpmModel, RangeCoder},
14    vm::RarVM,
15    DecompressError, Result,
16};
17
18/// Number of main codes (literals + length symbols).
19const MAIN_CODES: usize = 299;
20
21/// Number of distance codes.
22const DIST_CODES: usize = 60;
23
24/// Number of low distance codes.
25const LOW_DIST_CODES: usize = 17;
26
27/// Number of length codes.
28const LEN_CODES: usize = 28;
29
30/// Maximum match length.
31const MAX_MATCH_LEN: u32 = 258;
32
33/// Short distance bases for symbols 263-270.
34const SHORT_BASES: [u32; 8] = [0, 4, 8, 16, 32, 64, 128, 192];
35
36/// Short distance extra bits for symbols 263-270.
37const SHORT_BITS: [u8; 8] = [2, 2, 3, 4, 5, 6, 6, 6];
38
39/// Base lengths for length codes.
40const LENGTH_BASE: [u32; 28] = [
41    0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 128,
42    160, 192, 224,
43];
44
45/// Extra bits for length codes.
46const LENGTH_EXTRA: [u8; 28] = [
47    0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,
48];
49
50/// Base distances for distance codes (48 entries for RAR3).
51const DIST_BASE: [u32; 60] = [
52    0, 1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 512, 768, 1024, 1536,
53    2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576, 32768, 49152, 65536, 98304, 131072, 196608,
54    262144, 327680, 393216, 458752, 524288, 589824, 655360, 720896, 786432, 851968, 917504, 983040,
55    1048576, 1310720, 1572864, 1835008, 2097152, 2359296, 2621440, 2883584, 3145728, 3407872,
56    3670016, 3932160,
57];
58
59/// Extra bits for distance codes (60 entries for RAR3).
60const DIST_EXTRA: [u8; 60] = [
61    0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13,
62    13, 14, 14, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 18, 18, 18, 18, 18,
63    18, 18, 18, 18, 18, 18, 18,
64];
65
66/// RAR 2.9 decoder state.
67pub struct Rar29Decoder {
68    /// LZSS sliding window
69    lzss: LzssDecoder,
70    /// Huffman decoder
71    huffman: HuffmanDecoder,
72    /// VM for filter execution
73    vm: RarVM,
74    /// PPMd model (used when ppm_mode is true)
75    ppm: Option<PpmModel>,
76    /// PPMd range coder (used when ppm_mode is true)
77    ppm_coder: Option<RangeCoder>,
78    /// PPMd escape character
79    ppm_esc_char: i32,
80    /// Previous distances for repeat matches
81    old_dist: [u32; 4],
82    /// Current distance history index
83    old_dist_ptr: usize,
84    /// Last distance used
85    last_dist: u32,
86    /// Last length used
87    last_len: u32,
88    /// PPMd mode flag
89    ppm_mode: bool,
90    /// Tables need reading
91    tables_read: bool,
92    /// Previous low offset value for repeat
93    prev_low_offset: u32,
94    /// Low offset repeat counter
95    low_offset_repeat_count: u32,
96    /// Next position where we need to check filters (optimization to avoid O(n) scan)
97    next_filter_check: u64,
98}
99
100impl Rar29Decoder {
101    /// Create a new RAR29 decoder with default window size (4MB).
102    pub fn new() -> Self {
103        Self::with_window_size(0x400000) // 4MB default (max common size)
104    }
105
106    /// Create a new RAR29 decoder with specified window size.
107    /// Window size must be a power of 2.
108    pub fn with_window_size(window_size: usize) -> Self {
109        Self {
110            lzss: LzssDecoder::new(window_size),
111            huffman: HuffmanDecoder::new(),
112            vm: RarVM::new(),
113            ppm: None,
114            ppm_coder: None,
115            ppm_esc_char: -1,
116            old_dist: [0; 4],
117            old_dist_ptr: 0,
118            last_dist: 0,
119            last_len: 0,
120            ppm_mode: false,
121            tables_read: false,
122            prev_low_offset: 0,
123            low_offset_repeat_count: 0,
124            next_filter_check: u64::MAX,
125        }
126    }
127
128    /// Get partial output (for debugging failed decompression)
129    #[cfg(test)]
130    pub fn get_output(&self) -> Vec<u8> {
131        self.lzss.output().to_vec()
132    }
133
134    /// Decompress a block of data.
135    /// Returns the decompressed data.
136    pub fn decompress(&mut self, data: &[u8], unpacked_size: u64) -> Result<Vec<u8>> {
137        let mut reader = BitReader::new(data);
138
139        // Enable output accumulation for files (especially those larger than window)
140        self.lzss.enable_output(unpacked_size as usize);
141
142        // Read tables if needed
143        if !self.tables_read {
144            self.read_tables(&mut reader)?;
145        }
146
147        // Decompress until we have enough data
148        while self.lzss.total_written() < unpacked_size {
149            if reader.is_eof() {
150                break;
151            }
152
153            self.decode_block(&mut reader, unpacked_size)?;
154        }
155
156        // Execute any remaining pending VM filters
157        let total_written = self.lzss.total_written();
158        let window_mask = self.lzss.window_mask() as usize;
159
160        // Execute filters in order of their block_start position
161        loop {
162            // Find the earliest filter that is ready
163            let (filter_idx, next_pos) = match self.vm.find_ready_filter(total_written) {
164                Some((idx, pos)) => (idx, pos),
165                None => break,
166            };
167
168            // Flush up to filter start
169            let flushed = self.lzss.flushed_pos();
170            if flushed < next_pos {
171                self.lzss.flush_to_output(next_pos);
172            }
173
174            let window = self.lzss.window();
175            if let Some((_filter_end, filtered_data)) =
176                self.vm
177                    .execute_filter_at_index(filter_idx, window, window_mask, total_written)
178            {
179                // Write filtered data directly to output
180                self.lzss.write_filtered_to_output(&filtered_data, next_pos);
181            } else {
182                break;
183            }
184        }
185
186        // Flush any remaining data to output
187        self.lzss.flush_to_output(total_written);
188
189        // Extract the decompressed data
190        Ok(self.lzss.take_output())
191    }
192
193    /// Read Huffman tables from the bit stream.
194    fn read_tables(&mut self, reader: &mut BitReader) -> Result<()> {
195        #[cfg(test)]
196        {
197            let byte_pos = reader.bit_position() / 8;
198            eprintln!(
199                "read_tables ENTRY: bit_pos={}, byte_pos={}",
200                reader.bit_position(),
201                byte_pos
202            );
203            eprintln!("  raw bytes at pos: {:02x?}", reader.peek_bytes(8));
204        }
205        // Align to byte boundary (like unrar)
206        reader.align_to_byte();
207        #[cfg(test)]
208        {
209            let byte_pos = reader.bit_position() / 8;
210            eprintln!(
211                "read_tables AFTER align: bit_pos={}, byte_pos={}",
212                reader.bit_position(),
213                byte_pos
214            );
215            eprintln!("  raw bytes at pos: {:02x?}", reader.peek_bytes(8));
216        };
217
218        // Peek at the high bit to check for PPM mode
219        // In unrar, this is done by peeking 16 bits and checking bit 15
220        let ppm_flag = reader.peek_bits(1) != 0;
221
222        self.ppm_mode = ppm_flag;
223
224        if self.ppm_mode {
225            // DON'T consume the PPM flag bit - it's part of the MaxOrder byte
226            // Initialize or reuse PPMd model
227            let ppm = self.ppm.get_or_insert_with(PpmModel::new);
228            match ppm.init(reader) {
229                Ok((coder, esc_char)) => {
230                    self.ppm_coder = Some(coder);
231                    self.ppm_esc_char = esc_char;
232                    #[cfg(test)]
233                    println!("PPMd initialized: esc_char={}", esc_char);
234                }
235                Err(e) => {
236                    #[cfg(test)]
237                    println!("PPMd init failed: {}", e);
238                    #[cfg(not(test))]
239                    let _ = e;
240                    return Err(DecompressError::UnsupportedMethod(0x33));
241                }
242            }
243        } else {
244            // LZ mode - reset low dist state (per unrar ReadTables30)
245            self.prev_low_offset = 0;
246            self.low_offset_repeat_count = 0;
247
248            // Check bit 1 (0x4000) for reset tables
249            let reset_tables = reader.peek_bits(2) & 1 == 0; // Bit 14 inverted (0 means reset)
250                                                             // Consume the 2 header bits (PPM flag + reset flag)
251            reader.advance_bits(2);
252
253            if reset_tables {
254                self.huffman.reset_tables();
255            }
256
257            // Read Huffman tables
258            self.huffman.read_tables_after_header(reader)?;
259        }
260
261        self.tables_read = true;
262        Ok(())
263    }
264
265    /// Decode a block of data.
266    fn decode_block(&mut self, reader: &mut BitReader, max_size: u64) -> Result<()> {
267        if self.ppm_mode {
268            return self.decode_block_ppm(reader, max_size);
269        }
270
271        // Validate tables exist
272        if self.huffman.main_table.is_none() || self.huffman.dist_table.is_none() {
273            return Err(DecompressError::InvalidHuffmanCode);
274        }
275
276        #[cfg(test)]
277        let mut symbol_count = 0;
278
279        while self.lzss.total_written() < max_size && !reader.is_eof() {
280            // Check if we need to execute pending VM filters
281            self.maybe_execute_filters();
282
283            // Decode main symbol
284            #[cfg(test)]
285            let bit_pos_main_start = reader.bit_position();
286            #[cfg(test)]
287            let peek_bits = reader.peek_bits(16);
288
289            // SAFETY: We validated main_table.is_some() above
290            let symbol = unsafe {
291                self.huffman
292                    .main_table
293                    .as_ref()
294                    .unwrap_unchecked()
295                    .decode(reader)?
296            };
297
298            #[cfg(test)]
299            {
300                let pos = self.lzss.total_written();
301                if pos >= 1498580 && pos <= 1498610 {
302                    let bit_pos_after = reader.bit_position();
303                    eprintln!(
304                        "MAIN sym={} at pos={}, bits {}->{}  peek={:016b}",
305                        symbol, pos, bit_pos_main_start, bit_pos_after, peek_bits
306                    );
307                }
308            }
309
310            if symbol < 256 {
311                // Literal byte
312                #[cfg(test)]
313                {
314                    let pos = self.lzss.total_written();
315                    if pos >= 1498595 && pos <= 1498610 {
316                        eprintln!("WRITING literal 0x{:02x} at output pos {}", symbol, pos);
317                    }
318                }
319                self.lzss.write_literal(symbol as u8);
320            } else if symbol == 256 {
321                // End of block / new tables
322                // From unrar ReadEndOfBlock:
323                // "1"  - no new file, new table just here.
324                // "00" - new file,    no new table.
325                // "01" - new file,    new table (in beginning of next file).
326                #[cfg(test)]
327                eprintln!(
328                    "\n=== SYMBOL 256 (end of block) at output pos {}, bit_pos {} ===",
329                    self.lzss.total_written(),
330                    reader.bit_position()
331                );
332                if !reader.is_eof() {
333                    let first_bit = reader.read_bit()?;
334                    #[cfg(test)]
335                    eprintln!(
336                        "  first_bit={}, bit_pos after={}",
337                        first_bit,
338                        reader.bit_position()
339                    );
340                    if first_bit {
341                        // "1" = new tables, continue decompression
342                        // Reset low dist state when reading new tables
343                        self.prev_low_offset = 0;
344                        self.low_offset_repeat_count = 0;
345                        // Call full read_tables which aligns to byte and reads header
346                        self.read_tables(reader)?;
347                        #[cfg(test)]
348                        {
349                            eprintln!(
350                                "After new tables: bit_pos={}, next 16 bits={:016b}",
351                                reader.bit_position(),
352                                reader.peek_bits(16)
353                            );
354                            eprintln!("  About to decode first symbol after table read");
355                        }
356                        // Continue decompressing - don't break!
357                        continue;
358                    }
359                    // "0x" = new file (end of this file's data)
360                    let _second_bit = reader.read_bit()?; // consume the second bit
361                                                          // Break out - we're done with this file
362                }
363                break;
364            } else if symbol == 257 {
365                // VM filter code - read and skip it
366                #[cfg(test)]
367                eprintln!(
368                    "\n=== SYMBOL 257 (VM code) at output pos {} ===",
369                    self.lzss.total_written()
370                );
371                self.read_vm_code(reader)?;
372            } else if symbol == 258 {
373                // Repeat last match
374                if self.last_len > 0 {
375                    #[cfg(test)]
376                    {
377                        let pos = self.lzss.total_written();
378                        let end = pos + self.last_len as u64;
379                        if pos <= 1498598 && end > 1498598 {
380                            eprintln!(
381                                "!!! AT 1498598: symbol 258 repeat, last_dist={}, last_len={}",
382                                self.last_dist, self.last_len
383                            );
384                        }
385                    }
386                    self.lzss.copy_match(self.last_dist, self.last_len)?;
387                }
388            } else if symbol < 263 {
389                // Use one of the old distances (symbols 259-262 = indices 0-3)
390                let idx = (symbol - 259) as usize;
391                let distance = self.old_dist[idx];
392
393                // Decode length using the length table
394                let length = self.decode_length_from_table(reader)?;
395
396                #[cfg(test)]
397                {
398                    let written = self.lzss.total_written();
399                    let end = written + length as u64;
400                    if written <= 1498598 && end > 1498598 {
401                        eprintln!(
402                            "!!! AT 1498598: old idx={},len={},dist={}",
403                            idx, length, distance
404                        );
405                    }
406                }
407
408                self.lzss.copy_match(distance, length)?;
409
410                // Shift old distances: move entries 0..idx up by 1, put this at 0
411                for i in (1..=idx).rev() {
412                    self.old_dist[i] = self.old_dist[i - 1];
413                }
414                self.old_dist[0] = distance;
415                self.last_dist = distance;
416                self.last_len = length;
417            } else if symbol <= 270 {
418                // Short match (symbols 263-270): fixed length=2, short distance
419                let idx = (symbol - 263) as usize;
420                let base = SHORT_BASES[idx];
421                let bits = SHORT_BITS[idx];
422                let extra = if bits > 0 {
423                    reader.read_bits(bits as u32)?
424                } else {
425                    0
426                };
427                let distance = base + extra + 1;
428                let length = 2u32;
429
430                #[cfg(test)]
431                {
432                    let written = self.lzss.total_written();
433                    let end = written + length as u64;
434                    if written <= 1498598 && end > 1498598 {
435                        eprintln!(
436                            "!!! AT 1498598: short sym={}, idx={}, base={}, bits={}, extra={}, dist={}",
437                            symbol, idx, base, bits, extra, distance
438                        );
439                    }
440                }
441
442                self.lzss.copy_match(distance, length)?;
443
444                // Shift old distances
445                for i in (1..4).rev() {
446                    self.old_dist[i] = self.old_dist[i - 1];
447                }
448                self.old_dist[0] = distance;
449                self.old_dist_ptr = 0;
450                self.last_dist = distance;
451                self.last_len = length;
452            } else {
453                // Long match (symbols 271-298): length from main symbol, distance from offset table
454                #[cfg(test)]
455                let bit_before_len = reader.bit_position();
456
457                let len_idx = (symbol - 271) as usize;
458                let length = if len_idx < LENGTH_BASE.len() {
459                    let base = LENGTH_BASE[len_idx];
460                    let extra = LENGTH_EXTRA[len_idx];
461                    let extra_val = if extra > 0 {
462                        reader.read_bits(extra as u32)?
463                    } else {
464                        0
465                    };
466                    #[cfg(test)]
467                    {
468                        let written = self.lzss.total_written();
469                        if written >= 1498595 && written <= 1498602 {
470                            let bit_after_len = reader.bit_position();
471                            eprintln!(
472                                "!!! LONG DECODE at {}: sym={}, len_idx={}, len={}, bits {}->{}]",
473                                written,
474                                symbol,
475                                len_idx,
476                                base + extra_val + 3,
477                                bit_before_len,
478                                bit_after_len
479                            );
480                        }
481                    }
482                    base + extra_val + 3 // +3 because minimum match length for long matches is 3
483                } else {
484                    #[cfg(test)]
485                    eprintln!(
486                        "\nlen_idx {} out of range at written={}",
487                        len_idx,
488                        self.lzss.total_written()
489                    );
490                    return Err(DecompressError::InvalidHuffmanCode);
491                };
492
493                // Decode distance from offset table
494                let dist_symbol = {
495                    #[cfg(test)]
496                    let bit_pos_before = reader.bit_position();
497
498                    // SAFETY: We validated dist_table.is_some() at function start
499                    let dist_table = unsafe { self.huffman.dist_table.as_ref().unwrap_unchecked() };
500                    match dist_table.decode(reader) {
501                        Ok(s) => {
502                            #[cfg(test)]
503                            {
504                                let written = self.lzss.total_written();
505                                if written >= 1498595 && written <= 1498610 {
506                                    let bit_pos_after = reader.bit_position();
507                                    eprintln!(
508                                        "  dist_symbol={} at pos {} (bits {}->{})",
509                                        s, written, bit_pos_before, bit_pos_after
510                                    );
511                                }
512                            }
513                            s
514                        }
515                        Err(e) => {
516                            #[cfg(test)]
517                            eprintln!(
518                                "\nOffset decode failed at written={}, len={}",
519                                self.lzss.total_written(),
520                                length
521                            );
522                            return Err(e);
523                        }
524                    }
525                };
526
527                let dist_code = dist_symbol as usize;
528                let distance = if dist_code < DIST_BASE.len() {
529                    let base = DIST_BASE[dist_code];
530                    let extra = DIST_EXTRA[dist_code];
531
532                    let extra_val = if extra > 0 {
533                        if dist_code > 9 {
534                            // For dist_code > 9, use low offset table
535                            // First read high bits if extra > 4
536                            let high = if extra > 4 {
537                                #[cfg(test)]
538                                let high_bit_pos = reader.bit_position();
539                                let h = reader.read_bits((extra - 4) as u32)?;
540                                #[cfg(test)]
541                                {
542                                    let written = self.lzss.total_written();
543                                    if (written >= 1498595 && written <= 1498610)
544                                        || (written >= 2176060 && written <= 2176080)
545                                    {
546                                        eprintln!(
547                                            "    high bits at {}: {} bits = {} (0b{:016b}), pos {}->{}",
548                                            written,
549                                            extra - 4,
550                                            h, h,
551                                            high_bit_pos,
552                                            reader.bit_position()
553                                        );
554                                    }
555                                }
556                                h << 4
557                            } else {
558                                0
559                            };
560                            // Then decode low offset (0-15 or 16 for repeat)
561                            let low = if self.low_offset_repeat_count > 0 {
562                                self.low_offset_repeat_count -= 1;
563                                #[cfg(test)]
564                                {
565                                    let written = self.lzss.total_written();
566                                    if written >= 1498550 && written <= 1498610 {
567                                        eprintln!(
568                                            "!!! low_offset REPEAT at {}: prev={}",
569                                            written, self.prev_low_offset
570                                        );
571                                    }
572                                }
573                                self.prev_low_offset
574                            } else {
575                                #[cfg(test)]
576                                let bit_pos_before = reader.bit_position();
577                                #[cfg(test)]
578                                let raw_bits_16 = reader.peek_bits(16);
579                                // SAFETY: low_dist_table is always initialized when we reach here
580                                let low_table = unsafe {
581                                    self.huffman.low_dist_table.as_ref().unwrap_unchecked()
582                                };
583                                #[cfg(test)]
584                                {
585                                    let written = self.lzss.total_written();
586                                    if written == 1498598 {
587                                        // Dump the decode_len array and symbols
588                                        eprintln!(
589                                            "!!! LOW_TABLE at 1498598 decode_len: {:?}",
590                                            low_table.dump_decode_len()
591                                        );
592                                        eprintln!(
593                                            "!!! LOW_TABLE at 1498598 symbols: {:?}",
594                                            low_table.dump_symbols()
595                                        );
596                                    }
597                                }
598                                let sym = low_table.decode(reader)? as u32;
599                                #[cfg(test)]
600                                {
601                                    let written = self.lzss.total_written();
602                                    if written >= 1498550 && written <= 1498610 {
603                                        let bit_pos_after = reader.bit_position();
604                                        eprintln!("!!! low_offset at {}: sym={} (bits {}->{}), raw peek = {:016b}", 
605                                            written, sym, bit_pos_before, bit_pos_after, raw_bits_16);
606                                    }
607                                }
608
609                                if sym == 16 {
610                                    // Repeat previous low offset - total 16 uses (this one + 15 more)
611                                    // unrar: LowDistRepCount=LOW_DIST_REP_COUNT-1 where LOW_DIST_REP_COUNT=16
612                                    self.low_offset_repeat_count = 16 - 1; // 15 more uses after this one
613                                    self.prev_low_offset
614                                } else {
615                                    self.prev_low_offset = sym;
616                                    sym
617                                }
618                            };
619                            #[cfg(test)]
620                            {
621                                let written = self.lzss.total_written();
622                                if written >= 2176060 && written <= 2176080 {
623                                    if self.low_offset_repeat_count > 0 {
624                                        eprintln!(
625                                            "  low_offset REPEAT at {}: prev={}, remaining={}",
626                                            written,
627                                            self.prev_low_offset,
628                                            self.low_offset_repeat_count
629                                        );
630                                    } else {
631                                        eprintln!("  low_offset at {}: dist_code={}, base={}, extra={}, high={}, low={}, dist={}", 
632                                            written, dist_code, base, extra, high, low, base + high + low + 1);
633                                    }
634                                }
635                            }
636                            high + low
637                        } else {
638                            // For dist_code <= 9, read extra bits directly
639                            #[cfg(test)]
640                            let peek = reader.peek_bits(extra as u32);
641                            let val = reader.read_bits(extra as u32)?;
642                            #[cfg(test)]
643                            {
644                                let written = self.lzss.total_written();
645                                if written >= 0 && written < 0 {
646                                    eprintln!("  direct: dist_code={}, base={}, extra_bits={}, peek={:04b}, extra_val={}, distance={}", 
647                                        dist_code, base, extra, peek, val, base + val + 1);
648                                }
649                            }
650                            val
651                        }
652                    } else {
653                        0
654                    };
655                    base + extra_val + 1
656                } else {
657                    #[cfg(test)]
658                    eprintln!(
659                        "\ndist_code {} out of range at written={}",
660                        dist_code,
661                        self.lzss.total_written()
662                    );
663                    return Err(DecompressError::InvalidHuffmanCode);
664                };
665
666                // Length bonus for long distances (RAR3 specific)
667                // Per unrar: if (Distance>=0x2000) { Length++; if (Distance>=0x40000) Length++; }
668                let length = if distance >= 0x2000 {
669                    if distance >= 0x40000 {
670                        length + 2
671                    } else {
672                        length + 1
673                    }
674                } else {
675                    length
676                };
677
678                #[cfg(test)]
679                {
680                    let written = self.lzss.total_written();
681                    let end = written + length as u64;
682                    if written <= 1498598 && end > 1498598 {
683                        eprintln!(
684                            "!!! AT 1498598: long match dist={}, len={}",
685                            distance, length
686                        );
687                        // Check what's in the window at source position
688                        let src_pos = (written as u32).wrapping_sub(distance) as usize;
689                        let mask = self.lzss.window_mask() as usize;
690                        let window = self.lzss.window();
691                        eprintln!(
692                            "  window src[{}..{}]: {:02x?}",
693                            src_pos,
694                            src_pos + length as usize,
695                            &window[src_pos..src_pos + length as usize]
696                        );
697                    }
698                    if written >= 1498595 && written <= 1498602 {
699                        eprintln!(
700                            "LONG MATCH at {}: dist={}, len={}",
701                            written, distance, length
702                        );
703                    }
704                }
705
706                self.lzss.copy_match(distance, length)?;
707
708                // Shift old distances
709                for i in (1..4).rev() {
710                    self.old_dist[i] = self.old_dist[i - 1];
711                }
712                self.old_dist[0] = distance;
713                self.old_dist_ptr = 0;
714                self.last_dist = distance;
715                self.last_len = length;
716            }
717        }
718
719        Ok(())
720    }
721
722    /// Decode a length value using the length table.
723    fn decode_length_from_table(&mut self, reader: &mut BitReader) -> Result<u32> {
724        let symbol = {
725            let len_table = self
726                .huffman
727                .len_table
728                .as_ref()
729                .ok_or(DecompressError::InvalidHuffmanCode)?;
730            len_table.decode(reader)?
731        };
732
733        let sym = symbol as usize;
734        if sym < LENGTH_BASE.len() {
735            let base = LENGTH_BASE[sym];
736            let extra = LENGTH_EXTRA[sym];
737            let extra_val = if extra > 0 {
738                reader.read_bits(extra as u32)?
739            } else {
740                0
741            };
742            Ok(base + extra_val + 2)
743        } else {
744            Err(DecompressError::InvalidHuffmanCode)
745        }
746    }
747
748    /// Read VM filter code from bit stream (for LZ mode, symbol 257).
749    /// We read the VM code and register it with the VM for later execution.
750    fn read_vm_code(&mut self, reader: &mut BitReader) -> Result<()> {
751        #[cfg(test)]
752        let bit_pos_start = reader.bit_position();
753
754        // Read first byte
755        let first_byte = reader.read_bits(8)? as u8;
756
757        // Calculate length based on unrar's ReadVMCode logic:
758        // Length = (FirstByte & 7) + 1
759        // if Length == 7, read another byte and add 7
760        // if Length == 8, read 16 bits as length
761        let length = {
762            let base = (first_byte & 7) + 1;
763            match base {
764                7 => {
765                    // Read one more byte, add 7
766                    let next = reader.read_bits(8)? as u32;
767                    next + 7
768                }
769                8 => {
770                    // Read 16 bits as length
771                    reader.read_bits(16)?
772                }
773                _ => base as u32,
774            }
775        };
776
777        #[cfg(test)]
778        eprintln!(
779            "  read_vm_code: first_byte=0x{:02x}, length={}, bit_pos_start={}",
780            first_byte, length, bit_pos_start
781        );
782
783        if length == 0 {
784            return Ok(());
785        }
786
787        // Read VM code bytes
788        let mut vm_code = vec![0u8; length as usize];
789        for i in 0..length as usize {
790            vm_code[i] = reader.read_bits(8)? as u8;
791        }
792
793        #[cfg(test)]
794        eprintln!("    vm_code end bit_pos={}", reader.bit_position());
795
796        // Add to VM for later execution - use absolute total_written, not wrapped window position
797        let total_written = self.lzss.total_written();
798        let window_mask = self.lzss.window_mask();
799
800        #[cfg(test)]
801        eprintln!(
802            "    add_code: total_written={}, window_mask={:x}",
803            total_written, window_mask
804        );
805
806        #[cfg(test)]
807        {
808            let had_pending_before = self.vm.has_pending_filters();
809            let result = self
810                .vm
811                .add_code(first_byte, &vm_code, total_written, window_mask);
812            let has_pending_after = self.vm.has_pending_filters();
813            if let Some(next_pos) = self.vm.next_filter_pos() {
814                eprintln!(
815                    "    vm.add_code: added={}, pending={}->{}, next_pos={}",
816                    result, had_pending_before, has_pending_after, next_pos
817                );
818            } else {
819                eprintln!(
820                    "    vm.add_code: added={}, pending={}->{}, next_pos=NONE",
821                    result, had_pending_before, has_pending_after
822                );
823            }
824        }
825        #[cfg(not(test))]
826        self.vm
827            .add_code(first_byte, &vm_code, total_written, window_mask);
828
829        // Update next_filter_check when a filter is added
830        if let Some(end) = self.vm.next_filter_end() {
831            self.next_filter_check = self.next_filter_check.min(end);
832        }
833
834        Ok(())
835    }
836
837    /// Execute pending VM filters if we've reached their block_start position.
838    /// Applies filters to window data, writes filtered output directly to output buffer.
839    fn maybe_execute_filters(&mut self) {
840        let total_written = self.lzss.total_written();
841
842        // Fast path: skip if we haven't reached the next filter check position
843        if total_written < self.next_filter_check {
844            return;
845        }
846
847        let window_mask = self.lzss.window_mask() as usize;
848
849        // Execute filters that are ready, in order of their block_start position
850        loop {
851            // Find the earliest filter that is ready to execute
852            let (filter_idx, next_pos) = match self.vm.find_ready_filter(total_written) {
853                Some((idx, pos)) => (idx, pos),
854                None => break,
855            };
856
857            // Flush up to filter start first (unfiltered data before this filter)
858            let flushed = self.lzss.flushed_pos();
859            if flushed < next_pos {
860                self.lzss.flush_to_output(next_pos);
861            }
862
863            // Execute the filter on the window (read-only) and get filtered output
864            let window = self.lzss.window();
865            if let Some((filter_end, filtered_data)) =
866                self.vm
867                    .execute_filter_at_index(filter_idx, window, window_mask, total_written)
868            {
869                // Write filtered data directly to output (bypasses window)
870                self.lzss.write_filtered_to_output(&filtered_data, next_pos);
871                // Update next check to after this filter
872                self.next_filter_check = filter_end;
873            } else {
874                break;
875            }
876        }
877
878        // Update next_filter_check based on remaining filters
879        self.next_filter_check = self.vm.next_filter_end().unwrap_or(u64::MAX);
880    }
881
882    /// Decode a block using PPMd.
883    fn decode_block_ppm(&mut self, reader: &mut BitReader, max_size: u64) -> Result<()> {
884        let ppm = self
885            .ppm
886            .as_mut()
887            .ok_or(DecompressError::UnsupportedMethod(0x33))?;
888        let coder = self
889            .ppm_coder
890            .as_mut()
891            .ok_or(DecompressError::UnsupportedMethod(0x33))?;
892        let esc_char = self.ppm_esc_char;
893
894        while self.lzss.total_written() < max_size && !reader.is_eof() {
895            let ch = ppm.decode_char(coder, reader).map_err(|e| {
896                #[cfg(test)]
897                eprintln!(
898                    "PPM decode_char failed at pos {}: {}",
899                    self.lzss.total_written(),
900                    e
901                );
902                #[cfg(not(test))]
903                let _ = e;
904                DecompressError::InvalidHuffmanCode
905            })?;
906
907            if ch < 0 {
908                // Decode error
909                #[cfg(test)]
910                eprintln!("PPM decode_char returned negative: {}", ch);
911                return Err(DecompressError::InvalidHuffmanCode);
912            }
913
914            #[cfg(test)]
915            {
916                if self.lzss.total_written() < 20 {
917                    eprint!("[{}:{}] ", self.lzss.total_written(), ch);
918                }
919            }
920
921            if ch != esc_char {
922                // Regular character
923                self.lzss.write_literal(ch as u8);
924            } else {
925                // Escape sequence - decode control code
926                let ctrl = ppm
927                    .decode_char(coder, reader)
928                    .map_err(|_| DecompressError::InvalidHuffmanCode)?;
929
930                if ctrl < 0 {
931                    return Err(DecompressError::InvalidHuffmanCode);
932                }
933
934                match ctrl {
935                    0 => {
936                        // Should not happen (NextCh starts at 0)
937                        break;
938                    }
939                    1 => {
940                        // Write escape character itself
941                        self.lzss.write_literal(esc_char as u8);
942                    }
943                    2 => {
944                        // End of PPM block
945                        break;
946                    }
947                    3 => {
948                        // VM code - read and add to VM
949                        let first_byte = ppm
950                            .decode_char(coder, reader)
951                            .map_err(|_| DecompressError::InvalidHuffmanCode)?
952                            as u8;
953
954                        // Decode length from first byte
955                        let mut length = ((first_byte & 7) + 1) as u32;
956                        if length == 7 {
957                            let b1 = ppm
958                                .decode_char(coder, reader)
959                                .map_err(|_| DecompressError::InvalidHuffmanCode)?;
960                            length = (b1 as u32) + 7;
961                        } else if length == 8 {
962                            let b1 = ppm
963                                .decode_char(coder, reader)
964                                .map_err(|_| DecompressError::InvalidHuffmanCode)?;
965                            let b2 = ppm
966                                .decode_char(coder, reader)
967                                .map_err(|_| DecompressError::InvalidHuffmanCode)?;
968                            length = (b1 as u32) * 256 + (b2 as u32);
969                        }
970
971                        if length == 0 {
972                            continue;
973                        }
974
975                        // Read VM code bytes
976                        let mut vm_code = vec![0u8; length as usize];
977                        for i in 0..length as usize {
978                            let ch = ppm
979                                .decode_char(coder, reader)
980                                .map_err(|_| DecompressError::InvalidHuffmanCode)?;
981                            vm_code[i] = ch as u8;
982                        }
983
984                        // Add to VM
985                        let total_written = self.lzss.total_written();
986                        let window_mask = self.lzss.window_mask();
987                        self.vm
988                            .add_code(first_byte, &vm_code, total_written, window_mask);
989
990                        // Update next_filter_check when a filter is added
991                        if let Some(end) = self.vm.next_filter_end() {
992                            self.next_filter_check = self.next_filter_check.min(end);
993                        }
994                    }
995                    4 => {
996                        // LZ match: 3 bytes distance (MSB first), 1 byte length
997                        let mut distance: u32 = 0;
998                        for _ in 0..3 {
999                            let ch = ppm
1000                                .decode_char(coder, reader)
1001                                .map_err(|_| DecompressError::InvalidHuffmanCode)?;
1002                            distance = (distance << 8) + (ch as u32);
1003                        }
1004                        let len = ppm
1005                            .decode_char(coder, reader)
1006                            .map_err(|_| DecompressError::InvalidHuffmanCode)?;
1007
1008                        // Distance+2, Length+32
1009                        let distance = distance + 2;
1010                        let length = (len as u32) + 32;
1011
1012                        self.lzss.copy_match(distance, length)?;
1013                        self.last_dist = distance;
1014                        self.last_len = length;
1015                    }
1016                    5 => {
1017                        // RLE match: 1 byte length, distance = 1
1018                        let len = ppm
1019                            .decode_char(coder, reader)
1020                            .map_err(|_| DecompressError::InvalidHuffmanCode)?;
1021
1022                        // Length+4, Distance=1
1023                        let length = (len as u32) + 4;
1024
1025                        self.lzss.copy_match(1, length)?;
1026                        self.last_dist = 1;
1027                        self.last_len = length;
1028                    }
1029                    _ => {
1030                        // Unknown control code - likely corruption
1031                        #[cfg(test)]
1032                        eprintln!("Unknown PPM control code: {}", ctrl);
1033                        return Err(DecompressError::InvalidHuffmanCode);
1034                    }
1035                }
1036            }
1037        }
1038
1039        Ok(())
1040    }
1041
1042    /// Reset the decoder state for a new file.
1043    pub fn reset(&mut self) {
1044        self.lzss.reset();
1045        self.vm.reset();
1046        // Keep ppm model for reuse (SubAllocator reuses buffer if same size)
1047        self.ppm_coder = None;
1048        self.ppm_esc_char = -1;
1049        self.old_dist = [0; 4];
1050        self.old_dist_ptr = 0;
1051        self.last_dist = 0;
1052        self.last_len = 0;
1053        self.ppm_mode = false;
1054        self.tables_read = false;
1055        self.prev_low_offset = 0;
1056        self.low_offset_repeat_count = 0;
1057        self.next_filter_check = u64::MAX;
1058    }
1059
1060    /// Get total bytes decompressed.
1061    pub fn bytes_written(&self) -> u64 {
1062        self.lzss.total_written()
1063    }
1064}
1065
1066impl Default for Rar29Decoder {
1067    fn default() -> Self {
1068        Self::new()
1069    }
1070}
1071
1072/// Streaming decompressor for RAR29.
1073/// Allows decompressing chunks at a time.
1074pub struct Rar29StreamDecoder {
1075    decoder: Rar29Decoder,
1076    /// Accumulated compressed data
1077    input_buffer: Vec<u8>,
1078    /// Current position in input buffer
1079    input_pos: usize,
1080    /// Total expected unpacked size
1081    unpacked_size: u64,
1082}
1083
1084impl Rar29StreamDecoder {
1085    /// Create a new streaming decoder.
1086    pub fn new(unpacked_size: u64) -> Self {
1087        Self {
1088            decoder: Rar29Decoder::new(),
1089            input_buffer: Vec::new(),
1090            input_pos: 0,
1091            unpacked_size,
1092        }
1093    }
1094
1095    /// Feed compressed data to the decoder.
1096    /// Returns decompressed data available so far.
1097    pub fn feed(&mut self, data: &[u8]) -> Result<Vec<u8>> {
1098        self.input_buffer.extend_from_slice(data);
1099
1100        // Try to decompress with available data
1101        let result = self
1102            .decoder
1103            .decompress(&self.input_buffer[self.input_pos..], self.unpacked_size)?;
1104
1105        Ok(result)
1106    }
1107
1108    /// Check if decompression is complete.
1109    pub fn is_complete(&self) -> bool {
1110        self.decoder.bytes_written() >= self.unpacked_size
1111    }
1112
1113    /// Get total bytes decompressed.
1114    pub fn bytes_written(&self) -> u64 {
1115        self.decoder.bytes_written()
1116    }
1117}
1118
1119#[cfg(test)]
1120mod tests {
1121    use super::*;
1122
1123    #[test]
1124    fn test_decoder_creation() {
1125        let decoder = Rar29Decoder::new();
1126        assert_eq!(decoder.bytes_written(), 0);
1127        assert!(!decoder.tables_read);
1128    }
1129
1130    // More tests would require actual RAR compressed data
1131}