Skip to main content

rar_stream/decompress/
vm.rs

1//! RAR3 VM filter implementation.
2//!
3//! RAR3 uses a virtual machine for post-processing decompressed data.
4//! In practice, only 6 standard filters are used, identified by CRC.
5
6use crate::crc32::crc32;
7
8/// VM memory size (256KB)
9pub const VM_MEMSIZE: usize = 0x40000;
10pub const VM_MEMMASK: u32 = (VM_MEMSIZE - 1) as u32;
11
12/// Maximum channels for audio/delta filters
13pub const MAX_UNPACK_CHANNELS: usize = 1024;
14
15/// Standard filter types (identified by CRC, not bytecode)
16#[derive(Debug, Clone, Copy, PartialEq, Eq)]
17pub enum StandardFilter {
18    None,
19    /// x86 CALL (E8) address conversion
20    E8,
21    /// x86 CALL/JMP (E8/E9) address conversion
22    E8E9,
23    /// IA-64 Itanium preprocessing
24    Itanium,
25    /// Byte delta encoding (audio/images)
26    Delta,
27    /// RGB predictive filter
28    Rgb,
29    /// Audio sample predictor
30    Audio,
31}
32
33/// Known filter signatures
34struct FilterSignature {
35    length: u32,
36    crc: u32,
37    filter_type: StandardFilter,
38}
39
40const FILTER_SIGNATURES: &[FilterSignature] = &[
41    FilterSignature { length: 53, crc: 0xad576887, filter_type: StandardFilter::E8 },
42    FilterSignature { length: 57, crc: 0x3cd7e57e, filter_type: StandardFilter::E8E9 },
43    FilterSignature { length: 120, crc: 0x3769893f, filter_type: StandardFilter::Itanium },
44    FilterSignature { length: 29, crc: 0x0e06077d, filter_type: StandardFilter::Delta },
45    FilterSignature { length: 149, crc: 0x1c2c5dc8, filter_type: StandardFilter::Rgb },
46    FilterSignature { length: 216, crc: 0xbc85e701, filter_type: StandardFilter::Audio },
47];
48
49/// A prepared filter ready for execution
50#[derive(Debug, Clone)]
51pub struct PreparedFilter {
52    pub filter_type: StandardFilter,
53    /// Initial register values [R0-R6]
54    pub init_r: [u32; 7],
55    /// Block start position in output
56    pub block_start: u32,
57    /// Block length
58    pub block_length: u32,
59}
60
61/// Stored filter definition (reusable)
62#[derive(Debug, Clone)]
63pub struct StoredFilter {
64    pub filter_type: StandardFilter,
65}
66
67/// RAR VM state
68pub struct RarVM {
69    /// VM memory buffer
70    mem: Vec<u8>,
71    /// Stored filter definitions (by index)
72    filters: Vec<StoredFilter>,
73    /// Filter execution stack
74    stack: Vec<PreparedFilter>,
75    /// Last used filter index
76    last_filter: usize,
77    /// Old filter block lengths (for reuse)
78    old_lengths: Vec<u32>,
79}
80
81impl RarVM {
82    pub fn new() -> Self {
83        Self {
84            mem: vec![0u8; VM_MEMSIZE + 4],
85            filters: Vec::new(),
86            stack: Vec::new(),
87            last_filter: 0,
88            old_lengths: Vec::new(),
89        }
90    }
91
92    /// Reset VM state
93    pub fn reset(&mut self) {
94        self.filters.clear();
95        self.stack.clear();
96        self.last_filter = 0;
97        self.old_lengths.clear();
98    }
99
100    /// Identify filter type from VM code using CRC
101    fn identify_filter(code: &[u8]) -> StandardFilter {
102        if code.is_empty() {
103            return StandardFilter::None;
104        }
105
106        // Verify XOR checksum
107        let mut xor_sum: u8 = 0;
108        for &b in &code[1..] {
109            xor_sum ^= b;
110        }
111        if xor_sum != code[0] {
112            return StandardFilter::None;
113        }
114
115        // Calculate CRC and match against known filters
116        let code_crc = crc32(code);
117        let code_len = code.len() as u32;
118
119        for sig in FILTER_SIGNATURES {
120            if sig.crc == code_crc && sig.length == code_len {
121                return sig.filter_type;
122            }
123        }
124
125        StandardFilter::None
126    }
127
128    /// Read variable-length data value from bit input
129    fn read_data(data: &[u8], bit_pos: &mut usize) -> u32 {
130        if *bit_pos + 16 > data.len() * 8 {
131            return 0;
132        }
133
134        // Read 16 bits
135        let byte_pos = *bit_pos / 8;
136        let bit_off = *bit_pos % 8;
137        
138        let mut val: u32 = 0;
139        if byte_pos < data.len() {
140            val |= (data[byte_pos] as u32) << 8;
141        }
142        if byte_pos + 1 < data.len() {
143            val |= data[byte_pos + 1] as u32;
144        }
145        val <<= bit_off;
146        val >>= 8;
147        if byte_pos + 2 < data.len() {
148            val |= (data[byte_pos + 2] as u32) >> (8 - bit_off);
149        }
150        val &= 0xffff;
151
152        match val & 0xc000 {
153            0 => {
154                *bit_pos += 6;
155                (val >> 10) & 0xf
156            }
157            0x4000 => {
158                if (val & 0x3c00) == 0 {
159                    *bit_pos += 14;
160                    0xffffff00 | ((val >> 2) & 0xff)
161                } else {
162                    *bit_pos += 10;
163                    (val >> 6) & 0xff
164                }
165            }
166            0x8000 => {
167                *bit_pos += 2;
168                let byte_pos = *bit_pos / 8;
169                let mut result: u32 = 0;
170                if byte_pos < data.len() {
171                    result |= (data[byte_pos] as u32) << 8;
172                }
173                if byte_pos + 1 < data.len() {
174                    result |= data[byte_pos + 1] as u32;
175                }
176                *bit_pos += 16;
177                result
178            }
179            _ => {
180                *bit_pos += 2;
181                let byte_pos = *bit_pos / 8;
182                let mut result: u32 = 0;
183                if byte_pos < data.len() {
184                    result |= (data[byte_pos] as u32) << 24;
185                }
186                if byte_pos + 1 < data.len() {
187                    result |= (data[byte_pos + 1] as u32) << 16;
188                }
189                *bit_pos += 16;
190                let byte_pos = *bit_pos / 8;
191                if byte_pos < data.len() {
192                    result |= (data[byte_pos] as u32) << 8;
193                }
194                if byte_pos + 1 < data.len() {
195                    result |= data[byte_pos + 1] as u32;
196                }
197                *bit_pos += 16;
198                result
199            }
200        }
201    }
202
203    /// Add VM code and create filter
204    pub fn add_code(&mut self, first_byte: u8, code: &[u8]) -> bool {
205        let filter_type = Self::identify_filter(code);
206        
207        let filt_pos = if (first_byte & 0x80) != 0 {
208            let mut bit_pos = 0;
209            let pos = Self::read_data(code, &mut bit_pos);
210            if pos == 0 {
211                // Reset filters
212                self.filters.clear();
213                self.old_lengths.clear();
214            }
215            pos.saturating_sub(1) as usize
216        } else {
217            self.last_filter
218        };
219
220        if filt_pos > self.filters.len() || filt_pos > 1024 {
221            return false;
222        }
223
224        self.last_filter = filt_pos;
225        let new_filter = filt_pos == self.filters.len();
226
227        if new_filter {
228            self.filters.push(StoredFilter { filter_type });
229            self.old_lengths.push(0);
230        }
231
232        // Parse filter parameters from code
233        let mut bit_pos = if (first_byte & 0x80) != 0 { 
234            // Skip the filter position we already read
235            let mut bp = 0;
236            Self::read_data(code, &mut bp);
237            bp
238        } else { 
239            0 
240        };
241
242        let block_start = Self::read_data(code, &mut bit_pos);
243        let block_length = if (first_byte & 0x40) != 0 {
244            Self::read_data(code, &mut bit_pos)
245        } else if filt_pos < self.old_lengths.len() {
246            self.old_lengths[filt_pos]
247        } else {
248            0
249        };
250
251        if filt_pos < self.old_lengths.len() {
252            self.old_lengths[filt_pos] = block_length;
253        }
254
255        // Read initial registers
256        let mut init_r = [0u32; 7];
257        init_r[3] = VM_MEMSIZE as u32;
258        init_r[4] = block_length;
259        init_r[5] = 0; // ExecCount
260        init_r[6] = 0; // FileOffset (set later)
261
262        if (first_byte & 0x20) != 0 {
263            let init_mask = Self::read_data(code, &mut bit_pos) as u8;
264            for i in 0..7 {
265                if (init_mask & (1 << i)) != 0 {
266                    init_r[i] = Self::read_data(code, &mut bit_pos);
267                }
268            }
269        }
270
271        let filter = PreparedFilter {
272            filter_type: self.filters.get(filt_pos).map(|f| f.filter_type).unwrap_or(filter_type),
273            init_r,
274            block_start,
275            block_length,
276        };
277
278        self.stack.push(filter);
279        true
280    }
281
282    /// Check if there are pending filters
283    pub fn has_pending_filters(&self) -> bool {
284        !self.stack.is_empty()
285    }
286
287    /// Get the next filter's block start position
288    pub fn next_filter_pos(&self) -> Option<u32> {
289        self.stack.first().map(|f| f.block_start)
290    }
291
292    /// Execute pending filters on the output window
293    pub fn execute_filters(&mut self, window: &mut [u8], write_pos: u32) -> Option<(usize, usize)> {
294        if self.stack.is_empty() {
295            return None;
296        }
297
298        let filter = &self.stack[0];
299        if filter.block_start > write_pos {
300            return None;
301        }
302
303        let filter = self.stack.remove(0);
304        let block_start = filter.block_start as usize;
305        let block_length = (filter.block_length & VM_MEMMASK) as usize;
306
307        if block_start + block_length > window.len() {
308            return None;
309        }
310
311        // Copy data to VM memory
312        let copy_len = block_length.min(VM_MEMSIZE);
313        self.mem[..copy_len].copy_from_slice(&window[block_start..block_start + copy_len]);
314
315        // Execute filter
316        let (filtered_data, filtered_size) = self.execute_filter(&filter, block_length);
317
318        if filtered_size > 0 && filtered_size <= block_length {
319            // Copy filtered data back
320            window[block_start..block_start + filtered_size]
321                .copy_from_slice(&self.mem[filtered_data..filtered_data + filtered_size]);
322        }
323
324        Some((block_start, filtered_size.max(block_length)))
325    }
326
327    /// Execute a single filter
328    fn execute_filter(&mut self, filter: &PreparedFilter, data_size: usize) -> (usize, usize) {
329        let r = filter.init_r;
330
331        match filter.filter_type {
332            StandardFilter::None => (0, data_size),
333            StandardFilter::E8 | StandardFilter::E8E9 => {
334                self.filter_e8e9(r[4] as usize, r[6], filter.filter_type == StandardFilter::E8E9)
335            }
336            StandardFilter::Itanium => {
337                self.filter_itanium(r[4] as usize, r[6])
338            }
339            StandardFilter::Delta => {
340                self.filter_delta(r[4] as usize, r[0] as usize)
341            }
342            StandardFilter::Rgb => {
343                self.filter_rgb(r[4] as usize, r[0] as usize, r[1] as usize)
344            }
345            StandardFilter::Audio => {
346                self.filter_audio(r[4] as usize, r[0] as usize)
347            }
348        }
349    }
350
351    /// E8/E8E9 filter - x86 CALL/JMP address conversion
352    fn filter_e8e9(&mut self, data_size: usize, file_offset: u32, include_e9: bool) -> (usize, usize) {
353        if !(4..=VM_MEMSIZE).contains(&data_size) {
354            return (0, 0);
355        }
356
357        const FILE_SIZE: u32 = 0x1000000;
358        let cmp_byte2: u8 = if include_e9 { 0xe9 } else { 0xe8 };
359
360        let mut cur_pos: usize = 0;
361        while cur_pos < data_size - 4 {
362            let cur_byte = self.mem[cur_pos];
363            cur_pos += 1;
364
365            if cur_byte == 0xe8 || cur_byte == cmp_byte2 {
366                let offset = cur_pos as u32 + file_offset;
367                let addr = u32::from_le_bytes([
368                    self.mem[cur_pos],
369                    self.mem[cur_pos + 1],
370                    self.mem[cur_pos + 2],
371                    self.mem[cur_pos + 3],
372                ]);
373
374                if (addr & 0x80000000) != 0 {
375                    // addr < 0
376                    if (addr.wrapping_add(offset) & 0x80000000) == 0 {
377                        // addr + offset >= 0
378                        let new_addr = addr.wrapping_add(FILE_SIZE);
379                        self.mem[cur_pos..cur_pos + 4].copy_from_slice(&new_addr.to_le_bytes());
380                    }
381                } else {
382                    // addr >= 0
383                    if (addr.wrapping_sub(FILE_SIZE) & 0x80000000) != 0 {
384                        // addr < FILE_SIZE
385                        let new_addr = addr.wrapping_sub(offset);
386                        self.mem[cur_pos..cur_pos + 4].copy_from_slice(&new_addr.to_le_bytes());
387                    }
388                }
389                cur_pos += 4;
390            }
391        }
392
393        (0, data_size)
394    }
395
396    /// Itanium filter - IA-64 address conversion
397    fn filter_itanium(&mut self, data_size: usize, file_offset: u32) -> (usize, usize) {
398        if !(21..=VM_MEMSIZE).contains(&data_size) {
399            return (0, 0);
400        }
401
402        static MASKS: [u8; 16] = [4, 4, 6, 6, 0, 0, 7, 7, 4, 4, 0, 0, 4, 4, 0, 0];
403
404        let mut cur_pos: usize = 0;
405        let mut file_off = file_offset >> 4;
406
407        while cur_pos < data_size - 21 {
408            let byte_val = (self.mem[cur_pos] & 0x1f) as i32 - 0x10;
409            if byte_val >= 0 {
410                let cmd_mask = MASKS[byte_val as usize];
411                if cmd_mask != 0 {
412                    for i in 0..=2 {
413                        if (cmd_mask & (1 << i)) != 0 {
414                            let start_pos = i * 41 + 5;
415                            let op_type = self.itanium_get_bits(cur_pos, start_pos + 37, 4);
416                            if op_type == 5 {
417                                let offset = self.itanium_get_bits(cur_pos, start_pos + 13, 20);
418                                self.itanium_set_bits(
419                                    cur_pos,
420                                    (offset.wrapping_sub(file_off)) & 0xfffff,
421                                    start_pos + 13,
422                                    20,
423                                );
424                            }
425                        }
426                    }
427                }
428            }
429            cur_pos += 16;
430            file_off = file_off.wrapping_add(1);
431        }
432
433        (0, data_size)
434    }
435
436    fn itanium_get_bits(&self, base: usize, bit_pos: usize, bit_count: usize) -> u32 {
437        let in_addr = base + bit_pos / 8;
438        let in_bit = bit_pos & 7;
439
440        let mut bit_field: u32 = 0;
441        if in_addr < self.mem.len() {
442            bit_field |= self.mem[in_addr] as u32;
443        }
444        if in_addr + 1 < self.mem.len() {
445            bit_field |= (self.mem[in_addr + 1] as u32) << 8;
446        }
447        if in_addr + 2 < self.mem.len() {
448            bit_field |= (self.mem[in_addr + 2] as u32) << 16;
449        }
450        if in_addr + 3 < self.mem.len() {
451            bit_field |= (self.mem[in_addr + 3] as u32) << 24;
452        }
453
454        bit_field >>= in_bit;
455        bit_field & (0xffffffff >> (32 - bit_count))
456    }
457
458    fn itanium_set_bits(&mut self, base: usize, bit_field: u32, bit_pos: usize, bit_count: usize) {
459        let in_addr = base + bit_pos / 8;
460        let in_bit = bit_pos & 7;
461
462        let and_mask = !(((1u32 << bit_count) - 1) << in_bit);
463        let bit_field = bit_field << in_bit;
464
465        for i in 0..4 {
466            if in_addr + i < self.mem.len() {
467                self.mem[in_addr + i] &= (and_mask >> (i * 8)) as u8;
468                self.mem[in_addr + i] |= (bit_field >> (i * 8)) as u8;
469            }
470        }
471    }
472
473    /// Delta filter - byte delta encoding
474    fn filter_delta(&mut self, data_size: usize, channels: usize) -> (usize, usize) {
475        if data_size > VM_MEMSIZE / 2 || channels > MAX_UNPACK_CHANNELS || channels == 0 {
476            return (0, 0);
477        }
478
479        let border = data_size * 2;
480        let mut src_pos = 0;
481
482        for cur_channel in 0..channels {
483            let mut prev_byte: u8 = 0;
484            let mut dest_pos = data_size + cur_channel;
485            while dest_pos < border {
486                prev_byte = prev_byte.wrapping_sub(self.mem[src_pos]);
487                self.mem[dest_pos] = prev_byte;
488                src_pos += 1;
489                dest_pos += channels;
490            }
491        }
492
493        (data_size, data_size)
494    }
495
496    /// RGB filter - predictive color filter
497    fn filter_rgb(&mut self, data_size: usize, width: usize, pos_r: usize) -> (usize, usize) {
498        let width = width.saturating_sub(3);
499        if !(3..=VM_MEMSIZE / 2).contains(&data_size) || width > data_size || pos_r > 2 {
500            return (0, 0);
501        }
502
503        const CHANNELS: usize = 3;
504        let mut src_idx = 0;
505
506        for cur_channel in 0..CHANNELS {
507            let mut prev_byte: u32 = 0;
508
509            let mut i = cur_channel;
510            while i < data_size {
511                let predicted = if i >= width + 3 {
512                    let upper_idx = data_size + i - width;
513                    let upper_byte = self.mem[upper_idx] as u32;
514                    let upper_left_byte = self.mem[upper_idx - 3] as u32;
515
516                    let mut pred = prev_byte.wrapping_add(upper_byte).wrapping_sub(upper_left_byte);
517                    let pa = (pred as i32 - prev_byte as i32).unsigned_abs();
518                    let pb = (pred as i32 - upper_byte as i32).unsigned_abs();
519                    let pc = (pred as i32 - upper_left_byte as i32).unsigned_abs();
520
521                    if pa <= pb && pa <= pc {
522                        pred = prev_byte;
523                    } else if pb <= pc {
524                        pred = upper_byte;
525                    } else {
526                        pred = upper_left_byte;
527                    }
528                    pred
529                } else {
530                    prev_byte
531                };
532
533                prev_byte = predicted.wrapping_sub(self.mem[src_idx] as u32) & 0xff;
534                self.mem[data_size + i] = prev_byte as u8;
535                src_idx += 1;
536                i += CHANNELS;
537            }
538        }
539
540        // Apply RGB correlation
541        let border = data_size - 2;
542        let mut i = pos_r;
543        while i < border {
544            let g = self.mem[data_size + i + 1];
545            self.mem[data_size + i] = self.mem[data_size + i].wrapping_add(g);
546            self.mem[data_size + i + 2] = self.mem[data_size + i + 2].wrapping_add(g);
547            i += 3;
548        }
549
550        (data_size, data_size)
551    }
552
553    /// Audio filter - audio sample predictor
554    fn filter_audio(&mut self, data_size: usize, channels: usize) -> (usize, usize) {
555        if data_size > VM_MEMSIZE / 2 || channels > 128 || channels == 0 {
556            return (0, 0);
557        }
558
559        let mut src_idx = 0;
560
561        for cur_channel in 0..channels {
562            let mut prev_byte: u32 = 0;
563            let mut prev_delta: i32 = 0;
564            let mut dif = [0u32; 7];
565            let mut d1: i32 = 0;
566            let mut d2: i32 = 0;
567            let mut k1: i32 = 0;
568            let mut k2: i32 = 0;
569            let mut k3: i32 = 0;
570
571            let mut i = cur_channel;
572            let mut byte_count = 0u32;
573            while i < data_size {
574                let d3 = d2;
575                d2 = prev_delta - d1;
576                d1 = prev_delta;
577
578                let predicted = (8i32 * prev_byte as i32 + k1 * d1 + k2 * d2 + k3 * d3) >> 3;
579                let predicted = (predicted as u32) & 0xff;
580
581                let cur_byte = self.mem[src_idx] as u32;
582                src_idx += 1;
583
584                let result = predicted.wrapping_sub(cur_byte) & 0xff;
585                self.mem[data_size + i] = result as u8;
586                prev_delta = (result as i8) as i32 - (prev_byte as i8) as i32;
587                prev_byte = result;
588
589                let d = ((cur_byte as i8) as i32) << 3;
590
591                dif[0] = dif[0].wrapping_add(d.unsigned_abs());
592                dif[1] = dif[1].wrapping_add((d - d1).unsigned_abs());
593                dif[2] = dif[2].wrapping_add((d + d1).unsigned_abs());
594                dif[3] = dif[3].wrapping_add((d - d2).unsigned_abs());
595                dif[4] = dif[4].wrapping_add((d + d2).unsigned_abs());
596                dif[5] = dif[5].wrapping_add((d - d3).unsigned_abs());
597                dif[6] = dif[6].wrapping_add((d + d3).unsigned_abs());
598
599                if (byte_count & 0x1f) == 0 {
600                    let mut min_dif = dif[0];
601                    let mut num_min_dif = 0;
602                    dif[0] = 0;
603
604                    for j in 1..7 {
605                        if dif[j] < min_dif {
606                            min_dif = dif[j];
607                            num_min_dif = j;
608                        }
609                        dif[j] = 0;
610                    }
611
612                    match num_min_dif {
613                        1 => { if k1 >= -16 { k1 -= 1; } }
614                        2 => { if k1 < 16 { k1 += 1; } }
615                        3 => { if k2 >= -16 { k2 -= 1; } }
616                        4 => { if k2 < 16 { k2 += 1; } }
617                        5 => { if k3 >= -16 { k3 -= 1; } }
618                        6 => { if k3 < 16 { k3 += 1; } }
619                        _ => {}
620                    }
621                }
622
623                i += channels;
624                byte_count += 1;
625            }
626        }
627
628        (data_size, data_size)
629    }
630}
631
632impl Default for RarVM {
633    fn default() -> Self {
634        Self::new()
635    }
636}
637
638#[cfg(test)]
639mod tests {
640    use super::*;
641
642    #[test]
643    fn test_filter_identification() {
644        // Test that filter identification works with known CRCs
645        assert_eq!(RarVM::identify_filter(&[]), StandardFilter::None);
646    }
647
648    #[test]
649    fn test_delta_filter() {
650        let mut vm = RarVM::new();
651        
652        // Simple delta test: 3 channels, 6 bytes
653        vm.mem[0] = 10;
654        vm.mem[1] = 20;
655        vm.mem[2] = 30;
656        vm.mem[3] = 5;
657        vm.mem[4] = 10;
658        vm.mem[5] = 15;
659
660        let (offset, size) = vm.filter_delta(6, 3);
661        assert_eq!(offset, 6);
662        assert_eq!(size, 6);
663    }
664
665    #[test]
666    fn test_e8_filter() {
667        let mut vm = RarVM::new();
668        
669        // E8 filter test
670        vm.mem[0] = 0xe8;
671        vm.mem[1] = 0x00;
672        vm.mem[2] = 0x00;
673        vm.mem[3] = 0x10;
674        vm.mem[4] = 0x00;
675
676        let (offset, size) = vm.filter_e8e9(5, 0, false);
677        assert_eq!(offset, 0);
678        assert_eq!(size, 5);
679    }
680}