beeziptoo/
lib.rs

1// Copyright ® 2023-2024 Andrew Halle and Randy Barlow
2//! beeziptoo
3//!
4//! Because we wanted to implement `bzip2`, too.
5//!
6//! <https://github.com/dsnet/compress/blob/master/doc/bzip2-format.pdf> was used as a reference
7//! when writing this library.
8use std::io::{self, Cursor, Read};
9
10use crate::burrows_wheeler::BwtEncoded;
11
12mod burrows_wheeler;
13mod file_format;
14mod huffman;
15mod move_to_front;
16mod rle1;
17mod rle2;
18
19/// These are the possible errors that can occur during compression.
20#[derive(Debug, thiserror::Error)]
21pub enum CompressError {
22    /// An IO error occurred.
23    #[error("I/O error: {0}")]
24    IOError(io::Error),
25}
26
27impl From<io::Error> for CompressError {
28    fn from(value: io::Error) -> Self {
29        CompressError::IOError(value)
30    }
31}
32
33/// These are the possible errors that can occur during decompression.
34#[derive(Debug, thiserror::Error)]
35pub enum DecompressError {
36    /// An IO error occurred.
37    #[error("I/O error: {0}")]
38    IOError(io::Error),
39    /// Unable to parse the bzip2 stream.
40    #[error("Unable to parse the bzip2 stream: {0}")]
41    Parse(#[from] file_format::DecodeError),
42    /// The runlength decoder encountered an invalid input.
43    #[error("Failed to decode at a runlength step")]
44    RunLengthDecode,
45    /// The burrows-wheeler decoder encountered an invalid input.
46    #[error("Failed to decode at a burrows-wheeler step")]
47    BurrowsWheelerDecode(#[from] burrows_wheeler::DecodeError),
48    /// The huffman decoder encountered an invalid input.
49    #[error("Failed to decode at a huffman code step")]
50    HuffmanDecode,
51}
52
53impl From<io::Error> for DecompressError {
54    fn from(value: io::Error) -> Self {
55        DecompressError::IOError(value)
56    }
57}
58
59impl From<rle1::Error> for DecompressError {
60    fn from(value: rle1::Error) -> Self {
61        match value {
62            rle1::Error::RunLengthInvalid(_) => DecompressError::RunLengthDecode,
63            rle1::Error::RunLengthTruncated => DecompressError::RunLengthDecode,
64        }
65    }
66}
67
68impl From<huffman::Error> for DecompressError {
69    fn from(_value: huffman::Error) -> Self {
70        Self::HuffmanDecode
71    }
72}
73
74/// Compress the given data.
75pub fn compress<R>(mut data: R) -> Result<impl Read, CompressError>
76where
77    R: Read,
78{
79    let mut all_data = vec![];
80    data.read_to_end(&mut all_data)?;
81
82    let rle_data = rle1::encode(&all_data);
83    // TODO: Origin pointer is unused here. When we write out the file in the correct format, we
84    // will use it then.
85    let burrows_wheeler_data = burrows_wheeler::encode(&rle_data);
86    let move_to_front_data = move_to_front::encode(&burrows_wheeler_data.data);
87    let rle2_data = rle2::encode(&move_to_front_data);
88    let _huffman_data = huffman::encode(&rle2_data);
89    // This is a stub to satisfy the return type. We need to put something here that can turn
90    // huffman_data into the bzip2 file format.
91    let file_data = Vec::new();
92
93    let cursor = Cursor::new(file_data);
94
95    Ok(cursor)
96}
97
98/// Decompress the given data.
99///
100/// # Errors
101///
102/// This function is failable since it is possible the given data isn't a valid `bzip2` archive.
103pub fn decompress<R>(mut data: R) -> Result<impl Read, DecompressError>
104where
105    R: Read,
106{
107    let mut all_data = vec![];
108    let mut decompressed_data = vec![];
109    data.read_to_end(&mut all_data)?;
110
111    let un_file_data = file_format::decode(&all_data)?;
112
113    for block in &un_file_data {
114        let un_huffman_data = huffman::decode(block.symbols());
115        let un_rle2 = rle2::decode(&un_huffman_data);
116        let un_move_to_front_data = move_to_front::decode(&un_rle2, block.symbol_stack());
117        let un_burrows_wheeler_data = burrows_wheeler::decode(&BwtEncoded::new(
118            un_move_to_front_data,
119            block.origin_pointer(),
120        ))?;
121        let mut un_rle_data = rle1::decode(&un_burrows_wheeler_data)?;
122        decompressed_data.append(&mut un_rle_data);
123    }
124
125    let cursor = Cursor::new(decompressed_data);
126
127    Ok(cursor)
128}
129
130#[cfg(test)]
131mod tests {
132    use std::{
133        io::Write as _,
134        process::{Command, Stdio},
135    };
136
137    use super::*;
138
139    #[test]
140    fn can_we_read() {
141        let peter_piper = "If Peter Piper picked a peck of pickled peppers, where's the peck of pickled peppers Peter Piper picked?????";
142
143        for level in 1..=9 {
144            let mut child = Command::new("bzip2")
145                .arg("-c")
146                .arg(format!("-{level}"))
147                .stdin(Stdio::piped())
148                .stdout(Stdio::piped())
149                .spawn()
150                .unwrap();
151            {
152                let mut stdin = child.stdin.take().unwrap();
153                stdin.write_all(peter_piper.as_bytes()).unwrap();
154            }
155            let bytes = child.wait_with_output().unwrap().stdout;
156
157            let mut data = decompress(&bytes[..]).expect("Cannot decompress test data");
158
159            let mut buffer = vec![];
160            let _num_bytes = data
161                .read_to_end(&mut buffer)
162                .expect("Cannot read decompressed data");
163            assert_eq!(std::str::from_utf8(&buffer).unwrap(), peter_piper);
164        }
165    }
166
167    // This was found by the fuzzer.
168    #[test]
169    fn four_bytes() {
170        let sample = [0x04, 0x00, 0x00, 0x00];
171
172        let mut child = Command::new("bzip2")
173            .arg("-c")
174            .stdin(Stdio::piped())
175            .stdout(Stdio::piped())
176            .spawn()
177            .unwrap();
178        {
179            let mut stdin = child.stdin.take().unwrap();
180            stdin.write_all(&sample).unwrap();
181        }
182        let bytes = child.wait_with_output().unwrap().stdout;
183
184        let mut data = decompress(&bytes[..]).expect("Cannot decompress test data");
185
186        let mut buffer = vec![];
187        let _bytes = data
188            .read_to_end(&mut buffer)
189            .expect("Cannot read decompressed data");
190        assert_eq!(buffer, sample);
191    }
192
193    // This was found by the fuzzer.
194    //
195    // The root cause was failing to do the MTF decode after parsing the list of tree selector
196    // indices.
197    #[test]
198    fn parsing_selectors() {
199        let sample = [
200            0, 255, 255, 72, 255, 255, 255, 255, 0, 5, 2, 255, 7, 255, 0, 0, 0, 0, 1, 0, 2, 255, 0,
201            255, 255, 255, 255, 196, 251, 255, 183, 0, 0, 0, 182, 5, 0, 0, 255, 183, 0, 0, 0, 182,
202            5, 0, 0, 0,
203        ];
204
205        let mut child = Command::new("bzip2")
206            .arg("-c")
207            .stdin(Stdio::piped())
208            .stdout(Stdio::piped())
209            .spawn()
210            .unwrap();
211        {
212            let mut stdin = child.stdin.take().unwrap();
213            stdin.write_all(&sample).unwrap();
214        }
215        let bytes = child.wait_with_output().unwrap().stdout;
216
217        let mut data = decompress(&bytes[..]).expect("Cannot decompress test data");
218
219        let mut buffer = vec![];
220        let _bytes = data
221            .read_to_end(&mut buffer)
222            .expect("Cannot read decompressed data");
223        assert_eq!(buffer, sample);
224    }
225}