sfwtools/
compression.rs

1// TODO: distinguish between byte-based and unicode-based compression
2
3/*
4This entire file used tabs.rs as a starting point, and the structure remains similar.
5Instead of detab/entab, we have compress/expand. Both are filters, and both use
6many of the same local structure, data structures, user interfaces, etc.
7 */
8
9use std::convert::TryFrom;
10use std::fs::File;
11use std::io::{Error, ErrorKind::Other, Read, Write};
12use std::iter::Peekable;
13
14use peeking_take_while::PeekableExt;
15use seahorse::{App, Command, Context};
16use tailcall::tailcall;
17
18use crate::bytes_iter::BytesIter;
19use crate::error::*;
20use crate::iter_extra::*;
21use crate::util::write_u8;
22
23const THRESH: usize = 5;
24const RCODE: u8 = 0;
25const MAX_CHUNK_SIZE: usize = 255;
26
27pub fn compress_app() -> App {
28    App::new("compress")
29        .author("Brandon Elam Barker")
30        .action(run_compress_seahorse_action)
31        .command(run_compress_seahorse_cmd())
32}
33
34const COMPRESS_USAGE: &str = "compress SOURCE_FILE DEST_FILE";
35
36pub fn run_compress_seahorse_cmd() -> Command {
37    Command::new("compress")
38        .description(
39            "compress: adjacent redundancy compression\
40        ; output to STDOUT is the default",
41        )
42        .usage(COMPRESS_USAGE)
43        .action(run_compress_seahorse_action)
44}
45
46pub fn run_compress_seahorse_action(ctxt: &Context) {
47    let args = &mut ctxt.args.iter();
48    let src = args.next().user_err("compress: missing source");
49    let f_out: Box<dyn Write> = match args.next() {
50        Some(dst) => Box::new(
51            File::create(dst)
52                .user_err(&format!("Couldn't open destination: {}", &dst)),
53        ),
54        None => Box::new(std::io::stdout()),
55    };
56    run_compress(src, f_out);
57}
58
59/// Convenience function for running compress in idiomatic fashion
60/// (i.e.) errors are printed to user and the program exits.
61pub fn run_compress(src: &str, dst: Box<dyn Write>) {
62    compress(src, dst).user_err("Error in compress");
63}
64
65pub fn compress<W: Write>(src: &str, mut f_out: W) -> Result<(), Error> {
66    let f_in =
67        File::open(src).sfw_err(&format!("Couldn't open source '{}'", src))?;
68    let f_in_iter = BytesIter::new(f_in, MAX_CHUNK_SIZE);
69    let mut out_buf: Vec<u8> = Vec::with_capacity(MAX_CHUNK_SIZE);
70    compress_go(
71        &mut f_out,
72        f_in_iter,
73        vec![].into_iter().peekable(),
74        &mut out_buf,
75    )
76}
77
78// This implementation does not compress across boundaries in byte chunks,
79// If this were desired, then a folded approach, as is used for word counts,
80// might be desirable.
81#[tailcall]
82fn compress_go<'a, R, W>(
83    f_out: &mut W,
84    mut bytes_iter: BytesIter<R>,
85    mut buf_iter: Peekable<std::vec::IntoIter<u8>>,
86    out_buf: &mut Vec<u8>,
87) -> Result<(), Error>
88where
89    R: Read,
90    W: Write,
91{
92    match buf_iter.next() {
93        Some(char) => {
94            let char_streak = &mut buf_iter
95                .by_ref()
96                .peeking_take_while(|c| *c == char)
97                .collect::<Vec<u8>>();
98            char_streak.push(char);
99            if char_streak.len() >= THRESH {
100                write_buf_out(out_buf, f_out)?; // Write out non-streak buffer
101                write_u8(f_out, RCODE)?;
102                write_u8(f_out, char)?;
103                let char_streak_len = char_streak.len();
104                let streak_len =
105                    u8::try_from(char_streak_len).map_err(|_| {
106                        Error::new(
107                            Other,
108                            format!(
109                                "Couldn't convert char_streak_len '{}' to a u8",
110                                char_streak_len
111                            ),
112                        )
113                    })?;
114                write_u8(f_out, streak_len)?;
115            } else {
116                if out_buf.len() + char_streak.len() > MAX_CHUNK_SIZE {
117                    write_buf_out(out_buf, f_out)?;
118                }
119                out_buf.append(char_streak);
120            }
121            compress_go(f_out, bytes_iter, buf_iter, out_buf)
122        }
123        None => {
124            match bytes_iter.next() {
125                Some(buf_new) => {
126                    let buf_iter = buf_new?.into_iter().peekable(); //shadow
127                    compress_go(f_out, bytes_iter, buf_iter, out_buf)
128                }
129                None => write_buf_out(out_buf, f_out), /* Finished */
130            }
131        }
132    }
133}
134
135fn write_buf_out<W: Write>(
136    out_buf: &mut Vec<u8>,
137    f_out: &mut W,
138) -> Result<(), Error> {
139    let out_len = out_buf.len();
140    let out_len = u8::try_from(out_len).map_err(|_| {
141        Error::new(
142            Other,
143            format!("Couldn't convert out_len '{}' to a u8", out_len),
144        )
145    })?;
146    write_u8(f_out, out_len)?;
147    f_out.write_all(out_buf)?;
148    out_buf.clear();
149    Ok(())
150}
151
152pub fn expand_app() -> App {
153    App::new("expand")
154        .author("Brandon Elam Barker")
155        .action(run_expand_seahorse_action)
156        .command(run_expand_seahorse_cmd())
157}
158const EXPAND_USAGE: &str = "expand SOURCE_FILE DEST_FILE";
159
160pub fn run_expand_seahorse_cmd() -> Command {
161    Command::new("expand")
162        .description(
163            "expand: replace spaces with tabs in a file\
164            ; output to STDOUT is the default",
165        )
166        .usage(EXPAND_USAGE)
167        .action(run_expand_seahorse_action)
168}
169
170pub fn run_expand_seahorse_action(ctxt: &Context) {
171    let args = &mut ctxt.args.iter();
172    let src = args.next().user_err("expand: missing source");
173    let f_out: Box<dyn Write> = match args.next() {
174        Some(dst) => Box::new(
175            File::create(dst)
176                .user_err(&format!("Couldn't open destination: {}", &dst)),
177        ),
178        None => Box::new(std::io::stdout()),
179    };
180    run_expand(src, f_out);
181}
182
183/// Convenience function for running expand in idiomatic fashion
184/// (i.e.) errors are printed to user and the program exits.
185pub fn run_expand(src: &str, dst: Box<dyn Write>) {
186    expand(src, dst).user_err("Error in expand");
187}
188
189pub fn expand<W: Write>(src: &str, mut f_out: W) -> Result<(), Error> {
190    let f_in = File::open(src).sfw_err("Couldn't open source")?;
191    let f_in_iter = BytesIter::new(f_in, MAX_CHUNK_SIZE);
192    expand_go(&mut f_out, f_in_iter, vec![].into_iter())
193}
194
195#[tailcall]
196fn expand_go<'a, R, W>(
197    f_out: &mut W,
198    mut bytes_iter: BytesIter<R>,
199    mut buf_iter: std::vec::IntoIter<u8>,
200) -> Result<(), Error>
201where
202    R: Read,
203    W: Write,
204{
205    match buf_iter.next() {
206        Some(byte) => {
207            match byte {
208                0 => {
209                    let repeat_char = buf_iter
210                        .next()
211                        .sfw_err("Couldn't read repeat character")?;
212                    let repeat_count = buf_iter
213                        .next()
214                        .sfw_err("Couldn't read repeat count")?;
215                    f_out.write_all(
216                        &(0..repeat_count)
217                            .map(|_| repeat_char)
218                            .collect::<Vec<u8>>(),
219                    )?;
220                }
221                read_size => {
222                    let read_size = read_size as usize;
223                    let non_repeat_string =
224                        buf_iter.by_ref().safe_take(read_size)?;
225                    f_out.write_all(&non_repeat_string)?
226                }
227            };
228            expand_go(f_out, bytes_iter, buf_iter)
229        }
230        None => {
231            match bytes_iter.next() {
232                Some(buf_new) => {
233                    let buf_iter = buf_new?.into_iter(); //shadow
234                    expand_go(f_out, bytes_iter, buf_iter)
235                }
236                None => Ok(()), /* Finished */
237            }
238        }
239    }
240}