grep_cli/decompress.rs
1use std::{
2    ffi::{OsStr, OsString},
3    fs::File,
4    io,
5    path::{Path, PathBuf},
6    process::Command,
7};
8
9use globset::{Glob, GlobSet, GlobSetBuilder};
10
11use crate::process::{CommandError, CommandReader, CommandReaderBuilder};
12
13/// A builder for a matcher that determines which files get decompressed.
14#[derive(Clone, Debug)]
15pub struct DecompressionMatcherBuilder {
16    /// The commands for each matching glob.
17    commands: Vec<DecompressionCommand>,
18    /// Whether to include the default matching rules.
19    defaults: bool,
20}
21
22/// A representation of a single command for decompressing data
23/// out-of-process.
24#[derive(Clone, Debug)]
25struct DecompressionCommand {
26    /// The glob that matches this command.
27    glob: String,
28    /// The command or binary name.
29    bin: PathBuf,
30    /// The arguments to invoke with the command.
31    args: Vec<OsString>,
32}
33
34impl Default for DecompressionMatcherBuilder {
35    fn default() -> DecompressionMatcherBuilder {
36        DecompressionMatcherBuilder::new()
37    }
38}
39
40impl DecompressionMatcherBuilder {
41    /// Create a new builder for configuring a decompression matcher.
42    pub fn new() -> DecompressionMatcherBuilder {
43        DecompressionMatcherBuilder { commands: vec![], defaults: true }
44    }
45
46    /// Build a matcher for determining how to decompress files.
47    ///
48    /// If there was a problem compiling the matcher, then an error is
49    /// returned.
50    pub fn build(&self) -> Result<DecompressionMatcher, CommandError> {
51        let defaults = if !self.defaults {
52            vec![]
53        } else {
54            default_decompression_commands()
55        };
56        let mut glob_builder = GlobSetBuilder::new();
57        let mut commands = vec![];
58        for decomp_cmd in defaults.iter().chain(&self.commands) {
59            let glob = Glob::new(&decomp_cmd.glob).map_err(|err| {
60                CommandError::io(io::Error::new(io::ErrorKind::Other, err))
61            })?;
62            glob_builder.add(glob);
63            commands.push(decomp_cmd.clone());
64        }
65        let globs = glob_builder.build().map_err(|err| {
66            CommandError::io(io::Error::new(io::ErrorKind::Other, err))
67        })?;
68        Ok(DecompressionMatcher { globs, commands })
69    }
70
71    /// When enabled, the default matching rules will be compiled into this
72    /// matcher before any other associations. When disabled, only the
73    /// rules explicitly given to this builder will be used.
74    ///
75    /// This is enabled by default.
76    pub fn defaults(&mut self, yes: bool) -> &mut DecompressionMatcherBuilder {
77        self.defaults = yes;
78        self
79    }
80
81    /// Associates a glob with a command to decompress files matching the glob.
82    ///
83    /// If multiple globs match the same file, then the most recently added
84    /// glob takes precedence.
85    ///
86    /// The syntax for the glob is documented in the
87    /// [`globset` crate](https://docs.rs/globset/#syntax).
88    ///
89    /// The `program` given is resolved with respect to `PATH` and turned
90    /// into an absolute path internally before being executed by the current
91    /// platform. Notably, on Windows, this avoids a security problem where
92    /// passing a relative path to `CreateProcess` will automatically search
93    /// the current directory for a matching program. If the program could
94    /// not be resolved, then it is silently ignored and the association is
95    /// dropped. For this reason, callers should prefer `try_associate`.
96    pub fn associate<P, I, A>(
97        &mut self,
98        glob: &str,
99        program: P,
100        args: I,
101    ) -> &mut DecompressionMatcherBuilder
102    where
103        P: AsRef<OsStr>,
104        I: IntoIterator<Item = A>,
105        A: AsRef<OsStr>,
106    {
107        let _ = self.try_associate(glob, program, args);
108        self
109    }
110
111    /// Associates a glob with a command to decompress files matching the glob.
112    ///
113    /// If multiple globs match the same file, then the most recently added
114    /// glob takes precedence.
115    ///
116    /// The syntax for the glob is documented in the
117    /// [`globset` crate](https://docs.rs/globset/#syntax).
118    ///
119    /// The `program` given is resolved with respect to `PATH` and turned
120    /// into an absolute path internally before being executed by the current
121    /// platform. Notably, on Windows, this avoids a security problem where
122    /// passing a relative path to `CreateProcess` will automatically search
123    /// the current directory for a matching program. If the program could not
124    /// be resolved, then an error is returned.
125    pub fn try_associate<P, I, A>(
126        &mut self,
127        glob: &str,
128        program: P,
129        args: I,
130    ) -> Result<&mut DecompressionMatcherBuilder, CommandError>
131    where
132        P: AsRef<OsStr>,
133        I: IntoIterator<Item = A>,
134        A: AsRef<OsStr>,
135    {
136        let glob = glob.to_string();
137        let bin = try_resolve_binary(Path::new(program.as_ref()))?;
138        let args =
139            args.into_iter().map(|a| a.as_ref().to_os_string()).collect();
140        self.commands.push(DecompressionCommand { glob, bin, args });
141        Ok(self)
142    }
143}
144
145/// A matcher for determining how to decompress files.
146#[derive(Clone, Debug)]
147pub struct DecompressionMatcher {
148    /// The set of globs to match. Each glob has a corresponding entry in
149    /// `commands`. When a glob matches, the corresponding command should be
150    /// used to perform out-of-process decompression.
151    globs: GlobSet,
152    /// The commands for each matching glob.
153    commands: Vec<DecompressionCommand>,
154}
155
156impl Default for DecompressionMatcher {
157    fn default() -> DecompressionMatcher {
158        DecompressionMatcher::new()
159    }
160}
161
162impl DecompressionMatcher {
163    /// Create a new matcher with default rules.
164    ///
165    /// To add more matching rules, build a matcher with
166    /// [`DecompressionMatcherBuilder`].
167    pub fn new() -> DecompressionMatcher {
168        DecompressionMatcherBuilder::new()
169            .build()
170            .expect("built-in matching rules should always compile")
171    }
172
173    /// Return a pre-built command based on the given file path that can
174    /// decompress its contents. If no such decompressor is known, then this
175    /// returns `None`.
176    ///
177    /// If there are multiple possible commands matching the given path, then
178    /// the command added last takes precedence.
179    pub fn command<P: AsRef<Path>>(&self, path: P) -> Option<Command> {
180        if let Some(i) = self.globs.matches(path).into_iter().next_back() {
181            let decomp_cmd = &self.commands[i];
182            let mut cmd = Command::new(&decomp_cmd.bin);
183            cmd.args(&decomp_cmd.args);
184            return Some(cmd);
185        }
186        None
187    }
188
189    /// Returns true if and only if the given file path has at least one
190    /// matching command to perform decompression on.
191    pub fn has_command<P: AsRef<Path>>(&self, path: P) -> bool {
192        self.globs.is_match(path)
193    }
194}
195
196/// Configures and builds a streaming reader for decompressing data.
197#[derive(Clone, Debug, Default)]
198pub struct DecompressionReaderBuilder {
199    matcher: DecompressionMatcher,
200    command_builder: CommandReaderBuilder,
201}
202
203impl DecompressionReaderBuilder {
204    /// Create a new builder with the default configuration.
205    pub fn new() -> DecompressionReaderBuilder {
206        DecompressionReaderBuilder::default()
207    }
208
209    /// Build a new streaming reader for decompressing data.
210    ///
211    /// If decompression is done out-of-process and if there was a problem
212    /// spawning the process, then its error is logged at the debug level and a
213    /// passthru reader is returned that does no decompression. This behavior
214    /// typically occurs when the given file path matches a decompression
215    /// command, but is executing in an environment where the decompression
216    /// command is not available.
217    ///
218    /// If the given file path could not be matched with a decompression
219    /// strategy, then a passthru reader is returned that does no
220    /// decompression.
221    pub fn build<P: AsRef<Path>>(
222        &self,
223        path: P,
224    ) -> Result<DecompressionReader, CommandError> {
225        let path = path.as_ref();
226        let Some(mut cmd) = self.matcher.command(path) else {
227            return DecompressionReader::new_passthru(path);
228        };
229        cmd.arg(path);
230
231        match self.command_builder.build(&mut cmd) {
232            Ok(cmd_reader) => Ok(DecompressionReader { rdr: Ok(cmd_reader) }),
233            Err(err) => {
234                log::debug!(
235                    "{}: error spawning command '{:?}': {} \
236                     (falling back to uncompressed reader)",
237                    path.display(),
238                    cmd,
239                    err,
240                );
241                DecompressionReader::new_passthru(path)
242            }
243        }
244    }
245
246    /// Set the matcher to use to look up the decompression command for each
247    /// file path.
248    ///
249    /// A set of sensible rules is enabled by default. Setting this will
250    /// completely replace the current rules.
251    pub fn matcher(
252        &mut self,
253        matcher: DecompressionMatcher,
254    ) -> &mut DecompressionReaderBuilder {
255        self.matcher = matcher;
256        self
257    }
258
259    /// Get the underlying matcher currently used by this builder.
260    pub fn get_matcher(&self) -> &DecompressionMatcher {
261        &self.matcher
262    }
263
264    /// When enabled, the reader will asynchronously read the contents of the
265    /// command's stderr output. When disabled, stderr is only read after the
266    /// stdout stream has been exhausted (or if the process quits with an error
267    /// code).
268    ///
269    /// Note that when enabled, this may require launching an additional
270    /// thread in order to read stderr. This is done so that the process being
271    /// executed is never blocked from writing to stdout or stderr. If this is
272    /// disabled, then it is possible for the process to fill up the stderr
273    /// buffer and deadlock.
274    ///
275    /// This is enabled by default.
276    pub fn async_stderr(
277        &mut self,
278        yes: bool,
279    ) -> &mut DecompressionReaderBuilder {
280        self.command_builder.async_stderr(yes);
281        self
282    }
283}
284
285/// A streaming reader for decompressing the contents of a file.
286///
287/// The purpose of this reader is to provide a seamless way to decompress the
288/// contents of file using existing tools in the current environment. This is
289/// meant to be an alternative to using decompression libraries in favor of the
290/// simplicity and portability of using external commands such as `gzip` and
291/// `xz`. This does impose the overhead of spawning a process, so other means
292/// for performing decompression should be sought if this overhead isn't
293/// acceptable.
294///
295/// A decompression reader comes with a default set of matching rules that are
296/// meant to associate file paths with the corresponding command to use to
297/// decompress them. For example, a glob like `*.gz` matches gzip compressed
298/// files with the command `gzip -d -c`. If a file path does not match any
299/// existing rules, or if it matches a rule whose command does not exist in the
300/// current environment, then the decompression reader passes through the
301/// contents of the underlying file without doing any decompression.
302///
303/// The default matching rules are probably good enough for most cases, and if
304/// they require revision, pull requests are welcome. In cases where they must
305/// be changed or extended, they can be customized through the use of
306/// [`DecompressionMatcherBuilder`] and [`DecompressionReaderBuilder`].
307///
308/// By default, this reader will asynchronously read the processes' stderr.
309/// This prevents subtle deadlocking bugs for noisy processes that write a lot
310/// to stderr. Currently, the entire contents of stderr is read on to the heap.
311///
312/// # Example
313///
314/// This example shows how to read the decompressed contents of a file without
315/// needing to explicitly choose the decompression command to run.
316///
317/// Note that if you need to decompress multiple files, it is better to use
318/// `DecompressionReaderBuilder`, which will amortize the cost of compiling the
319/// matcher.
320///
321/// ```no_run
322/// use std::{io::Read, process::Command};
323///
324/// use grep_cli::DecompressionReader;
325///
326/// let mut rdr = DecompressionReader::new("/usr/share/man/man1/ls.1.gz")?;
327/// let mut contents = vec![];
328/// rdr.read_to_end(&mut contents)?;
329/// # Ok::<(), Box<dyn std::error::Error>>(())
330/// ```
331#[derive(Debug)]
332pub struct DecompressionReader {
333    rdr: Result<CommandReader, File>,
334}
335
336impl DecompressionReader {
337    /// Build a new streaming reader for decompressing data.
338    ///
339    /// If decompression is done out-of-process and if there was a problem
340    /// spawning the process, then its error is returned.
341    ///
342    /// If the given file path could not be matched with a decompression
343    /// strategy, then a passthru reader is returned that does no
344    /// decompression.
345    ///
346    /// This uses the default matching rules for determining how to decompress
347    /// the given file. To change those matching rules, use
348    /// [`DecompressionReaderBuilder`] and [`DecompressionMatcherBuilder`].
349    ///
350    /// When creating readers for many paths. it is better to use the builder
351    /// since it will amortize the cost of constructing the matcher.
352    pub fn new<P: AsRef<Path>>(
353        path: P,
354    ) -> Result<DecompressionReader, CommandError> {
355        DecompressionReaderBuilder::new().build(path)
356    }
357
358    /// Creates a new "passthru" decompression reader that reads from the file
359    /// corresponding to the given path without doing decompression and without
360    /// executing another process.
361    fn new_passthru(path: &Path) -> Result<DecompressionReader, CommandError> {
362        let file = File::open(path)?;
363        Ok(DecompressionReader { rdr: Err(file) })
364    }
365
366    /// Closes this reader, freeing any resources used by its underlying child
367    /// process, if one was used. If the child process exits with a nonzero
368    /// exit code, the returned Err value will include its stderr.
369    ///
370    /// `close` is idempotent, meaning it can be safely called multiple times.
371    /// The first call closes the CommandReader and any subsequent calls do
372    /// nothing.
373    ///
374    /// This method should be called after partially reading a file to prevent
375    /// resource leakage. However there is no need to call `close` explicitly
376    /// if your code always calls `read` to EOF, as `read` takes care of
377    /// calling `close` in this case.
378    ///
379    /// `close` is also called in `drop` as a last line of defense against
380    /// resource leakage. Any error from the child process is then printed as a
381    /// warning to stderr. This can be avoided by explicitly calling `close`
382    /// before the CommandReader is dropped.
383    pub fn close(&mut self) -> io::Result<()> {
384        match self.rdr {
385            Ok(ref mut rdr) => rdr.close(),
386            Err(_) => Ok(()),
387        }
388    }
389}
390
391impl io::Read for DecompressionReader {
392    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
393        match self.rdr {
394            Ok(ref mut rdr) => rdr.read(buf),
395            Err(ref mut rdr) => rdr.read(buf),
396        }
397    }
398}
399
400/// Resolves a path to a program to a path by searching for the program in
401/// `PATH`.
402///
403/// If the program could not be resolved, then an error is returned.
404///
405/// The purpose of doing this instead of passing the path to the program
406/// directly to Command::new is that Command::new will hand relative paths
407/// to CreateProcess on Windows, which will implicitly search the current
408/// working directory for the executable. This could be undesirable for
409/// security reasons. e.g., running ripgrep with the -z/--search-zip flag on an
410/// untrusted directory tree could result in arbitrary programs executing on
411/// Windows.
412///
413/// Note that this could still return a relative path if PATH contains a
414/// relative path. We permit this since it is assumed that the user has set
415/// this explicitly, and thus, desires this behavior.
416///
417/// # Platform behavior
418///
419/// On non-Windows, this is a no-op.
420pub fn resolve_binary<P: AsRef<Path>>(
421    prog: P,
422) -> Result<PathBuf, CommandError> {
423    if !cfg!(windows) {
424        return Ok(prog.as_ref().to_path_buf());
425    }
426    try_resolve_binary(prog)
427}
428
429/// Resolves a path to a program to a path by searching for the program in
430/// `PATH`.
431///
432/// If the program could not be resolved, then an error is returned.
433///
434/// The purpose of doing this instead of passing the path to the program
435/// directly to Command::new is that Command::new will hand relative paths
436/// to CreateProcess on Windows, which will implicitly search the current
437/// working directory for the executable. This could be undesirable for
438/// security reasons. e.g., running ripgrep with the -z/--search-zip flag on an
439/// untrusted directory tree could result in arbitrary programs executing on
440/// Windows.
441///
442/// Note that this could still return a relative path if PATH contains a
443/// relative path. We permit this since it is assumed that the user has set
444/// this explicitly, and thus, desires this behavior.
445///
446/// If `check_exists` is false or the path is already an absolute path this
447/// will return immediately.
448fn try_resolve_binary<P: AsRef<Path>>(
449    prog: P,
450) -> Result<PathBuf, CommandError> {
451    use std::env;
452
453    fn is_exe(path: &Path) -> bool {
454        let Ok(md) = path.metadata() else { return false };
455        !md.is_dir()
456    }
457
458    let prog = prog.as_ref();
459    if prog.is_absolute() {
460        return Ok(prog.to_path_buf());
461    }
462    let Some(syspaths) = env::var_os("PATH") else {
463        let msg = "system PATH environment variable not found";
464        return Err(CommandError::io(io::Error::new(
465            io::ErrorKind::Other,
466            msg,
467        )));
468    };
469    for syspath in env::split_paths(&syspaths) {
470        if syspath.as_os_str().is_empty() {
471            continue;
472        }
473        let abs_prog = syspath.join(prog);
474        if is_exe(&abs_prog) {
475            return Ok(abs_prog.to_path_buf());
476        }
477        if abs_prog.extension().is_none() {
478            for extension in ["com", "exe"] {
479                let abs_prog = abs_prog.with_extension(extension);
480                if is_exe(&abs_prog) {
481                    return Ok(abs_prog.to_path_buf());
482                }
483            }
484        }
485    }
486    let msg = format!("{}: could not find executable in PATH", prog.display());
487    return Err(CommandError::io(io::Error::new(io::ErrorKind::Other, msg)));
488}
489
490fn default_decompression_commands() -> Vec<DecompressionCommand> {
491    const ARGS_GZIP: &[&str] = &["gzip", "-d", "-c"];
492    const ARGS_BZIP: &[&str] = &["bzip2", "-d", "-c"];
493    const ARGS_XZ: &[&str] = &["xz", "-d", "-c"];
494    const ARGS_LZ4: &[&str] = &["lz4", "-d", "-c"];
495    const ARGS_LZMA: &[&str] = &["xz", "--format=lzma", "-d", "-c"];
496    const ARGS_BROTLI: &[&str] = &["brotli", "-d", "-c"];
497    const ARGS_ZSTD: &[&str] = &["zstd", "-q", "-d", "-c"];
498    const ARGS_UNCOMPRESS: &[&str] = &["uncompress", "-c"];
499
500    fn add(glob: &str, args: &[&str], cmds: &mut Vec<DecompressionCommand>) {
501        let bin = match resolve_binary(Path::new(args[0])) {
502            Ok(bin) => bin,
503            Err(err) => {
504                log::debug!("{}", err);
505                return;
506            }
507        };
508        cmds.push(DecompressionCommand {
509            glob: glob.to_string(),
510            bin,
511            args: args
512                .iter()
513                .skip(1)
514                .map(|s| OsStr::new(s).to_os_string())
515                .collect(),
516        });
517    }
518    let mut cmds = vec![];
519    add("*.gz", ARGS_GZIP, &mut cmds);
520    add("*.tgz", ARGS_GZIP, &mut cmds);
521    add("*.bz2", ARGS_BZIP, &mut cmds);
522    add("*.tbz2", ARGS_BZIP, &mut cmds);
523    add("*.xz", ARGS_XZ, &mut cmds);
524    add("*.txz", ARGS_XZ, &mut cmds);
525    add("*.lz4", ARGS_LZ4, &mut cmds);
526    add("*.lzma", ARGS_LZMA, &mut cmds);
527    add("*.br", ARGS_BROTLI, &mut cmds);
528    add("*.zst", ARGS_ZSTD, &mut cmds);
529    add("*.zstd", ARGS_ZSTD, &mut cmds);
530    add("*.Z", ARGS_UNCOMPRESS, &mut cmds);
531    cmds
532}