grep_cli/decompress.rs
1use std::{
2 ffi::{OsStr, OsString},
3 fs::File,
4 io,
5 path::{Path, PathBuf},
6 process::Command,
7};
8
9use globset::{Glob, GlobSet, GlobSetBuilder};
10
11use crate::process::{CommandError, CommandReader, CommandReaderBuilder};
12
13/// A builder for a matcher that determines which files get decompressed.
14#[derive(Clone, Debug)]
15pub struct DecompressionMatcherBuilder {
16 /// The commands for each matching glob.
17 commands: Vec<DecompressionCommand>,
18 /// Whether to include the default matching rules.
19 defaults: bool,
20}
21
22/// A representation of a single command for decompressing data
23/// out-of-process.
24#[derive(Clone, Debug)]
25struct DecompressionCommand {
26 /// The glob that matches this command.
27 glob: String,
28 /// The command or binary name.
29 bin: PathBuf,
30 /// The arguments to invoke with the command.
31 args: Vec<OsString>,
32}
33
34impl Default for DecompressionMatcherBuilder {
35 fn default() -> DecompressionMatcherBuilder {
36 DecompressionMatcherBuilder::new()
37 }
38}
39
40impl DecompressionMatcherBuilder {
41 /// Create a new builder for configuring a decompression matcher.
42 pub fn new() -> DecompressionMatcherBuilder {
43 DecompressionMatcherBuilder { commands: vec![], defaults: true }
44 }
45
46 /// Build a matcher for determining how to decompress files.
47 ///
48 /// If there was a problem compiling the matcher, then an error is
49 /// returned.
50 pub fn build(&self) -> Result<DecompressionMatcher, CommandError> {
51 let defaults = if !self.defaults {
52 vec![]
53 } else {
54 default_decompression_commands()
55 };
56 let mut glob_builder = GlobSetBuilder::new();
57 let mut commands = vec![];
58 for decomp_cmd in defaults.iter().chain(&self.commands) {
59 let glob = Glob::new(&decomp_cmd.glob).map_err(|err| {
60 CommandError::io(io::Error::new(io::ErrorKind::Other, err))
61 })?;
62 glob_builder.add(glob);
63 commands.push(decomp_cmd.clone());
64 }
65 let globs = glob_builder.build().map_err(|err| {
66 CommandError::io(io::Error::new(io::ErrorKind::Other, err))
67 })?;
68 Ok(DecompressionMatcher { globs, commands })
69 }
70
71 /// When enabled, the default matching rules will be compiled into this
72 /// matcher before any other associations. When disabled, only the
73 /// rules explicitly given to this builder will be used.
74 ///
75 /// This is enabled by default.
76 pub fn defaults(&mut self, yes: bool) -> &mut DecompressionMatcherBuilder {
77 self.defaults = yes;
78 self
79 }
80
81 /// Associates a glob with a command to decompress files matching the glob.
82 ///
83 /// If multiple globs match the same file, then the most recently added
84 /// glob takes precedence.
85 ///
86 /// The syntax for the glob is documented in the
87 /// [`globset` crate](https://docs.rs/globset/#syntax).
88 ///
89 /// The `program` given is resolved with respect to `PATH` and turned
90 /// into an absolute path internally before being executed by the current
91 /// platform. Notably, on Windows, this avoids a security problem where
92 /// passing a relative path to `CreateProcess` will automatically search
93 /// the current directory for a matching program. If the program could
94 /// not be resolved, then it is silently ignored and the association is
95 /// dropped. For this reason, callers should prefer `try_associate`.
96 pub fn associate<P, I, A>(
97 &mut self,
98 glob: &str,
99 program: P,
100 args: I,
101 ) -> &mut DecompressionMatcherBuilder
102 where
103 P: AsRef<OsStr>,
104 I: IntoIterator<Item = A>,
105 A: AsRef<OsStr>,
106 {
107 let _ = self.try_associate(glob, program, args);
108 self
109 }
110
111 /// Associates a glob with a command to decompress files matching the glob.
112 ///
113 /// If multiple globs match the same file, then the most recently added
114 /// glob takes precedence.
115 ///
116 /// The syntax for the glob is documented in the
117 /// [`globset` crate](https://docs.rs/globset/#syntax).
118 ///
119 /// The `program` given is resolved with respect to `PATH` and turned
120 /// into an absolute path internally before being executed by the current
121 /// platform. Notably, on Windows, this avoids a security problem where
122 /// passing a relative path to `CreateProcess` will automatically search
123 /// the current directory for a matching program. If the program could not
124 /// be resolved, then an error is returned.
125 pub fn try_associate<P, I, A>(
126 &mut self,
127 glob: &str,
128 program: P,
129 args: I,
130 ) -> Result<&mut DecompressionMatcherBuilder, CommandError>
131 where
132 P: AsRef<OsStr>,
133 I: IntoIterator<Item = A>,
134 A: AsRef<OsStr>,
135 {
136 let glob = glob.to_string();
137 let bin = try_resolve_binary(Path::new(program.as_ref()))?;
138 let args =
139 args.into_iter().map(|a| a.as_ref().to_os_string()).collect();
140 self.commands.push(DecompressionCommand { glob, bin, args });
141 Ok(self)
142 }
143}
144
145/// A matcher for determining how to decompress files.
146#[derive(Clone, Debug)]
147pub struct DecompressionMatcher {
148 /// The set of globs to match. Each glob has a corresponding entry in
149 /// `commands`. When a glob matches, the corresponding command should be
150 /// used to perform out-of-process decompression.
151 globs: GlobSet,
152 /// The commands for each matching glob.
153 commands: Vec<DecompressionCommand>,
154}
155
156impl Default for DecompressionMatcher {
157 fn default() -> DecompressionMatcher {
158 DecompressionMatcher::new()
159 }
160}
161
162impl DecompressionMatcher {
163 /// Create a new matcher with default rules.
164 ///
165 /// To add more matching rules, build a matcher with
166 /// [`DecompressionMatcherBuilder`].
167 pub fn new() -> DecompressionMatcher {
168 DecompressionMatcherBuilder::new()
169 .build()
170 .expect("built-in matching rules should always compile")
171 }
172
173 /// Return a pre-built command based on the given file path that can
174 /// decompress its contents. If no such decompressor is known, then this
175 /// returns `None`.
176 ///
177 /// If there are multiple possible commands matching the given path, then
178 /// the command added last takes precedence.
179 pub fn command<P: AsRef<Path>>(&self, path: P) -> Option<Command> {
180 if let Some(i) = self.globs.matches(path).into_iter().next_back() {
181 let decomp_cmd = &self.commands[i];
182 let mut cmd = Command::new(&decomp_cmd.bin);
183 cmd.args(&decomp_cmd.args);
184 return Some(cmd);
185 }
186 None
187 }
188
189 /// Returns true if and only if the given file path has at least one
190 /// matching command to perform decompression on.
191 pub fn has_command<P: AsRef<Path>>(&self, path: P) -> bool {
192 self.globs.is_match(path)
193 }
194}
195
196/// Configures and builds a streaming reader for decompressing data.
197#[derive(Clone, Debug, Default)]
198pub struct DecompressionReaderBuilder {
199 matcher: DecompressionMatcher,
200 command_builder: CommandReaderBuilder,
201}
202
203impl DecompressionReaderBuilder {
204 /// Create a new builder with the default configuration.
205 pub fn new() -> DecompressionReaderBuilder {
206 DecompressionReaderBuilder::default()
207 }
208
209 /// Build a new streaming reader for decompressing data.
210 ///
211 /// If decompression is done out-of-process and if there was a problem
212 /// spawning the process, then its error is logged at the debug level and a
213 /// passthru reader is returned that does no decompression. This behavior
214 /// typically occurs when the given file path matches a decompression
215 /// command, but is executing in an environment where the decompression
216 /// command is not available.
217 ///
218 /// If the given file path could not be matched with a decompression
219 /// strategy, then a passthru reader is returned that does no
220 /// decompression.
221 pub fn build<P: AsRef<Path>>(
222 &self,
223 path: P,
224 ) -> Result<DecompressionReader, CommandError> {
225 let path = path.as_ref();
226 let Some(mut cmd) = self.matcher.command(path) else {
227 return DecompressionReader::new_passthru(path);
228 };
229 cmd.arg(path);
230
231 match self.command_builder.build(&mut cmd) {
232 Ok(cmd_reader) => Ok(DecompressionReader { rdr: Ok(cmd_reader) }),
233 Err(err) => {
234 log::debug!(
235 "{}: error spawning command '{:?}': {} \
236 (falling back to uncompressed reader)",
237 path.display(),
238 cmd,
239 err,
240 );
241 DecompressionReader::new_passthru(path)
242 }
243 }
244 }
245
246 /// Set the matcher to use to look up the decompression command for each
247 /// file path.
248 ///
249 /// A set of sensible rules is enabled by default. Setting this will
250 /// completely replace the current rules.
251 pub fn matcher(
252 &mut self,
253 matcher: DecompressionMatcher,
254 ) -> &mut DecompressionReaderBuilder {
255 self.matcher = matcher;
256 self
257 }
258
259 /// Get the underlying matcher currently used by this builder.
260 pub fn get_matcher(&self) -> &DecompressionMatcher {
261 &self.matcher
262 }
263
264 /// When enabled, the reader will asynchronously read the contents of the
265 /// command's stderr output. When disabled, stderr is only read after the
266 /// stdout stream has been exhausted (or if the process quits with an error
267 /// code).
268 ///
269 /// Note that when enabled, this may require launching an additional
270 /// thread in order to read stderr. This is done so that the process being
271 /// executed is never blocked from writing to stdout or stderr. If this is
272 /// disabled, then it is possible for the process to fill up the stderr
273 /// buffer and deadlock.
274 ///
275 /// This is enabled by default.
276 pub fn async_stderr(
277 &mut self,
278 yes: bool,
279 ) -> &mut DecompressionReaderBuilder {
280 self.command_builder.async_stderr(yes);
281 self
282 }
283}
284
285/// A streaming reader for decompressing the contents of a file.
286///
287/// The purpose of this reader is to provide a seamless way to decompress the
288/// contents of file using existing tools in the current environment. This is
289/// meant to be an alternative to using decompression libraries in favor of the
290/// simplicity and portability of using external commands such as `gzip` and
291/// `xz`. This does impose the overhead of spawning a process, so other means
292/// for performing decompression should be sought if this overhead isn't
293/// acceptable.
294///
295/// A decompression reader comes with a default set of matching rules that are
296/// meant to associate file paths with the corresponding command to use to
297/// decompress them. For example, a glob like `*.gz` matches gzip compressed
298/// files with the command `gzip -d -c`. If a file path does not match any
299/// existing rules, or if it matches a rule whose command does not exist in the
300/// current environment, then the decompression reader passes through the
301/// contents of the underlying file without doing any decompression.
302///
303/// The default matching rules are probably good enough for most cases, and if
304/// they require revision, pull requests are welcome. In cases where they must
305/// be changed or extended, they can be customized through the use of
306/// [`DecompressionMatcherBuilder`] and [`DecompressionReaderBuilder`].
307///
308/// By default, this reader will asynchronously read the processes' stderr.
309/// This prevents subtle deadlocking bugs for noisy processes that write a lot
310/// to stderr. Currently, the entire contents of stderr is read on to the heap.
311///
312/// # Example
313///
314/// This example shows how to read the decompressed contents of a file without
315/// needing to explicitly choose the decompression command to run.
316///
317/// Note that if you need to decompress multiple files, it is better to use
318/// `DecompressionReaderBuilder`, which will amortize the cost of compiling the
319/// matcher.
320///
321/// ```no_run
322/// use std::{io::Read, process::Command};
323///
324/// use grep_cli::DecompressionReader;
325///
326/// let mut rdr = DecompressionReader::new("/usr/share/man/man1/ls.1.gz")?;
327/// let mut contents = vec![];
328/// rdr.read_to_end(&mut contents)?;
329/// # Ok::<(), Box<dyn std::error::Error>>(())
330/// ```
331#[derive(Debug)]
332pub struct DecompressionReader {
333 rdr: Result<CommandReader, File>,
334}
335
336impl DecompressionReader {
337 /// Build a new streaming reader for decompressing data.
338 ///
339 /// If decompression is done out-of-process and if there was a problem
340 /// spawning the process, then its error is returned.
341 ///
342 /// If the given file path could not be matched with a decompression
343 /// strategy, then a passthru reader is returned that does no
344 /// decompression.
345 ///
346 /// This uses the default matching rules for determining how to decompress
347 /// the given file. To change those matching rules, use
348 /// [`DecompressionReaderBuilder`] and [`DecompressionMatcherBuilder`].
349 ///
350 /// When creating readers for many paths. it is better to use the builder
351 /// since it will amortize the cost of constructing the matcher.
352 pub fn new<P: AsRef<Path>>(
353 path: P,
354 ) -> Result<DecompressionReader, CommandError> {
355 DecompressionReaderBuilder::new().build(path)
356 }
357
358 /// Creates a new "passthru" decompression reader that reads from the file
359 /// corresponding to the given path without doing decompression and without
360 /// executing another process.
361 fn new_passthru(path: &Path) -> Result<DecompressionReader, CommandError> {
362 let file = File::open(path)?;
363 Ok(DecompressionReader { rdr: Err(file) })
364 }
365
366 /// Closes this reader, freeing any resources used by its underlying child
367 /// process, if one was used. If the child process exits with a nonzero
368 /// exit code, the returned Err value will include its stderr.
369 ///
370 /// `close` is idempotent, meaning it can be safely called multiple times.
371 /// The first call closes the CommandReader and any subsequent calls do
372 /// nothing.
373 ///
374 /// This method should be called after partially reading a file to prevent
375 /// resource leakage. However there is no need to call `close` explicitly
376 /// if your code always calls `read` to EOF, as `read` takes care of
377 /// calling `close` in this case.
378 ///
379 /// `close` is also called in `drop` as a last line of defense against
380 /// resource leakage. Any error from the child process is then printed as a
381 /// warning to stderr. This can be avoided by explicitly calling `close`
382 /// before the CommandReader is dropped.
383 pub fn close(&mut self) -> io::Result<()> {
384 match self.rdr {
385 Ok(ref mut rdr) => rdr.close(),
386 Err(_) => Ok(()),
387 }
388 }
389}
390
391impl io::Read for DecompressionReader {
392 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
393 match self.rdr {
394 Ok(ref mut rdr) => rdr.read(buf),
395 Err(ref mut rdr) => rdr.read(buf),
396 }
397 }
398}
399
400/// Resolves a path to a program to a path by searching for the program in
401/// `PATH`.
402///
403/// If the program could not be resolved, then an error is returned.
404///
405/// The purpose of doing this instead of passing the path to the program
406/// directly to Command::new is that Command::new will hand relative paths
407/// to CreateProcess on Windows, which will implicitly search the current
408/// working directory for the executable. This could be undesirable for
409/// security reasons. e.g., running ripgrep with the -z/--search-zip flag on an
410/// untrusted directory tree could result in arbitrary programs executing on
411/// Windows.
412///
413/// Note that this could still return a relative path if PATH contains a
414/// relative path. We permit this since it is assumed that the user has set
415/// this explicitly, and thus, desires this behavior.
416///
417/// # Platform behavior
418///
419/// On non-Windows, this is a no-op.
420pub fn resolve_binary<P: AsRef<Path>>(
421 prog: P,
422) -> Result<PathBuf, CommandError> {
423 if !cfg!(windows) {
424 return Ok(prog.as_ref().to_path_buf());
425 }
426 try_resolve_binary(prog)
427}
428
429/// Resolves a path to a program to a path by searching for the program in
430/// `PATH`.
431///
432/// If the program could not be resolved, then an error is returned.
433///
434/// The purpose of doing this instead of passing the path to the program
435/// directly to Command::new is that Command::new will hand relative paths
436/// to CreateProcess on Windows, which will implicitly search the current
437/// working directory for the executable. This could be undesirable for
438/// security reasons. e.g., running ripgrep with the -z/--search-zip flag on an
439/// untrusted directory tree could result in arbitrary programs executing on
440/// Windows.
441///
442/// Note that this could still return a relative path if PATH contains a
443/// relative path. We permit this since it is assumed that the user has set
444/// this explicitly, and thus, desires this behavior.
445///
446/// If `check_exists` is false or the path is already an absolute path this
447/// will return immediately.
448fn try_resolve_binary<P: AsRef<Path>>(
449 prog: P,
450) -> Result<PathBuf, CommandError> {
451 use std::env;
452
453 fn is_exe(path: &Path) -> bool {
454 let Ok(md) = path.metadata() else { return false };
455 !md.is_dir()
456 }
457
458 let prog = prog.as_ref();
459 if prog.is_absolute() {
460 return Ok(prog.to_path_buf());
461 }
462 let Some(syspaths) = env::var_os("PATH") else {
463 let msg = "system PATH environment variable not found";
464 return Err(CommandError::io(io::Error::new(
465 io::ErrorKind::Other,
466 msg,
467 )));
468 };
469 for syspath in env::split_paths(&syspaths) {
470 if syspath.as_os_str().is_empty() {
471 continue;
472 }
473 let abs_prog = syspath.join(prog);
474 if is_exe(&abs_prog) {
475 return Ok(abs_prog.to_path_buf());
476 }
477 if abs_prog.extension().is_none() {
478 for extension in ["com", "exe"] {
479 let abs_prog = abs_prog.with_extension(extension);
480 if is_exe(&abs_prog) {
481 return Ok(abs_prog.to_path_buf());
482 }
483 }
484 }
485 }
486 let msg = format!("{}: could not find executable in PATH", prog.display());
487 return Err(CommandError::io(io::Error::new(io::ErrorKind::Other, msg)));
488}
489
490fn default_decompression_commands() -> Vec<DecompressionCommand> {
491 const ARGS_GZIP: &[&str] = &["gzip", "-d", "-c"];
492 const ARGS_BZIP: &[&str] = &["bzip2", "-d", "-c"];
493 const ARGS_XZ: &[&str] = &["xz", "-d", "-c"];
494 const ARGS_LZ4: &[&str] = &["lz4", "-d", "-c"];
495 const ARGS_LZMA: &[&str] = &["xz", "--format=lzma", "-d", "-c"];
496 const ARGS_BROTLI: &[&str] = &["brotli", "-d", "-c"];
497 const ARGS_ZSTD: &[&str] = &["zstd", "-q", "-d", "-c"];
498 const ARGS_UNCOMPRESS: &[&str] = &["uncompress", "-c"];
499
500 fn add(glob: &str, args: &[&str], cmds: &mut Vec<DecompressionCommand>) {
501 let bin = match resolve_binary(Path::new(args[0])) {
502 Ok(bin) => bin,
503 Err(err) => {
504 log::debug!("{}", err);
505 return;
506 }
507 };
508 cmds.push(DecompressionCommand {
509 glob: glob.to_string(),
510 bin,
511 args: args
512 .iter()
513 .skip(1)
514 .map(|s| OsStr::new(s).to_os_string())
515 .collect(),
516 });
517 }
518 let mut cmds = vec![];
519 add("*.gz", ARGS_GZIP, &mut cmds);
520 add("*.tgz", ARGS_GZIP, &mut cmds);
521 add("*.bz2", ARGS_BZIP, &mut cmds);
522 add("*.tbz2", ARGS_BZIP, &mut cmds);
523 add("*.xz", ARGS_XZ, &mut cmds);
524 add("*.txz", ARGS_XZ, &mut cmds);
525 add("*.lz4", ARGS_LZ4, &mut cmds);
526 add("*.lzma", ARGS_LZMA, &mut cmds);
527 add("*.br", ARGS_BROTLI, &mut cmds);
528 add("*.zst", ARGS_ZSTD, &mut cmds);
529 add("*.zstd", ARGS_ZSTD, &mut cmds);
530 add("*.Z", ARGS_UNCOMPRESS, &mut cmds);
531 cmds
532}