grep_cli/
pattern.rs

1use std::{ffi::OsStr, io, path::Path};
2
3use bstr::io::BufReadExt;
4
5use crate::escape::{escape, escape_os};
6
7/// An error that occurs when a pattern could not be converted to valid UTF-8.
8///
9/// The purpose of this error is to give a more targeted failure mode for
10/// patterns written by end users that are not valid UTF-8.
11#[derive(Clone, Debug, Eq, PartialEq)]
12pub struct InvalidPatternError {
13    original: String,
14    valid_up_to: usize,
15}
16
17impl InvalidPatternError {
18    /// Returns the index in the given string up to which valid UTF-8 was
19    /// verified.
20    pub fn valid_up_to(&self) -> usize {
21        self.valid_up_to
22    }
23}
24
25impl std::error::Error for InvalidPatternError {}
26
27impl std::fmt::Display for InvalidPatternError {
28    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
29        write!(
30            f,
31            "found invalid UTF-8 in pattern at byte offset {}: {} \
32             (disable Unicode mode and use hex escape sequences to match \
33             arbitrary bytes in a pattern, e.g., '(?-u)\\xFF')",
34            self.valid_up_to, self.original,
35        )
36    }
37}
38
39impl From<InvalidPatternError> for io::Error {
40    fn from(paterr: InvalidPatternError) -> io::Error {
41        io::Error::new(io::ErrorKind::Other, paterr)
42    }
43}
44
45/// Convert an OS string into a regular expression pattern.
46///
47/// This conversion fails if the given pattern is not valid UTF-8, in which
48/// case, a targeted error with more information about where the invalid UTF-8
49/// occurs is given. The error also suggests the use of hex escape sequences,
50/// which are supported by many regex engines.
51pub fn pattern_from_os(pattern: &OsStr) -> Result<&str, InvalidPatternError> {
52    pattern.to_str().ok_or_else(|| {
53        let valid_up_to = pattern
54            .to_string_lossy()
55            .find('\u{FFFD}')
56            .expect("a Unicode replacement codepoint for invalid UTF-8");
57        InvalidPatternError { original: escape_os(pattern), valid_up_to }
58    })
59}
60
61/// Convert arbitrary bytes into a regular expression pattern.
62///
63/// This conversion fails if the given pattern is not valid UTF-8, in which
64/// case, a targeted error with more information about where the invalid UTF-8
65/// occurs is given. The error also suggests the use of hex escape sequences,
66/// which are supported by many regex engines.
67pub fn pattern_from_bytes(
68    pattern: &[u8],
69) -> Result<&str, InvalidPatternError> {
70    std::str::from_utf8(pattern).map_err(|err| InvalidPatternError {
71        original: escape(pattern),
72        valid_up_to: err.valid_up_to(),
73    })
74}
75
76/// Read patterns from a file path, one per line.
77///
78/// If there was a problem reading or if any of the patterns contain invalid
79/// UTF-8, then an error is returned. If there was a problem with a specific
80/// pattern, then the error message will include the line number and the file
81/// path.
82pub fn patterns_from_path<P: AsRef<Path>>(path: P) -> io::Result<Vec<String>> {
83    let path = path.as_ref();
84    let file = std::fs::File::open(path).map_err(|err| {
85        io::Error::new(
86            io::ErrorKind::Other,
87            format!("{}: {}", path.display(), err),
88        )
89    })?;
90    patterns_from_reader(file).map_err(|err| {
91        io::Error::new(
92            io::ErrorKind::Other,
93            format!("{}:{}", path.display(), err),
94        )
95    })
96}
97
98/// Read patterns from stdin, one per line.
99///
100/// If there was a problem reading or if any of the patterns contain invalid
101/// UTF-8, then an error is returned. If there was a problem with a specific
102/// pattern, then the error message will include the line number and the fact
103/// that it came from stdin.
104pub fn patterns_from_stdin() -> io::Result<Vec<String>> {
105    let stdin = io::stdin();
106    let locked = stdin.lock();
107    patterns_from_reader(locked).map_err(|err| {
108        io::Error::new(io::ErrorKind::Other, format!("<stdin>:{}", err))
109    })
110}
111
112/// Read patterns from any reader, one per line.
113///
114/// If there was a problem reading or if any of the patterns contain invalid
115/// UTF-8, then an error is returned. If there was a problem with a specific
116/// pattern, then the error message will include the line number.
117///
118/// Note that this routine uses its own internal buffer, so the caller should
119/// not provide their own buffered reader if possible.
120///
121/// # Example
122///
123/// This shows how to parse patterns, one per line.
124///
125/// ```
126/// use grep_cli::patterns_from_reader;
127///
128/// let patterns = "\
129/// foo
130/// bar\\s+foo
131/// [a-z]{3}
132/// ";
133///
134/// assert_eq!(patterns_from_reader(patterns.as_bytes())?, vec![
135///     r"foo",
136///     r"bar\s+foo",
137///     r"[a-z]{3}",
138/// ]);
139/// # Ok::<(), Box<dyn std::error::Error>>(())
140/// ```
141pub fn patterns_from_reader<R: io::Read>(rdr: R) -> io::Result<Vec<String>> {
142    let mut patterns = vec![];
143    let mut line_number = 0;
144    io::BufReader::new(rdr).for_byte_line(|line| {
145        line_number += 1;
146        match pattern_from_bytes(line) {
147            Ok(pattern) => {
148                patterns.push(pattern.to_string());
149                Ok(true)
150            }
151            Err(err) => Err(io::Error::new(
152                io::ErrorKind::Other,
153                format!("{}: {}", line_number, err),
154            )),
155        }
156    })?;
157    Ok(patterns)
158}
159
160#[cfg(test)]
161mod tests {
162    use super::*;
163
164    #[test]
165    fn bytes() {
166        let pat = b"abc\xFFxyz";
167        let err = pattern_from_bytes(pat).unwrap_err();
168        assert_eq!(3, err.valid_up_to());
169    }
170
171    #[test]
172    #[cfg(unix)]
173    fn os() {
174        use std::ffi::OsStr;
175        use std::os::unix::ffi::OsStrExt;
176
177        let pat = OsStr::from_bytes(b"abc\xFFxyz");
178        let err = pattern_from_os(pat).unwrap_err();
179        assert_eq!(3, err.valid_up_to());
180    }
181}