grep_printer/
util.rs

1use std::{borrow::Cow, cell::OnceCell, fmt, io, path::Path, time};
2
3use {
4    bstr::ByteVec,
5    grep_matcher::{Captures, LineTerminator, Match, Matcher},
6    grep_searcher::{
7        LineIter, Searcher, SinkContext, SinkContextKind, SinkError, SinkMatch,
8    },
9};
10
11use crate::{MAX_LOOK_AHEAD, hyperlink::HyperlinkPath};
12
13/// A type for handling replacements while amortizing allocation.
14pub(crate) struct Replacer<M: Matcher> {
15    space: Option<Space<M>>,
16}
17
18struct Space<M: Matcher> {
19    /// The place to store capture locations.
20    caps: M::Captures,
21    /// The place to write a replacement to.
22    dst: Vec<u8>,
23    /// The place to store match offsets in terms of `dst`.
24    matches: Vec<Match>,
25}
26
27impl<M: Matcher> fmt::Debug for Replacer<M> {
28    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
29        let (dst, matches) = self.replacement().unwrap_or((&[], &[]));
30        f.debug_struct("Replacer")
31            .field("dst", &dst)
32            .field("matches", &matches)
33            .finish()
34    }
35}
36
37impl<M: Matcher> Replacer<M> {
38    /// Create a new replacer for use with a particular matcher.
39    ///
40    /// This constructor does not allocate. Instead, space for dealing with
41    /// replacements is allocated lazily only when needed.
42    pub(crate) fn new() -> Replacer<M> {
43        Replacer { space: None }
44    }
45
46    /// Executes a replacement on the given haystack string by replacing all
47    /// matches with the given replacement. To access the result of the
48    /// replacement, use the `replacement` method.
49    ///
50    /// This can fail if the underlying matcher reports an error.
51    pub(crate) fn replace_all<'a>(
52        &'a mut self,
53        searcher: &Searcher,
54        matcher: &M,
55        mut haystack: &[u8],
56        range: std::ops::Range<usize>,
57        replacement: &[u8],
58    ) -> io::Result<()> {
59        // See the giant comment in 'find_iter_at_in_context' below for why we
60        // do this dance.
61        let is_multi_line = searcher.multi_line_with_matcher(&matcher);
62        // Get the line_terminator that was removed (if any) so we can add it
63        // back.
64        let line_terminator = if is_multi_line {
65            if haystack[range.end..].len() >= MAX_LOOK_AHEAD {
66                haystack = &haystack[..range.end + MAX_LOOK_AHEAD];
67            }
68            &[]
69        } else {
70            // When searching a single line, we should remove the line
71            // terminator. Otherwise, it's possible for the regex (via
72            // look-around) to observe the line terminator and not match
73            // because of it.
74            let mut m = Match::new(0, range.end);
75            let line_terminator =
76                trim_line_terminator(searcher, haystack, &mut m);
77            haystack = &haystack[..m.end()];
78            line_terminator
79        };
80        {
81            let &mut Space { ref mut dst, ref mut caps, ref mut matches } =
82                self.allocate(matcher)?;
83            dst.clear();
84            matches.clear();
85
86            replace_with_captures_in_context(
87                matcher,
88                haystack,
89                line_terminator,
90                range.clone(),
91                caps,
92                dst,
93                |caps, dst| {
94                    let start = dst.len();
95                    caps.interpolate(
96                        |name| matcher.capture_index(name),
97                        haystack,
98                        replacement,
99                        dst,
100                    );
101                    let end = dst.len();
102                    matches.push(Match::new(start, end));
103                    true
104                },
105            )
106            .map_err(io::Error::error_message)?;
107        }
108        Ok(())
109    }
110
111    /// Return the result of the prior replacement and the match offsets for
112    /// all replacement occurrences within the returned replacement buffer.
113    ///
114    /// If no replacement has occurred then `None` is returned.
115    pub(crate) fn replacement<'a>(
116        &'a self,
117    ) -> Option<(&'a [u8], &'a [Match])> {
118        match self.space {
119            None => None,
120            Some(ref space) => {
121                if space.matches.is_empty() {
122                    None
123                } else {
124                    Some((&space.dst, &space.matches))
125                }
126            }
127        }
128    }
129
130    /// Clear space used for performing a replacement.
131    ///
132    /// Subsequent calls to `replacement` after calling `clear` (but before
133    /// executing another replacement) will always return `None`.
134    pub(crate) fn clear(&mut self) {
135        if let Some(ref mut space) = self.space {
136            space.dst.clear();
137            space.matches.clear();
138        }
139    }
140
141    /// Allocate space for replacements when used with the given matcher and
142    /// return a mutable reference to that space.
143    ///
144    /// This can fail if allocating space for capture locations from the given
145    /// matcher fails.
146    fn allocate(&mut self, matcher: &M) -> io::Result<&mut Space<M>> {
147        if self.space.is_none() {
148            let caps =
149                matcher.new_captures().map_err(io::Error::error_message)?;
150            self.space = Some(Space { caps, dst: vec![], matches: vec![] });
151        }
152        Ok(self.space.as_mut().unwrap())
153    }
154}
155
156/// A simple layer of abstraction over either a match or a contextual line
157/// reported by the searcher.
158///
159/// In particular, this provides an API that unions the `SinkMatch` and
160/// `SinkContext` types while also exposing a list of all individual match
161/// locations.
162///
163/// While this serves as a convenient mechanism to abstract over `SinkMatch`
164/// and `SinkContext`, this also provides a way to abstract over replacements.
165/// Namely, after a replacement, a `Sunk` value can be constructed using the
166/// results of the replacement instead of the bytes reported directly by the
167/// searcher.
168#[derive(Debug)]
169pub(crate) struct Sunk<'a> {
170    bytes: &'a [u8],
171    absolute_byte_offset: u64,
172    line_number: Option<u64>,
173    context_kind: Option<&'a SinkContextKind>,
174    matches: &'a [Match],
175    original_matches: &'a [Match],
176}
177
178impl<'a> Sunk<'a> {
179    #[inline]
180    pub(crate) fn empty() -> Sunk<'static> {
181        Sunk {
182            bytes: &[],
183            absolute_byte_offset: 0,
184            line_number: None,
185            context_kind: None,
186            matches: &[],
187            original_matches: &[],
188        }
189    }
190
191    #[inline]
192    pub(crate) fn from_sink_match(
193        sunk: &'a SinkMatch<'a>,
194        original_matches: &'a [Match],
195        replacement: Option<(&'a [u8], &'a [Match])>,
196    ) -> Sunk<'a> {
197        let (bytes, matches) =
198            replacement.unwrap_or_else(|| (sunk.bytes(), original_matches));
199        Sunk {
200            bytes,
201            absolute_byte_offset: sunk.absolute_byte_offset(),
202            line_number: sunk.line_number(),
203            context_kind: None,
204            matches,
205            original_matches,
206        }
207    }
208
209    #[inline]
210    pub(crate) fn from_sink_context(
211        sunk: &'a SinkContext<'a>,
212        original_matches: &'a [Match],
213        replacement: Option<(&'a [u8], &'a [Match])>,
214    ) -> Sunk<'a> {
215        let (bytes, matches) =
216            replacement.unwrap_or_else(|| (sunk.bytes(), original_matches));
217        Sunk {
218            bytes,
219            absolute_byte_offset: sunk.absolute_byte_offset(),
220            line_number: sunk.line_number(),
221            context_kind: Some(sunk.kind()),
222            matches,
223            original_matches,
224        }
225    }
226
227    #[inline]
228    pub(crate) fn context_kind(&self) -> Option<&'a SinkContextKind> {
229        self.context_kind
230    }
231
232    #[inline]
233    pub(crate) fn bytes(&self) -> &'a [u8] {
234        self.bytes
235    }
236
237    #[inline]
238    pub(crate) fn matches(&self) -> &'a [Match] {
239        self.matches
240    }
241
242    #[inline]
243    pub(crate) fn original_matches(&self) -> &'a [Match] {
244        self.original_matches
245    }
246
247    #[inline]
248    pub(crate) fn lines(&self, line_term: u8) -> LineIter<'a> {
249        LineIter::new(line_term, self.bytes())
250    }
251
252    #[inline]
253    pub(crate) fn absolute_byte_offset(&self) -> u64 {
254        self.absolute_byte_offset
255    }
256
257    #[inline]
258    pub(crate) fn line_number(&self) -> Option<u64> {
259        self.line_number
260    }
261}
262
263/// A simple encapsulation of a file path used by a printer.
264///
265/// This represents any transforms that we might want to perform on the path,
266/// such as converting it to valid UTF-8 and/or replacing its separator with
267/// something else. This allows us to amortize work if we are printing the
268/// file path for every match.
269///
270/// In the common case, no transformation is needed, which lets us avoid
271/// the allocation. Typically, only Windows requires a transform, since
272/// it's fraught to access the raw bytes of a path directly and first need
273/// to lossily convert to UTF-8. Windows is also typically where the path
274/// separator replacement is used, e.g., in cygwin environments to use `/`
275/// instead of `\`.
276///
277/// Users of this type are expected to construct it from a normal `Path`
278/// found in the standard library. It can then be written to any `io::Write`
279/// implementation using the `as_bytes` method. This achieves platform
280/// portability with a small cost: on Windows, paths that are not valid UTF-16
281/// will not roundtrip correctly.
282#[derive(Clone, Debug)]
283pub(crate) struct PrinterPath<'a> {
284    // On Unix, we can re-materialize a `Path` from our `Cow<'a, [u8]>` with
285    // zero cost, so there's no point in storing it. At time of writing,
286    // OsStr::as_os_str_bytes (and its corresponding constructor) are not
287    // stable yet. Those would let us achieve the same end portably. (As long
288    // as we keep our UTF-8 requirement on Windows.)
289    #[cfg(not(unix))]
290    path: &'a Path,
291    bytes: Cow<'a, [u8]>,
292    hyperlink: OnceCell<Option<HyperlinkPath>>,
293}
294
295impl<'a> PrinterPath<'a> {
296    /// Create a new path suitable for printing.
297    pub(crate) fn new(path: &'a Path) -> PrinterPath<'a> {
298        PrinterPath {
299            #[cfg(not(unix))]
300            path,
301            // N.B. This is zero-cost on Unix and requires at least a UTF-8
302            // check on Windows. This doesn't allocate on Windows unless the
303            // path is invalid UTF-8 (which is exceptionally rare).
304            bytes: Vec::from_path_lossy(path),
305            hyperlink: OnceCell::new(),
306        }
307    }
308
309    /// Set the separator on this path.
310    ///
311    /// When set, `PrinterPath::as_bytes` will return the path provided but
312    /// with its separator replaced with the one given.
313    pub(crate) fn with_separator(
314        mut self,
315        sep: Option<u8>,
316    ) -> PrinterPath<'a> {
317        /// Replace the path separator in this path with the given separator
318        /// and do it in place. On Windows, both `/` and `\` are treated as
319        /// path separators that are both replaced by `new_sep`. In all other
320        /// environments, only `/` is treated as a path separator.
321        fn replace_separator(bytes: &[u8], sep: u8) -> Vec<u8> {
322            let mut bytes = bytes.to_vec();
323            for b in bytes.iter_mut() {
324                if *b == b'/' || (cfg!(windows) && *b == b'\\') {
325                    *b = sep;
326                }
327            }
328            bytes
329        }
330        let Some(sep) = sep else { return self };
331        self.bytes = Cow::Owned(replace_separator(self.as_bytes(), sep));
332        self
333    }
334
335    /// Return the raw bytes for this path.
336    pub(crate) fn as_bytes(&self) -> &[u8] {
337        &self.bytes
338    }
339
340    /// Return this path as a hyperlink.
341    ///
342    /// Note that a hyperlink may not be able to be created from a path.
343    /// Namely, computing the hyperlink may require touching the file system
344    /// (e.g., for path canonicalization) and that can fail. This failure is
345    /// silent but is logged.
346    pub(crate) fn as_hyperlink(&self) -> Option<&HyperlinkPath> {
347        self.hyperlink
348            .get_or_init(|| HyperlinkPath::from_path(self.as_path()))
349            .as_ref()
350    }
351
352    /// Return this path as an actual `Path` type.
353    pub(crate) fn as_path(&self) -> &Path {
354        #[cfg(unix)]
355        fn imp<'p>(p: &'p PrinterPath<'_>) -> &'p Path {
356            use std::{ffi::OsStr, os::unix::ffi::OsStrExt};
357            Path::new(OsStr::from_bytes(p.as_bytes()))
358        }
359        #[cfg(not(unix))]
360        fn imp<'p>(p: &'p PrinterPath<'_>) -> &'p Path {
361            p.path
362        }
363        imp(self)
364    }
365}
366
367/// A type that provides "nicer" Display and Serialize impls for
368/// std::time::Duration. The serialization format should actually be compatible
369/// with the Deserialize impl for std::time::Duration, since this type only
370/// adds new fields.
371#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
372pub(crate) struct NiceDuration(pub time::Duration);
373
374impl fmt::Display for NiceDuration {
375    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
376        write!(f, "{:0.6}s", self.fractional_seconds())
377    }
378}
379
380impl NiceDuration {
381    /// Returns the number of seconds in this duration in fraction form.
382    /// The number to the left of the decimal point is the number of seconds,
383    /// and the number to the right is the number of milliseconds.
384    fn fractional_seconds(&self) -> f64 {
385        let fractional = (self.0.subsec_nanos() as f64) / 1_000_000_000.0;
386        self.0.as_secs() as f64 + fractional
387    }
388}
389
390#[cfg(feature = "serde")]
391impl serde::Serialize for NiceDuration {
392    fn serialize<S: serde::Serializer>(
393        &self,
394        ser: S,
395    ) -> Result<S::Ok, S::Error> {
396        use serde::ser::SerializeStruct;
397
398        let mut state = ser.serialize_struct("Duration", 3)?;
399        state.serialize_field("secs", &self.0.as_secs())?;
400        state.serialize_field("nanos", &self.0.subsec_nanos())?;
401        state.serialize_field("human", &format!("{}", self))?;
402        state.end()
403    }
404}
405
406/// A simple formatter for converting `u64` values to ASCII byte strings.
407///
408/// This avoids going through the formatting machinery which seems to
409/// substantially slow things down.
410///
411/// The `itoa` crate does the same thing as this formatter, but is a bit
412/// faster. We roll our own which is a bit slower, but gets us enough of a win
413/// to be satisfied with and with pure safe code.
414#[derive(Debug)]
415pub(crate) struct DecimalFormatter {
416    buf: [u8; Self::MAX_U64_LEN],
417    start: usize,
418}
419
420impl DecimalFormatter {
421    /// Discovered via `u64::MAX.to_string().len()`.
422    const MAX_U64_LEN: usize = 20;
423
424    /// Create a new decimal formatter for the given 64-bit unsigned integer.
425    pub(crate) fn new(mut n: u64) -> DecimalFormatter {
426        let mut buf = [0; Self::MAX_U64_LEN];
427        let mut i = buf.len();
428        loop {
429            i -= 1;
430
431            let digit = u8::try_from(n % 10).unwrap();
432            n /= 10;
433            buf[i] = b'0' + digit;
434            if n == 0 {
435                break;
436            }
437        }
438        DecimalFormatter { buf, start: i }
439    }
440
441    /// Return the decimal formatted as an ASCII byte string.
442    pub(crate) fn as_bytes(&self) -> &[u8] {
443        &self.buf[self.start..]
444    }
445}
446
447/// Trim prefix ASCII spaces from the given slice and return the corresponding
448/// range.
449///
450/// This stops trimming a prefix as soon as it sees non-whitespace or a line
451/// terminator.
452pub(crate) fn trim_ascii_prefix(
453    line_term: LineTerminator,
454    slice: &[u8],
455    range: Match,
456) -> Match {
457    fn is_space(b: u8) -> bool {
458        match b {
459            b'\t' | b'\n' | b'\x0B' | b'\x0C' | b'\r' | b' ' => true,
460            _ => false,
461        }
462    }
463
464    let count = slice[range]
465        .iter()
466        .take_while(|&&b| -> bool {
467            is_space(b) && !line_term.as_bytes().contains(&b)
468        })
469        .count();
470    range.with_start(range.start() + count)
471}
472
473pub(crate) fn find_iter_at_in_context<M, F>(
474    searcher: &Searcher,
475    matcher: M,
476    mut bytes: &[u8],
477    range: std::ops::Range<usize>,
478    mut matched: F,
479) -> io::Result<()>
480where
481    M: Matcher,
482    F: FnMut(Match) -> bool,
483{
484    // This strange dance is to account for the possibility of look-ahead in
485    // the regex. The problem here is that mat.bytes() doesn't include the
486    // lines beyond the match boundaries in mulit-line mode, which means that
487    // when we try to rediscover the full set of matches here, the regex may no
488    // longer match if it required some look-ahead beyond the matching lines.
489    //
490    // PCRE2 (and the grep-matcher interfaces) has no way of specifying an end
491    // bound of the search. So we kludge it and let the regex engine search the
492    // rest of the buffer... But to avoid things getting too crazy, we cap the
493    // buffer.
494    //
495    // If it weren't for multi-line mode, then none of this would be needed.
496    // Alternatively, if we refactored the grep interfaces to pass along the
497    // full set of matches (if available) from the searcher, then that might
498    // also help here. But that winds up paying an upfront unavoidable cost for
499    // the case where matches don't need to be counted. So then you'd have to
500    // introduce a way to pass along matches conditionally, only when needed.
501    // Yikes.
502    //
503    // Maybe the bigger picture thing here is that the searcher should be
504    // responsible for finding matches when necessary, and the printer
505    // shouldn't be involved in this business in the first place. Sigh. Live
506    // and learn. Abstraction boundaries are hard.
507    let is_multi_line = searcher.multi_line_with_matcher(&matcher);
508    if is_multi_line {
509        if bytes[range.end..].len() >= MAX_LOOK_AHEAD {
510            bytes = &bytes[..range.end + MAX_LOOK_AHEAD];
511        }
512    } else {
513        // When searching a single line, we should remove the line terminator.
514        // Otherwise, it's possible for the regex (via look-around) to observe
515        // the line terminator and not match because of it.
516        let mut m = Match::new(0, range.end);
517        // No need to rember the line terminator as we aren't doing a replace
518        // here.
519        trim_line_terminator(searcher, bytes, &mut m);
520        bytes = &bytes[..m.end()];
521    }
522    matcher
523        .find_iter_at(bytes, range.start, |m| {
524            if m.start() >= range.end {
525                return false;
526            }
527            matched(m)
528        })
529        .map_err(io::Error::error_message)
530}
531
532/// Given a buf and some bounds, if there is a line terminator at the end of
533/// the given bounds in buf, then the bounds are trimmed to remove the line
534/// terminator, returning the slice of the removed line terminator (if any).
535pub(crate) fn trim_line_terminator<'b>(
536    searcher: &Searcher,
537    buf: &'b [u8],
538    line: &mut Match,
539) -> &'b [u8] {
540    let lineterm = searcher.line_terminator();
541    if lineterm.is_suffix(&buf[*line]) {
542        let mut end = line.end() - 1;
543        if lineterm.is_crlf() && end > 0 && buf.get(end - 1) == Some(&b'\r') {
544            end -= 1;
545        }
546        let orig_end = line.end();
547        *line = line.with_end(end);
548        &buf[end..orig_end]
549    } else {
550        &[]
551    }
552}
553
554/// Like `Matcher::replace_with_captures_at`, but accepts an end bound.
555///
556/// See also: `find_iter_at_in_context` for why we need this.
557fn replace_with_captures_in_context<M, F>(
558    matcher: M,
559    bytes: &[u8],
560    line_terminator: &[u8],
561    range: std::ops::Range<usize>,
562    caps: &mut M::Captures,
563    dst: &mut Vec<u8>,
564    mut append: F,
565) -> Result<(), M::Error>
566where
567    M: Matcher,
568    F: FnMut(&M::Captures, &mut Vec<u8>) -> bool,
569{
570    let mut last_match = range.start;
571    matcher.captures_iter_at(bytes, range.start, caps, |caps| {
572        let m = caps.get(0).unwrap();
573        if m.start() >= range.end {
574            return false;
575        }
576        dst.extend(&bytes[last_match..m.start()]);
577        last_match = m.end();
578        append(caps, dst)
579    })?;
580    let end = if last_match > range.end {
581        bytes.len()
582    } else {
583        std::cmp::min(bytes.len(), range.end)
584    };
585    dst.extend(&bytes[last_match..end]);
586    // Add back any line terminator.
587    dst.extend(line_terminator);
588    Ok(())
589}
590
591#[cfg(test)]
592mod tests {
593    use super::*;
594
595    #[test]
596    fn custom_decimal_format() {
597        let fmt = |n: u64| {
598            let bytes = DecimalFormatter::new(n).as_bytes().to_vec();
599            String::from_utf8(bytes).unwrap()
600        };
601        let std = |n: u64| n.to_string();
602
603        let ints = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 100, 123, u64::MAX];
604        for n in ints {
605            assert_eq!(std(n), fmt(n));
606        }
607    }
608}
grep_printer/util.rs

grep_printer/
util.rs