grep_searcher/searcher/
mod.rs

1use std::{
2    cell::RefCell,
3    cmp,
4    fs::File,
5    io::{self, Read},
6    path::Path,
7};
8
9use {
10    encoding_rs_io::DecodeReaderBytesBuilder,
11    grep_matcher::{LineTerminator, Match, Matcher},
12};
13
14use crate::{
15    line_buffer::{
16        self, BufferAllocation, DEFAULT_BUFFER_CAPACITY, LineBuffer,
17        LineBufferBuilder, LineBufferReader, alloc_error,
18    },
19    searcher::glue::{MultiLine, ReadByLine, SliceByLine},
20    sink::{Sink, SinkError},
21};
22
23pub use self::mmap::MmapChoice;
24
25mod core;
26mod glue;
27mod mmap;
28
29/// We use this type alias since we want the ergonomics of a matcher's `Match`
30/// type, but in practice, we use it for arbitrary ranges, so give it a more
31/// accurate name. This is only used in the searcher's internals.
32type Range = Match;
33
34/// The behavior of binary detection while searching.
35///
36/// Binary detection is the process of _heuristically_ identifying whether a
37/// given chunk of data is binary or not, and then taking an action based on
38/// the result of that heuristic. The motivation behind detecting binary data
39/// is that binary data often indicates data that is undesirable to search
40/// using textual patterns. Of course, there are many cases in which this isn't
41/// true, which is why binary detection is disabled by default.
42///
43/// Unfortunately, binary detection works differently depending on the type of
44/// search being executed:
45///
46/// 1. When performing a search using a fixed size buffer, binary detection is
47///    applied to the buffer's contents as it is filled. Binary detection must
48///    be applied to the buffer directly because binary files may not contain
49///    line terminators, which could result in exorbitant memory usage.
50/// 2. When performing a search using memory maps or by reading data off the
51///    heap, then binary detection is only guaranteed to be applied to the
52///    parts corresponding to a match. When `Quit` is enabled, then the first
53///    few KB of the data are searched for binary data.
54#[derive(Clone, Debug, Default, Eq, PartialEq)]
55pub struct BinaryDetection(line_buffer::BinaryDetection);
56
57impl BinaryDetection {
58    /// No binary detection is performed. Data reported by the searcher may
59    /// contain arbitrary bytes.
60    ///
61    /// This is the default.
62    pub fn none() -> BinaryDetection {
63        BinaryDetection(line_buffer::BinaryDetection::None)
64    }
65
66    /// Binary detection is performed by looking for the given byte.
67    ///
68    /// When searching is performed using a fixed size buffer, then the
69    /// contents of that buffer are always searched for the presence of this
70    /// byte. If it is found, then the underlying data is considered binary
71    /// and the search stops as if it reached EOF.
72    ///
73    /// When searching is performed with the entire contents mapped into
74    /// memory, then binary detection is more conservative. Namely, only a
75    /// fixed sized region at the beginning of the contents are detected for
76    /// binary data. As a compromise, any subsequent matching (or context)
77    /// lines are also searched for binary data. If binary data is detected at
78    /// any point, then the search stops as if it reached EOF.
79    pub fn quit(binary_byte: u8) -> BinaryDetection {
80        BinaryDetection(line_buffer::BinaryDetection::Quit(binary_byte))
81    }
82
83    /// Binary detection is performed by looking for the given byte, and
84    /// replacing it with the line terminator configured on the searcher.
85    /// (If the searcher is configured to use `CRLF` as the line terminator,
86    /// then this byte is replaced by just `LF`.)
87    ///
88    /// When searching is performed using a fixed size buffer, then the
89    /// contents of that buffer are always searched for the presence of this
90    /// byte and replaced with the line terminator. In effect, the caller is
91    /// guaranteed to never observe this byte while searching.
92    ///
93    /// When searching is performed with the entire contents mapped into
94    /// memory, then this setting has no effect and is ignored.
95    pub fn convert(binary_byte: u8) -> BinaryDetection {
96        BinaryDetection(line_buffer::BinaryDetection::Convert(binary_byte))
97    }
98
99    /// If this binary detection uses the "quit" strategy, then this returns
100    /// the byte that will cause a search to quit. In any other case, this
101    /// returns `None`.
102    pub fn quit_byte(&self) -> Option<u8> {
103        match self.0 {
104            line_buffer::BinaryDetection::Quit(b) => Some(b),
105            _ => None,
106        }
107    }
108
109    /// If this binary detection uses the "convert" strategy, then this returns
110    /// the byte that will be replaced by the line terminator. In any other
111    /// case, this returns `None`.
112    pub fn convert_byte(&self) -> Option<u8> {
113        match self.0 {
114            line_buffer::BinaryDetection::Convert(b) => Some(b),
115            _ => None,
116        }
117    }
118}
119
120/// An encoding to use when searching.
121///
122/// An encoding can be used to configure a [`SearcherBuilder`] to transcode
123/// source data from an encoding to UTF-8 before searching.
124///
125/// An `Encoding` will always be cheap to clone.
126#[derive(Clone, Debug, Eq, PartialEq)]
127pub struct Encoding(&'static encoding_rs::Encoding);
128
129impl Encoding {
130    /// Create a new encoding for the specified label.
131    ///
132    /// The encoding label provided is mapped to an encoding via the set of
133    /// available choices specified in the
134    /// [Encoding Standard](https://encoding.spec.whatwg.org/#concept-encoding-get).
135    /// If the given label does not correspond to a valid encoding, then this
136    /// returns an error.
137    pub fn new(label: &str) -> Result<Encoding, ConfigError> {
138        let label = label.as_bytes();
139        match encoding_rs::Encoding::for_label_no_replacement(label) {
140            Some(encoding) => Ok(Encoding(encoding)),
141            None => {
142                Err(ConfigError::UnknownEncoding { label: label.to_vec() })
143            }
144        }
145    }
146}
147
148/// The internal configuration of a searcher. This is shared among several
149/// search related types, but is only ever written to by the SearcherBuilder.
150#[derive(Clone, Debug)]
151pub struct Config {
152    /// The line terminator to use.
153    line_term: LineTerminator,
154    /// Whether to invert matching.
155    invert_match: bool,
156    /// The number of lines after a match to include.
157    after_context: usize,
158    /// The number of lines before a match to include.
159    before_context: usize,
160    /// Whether to enable unbounded context or not.
161    passthru: bool,
162    /// Whether to count line numbers.
163    line_number: bool,
164    /// The maximum amount of heap memory to use.
165    ///
166    /// When not given, no explicit limit is enforced. When set to `0`, then
167    /// only the memory map search strategy is available.
168    heap_limit: Option<usize>,
169    /// The memory map strategy.
170    mmap: MmapChoice,
171    /// The binary data detection strategy.
172    binary: BinaryDetection,
173    /// Whether to enable matching across multiple lines.
174    multi_line: bool,
175    /// An encoding that, when present, causes the searcher to transcode all
176    /// input from the encoding to UTF-8.
177    encoding: Option<Encoding>,
178    /// Whether to do automatic transcoding based on a BOM or not.
179    bom_sniffing: bool,
180    /// Whether to stop searching when a non-matching line is found after a
181    /// matching line.
182    stop_on_nonmatch: bool,
183    /// The maximum number of matches this searcher should emit.
184    max_matches: Option<u64>,
185}
186
187impl Default for Config {
188    fn default() -> Config {
189        Config {
190            line_term: LineTerminator::default(),
191            invert_match: false,
192            after_context: 0,
193            before_context: 0,
194            passthru: false,
195            line_number: true,
196            heap_limit: None,
197            mmap: MmapChoice::default(),
198            binary: BinaryDetection::default(),
199            multi_line: false,
200            encoding: None,
201            bom_sniffing: true,
202            stop_on_nonmatch: false,
203            max_matches: None,
204        }
205    }
206}
207
208impl Config {
209    /// Return the maximal amount of lines needed to fulfill this
210    /// configuration's context.
211    ///
212    /// If this returns `0`, then no context is ever needed.
213    fn max_context(&self) -> usize {
214        cmp::max(self.before_context, self.after_context)
215    }
216
217    /// Build a line buffer from this configuration.
218    fn line_buffer(&self) -> LineBuffer {
219        let mut builder = LineBufferBuilder::new();
220        builder
221            .line_terminator(self.line_term.as_byte())
222            .binary_detection(self.binary.0);
223
224        if let Some(limit) = self.heap_limit {
225            let (capacity, additional) = if limit <= DEFAULT_BUFFER_CAPACITY {
226                (limit, 0)
227            } else {
228                (DEFAULT_BUFFER_CAPACITY, limit - DEFAULT_BUFFER_CAPACITY)
229            };
230            builder
231                .capacity(capacity)
232                .buffer_alloc(BufferAllocation::Error(additional));
233        }
234        builder.build()
235    }
236}
237
238/// An error that can occur when building a searcher.
239///
240/// This error occurs when a non-sensical configuration is present when trying
241/// to construct a `Searcher` from a `SearcherBuilder`.
242#[derive(Clone, Debug, Eq, PartialEq)]
243#[non_exhaustive]
244pub enum ConfigError {
245    /// Indicates that the heap limit configuration prevents all possible
246    /// search strategies from being used. For example, if the heap limit is
247    /// set to 0 and memory map searching is disabled or unavailable.
248    SearchUnavailable,
249    /// Occurs when a matcher reports a line terminator that is different than
250    /// the one configured in the searcher.
251    MismatchedLineTerminators {
252        /// The matcher's line terminator.
253        matcher: LineTerminator,
254        /// The searcher's line terminator.
255        searcher: LineTerminator,
256    },
257    /// Occurs when no encoding could be found for a particular label.
258    UnknownEncoding {
259        /// The provided encoding label that could not be found.
260        label: Vec<u8>,
261    },
262}
263
264impl std::error::Error for ConfigError {}
265
266impl std::fmt::Display for ConfigError {
267    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
268        match *self {
269            ConfigError::SearchUnavailable => {
270                write!(f, "grep config error: no available searchers")
271            }
272            ConfigError::MismatchedLineTerminators { matcher, searcher } => {
273                write!(
274                    f,
275                    "grep config error: mismatched line terminators, \
276                     matcher has {:?} but searcher has {:?}",
277                    matcher, searcher
278                )
279            }
280            ConfigError::UnknownEncoding { ref label } => write!(
281                f,
282                "grep config error: unknown encoding: {}",
283                String::from_utf8_lossy(label),
284            ),
285        }
286    }
287}
288
289/// A builder for configuring a searcher.
290///
291/// A search builder permits specifying the configuration of a searcher,
292/// including options like whether to invert the search or to enable multi
293/// line search.
294///
295/// Once a searcher has been built, it is beneficial to reuse that searcher
296/// for multiple searches, if possible.
297#[derive(Clone, Debug)]
298pub struct SearcherBuilder {
299    config: Config,
300}
301
302impl Default for SearcherBuilder {
303    fn default() -> SearcherBuilder {
304        SearcherBuilder::new()
305    }
306}
307
308impl SearcherBuilder {
309    /// Create a new searcher builder with a default configuration.
310    pub fn new() -> SearcherBuilder {
311        SearcherBuilder { config: Config::default() }
312    }
313
314    /// Build a searcher with the given matcher.
315    pub fn build(&self) -> Searcher {
316        let mut config = self.config.clone();
317        if config.passthru {
318            config.before_context = 0;
319            config.after_context = 0;
320        }
321
322        let mut decode_builder = DecodeReaderBytesBuilder::new();
323        decode_builder
324            .encoding(self.config.encoding.as_ref().map(|e| e.0))
325            .utf8_passthru(true)
326            .strip_bom(self.config.bom_sniffing)
327            .bom_override(true)
328            .bom_sniffing(self.config.bom_sniffing);
329
330        Searcher {
331            config,
332            decode_builder,
333            decode_buffer: RefCell::new(vec![0; 8 * (1 << 10)]),
334            line_buffer: RefCell::new(self.config.line_buffer()),
335            multi_line_buffer: RefCell::new(vec![]),
336        }
337    }
338
339    /// Set the line terminator that is used by the searcher.
340    ///
341    /// When using a searcher, if the matcher provided has a line terminator
342    /// set, then it must be the same as this one. If they aren't, building
343    /// a searcher will return an error.
344    ///
345    /// By default, this is set to `b'\n'`.
346    pub fn line_terminator(
347        &mut self,
348        line_term: LineTerminator,
349    ) -> &mut SearcherBuilder {
350        self.config.line_term = line_term;
351        self
352    }
353
354    /// Whether to invert matching, whereby lines that don't match are reported
355    /// instead of reporting lines that do match.
356    ///
357    /// By default, this is disabled.
358    pub fn invert_match(&mut self, yes: bool) -> &mut SearcherBuilder {
359        self.config.invert_match = yes;
360        self
361    }
362
363    /// Whether to count and include line numbers with matching lines.
364    ///
365    /// This is enabled by default. There is a small performance penalty
366    /// associated with computing line numbers, so this can be disabled when
367    /// this isn't desirable.
368    pub fn line_number(&mut self, yes: bool) -> &mut SearcherBuilder {
369        self.config.line_number = yes;
370        self
371    }
372
373    /// Whether to enable multi line search or not.
374    ///
375    /// When multi line search is enabled, matches *may* match across multiple
376    /// lines. Conversely, when multi line search is disabled, it is impossible
377    /// for any match to span more than one line.
378    ///
379    /// **Warning:** multi line search requires having the entire contents to
380    /// search mapped in memory at once. When searching files, memory maps
381    /// will be used if possible and if they are enabled, which avoids using
382    /// your program's heap. However, if memory maps cannot be used (e.g.,
383    /// for searching streams like `stdin` or if transcoding is necessary),
384    /// then the entire contents of the stream are read on to the heap before
385    /// starting the search.
386    ///
387    /// This is disabled by default.
388    pub fn multi_line(&mut self, yes: bool) -> &mut SearcherBuilder {
389        self.config.multi_line = yes;
390        self
391    }
392
393    /// Whether to include a fixed number of lines after every match.
394    ///
395    /// When this is set to a non-zero number, then the searcher will report
396    /// `line_count` contextual lines after every match.
397    ///
398    /// This is set to `0` by default.
399    pub fn after_context(
400        &mut self,
401        line_count: usize,
402    ) -> &mut SearcherBuilder {
403        self.config.after_context = line_count;
404        self
405    }
406
407    /// Whether to include a fixed number of lines before every match.
408    ///
409    /// When this is set to a non-zero number, then the searcher will report
410    /// `line_count` contextual lines before every match.
411    ///
412    /// This is set to `0` by default.
413    pub fn before_context(
414        &mut self,
415        line_count: usize,
416    ) -> &mut SearcherBuilder {
417        self.config.before_context = line_count;
418        self
419    }
420
421    /// Whether to enable the "passthru" feature or not.
422    ///
423    /// When passthru is enabled, it effectively treats all non-matching lines
424    /// as contextual lines. In other words, enabling this is akin to
425    /// requesting an unbounded number of before and after contextual lines.
426    ///
427    /// When passthru mode is enabled, any `before_context` or `after_context`
428    /// settings are ignored by setting them to `0`.
429    ///
430    /// This is disabled by default.
431    pub fn passthru(&mut self, yes: bool) -> &mut SearcherBuilder {
432        self.config.passthru = yes;
433        self
434    }
435
436    /// Set an approximate limit on the amount of heap space used by a
437    /// searcher.
438    ///
439    /// The heap limit is enforced in two scenarios:
440    ///
441    /// * When searching using a fixed size buffer, the heap limit controls
442    ///   how big this buffer is allowed to be. Assuming contexts are disabled,
443    ///   the minimum size of this buffer is the length (in bytes) of the
444    ///   largest single line in the contents being searched. If any line
445    ///   exceeds the heap limit, then an error will be returned.
446    /// * When performing a multi line search, a fixed size buffer cannot be
447    ///   used. Thus, the only choices are to read the entire contents on to
448    ///   the heap, or use memory maps. In the former case, the heap limit set
449    ///   here is enforced.
450    ///
451    /// If a heap limit is set to `0`, then no heap space is used. If there are
452    /// no alternative strategies available for searching without heap space
453    /// (e.g., memory maps are disabled), then the searcher wil return an error
454    /// immediately.
455    ///
456    /// By default, no limit is set.
457    pub fn heap_limit(
458        &mut self,
459        bytes: Option<usize>,
460    ) -> &mut SearcherBuilder {
461        self.config.heap_limit = bytes;
462        self
463    }
464
465    /// Set the strategy to employ use of memory maps.
466    ///
467    /// Currently, there are only two strategies that can be employed:
468    ///
469    /// * **Automatic** - A searcher will use heuristics, including but not
470    ///   limited to file size and platform, to determine whether to use memory
471    ///   maps or not.
472    /// * **Never** - Memory maps will never be used. If multi line search is
473    ///   enabled, then the entire contents will be read on to the heap before
474    ///   searching begins.
475    ///
476    /// The default behavior is **never**. Generally speaking, and perhaps
477    /// against conventional wisdom, memory maps don't necessarily enable
478    /// faster searching. For example, depending on the platform, using memory
479    /// maps while searching a large directory can actually be quite a bit
480    /// slower than using normal read calls because of the overhead of managing
481    /// the memory maps.
482    ///
483    /// Memory maps can be faster in some cases however. On some platforms,
484    /// when searching a very large file that *is already in memory*, it can
485    /// be slightly faster to search it as a memory map instead of using
486    /// normal read calls.
487    ///
488    /// Finally, memory maps have a somewhat complicated safety story in Rust.
489    /// If you aren't sure whether enabling memory maps is worth it, then just
490    /// don't bother with it.
491    ///
492    /// **WARNING**: If your process is searching a file backed memory map
493    /// at the same time that file is truncated, then it's possible for the
494    /// process to terminate with a bus error.
495    pub fn memory_map(
496        &mut self,
497        strategy: MmapChoice,
498    ) -> &mut SearcherBuilder {
499        self.config.mmap = strategy;
500        self
501    }
502
503    /// Set the binary detection strategy.
504    ///
505    /// The binary detection strategy determines not only how the searcher
506    /// detects binary data, but how it responds to the presence of binary
507    /// data. See the [`BinaryDetection`] type for more information.
508    ///
509    /// By default, binary detection is disabled.
510    pub fn binary_detection(
511        &mut self,
512        detection: BinaryDetection,
513    ) -> &mut SearcherBuilder {
514        self.config.binary = detection;
515        self
516    }
517
518    /// Set the encoding used to read the source data before searching.
519    ///
520    /// When an encoding is provided, then the source data is _unconditionally_
521    /// transcoded using the encoding, unless a BOM is present. If a BOM is
522    /// present, then the encoding indicated by the BOM is used instead. If the
523    /// transcoding process encounters an error, then bytes are replaced with
524    /// the Unicode replacement codepoint.
525    ///
526    /// When no encoding is specified (the default), then BOM sniffing is
527    /// used (if it's enabled, which it is, by default) to determine whether
528    /// the source data is UTF-8 or UTF-16, and transcoding will be performed
529    /// automatically. If no BOM could be found, then the source data is
530    /// searched _as if_ it were UTF-8. However, so long as the source data is
531    /// at least ASCII compatible, then it is possible for a search to produce
532    /// useful results.
533    pub fn encoding(
534        &mut self,
535        encoding: Option<Encoding>,
536    ) -> &mut SearcherBuilder {
537        self.config.encoding = encoding;
538        self
539    }
540
541    /// Enable automatic transcoding based on BOM sniffing.
542    ///
543    /// When this is enabled and an explicit encoding is not set, then this
544    /// searcher will try to detect the encoding of the bytes being searched
545    /// by sniffing its byte-order mark (BOM). In particular, when this is
546    /// enabled, UTF-16 encoded files will be searched seamlessly.
547    ///
548    /// When this is disabled and if an explicit encoding is not set, then
549    /// the bytes from the source stream will be passed through unchanged,
550    /// including its BOM, if one is present.
551    ///
552    /// This is enabled by default.
553    pub fn bom_sniffing(&mut self, yes: bool) -> &mut SearcherBuilder {
554        self.config.bom_sniffing = yes;
555        self
556    }
557
558    /// Stop searching a file when a non-matching line is found after a
559    /// matching line.
560    ///
561    /// This is useful for searching sorted files where it is expected that all
562    /// the matches will be on adjacent lines.
563    pub fn stop_on_nonmatch(
564        &mut self,
565        stop_on_nonmatch: bool,
566    ) -> &mut SearcherBuilder {
567        self.config.stop_on_nonmatch = stop_on_nonmatch;
568        self
569    }
570
571    /// Sets the maximum number of matches that should be emitted by this
572    /// searcher.
573    ///
574    /// If multi line search is enabled and a match spans multiple lines, then
575    /// that match is counted exactly once for the purposes of enforcing this
576    /// limit, regardless of how many lines it spans.
577    ///
578    /// Note that `0` is a legal value. This will cause the searcher to
579    /// immediately quick without searching anything.
580    ///
581    /// By default, no limit is set.
582    #[inline]
583    pub fn max_matches(&mut self, limit: Option<u64>) -> &mut SearcherBuilder {
584        self.config.max_matches = limit;
585        self
586    }
587}
588
589/// A searcher executes searches over a haystack and writes results to a caller
590/// provided sink.
591///
592/// Matches are detected via implementations of the `Matcher` trait, which must
593/// be provided by the caller when executing a search.
594///
595/// When possible, a searcher should be reused.
596#[derive(Clone, Debug)]
597pub struct Searcher {
598    /// The configuration for this searcher.
599    ///
600    /// We make most of these settings available to users of `Searcher` via
601    /// public API methods, which can be queried in implementations of `Sink`
602    /// if necessary.
603    config: Config,
604    /// A builder for constructing a streaming reader that transcodes source
605    /// data according to either an explicitly specified encoding or via an
606    /// automatically detected encoding via BOM sniffing.
607    ///
608    /// When no transcoding is needed, then the transcoder built will pass
609    /// through the underlying bytes with no additional overhead.
610    decode_builder: DecodeReaderBytesBuilder,
611    /// A buffer that is used for transcoding scratch space.
612    decode_buffer: RefCell<Vec<u8>>,
613    /// A line buffer for use in line oriented searching.
614    ///
615    /// We wrap it in a RefCell to permit lending out borrows of `Searcher`
616    /// to sinks. We still require a mutable borrow to execute a search, so
617    /// we statically prevent callers from causing RefCell to panic at runtime
618    /// due to a borrowing violation.
619    line_buffer: RefCell<LineBuffer>,
620    /// A buffer in which to store the contents of a reader when performing a
621    /// multi line search. In particular, multi line searches cannot be
622    /// performed incrementally, and need the entire haystack in memory at
623    /// once.
624    multi_line_buffer: RefCell<Vec<u8>>,
625}
626
627impl Searcher {
628    /// Create a new searcher with a default configuration.
629    ///
630    /// To configure the searcher (e.g., invert matching, enable memory maps,
631    /// enable contexts, etc.), use the [`SearcherBuilder`].
632    pub fn new() -> Searcher {
633        SearcherBuilder::new().build()
634    }
635
636    /// Execute a search over the file with the given path and write the
637    /// results to the given sink.
638    ///
639    /// If memory maps are enabled and the searcher heuristically believes
640    /// memory maps will help the search run faster, then this will use
641    /// memory maps. For this reason, callers should prefer using this method
642    /// or `search_file` over the more generic `search_reader` when possible.
643    pub fn search_path<P, M, S>(
644        &mut self,
645        matcher: M,
646        path: P,
647        write_to: S,
648    ) -> Result<(), S::Error>
649    where
650        P: AsRef<Path>,
651        M: Matcher,
652        S: Sink,
653    {
654        let path = path.as_ref();
655        let file = File::open(path).map_err(S::Error::error_io)?;
656        self.search_file_maybe_path(matcher, Some(path), &file, write_to)
657    }
658
659    /// Execute a search over a file and write the results to the given sink.
660    ///
661    /// If memory maps are enabled and the searcher heuristically believes
662    /// memory maps will help the search run faster, then this will use
663    /// memory maps. For this reason, callers should prefer using this method
664    /// or `search_path` over the more generic `search_reader` when possible.
665    pub fn search_file<M, S>(
666        &mut self,
667        matcher: M,
668        file: &File,
669        write_to: S,
670    ) -> Result<(), S::Error>
671    where
672        M: Matcher,
673        S: Sink,
674    {
675        self.search_file_maybe_path(matcher, None, file, write_to)
676    }
677
678    fn search_file_maybe_path<M, S>(
679        &mut self,
680        matcher: M,
681        path: Option<&Path>,
682        file: &File,
683        write_to: S,
684    ) -> Result<(), S::Error>
685    where
686        M: Matcher,
687        S: Sink,
688    {
689        if let Some(mmap) = self.config.mmap.open(file, path) {
690            log::trace!("{:?}: searching via memory map", path);
691            return self.search_slice(matcher, &mmap, write_to);
692        }
693        // Fast path for multi-line searches of files when memory maps are not
694        // enabled. This pre-allocates a buffer roughly the size of the file,
695        // which isn't possible when searching an arbitrary std::io::Read.
696        if self.multi_line_with_matcher(&matcher) {
697            log::trace!(
698                "{:?}: reading entire file on to heap for mulitline",
699                path
700            );
701            self.fill_multi_line_buffer_from_file::<S>(file)?;
702            log::trace!("{:?}: searching via multiline strategy", path);
703            MultiLine::new(
704                self,
705                matcher,
706                &*self.multi_line_buffer.borrow(),
707                write_to,
708            )
709            .run()
710        } else {
711            log::trace!("{:?}: searching using generic reader", path);
712            self.search_reader(matcher, file, write_to)
713        }
714    }
715
716    /// Execute a search over any implementation of `std::io::Read` and write
717    /// the results to the given sink.
718    ///
719    /// When possible, this implementation will search the reader incrementally
720    /// without reading it into memory. In some cases---for example, if multi
721    /// line search is enabled---an incremental search isn't possible and the
722    /// given reader is consumed completely and placed on the heap before
723    /// searching begins. For this reason, when multi line search is enabled,
724    /// one should try to use higher level APIs (e.g., searching by file or
725    /// file path) so that memory maps can be used if they are available and
726    /// enabled.
727    pub fn search_reader<M, R, S>(
728        &mut self,
729        matcher: M,
730        read_from: R,
731        write_to: S,
732    ) -> Result<(), S::Error>
733    where
734        M: Matcher,
735        R: io::Read,
736        S: Sink,
737    {
738        self.check_config(&matcher).map_err(S::Error::error_config)?;
739
740        let mut decode_buffer = self.decode_buffer.borrow_mut();
741        let decoder = self
742            .decode_builder
743            .build_with_buffer(read_from, &mut *decode_buffer)
744            .map_err(S::Error::error_io)?;
745
746        if self.multi_line_with_matcher(&matcher) {
747            log::trace!(
748                "generic reader: reading everything to heap for multiline"
749            );
750            self.fill_multi_line_buffer_from_reader::<_, S>(decoder)?;
751            log::trace!("generic reader: searching via multiline strategy");
752            MultiLine::new(
753                self,
754                matcher,
755                &*self.multi_line_buffer.borrow(),
756                write_to,
757            )
758            .run()
759        } else {
760            let mut line_buffer = self.line_buffer.borrow_mut();
761            let rdr = LineBufferReader::new(decoder, &mut *line_buffer);
762            log::trace!("generic reader: searching via roll buffer strategy");
763            ReadByLine::new(self, matcher, rdr, write_to).run()
764        }
765    }
766
767    /// Execute a search over the given slice and write the results to the
768    /// given sink.
769    pub fn search_slice<M, S>(
770        &mut self,
771        matcher: M,
772        slice: &[u8],
773        write_to: S,
774    ) -> Result<(), S::Error>
775    where
776        M: Matcher,
777        S: Sink,
778    {
779        self.check_config(&matcher).map_err(S::Error::error_config)?;
780
781        // We can search the slice directly, unless we need to do transcoding.
782        if self.slice_needs_transcoding(slice) {
783            log::trace!(
784                "slice reader: needs transcoding, using generic reader"
785            );
786            return self.search_reader(matcher, slice, write_to);
787        }
788        if self.multi_line_with_matcher(&matcher) {
789            log::trace!("slice reader: searching via multiline strategy");
790            MultiLine::new(self, matcher, slice, write_to).run()
791        } else {
792            log::trace!("slice reader: searching via slice-by-line strategy");
793            SliceByLine::new(self, matcher, slice, write_to).run()
794        }
795    }
796
797    /// Set the binary detection method used on this searcher.
798    pub fn set_binary_detection(&mut self, detection: BinaryDetection) {
799        self.config.binary = detection.clone();
800        self.line_buffer.borrow_mut().set_binary_detection(detection.0);
801    }
802
803    /// Check that the searcher's configuration and the matcher are consistent
804    /// with each other.
805    fn check_config<M: Matcher>(&self, matcher: M) -> Result<(), ConfigError> {
806        if self.config.heap_limit == Some(0) && !self.config.mmap.is_enabled()
807        {
808            return Err(ConfigError::SearchUnavailable);
809        }
810        let matcher_line_term = match matcher.line_terminator() {
811            None => return Ok(()),
812            Some(line_term) => line_term,
813        };
814        if matcher_line_term != self.config.line_term {
815            return Err(ConfigError::MismatchedLineTerminators {
816                matcher: matcher_line_term,
817                searcher: self.config.line_term,
818            });
819        }
820        Ok(())
821    }
822
823    /// Returns true if and only if the given slice needs to be transcoded.
824    fn slice_needs_transcoding(&self, slice: &[u8]) -> bool {
825        self.config.encoding.is_some()
826            || (self.config.bom_sniffing && slice_has_bom(slice))
827    }
828}
829
830/// The following methods permit querying the configuration of a searcher.
831/// These can be useful in generic implementations of [`Sink`], where the
832/// output may be tailored based on how the searcher is configured.
833impl Searcher {
834    /// Returns the line terminator used by this searcher.
835    #[inline]
836    pub fn line_terminator(&self) -> LineTerminator {
837        self.config.line_term
838    }
839
840    /// Returns the type of binary detection configured on this searcher.
841    #[inline]
842    pub fn binary_detection(&self) -> &BinaryDetection {
843        &self.config.binary
844    }
845
846    /// Returns true if and only if this searcher is configured to invert its
847    /// search results. That is, matching lines are lines that do **not** match
848    /// the searcher's matcher.
849    #[inline]
850    pub fn invert_match(&self) -> bool {
851        self.config.invert_match
852    }
853
854    /// Returns true if and only if this searcher is configured to count line
855    /// numbers.
856    #[inline]
857    pub fn line_number(&self) -> bool {
858        self.config.line_number
859    }
860
861    /// Returns true if and only if this searcher is configured to perform
862    /// multi line search.
863    #[inline]
864    pub fn multi_line(&self) -> bool {
865        self.config.multi_line
866    }
867
868    /// Returns true if and only if this searcher is configured to stop when it
869    /// finds a non-matching line after a matching one.
870    #[inline]
871    pub fn stop_on_nonmatch(&self) -> bool {
872        self.config.stop_on_nonmatch
873    }
874
875    /// Returns the maximum number of matches emitted by this searcher, if
876    /// such a limit was set.
877    ///
878    /// If multi line search is enabled and a match spans multiple lines, then
879    /// that match is counted exactly once for the purposes of enforcing this
880    /// limit, regardless of how many lines it spans.
881    ///
882    /// Note that `0` is a legal value. This will cause the searcher to
883    /// immediately quick without searching anything.
884    #[inline]
885    pub fn max_matches(&self) -> Option<u64> {
886        self.config.max_matches
887    }
888
889    /// Returns true if and only if this searcher will choose a multi-line
890    /// strategy given the provided matcher.
891    ///
892    /// This may diverge from the result of `multi_line` in cases where the
893    /// searcher has been configured to execute a search that can report
894    /// matches over multiple lines, but where the matcher guarantees that it
895    /// will never produce a match over multiple lines.
896    pub fn multi_line_with_matcher<M: Matcher>(&self, matcher: M) -> bool {
897        if !self.multi_line() {
898            return false;
899        }
900        if let Some(line_term) = matcher.line_terminator() {
901            if line_term == self.line_terminator() {
902                return false;
903            }
904        }
905        if let Some(non_matching) = matcher.non_matching_bytes() {
906            // If the line terminator is CRLF, we don't actually need to care
907            // whether the regex can match `\r` or not. Namely, a `\r` is
908            // neither necessary nor sufficient to terminate a line. A `\n` is
909            // always required.
910            if non_matching.contains(self.line_terminator().as_byte()) {
911                return false;
912            }
913        }
914        true
915    }
916
917    /// Returns the number of "after" context lines to report. When context
918    /// reporting is not enabled, this returns `0`.
919    #[inline]
920    pub fn after_context(&self) -> usize {
921        self.config.after_context
922    }
923
924    /// Returns the number of "before" context lines to report. When context
925    /// reporting is not enabled, this returns `0`.
926    #[inline]
927    pub fn before_context(&self) -> usize {
928        self.config.before_context
929    }
930
931    /// Returns true if and only if the searcher has "passthru" mode enabled.
932    #[inline]
933    pub fn passthru(&self) -> bool {
934        self.config.passthru
935    }
936
937    /// Fill the buffer for use with multi-line searching from the given file.
938    /// This reads from the file until EOF or until an error occurs. If the
939    /// contents exceed the configured heap limit, then an error is returned.
940    fn fill_multi_line_buffer_from_file<S: Sink>(
941        &self,
942        file: &File,
943    ) -> Result<(), S::Error> {
944        assert!(self.config.multi_line);
945
946        let mut decode_buffer = self.decode_buffer.borrow_mut();
947        let mut read_from = self
948            .decode_builder
949            .build_with_buffer(file, &mut *decode_buffer)
950            .map_err(S::Error::error_io)?;
951
952        // If we don't have a heap limit, then we can defer to std's
953        // read_to_end implementation. fill_multi_line_buffer_from_reader will
954        // do this too, but since we have a File, we can be a bit smarter about
955        // pre-allocating here.
956        //
957        // If we're transcoding, then our pre-allocation might not be exact,
958        // but is probably still better than nothing.
959        if self.config.heap_limit.is_none() {
960            let mut buf = self.multi_line_buffer.borrow_mut();
961            buf.clear();
962            let cap =
963                file.metadata().map(|m| m.len() as usize + 1).unwrap_or(0);
964            buf.reserve(cap);
965            read_from.read_to_end(&mut *buf).map_err(S::Error::error_io)?;
966            return Ok(());
967        }
968        self.fill_multi_line_buffer_from_reader::<_, S>(read_from)
969    }
970
971    /// Fill the buffer for use with multi-line searching from the given
972    /// reader. This reads from the reader until EOF or until an error occurs.
973    /// If the contents exceed the configured heap limit, then an error is
974    /// returned.
975    fn fill_multi_line_buffer_from_reader<R: io::Read, S: Sink>(
976        &self,
977        mut read_from: R,
978    ) -> Result<(), S::Error> {
979        assert!(self.config.multi_line);
980
981        let mut buf = self.multi_line_buffer.borrow_mut();
982        buf.clear();
983
984        // If we don't have a heap limit, then we can defer to std's
985        // read_to_end implementation...
986        let heap_limit = match self.config.heap_limit {
987            Some(heap_limit) => heap_limit,
988            None => {
989                read_from
990                    .read_to_end(&mut *buf)
991                    .map_err(S::Error::error_io)?;
992                return Ok(());
993            }
994        };
995        if heap_limit == 0 {
996            return Err(S::Error::error_io(alloc_error(heap_limit)));
997        }
998
999        // ... otherwise we need to roll our own. This is likely quite a bit
1000        // slower than what is optimal, but we avoid worry about memory safety
1001        // until there's a compelling reason to speed this up.
1002        buf.resize(cmp::min(DEFAULT_BUFFER_CAPACITY, heap_limit), 0);
1003        let mut pos = 0;
1004        loop {
1005            let nread = match read_from.read(&mut buf[pos..]) {
1006                Ok(nread) => nread,
1007                Err(ref err) if err.kind() == io::ErrorKind::Interrupted => {
1008                    continue;
1009                }
1010                Err(err) => return Err(S::Error::error_io(err)),
1011            };
1012            if nread == 0 {
1013                buf.resize(pos, 0);
1014                return Ok(());
1015            }
1016
1017            pos += nread;
1018            if buf[pos..].is_empty() {
1019                let additional = heap_limit - buf.len();
1020                if additional == 0 {
1021                    return Err(S::Error::error_io(alloc_error(heap_limit)));
1022                }
1023                let limit = buf.len() + additional;
1024                let doubled = 2 * buf.len();
1025                buf.resize(cmp::min(doubled, limit), 0);
1026            }
1027        }
1028    }
1029}
1030
1031/// Returns true if and only if the given slice begins with a UTF-8 or UTF-16
1032/// BOM.
1033///
1034/// This is used by the searcher to determine if a transcoder is necessary.
1035/// Otherwise, it is advantageous to search the slice directly.
1036fn slice_has_bom(slice: &[u8]) -> bool {
1037    let enc = match encoding_rs::Encoding::for_bom(slice) {
1038        None => return false,
1039        Some((enc, _)) => enc,
1040    };
1041    log::trace!("found byte-order mark (BOM) for encoding {enc:?}");
1042    [encoding_rs::UTF_16LE, encoding_rs::UTF_16BE, encoding_rs::UTF_8]
1043        .contains(&enc)
1044}
1045
1046#[cfg(test)]
1047mod tests {
1048    use crate::testutil::{KitchenSink, RegexMatcher};
1049
1050    use super::*;
1051
1052    #[test]
1053    fn config_error_heap_limit() {
1054        let matcher = RegexMatcher::new("");
1055        let sink = KitchenSink::new();
1056        let mut searcher = SearcherBuilder::new().heap_limit(Some(0)).build();
1057        let res = searcher.search_slice(matcher, &[], sink);
1058        assert!(res.is_err());
1059    }
1060
1061    #[test]
1062    fn config_error_line_terminator() {
1063        let mut matcher = RegexMatcher::new("");
1064        matcher.set_line_term(Some(LineTerminator::byte(b'z')));
1065
1066        let sink = KitchenSink::new();
1067        let mut searcher = Searcher::new();
1068        let res = searcher.search_slice(matcher, &[], sink);
1069        assert!(res.is_err());
1070    }
1071
1072    #[test]
1073    fn uft8_bom_sniffing() {
1074        // See: https://github.com/BurntSushi/ripgrep/issues/1638
1075        // ripgrep must sniff utf-8 BOM, just like it does with utf-16
1076        let matcher = RegexMatcher::new("foo");
1077        let haystack: &[u8] = &[0xef, 0xbb, 0xbf, 0x66, 0x6f, 0x6f];
1078
1079        let mut sink = KitchenSink::new();
1080        let mut searcher = SearcherBuilder::new().build();
1081
1082        let res = searcher.search_slice(matcher, haystack, &mut sink);
1083        assert!(res.is_ok());
1084
1085        let sink_output = String::from_utf8(sink.as_bytes().to_vec()).unwrap();
1086        assert_eq!(sink_output, "1:0:foo\nbyte count:3\n");
1087    }
1088}