grep_searcher/searcher/
mod.rs

1use std::{
2    cell::RefCell,
3    cmp,
4    fs::File,
5    io::{self, Read},
6    path::Path,
7};
8
9use {
10    encoding_rs_io::DecodeReaderBytesBuilder,
11    grep_matcher::{LineTerminator, Match, Matcher},
12};
13
14use crate::{
15    line_buffer::{
16        self, alloc_error, BufferAllocation, LineBuffer, LineBufferBuilder,
17        LineBufferReader, DEFAULT_BUFFER_CAPACITY,
18    },
19    searcher::glue::{MultiLine, ReadByLine, SliceByLine},
20    sink::{Sink, SinkError},
21};
22
23pub use self::mmap::MmapChoice;
24
25mod core;
26mod glue;
27mod mmap;
28
29/// We use this type alias since we want the ergonomics of a matcher's `Match`
30/// type, but in practice, we use it for arbitrary ranges, so give it a more
31/// accurate name. This is only used in the searcher's internals.
32type Range = Match;
33
34/// The behavior of binary detection while searching.
35///
36/// Binary detection is the process of _heuristically_ identifying whether a
37/// given chunk of data is binary or not, and then taking an action based on
38/// the result of that heuristic. The motivation behind detecting binary data
39/// is that binary data often indicates data that is undesirable to search
40/// using textual patterns. Of course, there are many cases in which this isn't
41/// true, which is why binary detection is disabled by default.
42///
43/// Unfortunately, binary detection works differently depending on the type of
44/// search being executed:
45///
46/// 1. When performing a search using a fixed size buffer, binary detection is
47///    applied to the buffer's contents as it is filled. Binary detection must
48///    be applied to the buffer directly because binary files may not contain
49///    line terminators, which could result in exorbitant memory usage.
50/// 2. When performing a search using memory maps or by reading data off the
51///    heap, then binary detection is only guaranteed to be applied to the
52///    parts corresponding to a match. When `Quit` is enabled, then the first
53///    few KB of the data are searched for binary data.
54#[derive(Clone, Debug, Default, Eq, PartialEq)]
55pub struct BinaryDetection(line_buffer::BinaryDetection);
56
57impl BinaryDetection {
58    /// No binary detection is performed. Data reported by the searcher may
59    /// contain arbitrary bytes.
60    ///
61    /// This is the default.
62    pub fn none() -> BinaryDetection {
63        BinaryDetection(line_buffer::BinaryDetection::None)
64    }
65
66    /// Binary detection is performed by looking for the given byte.
67    ///
68    /// When searching is performed using a fixed size buffer, then the
69    /// contents of that buffer are always searched for the presence of this
70    /// byte. If it is found, then the underlying data is considered binary
71    /// and the search stops as if it reached EOF.
72    ///
73    /// When searching is performed with the entire contents mapped into
74    /// memory, then binary detection is more conservative. Namely, only a
75    /// fixed sized region at the beginning of the contents are detected for
76    /// binary data. As a compromise, any subsequent matching (or context)
77    /// lines are also searched for binary data. If binary data is detected at
78    /// any point, then the search stops as if it reached EOF.
79    pub fn quit(binary_byte: u8) -> BinaryDetection {
80        BinaryDetection(line_buffer::BinaryDetection::Quit(binary_byte))
81    }
82
83    /// Binary detection is performed by looking for the given byte, and
84    /// replacing it with the line terminator configured on the searcher.
85    /// (If the searcher is configured to use `CRLF` as the line terminator,
86    /// then this byte is replaced by just `LF`.)
87    ///
88    /// When searching is performed using a fixed size buffer, then the
89    /// contents of that buffer are always searched for the presence of this
90    /// byte and replaced with the line terminator. In effect, the caller is
91    /// guaranteed to never observe this byte while searching.
92    ///
93    /// When searching is performed with the entire contents mapped into
94    /// memory, then this setting has no effect and is ignored.
95    pub fn convert(binary_byte: u8) -> BinaryDetection {
96        BinaryDetection(line_buffer::BinaryDetection::Convert(binary_byte))
97    }
98
99    /// If this binary detection uses the "quit" strategy, then this returns
100    /// the byte that will cause a search to quit. In any other case, this
101    /// returns `None`.
102    pub fn quit_byte(&self) -> Option<u8> {
103        match self.0 {
104            line_buffer::BinaryDetection::Quit(b) => Some(b),
105            _ => None,
106        }
107    }
108
109    /// If this binary detection uses the "convert" strategy, then this returns
110    /// the byte that will be replaced by the line terminator. In any other
111    /// case, this returns `None`.
112    pub fn convert_byte(&self) -> Option<u8> {
113        match self.0 {
114            line_buffer::BinaryDetection::Convert(b) => Some(b),
115            _ => None,
116        }
117    }
118}
119
120/// An encoding to use when searching.
121///
122/// An encoding can be used to configure a [`SearcherBuilder`] to transcode
123/// source data from an encoding to UTF-8 before searching.
124///
125/// An `Encoding` will always be cheap to clone.
126#[derive(Clone, Debug, Eq, PartialEq)]
127pub struct Encoding(&'static encoding_rs::Encoding);
128
129impl Encoding {
130    /// Create a new encoding for the specified label.
131    ///
132    /// The encoding label provided is mapped to an encoding via the set of
133    /// available choices specified in the
134    /// [Encoding Standard](https://encoding.spec.whatwg.org/#concept-encoding-get).
135    /// If the given label does not correspond to a valid encoding, then this
136    /// returns an error.
137    pub fn new(label: &str) -> Result<Encoding, ConfigError> {
138        let label = label.as_bytes();
139        match encoding_rs::Encoding::for_label_no_replacement(label) {
140            Some(encoding) => Ok(Encoding(encoding)),
141            None => {
142                Err(ConfigError::UnknownEncoding { label: label.to_vec() })
143            }
144        }
145    }
146}
147
148/// The internal configuration of a searcher. This is shared among several
149/// search related types, but is only ever written to by the SearcherBuilder.
150#[derive(Clone, Debug)]
151pub struct Config {
152    /// The line terminator to use.
153    line_term: LineTerminator,
154    /// Whether to invert matching.
155    invert_match: bool,
156    /// The number of lines after a match to include.
157    after_context: usize,
158    /// The number of lines before a match to include.
159    before_context: usize,
160    /// Whether to enable unbounded context or not.
161    passthru: bool,
162    /// Whether to count line numbers.
163    line_number: bool,
164    /// The maximum amount of heap memory to use.
165    ///
166    /// When not given, no explicit limit is enforced. When set to `0`, then
167    /// only the memory map search strategy is available.
168    heap_limit: Option<usize>,
169    /// The memory map strategy.
170    mmap: MmapChoice,
171    /// The binary data detection strategy.
172    binary: BinaryDetection,
173    /// Whether to enable matching across multiple lines.
174    multi_line: bool,
175    /// An encoding that, when present, causes the searcher to transcode all
176    /// input from the encoding to UTF-8.
177    encoding: Option<Encoding>,
178    /// Whether to do automatic transcoding based on a BOM or not.
179    bom_sniffing: bool,
180    /// Whether to stop searching when a non-matching line is found after a
181    /// matching line.
182    stop_on_nonmatch: bool,
183}
184
185impl Default for Config {
186    fn default() -> Config {
187        Config {
188            line_term: LineTerminator::default(),
189            invert_match: false,
190            after_context: 0,
191            before_context: 0,
192            passthru: false,
193            line_number: true,
194            heap_limit: None,
195            mmap: MmapChoice::default(),
196            binary: BinaryDetection::default(),
197            multi_line: false,
198            encoding: None,
199            bom_sniffing: true,
200            stop_on_nonmatch: false,
201        }
202    }
203}
204
205impl Config {
206    /// Return the maximal amount of lines needed to fulfill this
207    /// configuration's context.
208    ///
209    /// If this returns `0`, then no context is ever needed.
210    fn max_context(&self) -> usize {
211        cmp::max(self.before_context, self.after_context)
212    }
213
214    /// Build a line buffer from this configuration.
215    fn line_buffer(&self) -> LineBuffer {
216        let mut builder = LineBufferBuilder::new();
217        builder
218            .line_terminator(self.line_term.as_byte())
219            .binary_detection(self.binary.0);
220
221        if let Some(limit) = self.heap_limit {
222            let (capacity, additional) = if limit <= DEFAULT_BUFFER_CAPACITY {
223                (limit, 0)
224            } else {
225                (DEFAULT_BUFFER_CAPACITY, limit - DEFAULT_BUFFER_CAPACITY)
226            };
227            builder
228                .capacity(capacity)
229                .buffer_alloc(BufferAllocation::Error(additional));
230        }
231        builder.build()
232    }
233}
234
235/// An error that can occur when building a searcher.
236///
237/// This error occurs when a non-sensical configuration is present when trying
238/// to construct a `Searcher` from a `SearcherBuilder`.
239#[derive(Clone, Debug, Eq, PartialEq)]
240#[non_exhaustive]
241pub enum ConfigError {
242    /// Indicates that the heap limit configuration prevents all possible
243    /// search strategies from being used. For example, if the heap limit is
244    /// set to 0 and memory map searching is disabled or unavailable.
245    SearchUnavailable,
246    /// Occurs when a matcher reports a line terminator that is different than
247    /// the one configured in the searcher.
248    MismatchedLineTerminators {
249        /// The matcher's line terminator.
250        matcher: LineTerminator,
251        /// The searcher's line terminator.
252        searcher: LineTerminator,
253    },
254    /// Occurs when no encoding could be found for a particular label.
255    UnknownEncoding {
256        /// The provided encoding label that could not be found.
257        label: Vec<u8>,
258    },
259}
260
261impl std::error::Error for ConfigError {}
262
263impl std::fmt::Display for ConfigError {
264    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
265        match *self {
266            ConfigError::SearchUnavailable => {
267                write!(f, "grep config error: no available searchers")
268            }
269            ConfigError::MismatchedLineTerminators { matcher, searcher } => {
270                write!(
271                    f,
272                    "grep config error: mismatched line terminators, \
273                     matcher has {:?} but searcher has {:?}",
274                    matcher, searcher
275                )
276            }
277            ConfigError::UnknownEncoding { ref label } => write!(
278                f,
279                "grep config error: unknown encoding: {}",
280                String::from_utf8_lossy(label),
281            ),
282        }
283    }
284}
285
286/// A builder for configuring a searcher.
287///
288/// A search builder permits specifying the configuration of a searcher,
289/// including options like whether to invert the search or to enable multi
290/// line search.
291///
292/// Once a searcher has been built, it is beneficial to reuse that searcher
293/// for multiple searches, if possible.
294#[derive(Clone, Debug)]
295pub struct SearcherBuilder {
296    config: Config,
297}
298
299impl Default for SearcherBuilder {
300    fn default() -> SearcherBuilder {
301        SearcherBuilder::new()
302    }
303}
304
305impl SearcherBuilder {
306    /// Create a new searcher builder with a default configuration.
307    pub fn new() -> SearcherBuilder {
308        SearcherBuilder { config: Config::default() }
309    }
310
311    /// Build a searcher with the given matcher.
312    pub fn build(&self) -> Searcher {
313        let mut config = self.config.clone();
314        if config.passthru {
315            config.before_context = 0;
316            config.after_context = 0;
317        }
318
319        let mut decode_builder = DecodeReaderBytesBuilder::new();
320        decode_builder
321            .encoding(self.config.encoding.as_ref().map(|e| e.0))
322            .utf8_passthru(true)
323            .strip_bom(self.config.bom_sniffing)
324            .bom_override(true)
325            .bom_sniffing(self.config.bom_sniffing);
326
327        Searcher {
328            config,
329            decode_builder,
330            decode_buffer: RefCell::new(vec![0; 8 * (1 << 10)]),
331            line_buffer: RefCell::new(self.config.line_buffer()),
332            multi_line_buffer: RefCell::new(vec![]),
333        }
334    }
335
336    /// Set the line terminator that is used by the searcher.
337    ///
338    /// When using a searcher, if the matcher provided has a line terminator
339    /// set, then it must be the same as this one. If they aren't, building
340    /// a searcher will return an error.
341    ///
342    /// By default, this is set to `b'\n'`.
343    pub fn line_terminator(
344        &mut self,
345        line_term: LineTerminator,
346    ) -> &mut SearcherBuilder {
347        self.config.line_term = line_term;
348        self
349    }
350
351    /// Whether to invert matching, whereby lines that don't match are reported
352    /// instead of reporting lines that do match.
353    ///
354    /// By default, this is disabled.
355    pub fn invert_match(&mut self, yes: bool) -> &mut SearcherBuilder {
356        self.config.invert_match = yes;
357        self
358    }
359
360    /// Whether to count and include line numbers with matching lines.
361    ///
362    /// This is enabled by default. There is a small performance penalty
363    /// associated with computing line numbers, so this can be disabled when
364    /// this isn't desirable.
365    pub fn line_number(&mut self, yes: bool) -> &mut SearcherBuilder {
366        self.config.line_number = yes;
367        self
368    }
369
370    /// Whether to enable multi line search or not.
371    ///
372    /// When multi line search is enabled, matches *may* match across multiple
373    /// lines. Conversely, when multi line search is disabled, it is impossible
374    /// for any match to span more than one line.
375    ///
376    /// **Warning:** multi line search requires having the entire contents to
377    /// search mapped in memory at once. When searching files, memory maps
378    /// will be used if possible and if they are enabled, which avoids using
379    /// your program's heap. However, if memory maps cannot be used (e.g.,
380    /// for searching streams like `stdin` or if transcoding is necessary),
381    /// then the entire contents of the stream are read on to the heap before
382    /// starting the search.
383    ///
384    /// This is disabled by default.
385    pub fn multi_line(&mut self, yes: bool) -> &mut SearcherBuilder {
386        self.config.multi_line = yes;
387        self
388    }
389
390    /// Whether to include a fixed number of lines after every match.
391    ///
392    /// When this is set to a non-zero number, then the searcher will report
393    /// `line_count` contextual lines after every match.
394    ///
395    /// This is set to `0` by default.
396    pub fn after_context(
397        &mut self,
398        line_count: usize,
399    ) -> &mut SearcherBuilder {
400        self.config.after_context = line_count;
401        self
402    }
403
404    /// Whether to include a fixed number of lines before every match.
405    ///
406    /// When this is set to a non-zero number, then the searcher will report
407    /// `line_count` contextual lines before every match.
408    ///
409    /// This is set to `0` by default.
410    pub fn before_context(
411        &mut self,
412        line_count: usize,
413    ) -> &mut SearcherBuilder {
414        self.config.before_context = line_count;
415        self
416    }
417
418    /// Whether to enable the "passthru" feature or not.
419    ///
420    /// When passthru is enabled, it effectively treats all non-matching lines
421    /// as contextual lines. In other words, enabling this is akin to
422    /// requesting an unbounded number of before and after contextual lines.
423    ///
424    /// When passthru mode is enabled, any `before_context` or `after_context`
425    /// settings are ignored by setting them to `0`.
426    ///
427    /// This is disabled by default.
428    pub fn passthru(&mut self, yes: bool) -> &mut SearcherBuilder {
429        self.config.passthru = yes;
430        self
431    }
432
433    /// Set an approximate limit on the amount of heap space used by a
434    /// searcher.
435    ///
436    /// The heap limit is enforced in two scenarios:
437    ///
438    /// * When searching using a fixed size buffer, the heap limit controls
439    ///   how big this buffer is allowed to be. Assuming contexts are disabled,
440    ///   the minimum size of this buffer is the length (in bytes) of the
441    ///   largest single line in the contents being searched. If any line
442    ///   exceeds the heap limit, then an error will be returned.
443    /// * When performing a multi line search, a fixed size buffer cannot be
444    ///   used. Thus, the only choices are to read the entire contents on to
445    ///   the heap, or use memory maps. In the former case, the heap limit set
446    ///   here is enforced.
447    ///
448    /// If a heap limit is set to `0`, then no heap space is used. If there are
449    /// no alternative strategies available for searching without heap space
450    /// (e.g., memory maps are disabled), then the searcher wil return an error
451    /// immediately.
452    ///
453    /// By default, no limit is set.
454    pub fn heap_limit(
455        &mut self,
456        bytes: Option<usize>,
457    ) -> &mut SearcherBuilder {
458        self.config.heap_limit = bytes;
459        self
460    }
461
462    /// Set the strategy to employ use of memory maps.
463    ///
464    /// Currently, there are only two strategies that can be employed:
465    ///
466    /// * **Automatic** - A searcher will use heuristics, including but not
467    ///   limited to file size and platform, to determine whether to use memory
468    ///   maps or not.
469    /// * **Never** - Memory maps will never be used. If multi line search is
470    ///   enabled, then the entire contents will be read on to the heap before
471    ///   searching begins.
472    ///
473    /// The default behavior is **never**. Generally speaking, and perhaps
474    /// against conventional wisdom, memory maps don't necessarily enable
475    /// faster searching. For example, depending on the platform, using memory
476    /// maps while searching a large directory can actually be quite a bit
477    /// slower than using normal read calls because of the overhead of managing
478    /// the memory maps.
479    ///
480    /// Memory maps can be faster in some cases however. On some platforms,
481    /// when searching a very large file that *is already in memory*, it can
482    /// be slightly faster to search it as a memory map instead of using
483    /// normal read calls.
484    ///
485    /// Finally, memory maps have a somewhat complicated safety story in Rust.
486    /// If you aren't sure whether enabling memory maps is worth it, then just
487    /// don't bother with it.
488    ///
489    /// **WARNING**: If your process is searching a file backed memory map
490    /// at the same time that file is truncated, then it's possible for the
491    /// process to terminate with a bus error.
492    pub fn memory_map(
493        &mut self,
494        strategy: MmapChoice,
495    ) -> &mut SearcherBuilder {
496        self.config.mmap = strategy;
497        self
498    }
499
500    /// Set the binary detection strategy.
501    ///
502    /// The binary detection strategy determines not only how the searcher
503    /// detects binary data, but how it responds to the presence of binary
504    /// data. See the [`BinaryDetection`] type for more information.
505    ///
506    /// By default, binary detection is disabled.
507    pub fn binary_detection(
508        &mut self,
509        detection: BinaryDetection,
510    ) -> &mut SearcherBuilder {
511        self.config.binary = detection;
512        self
513    }
514
515    /// Set the encoding used to read the source data before searching.
516    ///
517    /// When an encoding is provided, then the source data is _unconditionally_
518    /// transcoded using the encoding, unless a BOM is present. If a BOM is
519    /// present, then the encoding indicated by the BOM is used instead. If the
520    /// transcoding process encounters an error, then bytes are replaced with
521    /// the Unicode replacement codepoint.
522    ///
523    /// When no encoding is specified (the default), then BOM sniffing is
524    /// used (if it's enabled, which it is, by default) to determine whether
525    /// the source data is UTF-8 or UTF-16, and transcoding will be performed
526    /// automatically. If no BOM could be found, then the source data is
527    /// searched _as if_ it were UTF-8. However, so long as the source data is
528    /// at least ASCII compatible, then it is possible for a search to produce
529    /// useful results.
530    pub fn encoding(
531        &mut self,
532        encoding: Option<Encoding>,
533    ) -> &mut SearcherBuilder {
534        self.config.encoding = encoding;
535        self
536    }
537
538    /// Enable automatic transcoding based on BOM sniffing.
539    ///
540    /// When this is enabled and an explicit encoding is not set, then this
541    /// searcher will try to detect the encoding of the bytes being searched
542    /// by sniffing its byte-order mark (BOM). In particular, when this is
543    /// enabled, UTF-16 encoded files will be searched seamlessly.
544    ///
545    /// When this is disabled and if an explicit encoding is not set, then
546    /// the bytes from the source stream will be passed through unchanged,
547    /// including its BOM, if one is present.
548    ///
549    /// This is enabled by default.
550    pub fn bom_sniffing(&mut self, yes: bool) -> &mut SearcherBuilder {
551        self.config.bom_sniffing = yes;
552        self
553    }
554
555    /// Stop searching a file when a non-matching line is found after a
556    /// matching line.
557    ///
558    /// This is useful for searching sorted files where it is expected that all
559    /// the matches will be on adjacent lines.
560    pub fn stop_on_nonmatch(
561        &mut self,
562        stop_on_nonmatch: bool,
563    ) -> &mut SearcherBuilder {
564        self.config.stop_on_nonmatch = stop_on_nonmatch;
565        self
566    }
567}
568
569/// A searcher executes searches over a haystack and writes results to a caller
570/// provided sink.
571///
572/// Matches are detected via implementations of the `Matcher` trait, which must
573/// be provided by the caller when executing a search.
574///
575/// When possible, a searcher should be reused.
576#[derive(Clone, Debug)]
577pub struct Searcher {
578    /// The configuration for this searcher.
579    ///
580    /// We make most of these settings available to users of `Searcher` via
581    /// public API methods, which can be queried in implementations of `Sink`
582    /// if necessary.
583    config: Config,
584    /// A builder for constructing a streaming reader that transcodes source
585    /// data according to either an explicitly specified encoding or via an
586    /// automatically detected encoding via BOM sniffing.
587    ///
588    /// When no transcoding is needed, then the transcoder built will pass
589    /// through the underlying bytes with no additional overhead.
590    decode_builder: DecodeReaderBytesBuilder,
591    /// A buffer that is used for transcoding scratch space.
592    decode_buffer: RefCell<Vec<u8>>,
593    /// A line buffer for use in line oriented searching.
594    ///
595    /// We wrap it in a RefCell to permit lending out borrows of `Searcher`
596    /// to sinks. We still require a mutable borrow to execute a search, so
597    /// we statically prevent callers from causing RefCell to panic at runtime
598    /// due to a borrowing violation.
599    line_buffer: RefCell<LineBuffer>,
600    /// A buffer in which to store the contents of a reader when performing a
601    /// multi line search. In particular, multi line searches cannot be
602    /// performed incrementally, and need the entire haystack in memory at
603    /// once.
604    multi_line_buffer: RefCell<Vec<u8>>,
605}
606
607impl Searcher {
608    /// Create a new searcher with a default configuration.
609    ///
610    /// To configure the searcher (e.g., invert matching, enable memory maps,
611    /// enable contexts, etc.), use the [`SearcherBuilder`].
612    pub fn new() -> Searcher {
613        SearcherBuilder::new().build()
614    }
615
616    /// Execute a search over the file with the given path and write the
617    /// results to the given sink.
618    ///
619    /// If memory maps are enabled and the searcher heuristically believes
620    /// memory maps will help the search run faster, then this will use
621    /// memory maps. For this reason, callers should prefer using this method
622    /// or `search_file` over the more generic `search_reader` when possible.
623    pub fn search_path<P, M, S>(
624        &mut self,
625        matcher: M,
626        path: P,
627        write_to: S,
628    ) -> Result<(), S::Error>
629    where
630        P: AsRef<Path>,
631        M: Matcher,
632        S: Sink,
633    {
634        let path = path.as_ref();
635        let file = File::open(path).map_err(S::Error::error_io)?;
636        self.search_file_maybe_path(matcher, Some(path), &file, write_to)
637    }
638
639    /// Execute a search over a file and write the results to the given sink.
640    ///
641    /// If memory maps are enabled and the searcher heuristically believes
642    /// memory maps will help the search run faster, then this will use
643    /// memory maps. For this reason, callers should prefer using this method
644    /// or `search_path` over the more generic `search_reader` when possible.
645    pub fn search_file<M, S>(
646        &mut self,
647        matcher: M,
648        file: &File,
649        write_to: S,
650    ) -> Result<(), S::Error>
651    where
652        M: Matcher,
653        S: Sink,
654    {
655        self.search_file_maybe_path(matcher, None, file, write_to)
656    }
657
658    fn search_file_maybe_path<M, S>(
659        &mut self,
660        matcher: M,
661        path: Option<&Path>,
662        file: &File,
663        write_to: S,
664    ) -> Result<(), S::Error>
665    where
666        M: Matcher,
667        S: Sink,
668    {
669        if let Some(mmap) = self.config.mmap.open(file, path) {
670            log::trace!("{:?}: searching via memory map", path);
671            return self.search_slice(matcher, &mmap, write_to);
672        }
673        // Fast path for multi-line searches of files when memory maps are not
674        // enabled. This pre-allocates a buffer roughly the size of the file,
675        // which isn't possible when searching an arbitrary std::io::Read.
676        if self.multi_line_with_matcher(&matcher) {
677            log::trace!(
678                "{:?}: reading entire file on to heap for mulitline",
679                path
680            );
681            self.fill_multi_line_buffer_from_file::<S>(file)?;
682            log::trace!("{:?}: searching via multiline strategy", path);
683            MultiLine::new(
684                self,
685                matcher,
686                &*self.multi_line_buffer.borrow(),
687                write_to,
688            )
689            .run()
690        } else {
691            log::trace!("{:?}: searching using generic reader", path);
692            self.search_reader(matcher, file, write_to)
693        }
694    }
695
696    /// Execute a search over any implementation of `std::io::Read` and write
697    /// the results to the given sink.
698    ///
699    /// When possible, this implementation will search the reader incrementally
700    /// without reading it into memory. In some cases---for example, if multi
701    /// line search is enabled---an incremental search isn't possible and the
702    /// given reader is consumed completely and placed on the heap before
703    /// searching begins. For this reason, when multi line search is enabled,
704    /// one should try to use higher level APIs (e.g., searching by file or
705    /// file path) so that memory maps can be used if they are available and
706    /// enabled.
707    pub fn search_reader<M, R, S>(
708        &mut self,
709        matcher: M,
710        read_from: R,
711        write_to: S,
712    ) -> Result<(), S::Error>
713    where
714        M: Matcher,
715        R: io::Read,
716        S: Sink,
717    {
718        self.check_config(&matcher).map_err(S::Error::error_config)?;
719
720        let mut decode_buffer = self.decode_buffer.borrow_mut();
721        let decoder = self
722            .decode_builder
723            .build_with_buffer(read_from, &mut *decode_buffer)
724            .map_err(S::Error::error_io)?;
725
726        if self.multi_line_with_matcher(&matcher) {
727            log::trace!(
728                "generic reader: reading everything to heap for multiline"
729            );
730            self.fill_multi_line_buffer_from_reader::<_, S>(decoder)?;
731            log::trace!("generic reader: searching via multiline strategy");
732            MultiLine::new(
733                self,
734                matcher,
735                &*self.multi_line_buffer.borrow(),
736                write_to,
737            )
738            .run()
739        } else {
740            let mut line_buffer = self.line_buffer.borrow_mut();
741            let rdr = LineBufferReader::new(decoder, &mut *line_buffer);
742            log::trace!("generic reader: searching via roll buffer strategy");
743            ReadByLine::new(self, matcher, rdr, write_to).run()
744        }
745    }
746
747    /// Execute a search over the given slice and write the results to the
748    /// given sink.
749    pub fn search_slice<M, S>(
750        &mut self,
751        matcher: M,
752        slice: &[u8],
753        write_to: S,
754    ) -> Result<(), S::Error>
755    where
756        M: Matcher,
757        S: Sink,
758    {
759        self.check_config(&matcher).map_err(S::Error::error_config)?;
760
761        // We can search the slice directly, unless we need to do transcoding.
762        if self.slice_needs_transcoding(slice) {
763            log::trace!(
764                "slice reader: needs transcoding, using generic reader"
765            );
766            return self.search_reader(matcher, slice, write_to);
767        }
768        if self.multi_line_with_matcher(&matcher) {
769            log::trace!("slice reader: searching via multiline strategy");
770            MultiLine::new(self, matcher, slice, write_to).run()
771        } else {
772            log::trace!("slice reader: searching via slice-by-line strategy");
773            SliceByLine::new(self, matcher, slice, write_to).run()
774        }
775    }
776
777    /// Set the binary detection method used on this searcher.
778    pub fn set_binary_detection(&mut self, detection: BinaryDetection) {
779        self.config.binary = detection.clone();
780        self.line_buffer.borrow_mut().set_binary_detection(detection.0);
781    }
782
783    /// Check that the searcher's configuration and the matcher are consistent
784    /// with each other.
785    fn check_config<M: Matcher>(&self, matcher: M) -> Result<(), ConfigError> {
786        if self.config.heap_limit == Some(0) && !self.config.mmap.is_enabled()
787        {
788            return Err(ConfigError::SearchUnavailable);
789        }
790        let matcher_line_term = match matcher.line_terminator() {
791            None => return Ok(()),
792            Some(line_term) => line_term,
793        };
794        if matcher_line_term != self.config.line_term {
795            return Err(ConfigError::MismatchedLineTerminators {
796                matcher: matcher_line_term,
797                searcher: self.config.line_term,
798            });
799        }
800        Ok(())
801    }
802
803    /// Returns true if and only if the given slice needs to be transcoded.
804    fn slice_needs_transcoding(&self, slice: &[u8]) -> bool {
805        self.config.encoding.is_some()
806            || (self.config.bom_sniffing && slice_has_bom(slice))
807    }
808}
809
810/// The following methods permit querying the configuration of a searcher.
811/// These can be useful in generic implementations of [`Sink`], where the
812/// output may be tailored based on how the searcher is configured.
813impl Searcher {
814    /// Returns the line terminator used by this searcher.
815    #[inline]
816    pub fn line_terminator(&self) -> LineTerminator {
817        self.config.line_term
818    }
819
820    /// Returns the type of binary detection configured on this searcher.
821    #[inline]
822    pub fn binary_detection(&self) -> &BinaryDetection {
823        &self.config.binary
824    }
825
826    /// Returns true if and only if this searcher is configured to invert its
827    /// search results. That is, matching lines are lines that do **not** match
828    /// the searcher's matcher.
829    #[inline]
830    pub fn invert_match(&self) -> bool {
831        self.config.invert_match
832    }
833
834    /// Returns true if and only if this searcher is configured to count line
835    /// numbers.
836    #[inline]
837    pub fn line_number(&self) -> bool {
838        self.config.line_number
839    }
840
841    /// Returns true if and only if this searcher is configured to perform
842    /// multi line search.
843    #[inline]
844    pub fn multi_line(&self) -> bool {
845        self.config.multi_line
846    }
847
848    /// Returns true if and only if this searcher is configured to stop when in
849    /// finds a non-matching line after a matching one.
850    #[inline]
851    pub fn stop_on_nonmatch(&self) -> bool {
852        self.config.stop_on_nonmatch
853    }
854
855    /// Returns true if and only if this searcher will choose a multi-line
856    /// strategy given the provided matcher.
857    ///
858    /// This may diverge from the result of `multi_line` in cases where the
859    /// searcher has been configured to execute a search that can report
860    /// matches over multiple lines, but where the matcher guarantees that it
861    /// will never produce a match over multiple lines.
862    pub fn multi_line_with_matcher<M: Matcher>(&self, matcher: M) -> bool {
863        if !self.multi_line() {
864            return false;
865        }
866        if let Some(line_term) = matcher.line_terminator() {
867            if line_term == self.line_terminator() {
868                return false;
869            }
870        }
871        if let Some(non_matching) = matcher.non_matching_bytes() {
872            // If the line terminator is CRLF, we don't actually need to care
873            // whether the regex can match `\r` or not. Namely, a `\r` is
874            // neither necessary nor sufficient to terminate a line. A `\n` is
875            // always required.
876            if non_matching.contains(self.line_terminator().as_byte()) {
877                return false;
878            }
879        }
880        true
881    }
882
883    /// Returns the number of "after" context lines to report. When context
884    /// reporting is not enabled, this returns `0`.
885    #[inline]
886    pub fn after_context(&self) -> usize {
887        self.config.after_context
888    }
889
890    /// Returns the number of "before" context lines to report. When context
891    /// reporting is not enabled, this returns `0`.
892    #[inline]
893    pub fn before_context(&self) -> usize {
894        self.config.before_context
895    }
896
897    /// Returns true if and only if the searcher has "passthru" mode enabled.
898    #[inline]
899    pub fn passthru(&self) -> bool {
900        self.config.passthru
901    }
902
903    /// Fill the buffer for use with multi-line searching from the given file.
904    /// This reads from the file until EOF or until an error occurs. If the
905    /// contents exceed the configured heap limit, then an error is returned.
906    fn fill_multi_line_buffer_from_file<S: Sink>(
907        &self,
908        file: &File,
909    ) -> Result<(), S::Error> {
910        assert!(self.config.multi_line);
911
912        let mut decode_buffer = self.decode_buffer.borrow_mut();
913        let mut read_from = self
914            .decode_builder
915            .build_with_buffer(file, &mut *decode_buffer)
916            .map_err(S::Error::error_io)?;
917
918        // If we don't have a heap limit, then we can defer to std's
919        // read_to_end implementation. fill_multi_line_buffer_from_reader will
920        // do this too, but since we have a File, we can be a bit smarter about
921        // pre-allocating here.
922        //
923        // If we're transcoding, then our pre-allocation might not be exact,
924        // but is probably still better than nothing.
925        if self.config.heap_limit.is_none() {
926            let mut buf = self.multi_line_buffer.borrow_mut();
927            buf.clear();
928            let cap =
929                file.metadata().map(|m| m.len() as usize + 1).unwrap_or(0);
930            buf.reserve(cap);
931            read_from.read_to_end(&mut *buf).map_err(S::Error::error_io)?;
932            return Ok(());
933        }
934        self.fill_multi_line_buffer_from_reader::<_, S>(read_from)
935    }
936
937    /// Fill the buffer for use with multi-line searching from the given
938    /// reader. This reads from the reader until EOF or until an error occurs.
939    /// If the contents exceed the configured heap limit, then an error is
940    /// returned.
941    fn fill_multi_line_buffer_from_reader<R: io::Read, S: Sink>(
942        &self,
943        mut read_from: R,
944    ) -> Result<(), S::Error> {
945        assert!(self.config.multi_line);
946
947        let mut buf = self.multi_line_buffer.borrow_mut();
948        buf.clear();
949
950        // If we don't have a heap limit, then we can defer to std's
951        // read_to_end implementation...
952        let heap_limit = match self.config.heap_limit {
953            Some(heap_limit) => heap_limit,
954            None => {
955                read_from
956                    .read_to_end(&mut *buf)
957                    .map_err(S::Error::error_io)?;
958                return Ok(());
959            }
960        };
961        if heap_limit == 0 {
962            return Err(S::Error::error_io(alloc_error(heap_limit)));
963        }
964
965        // ... otherwise we need to roll our own. This is likely quite a bit
966        // slower than what is optimal, but we avoid worry about memory safety
967        // until there's a compelling reason to speed this up.
968        buf.resize(cmp::min(DEFAULT_BUFFER_CAPACITY, heap_limit), 0);
969        let mut pos = 0;
970        loop {
971            let nread = match read_from.read(&mut buf[pos..]) {
972                Ok(nread) => nread,
973                Err(ref err) if err.kind() == io::ErrorKind::Interrupted => {
974                    continue;
975                }
976                Err(err) => return Err(S::Error::error_io(err)),
977            };
978            if nread == 0 {
979                buf.resize(pos, 0);
980                return Ok(());
981            }
982
983            pos += nread;
984            if buf[pos..].is_empty() {
985                let additional = heap_limit - buf.len();
986                if additional == 0 {
987                    return Err(S::Error::error_io(alloc_error(heap_limit)));
988                }
989                let limit = buf.len() + additional;
990                let doubled = 2 * buf.len();
991                buf.resize(cmp::min(doubled, limit), 0);
992            }
993        }
994    }
995}
996
997/// Returns true if and only if the given slice begins with a UTF-8 or UTF-16
998/// BOM.
999///
1000/// This is used by the searcher to determine if a transcoder is necessary.
1001/// Otherwise, it is advantageous to search the slice directly.
1002fn slice_has_bom(slice: &[u8]) -> bool {
1003    let enc = match encoding_rs::Encoding::for_bom(slice) {
1004        None => return false,
1005        Some((enc, _)) => enc,
1006    };
1007    [encoding_rs::UTF_16LE, encoding_rs::UTF_16BE, encoding_rs::UTF_8]
1008        .contains(&enc)
1009}
1010
1011#[cfg(test)]
1012mod tests {
1013    use crate::testutil::{KitchenSink, RegexMatcher};
1014
1015    use super::*;
1016
1017    #[test]
1018    fn config_error_heap_limit() {
1019        let matcher = RegexMatcher::new("");
1020        let sink = KitchenSink::new();
1021        let mut searcher = SearcherBuilder::new().heap_limit(Some(0)).build();
1022        let res = searcher.search_slice(matcher, &[], sink);
1023        assert!(res.is_err());
1024    }
1025
1026    #[test]
1027    fn config_error_line_terminator() {
1028        let mut matcher = RegexMatcher::new("");
1029        matcher.set_line_term(Some(LineTerminator::byte(b'z')));
1030
1031        let sink = KitchenSink::new();
1032        let mut searcher = Searcher::new();
1033        let res = searcher.search_slice(matcher, &[], sink);
1034        assert!(res.is_err());
1035    }
1036
1037    #[test]
1038    fn uft8_bom_sniffing() {
1039        // See: https://github.com/BurntSushi/ripgrep/issues/1638
1040        // ripgrep must sniff utf-8 BOM, just like it does with utf-16
1041        let matcher = RegexMatcher::new("foo");
1042        let haystack: &[u8] = &[0xef, 0xbb, 0xbf, 0x66, 0x6f, 0x6f];
1043
1044        let mut sink = KitchenSink::new();
1045        let mut searcher = SearcherBuilder::new().build();
1046
1047        let res = searcher.search_slice(matcher, haystack, &mut sink);
1048        assert!(res.is_ok());
1049
1050        let sink_output = String::from_utf8(sink.as_bytes().to_vec()).unwrap();
1051        assert_eq!(sink_output, "1:0:foo\nbyte count:3\n");
1052    }
1053}