grep_searcher/searcher/mod.rs
1use std::{
2 cell::RefCell,
3 cmp,
4 fs::File,
5 io::{self, Read},
6 path::Path,
7};
8
9use {
10 encoding_rs_io::DecodeReaderBytesBuilder,
11 grep_matcher::{LineTerminator, Match, Matcher},
12};
13
14use crate::{
15 line_buffer::{
16 self, BufferAllocation, DEFAULT_BUFFER_CAPACITY, LineBuffer,
17 LineBufferBuilder, LineBufferReader, alloc_error,
18 },
19 searcher::glue::{MultiLine, ReadByLine, SliceByLine},
20 sink::{Sink, SinkError},
21};
22
23pub use self::mmap::MmapChoice;
24
25mod core;
26mod glue;
27mod mmap;
28
29/// We use this type alias since we want the ergonomics of a matcher's `Match`
30/// type, but in practice, we use it for arbitrary ranges, so give it a more
31/// accurate name. This is only used in the searcher's internals.
32type Range = Match;
33
34/// The behavior of binary detection while searching.
35///
36/// Binary detection is the process of _heuristically_ identifying whether a
37/// given chunk of data is binary or not, and then taking an action based on
38/// the result of that heuristic. The motivation behind detecting binary data
39/// is that binary data often indicates data that is undesirable to search
40/// using textual patterns. Of course, there are many cases in which this isn't
41/// true, which is why binary detection is disabled by default.
42///
43/// Unfortunately, binary detection works differently depending on the type of
44/// search being executed:
45///
46/// 1. When performing a search using a fixed size buffer, binary detection is
47/// applied to the buffer's contents as it is filled. Binary detection must
48/// be applied to the buffer directly because binary files may not contain
49/// line terminators, which could result in exorbitant memory usage.
50/// 2. When performing a search using memory maps or by reading data off the
51/// heap, then binary detection is only guaranteed to be applied to the
52/// parts corresponding to a match. When `Quit` is enabled, then the first
53/// few KB of the data are searched for binary data.
54#[derive(Clone, Debug, Default, Eq, PartialEq)]
55pub struct BinaryDetection(line_buffer::BinaryDetection);
56
57impl BinaryDetection {
58 /// No binary detection is performed. Data reported by the searcher may
59 /// contain arbitrary bytes.
60 ///
61 /// This is the default.
62 pub fn none() -> BinaryDetection {
63 BinaryDetection(line_buffer::BinaryDetection::None)
64 }
65
66 /// Binary detection is performed by looking for the given byte.
67 ///
68 /// When searching is performed using a fixed size buffer, then the
69 /// contents of that buffer are always searched for the presence of this
70 /// byte. If it is found, then the underlying data is considered binary
71 /// and the search stops as if it reached EOF.
72 ///
73 /// When searching is performed with the entire contents mapped into
74 /// memory, then binary detection is more conservative. Namely, only a
75 /// fixed sized region at the beginning of the contents are detected for
76 /// binary data. As a compromise, any subsequent matching (or context)
77 /// lines are also searched for binary data. If binary data is detected at
78 /// any point, then the search stops as if it reached EOF.
79 pub fn quit(binary_byte: u8) -> BinaryDetection {
80 BinaryDetection(line_buffer::BinaryDetection::Quit(binary_byte))
81 }
82
83 /// Binary detection is performed by looking for the given byte, and
84 /// replacing it with the line terminator configured on the searcher.
85 /// (If the searcher is configured to use `CRLF` as the line terminator,
86 /// then this byte is replaced by just `LF`.)
87 ///
88 /// When searching is performed using a fixed size buffer, then the
89 /// contents of that buffer are always searched for the presence of this
90 /// byte and replaced with the line terminator. In effect, the caller is
91 /// guaranteed to never observe this byte while searching.
92 ///
93 /// When searching is performed with the entire contents mapped into
94 /// memory, then this setting has no effect and is ignored.
95 pub fn convert(binary_byte: u8) -> BinaryDetection {
96 BinaryDetection(line_buffer::BinaryDetection::Convert(binary_byte))
97 }
98
99 /// If this binary detection uses the "quit" strategy, then this returns
100 /// the byte that will cause a search to quit. In any other case, this
101 /// returns `None`.
102 pub fn quit_byte(&self) -> Option<u8> {
103 match self.0 {
104 line_buffer::BinaryDetection::Quit(b) => Some(b),
105 _ => None,
106 }
107 }
108
109 /// If this binary detection uses the "convert" strategy, then this returns
110 /// the byte that will be replaced by the line terminator. In any other
111 /// case, this returns `None`.
112 pub fn convert_byte(&self) -> Option<u8> {
113 match self.0 {
114 line_buffer::BinaryDetection::Convert(b) => Some(b),
115 _ => None,
116 }
117 }
118}
119
120/// An encoding to use when searching.
121///
122/// An encoding can be used to configure a [`SearcherBuilder`] to transcode
123/// source data from an encoding to UTF-8 before searching.
124///
125/// An `Encoding` will always be cheap to clone.
126#[derive(Clone, Debug, Eq, PartialEq)]
127pub struct Encoding(&'static encoding_rs::Encoding);
128
129impl Encoding {
130 /// Create a new encoding for the specified label.
131 ///
132 /// The encoding label provided is mapped to an encoding via the set of
133 /// available choices specified in the
134 /// [Encoding Standard](https://encoding.spec.whatwg.org/#concept-encoding-get).
135 /// If the given label does not correspond to a valid encoding, then this
136 /// returns an error.
137 pub fn new(label: &str) -> Result<Encoding, ConfigError> {
138 let label = label.as_bytes();
139 match encoding_rs::Encoding::for_label_no_replacement(label) {
140 Some(encoding) => Ok(Encoding(encoding)),
141 None => {
142 Err(ConfigError::UnknownEncoding { label: label.to_vec() })
143 }
144 }
145 }
146}
147
148/// The internal configuration of a searcher. This is shared among several
149/// search related types, but is only ever written to by the SearcherBuilder.
150#[derive(Clone, Debug)]
151pub struct Config {
152 /// The line terminator to use.
153 line_term: LineTerminator,
154 /// Whether to invert matching.
155 invert_match: bool,
156 /// The number of lines after a match to include.
157 after_context: usize,
158 /// The number of lines before a match to include.
159 before_context: usize,
160 /// Whether to enable unbounded context or not.
161 passthru: bool,
162 /// Whether to count line numbers.
163 line_number: bool,
164 /// The maximum amount of heap memory to use.
165 ///
166 /// When not given, no explicit limit is enforced. When set to `0`, then
167 /// only the memory map search strategy is available.
168 heap_limit: Option<usize>,
169 /// The memory map strategy.
170 mmap: MmapChoice,
171 /// The binary data detection strategy.
172 binary: BinaryDetection,
173 /// Whether to enable matching across multiple lines.
174 multi_line: bool,
175 /// An encoding that, when present, causes the searcher to transcode all
176 /// input from the encoding to UTF-8.
177 encoding: Option<Encoding>,
178 /// Whether to do automatic transcoding based on a BOM or not.
179 bom_sniffing: bool,
180 /// Whether to stop searching when a non-matching line is found after a
181 /// matching line.
182 stop_on_nonmatch: bool,
183 /// The maximum number of matches this searcher should emit.
184 max_matches: Option<u64>,
185}
186
187impl Default for Config {
188 fn default() -> Config {
189 Config {
190 line_term: LineTerminator::default(),
191 invert_match: false,
192 after_context: 0,
193 before_context: 0,
194 passthru: false,
195 line_number: true,
196 heap_limit: None,
197 mmap: MmapChoice::default(),
198 binary: BinaryDetection::default(),
199 multi_line: false,
200 encoding: None,
201 bom_sniffing: true,
202 stop_on_nonmatch: false,
203 max_matches: None,
204 }
205 }
206}
207
208impl Config {
209 /// Return the maximal amount of lines needed to fulfill this
210 /// configuration's context.
211 ///
212 /// If this returns `0`, then no context is ever needed.
213 fn max_context(&self) -> usize {
214 cmp::max(self.before_context, self.after_context)
215 }
216
217 /// Build a line buffer from this configuration.
218 fn line_buffer(&self) -> LineBuffer {
219 let mut builder = LineBufferBuilder::new();
220 builder
221 .line_terminator(self.line_term.as_byte())
222 .binary_detection(self.binary.0);
223
224 if let Some(limit) = self.heap_limit {
225 let (capacity, additional) = if limit <= DEFAULT_BUFFER_CAPACITY {
226 (limit, 0)
227 } else {
228 (DEFAULT_BUFFER_CAPACITY, limit - DEFAULT_BUFFER_CAPACITY)
229 };
230 builder
231 .capacity(capacity)
232 .buffer_alloc(BufferAllocation::Error(additional));
233 }
234 builder.build()
235 }
236}
237
238/// An error that can occur when building a searcher.
239///
240/// This error occurs when a non-sensical configuration is present when trying
241/// to construct a `Searcher` from a `SearcherBuilder`.
242#[derive(Clone, Debug, Eq, PartialEq)]
243#[non_exhaustive]
244pub enum ConfigError {
245 /// Indicates that the heap limit configuration prevents all possible
246 /// search strategies from being used. For example, if the heap limit is
247 /// set to 0 and memory map searching is disabled or unavailable.
248 SearchUnavailable,
249 /// Occurs when a matcher reports a line terminator that is different than
250 /// the one configured in the searcher.
251 MismatchedLineTerminators {
252 /// The matcher's line terminator.
253 matcher: LineTerminator,
254 /// The searcher's line terminator.
255 searcher: LineTerminator,
256 },
257 /// Occurs when no encoding could be found for a particular label.
258 UnknownEncoding {
259 /// The provided encoding label that could not be found.
260 label: Vec<u8>,
261 },
262}
263
264impl std::error::Error for ConfigError {}
265
266impl std::fmt::Display for ConfigError {
267 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
268 match *self {
269 ConfigError::SearchUnavailable => {
270 write!(f, "grep config error: no available searchers")
271 }
272 ConfigError::MismatchedLineTerminators { matcher, searcher } => {
273 write!(
274 f,
275 "grep config error: mismatched line terminators, \
276 matcher has {:?} but searcher has {:?}",
277 matcher, searcher
278 )
279 }
280 ConfigError::UnknownEncoding { ref label } => write!(
281 f,
282 "grep config error: unknown encoding: {}",
283 String::from_utf8_lossy(label),
284 ),
285 }
286 }
287}
288
289/// A builder for configuring a searcher.
290///
291/// A search builder permits specifying the configuration of a searcher,
292/// including options like whether to invert the search or to enable multi
293/// line search.
294///
295/// Once a searcher has been built, it is beneficial to reuse that searcher
296/// for multiple searches, if possible.
297#[derive(Clone, Debug)]
298pub struct SearcherBuilder {
299 config: Config,
300}
301
302impl Default for SearcherBuilder {
303 fn default() -> SearcherBuilder {
304 SearcherBuilder::new()
305 }
306}
307
308impl SearcherBuilder {
309 /// Create a new searcher builder with a default configuration.
310 pub fn new() -> SearcherBuilder {
311 SearcherBuilder { config: Config::default() }
312 }
313
314 /// Build a searcher with the given matcher.
315 pub fn build(&self) -> Searcher {
316 let mut config = self.config.clone();
317 if config.passthru {
318 config.before_context = 0;
319 config.after_context = 0;
320 }
321
322 let mut decode_builder = DecodeReaderBytesBuilder::new();
323 decode_builder
324 .encoding(self.config.encoding.as_ref().map(|e| e.0))
325 .utf8_passthru(true)
326 .strip_bom(self.config.bom_sniffing)
327 .bom_override(true)
328 .bom_sniffing(self.config.bom_sniffing);
329
330 Searcher {
331 config,
332 decode_builder,
333 decode_buffer: RefCell::new(vec![0; 8 * (1 << 10)]),
334 line_buffer: RefCell::new(self.config.line_buffer()),
335 multi_line_buffer: RefCell::new(vec![]),
336 }
337 }
338
339 /// Set the line terminator that is used by the searcher.
340 ///
341 /// When using a searcher, if the matcher provided has a line terminator
342 /// set, then it must be the same as this one. If they aren't, building
343 /// a searcher will return an error.
344 ///
345 /// By default, this is set to `b'\n'`.
346 pub fn line_terminator(
347 &mut self,
348 line_term: LineTerminator,
349 ) -> &mut SearcherBuilder {
350 self.config.line_term = line_term;
351 self
352 }
353
354 /// Whether to invert matching, whereby lines that don't match are reported
355 /// instead of reporting lines that do match.
356 ///
357 /// By default, this is disabled.
358 pub fn invert_match(&mut self, yes: bool) -> &mut SearcherBuilder {
359 self.config.invert_match = yes;
360 self
361 }
362
363 /// Whether to count and include line numbers with matching lines.
364 ///
365 /// This is enabled by default. There is a small performance penalty
366 /// associated with computing line numbers, so this can be disabled when
367 /// this isn't desirable.
368 pub fn line_number(&mut self, yes: bool) -> &mut SearcherBuilder {
369 self.config.line_number = yes;
370 self
371 }
372
373 /// Whether to enable multi line search or not.
374 ///
375 /// When multi line search is enabled, matches *may* match across multiple
376 /// lines. Conversely, when multi line search is disabled, it is impossible
377 /// for any match to span more than one line.
378 ///
379 /// **Warning:** multi line search requires having the entire contents to
380 /// search mapped in memory at once. When searching files, memory maps
381 /// will be used if possible and if they are enabled, which avoids using
382 /// your program's heap. However, if memory maps cannot be used (e.g.,
383 /// for searching streams like `stdin` or if transcoding is necessary),
384 /// then the entire contents of the stream are read on to the heap before
385 /// starting the search.
386 ///
387 /// This is disabled by default.
388 pub fn multi_line(&mut self, yes: bool) -> &mut SearcherBuilder {
389 self.config.multi_line = yes;
390 self
391 }
392
393 /// Whether to include a fixed number of lines after every match.
394 ///
395 /// When this is set to a non-zero number, then the searcher will report
396 /// `line_count` contextual lines after every match.
397 ///
398 /// This is set to `0` by default.
399 pub fn after_context(
400 &mut self,
401 line_count: usize,
402 ) -> &mut SearcherBuilder {
403 self.config.after_context = line_count;
404 self
405 }
406
407 /// Whether to include a fixed number of lines before every match.
408 ///
409 /// When this is set to a non-zero number, then the searcher will report
410 /// `line_count` contextual lines before every match.
411 ///
412 /// This is set to `0` by default.
413 pub fn before_context(
414 &mut self,
415 line_count: usize,
416 ) -> &mut SearcherBuilder {
417 self.config.before_context = line_count;
418 self
419 }
420
421 /// Whether to enable the "passthru" feature or not.
422 ///
423 /// When passthru is enabled, it effectively treats all non-matching lines
424 /// as contextual lines. In other words, enabling this is akin to
425 /// requesting an unbounded number of before and after contextual lines.
426 ///
427 /// When passthru mode is enabled, any `before_context` or `after_context`
428 /// settings are ignored by setting them to `0`.
429 ///
430 /// This is disabled by default.
431 pub fn passthru(&mut self, yes: bool) -> &mut SearcherBuilder {
432 self.config.passthru = yes;
433 self
434 }
435
436 /// Set an approximate limit on the amount of heap space used by a
437 /// searcher.
438 ///
439 /// The heap limit is enforced in two scenarios:
440 ///
441 /// * When searching using a fixed size buffer, the heap limit controls
442 /// how big this buffer is allowed to be. Assuming contexts are disabled,
443 /// the minimum size of this buffer is the length (in bytes) of the
444 /// largest single line in the contents being searched. If any line
445 /// exceeds the heap limit, then an error will be returned.
446 /// * When performing a multi line search, a fixed size buffer cannot be
447 /// used. Thus, the only choices are to read the entire contents on to
448 /// the heap, or use memory maps. In the former case, the heap limit set
449 /// here is enforced.
450 ///
451 /// If a heap limit is set to `0`, then no heap space is used. If there are
452 /// no alternative strategies available for searching without heap space
453 /// (e.g., memory maps are disabled), then the searcher wil return an error
454 /// immediately.
455 ///
456 /// By default, no limit is set.
457 pub fn heap_limit(
458 &mut self,
459 bytes: Option<usize>,
460 ) -> &mut SearcherBuilder {
461 self.config.heap_limit = bytes;
462 self
463 }
464
465 /// Set the strategy to employ use of memory maps.
466 ///
467 /// Currently, there are only two strategies that can be employed:
468 ///
469 /// * **Automatic** - A searcher will use heuristics, including but not
470 /// limited to file size and platform, to determine whether to use memory
471 /// maps or not.
472 /// * **Never** - Memory maps will never be used. If multi line search is
473 /// enabled, then the entire contents will be read on to the heap before
474 /// searching begins.
475 ///
476 /// The default behavior is **never**. Generally speaking, and perhaps
477 /// against conventional wisdom, memory maps don't necessarily enable
478 /// faster searching. For example, depending on the platform, using memory
479 /// maps while searching a large directory can actually be quite a bit
480 /// slower than using normal read calls because of the overhead of managing
481 /// the memory maps.
482 ///
483 /// Memory maps can be faster in some cases however. On some platforms,
484 /// when searching a very large file that *is already in memory*, it can
485 /// be slightly faster to search it as a memory map instead of using
486 /// normal read calls.
487 ///
488 /// Finally, memory maps have a somewhat complicated safety story in Rust.
489 /// If you aren't sure whether enabling memory maps is worth it, then just
490 /// don't bother with it.
491 ///
492 /// **WARNING**: If your process is searching a file backed memory map
493 /// at the same time that file is truncated, then it's possible for the
494 /// process to terminate with a bus error.
495 pub fn memory_map(
496 &mut self,
497 strategy: MmapChoice,
498 ) -> &mut SearcherBuilder {
499 self.config.mmap = strategy;
500 self
501 }
502
503 /// Set the binary detection strategy.
504 ///
505 /// The binary detection strategy determines not only how the searcher
506 /// detects binary data, but how it responds to the presence of binary
507 /// data. See the [`BinaryDetection`] type for more information.
508 ///
509 /// By default, binary detection is disabled.
510 pub fn binary_detection(
511 &mut self,
512 detection: BinaryDetection,
513 ) -> &mut SearcherBuilder {
514 self.config.binary = detection;
515 self
516 }
517
518 /// Set the encoding used to read the source data before searching.
519 ///
520 /// When an encoding is provided, then the source data is _unconditionally_
521 /// transcoded using the encoding, unless a BOM is present. If a BOM is
522 /// present, then the encoding indicated by the BOM is used instead. If the
523 /// transcoding process encounters an error, then bytes are replaced with
524 /// the Unicode replacement codepoint.
525 ///
526 /// When no encoding is specified (the default), then BOM sniffing is
527 /// used (if it's enabled, which it is, by default) to determine whether
528 /// the source data is UTF-8 or UTF-16, and transcoding will be performed
529 /// automatically. If no BOM could be found, then the source data is
530 /// searched _as if_ it were UTF-8. However, so long as the source data is
531 /// at least ASCII compatible, then it is possible for a search to produce
532 /// useful results.
533 pub fn encoding(
534 &mut self,
535 encoding: Option<Encoding>,
536 ) -> &mut SearcherBuilder {
537 self.config.encoding = encoding;
538 self
539 }
540
541 /// Enable automatic transcoding based on BOM sniffing.
542 ///
543 /// When this is enabled and an explicit encoding is not set, then this
544 /// searcher will try to detect the encoding of the bytes being searched
545 /// by sniffing its byte-order mark (BOM). In particular, when this is
546 /// enabled, UTF-16 encoded files will be searched seamlessly.
547 ///
548 /// When this is disabled and if an explicit encoding is not set, then
549 /// the bytes from the source stream will be passed through unchanged,
550 /// including its BOM, if one is present.
551 ///
552 /// This is enabled by default.
553 pub fn bom_sniffing(&mut self, yes: bool) -> &mut SearcherBuilder {
554 self.config.bom_sniffing = yes;
555 self
556 }
557
558 /// Stop searching a file when a non-matching line is found after a
559 /// matching line.
560 ///
561 /// This is useful for searching sorted files where it is expected that all
562 /// the matches will be on adjacent lines.
563 pub fn stop_on_nonmatch(
564 &mut self,
565 stop_on_nonmatch: bool,
566 ) -> &mut SearcherBuilder {
567 self.config.stop_on_nonmatch = stop_on_nonmatch;
568 self
569 }
570
571 /// Sets the maximum number of matches that should be emitted by this
572 /// searcher.
573 ///
574 /// If multi line search is enabled and a match spans multiple lines, then
575 /// that match is counted exactly once for the purposes of enforcing this
576 /// limit, regardless of how many lines it spans.
577 ///
578 /// Note that `0` is a legal value. This will cause the searcher to
579 /// immediately quick without searching anything.
580 ///
581 /// By default, no limit is set.
582 #[inline]
583 pub fn max_matches(&mut self, limit: Option<u64>) -> &mut SearcherBuilder {
584 self.config.max_matches = limit;
585 self
586 }
587}
588
589/// A searcher executes searches over a haystack and writes results to a caller
590/// provided sink.
591///
592/// Matches are detected via implementations of the `Matcher` trait, which must
593/// be provided by the caller when executing a search.
594///
595/// When possible, a searcher should be reused.
596#[derive(Clone, Debug)]
597pub struct Searcher {
598 /// The configuration for this searcher.
599 ///
600 /// We make most of these settings available to users of `Searcher` via
601 /// public API methods, which can be queried in implementations of `Sink`
602 /// if necessary.
603 config: Config,
604 /// A builder for constructing a streaming reader that transcodes source
605 /// data according to either an explicitly specified encoding or via an
606 /// automatically detected encoding via BOM sniffing.
607 ///
608 /// When no transcoding is needed, then the transcoder built will pass
609 /// through the underlying bytes with no additional overhead.
610 decode_builder: DecodeReaderBytesBuilder,
611 /// A buffer that is used for transcoding scratch space.
612 decode_buffer: RefCell<Vec<u8>>,
613 /// A line buffer for use in line oriented searching.
614 ///
615 /// We wrap it in a RefCell to permit lending out borrows of `Searcher`
616 /// to sinks. We still require a mutable borrow to execute a search, so
617 /// we statically prevent callers from causing RefCell to panic at runtime
618 /// due to a borrowing violation.
619 line_buffer: RefCell<LineBuffer>,
620 /// A buffer in which to store the contents of a reader when performing a
621 /// multi line search. In particular, multi line searches cannot be
622 /// performed incrementally, and need the entire haystack in memory at
623 /// once.
624 multi_line_buffer: RefCell<Vec<u8>>,
625}
626
627impl Searcher {
628 /// Create a new searcher with a default configuration.
629 ///
630 /// To configure the searcher (e.g., invert matching, enable memory maps,
631 /// enable contexts, etc.), use the [`SearcherBuilder`].
632 pub fn new() -> Searcher {
633 SearcherBuilder::new().build()
634 }
635
636 /// Execute a search over the file with the given path and write the
637 /// results to the given sink.
638 ///
639 /// If memory maps are enabled and the searcher heuristically believes
640 /// memory maps will help the search run faster, then this will use
641 /// memory maps. For this reason, callers should prefer using this method
642 /// or `search_file` over the more generic `search_reader` when possible.
643 pub fn search_path<P, M, S>(
644 &mut self,
645 matcher: M,
646 path: P,
647 write_to: S,
648 ) -> Result<(), S::Error>
649 where
650 P: AsRef<Path>,
651 M: Matcher,
652 S: Sink,
653 {
654 let path = path.as_ref();
655 let file = File::open(path).map_err(S::Error::error_io)?;
656 self.search_file_maybe_path(matcher, Some(path), &file, write_to)
657 }
658
659 /// Execute a search over a file and write the results to the given sink.
660 ///
661 /// If memory maps are enabled and the searcher heuristically believes
662 /// memory maps will help the search run faster, then this will use
663 /// memory maps. For this reason, callers should prefer using this method
664 /// or `search_path` over the more generic `search_reader` when possible.
665 pub fn search_file<M, S>(
666 &mut self,
667 matcher: M,
668 file: &File,
669 write_to: S,
670 ) -> Result<(), S::Error>
671 where
672 M: Matcher,
673 S: Sink,
674 {
675 self.search_file_maybe_path(matcher, None, file, write_to)
676 }
677
678 fn search_file_maybe_path<M, S>(
679 &mut self,
680 matcher: M,
681 path: Option<&Path>,
682 file: &File,
683 write_to: S,
684 ) -> Result<(), S::Error>
685 where
686 M: Matcher,
687 S: Sink,
688 {
689 if let Some(mmap) = self.config.mmap.open(file, path) {
690 log::trace!("{:?}: searching via memory map", path);
691 return self.search_slice(matcher, &mmap, write_to);
692 }
693 // Fast path for multi-line searches of files when memory maps are not
694 // enabled. This pre-allocates a buffer roughly the size of the file,
695 // which isn't possible when searching an arbitrary std::io::Read.
696 if self.multi_line_with_matcher(&matcher) {
697 log::trace!(
698 "{:?}: reading entire file on to heap for mulitline",
699 path
700 );
701 self.fill_multi_line_buffer_from_file::<S>(file)?;
702 log::trace!("{:?}: searching via multiline strategy", path);
703 MultiLine::new(
704 self,
705 matcher,
706 &*self.multi_line_buffer.borrow(),
707 write_to,
708 )
709 .run()
710 } else {
711 log::trace!("{:?}: searching using generic reader", path);
712 self.search_reader(matcher, file, write_to)
713 }
714 }
715
716 /// Execute a search over any implementation of `std::io::Read` and write
717 /// the results to the given sink.
718 ///
719 /// When possible, this implementation will search the reader incrementally
720 /// without reading it into memory. In some cases---for example, if multi
721 /// line search is enabled---an incremental search isn't possible and the
722 /// given reader is consumed completely and placed on the heap before
723 /// searching begins. For this reason, when multi line search is enabled,
724 /// one should try to use higher level APIs (e.g., searching by file or
725 /// file path) so that memory maps can be used if they are available and
726 /// enabled.
727 pub fn search_reader<M, R, S>(
728 &mut self,
729 matcher: M,
730 read_from: R,
731 write_to: S,
732 ) -> Result<(), S::Error>
733 where
734 M: Matcher,
735 R: io::Read,
736 S: Sink,
737 {
738 self.check_config(&matcher).map_err(S::Error::error_config)?;
739
740 let mut decode_buffer = self.decode_buffer.borrow_mut();
741 let decoder = self
742 .decode_builder
743 .build_with_buffer(read_from, &mut *decode_buffer)
744 .map_err(S::Error::error_io)?;
745
746 if self.multi_line_with_matcher(&matcher) {
747 log::trace!(
748 "generic reader: reading everything to heap for multiline"
749 );
750 self.fill_multi_line_buffer_from_reader::<_, S>(decoder)?;
751 log::trace!("generic reader: searching via multiline strategy");
752 MultiLine::new(
753 self,
754 matcher,
755 &*self.multi_line_buffer.borrow(),
756 write_to,
757 )
758 .run()
759 } else {
760 let mut line_buffer = self.line_buffer.borrow_mut();
761 let rdr = LineBufferReader::new(decoder, &mut *line_buffer);
762 log::trace!("generic reader: searching via roll buffer strategy");
763 ReadByLine::new(self, matcher, rdr, write_to).run()
764 }
765 }
766
767 /// Execute a search over the given slice and write the results to the
768 /// given sink.
769 pub fn search_slice<M, S>(
770 &mut self,
771 matcher: M,
772 slice: &[u8],
773 write_to: S,
774 ) -> Result<(), S::Error>
775 where
776 M: Matcher,
777 S: Sink,
778 {
779 self.check_config(&matcher).map_err(S::Error::error_config)?;
780
781 // We can search the slice directly, unless we need to do transcoding.
782 if self.slice_needs_transcoding(slice) {
783 log::trace!(
784 "slice reader: needs transcoding, using generic reader"
785 );
786 return self.search_reader(matcher, slice, write_to);
787 }
788 if self.multi_line_with_matcher(&matcher) {
789 log::trace!("slice reader: searching via multiline strategy");
790 MultiLine::new(self, matcher, slice, write_to).run()
791 } else {
792 log::trace!("slice reader: searching via slice-by-line strategy");
793 SliceByLine::new(self, matcher, slice, write_to).run()
794 }
795 }
796
797 /// Set the binary detection method used on this searcher.
798 pub fn set_binary_detection(&mut self, detection: BinaryDetection) {
799 self.config.binary = detection.clone();
800 self.line_buffer.borrow_mut().set_binary_detection(detection.0);
801 }
802
803 /// Check that the searcher's configuration and the matcher are consistent
804 /// with each other.
805 fn check_config<M: Matcher>(&self, matcher: M) -> Result<(), ConfigError> {
806 if self.config.heap_limit == Some(0) && !self.config.mmap.is_enabled()
807 {
808 return Err(ConfigError::SearchUnavailable);
809 }
810 let matcher_line_term = match matcher.line_terminator() {
811 None => return Ok(()),
812 Some(line_term) => line_term,
813 };
814 if matcher_line_term != self.config.line_term {
815 return Err(ConfigError::MismatchedLineTerminators {
816 matcher: matcher_line_term,
817 searcher: self.config.line_term,
818 });
819 }
820 Ok(())
821 }
822
823 /// Returns true if and only if the given slice needs to be transcoded.
824 fn slice_needs_transcoding(&self, slice: &[u8]) -> bool {
825 self.config.encoding.is_some()
826 || (self.config.bom_sniffing && slice_has_bom(slice))
827 }
828}
829
830/// The following methods permit querying the configuration of a searcher.
831/// These can be useful in generic implementations of [`Sink`], where the
832/// output may be tailored based on how the searcher is configured.
833impl Searcher {
834 /// Returns the line terminator used by this searcher.
835 #[inline]
836 pub fn line_terminator(&self) -> LineTerminator {
837 self.config.line_term
838 }
839
840 /// Returns the type of binary detection configured on this searcher.
841 #[inline]
842 pub fn binary_detection(&self) -> &BinaryDetection {
843 &self.config.binary
844 }
845
846 /// Returns true if and only if this searcher is configured to invert its
847 /// search results. That is, matching lines are lines that do **not** match
848 /// the searcher's matcher.
849 #[inline]
850 pub fn invert_match(&self) -> bool {
851 self.config.invert_match
852 }
853
854 /// Returns true if and only if this searcher is configured to count line
855 /// numbers.
856 #[inline]
857 pub fn line_number(&self) -> bool {
858 self.config.line_number
859 }
860
861 /// Returns true if and only if this searcher is configured to perform
862 /// multi line search.
863 #[inline]
864 pub fn multi_line(&self) -> bool {
865 self.config.multi_line
866 }
867
868 /// Returns true if and only if this searcher is configured to stop when it
869 /// finds a non-matching line after a matching one.
870 #[inline]
871 pub fn stop_on_nonmatch(&self) -> bool {
872 self.config.stop_on_nonmatch
873 }
874
875 /// Returns the maximum number of matches emitted by this searcher, if
876 /// such a limit was set.
877 ///
878 /// If multi line search is enabled and a match spans multiple lines, then
879 /// that match is counted exactly once for the purposes of enforcing this
880 /// limit, regardless of how many lines it spans.
881 ///
882 /// Note that `0` is a legal value. This will cause the searcher to
883 /// immediately quick without searching anything.
884 #[inline]
885 pub fn max_matches(&self) -> Option<u64> {
886 self.config.max_matches
887 }
888
889 /// Returns true if and only if this searcher will choose a multi-line
890 /// strategy given the provided matcher.
891 ///
892 /// This may diverge from the result of `multi_line` in cases where the
893 /// searcher has been configured to execute a search that can report
894 /// matches over multiple lines, but where the matcher guarantees that it
895 /// will never produce a match over multiple lines.
896 pub fn multi_line_with_matcher<M: Matcher>(&self, matcher: M) -> bool {
897 if !self.multi_line() {
898 return false;
899 }
900 if let Some(line_term) = matcher.line_terminator() {
901 if line_term == self.line_terminator() {
902 return false;
903 }
904 }
905 if let Some(non_matching) = matcher.non_matching_bytes() {
906 // If the line terminator is CRLF, we don't actually need to care
907 // whether the regex can match `\r` or not. Namely, a `\r` is
908 // neither necessary nor sufficient to terminate a line. A `\n` is
909 // always required.
910 if non_matching.contains(self.line_terminator().as_byte()) {
911 return false;
912 }
913 }
914 true
915 }
916
917 /// Returns the number of "after" context lines to report. When context
918 /// reporting is not enabled, this returns `0`.
919 #[inline]
920 pub fn after_context(&self) -> usize {
921 self.config.after_context
922 }
923
924 /// Returns the number of "before" context lines to report. When context
925 /// reporting is not enabled, this returns `0`.
926 #[inline]
927 pub fn before_context(&self) -> usize {
928 self.config.before_context
929 }
930
931 /// Returns true if and only if the searcher has "passthru" mode enabled.
932 #[inline]
933 pub fn passthru(&self) -> bool {
934 self.config.passthru
935 }
936
937 /// Fill the buffer for use with multi-line searching from the given file.
938 /// This reads from the file until EOF or until an error occurs. If the
939 /// contents exceed the configured heap limit, then an error is returned.
940 fn fill_multi_line_buffer_from_file<S: Sink>(
941 &self,
942 file: &File,
943 ) -> Result<(), S::Error> {
944 assert!(self.config.multi_line);
945
946 let mut decode_buffer = self.decode_buffer.borrow_mut();
947 let mut read_from = self
948 .decode_builder
949 .build_with_buffer(file, &mut *decode_buffer)
950 .map_err(S::Error::error_io)?;
951
952 // If we don't have a heap limit, then we can defer to std's
953 // read_to_end implementation. fill_multi_line_buffer_from_reader will
954 // do this too, but since we have a File, we can be a bit smarter about
955 // pre-allocating here.
956 //
957 // If we're transcoding, then our pre-allocation might not be exact,
958 // but is probably still better than nothing.
959 if self.config.heap_limit.is_none() {
960 let mut buf = self.multi_line_buffer.borrow_mut();
961 buf.clear();
962 let cap =
963 file.metadata().map(|m| m.len() as usize + 1).unwrap_or(0);
964 buf.reserve(cap);
965 read_from.read_to_end(&mut *buf).map_err(S::Error::error_io)?;
966 return Ok(());
967 }
968 self.fill_multi_line_buffer_from_reader::<_, S>(read_from)
969 }
970
971 /// Fill the buffer for use with multi-line searching from the given
972 /// reader. This reads from the reader until EOF or until an error occurs.
973 /// If the contents exceed the configured heap limit, then an error is
974 /// returned.
975 fn fill_multi_line_buffer_from_reader<R: io::Read, S: Sink>(
976 &self,
977 mut read_from: R,
978 ) -> Result<(), S::Error> {
979 assert!(self.config.multi_line);
980
981 let mut buf = self.multi_line_buffer.borrow_mut();
982 buf.clear();
983
984 // If we don't have a heap limit, then we can defer to std's
985 // read_to_end implementation...
986 let heap_limit = match self.config.heap_limit {
987 Some(heap_limit) => heap_limit,
988 None => {
989 read_from
990 .read_to_end(&mut *buf)
991 .map_err(S::Error::error_io)?;
992 return Ok(());
993 }
994 };
995 if heap_limit == 0 {
996 return Err(S::Error::error_io(alloc_error(heap_limit)));
997 }
998
999 // ... otherwise we need to roll our own. This is likely quite a bit
1000 // slower than what is optimal, but we avoid worry about memory safety
1001 // until there's a compelling reason to speed this up.
1002 buf.resize(cmp::min(DEFAULT_BUFFER_CAPACITY, heap_limit), 0);
1003 let mut pos = 0;
1004 loop {
1005 let nread = match read_from.read(&mut buf[pos..]) {
1006 Ok(nread) => nread,
1007 Err(ref err) if err.kind() == io::ErrorKind::Interrupted => {
1008 continue;
1009 }
1010 Err(err) => return Err(S::Error::error_io(err)),
1011 };
1012 if nread == 0 {
1013 buf.resize(pos, 0);
1014 return Ok(());
1015 }
1016
1017 pos += nread;
1018 if buf[pos..].is_empty() {
1019 let additional = heap_limit - buf.len();
1020 if additional == 0 {
1021 return Err(S::Error::error_io(alloc_error(heap_limit)));
1022 }
1023 let limit = buf.len() + additional;
1024 let doubled = 2 * buf.len();
1025 buf.resize(cmp::min(doubled, limit), 0);
1026 }
1027 }
1028 }
1029}
1030
1031/// Returns true if and only if the given slice begins with a UTF-8 or UTF-16
1032/// BOM.
1033///
1034/// This is used by the searcher to determine if a transcoder is necessary.
1035/// Otherwise, it is advantageous to search the slice directly.
1036fn slice_has_bom(slice: &[u8]) -> bool {
1037 let enc = match encoding_rs::Encoding::for_bom(slice) {
1038 None => return false,
1039 Some((enc, _)) => enc,
1040 };
1041 log::trace!("found byte-order mark (BOM) for encoding {enc:?}");
1042 [encoding_rs::UTF_16LE, encoding_rs::UTF_16BE, encoding_rs::UTF_8]
1043 .contains(&enc)
1044}
1045
1046#[cfg(test)]
1047mod tests {
1048 use crate::testutil::{KitchenSink, RegexMatcher};
1049
1050 use super::*;
1051
1052 #[test]
1053 fn config_error_heap_limit() {
1054 let matcher = RegexMatcher::new("");
1055 let sink = KitchenSink::new();
1056 let mut searcher = SearcherBuilder::new().heap_limit(Some(0)).build();
1057 let res = searcher.search_slice(matcher, &[], sink);
1058 assert!(res.is_err());
1059 }
1060
1061 #[test]
1062 fn config_error_line_terminator() {
1063 let mut matcher = RegexMatcher::new("");
1064 matcher.set_line_term(Some(LineTerminator::byte(b'z')));
1065
1066 let sink = KitchenSink::new();
1067 let mut searcher = Searcher::new();
1068 let res = searcher.search_slice(matcher, &[], sink);
1069 assert!(res.is_err());
1070 }
1071
1072 #[test]
1073 fn uft8_bom_sniffing() {
1074 // See: https://github.com/BurntSushi/ripgrep/issues/1638
1075 // ripgrep must sniff utf-8 BOM, just like it does with utf-16
1076 let matcher = RegexMatcher::new("foo");
1077 let haystack: &[u8] = &[0xef, 0xbb, 0xbf, 0x66, 0x6f, 0x6f];
1078
1079 let mut sink = KitchenSink::new();
1080 let mut searcher = SearcherBuilder::new().build();
1081
1082 let res = searcher.search_slice(matcher, haystack, &mut sink);
1083 assert!(res.is_ok());
1084
1085 let sink_output = String::from_utf8(sink.as_bytes().to_vec()).unwrap();
1086 assert_eq!(sink_output, "1:0:foo\nbyte count:3\n");
1087 }
1088}