grep_searcher/searcher/mod.rs
1use std::{
2 cell::RefCell,
3 cmp,
4 fs::File,
5 io::{self, Read},
6 path::Path,
7};
8
9use {
10 encoding_rs_io::DecodeReaderBytesBuilder,
11 grep_matcher::{LineTerminator, Match, Matcher},
12};
13
14use crate::{
15 line_buffer::{
16 self, alloc_error, BufferAllocation, LineBuffer, LineBufferBuilder,
17 LineBufferReader, DEFAULT_BUFFER_CAPACITY,
18 },
19 searcher::glue::{MultiLine, ReadByLine, SliceByLine},
20 sink::{Sink, SinkError},
21};
22
23pub use self::mmap::MmapChoice;
24
25mod core;
26mod glue;
27mod mmap;
28
29/// We use this type alias since we want the ergonomics of a matcher's `Match`
30/// type, but in practice, we use it for arbitrary ranges, so give it a more
31/// accurate name. This is only used in the searcher's internals.
32type Range = Match;
33
34/// The behavior of binary detection while searching.
35///
36/// Binary detection is the process of _heuristically_ identifying whether a
37/// given chunk of data is binary or not, and then taking an action based on
38/// the result of that heuristic. The motivation behind detecting binary data
39/// is that binary data often indicates data that is undesirable to search
40/// using textual patterns. Of course, there are many cases in which this isn't
41/// true, which is why binary detection is disabled by default.
42///
43/// Unfortunately, binary detection works differently depending on the type of
44/// search being executed:
45///
46/// 1. When performing a search using a fixed size buffer, binary detection is
47/// applied to the buffer's contents as it is filled. Binary detection must
48/// be applied to the buffer directly because binary files may not contain
49/// line terminators, which could result in exorbitant memory usage.
50/// 2. When performing a search using memory maps or by reading data off the
51/// heap, then binary detection is only guaranteed to be applied to the
52/// parts corresponding to a match. When `Quit` is enabled, then the first
53/// few KB of the data are searched for binary data.
54#[derive(Clone, Debug, Default, Eq, PartialEq)]
55pub struct BinaryDetection(line_buffer::BinaryDetection);
56
57impl BinaryDetection {
58 /// No binary detection is performed. Data reported by the searcher may
59 /// contain arbitrary bytes.
60 ///
61 /// This is the default.
62 pub fn none() -> BinaryDetection {
63 BinaryDetection(line_buffer::BinaryDetection::None)
64 }
65
66 /// Binary detection is performed by looking for the given byte.
67 ///
68 /// When searching is performed using a fixed size buffer, then the
69 /// contents of that buffer are always searched for the presence of this
70 /// byte. If it is found, then the underlying data is considered binary
71 /// and the search stops as if it reached EOF.
72 ///
73 /// When searching is performed with the entire contents mapped into
74 /// memory, then binary detection is more conservative. Namely, only a
75 /// fixed sized region at the beginning of the contents are detected for
76 /// binary data. As a compromise, any subsequent matching (or context)
77 /// lines are also searched for binary data. If binary data is detected at
78 /// any point, then the search stops as if it reached EOF.
79 pub fn quit(binary_byte: u8) -> BinaryDetection {
80 BinaryDetection(line_buffer::BinaryDetection::Quit(binary_byte))
81 }
82
83 /// Binary detection is performed by looking for the given byte, and
84 /// replacing it with the line terminator configured on the searcher.
85 /// (If the searcher is configured to use `CRLF` as the line terminator,
86 /// then this byte is replaced by just `LF`.)
87 ///
88 /// When searching is performed using a fixed size buffer, then the
89 /// contents of that buffer are always searched for the presence of this
90 /// byte and replaced with the line terminator. In effect, the caller is
91 /// guaranteed to never observe this byte while searching.
92 ///
93 /// When searching is performed with the entire contents mapped into
94 /// memory, then this setting has no effect and is ignored.
95 pub fn convert(binary_byte: u8) -> BinaryDetection {
96 BinaryDetection(line_buffer::BinaryDetection::Convert(binary_byte))
97 }
98
99 /// If this binary detection uses the "quit" strategy, then this returns
100 /// the byte that will cause a search to quit. In any other case, this
101 /// returns `None`.
102 pub fn quit_byte(&self) -> Option<u8> {
103 match self.0 {
104 line_buffer::BinaryDetection::Quit(b) => Some(b),
105 _ => None,
106 }
107 }
108
109 /// If this binary detection uses the "convert" strategy, then this returns
110 /// the byte that will be replaced by the line terminator. In any other
111 /// case, this returns `None`.
112 pub fn convert_byte(&self) -> Option<u8> {
113 match self.0 {
114 line_buffer::BinaryDetection::Convert(b) => Some(b),
115 _ => None,
116 }
117 }
118}
119
120/// An encoding to use when searching.
121///
122/// An encoding can be used to configure a [`SearcherBuilder`] to transcode
123/// source data from an encoding to UTF-8 before searching.
124///
125/// An `Encoding` will always be cheap to clone.
126#[derive(Clone, Debug, Eq, PartialEq)]
127pub struct Encoding(&'static encoding_rs::Encoding);
128
129impl Encoding {
130 /// Create a new encoding for the specified label.
131 ///
132 /// The encoding label provided is mapped to an encoding via the set of
133 /// available choices specified in the
134 /// [Encoding Standard](https://encoding.spec.whatwg.org/#concept-encoding-get).
135 /// If the given label does not correspond to a valid encoding, then this
136 /// returns an error.
137 pub fn new(label: &str) -> Result<Encoding, ConfigError> {
138 let label = label.as_bytes();
139 match encoding_rs::Encoding::for_label_no_replacement(label) {
140 Some(encoding) => Ok(Encoding(encoding)),
141 None => {
142 Err(ConfigError::UnknownEncoding { label: label.to_vec() })
143 }
144 }
145 }
146}
147
148/// The internal configuration of a searcher. This is shared among several
149/// search related types, but is only ever written to by the SearcherBuilder.
150#[derive(Clone, Debug)]
151pub struct Config {
152 /// The line terminator to use.
153 line_term: LineTerminator,
154 /// Whether to invert matching.
155 invert_match: bool,
156 /// The number of lines after a match to include.
157 after_context: usize,
158 /// The number of lines before a match to include.
159 before_context: usize,
160 /// Whether to enable unbounded context or not.
161 passthru: bool,
162 /// Whether to count line numbers.
163 line_number: bool,
164 /// The maximum amount of heap memory to use.
165 ///
166 /// When not given, no explicit limit is enforced. When set to `0`, then
167 /// only the memory map search strategy is available.
168 heap_limit: Option<usize>,
169 /// The memory map strategy.
170 mmap: MmapChoice,
171 /// The binary data detection strategy.
172 binary: BinaryDetection,
173 /// Whether to enable matching across multiple lines.
174 multi_line: bool,
175 /// An encoding that, when present, causes the searcher to transcode all
176 /// input from the encoding to UTF-8.
177 encoding: Option<Encoding>,
178 /// Whether to do automatic transcoding based on a BOM or not.
179 bom_sniffing: bool,
180 /// Whether to stop searching when a non-matching line is found after a
181 /// matching line.
182 stop_on_nonmatch: bool,
183}
184
185impl Default for Config {
186 fn default() -> Config {
187 Config {
188 line_term: LineTerminator::default(),
189 invert_match: false,
190 after_context: 0,
191 before_context: 0,
192 passthru: false,
193 line_number: true,
194 heap_limit: None,
195 mmap: MmapChoice::default(),
196 binary: BinaryDetection::default(),
197 multi_line: false,
198 encoding: None,
199 bom_sniffing: true,
200 stop_on_nonmatch: false,
201 }
202 }
203}
204
205impl Config {
206 /// Return the maximal amount of lines needed to fulfill this
207 /// configuration's context.
208 ///
209 /// If this returns `0`, then no context is ever needed.
210 fn max_context(&self) -> usize {
211 cmp::max(self.before_context, self.after_context)
212 }
213
214 /// Build a line buffer from this configuration.
215 fn line_buffer(&self) -> LineBuffer {
216 let mut builder = LineBufferBuilder::new();
217 builder
218 .line_terminator(self.line_term.as_byte())
219 .binary_detection(self.binary.0);
220
221 if let Some(limit) = self.heap_limit {
222 let (capacity, additional) = if limit <= DEFAULT_BUFFER_CAPACITY {
223 (limit, 0)
224 } else {
225 (DEFAULT_BUFFER_CAPACITY, limit - DEFAULT_BUFFER_CAPACITY)
226 };
227 builder
228 .capacity(capacity)
229 .buffer_alloc(BufferAllocation::Error(additional));
230 }
231 builder.build()
232 }
233}
234
235/// An error that can occur when building a searcher.
236///
237/// This error occurs when a non-sensical configuration is present when trying
238/// to construct a `Searcher` from a `SearcherBuilder`.
239#[derive(Clone, Debug, Eq, PartialEq)]
240#[non_exhaustive]
241pub enum ConfigError {
242 /// Indicates that the heap limit configuration prevents all possible
243 /// search strategies from being used. For example, if the heap limit is
244 /// set to 0 and memory map searching is disabled or unavailable.
245 SearchUnavailable,
246 /// Occurs when a matcher reports a line terminator that is different than
247 /// the one configured in the searcher.
248 MismatchedLineTerminators {
249 /// The matcher's line terminator.
250 matcher: LineTerminator,
251 /// The searcher's line terminator.
252 searcher: LineTerminator,
253 },
254 /// Occurs when no encoding could be found for a particular label.
255 UnknownEncoding {
256 /// The provided encoding label that could not be found.
257 label: Vec<u8>,
258 },
259}
260
261impl std::error::Error for ConfigError {}
262
263impl std::fmt::Display for ConfigError {
264 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
265 match *self {
266 ConfigError::SearchUnavailable => {
267 write!(f, "grep config error: no available searchers")
268 }
269 ConfigError::MismatchedLineTerminators { matcher, searcher } => {
270 write!(
271 f,
272 "grep config error: mismatched line terminators, \
273 matcher has {:?} but searcher has {:?}",
274 matcher, searcher
275 )
276 }
277 ConfigError::UnknownEncoding { ref label } => write!(
278 f,
279 "grep config error: unknown encoding: {}",
280 String::from_utf8_lossy(label),
281 ),
282 }
283 }
284}
285
286/// A builder for configuring a searcher.
287///
288/// A search builder permits specifying the configuration of a searcher,
289/// including options like whether to invert the search or to enable multi
290/// line search.
291///
292/// Once a searcher has been built, it is beneficial to reuse that searcher
293/// for multiple searches, if possible.
294#[derive(Clone, Debug)]
295pub struct SearcherBuilder {
296 config: Config,
297}
298
299impl Default for SearcherBuilder {
300 fn default() -> SearcherBuilder {
301 SearcherBuilder::new()
302 }
303}
304
305impl SearcherBuilder {
306 /// Create a new searcher builder with a default configuration.
307 pub fn new() -> SearcherBuilder {
308 SearcherBuilder { config: Config::default() }
309 }
310
311 /// Build a searcher with the given matcher.
312 pub fn build(&self) -> Searcher {
313 let mut config = self.config.clone();
314 if config.passthru {
315 config.before_context = 0;
316 config.after_context = 0;
317 }
318
319 let mut decode_builder = DecodeReaderBytesBuilder::new();
320 decode_builder
321 .encoding(self.config.encoding.as_ref().map(|e| e.0))
322 .utf8_passthru(true)
323 .strip_bom(self.config.bom_sniffing)
324 .bom_override(true)
325 .bom_sniffing(self.config.bom_sniffing);
326
327 Searcher {
328 config,
329 decode_builder,
330 decode_buffer: RefCell::new(vec![0; 8 * (1 << 10)]),
331 line_buffer: RefCell::new(self.config.line_buffer()),
332 multi_line_buffer: RefCell::new(vec![]),
333 }
334 }
335
336 /// Set the line terminator that is used by the searcher.
337 ///
338 /// When using a searcher, if the matcher provided has a line terminator
339 /// set, then it must be the same as this one. If they aren't, building
340 /// a searcher will return an error.
341 ///
342 /// By default, this is set to `b'\n'`.
343 pub fn line_terminator(
344 &mut self,
345 line_term: LineTerminator,
346 ) -> &mut SearcherBuilder {
347 self.config.line_term = line_term;
348 self
349 }
350
351 /// Whether to invert matching, whereby lines that don't match are reported
352 /// instead of reporting lines that do match.
353 ///
354 /// By default, this is disabled.
355 pub fn invert_match(&mut self, yes: bool) -> &mut SearcherBuilder {
356 self.config.invert_match = yes;
357 self
358 }
359
360 /// Whether to count and include line numbers with matching lines.
361 ///
362 /// This is enabled by default. There is a small performance penalty
363 /// associated with computing line numbers, so this can be disabled when
364 /// this isn't desirable.
365 pub fn line_number(&mut self, yes: bool) -> &mut SearcherBuilder {
366 self.config.line_number = yes;
367 self
368 }
369
370 /// Whether to enable multi line search or not.
371 ///
372 /// When multi line search is enabled, matches *may* match across multiple
373 /// lines. Conversely, when multi line search is disabled, it is impossible
374 /// for any match to span more than one line.
375 ///
376 /// **Warning:** multi line search requires having the entire contents to
377 /// search mapped in memory at once. When searching files, memory maps
378 /// will be used if possible and if they are enabled, which avoids using
379 /// your program's heap. However, if memory maps cannot be used (e.g.,
380 /// for searching streams like `stdin` or if transcoding is necessary),
381 /// then the entire contents of the stream are read on to the heap before
382 /// starting the search.
383 ///
384 /// This is disabled by default.
385 pub fn multi_line(&mut self, yes: bool) -> &mut SearcherBuilder {
386 self.config.multi_line = yes;
387 self
388 }
389
390 /// Whether to include a fixed number of lines after every match.
391 ///
392 /// When this is set to a non-zero number, then the searcher will report
393 /// `line_count` contextual lines after every match.
394 ///
395 /// This is set to `0` by default.
396 pub fn after_context(
397 &mut self,
398 line_count: usize,
399 ) -> &mut SearcherBuilder {
400 self.config.after_context = line_count;
401 self
402 }
403
404 /// Whether to include a fixed number of lines before every match.
405 ///
406 /// When this is set to a non-zero number, then the searcher will report
407 /// `line_count` contextual lines before every match.
408 ///
409 /// This is set to `0` by default.
410 pub fn before_context(
411 &mut self,
412 line_count: usize,
413 ) -> &mut SearcherBuilder {
414 self.config.before_context = line_count;
415 self
416 }
417
418 /// Whether to enable the "passthru" feature or not.
419 ///
420 /// When passthru is enabled, it effectively treats all non-matching lines
421 /// as contextual lines. In other words, enabling this is akin to
422 /// requesting an unbounded number of before and after contextual lines.
423 ///
424 /// When passthru mode is enabled, any `before_context` or `after_context`
425 /// settings are ignored by setting them to `0`.
426 ///
427 /// This is disabled by default.
428 pub fn passthru(&mut self, yes: bool) -> &mut SearcherBuilder {
429 self.config.passthru = yes;
430 self
431 }
432
433 /// Set an approximate limit on the amount of heap space used by a
434 /// searcher.
435 ///
436 /// The heap limit is enforced in two scenarios:
437 ///
438 /// * When searching using a fixed size buffer, the heap limit controls
439 /// how big this buffer is allowed to be. Assuming contexts are disabled,
440 /// the minimum size of this buffer is the length (in bytes) of the
441 /// largest single line in the contents being searched. If any line
442 /// exceeds the heap limit, then an error will be returned.
443 /// * When performing a multi line search, a fixed size buffer cannot be
444 /// used. Thus, the only choices are to read the entire contents on to
445 /// the heap, or use memory maps. In the former case, the heap limit set
446 /// here is enforced.
447 ///
448 /// If a heap limit is set to `0`, then no heap space is used. If there are
449 /// no alternative strategies available for searching without heap space
450 /// (e.g., memory maps are disabled), then the searcher wil return an error
451 /// immediately.
452 ///
453 /// By default, no limit is set.
454 pub fn heap_limit(
455 &mut self,
456 bytes: Option<usize>,
457 ) -> &mut SearcherBuilder {
458 self.config.heap_limit = bytes;
459 self
460 }
461
462 /// Set the strategy to employ use of memory maps.
463 ///
464 /// Currently, there are only two strategies that can be employed:
465 ///
466 /// * **Automatic** - A searcher will use heuristics, including but not
467 /// limited to file size and platform, to determine whether to use memory
468 /// maps or not.
469 /// * **Never** - Memory maps will never be used. If multi line search is
470 /// enabled, then the entire contents will be read on to the heap before
471 /// searching begins.
472 ///
473 /// The default behavior is **never**. Generally speaking, and perhaps
474 /// against conventional wisdom, memory maps don't necessarily enable
475 /// faster searching. For example, depending on the platform, using memory
476 /// maps while searching a large directory can actually be quite a bit
477 /// slower than using normal read calls because of the overhead of managing
478 /// the memory maps.
479 ///
480 /// Memory maps can be faster in some cases however. On some platforms,
481 /// when searching a very large file that *is already in memory*, it can
482 /// be slightly faster to search it as a memory map instead of using
483 /// normal read calls.
484 ///
485 /// Finally, memory maps have a somewhat complicated safety story in Rust.
486 /// If you aren't sure whether enabling memory maps is worth it, then just
487 /// don't bother with it.
488 ///
489 /// **WARNING**: If your process is searching a file backed memory map
490 /// at the same time that file is truncated, then it's possible for the
491 /// process to terminate with a bus error.
492 pub fn memory_map(
493 &mut self,
494 strategy: MmapChoice,
495 ) -> &mut SearcherBuilder {
496 self.config.mmap = strategy;
497 self
498 }
499
500 /// Set the binary detection strategy.
501 ///
502 /// The binary detection strategy determines not only how the searcher
503 /// detects binary data, but how it responds to the presence of binary
504 /// data. See the [`BinaryDetection`] type for more information.
505 ///
506 /// By default, binary detection is disabled.
507 pub fn binary_detection(
508 &mut self,
509 detection: BinaryDetection,
510 ) -> &mut SearcherBuilder {
511 self.config.binary = detection;
512 self
513 }
514
515 /// Set the encoding used to read the source data before searching.
516 ///
517 /// When an encoding is provided, then the source data is _unconditionally_
518 /// transcoded using the encoding, unless a BOM is present. If a BOM is
519 /// present, then the encoding indicated by the BOM is used instead. If the
520 /// transcoding process encounters an error, then bytes are replaced with
521 /// the Unicode replacement codepoint.
522 ///
523 /// When no encoding is specified (the default), then BOM sniffing is
524 /// used (if it's enabled, which it is, by default) to determine whether
525 /// the source data is UTF-8 or UTF-16, and transcoding will be performed
526 /// automatically. If no BOM could be found, then the source data is
527 /// searched _as if_ it were UTF-8. However, so long as the source data is
528 /// at least ASCII compatible, then it is possible for a search to produce
529 /// useful results.
530 pub fn encoding(
531 &mut self,
532 encoding: Option<Encoding>,
533 ) -> &mut SearcherBuilder {
534 self.config.encoding = encoding;
535 self
536 }
537
538 /// Enable automatic transcoding based on BOM sniffing.
539 ///
540 /// When this is enabled and an explicit encoding is not set, then this
541 /// searcher will try to detect the encoding of the bytes being searched
542 /// by sniffing its byte-order mark (BOM). In particular, when this is
543 /// enabled, UTF-16 encoded files will be searched seamlessly.
544 ///
545 /// When this is disabled and if an explicit encoding is not set, then
546 /// the bytes from the source stream will be passed through unchanged,
547 /// including its BOM, if one is present.
548 ///
549 /// This is enabled by default.
550 pub fn bom_sniffing(&mut self, yes: bool) -> &mut SearcherBuilder {
551 self.config.bom_sniffing = yes;
552 self
553 }
554
555 /// Stop searching a file when a non-matching line is found after a
556 /// matching line.
557 ///
558 /// This is useful for searching sorted files where it is expected that all
559 /// the matches will be on adjacent lines.
560 pub fn stop_on_nonmatch(
561 &mut self,
562 stop_on_nonmatch: bool,
563 ) -> &mut SearcherBuilder {
564 self.config.stop_on_nonmatch = stop_on_nonmatch;
565 self
566 }
567}
568
569/// A searcher executes searches over a haystack and writes results to a caller
570/// provided sink.
571///
572/// Matches are detected via implementations of the `Matcher` trait, which must
573/// be provided by the caller when executing a search.
574///
575/// When possible, a searcher should be reused.
576#[derive(Clone, Debug)]
577pub struct Searcher {
578 /// The configuration for this searcher.
579 ///
580 /// We make most of these settings available to users of `Searcher` via
581 /// public API methods, which can be queried in implementations of `Sink`
582 /// if necessary.
583 config: Config,
584 /// A builder for constructing a streaming reader that transcodes source
585 /// data according to either an explicitly specified encoding or via an
586 /// automatically detected encoding via BOM sniffing.
587 ///
588 /// When no transcoding is needed, then the transcoder built will pass
589 /// through the underlying bytes with no additional overhead.
590 decode_builder: DecodeReaderBytesBuilder,
591 /// A buffer that is used for transcoding scratch space.
592 decode_buffer: RefCell<Vec<u8>>,
593 /// A line buffer for use in line oriented searching.
594 ///
595 /// We wrap it in a RefCell to permit lending out borrows of `Searcher`
596 /// to sinks. We still require a mutable borrow to execute a search, so
597 /// we statically prevent callers from causing RefCell to panic at runtime
598 /// due to a borrowing violation.
599 line_buffer: RefCell<LineBuffer>,
600 /// A buffer in which to store the contents of a reader when performing a
601 /// multi line search. In particular, multi line searches cannot be
602 /// performed incrementally, and need the entire haystack in memory at
603 /// once.
604 multi_line_buffer: RefCell<Vec<u8>>,
605}
606
607impl Searcher {
608 /// Create a new searcher with a default configuration.
609 ///
610 /// To configure the searcher (e.g., invert matching, enable memory maps,
611 /// enable contexts, etc.), use the [`SearcherBuilder`].
612 pub fn new() -> Searcher {
613 SearcherBuilder::new().build()
614 }
615
616 /// Execute a search over the file with the given path and write the
617 /// results to the given sink.
618 ///
619 /// If memory maps are enabled and the searcher heuristically believes
620 /// memory maps will help the search run faster, then this will use
621 /// memory maps. For this reason, callers should prefer using this method
622 /// or `search_file` over the more generic `search_reader` when possible.
623 pub fn search_path<P, M, S>(
624 &mut self,
625 matcher: M,
626 path: P,
627 write_to: S,
628 ) -> Result<(), S::Error>
629 where
630 P: AsRef<Path>,
631 M: Matcher,
632 S: Sink,
633 {
634 let path = path.as_ref();
635 let file = File::open(path).map_err(S::Error::error_io)?;
636 self.search_file_maybe_path(matcher, Some(path), &file, write_to)
637 }
638
639 /// Execute a search over a file and write the results to the given sink.
640 ///
641 /// If memory maps are enabled and the searcher heuristically believes
642 /// memory maps will help the search run faster, then this will use
643 /// memory maps. For this reason, callers should prefer using this method
644 /// or `search_path` over the more generic `search_reader` when possible.
645 pub fn search_file<M, S>(
646 &mut self,
647 matcher: M,
648 file: &File,
649 write_to: S,
650 ) -> Result<(), S::Error>
651 where
652 M: Matcher,
653 S: Sink,
654 {
655 self.search_file_maybe_path(matcher, None, file, write_to)
656 }
657
658 fn search_file_maybe_path<M, S>(
659 &mut self,
660 matcher: M,
661 path: Option<&Path>,
662 file: &File,
663 write_to: S,
664 ) -> Result<(), S::Error>
665 where
666 M: Matcher,
667 S: Sink,
668 {
669 if let Some(mmap) = self.config.mmap.open(file, path) {
670 log::trace!("{:?}: searching via memory map", path);
671 return self.search_slice(matcher, &mmap, write_to);
672 }
673 // Fast path for multi-line searches of files when memory maps are not
674 // enabled. This pre-allocates a buffer roughly the size of the file,
675 // which isn't possible when searching an arbitrary std::io::Read.
676 if self.multi_line_with_matcher(&matcher) {
677 log::trace!(
678 "{:?}: reading entire file on to heap for mulitline",
679 path
680 );
681 self.fill_multi_line_buffer_from_file::<S>(file)?;
682 log::trace!("{:?}: searching via multiline strategy", path);
683 MultiLine::new(
684 self,
685 matcher,
686 &*self.multi_line_buffer.borrow(),
687 write_to,
688 )
689 .run()
690 } else {
691 log::trace!("{:?}: searching using generic reader", path);
692 self.search_reader(matcher, file, write_to)
693 }
694 }
695
696 /// Execute a search over any implementation of `std::io::Read` and write
697 /// the results to the given sink.
698 ///
699 /// When possible, this implementation will search the reader incrementally
700 /// without reading it into memory. In some cases---for example, if multi
701 /// line search is enabled---an incremental search isn't possible and the
702 /// given reader is consumed completely and placed on the heap before
703 /// searching begins. For this reason, when multi line search is enabled,
704 /// one should try to use higher level APIs (e.g., searching by file or
705 /// file path) so that memory maps can be used if they are available and
706 /// enabled.
707 pub fn search_reader<M, R, S>(
708 &mut self,
709 matcher: M,
710 read_from: R,
711 write_to: S,
712 ) -> Result<(), S::Error>
713 where
714 M: Matcher,
715 R: io::Read,
716 S: Sink,
717 {
718 self.check_config(&matcher).map_err(S::Error::error_config)?;
719
720 let mut decode_buffer = self.decode_buffer.borrow_mut();
721 let decoder = self
722 .decode_builder
723 .build_with_buffer(read_from, &mut *decode_buffer)
724 .map_err(S::Error::error_io)?;
725
726 if self.multi_line_with_matcher(&matcher) {
727 log::trace!(
728 "generic reader: reading everything to heap for multiline"
729 );
730 self.fill_multi_line_buffer_from_reader::<_, S>(decoder)?;
731 log::trace!("generic reader: searching via multiline strategy");
732 MultiLine::new(
733 self,
734 matcher,
735 &*self.multi_line_buffer.borrow(),
736 write_to,
737 )
738 .run()
739 } else {
740 let mut line_buffer = self.line_buffer.borrow_mut();
741 let rdr = LineBufferReader::new(decoder, &mut *line_buffer);
742 log::trace!("generic reader: searching via roll buffer strategy");
743 ReadByLine::new(self, matcher, rdr, write_to).run()
744 }
745 }
746
747 /// Execute a search over the given slice and write the results to the
748 /// given sink.
749 pub fn search_slice<M, S>(
750 &mut self,
751 matcher: M,
752 slice: &[u8],
753 write_to: S,
754 ) -> Result<(), S::Error>
755 where
756 M: Matcher,
757 S: Sink,
758 {
759 self.check_config(&matcher).map_err(S::Error::error_config)?;
760
761 // We can search the slice directly, unless we need to do transcoding.
762 if self.slice_needs_transcoding(slice) {
763 log::trace!(
764 "slice reader: needs transcoding, using generic reader"
765 );
766 return self.search_reader(matcher, slice, write_to);
767 }
768 if self.multi_line_with_matcher(&matcher) {
769 log::trace!("slice reader: searching via multiline strategy");
770 MultiLine::new(self, matcher, slice, write_to).run()
771 } else {
772 log::trace!("slice reader: searching via slice-by-line strategy");
773 SliceByLine::new(self, matcher, slice, write_to).run()
774 }
775 }
776
777 /// Set the binary detection method used on this searcher.
778 pub fn set_binary_detection(&mut self, detection: BinaryDetection) {
779 self.config.binary = detection.clone();
780 self.line_buffer.borrow_mut().set_binary_detection(detection.0);
781 }
782
783 /// Check that the searcher's configuration and the matcher are consistent
784 /// with each other.
785 fn check_config<M: Matcher>(&self, matcher: M) -> Result<(), ConfigError> {
786 if self.config.heap_limit == Some(0) && !self.config.mmap.is_enabled()
787 {
788 return Err(ConfigError::SearchUnavailable);
789 }
790 let matcher_line_term = match matcher.line_terminator() {
791 None => return Ok(()),
792 Some(line_term) => line_term,
793 };
794 if matcher_line_term != self.config.line_term {
795 return Err(ConfigError::MismatchedLineTerminators {
796 matcher: matcher_line_term,
797 searcher: self.config.line_term,
798 });
799 }
800 Ok(())
801 }
802
803 /// Returns true if and only if the given slice needs to be transcoded.
804 fn slice_needs_transcoding(&self, slice: &[u8]) -> bool {
805 self.config.encoding.is_some()
806 || (self.config.bom_sniffing && slice_has_bom(slice))
807 }
808}
809
810/// The following methods permit querying the configuration of a searcher.
811/// These can be useful in generic implementations of [`Sink`], where the
812/// output may be tailored based on how the searcher is configured.
813impl Searcher {
814 /// Returns the line terminator used by this searcher.
815 #[inline]
816 pub fn line_terminator(&self) -> LineTerminator {
817 self.config.line_term
818 }
819
820 /// Returns the type of binary detection configured on this searcher.
821 #[inline]
822 pub fn binary_detection(&self) -> &BinaryDetection {
823 &self.config.binary
824 }
825
826 /// Returns true if and only if this searcher is configured to invert its
827 /// search results. That is, matching lines are lines that do **not** match
828 /// the searcher's matcher.
829 #[inline]
830 pub fn invert_match(&self) -> bool {
831 self.config.invert_match
832 }
833
834 /// Returns true if and only if this searcher is configured to count line
835 /// numbers.
836 #[inline]
837 pub fn line_number(&self) -> bool {
838 self.config.line_number
839 }
840
841 /// Returns true if and only if this searcher is configured to perform
842 /// multi line search.
843 #[inline]
844 pub fn multi_line(&self) -> bool {
845 self.config.multi_line
846 }
847
848 /// Returns true if and only if this searcher is configured to stop when in
849 /// finds a non-matching line after a matching one.
850 #[inline]
851 pub fn stop_on_nonmatch(&self) -> bool {
852 self.config.stop_on_nonmatch
853 }
854
855 /// Returns true if and only if this searcher will choose a multi-line
856 /// strategy given the provided matcher.
857 ///
858 /// This may diverge from the result of `multi_line` in cases where the
859 /// searcher has been configured to execute a search that can report
860 /// matches over multiple lines, but where the matcher guarantees that it
861 /// will never produce a match over multiple lines.
862 pub fn multi_line_with_matcher<M: Matcher>(&self, matcher: M) -> bool {
863 if !self.multi_line() {
864 return false;
865 }
866 if let Some(line_term) = matcher.line_terminator() {
867 if line_term == self.line_terminator() {
868 return false;
869 }
870 }
871 if let Some(non_matching) = matcher.non_matching_bytes() {
872 // If the line terminator is CRLF, we don't actually need to care
873 // whether the regex can match `\r` or not. Namely, a `\r` is
874 // neither necessary nor sufficient to terminate a line. A `\n` is
875 // always required.
876 if non_matching.contains(self.line_terminator().as_byte()) {
877 return false;
878 }
879 }
880 true
881 }
882
883 /// Returns the number of "after" context lines to report. When context
884 /// reporting is not enabled, this returns `0`.
885 #[inline]
886 pub fn after_context(&self) -> usize {
887 self.config.after_context
888 }
889
890 /// Returns the number of "before" context lines to report. When context
891 /// reporting is not enabled, this returns `0`.
892 #[inline]
893 pub fn before_context(&self) -> usize {
894 self.config.before_context
895 }
896
897 /// Returns true if and only if the searcher has "passthru" mode enabled.
898 #[inline]
899 pub fn passthru(&self) -> bool {
900 self.config.passthru
901 }
902
903 /// Fill the buffer for use with multi-line searching from the given file.
904 /// This reads from the file until EOF or until an error occurs. If the
905 /// contents exceed the configured heap limit, then an error is returned.
906 fn fill_multi_line_buffer_from_file<S: Sink>(
907 &self,
908 file: &File,
909 ) -> Result<(), S::Error> {
910 assert!(self.config.multi_line);
911
912 let mut decode_buffer = self.decode_buffer.borrow_mut();
913 let mut read_from = self
914 .decode_builder
915 .build_with_buffer(file, &mut *decode_buffer)
916 .map_err(S::Error::error_io)?;
917
918 // If we don't have a heap limit, then we can defer to std's
919 // read_to_end implementation. fill_multi_line_buffer_from_reader will
920 // do this too, but since we have a File, we can be a bit smarter about
921 // pre-allocating here.
922 //
923 // If we're transcoding, then our pre-allocation might not be exact,
924 // but is probably still better than nothing.
925 if self.config.heap_limit.is_none() {
926 let mut buf = self.multi_line_buffer.borrow_mut();
927 buf.clear();
928 let cap =
929 file.metadata().map(|m| m.len() as usize + 1).unwrap_or(0);
930 buf.reserve(cap);
931 read_from.read_to_end(&mut *buf).map_err(S::Error::error_io)?;
932 return Ok(());
933 }
934 self.fill_multi_line_buffer_from_reader::<_, S>(read_from)
935 }
936
937 /// Fill the buffer for use with multi-line searching from the given
938 /// reader. This reads from the reader until EOF or until an error occurs.
939 /// If the contents exceed the configured heap limit, then an error is
940 /// returned.
941 fn fill_multi_line_buffer_from_reader<R: io::Read, S: Sink>(
942 &self,
943 mut read_from: R,
944 ) -> Result<(), S::Error> {
945 assert!(self.config.multi_line);
946
947 let mut buf = self.multi_line_buffer.borrow_mut();
948 buf.clear();
949
950 // If we don't have a heap limit, then we can defer to std's
951 // read_to_end implementation...
952 let heap_limit = match self.config.heap_limit {
953 Some(heap_limit) => heap_limit,
954 None => {
955 read_from
956 .read_to_end(&mut *buf)
957 .map_err(S::Error::error_io)?;
958 return Ok(());
959 }
960 };
961 if heap_limit == 0 {
962 return Err(S::Error::error_io(alloc_error(heap_limit)));
963 }
964
965 // ... otherwise we need to roll our own. This is likely quite a bit
966 // slower than what is optimal, but we avoid worry about memory safety
967 // until there's a compelling reason to speed this up.
968 buf.resize(cmp::min(DEFAULT_BUFFER_CAPACITY, heap_limit), 0);
969 let mut pos = 0;
970 loop {
971 let nread = match read_from.read(&mut buf[pos..]) {
972 Ok(nread) => nread,
973 Err(ref err) if err.kind() == io::ErrorKind::Interrupted => {
974 continue;
975 }
976 Err(err) => return Err(S::Error::error_io(err)),
977 };
978 if nread == 0 {
979 buf.resize(pos, 0);
980 return Ok(());
981 }
982
983 pos += nread;
984 if buf[pos..].is_empty() {
985 let additional = heap_limit - buf.len();
986 if additional == 0 {
987 return Err(S::Error::error_io(alloc_error(heap_limit)));
988 }
989 let limit = buf.len() + additional;
990 let doubled = 2 * buf.len();
991 buf.resize(cmp::min(doubled, limit), 0);
992 }
993 }
994 }
995}
996
997/// Returns true if and only if the given slice begins with a UTF-8 or UTF-16
998/// BOM.
999///
1000/// This is used by the searcher to determine if a transcoder is necessary.
1001/// Otherwise, it is advantageous to search the slice directly.
1002fn slice_has_bom(slice: &[u8]) -> bool {
1003 let enc = match encoding_rs::Encoding::for_bom(slice) {
1004 None => return false,
1005 Some((enc, _)) => enc,
1006 };
1007 [encoding_rs::UTF_16LE, encoding_rs::UTF_16BE, encoding_rs::UTF_8]
1008 .contains(&enc)
1009}
1010
1011#[cfg(test)]
1012mod tests {
1013 use crate::testutil::{KitchenSink, RegexMatcher};
1014
1015 use super::*;
1016
1017 #[test]
1018 fn config_error_heap_limit() {
1019 let matcher = RegexMatcher::new("");
1020 let sink = KitchenSink::new();
1021 let mut searcher = SearcherBuilder::new().heap_limit(Some(0)).build();
1022 let res = searcher.search_slice(matcher, &[], sink);
1023 assert!(res.is_err());
1024 }
1025
1026 #[test]
1027 fn config_error_line_terminator() {
1028 let mut matcher = RegexMatcher::new("");
1029 matcher.set_line_term(Some(LineTerminator::byte(b'z')));
1030
1031 let sink = KitchenSink::new();
1032 let mut searcher = Searcher::new();
1033 let res = searcher.search_slice(matcher, &[], sink);
1034 assert!(res.is_err());
1035 }
1036
1037 #[test]
1038 fn uft8_bom_sniffing() {
1039 // See: https://github.com/BurntSushi/ripgrep/issues/1638
1040 // ripgrep must sniff utf-8 BOM, just like it does with utf-16
1041 let matcher = RegexMatcher::new("foo");
1042 let haystack: &[u8] = &[0xef, 0xbb, 0xbf, 0x66, 0x6f, 0x6f];
1043
1044 let mut sink = KitchenSink::new();
1045 let mut searcher = SearcherBuilder::new().build();
1046
1047 let res = searcher.search_slice(matcher, haystack, &mut sink);
1048 assert!(res.is_ok());
1049
1050 let sink_output = String::from_utf8(sink.as_bytes().to_vec()).unwrap();
1051 assert_eq!(sink_output, "1:0:foo\nbyte count:3\n");
1052 }
1053}