grep_printer/
json.rs

1use std::{
2    io::{self, Write},
3    path::Path,
4    time::Instant,
5};
6
7use {
8    grep_matcher::{Match, Matcher},
9    grep_searcher::{
10        Searcher, Sink, SinkContext, SinkContextKind, SinkFinish, SinkMatch,
11    },
12    serde_json as json,
13};
14
15use crate::{
16    counter::CounterWriter, jsont, stats::Stats, util::find_iter_at_in_context,
17};
18
19/// The configuration for the JSON printer.
20///
21/// This is manipulated by the JSONBuilder and then referenced by the actual
22/// implementation. Once a printer is build, the configuration is frozen and
23/// cannot changed.
24#[derive(Debug, Clone)]
25struct Config {
26    pretty: bool,
27    max_matches: Option<u64>,
28    always_begin_end: bool,
29}
30
31impl Default for Config {
32    fn default() -> Config {
33        Config { pretty: false, max_matches: None, always_begin_end: false }
34    }
35}
36
37/// A builder for a JSON lines printer.
38///
39/// The builder permits configuring how the printer behaves. The JSON printer
40/// has fewer configuration options than the standard printer because it is
41/// a structured format, and the printer always attempts to find the most
42/// information possible.
43///
44/// Some configuration options, such as whether line numbers are included or
45/// whether contextual lines are shown, are drawn directly from the
46/// `grep_searcher::Searcher`'s configuration.
47///
48/// Once a `JSON` printer is built, its configuration cannot be changed.
49#[derive(Clone, Debug)]
50pub struct JSONBuilder {
51    config: Config,
52}
53
54impl JSONBuilder {
55    /// Return a new builder for configuring the JSON printer.
56    pub fn new() -> JSONBuilder {
57        JSONBuilder { config: Config::default() }
58    }
59
60    /// Create a JSON printer that writes results to the given writer.
61    pub fn build<W: io::Write>(&self, wtr: W) -> JSON<W> {
62        JSON {
63            config: self.config.clone(),
64            wtr: CounterWriter::new(wtr),
65            matches: vec![],
66        }
67    }
68
69    /// Print JSON in a pretty printed format.
70    ///
71    /// Enabling this will no longer produce a "JSON lines" format, in that
72    /// each JSON object printed may span multiple lines.
73    ///
74    /// This is disabled by default.
75    pub fn pretty(&mut self, yes: bool) -> &mut JSONBuilder {
76        self.config.pretty = yes;
77        self
78    }
79
80    /// Set the maximum amount of matches that are printed.
81    ///
82    /// If multi line search is enabled and a match spans multiple lines, then
83    /// that match is counted exactly once for the purposes of enforcing this
84    /// limit, regardless of how many lines it spans.
85    pub fn max_matches(&mut self, limit: Option<u64>) -> &mut JSONBuilder {
86        self.config.max_matches = limit;
87        self
88    }
89
90    /// When enabled, the `begin` and `end` messages are always emitted, even
91    /// when no match is found.
92    ///
93    /// When disabled, the `begin` and `end` messages are only shown if there
94    /// is at least one `match` or `context` message.
95    ///
96    /// This is disabled by default.
97    pub fn always_begin_end(&mut self, yes: bool) -> &mut JSONBuilder {
98        self.config.always_begin_end = yes;
99        self
100    }
101}
102
103/// The JSON printer, which emits results in a JSON lines format.
104///
105/// This type is generic over `W`, which represents any implementation of
106/// the standard library `io::Write` trait.
107///
108/// # Format
109///
110/// This section describes the JSON format used by this printer.
111///
112/// To skip the rigamarole, take a look at the
113/// [example](#example)
114/// at the end.
115///
116/// ## Overview
117///
118/// The format of this printer is the [JSON Lines](https://jsonlines.org/)
119/// format. Specifically, this printer emits a sequence of messages, where
120/// each message is encoded as a single JSON value on a single line. There are
121/// four different types of messages (and this number may expand over time):
122///
123/// * **begin** - A message that indicates a file is being searched.
124/// * **end** - A message the indicates a file is done being searched. This
125///   message also include summary statistics about the search.
126/// * **match** - A message that indicates a match was found. This includes
127///   the text and offsets of the match.
128/// * **context** - A message that indicates a contextual line was found.
129///   This includes the text of the line, along with any match information if
130///   the search was inverted.
131///
132/// Every message is encoded in the same envelope format, which includes a tag
133/// indicating the message type along with an object for the payload:
134///
135/// ```json
136/// {
137///     "type": "{begin|end|match|context}",
138///     "data": { ... }
139/// }
140/// ```
141///
142/// The message itself is encoded in the envelope's `data` key.
143///
144/// ## Text encoding
145///
146/// Before describing each message format, we first must briefly discuss text
147/// encoding, since it factors into every type of message. In particular, JSON
148/// may only be encoded in UTF-8, UTF-16 or UTF-32. For the purposes of this
149/// printer, we need only worry about UTF-8. The problem here is that searching
150/// is not limited to UTF-8 exclusively, which in turn implies that matches
151/// may be reported that contain invalid UTF-8. Moreover, this printer may
152/// also print file paths, and the encoding of file paths is itself not
153/// guaranteed to be valid UTF-8. Therefore, this printer must deal with the
154/// presence of invalid UTF-8 somehow. The printer could silently ignore such
155/// things completely, or even lossily transcode invalid UTF-8 to valid UTF-8
156/// by replacing all invalid sequences with the Unicode replacement character.
157/// However, this would prevent consumers of this format from accessing the
158/// original data in a non-lossy way.
159///
160/// Therefore, this printer will emit valid UTF-8 encoded bytes as normal
161/// JSON strings and otherwise base64 encode data that isn't valid UTF-8. To
162/// communicate whether this process occurs or not, strings are keyed by the
163/// name `text` where as arbitrary bytes are keyed by `bytes`.
164///
165/// For example, when a path is included in a message, it is formatted like so,
166/// if and only if the path is valid UTF-8:
167///
168/// ```json
169/// {
170///     "path": {
171///         "text": "/home/ubuntu/lib.rs"
172///     }
173/// }
174/// ```
175///
176/// If instead our path was `/home/ubuntu/lib\xFF.rs`, where the `\xFF` byte
177/// makes it invalid UTF-8, the path would instead be encoded like so:
178///
179/// ```json
180/// {
181///     "path": {
182///         "bytes": "L2hvbWUvdWJ1bnR1L2xpYv8ucnM="
183///     }
184/// }
185/// ```
186///
187/// This same representation is used for reporting matches as well.
188///
189/// The printer guarantees that the `text` field is used whenever the
190/// underlying bytes are valid UTF-8.
191///
192/// ## Wire format
193///
194/// This section documents the wire format emitted by this printer, starting
195/// with the four types of messages.
196///
197/// Each message has its own format, and is contained inside an envelope that
198/// indicates the type of message. The envelope has these fields:
199///
200/// * **type** - A string indicating the type of this message. It may be one
201///   of four possible strings: `begin`, `end`, `match` or `context`. This
202///   list may expand over time.
203/// * **data** - The actual message data. The format of this field depends on
204///   the value of `type`. The possible message formats are
205///   [`begin`](#message-begin),
206///   [`end`](#message-end),
207///   [`match`](#message-match),
208///   [`context`](#message-context).
209///
210/// #### Message: **begin**
211///
212/// This message indicates that a search has begun. It has these fields:
213///
214/// * **path** - An
215///   [arbitrary data object](#object-arbitrary-data)
216///   representing the file path corresponding to the search, if one is
217///   present. If no file path is available, then this field is `null`.
218///
219/// #### Message: **end**
220///
221/// This message indicates that a search has finished. It has these fields:
222///
223/// * **path** - An
224///   [arbitrary data object](#object-arbitrary-data)
225///   representing the file path corresponding to the search, if one is
226///   present. If no file path is available, then this field is `null`.
227/// * **binary_offset** - The absolute offset in the data searched
228///   corresponding to the place at which binary data was detected. If no
229///   binary data was detected (or if binary detection was disabled), then this
230///   field is `null`.
231/// * **stats** - A [`stats` object](#object-stats) that contains summary
232///   statistics for the previous search.
233///
234/// #### Message: **match**
235///
236/// This message indicates that a match has been found. A match generally
237/// corresponds to a single line of text, although it may correspond to
238/// multiple lines if the search can emit matches over multiple lines. It
239/// has these fields:
240///
241/// * **path** - An
242///   [arbitrary data object](#object-arbitrary-data)
243///   representing the file path corresponding to the search, if one is
244///   present. If no file path is available, then this field is `null`.
245/// * **lines** - An
246///   [arbitrary data object](#object-arbitrary-data)
247///   representing one or more lines contained in this match.
248/// * **line_number** - If the searcher has been configured to report line
249///   numbers, then this corresponds to the line number of the first line
250///   in `lines`. If no line numbers are available, then this is `null`.
251/// * **absolute_offset** - The absolute byte offset corresponding to the start
252///   of `lines` in the data being searched.
253/// * **submatches** - An array of [`submatch` objects](#object-submatch)
254///   corresponding to matches in `lines`. The offsets included in each
255///   `submatch` correspond to byte offsets into `lines`. (If `lines` is base64
256///   encoded, then the byte offsets correspond to the data after base64
257///   decoding.) The `submatch` objects are guaranteed to be sorted by their
258///   starting offsets. Note that it is possible for this array to be empty,
259///   for example, when searching reports inverted matches.
260///
261/// #### Message: **context**
262///
263/// This message indicates that a contextual line has been found. A contextual
264/// line is a line that doesn't contain a match, but is generally adjacent to
265/// a line that does contain a match. The precise way in which contextual lines
266/// are reported is determined by the searcher. It has these fields, which are
267/// exactly the same fields found in a [`match`](#message-match):
268///
269/// * **path** - An
270///   [arbitrary data object](#object-arbitrary-data)
271///   representing the file path corresponding to the search, if one is
272///   present. If no file path is available, then this field is `null`.
273/// * **lines** - An
274///   [arbitrary data object](#object-arbitrary-data)
275///   representing one or more lines contained in this context. This includes
276///   line terminators, if they're present.
277/// * **line_number** - If the searcher has been configured to report line
278///   numbers, then this corresponds to the line number of the first line
279///   in `lines`. If no line numbers are available, then this is `null`.
280/// * **absolute_offset** - The absolute byte offset corresponding to the start
281///   of `lines` in the data being searched.
282/// * **submatches** - An array of [`submatch` objects](#object-submatch)
283///   corresponding to matches in `lines`. The offsets included in each
284///   `submatch` correspond to byte offsets into `lines`. (If `lines` is base64
285///   encoded, then the byte offsets correspond to the data after base64
286///   decoding.) The `submatch` objects are guaranteed to be sorted by
287///   their starting offsets. Note that it is possible for this array to be
288///   non-empty, for example, when searching reports inverted matches such that
289///   the original matcher could match things in the contextual lines.
290///
291/// #### Object: **submatch**
292///
293/// This object describes submatches found within `match` or `context`
294/// messages. The `start` and `end` fields indicate the half-open interval on
295/// which the match occurs (`start` is included, but `end` is not). It is
296/// guaranteed that `start <= end`. It has these fields:
297///
298/// * **match** - An
299///   [arbitrary data object](#object-arbitrary-data)
300///   corresponding to the text in this submatch.
301/// * **start** - A byte offset indicating the start of this match. This offset
302///   is generally reported in terms of the parent object's data. For example,
303///   the `lines` field in the
304///   [`match`](#message-match) or [`context`](#message-context)
305///   messages.
306/// * **end** - A byte offset indicating the end of this match. This offset
307///   is generally reported in terms of the parent object's data. For example,
308///   the `lines` field in the
309///   [`match`](#message-match) or [`context`](#message-context)
310///   messages.
311///
312/// #### Object: **stats**
313///
314/// This object is included in messages and contains summary statistics about
315/// a search. It has these fields:
316///
317/// * **elapsed** - A [`duration` object](#object-duration) describing the
318///   length of time that elapsed while performing the search.
319/// * **searches** - The number of searches that have run. For this printer,
320///   this value is always `1`. (Implementations may emit additional message
321///   types that use this same `stats` object that represents summary
322///   statistics over multiple searches.)
323/// * **searches_with_match** - The number of searches that have run that have
324///   found at least one match. This is never more than `searches`.
325/// * **bytes_searched** - The total number of bytes that have been searched.
326/// * **bytes_printed** - The total number of bytes that have been printed.
327///   This includes everything emitted by this printer.
328/// * **matched_lines** - The total number of lines that participated in a
329///   match. When matches may contain multiple lines, then this includes every
330///   line that is part of every match.
331/// * **matches** - The total number of matches. There may be multiple matches
332///   per line. When matches may contain multiple lines, each match is counted
333///   only once, regardless of how many lines it spans.
334///
335/// #### Object: **duration**
336///
337/// This object includes a few fields for describing a duration. Two of its
338/// fields, `secs` and `nanos`, can be combined to give nanosecond precision
339/// on systems that support it. It has these fields:
340///
341/// * **secs** - A whole number of seconds indicating the length of this
342///   duration.
343/// * **nanos** - A fractional part of this duration represent by nanoseconds.
344///   If nanosecond precision isn't supported, then this is typically rounded
345///   up to the nearest number of nanoseconds.
346/// * **human** - A human readable string describing the length of the
347///   duration. The format of the string is itself unspecified.
348///
349/// #### Object: **arbitrary data**
350///
351/// This object is used whenever arbitrary data needs to be represented as a
352/// JSON value. This object contains two fields, where generally only one of
353/// the fields is present:
354///
355/// * **text** - A normal JSON string that is UTF-8 encoded. This field is
356///   populated if and only if the underlying data is valid UTF-8.
357/// * **bytes** - A normal JSON string that is a base64 encoding of the
358///   underlying bytes.
359///
360/// More information on the motivation for this representation can be seen in
361/// the section [text encoding](#text-encoding) above.
362///
363/// ## Example
364///
365/// This section shows a small example that includes all message types.
366///
367/// Here's the file we want to search, located at `/home/andrew/sherlock`:
368///
369/// ```text
370/// For the Doctor Watsons of this world, as opposed to the Sherlock
371/// Holmeses, success in the province of detective work must always
372/// be, to a very large extent, the result of luck. Sherlock Holmes
373/// can extract a clew from a wisp of straw or a flake of cigar ash;
374/// but Doctor Watson has to have it taken out for him and dusted,
375/// and exhibited clearly, with a label attached.
376/// ```
377///
378/// Searching for `Watson` with a `before_context` of `1` with line numbers
379/// enabled shows something like this using the standard printer:
380///
381/// ```text
382/// sherlock:1:For the Doctor Watsons of this world, as opposed to the Sherlock
383/// --
384/// sherlock-4-can extract a clew from a wisp of straw or a flake of cigar ash;
385/// sherlock:5:but Doctor Watson has to have it taken out for him and dusted,
386/// ```
387///
388/// Here's what the same search looks like using the JSON wire format described
389/// above, where in we show semi-prettified JSON (instead of a strict JSON
390/// Lines format), for illustrative purposes:
391///
392/// ```json
393/// {
394///   "type": "begin",
395///   "data": {
396///     "path": {"text": "/home/andrew/sherlock"}}
397///   }
398/// }
399/// {
400///   "type": "match",
401///   "data": {
402///     "path": {"text": "/home/andrew/sherlock"},
403///     "lines": {"text": "For the Doctor Watsons of this world, as opposed to the Sherlock\n"},
404///     "line_number": 1,
405///     "absolute_offset": 0,
406///     "submatches": [
407///       {"match": {"text": "Watson"}, "start": 15, "end": 21}
408///     ]
409///   }
410/// }
411/// {
412///   "type": "context",
413///   "data": {
414///     "path": {"text": "/home/andrew/sherlock"},
415///     "lines": {"text": "can extract a clew from a wisp of straw or a flake of cigar ash;\n"},
416///     "line_number": 4,
417///     "absolute_offset": 193,
418///     "submatches": []
419///   }
420/// }
421/// {
422///   "type": "match",
423///   "data": {
424///     "path": {"text": "/home/andrew/sherlock"},
425///     "lines": {"text": "but Doctor Watson has to have it taken out for him and dusted,\n"},
426///     "line_number": 5,
427///     "absolute_offset": 258,
428///     "submatches": [
429///       {"match": {"text": "Watson"}, "start": 11, "end": 17}
430///     ]
431///   }
432/// }
433/// {
434///   "type": "end",
435///   "data": {
436///     "path": {"text": "/home/andrew/sherlock"},
437///     "binary_offset": null,
438///     "stats": {
439///       "elapsed": {"secs": 0, "nanos": 36296, "human": "0.0000s"},
440///       "searches": 1,
441///       "searches_with_match": 1,
442///       "bytes_searched": 367,
443///       "bytes_printed": 1151,
444///       "matched_lines": 2,
445///       "matches": 2
446///     }
447///   }
448/// }
449/// ```
450#[derive(Clone, Debug)]
451pub struct JSON<W> {
452    config: Config,
453    wtr: CounterWriter<W>,
454    matches: Vec<Match>,
455}
456
457impl<W: io::Write> JSON<W> {
458    /// Return a JSON lines printer with a default configuration that writes
459    /// matches to the given writer.
460    pub fn new(wtr: W) -> JSON<W> {
461        JSONBuilder::new().build(wtr)
462    }
463
464    /// Return an implementation of `Sink` for the JSON printer.
465    ///
466    /// This does not associate the printer with a file path, which means this
467    /// implementation will never print a file path along with the matches.
468    pub fn sink<'s, M: Matcher>(
469        &'s mut self,
470        matcher: M,
471    ) -> JSONSink<'static, 's, M, W> {
472        JSONSink {
473            matcher,
474            json: self,
475            path: None,
476            start_time: Instant::now(),
477            match_count: 0,
478            after_context_remaining: 0,
479            binary_byte_offset: None,
480            begin_printed: false,
481            stats: Stats::new(),
482        }
483    }
484
485    /// Return an implementation of `Sink` associated with a file path.
486    ///
487    /// When the printer is associated with a path, then it may, depending on
488    /// its configuration, print the path along with the matches found.
489    pub fn sink_with_path<'p, 's, M, P>(
490        &'s mut self,
491        matcher: M,
492        path: &'p P,
493    ) -> JSONSink<'p, 's, M, W>
494    where
495        M: Matcher,
496        P: ?Sized + AsRef<Path>,
497    {
498        JSONSink {
499            matcher,
500            json: self,
501            path: Some(path.as_ref()),
502            start_time: Instant::now(),
503            match_count: 0,
504            after_context_remaining: 0,
505            binary_byte_offset: None,
506            begin_printed: false,
507            stats: Stats::new(),
508        }
509    }
510
511    /// Write the given message followed by a new line. The new line is
512    /// determined from the configuration of the given searcher.
513    fn write_message(
514        &mut self,
515        message: &jsont::Message<'_>,
516    ) -> io::Result<()> {
517        if self.config.pretty {
518            json::to_writer_pretty(&mut self.wtr, message)?;
519        } else {
520            json::to_writer(&mut self.wtr, message)?;
521        }
522        self.wtr.write(&[b'\n'])?;
523        Ok(())
524    }
525}
526
527impl<W> JSON<W> {
528    /// Returns true if and only if this printer has written at least one byte
529    /// to the underlying writer during any of the previous searches.
530    pub fn has_written(&self) -> bool {
531        self.wtr.total_count() > 0
532    }
533
534    /// Return a mutable reference to the underlying writer.
535    pub fn get_mut(&mut self) -> &mut W {
536        self.wtr.get_mut()
537    }
538
539    /// Consume this printer and return back ownership of the underlying
540    /// writer.
541    pub fn into_inner(self) -> W {
542        self.wtr.into_inner()
543    }
544}
545
546/// An implementation of `Sink` associated with a matcher and an optional file
547/// path for the JSON printer.
548///
549/// This type is generic over a few type parameters:
550///
551/// * `'p` refers to the lifetime of the file path, if one is provided. When
552/// no file path is given, then this is `'static`.
553/// * `'s` refers to the lifetime of the [`JSON`] printer that this type
554/// borrows.
555/// * `M` refers to the type of matcher used by
556/// `grep_searcher::Searcher` that is reporting results to this sink.
557/// * `W` refers to the underlying writer that this printer is writing its
558/// output to.
559#[derive(Debug)]
560pub struct JSONSink<'p, 's, M: Matcher, W> {
561    matcher: M,
562    json: &'s mut JSON<W>,
563    path: Option<&'p Path>,
564    start_time: Instant,
565    match_count: u64,
566    after_context_remaining: u64,
567    binary_byte_offset: Option<u64>,
568    begin_printed: bool,
569    stats: Stats,
570}
571
572impl<'p, 's, M: Matcher, W: io::Write> JSONSink<'p, 's, M, W> {
573    /// Returns true if and only if this printer received a match in the
574    /// previous search.
575    ///
576    /// This is unaffected by the result of searches before the previous
577    /// search.
578    pub fn has_match(&self) -> bool {
579        self.match_count > 0
580    }
581
582    /// Return the total number of matches reported to this sink.
583    ///
584    /// This corresponds to the number of times `Sink::matched` is called.
585    pub fn match_count(&self) -> u64 {
586        self.match_count
587    }
588
589    /// If binary data was found in the previous search, this returns the
590    /// offset at which the binary data was first detected.
591    ///
592    /// The offset returned is an absolute offset relative to the entire
593    /// set of bytes searched.
594    ///
595    /// This is unaffected by the result of searches before the previous
596    /// search. e.g., If the search prior to the previous search found binary
597    /// data but the previous search found no binary data, then this will
598    /// return `None`.
599    pub fn binary_byte_offset(&self) -> Option<u64> {
600        self.binary_byte_offset
601    }
602
603    /// Return a reference to the stats produced by the printer for all
604    /// searches executed on this sink.
605    pub fn stats(&self) -> &Stats {
606        &self.stats
607    }
608
609    /// Execute the matcher over the given bytes and record the match
610    /// locations if the current configuration demands match granularity.
611    fn record_matches(
612        &mut self,
613        searcher: &Searcher,
614        bytes: &[u8],
615        range: std::ops::Range<usize>,
616    ) -> io::Result<()> {
617        self.json.matches.clear();
618        // If printing requires knowing the location of each individual match,
619        // then compute and stored those right now for use later. While this
620        // adds an extra copy for storing the matches, we do amortize the
621        // allocation for it and this greatly simplifies the printing logic to
622        // the extent that it's easy to ensure that we never do more than
623        // one search to find the matches.
624        let matches = &mut self.json.matches;
625        find_iter_at_in_context(
626            searcher,
627            &self.matcher,
628            bytes,
629            range.clone(),
630            |m| {
631                let (s, e) = (m.start() - range.start, m.end() - range.start);
632                matches.push(Match::new(s, e));
633                true
634            },
635        )?;
636        // Don't report empty matches appearing at the end of the bytes.
637        if !matches.is_empty()
638            && matches.last().unwrap().is_empty()
639            && matches.last().unwrap().start() >= bytes.len()
640        {
641            matches.pop().unwrap();
642        }
643        Ok(())
644    }
645
646    /// Returns true if this printer should quit.
647    ///
648    /// This implements the logic for handling quitting after seeing a certain
649    /// amount of matches. In most cases, the logic is simple, but we must
650    /// permit all "after" contextual lines to print after reaching the limit.
651    fn should_quit(&self) -> bool {
652        let limit = match self.json.config.max_matches {
653            None => return false,
654            Some(limit) => limit,
655        };
656        if self.match_count < limit {
657            return false;
658        }
659        self.after_context_remaining == 0
660    }
661
662    /// Returns whether the current match count exceeds the configured limit.
663    /// If there is no limit, then this always returns false.
664    fn match_more_than_limit(&self) -> bool {
665        let limit = match self.json.config.max_matches {
666            None => return false,
667            Some(limit) => limit,
668        };
669        self.match_count > limit
670    }
671
672    /// Write the "begin" message.
673    fn write_begin_message(&mut self) -> io::Result<()> {
674        if self.begin_printed {
675            return Ok(());
676        }
677        let msg = jsont::Message::Begin(jsont::Begin { path: self.path });
678        self.json.write_message(&msg)?;
679        self.begin_printed = true;
680        Ok(())
681    }
682}
683
684impl<'p, 's, M: Matcher, W: io::Write> Sink for JSONSink<'p, 's, M, W> {
685    type Error = io::Error;
686
687    fn matched(
688        &mut self,
689        searcher: &Searcher,
690        mat: &SinkMatch<'_>,
691    ) -> Result<bool, io::Error> {
692        self.write_begin_message()?;
693
694        self.match_count += 1;
695        // When we've exceeded our match count, then the remaining context
696        // lines should not be reset, but instead, decremented. This avoids a
697        // bug where we display more matches than a configured limit. The main
698        // idea here is that 'matched' might be called again while printing
699        // an after-context line. In that case, we should treat this as a
700        // contextual line rather than a matching line for the purposes of
701        // termination.
702        if self.match_more_than_limit() {
703            self.after_context_remaining =
704                self.after_context_remaining.saturating_sub(1);
705        } else {
706            self.after_context_remaining = searcher.after_context() as u64;
707        }
708
709        self.record_matches(
710            searcher,
711            mat.buffer(),
712            mat.bytes_range_in_buffer(),
713        )?;
714        self.stats.add_matches(self.json.matches.len() as u64);
715        self.stats.add_matched_lines(mat.lines().count() as u64);
716
717        let submatches = SubMatches::new(mat.bytes(), &self.json.matches);
718        let msg = jsont::Message::Match(jsont::Match {
719            path: self.path,
720            lines: mat.bytes(),
721            line_number: mat.line_number(),
722            absolute_offset: mat.absolute_byte_offset(),
723            submatches: submatches.as_slice(),
724        });
725        self.json.write_message(&msg)?;
726        Ok(!self.should_quit())
727    }
728
729    fn context(
730        &mut self,
731        searcher: &Searcher,
732        ctx: &SinkContext<'_>,
733    ) -> Result<bool, io::Error> {
734        self.write_begin_message()?;
735        self.json.matches.clear();
736
737        if ctx.kind() == &SinkContextKind::After {
738            self.after_context_remaining =
739                self.after_context_remaining.saturating_sub(1);
740        }
741        let submatches = if searcher.invert_match() {
742            self.record_matches(searcher, ctx.bytes(), 0..ctx.bytes().len())?;
743            SubMatches::new(ctx.bytes(), &self.json.matches)
744        } else {
745            SubMatches::empty()
746        };
747        let msg = jsont::Message::Context(jsont::Context {
748            path: self.path,
749            lines: ctx.bytes(),
750            line_number: ctx.line_number(),
751            absolute_offset: ctx.absolute_byte_offset(),
752            submatches: submatches.as_slice(),
753        });
754        self.json.write_message(&msg)?;
755        Ok(!self.should_quit())
756    }
757
758    fn binary_data(
759        &mut self,
760        searcher: &Searcher,
761        binary_byte_offset: u64,
762    ) -> Result<bool, io::Error> {
763        if searcher.binary_detection().quit_byte().is_some() {
764            if let Some(ref path) = self.path {
765                log::debug!(
766                    "ignoring {path}: found binary data at \
767                     offset {binary_byte_offset}",
768                    path = path.display(),
769                );
770            }
771        }
772        Ok(true)
773    }
774
775    fn begin(&mut self, _searcher: &Searcher) -> Result<bool, io::Error> {
776        self.json.wtr.reset_count();
777        self.start_time = Instant::now();
778        self.match_count = 0;
779        self.after_context_remaining = 0;
780        self.binary_byte_offset = None;
781        if self.json.config.max_matches == Some(0) {
782            return Ok(false);
783        }
784
785        if !self.json.config.always_begin_end {
786            return Ok(true);
787        }
788        self.write_begin_message()?;
789        Ok(true)
790    }
791
792    fn finish(
793        &mut self,
794        _searcher: &Searcher,
795        finish: &SinkFinish,
796    ) -> Result<(), io::Error> {
797        if !self.begin_printed {
798            return Ok(());
799        }
800
801        self.binary_byte_offset = finish.binary_byte_offset();
802        self.stats.add_elapsed(self.start_time.elapsed());
803        self.stats.add_searches(1);
804        if self.match_count > 0 {
805            self.stats.add_searches_with_match(1);
806        }
807        self.stats.add_bytes_searched(finish.byte_count());
808        self.stats.add_bytes_printed(self.json.wtr.count());
809
810        let msg = jsont::Message::End(jsont::End {
811            path: self.path,
812            binary_offset: finish.binary_byte_offset(),
813            stats: self.stats.clone(),
814        });
815        self.json.write_message(&msg)?;
816        Ok(())
817    }
818}
819
820/// SubMatches represents a set of matches in a contiguous range of bytes.
821///
822/// A simpler representation for this would just simply be `Vec<SubMatch>`,
823/// but the common case is exactly one match per range of bytes, which we
824/// specialize here using a fixed size array without any allocation.
825enum SubMatches<'a> {
826    Empty,
827    Small([jsont::SubMatch<'a>; 1]),
828    Big(Vec<jsont::SubMatch<'a>>),
829}
830
831impl<'a> SubMatches<'a> {
832    /// Create a new set of match ranges from a set of matches and the
833    /// corresponding bytes that those matches apply to.
834    fn new(bytes: &'a [u8], matches: &[Match]) -> SubMatches<'a> {
835        if matches.len() == 1 {
836            let mat = matches[0];
837            SubMatches::Small([jsont::SubMatch {
838                m: &bytes[mat],
839                start: mat.start(),
840                end: mat.end(),
841            }])
842        } else {
843            let mut match_ranges = vec![];
844            for &mat in matches {
845                match_ranges.push(jsont::SubMatch {
846                    m: &bytes[mat],
847                    start: mat.start(),
848                    end: mat.end(),
849                });
850            }
851            SubMatches::Big(match_ranges)
852        }
853    }
854
855    /// Create an empty set of match ranges.
856    fn empty() -> SubMatches<'static> {
857        SubMatches::Empty
858    }
859
860    /// Return this set of match ranges as a slice.
861    fn as_slice(&self) -> &[jsont::SubMatch<'_>] {
862        match *self {
863            SubMatches::Empty => &[],
864            SubMatches::Small(ref x) => x,
865            SubMatches::Big(ref x) => x,
866        }
867    }
868}
869
870#[cfg(test)]
871mod tests {
872    use grep_matcher::LineTerminator;
873    use grep_regex::{RegexMatcher, RegexMatcherBuilder};
874    use grep_searcher::SearcherBuilder;
875
876    use super::{JSONBuilder, JSON};
877
878    const SHERLOCK: &'static [u8] = b"\
879For the Doctor Watsons of this world, as opposed to the Sherlock
880Holmeses, success in the province of detective work must always
881be, to a very large extent, the result of luck. Sherlock Holmes
882can extract a clew from a wisp of straw or a flake of cigar ash;
883but Doctor Watson has to have it taken out for him and dusted,
884and exhibited clearly, with a label attached.
885";
886
887    fn printer_contents(printer: &mut JSON<Vec<u8>>) -> String {
888        String::from_utf8(printer.get_mut().to_owned()).unwrap()
889    }
890
891    #[test]
892    fn binary_detection() {
893        use grep_searcher::BinaryDetection;
894
895        const BINARY: &'static [u8] = b"\
896For the Doctor Watsons of this world, as opposed to the Sherlock
897Holmeses, success in the province of detective work must always
898be, to a very large extent, the result of luck. Sherlock Holmes
899can extract a clew \x00 from a wisp of straw or a flake of cigar ash;
900but Doctor Watson has to have it taken out for him and dusted,
901and exhibited clearly, with a label attached.\
902";
903
904        let matcher = RegexMatcher::new(r"Watson").unwrap();
905        let mut printer = JSONBuilder::new().build(vec![]);
906        SearcherBuilder::new()
907            .binary_detection(BinaryDetection::quit(b'\x00'))
908            .heap_limit(Some(80))
909            .build()
910            .search_reader(&matcher, BINARY, printer.sink(&matcher))
911            .unwrap();
912        let got = printer_contents(&mut printer);
913
914        assert_eq!(got.lines().count(), 3);
915        let last = got.lines().last().unwrap();
916        assert!(last.contains(r#""binary_offset":212,"#));
917    }
918
919    #[test]
920    fn max_matches() {
921        let matcher = RegexMatcher::new(r"Watson").unwrap();
922        let mut printer =
923            JSONBuilder::new().max_matches(Some(1)).build(vec![]);
924        SearcherBuilder::new()
925            .build()
926            .search_reader(&matcher, SHERLOCK, printer.sink(&matcher))
927            .unwrap();
928        let got = printer_contents(&mut printer);
929
930        assert_eq!(got.lines().count(), 3);
931    }
932
933    #[test]
934    fn max_matches_after_context() {
935        let haystack = "\
936a
937b
938c
939d
940e
941d
942e
943d
944e
945d
946e
947";
948        let matcher = RegexMatcher::new(r"d").unwrap();
949        let mut printer =
950            JSONBuilder::new().max_matches(Some(1)).build(vec![]);
951        SearcherBuilder::new()
952            .after_context(2)
953            .build()
954            .search_reader(
955                &matcher,
956                haystack.as_bytes(),
957                printer.sink(&matcher),
958            )
959            .unwrap();
960        let got = printer_contents(&mut printer);
961
962        assert_eq!(got.lines().count(), 5);
963    }
964
965    #[test]
966    fn no_match() {
967        let matcher = RegexMatcher::new(r"DOES NOT MATCH").unwrap();
968        let mut printer = JSONBuilder::new().build(vec![]);
969        SearcherBuilder::new()
970            .build()
971            .search_reader(&matcher, SHERLOCK, printer.sink(&matcher))
972            .unwrap();
973        let got = printer_contents(&mut printer);
974
975        assert!(got.is_empty());
976    }
977
978    #[test]
979    fn always_begin_end_no_match() {
980        let matcher = RegexMatcher::new(r"DOES NOT MATCH").unwrap();
981        let mut printer =
982            JSONBuilder::new().always_begin_end(true).build(vec![]);
983        SearcherBuilder::new()
984            .build()
985            .search_reader(&matcher, SHERLOCK, printer.sink(&matcher))
986            .unwrap();
987        let got = printer_contents(&mut printer);
988
989        assert_eq!(got.lines().count(), 2);
990        assert!(got.contains("begin") && got.contains("end"));
991    }
992
993    #[test]
994    fn missing_crlf() {
995        let haystack = "test\r\n".as_bytes();
996
997        let matcher = RegexMatcherBuilder::new().build("test").unwrap();
998        let mut printer = JSONBuilder::new().build(vec![]);
999        SearcherBuilder::new()
1000            .build()
1001            .search_reader(&matcher, haystack, printer.sink(&matcher))
1002            .unwrap();
1003        let got = printer_contents(&mut printer);
1004        assert_eq!(got.lines().count(), 3);
1005        assert!(
1006            got.lines().nth(1).unwrap().contains(r"test\r\n"),
1007            r"missing 'test\r\n' in '{}'",
1008            got.lines().nth(1).unwrap(),
1009        );
1010
1011        let matcher =
1012            RegexMatcherBuilder::new().crlf(true).build("test").unwrap();
1013        let mut printer = JSONBuilder::new().build(vec![]);
1014        SearcherBuilder::new()
1015            .line_terminator(LineTerminator::crlf())
1016            .build()
1017            .search_reader(&matcher, haystack, printer.sink(&matcher))
1018            .unwrap();
1019        let got = printer_contents(&mut printer);
1020        assert_eq!(got.lines().count(), 3);
1021        assert!(
1022            got.lines().nth(1).unwrap().contains(r"test\r\n"),
1023            r"missing 'test\r\n' in '{}'",
1024            got.lines().nth(1).unwrap(),
1025        );
1026    }
1027}