grep_printer/
json.rs

1use std::{
2    io::{self, Write},
3    path::Path,
4    sync::Arc,
5    time::Instant,
6};
7
8use {
9    grep_matcher::{Match, Matcher},
10    grep_searcher::{Searcher, Sink, SinkContext, SinkFinish, SinkMatch},
11    serde_json as json,
12};
13
14use crate::{
15    counter::CounterWriter, jsont, stats::Stats, util::Replacer,
16    util::find_iter_at_in_context,
17};
18
19/// The configuration for the JSON printer.
20///
21/// This is manipulated by the JSONBuilder and then referenced by the actual
22/// implementation. Once a printer is build, the configuration is frozen and
23/// cannot changed.
24#[derive(Debug, Clone)]
25struct Config {
26    pretty: bool,
27    always_begin_end: bool,
28    replacement: Arc<Option<Vec<u8>>>,
29}
30
31impl Default for Config {
32    fn default() -> Config {
33        Config {
34            pretty: false,
35            always_begin_end: false,
36            replacement: Arc::new(None),
37        }
38    }
39}
40
41/// A builder for a JSON lines printer.
42///
43/// The builder permits configuring how the printer behaves. The JSON printer
44/// has fewer configuration options than the standard printer because it is
45/// a structured format, and the printer always attempts to find the most
46/// information possible.
47///
48/// Some configuration options, such as whether line numbers are included or
49/// whether contextual lines are shown, are drawn directly from the
50/// `grep_searcher::Searcher`'s configuration.
51///
52/// Once a `JSON` printer is built, its configuration cannot be changed.
53#[derive(Clone, Debug)]
54pub struct JSONBuilder {
55    config: Config,
56}
57
58impl JSONBuilder {
59    /// Return a new builder for configuring the JSON printer.
60    pub fn new() -> JSONBuilder {
61        JSONBuilder { config: Config::default() }
62    }
63
64    /// Create a JSON printer that writes results to the given writer.
65    pub fn build<W: io::Write>(&self, wtr: W) -> JSON<W> {
66        JSON {
67            config: self.config.clone(),
68            wtr: CounterWriter::new(wtr),
69            matches: vec![],
70        }
71    }
72
73    /// Print JSON in a pretty printed format.
74    ///
75    /// Enabling this will no longer produce a "JSON lines" format, in that
76    /// each JSON object printed may span multiple lines.
77    ///
78    /// This is disabled by default.
79    pub fn pretty(&mut self, yes: bool) -> &mut JSONBuilder {
80        self.config.pretty = yes;
81        self
82    }
83
84    /// When enabled, the `begin` and `end` messages are always emitted, even
85    /// when no match is found.
86    ///
87    /// When disabled, the `begin` and `end` messages are only shown if there
88    /// is at least one `match` or `context` message.
89    ///
90    /// This is disabled by default.
91    pub fn always_begin_end(&mut self, yes: bool) -> &mut JSONBuilder {
92        self.config.always_begin_end = yes;
93        self
94    }
95
96    /// Set the bytes that will be used to replace each occurrence of a match
97    /// found.
98    ///
99    /// The replacement bytes given may include references to capturing groups,
100    /// which may either be in index form (e.g., `$2`) or can reference named
101    /// capturing groups if present in the original pattern (e.g., `$foo`).
102    ///
103    /// For documentation on the full format, please see the `Capture` trait's
104    /// `interpolate` method in the
105    /// [grep-printer](https://docs.rs/grep-printer) crate.
106    pub fn replacement(
107        &mut self,
108        replacement: Option<Vec<u8>>,
109    ) -> &mut JSONBuilder {
110        self.config.replacement = Arc::new(replacement);
111        self
112    }
113}
114
115/// The JSON printer, which emits results in a JSON lines format.
116///
117/// This type is generic over `W`, which represents any implementation of
118/// the standard library `io::Write` trait.
119///
120/// # Format
121///
122/// This section describes the JSON format used by this printer.
123///
124/// To skip the rigamarole, take a look at the
125/// [example](#example)
126/// at the end.
127///
128/// ## Overview
129///
130/// The format of this printer is the [JSON Lines](https://jsonlines.org/)
131/// format. Specifically, this printer emits a sequence of messages, where
132/// each message is encoded as a single JSON value on a single line. There are
133/// four different types of messages (and this number may expand over time):
134///
135/// * **begin** - A message that indicates a file is being searched.
136/// * **end** - A message the indicates a file is done being searched. This
137///   message also include summary statistics about the search.
138/// * **match** - A message that indicates a match was found. This includes
139///   the text and offsets of the match.
140/// * **context** - A message that indicates a contextual line was found.
141///   This includes the text of the line, along with any match information if
142///   the search was inverted.
143///
144/// Every message is encoded in the same envelope format, which includes a tag
145/// indicating the message type along with an object for the payload:
146///
147/// ```json
148/// {
149///     "type": "{begin|end|match|context}",
150///     "data": { ... }
151/// }
152/// ```
153///
154/// The message itself is encoded in the envelope's `data` key.
155///
156/// ## Text encoding
157///
158/// Before describing each message format, we first must briefly discuss text
159/// encoding, since it factors into every type of message. In particular, JSON
160/// may only be encoded in UTF-8, UTF-16 or UTF-32. For the purposes of this
161/// printer, we need only worry about UTF-8. The problem here is that searching
162/// is not limited to UTF-8 exclusively, which in turn implies that matches
163/// may be reported that contain invalid UTF-8. Moreover, this printer may
164/// also print file paths, and the encoding of file paths is itself not
165/// guaranteed to be valid UTF-8. Therefore, this printer must deal with the
166/// presence of invalid UTF-8 somehow. The printer could silently ignore such
167/// things completely, or even lossily transcode invalid UTF-8 to valid UTF-8
168/// by replacing all invalid sequences with the Unicode replacement character.
169/// However, this would prevent consumers of this format from accessing the
170/// original data in a non-lossy way.
171///
172/// Therefore, this printer will emit valid UTF-8 encoded bytes as normal
173/// JSON strings and otherwise base64 encode data that isn't valid UTF-8. To
174/// communicate whether this process occurs or not, strings are keyed by the
175/// name `text` where as arbitrary bytes are keyed by `bytes`.
176///
177/// For example, when a path is included in a message, it is formatted like so,
178/// if and only if the path is valid UTF-8:
179///
180/// ```json
181/// {
182///     "path": {
183///         "text": "/home/ubuntu/lib.rs"
184///     }
185/// }
186/// ```
187///
188/// If instead our path was `/home/ubuntu/lib\xFF.rs`, where the `\xFF` byte
189/// makes it invalid UTF-8, the path would instead be encoded like so:
190///
191/// ```json
192/// {
193///     "path": {
194///         "bytes": "L2hvbWUvdWJ1bnR1L2xpYv8ucnM="
195///     }
196/// }
197/// ```
198///
199/// This same representation is used for reporting matches as well.
200///
201/// The printer guarantees that the `text` field is used whenever the
202/// underlying bytes are valid UTF-8.
203///
204/// ## Wire format
205///
206/// This section documents the wire format emitted by this printer, starting
207/// with the four types of messages.
208///
209/// Each message has its own format, and is contained inside an envelope that
210/// indicates the type of message. The envelope has these fields:
211///
212/// * **type** - A string indicating the type of this message. It may be one
213///   of four possible strings: `begin`, `end`, `match` or `context`. This
214///   list may expand over time.
215/// * **data** - The actual message data. The format of this field depends on
216///   the value of `type`. The possible message formats are
217///   [`begin`](#message-begin),
218///   [`end`](#message-end),
219///   [`match`](#message-match),
220///   [`context`](#message-context).
221///
222/// #### Message: **begin**
223///
224/// This message indicates that a search has begun. It has these fields:
225///
226/// * **path** - An
227///   [arbitrary data object](#object-arbitrary-data)
228///   representing the file path corresponding to the search, if one is
229///   present. If no file path is available, then this field is `null`.
230///
231/// #### Message: **end**
232///
233/// This message indicates that a search has finished. It has these fields:
234///
235/// * **path** - An
236///   [arbitrary data object](#object-arbitrary-data)
237///   representing the file path corresponding to the search, if one is
238///   present. If no file path is available, then this field is `null`.
239/// * **binary_offset** - The absolute offset in the data searched
240///   corresponding to the place at which binary data was detected. If no
241///   binary data was detected (or if binary detection was disabled), then this
242///   field is `null`.
243/// * **stats** - A [`stats` object](#object-stats) that contains summary
244///   statistics for the previous search.
245///
246/// #### Message: **match**
247///
248/// This message indicates that a match has been found. A match generally
249/// corresponds to a single line of text, although it may correspond to
250/// multiple lines if the search can emit matches over multiple lines. It
251/// has these fields:
252///
253/// * **path** - An
254///   [arbitrary data object](#object-arbitrary-data)
255///   representing the file path corresponding to the search, if one is
256///   present. If no file path is available, then this field is `null`.
257/// * **lines** - An
258///   [arbitrary data object](#object-arbitrary-data)
259///   representing one or more lines contained in this match.
260/// * **line_number** - If the searcher has been configured to report line
261///   numbers, then this corresponds to the line number of the first line
262///   in `lines`. If no line numbers are available, then this is `null`.
263/// * **absolute_offset** - The absolute byte offset corresponding to the start
264///   of `lines` in the data being searched.
265/// * **submatches** - An array of [`submatch` objects](#object-submatch)
266///   corresponding to matches in `lines`. The offsets included in each
267///   `submatch` correspond to byte offsets into `lines`. (If `lines` is base64
268///   encoded, then the byte offsets correspond to the data after base64
269///   decoding.) The `submatch` objects are guaranteed to be sorted by their
270///   starting offsets. Note that it is possible for this array to be empty,
271///   for example, when searching reports inverted matches. If the configuration
272///   specifies a replacement, the resulting replacement text is also present.
273///
274/// #### Message: **context**
275///
276/// This message indicates that a contextual line has been found. A contextual
277/// line is a line that doesn't contain a match, but is generally adjacent to
278/// a line that does contain a match. The precise way in which contextual lines
279/// are reported is determined by the searcher. It has these fields, which are
280/// exactly the same fields found in a [`match`](#message-match):
281///
282/// * **path** - An
283///   [arbitrary data object](#object-arbitrary-data)
284///   representing the file path corresponding to the search, if one is
285///   present. If no file path is available, then this field is `null`.
286/// * **lines** - An
287///   [arbitrary data object](#object-arbitrary-data)
288///   representing one or more lines contained in this context. This includes
289///   line terminators, if they're present.
290/// * **line_number** - If the searcher has been configured to report line
291///   numbers, then this corresponds to the line number of the first line
292///   in `lines`. If no line numbers are available, then this is `null`.
293/// * **absolute_offset** - The absolute byte offset corresponding to the start
294///   of `lines` in the data being searched.
295/// * **submatches** - An array of [`submatch` objects](#object-submatch)
296///   corresponding to matches in `lines`. The offsets included in each
297///   `submatch` correspond to byte offsets into `lines`. (If `lines` is base64
298///   encoded, then the byte offsets correspond to the data after base64
299///   decoding.) The `submatch` objects are guaranteed to be sorted by
300///   their starting offsets. Note that it is possible for this array to be
301///   non-empty, for example, when searching reports inverted matches such that
302///   the original matcher could match things in the contextual lines. If the
303///   configuration specifies a replacemement, the resulting replacement text
304///   is also present.
305///
306/// #### Object: **submatch**
307///
308/// This object describes submatches found within `match` or `context`
309/// messages. The `start` and `end` fields indicate the half-open interval on
310/// which the match occurs (`start` is included, but `end` is not). It is
311/// guaranteed that `start <= end`. It has these fields:
312///
313/// * **match** - An
314///   [arbitrary data object](#object-arbitrary-data)
315///   corresponding to the text in this submatch.
316/// * **start** - A byte offset indicating the start of this match. This offset
317///   is generally reported in terms of the parent object's data. For example,
318///   the `lines` field in the
319///   [`match`](#message-match) or [`context`](#message-context)
320///   messages.
321/// * **end** - A byte offset indicating the end of this match. This offset
322///   is generally reported in terms of the parent object's data. For example,
323///   the `lines` field in the
324///   [`match`](#message-match) or [`context`](#message-context)
325///   messages.
326/// * **replacement** (optional) - An
327///   [arbitrary data object](#object-arbitrary-data) corresponding to the
328///   replacement text for this submatch, if the configuration specifies
329///   a replacement.
330///
331/// #### Object: **stats**
332///
333/// This object is included in messages and contains summary statistics about
334/// a search. It has these fields:
335///
336/// * **elapsed** - A [`duration` object](#object-duration) describing the
337///   length of time that elapsed while performing the search.
338/// * **searches** - The number of searches that have run. For this printer,
339///   this value is always `1`. (Implementations may emit additional message
340///   types that use this same `stats` object that represents summary
341///   statistics over multiple searches.)
342/// * **searches_with_match** - The number of searches that have run that have
343///   found at least one match. This is never more than `searches`.
344/// * **bytes_searched** - The total number of bytes that have been searched.
345/// * **bytes_printed** - The total number of bytes that have been printed.
346///   This includes everything emitted by this printer.
347/// * **matched_lines** - The total number of lines that participated in a
348///   match. When matches may contain multiple lines, then this includes every
349///   line that is part of every match.
350/// * **matches** - The total number of matches. There may be multiple matches
351///   per line. When matches may contain multiple lines, each match is counted
352///   only once, regardless of how many lines it spans.
353///
354/// #### Object: **duration**
355///
356/// This object includes a few fields for describing a duration. Two of its
357/// fields, `secs` and `nanos`, can be combined to give nanosecond precision
358/// on systems that support it. It has these fields:
359///
360/// * **secs** - A whole number of seconds indicating the length of this
361///   duration.
362/// * **nanos** - A fractional part of this duration represent by nanoseconds.
363///   If nanosecond precision isn't supported, then this is typically rounded
364///   up to the nearest number of nanoseconds.
365/// * **human** - A human readable string describing the length of the
366///   duration. The format of the string is itself unspecified.
367///
368/// #### Object: **arbitrary data**
369///
370/// This object is used whenever arbitrary data needs to be represented as a
371/// JSON value. This object contains two fields, where generally only one of
372/// the fields is present:
373///
374/// * **text** - A normal JSON string that is UTF-8 encoded. This field is
375///   populated if and only if the underlying data is valid UTF-8.
376/// * **bytes** - A normal JSON string that is a base64 encoding of the
377///   underlying bytes.
378///
379/// More information on the motivation for this representation can be seen in
380/// the section [text encoding](#text-encoding) above.
381///
382/// ## Example
383///
384/// This section shows a small example that includes all message types.
385///
386/// Here's the file we want to search, located at `/home/andrew/sherlock`:
387///
388/// ```text
389/// For the Doctor Watsons of this world, as opposed to the Sherlock
390/// Holmeses, success in the province of detective work must always
391/// be, to a very large extent, the result of luck. Sherlock Holmes
392/// can extract a clew from a wisp of straw or a flake of cigar ash;
393/// but Doctor Watson has to have it taken out for him and dusted,
394/// and exhibited clearly, with a label attached.
395/// ```
396///
397/// Searching for `Watson` with a `before_context` of `1` with line numbers
398/// enabled shows something like this using the standard printer:
399///
400/// ```text
401/// sherlock:1:For the Doctor Watsons of this world, as opposed to the Sherlock
402/// --
403/// sherlock-4-can extract a clew from a wisp of straw or a flake of cigar ash;
404/// sherlock:5:but Doctor Watson has to have it taken out for him and dusted,
405/// ```
406///
407/// Here's what the same search looks like using the JSON wire format described
408/// above, where in we show semi-prettified JSON (instead of a strict JSON
409/// Lines format), for illustrative purposes:
410///
411/// ```json
412/// {
413///   "type": "begin",
414///   "data": {
415///     "path": {"text": "/home/andrew/sherlock"}}
416///   }
417/// }
418/// {
419///   "type": "match",
420///   "data": {
421///     "path": {"text": "/home/andrew/sherlock"},
422///     "lines": {"text": "For the Doctor Watsons of this world, as opposed to the Sherlock\n"},
423///     "line_number": 1,
424///     "absolute_offset": 0,
425///     "submatches": [
426///       {"match": {"text": "Watson"}, "start": 15, "end": 21}
427///     ]
428///   }
429/// }
430/// {
431///   "type": "context",
432///   "data": {
433///     "path": {"text": "/home/andrew/sherlock"},
434///     "lines": {"text": "can extract a clew from a wisp of straw or a flake of cigar ash;\n"},
435///     "line_number": 4,
436///     "absolute_offset": 193,
437///     "submatches": []
438///   }
439/// }
440/// {
441///   "type": "match",
442///   "data": {
443///     "path": {"text": "/home/andrew/sherlock"},
444///     "lines": {"text": "but Doctor Watson has to have it taken out for him and dusted,\n"},
445///     "line_number": 5,
446///     "absolute_offset": 258,
447///     "submatches": [
448///       {"match": {"text": "Watson"}, "start": 11, "end": 17}
449///     ]
450///   }
451/// }
452/// {
453///   "type": "end",
454///   "data": {
455///     "path": {"text": "/home/andrew/sherlock"},
456///     "binary_offset": null,
457///     "stats": {
458///       "elapsed": {"secs": 0, "nanos": 36296, "human": "0.0000s"},
459///       "searches": 1,
460///       "searches_with_match": 1,
461///       "bytes_searched": 367,
462///       "bytes_printed": 1151,
463///       "matched_lines": 2,
464///       "matches": 2
465///     }
466///   }
467/// }
468/// ```
469/// and here's what a match type item would looks like if a replacement text
470/// of 'Moriarity' was given as a parameter:
471/// ```json
472/// {
473///   "type": "match",
474///   "data": {
475///     "path": {"text": "/home/andrew/sherlock"},
476///     "lines": {"text": "For the Doctor Watsons of this world, as opposed to the Sherlock\n"},
477///     "line_number": 1,
478///     "absolute_offset": 0,
479///     "submatches": [
480///       {"match": {"text": "Watson"}, "replacement": {"text": "Moriarity"}, "start": 15, "end": 21}
481///     ]
482///   }
483/// }
484/// ```
485
486#[derive(Clone, Debug)]
487pub struct JSON<W> {
488    config: Config,
489    wtr: CounterWriter<W>,
490    matches: Vec<Match>,
491}
492
493impl<W: io::Write> JSON<W> {
494    /// Return a JSON lines printer with a default configuration that writes
495    /// matches to the given writer.
496    pub fn new(wtr: W) -> JSON<W> {
497        JSONBuilder::new().build(wtr)
498    }
499
500    /// Return an implementation of `Sink` for the JSON printer.
501    ///
502    /// This does not associate the printer with a file path, which means this
503    /// implementation will never print a file path along with the matches.
504    pub fn sink<'s, M: Matcher>(
505        &'s mut self,
506        matcher: M,
507    ) -> JSONSink<'static, 's, M, W> {
508        JSONSink {
509            matcher,
510            replacer: Replacer::new(),
511            json: self,
512            path: None,
513            start_time: Instant::now(),
514            match_count: 0,
515            binary_byte_offset: None,
516            begin_printed: false,
517            stats: Stats::new(),
518        }
519    }
520
521    /// Return an implementation of `Sink` associated with a file path.
522    ///
523    /// When the printer is associated with a path, then it may, depending on
524    /// its configuration, print the path along with the matches found.
525    pub fn sink_with_path<'p, 's, M, P>(
526        &'s mut self,
527        matcher: M,
528        path: &'p P,
529    ) -> JSONSink<'p, 's, M, W>
530    where
531        M: Matcher,
532        P: ?Sized + AsRef<Path>,
533    {
534        JSONSink {
535            matcher,
536            replacer: Replacer::new(),
537            json: self,
538            path: Some(path.as_ref()),
539            start_time: Instant::now(),
540            match_count: 0,
541            binary_byte_offset: None,
542            begin_printed: false,
543            stats: Stats::new(),
544        }
545    }
546
547    /// Write the given message followed by a new line. The new line is
548    /// determined from the configuration of the given searcher.
549    fn write_message(
550        &mut self,
551        message: &jsont::Message<'_>,
552    ) -> io::Result<()> {
553        if self.config.pretty {
554            json::to_writer_pretty(&mut self.wtr, message)?;
555        } else {
556            json::to_writer(&mut self.wtr, message)?;
557        }
558        let _ = self.wtr.write(b"\n")?; // This will always be Ok(1) when successful.
559        Ok(())
560    }
561}
562
563impl<W> JSON<W> {
564    /// Returns true if and only if this printer has written at least one byte
565    /// to the underlying writer during any of the previous searches.
566    pub fn has_written(&self) -> bool {
567        self.wtr.total_count() > 0
568    }
569
570    /// Return a mutable reference to the underlying writer.
571    pub fn get_mut(&mut self) -> &mut W {
572        self.wtr.get_mut()
573    }
574
575    /// Consume this printer and return back ownership of the underlying
576    /// writer.
577    pub fn into_inner(self) -> W {
578        self.wtr.into_inner()
579    }
580}
581
582/// An implementation of `Sink` associated with a matcher and an optional file
583/// path for the JSON printer.
584///
585/// This type is generic over a few type parameters:
586///
587/// * `'p` refers to the lifetime of the file path, if one is provided. When
588/// no file path is given, then this is `'static`.
589/// * `'s` refers to the lifetime of the [`JSON`] printer that this type
590/// borrows.
591/// * `M` refers to the type of matcher used by
592/// `grep_searcher::Searcher` that is reporting results to this sink.
593/// * `W` refers to the underlying writer that this printer is writing its
594/// output to.
595#[derive(Debug)]
596pub struct JSONSink<'p, 's, M: Matcher, W> {
597    matcher: M,
598    replacer: Replacer<M>,
599    json: &'s mut JSON<W>,
600    path: Option<&'p Path>,
601    start_time: Instant,
602    match_count: u64,
603    binary_byte_offset: Option<u64>,
604    begin_printed: bool,
605    stats: Stats,
606}
607
608impl<'p, 's, M: Matcher, W: io::Write> JSONSink<'p, 's, M, W> {
609    /// Returns true if and only if this printer received a match in the
610    /// previous search.
611    ///
612    /// This is unaffected by the result of searches before the previous
613    /// search.
614    pub fn has_match(&self) -> bool {
615        self.match_count > 0
616    }
617
618    /// Return the total number of matches reported to this sink.
619    ///
620    /// This corresponds to the number of times `Sink::matched` is called.
621    pub fn match_count(&self) -> u64 {
622        self.match_count
623    }
624
625    /// If binary data was found in the previous search, this returns the
626    /// offset at which the binary data was first detected.
627    ///
628    /// The offset returned is an absolute offset relative to the entire
629    /// set of bytes searched.
630    ///
631    /// This is unaffected by the result of searches before the previous
632    /// search. e.g., If the search prior to the previous search found binary
633    /// data but the previous search found no binary data, then this will
634    /// return `None`.
635    pub fn binary_byte_offset(&self) -> Option<u64> {
636        self.binary_byte_offset
637    }
638
639    /// Return a reference to the stats produced by the printer for all
640    /// searches executed on this sink.
641    pub fn stats(&self) -> &Stats {
642        &self.stats
643    }
644
645    /// Execute the matcher over the given bytes and record the match
646    /// locations if the current configuration demands match granularity.
647    fn record_matches(
648        &mut self,
649        searcher: &Searcher,
650        bytes: &[u8],
651        range: std::ops::Range<usize>,
652    ) -> io::Result<()> {
653        self.json.matches.clear();
654        // If printing requires knowing the location of each individual match,
655        // then compute and stored those right now for use later. While this
656        // adds an extra copy for storing the matches, we do amortize the
657        // allocation for it and this greatly simplifies the printing logic to
658        // the extent that it's easy to ensure that we never do more than
659        // one search to find the matches.
660        let matches = &mut self.json.matches;
661        find_iter_at_in_context(
662            searcher,
663            &self.matcher,
664            bytes,
665            range.clone(),
666            |m| {
667                let (s, e) = (m.start() - range.start, m.end() - range.start);
668                matches.push(Match::new(s, e));
669                true
670            },
671        )?;
672        // Don't report empty matches appearing at the end of the bytes.
673        if !matches.is_empty()
674            && matches.last().unwrap().is_empty()
675            && matches.last().unwrap().start() >= bytes.len()
676        {
677            matches.pop().unwrap();
678        }
679        Ok(())
680    }
681
682    /// If the configuration specifies a replacement, then this executes the
683    /// replacement, lazily allocating memory if necessary.
684    ///
685    /// To access the result of a replacement, use `replacer.replacement()`.
686    fn replace(
687        &mut self,
688        searcher: &Searcher,
689        bytes: &[u8],
690        range: std::ops::Range<usize>,
691    ) -> io::Result<()> {
692        self.replacer.clear();
693        if self.json.config.replacement.is_some() {
694            let replacement =
695                (*self.json.config.replacement).as_ref().map(|r| &*r).unwrap();
696            self.replacer.replace_all(
697                searcher,
698                &self.matcher,
699                bytes,
700                range,
701                replacement,
702            )?;
703        }
704        Ok(())
705    }
706
707    /// Write the "begin" message.
708    fn write_begin_message(&mut self) -> io::Result<()> {
709        if self.begin_printed {
710            return Ok(());
711        }
712        let msg = jsont::Message::Begin(jsont::Begin { path: self.path });
713        self.json.write_message(&msg)?;
714        self.begin_printed = true;
715        Ok(())
716    }
717}
718
719impl<'p, 's, M: Matcher, W: io::Write> Sink for JSONSink<'p, 's, M, W> {
720    type Error = io::Error;
721
722    fn matched(
723        &mut self,
724        searcher: &Searcher,
725        mat: &SinkMatch<'_>,
726    ) -> Result<bool, io::Error> {
727        self.match_count += 1;
728        self.write_begin_message()?;
729
730        self.record_matches(
731            searcher,
732            mat.buffer(),
733            mat.bytes_range_in_buffer(),
734        )?;
735        self.replace(searcher, mat.buffer(), mat.bytes_range_in_buffer())?;
736        self.stats.add_matches(self.json.matches.len() as u64);
737        self.stats.add_matched_lines(mat.lines().count() as u64);
738
739        let submatches = SubMatches::new(
740            mat.bytes(),
741            &self.json.matches,
742            self.replacer.replacement(),
743        );
744        let msg = jsont::Message::Match(jsont::Match {
745            path: self.path,
746            lines: mat.bytes(),
747            line_number: mat.line_number(),
748            absolute_offset: mat.absolute_byte_offset(),
749            submatches: submatches.as_slice(),
750        });
751        self.json.write_message(&msg)?;
752        Ok(true)
753    }
754
755    fn context(
756        &mut self,
757        searcher: &Searcher,
758        ctx: &SinkContext<'_>,
759    ) -> Result<bool, io::Error> {
760        self.write_begin_message()?;
761        self.json.matches.clear();
762
763        let submatches = if searcher.invert_match() {
764            self.record_matches(searcher, ctx.bytes(), 0..ctx.bytes().len())?;
765            self.replace(searcher, ctx.bytes(), 0..ctx.bytes().len())?;
766            SubMatches::new(
767                ctx.bytes(),
768                &self.json.matches,
769                self.replacer.replacement(),
770            )
771        } else {
772            SubMatches::empty()
773        };
774        let msg = jsont::Message::Context(jsont::Context {
775            path: self.path,
776            lines: ctx.bytes(),
777            line_number: ctx.line_number(),
778            absolute_offset: ctx.absolute_byte_offset(),
779            submatches: submatches.as_slice(),
780        });
781        self.json.write_message(&msg)?;
782        Ok(true)
783    }
784
785    fn binary_data(
786        &mut self,
787        searcher: &Searcher,
788        binary_byte_offset: u64,
789    ) -> Result<bool, io::Error> {
790        if searcher.binary_detection().quit_byte().is_some() {
791            if let Some(ref path) = self.path {
792                log::debug!(
793                    "ignoring {path}: found binary data at \
794                     offset {binary_byte_offset}",
795                    path = path.display(),
796                );
797            }
798        }
799        Ok(true)
800    }
801
802    fn begin(&mut self, _searcher: &Searcher) -> Result<bool, io::Error> {
803        self.json.wtr.reset_count();
804        self.start_time = Instant::now();
805        self.match_count = 0;
806        self.binary_byte_offset = None;
807
808        if !self.json.config.always_begin_end {
809            return Ok(true);
810        }
811        self.write_begin_message()?;
812        Ok(true)
813    }
814
815    fn finish(
816        &mut self,
817        _searcher: &Searcher,
818        finish: &SinkFinish,
819    ) -> Result<(), io::Error> {
820        self.binary_byte_offset = finish.binary_byte_offset();
821        self.stats.add_elapsed(self.start_time.elapsed());
822        self.stats.add_searches(1);
823        if self.match_count > 0 {
824            self.stats.add_searches_with_match(1);
825        }
826        self.stats.add_bytes_searched(finish.byte_count());
827        self.stats.add_bytes_printed(self.json.wtr.count());
828
829        if !self.begin_printed {
830            return Ok(());
831        }
832        let msg = jsont::Message::End(jsont::End {
833            path: self.path,
834            binary_offset: finish.binary_byte_offset(),
835            stats: self.stats.clone(),
836        });
837        self.json.write_message(&msg)?;
838        Ok(())
839    }
840}
841
842/// SubMatches represents a set of matches in a contiguous range of bytes.
843///
844/// A simpler representation for this would just simply be `Vec<SubMatch>`,
845/// but the common case is exactly one match per range of bytes, which we
846/// specialize here using a fixed size array without any allocation.
847enum SubMatches<'a> {
848    Empty,
849    Small([jsont::SubMatch<'a>; 1]),
850    Big(Vec<jsont::SubMatch<'a>>),
851}
852
853impl<'a> SubMatches<'a> {
854    /// Create a new set of match ranges from a set of matches and the
855    /// corresponding bytes that those matches apply to.
856    fn new(
857        bytes: &'a [u8],
858        matches: &[Match],
859        replacement: Option<(&'a [u8], &'a [Match])>,
860    ) -> SubMatches<'a> {
861        if matches.len() == 1 {
862            let mat = matches[0];
863            SubMatches::Small([jsont::SubMatch {
864                m: &bytes[mat],
865                replacement: replacement
866                    .map(|(rbuf, rmatches)| &rbuf[rmatches[0]]),
867                start: mat.start(),
868                end: mat.end(),
869            }])
870        } else {
871            let mut match_ranges = vec![];
872            for (i, &mat) in matches.iter().enumerate() {
873                match_ranges.push(jsont::SubMatch {
874                    m: &bytes[mat],
875                    replacement: replacement
876                        .map(|(rbuf, rmatches)| &rbuf[rmatches[i]]),
877                    start: mat.start(),
878                    end: mat.end(),
879                });
880            }
881            SubMatches::Big(match_ranges)
882        }
883    }
884
885    /// Create an empty set of match ranges.
886    fn empty() -> SubMatches<'static> {
887        SubMatches::Empty
888    }
889
890    /// Return this set of match ranges as a slice.
891    fn as_slice(&self) -> &[jsont::SubMatch<'_>] {
892        match *self {
893            SubMatches::Empty => &[],
894            SubMatches::Small(ref x) => x,
895            SubMatches::Big(ref x) => x,
896        }
897    }
898}
899
900#[cfg(test)]
901mod tests {
902    use grep_matcher::LineTerminator;
903    use grep_regex::{RegexMatcher, RegexMatcherBuilder};
904    use grep_searcher::SearcherBuilder;
905
906    use super::{JSON, JSONBuilder};
907
908    const SHERLOCK: &'static [u8] = b"\
909For the Doctor Watsons of this world, as opposed to the Sherlock
910Holmeses, success in the province of detective work must always
911be, to a very large extent, the result of luck. Sherlock Holmes
912can extract a clew from a wisp of straw or a flake of cigar ash;
913but Doctor Watson has to have it taken out for him and dusted,
914and exhibited clearly, with a label attached.
915";
916
917    fn printer_contents(printer: &mut JSON<Vec<u8>>) -> String {
918        String::from_utf8(printer.get_mut().to_owned()).unwrap()
919    }
920
921    #[test]
922    fn binary_detection() {
923        use grep_searcher::BinaryDetection;
924
925        const BINARY: &'static [u8] = b"\
926For the Doctor Watsons of this world, as opposed to the Sherlock
927Holmeses, success in the province of detective work must always
928be, to a very large extent, the result of luck. Sherlock Holmes
929can extract a clew \x00 from a wisp of straw or a flake of cigar ash;
930but Doctor Watson has to have it taken out for him and dusted,
931and exhibited clearly, with a label attached.\
932";
933
934        let matcher = RegexMatcher::new(r"Watson").unwrap();
935        let mut printer = JSONBuilder::new().build(vec![]);
936        SearcherBuilder::new()
937            .binary_detection(BinaryDetection::quit(b'\x00'))
938            .heap_limit(Some(80))
939            .build()
940            .search_reader(&matcher, BINARY, printer.sink(&matcher))
941            .unwrap();
942        let got = printer_contents(&mut printer);
943
944        assert_eq!(got.lines().count(), 3);
945        let last = got.lines().last().unwrap();
946        assert!(last.contains(r#""binary_offset":212,"#));
947    }
948
949    #[test]
950    fn max_matches() {
951        let matcher = RegexMatcher::new(r"Watson").unwrap();
952        let mut printer = JSONBuilder::new().build(vec![]);
953        SearcherBuilder::new()
954            .max_matches(Some(1))
955            .build()
956            .search_reader(&matcher, SHERLOCK, printer.sink(&matcher))
957            .unwrap();
958        let got = printer_contents(&mut printer);
959
960        assert_eq!(got.lines().count(), 3);
961    }
962
963    #[test]
964    fn max_matches_after_context() {
965        let haystack = "\
966a
967b
968c
969d
970e
971d
972e
973d
974e
975d
976e
977";
978        let matcher = RegexMatcher::new(r"d").unwrap();
979        let mut printer = JSONBuilder::new().build(vec![]);
980        SearcherBuilder::new()
981            .after_context(2)
982            .max_matches(Some(1))
983            .build()
984            .search_reader(
985                &matcher,
986                haystack.as_bytes(),
987                printer.sink(&matcher),
988            )
989            .unwrap();
990        let got = printer_contents(&mut printer);
991
992        assert_eq!(got.lines().count(), 5);
993    }
994
995    #[test]
996    fn no_match() {
997        let matcher = RegexMatcher::new(r"DOES NOT MATCH").unwrap();
998        let mut printer = JSONBuilder::new().build(vec![]);
999        SearcherBuilder::new()
1000            .build()
1001            .search_reader(&matcher, SHERLOCK, printer.sink(&matcher))
1002            .unwrap();
1003        let got = printer_contents(&mut printer);
1004
1005        assert!(got.is_empty());
1006    }
1007
1008    #[test]
1009    fn always_begin_end_no_match() {
1010        let matcher = RegexMatcher::new(r"DOES NOT MATCH").unwrap();
1011        let mut printer =
1012            JSONBuilder::new().always_begin_end(true).build(vec![]);
1013        SearcherBuilder::new()
1014            .build()
1015            .search_reader(&matcher, SHERLOCK, printer.sink(&matcher))
1016            .unwrap();
1017        let got = printer_contents(&mut printer);
1018
1019        assert_eq!(got.lines().count(), 2);
1020        assert!(got.contains("begin") && got.contains("end"));
1021    }
1022
1023    #[test]
1024    fn missing_crlf() {
1025        let haystack = "test\r\n".as_bytes();
1026
1027        let matcher = RegexMatcherBuilder::new().build("test").unwrap();
1028        let mut printer = JSONBuilder::new().build(vec![]);
1029        SearcherBuilder::new()
1030            .build()
1031            .search_reader(&matcher, haystack, printer.sink(&matcher))
1032            .unwrap();
1033        let got = printer_contents(&mut printer);
1034        assert_eq!(got.lines().count(), 3);
1035        assert!(
1036            got.lines().nth(1).unwrap().contains(r"test\r\n"),
1037            r"missing 'test\r\n' in '{}'",
1038            got.lines().nth(1).unwrap(),
1039        );
1040
1041        let matcher =
1042            RegexMatcherBuilder::new().crlf(true).build("test").unwrap();
1043        let mut printer = JSONBuilder::new().build(vec![]);
1044        SearcherBuilder::new()
1045            .line_terminator(LineTerminator::crlf())
1046            .build()
1047            .search_reader(&matcher, haystack, printer.sink(&matcher))
1048            .unwrap();
1049        let got = printer_contents(&mut printer);
1050        assert_eq!(got.lines().count(), 3);
1051        assert!(
1052            got.lines().nth(1).unwrap().contains(r"test\r\n"),
1053            r"missing 'test\r\n' in '{}'",
1054            got.lines().nth(1).unwrap(),
1055        );
1056    }
1057}
grep_printer/json.rs

grep_printer/
json.rs