grep_printer/json.rs
1use std::{
2 io::{self, Write},
3 path::Path,
4 sync::Arc,
5 time::Instant,
6};
7
8use {
9 grep_matcher::{Match, Matcher},
10 grep_searcher::{Searcher, Sink, SinkContext, SinkFinish, SinkMatch},
11 serde_json as json,
12};
13
14use crate::{
15 counter::CounterWriter, jsont, stats::Stats, util::Replacer,
16 util::find_iter_at_in_context,
17};
18
19/// The configuration for the JSON printer.
20///
21/// This is manipulated by the JSONBuilder and then referenced by the actual
22/// implementation. Once a printer is build, the configuration is frozen and
23/// cannot changed.
24#[derive(Debug, Clone)]
25struct Config {
26 pretty: bool,
27 always_begin_end: bool,
28 replacement: Arc<Option<Vec<u8>>>,
29}
30
31impl Default for Config {
32 fn default() -> Config {
33 Config {
34 pretty: false,
35 always_begin_end: false,
36 replacement: Arc::new(None),
37 }
38 }
39}
40
41/// A builder for a JSON lines printer.
42///
43/// The builder permits configuring how the printer behaves. The JSON printer
44/// has fewer configuration options than the standard printer because it is
45/// a structured format, and the printer always attempts to find the most
46/// information possible.
47///
48/// Some configuration options, such as whether line numbers are included or
49/// whether contextual lines are shown, are drawn directly from the
50/// `grep_searcher::Searcher`'s configuration.
51///
52/// Once a `JSON` printer is built, its configuration cannot be changed.
53#[derive(Clone, Debug)]
54pub struct JSONBuilder {
55 config: Config,
56}
57
58impl JSONBuilder {
59 /// Return a new builder for configuring the JSON printer.
60 pub fn new() -> JSONBuilder {
61 JSONBuilder { config: Config::default() }
62 }
63
64 /// Create a JSON printer that writes results to the given writer.
65 pub fn build<W: io::Write>(&self, wtr: W) -> JSON<W> {
66 JSON {
67 config: self.config.clone(),
68 wtr: CounterWriter::new(wtr),
69 matches: vec![],
70 }
71 }
72
73 /// Print JSON in a pretty printed format.
74 ///
75 /// Enabling this will no longer produce a "JSON lines" format, in that
76 /// each JSON object printed may span multiple lines.
77 ///
78 /// This is disabled by default.
79 pub fn pretty(&mut self, yes: bool) -> &mut JSONBuilder {
80 self.config.pretty = yes;
81 self
82 }
83
84 /// When enabled, the `begin` and `end` messages are always emitted, even
85 /// when no match is found.
86 ///
87 /// When disabled, the `begin` and `end` messages are only shown if there
88 /// is at least one `match` or `context` message.
89 ///
90 /// This is disabled by default.
91 pub fn always_begin_end(&mut self, yes: bool) -> &mut JSONBuilder {
92 self.config.always_begin_end = yes;
93 self
94 }
95
96 /// Set the bytes that will be used to replace each occurrence of a match
97 /// found.
98 ///
99 /// The replacement bytes given may include references to capturing groups,
100 /// which may either be in index form (e.g., `$2`) or can reference named
101 /// capturing groups if present in the original pattern (e.g., `$foo`).
102 ///
103 /// For documentation on the full format, please see the `Capture` trait's
104 /// `interpolate` method in the
105 /// [grep-printer](https://docs.rs/grep-printer) crate.
106 pub fn replacement(
107 &mut self,
108 replacement: Option<Vec<u8>>,
109 ) -> &mut JSONBuilder {
110 self.config.replacement = Arc::new(replacement);
111 self
112 }
113}
114
115/// The JSON printer, which emits results in a JSON lines format.
116///
117/// This type is generic over `W`, which represents any implementation of
118/// the standard library `io::Write` trait.
119///
120/// # Format
121///
122/// This section describes the JSON format used by this printer.
123///
124/// To skip the rigamarole, take a look at the
125/// [example](#example)
126/// at the end.
127///
128/// ## Overview
129///
130/// The format of this printer is the [JSON Lines](https://jsonlines.org/)
131/// format. Specifically, this printer emits a sequence of messages, where
132/// each message is encoded as a single JSON value on a single line. There are
133/// four different types of messages (and this number may expand over time):
134///
135/// * **begin** - A message that indicates a file is being searched.
136/// * **end** - A message the indicates a file is done being searched. This
137/// message also include summary statistics about the search.
138/// * **match** - A message that indicates a match was found. This includes
139/// the text and offsets of the match.
140/// * **context** - A message that indicates a contextual line was found.
141/// This includes the text of the line, along with any match information if
142/// the search was inverted.
143///
144/// Every message is encoded in the same envelope format, which includes a tag
145/// indicating the message type along with an object for the payload:
146///
147/// ```json
148/// {
149/// "type": "{begin|end|match|context}",
150/// "data": { ... }
151/// }
152/// ```
153///
154/// The message itself is encoded in the envelope's `data` key.
155///
156/// ## Text encoding
157///
158/// Before describing each message format, we first must briefly discuss text
159/// encoding, since it factors into every type of message. In particular, JSON
160/// may only be encoded in UTF-8, UTF-16 or UTF-32. For the purposes of this
161/// printer, we need only worry about UTF-8. The problem here is that searching
162/// is not limited to UTF-8 exclusively, which in turn implies that matches
163/// may be reported that contain invalid UTF-8. Moreover, this printer may
164/// also print file paths, and the encoding of file paths is itself not
165/// guaranteed to be valid UTF-8. Therefore, this printer must deal with the
166/// presence of invalid UTF-8 somehow. The printer could silently ignore such
167/// things completely, or even lossily transcode invalid UTF-8 to valid UTF-8
168/// by replacing all invalid sequences with the Unicode replacement character.
169/// However, this would prevent consumers of this format from accessing the
170/// original data in a non-lossy way.
171///
172/// Therefore, this printer will emit valid UTF-8 encoded bytes as normal
173/// JSON strings and otherwise base64 encode data that isn't valid UTF-8. To
174/// communicate whether this process occurs or not, strings are keyed by the
175/// name `text` where as arbitrary bytes are keyed by `bytes`.
176///
177/// For example, when a path is included in a message, it is formatted like so,
178/// if and only if the path is valid UTF-8:
179///
180/// ```json
181/// {
182/// "path": {
183/// "text": "/home/ubuntu/lib.rs"
184/// }
185/// }
186/// ```
187///
188/// If instead our path was `/home/ubuntu/lib\xFF.rs`, where the `\xFF` byte
189/// makes it invalid UTF-8, the path would instead be encoded like so:
190///
191/// ```json
192/// {
193/// "path": {
194/// "bytes": "L2hvbWUvdWJ1bnR1L2xpYv8ucnM="
195/// }
196/// }
197/// ```
198///
199/// This same representation is used for reporting matches as well.
200///
201/// The printer guarantees that the `text` field is used whenever the
202/// underlying bytes are valid UTF-8.
203///
204/// ## Wire format
205///
206/// This section documents the wire format emitted by this printer, starting
207/// with the four types of messages.
208///
209/// Each message has its own format, and is contained inside an envelope that
210/// indicates the type of message. The envelope has these fields:
211///
212/// * **type** - A string indicating the type of this message. It may be one
213/// of four possible strings: `begin`, `end`, `match` or `context`. This
214/// list may expand over time.
215/// * **data** - The actual message data. The format of this field depends on
216/// the value of `type`. The possible message formats are
217/// [`begin`](#message-begin),
218/// [`end`](#message-end),
219/// [`match`](#message-match),
220/// [`context`](#message-context).
221///
222/// #### Message: **begin**
223///
224/// This message indicates that a search has begun. It has these fields:
225///
226/// * **path** - An
227/// [arbitrary data object](#object-arbitrary-data)
228/// representing the file path corresponding to the search, if one is
229/// present. If no file path is available, then this field is `null`.
230///
231/// #### Message: **end**
232///
233/// This message indicates that a search has finished. It has these fields:
234///
235/// * **path** - An
236/// [arbitrary data object](#object-arbitrary-data)
237/// representing the file path corresponding to the search, if one is
238/// present. If no file path is available, then this field is `null`.
239/// * **binary_offset** - The absolute offset in the data searched
240/// corresponding to the place at which binary data was detected. If no
241/// binary data was detected (or if binary detection was disabled), then this
242/// field is `null`.
243/// * **stats** - A [`stats` object](#object-stats) that contains summary
244/// statistics for the previous search.
245///
246/// #### Message: **match**
247///
248/// This message indicates that a match has been found. A match generally
249/// corresponds to a single line of text, although it may correspond to
250/// multiple lines if the search can emit matches over multiple lines. It
251/// has these fields:
252///
253/// * **path** - An
254/// [arbitrary data object](#object-arbitrary-data)
255/// representing the file path corresponding to the search, if one is
256/// present. If no file path is available, then this field is `null`.
257/// * **lines** - An
258/// [arbitrary data object](#object-arbitrary-data)
259/// representing one or more lines contained in this match.
260/// * **line_number** - If the searcher has been configured to report line
261/// numbers, then this corresponds to the line number of the first line
262/// in `lines`. If no line numbers are available, then this is `null`.
263/// * **absolute_offset** - The absolute byte offset corresponding to the start
264/// of `lines` in the data being searched.
265/// * **submatches** - An array of [`submatch` objects](#object-submatch)
266/// corresponding to matches in `lines`. The offsets included in each
267/// `submatch` correspond to byte offsets into `lines`. (If `lines` is base64
268/// encoded, then the byte offsets correspond to the data after base64
269/// decoding.) The `submatch` objects are guaranteed to be sorted by their
270/// starting offsets. Note that it is possible for this array to be empty,
271/// for example, when searching reports inverted matches. If the configuration
272/// specifies a replacement, the resulting replacement text is also present.
273///
274/// #### Message: **context**
275///
276/// This message indicates that a contextual line has been found. A contextual
277/// line is a line that doesn't contain a match, but is generally adjacent to
278/// a line that does contain a match. The precise way in which contextual lines
279/// are reported is determined by the searcher. It has these fields, which are
280/// exactly the same fields found in a [`match`](#message-match):
281///
282/// * **path** - An
283/// [arbitrary data object](#object-arbitrary-data)
284/// representing the file path corresponding to the search, if one is
285/// present. If no file path is available, then this field is `null`.
286/// * **lines** - An
287/// [arbitrary data object](#object-arbitrary-data)
288/// representing one or more lines contained in this context. This includes
289/// line terminators, if they're present.
290/// * **line_number** - If the searcher has been configured to report line
291/// numbers, then this corresponds to the line number of the first line
292/// in `lines`. If no line numbers are available, then this is `null`.
293/// * **absolute_offset** - The absolute byte offset corresponding to the start
294/// of `lines` in the data being searched.
295/// * **submatches** - An array of [`submatch` objects](#object-submatch)
296/// corresponding to matches in `lines`. The offsets included in each
297/// `submatch` correspond to byte offsets into `lines`. (If `lines` is base64
298/// encoded, then the byte offsets correspond to the data after base64
299/// decoding.) The `submatch` objects are guaranteed to be sorted by
300/// their starting offsets. Note that it is possible for this array to be
301/// non-empty, for example, when searching reports inverted matches such that
302/// the original matcher could match things in the contextual lines. If the
303/// configuration specifies a replacemement, the resulting replacement text
304/// is also present.
305///
306/// #### Object: **submatch**
307///
308/// This object describes submatches found within `match` or `context`
309/// messages. The `start` and `end` fields indicate the half-open interval on
310/// which the match occurs (`start` is included, but `end` is not). It is
311/// guaranteed that `start <= end`. It has these fields:
312///
313/// * **match** - An
314/// [arbitrary data object](#object-arbitrary-data)
315/// corresponding to the text in this submatch.
316/// * **start** - A byte offset indicating the start of this match. This offset
317/// is generally reported in terms of the parent object's data. For example,
318/// the `lines` field in the
319/// [`match`](#message-match) or [`context`](#message-context)
320/// messages.
321/// * **end** - A byte offset indicating the end of this match. This offset
322/// is generally reported in terms of the parent object's data. For example,
323/// the `lines` field in the
324/// [`match`](#message-match) or [`context`](#message-context)
325/// messages.
326/// * **replacement** (optional) - An
327/// [arbitrary data object](#object-arbitrary-data) corresponding to the
328/// replacement text for this submatch, if the configuration specifies
329/// a replacement.
330///
331/// #### Object: **stats**
332///
333/// This object is included in messages and contains summary statistics about
334/// a search. It has these fields:
335///
336/// * **elapsed** - A [`duration` object](#object-duration) describing the
337/// length of time that elapsed while performing the search.
338/// * **searches** - The number of searches that have run. For this printer,
339/// this value is always `1`. (Implementations may emit additional message
340/// types that use this same `stats` object that represents summary
341/// statistics over multiple searches.)
342/// * **searches_with_match** - The number of searches that have run that have
343/// found at least one match. This is never more than `searches`.
344/// * **bytes_searched** - The total number of bytes that have been searched.
345/// * **bytes_printed** - The total number of bytes that have been printed.
346/// This includes everything emitted by this printer.
347/// * **matched_lines** - The total number of lines that participated in a
348/// match. When matches may contain multiple lines, then this includes every
349/// line that is part of every match.
350/// * **matches** - The total number of matches. There may be multiple matches
351/// per line. When matches may contain multiple lines, each match is counted
352/// only once, regardless of how many lines it spans.
353///
354/// #### Object: **duration**
355///
356/// This object includes a few fields for describing a duration. Two of its
357/// fields, `secs` and `nanos`, can be combined to give nanosecond precision
358/// on systems that support it. It has these fields:
359///
360/// * **secs** - A whole number of seconds indicating the length of this
361/// duration.
362/// * **nanos** - A fractional part of this duration represent by nanoseconds.
363/// If nanosecond precision isn't supported, then this is typically rounded
364/// up to the nearest number of nanoseconds.
365/// * **human** - A human readable string describing the length of the
366/// duration. The format of the string is itself unspecified.
367///
368/// #### Object: **arbitrary data**
369///
370/// This object is used whenever arbitrary data needs to be represented as a
371/// JSON value. This object contains two fields, where generally only one of
372/// the fields is present:
373///
374/// * **text** - A normal JSON string that is UTF-8 encoded. This field is
375/// populated if and only if the underlying data is valid UTF-8.
376/// * **bytes** - A normal JSON string that is a base64 encoding of the
377/// underlying bytes.
378///
379/// More information on the motivation for this representation can be seen in
380/// the section [text encoding](#text-encoding) above.
381///
382/// ## Example
383///
384/// This section shows a small example that includes all message types.
385///
386/// Here's the file we want to search, located at `/home/andrew/sherlock`:
387///
388/// ```text
389/// For the Doctor Watsons of this world, as opposed to the Sherlock
390/// Holmeses, success in the province of detective work must always
391/// be, to a very large extent, the result of luck. Sherlock Holmes
392/// can extract a clew from a wisp of straw or a flake of cigar ash;
393/// but Doctor Watson has to have it taken out for him and dusted,
394/// and exhibited clearly, with a label attached.
395/// ```
396///
397/// Searching for `Watson` with a `before_context` of `1` with line numbers
398/// enabled shows something like this using the standard printer:
399///
400/// ```text
401/// sherlock:1:For the Doctor Watsons of this world, as opposed to the Sherlock
402/// --
403/// sherlock-4-can extract a clew from a wisp of straw or a flake of cigar ash;
404/// sherlock:5:but Doctor Watson has to have it taken out for him and dusted,
405/// ```
406///
407/// Here's what the same search looks like using the JSON wire format described
408/// above, where in we show semi-prettified JSON (instead of a strict JSON
409/// Lines format), for illustrative purposes:
410///
411/// ```json
412/// {
413/// "type": "begin",
414/// "data": {
415/// "path": {"text": "/home/andrew/sherlock"}}
416/// }
417/// }
418/// {
419/// "type": "match",
420/// "data": {
421/// "path": {"text": "/home/andrew/sherlock"},
422/// "lines": {"text": "For the Doctor Watsons of this world, as opposed to the Sherlock\n"},
423/// "line_number": 1,
424/// "absolute_offset": 0,
425/// "submatches": [
426/// {"match": {"text": "Watson"}, "start": 15, "end": 21}
427/// ]
428/// }
429/// }
430/// {
431/// "type": "context",
432/// "data": {
433/// "path": {"text": "/home/andrew/sherlock"},
434/// "lines": {"text": "can extract a clew from a wisp of straw or a flake of cigar ash;\n"},
435/// "line_number": 4,
436/// "absolute_offset": 193,
437/// "submatches": []
438/// }
439/// }
440/// {
441/// "type": "match",
442/// "data": {
443/// "path": {"text": "/home/andrew/sherlock"},
444/// "lines": {"text": "but Doctor Watson has to have it taken out for him and dusted,\n"},
445/// "line_number": 5,
446/// "absolute_offset": 258,
447/// "submatches": [
448/// {"match": {"text": "Watson"}, "start": 11, "end": 17}
449/// ]
450/// }
451/// }
452/// {
453/// "type": "end",
454/// "data": {
455/// "path": {"text": "/home/andrew/sherlock"},
456/// "binary_offset": null,
457/// "stats": {
458/// "elapsed": {"secs": 0, "nanos": 36296, "human": "0.0000s"},
459/// "searches": 1,
460/// "searches_with_match": 1,
461/// "bytes_searched": 367,
462/// "bytes_printed": 1151,
463/// "matched_lines": 2,
464/// "matches": 2
465/// }
466/// }
467/// }
468/// ```
469/// and here's what a match type item would looks like if a replacement text
470/// of 'Moriarity' was given as a parameter:
471/// ```json
472/// {
473/// "type": "match",
474/// "data": {
475/// "path": {"text": "/home/andrew/sherlock"},
476/// "lines": {"text": "For the Doctor Watsons of this world, as opposed to the Sherlock\n"},
477/// "line_number": 1,
478/// "absolute_offset": 0,
479/// "submatches": [
480/// {"match": {"text": "Watson"}, "replacement": {"text": "Moriarity"}, "start": 15, "end": 21}
481/// ]
482/// }
483/// }
484/// ```
485
486#[derive(Clone, Debug)]
487pub struct JSON<W> {
488 config: Config,
489 wtr: CounterWriter<W>,
490 matches: Vec<Match>,
491}
492
493impl<W: io::Write> JSON<W> {
494 /// Return a JSON lines printer with a default configuration that writes
495 /// matches to the given writer.
496 pub fn new(wtr: W) -> JSON<W> {
497 JSONBuilder::new().build(wtr)
498 }
499
500 /// Return an implementation of `Sink` for the JSON printer.
501 ///
502 /// This does not associate the printer with a file path, which means this
503 /// implementation will never print a file path along with the matches.
504 pub fn sink<'s, M: Matcher>(
505 &'s mut self,
506 matcher: M,
507 ) -> JSONSink<'static, 's, M, W> {
508 JSONSink {
509 matcher,
510 replacer: Replacer::new(),
511 json: self,
512 path: None,
513 start_time: Instant::now(),
514 match_count: 0,
515 binary_byte_offset: None,
516 begin_printed: false,
517 stats: Stats::new(),
518 }
519 }
520
521 /// Return an implementation of `Sink` associated with a file path.
522 ///
523 /// When the printer is associated with a path, then it may, depending on
524 /// its configuration, print the path along with the matches found.
525 pub fn sink_with_path<'p, 's, M, P>(
526 &'s mut self,
527 matcher: M,
528 path: &'p P,
529 ) -> JSONSink<'p, 's, M, W>
530 where
531 M: Matcher,
532 P: ?Sized + AsRef<Path>,
533 {
534 JSONSink {
535 matcher,
536 replacer: Replacer::new(),
537 json: self,
538 path: Some(path.as_ref()),
539 start_time: Instant::now(),
540 match_count: 0,
541 binary_byte_offset: None,
542 begin_printed: false,
543 stats: Stats::new(),
544 }
545 }
546
547 /// Write the given message followed by a new line. The new line is
548 /// determined from the configuration of the given searcher.
549 fn write_message(
550 &mut self,
551 message: &jsont::Message<'_>,
552 ) -> io::Result<()> {
553 if self.config.pretty {
554 json::to_writer_pretty(&mut self.wtr, message)?;
555 } else {
556 json::to_writer(&mut self.wtr, message)?;
557 }
558 let _ = self.wtr.write(b"\n")?; // This will always be Ok(1) when successful.
559 Ok(())
560 }
561}
562
563impl<W> JSON<W> {
564 /// Returns true if and only if this printer has written at least one byte
565 /// to the underlying writer during any of the previous searches.
566 pub fn has_written(&self) -> bool {
567 self.wtr.total_count() > 0
568 }
569
570 /// Return a mutable reference to the underlying writer.
571 pub fn get_mut(&mut self) -> &mut W {
572 self.wtr.get_mut()
573 }
574
575 /// Consume this printer and return back ownership of the underlying
576 /// writer.
577 pub fn into_inner(self) -> W {
578 self.wtr.into_inner()
579 }
580}
581
582/// An implementation of `Sink` associated with a matcher and an optional file
583/// path for the JSON printer.
584///
585/// This type is generic over a few type parameters:
586///
587/// * `'p` refers to the lifetime of the file path, if one is provided. When
588/// no file path is given, then this is `'static`.
589/// * `'s` refers to the lifetime of the [`JSON`] printer that this type
590/// borrows.
591/// * `M` refers to the type of matcher used by
592/// `grep_searcher::Searcher` that is reporting results to this sink.
593/// * `W` refers to the underlying writer that this printer is writing its
594/// output to.
595#[derive(Debug)]
596pub struct JSONSink<'p, 's, M: Matcher, W> {
597 matcher: M,
598 replacer: Replacer<M>,
599 json: &'s mut JSON<W>,
600 path: Option<&'p Path>,
601 start_time: Instant,
602 match_count: u64,
603 binary_byte_offset: Option<u64>,
604 begin_printed: bool,
605 stats: Stats,
606}
607
608impl<'p, 's, M: Matcher, W: io::Write> JSONSink<'p, 's, M, W> {
609 /// Returns true if and only if this printer received a match in the
610 /// previous search.
611 ///
612 /// This is unaffected by the result of searches before the previous
613 /// search.
614 pub fn has_match(&self) -> bool {
615 self.match_count > 0
616 }
617
618 /// Return the total number of matches reported to this sink.
619 ///
620 /// This corresponds to the number of times `Sink::matched` is called.
621 pub fn match_count(&self) -> u64 {
622 self.match_count
623 }
624
625 /// If binary data was found in the previous search, this returns the
626 /// offset at which the binary data was first detected.
627 ///
628 /// The offset returned is an absolute offset relative to the entire
629 /// set of bytes searched.
630 ///
631 /// This is unaffected by the result of searches before the previous
632 /// search. e.g., If the search prior to the previous search found binary
633 /// data but the previous search found no binary data, then this will
634 /// return `None`.
635 pub fn binary_byte_offset(&self) -> Option<u64> {
636 self.binary_byte_offset
637 }
638
639 /// Return a reference to the stats produced by the printer for all
640 /// searches executed on this sink.
641 pub fn stats(&self) -> &Stats {
642 &self.stats
643 }
644
645 /// Execute the matcher over the given bytes and record the match
646 /// locations if the current configuration demands match granularity.
647 fn record_matches(
648 &mut self,
649 searcher: &Searcher,
650 bytes: &[u8],
651 range: std::ops::Range<usize>,
652 ) -> io::Result<()> {
653 self.json.matches.clear();
654 // If printing requires knowing the location of each individual match,
655 // then compute and stored those right now for use later. While this
656 // adds an extra copy for storing the matches, we do amortize the
657 // allocation for it and this greatly simplifies the printing logic to
658 // the extent that it's easy to ensure that we never do more than
659 // one search to find the matches.
660 let matches = &mut self.json.matches;
661 find_iter_at_in_context(
662 searcher,
663 &self.matcher,
664 bytes,
665 range.clone(),
666 |m| {
667 let (s, e) = (m.start() - range.start, m.end() - range.start);
668 matches.push(Match::new(s, e));
669 true
670 },
671 )?;
672 // Don't report empty matches appearing at the end of the bytes.
673 if !matches.is_empty()
674 && matches.last().unwrap().is_empty()
675 && matches.last().unwrap().start() >= bytes.len()
676 {
677 matches.pop().unwrap();
678 }
679 Ok(())
680 }
681
682 /// If the configuration specifies a replacement, then this executes the
683 /// replacement, lazily allocating memory if necessary.
684 ///
685 /// To access the result of a replacement, use `replacer.replacement()`.
686 fn replace(
687 &mut self,
688 searcher: &Searcher,
689 bytes: &[u8],
690 range: std::ops::Range<usize>,
691 ) -> io::Result<()> {
692 self.replacer.clear();
693 if self.json.config.replacement.is_some() {
694 let replacement =
695 (*self.json.config.replacement).as_ref().map(|r| &*r).unwrap();
696 self.replacer.replace_all(
697 searcher,
698 &self.matcher,
699 bytes,
700 range,
701 replacement,
702 )?;
703 }
704 Ok(())
705 }
706
707 /// Write the "begin" message.
708 fn write_begin_message(&mut self) -> io::Result<()> {
709 if self.begin_printed {
710 return Ok(());
711 }
712 let msg = jsont::Message::Begin(jsont::Begin { path: self.path });
713 self.json.write_message(&msg)?;
714 self.begin_printed = true;
715 Ok(())
716 }
717}
718
719impl<'p, 's, M: Matcher, W: io::Write> Sink for JSONSink<'p, 's, M, W> {
720 type Error = io::Error;
721
722 fn matched(
723 &mut self,
724 searcher: &Searcher,
725 mat: &SinkMatch<'_>,
726 ) -> Result<bool, io::Error> {
727 self.match_count += 1;
728 self.write_begin_message()?;
729
730 self.record_matches(
731 searcher,
732 mat.buffer(),
733 mat.bytes_range_in_buffer(),
734 )?;
735 self.replace(searcher, mat.buffer(), mat.bytes_range_in_buffer())?;
736 self.stats.add_matches(self.json.matches.len() as u64);
737 self.stats.add_matched_lines(mat.lines().count() as u64);
738
739 let submatches = SubMatches::new(
740 mat.bytes(),
741 &self.json.matches,
742 self.replacer.replacement(),
743 );
744 let msg = jsont::Message::Match(jsont::Match {
745 path: self.path,
746 lines: mat.bytes(),
747 line_number: mat.line_number(),
748 absolute_offset: mat.absolute_byte_offset(),
749 submatches: submatches.as_slice(),
750 });
751 self.json.write_message(&msg)?;
752 Ok(true)
753 }
754
755 fn context(
756 &mut self,
757 searcher: &Searcher,
758 ctx: &SinkContext<'_>,
759 ) -> Result<bool, io::Error> {
760 self.write_begin_message()?;
761 self.json.matches.clear();
762
763 let submatches = if searcher.invert_match() {
764 self.record_matches(searcher, ctx.bytes(), 0..ctx.bytes().len())?;
765 self.replace(searcher, ctx.bytes(), 0..ctx.bytes().len())?;
766 SubMatches::new(
767 ctx.bytes(),
768 &self.json.matches,
769 self.replacer.replacement(),
770 )
771 } else {
772 SubMatches::empty()
773 };
774 let msg = jsont::Message::Context(jsont::Context {
775 path: self.path,
776 lines: ctx.bytes(),
777 line_number: ctx.line_number(),
778 absolute_offset: ctx.absolute_byte_offset(),
779 submatches: submatches.as_slice(),
780 });
781 self.json.write_message(&msg)?;
782 Ok(true)
783 }
784
785 fn binary_data(
786 &mut self,
787 searcher: &Searcher,
788 binary_byte_offset: u64,
789 ) -> Result<bool, io::Error> {
790 if searcher.binary_detection().quit_byte().is_some() {
791 if let Some(ref path) = self.path {
792 log::debug!(
793 "ignoring {path}: found binary data at \
794 offset {binary_byte_offset}",
795 path = path.display(),
796 );
797 }
798 }
799 Ok(true)
800 }
801
802 fn begin(&mut self, _searcher: &Searcher) -> Result<bool, io::Error> {
803 self.json.wtr.reset_count();
804 self.start_time = Instant::now();
805 self.match_count = 0;
806 self.binary_byte_offset = None;
807
808 if !self.json.config.always_begin_end {
809 return Ok(true);
810 }
811 self.write_begin_message()?;
812 Ok(true)
813 }
814
815 fn finish(
816 &mut self,
817 _searcher: &Searcher,
818 finish: &SinkFinish,
819 ) -> Result<(), io::Error> {
820 self.binary_byte_offset = finish.binary_byte_offset();
821 self.stats.add_elapsed(self.start_time.elapsed());
822 self.stats.add_searches(1);
823 if self.match_count > 0 {
824 self.stats.add_searches_with_match(1);
825 }
826 self.stats.add_bytes_searched(finish.byte_count());
827 self.stats.add_bytes_printed(self.json.wtr.count());
828
829 if !self.begin_printed {
830 return Ok(());
831 }
832 let msg = jsont::Message::End(jsont::End {
833 path: self.path,
834 binary_offset: finish.binary_byte_offset(),
835 stats: self.stats.clone(),
836 });
837 self.json.write_message(&msg)?;
838 Ok(())
839 }
840}
841
842/// SubMatches represents a set of matches in a contiguous range of bytes.
843///
844/// A simpler representation for this would just simply be `Vec<SubMatch>`,
845/// but the common case is exactly one match per range of bytes, which we
846/// specialize here using a fixed size array without any allocation.
847enum SubMatches<'a> {
848 Empty,
849 Small([jsont::SubMatch<'a>; 1]),
850 Big(Vec<jsont::SubMatch<'a>>),
851}
852
853impl<'a> SubMatches<'a> {
854 /// Create a new set of match ranges from a set of matches and the
855 /// corresponding bytes that those matches apply to.
856 fn new(
857 bytes: &'a [u8],
858 matches: &[Match],
859 replacement: Option<(&'a [u8], &'a [Match])>,
860 ) -> SubMatches<'a> {
861 if matches.len() == 1 {
862 let mat = matches[0];
863 SubMatches::Small([jsont::SubMatch {
864 m: &bytes[mat],
865 replacement: replacement
866 .map(|(rbuf, rmatches)| &rbuf[rmatches[0]]),
867 start: mat.start(),
868 end: mat.end(),
869 }])
870 } else {
871 let mut match_ranges = vec![];
872 for (i, &mat) in matches.iter().enumerate() {
873 match_ranges.push(jsont::SubMatch {
874 m: &bytes[mat],
875 replacement: replacement
876 .map(|(rbuf, rmatches)| &rbuf[rmatches[i]]),
877 start: mat.start(),
878 end: mat.end(),
879 });
880 }
881 SubMatches::Big(match_ranges)
882 }
883 }
884
885 /// Create an empty set of match ranges.
886 fn empty() -> SubMatches<'static> {
887 SubMatches::Empty
888 }
889
890 /// Return this set of match ranges as a slice.
891 fn as_slice(&self) -> &[jsont::SubMatch<'_>] {
892 match *self {
893 SubMatches::Empty => &[],
894 SubMatches::Small(ref x) => x,
895 SubMatches::Big(ref x) => x,
896 }
897 }
898}
899
900#[cfg(test)]
901mod tests {
902 use grep_matcher::LineTerminator;
903 use grep_regex::{RegexMatcher, RegexMatcherBuilder};
904 use grep_searcher::SearcherBuilder;
905
906 use super::{JSON, JSONBuilder};
907
908 const SHERLOCK: &'static [u8] = b"\
909For the Doctor Watsons of this world, as opposed to the Sherlock
910Holmeses, success in the province of detective work must always
911be, to a very large extent, the result of luck. Sherlock Holmes
912can extract a clew from a wisp of straw or a flake of cigar ash;
913but Doctor Watson has to have it taken out for him and dusted,
914and exhibited clearly, with a label attached.
915";
916
917 fn printer_contents(printer: &mut JSON<Vec<u8>>) -> String {
918 String::from_utf8(printer.get_mut().to_owned()).unwrap()
919 }
920
921 #[test]
922 fn binary_detection() {
923 use grep_searcher::BinaryDetection;
924
925 const BINARY: &'static [u8] = b"\
926For the Doctor Watsons of this world, as opposed to the Sherlock
927Holmeses, success in the province of detective work must always
928be, to a very large extent, the result of luck. Sherlock Holmes
929can extract a clew \x00 from a wisp of straw or a flake of cigar ash;
930but Doctor Watson has to have it taken out for him and dusted,
931and exhibited clearly, with a label attached.\
932";
933
934 let matcher = RegexMatcher::new(r"Watson").unwrap();
935 let mut printer = JSONBuilder::new().build(vec![]);
936 SearcherBuilder::new()
937 .binary_detection(BinaryDetection::quit(b'\x00'))
938 .heap_limit(Some(80))
939 .build()
940 .search_reader(&matcher, BINARY, printer.sink(&matcher))
941 .unwrap();
942 let got = printer_contents(&mut printer);
943
944 assert_eq!(got.lines().count(), 3);
945 let last = got.lines().last().unwrap();
946 assert!(last.contains(r#""binary_offset":212,"#));
947 }
948
949 #[test]
950 fn max_matches() {
951 let matcher = RegexMatcher::new(r"Watson").unwrap();
952 let mut printer = JSONBuilder::new().build(vec![]);
953 SearcherBuilder::new()
954 .max_matches(Some(1))
955 .build()
956 .search_reader(&matcher, SHERLOCK, printer.sink(&matcher))
957 .unwrap();
958 let got = printer_contents(&mut printer);
959
960 assert_eq!(got.lines().count(), 3);
961 }
962
963 #[test]
964 fn max_matches_after_context() {
965 let haystack = "\
966a
967b
968c
969d
970e
971d
972e
973d
974e
975d
976e
977";
978 let matcher = RegexMatcher::new(r"d").unwrap();
979 let mut printer = JSONBuilder::new().build(vec![]);
980 SearcherBuilder::new()
981 .after_context(2)
982 .max_matches(Some(1))
983 .build()
984 .search_reader(
985 &matcher,
986 haystack.as_bytes(),
987 printer.sink(&matcher),
988 )
989 .unwrap();
990 let got = printer_contents(&mut printer);
991
992 assert_eq!(got.lines().count(), 5);
993 }
994
995 #[test]
996 fn no_match() {
997 let matcher = RegexMatcher::new(r"DOES NOT MATCH").unwrap();
998 let mut printer = JSONBuilder::new().build(vec![]);
999 SearcherBuilder::new()
1000 .build()
1001 .search_reader(&matcher, SHERLOCK, printer.sink(&matcher))
1002 .unwrap();
1003 let got = printer_contents(&mut printer);
1004
1005 assert!(got.is_empty());
1006 }
1007
1008 #[test]
1009 fn always_begin_end_no_match() {
1010 let matcher = RegexMatcher::new(r"DOES NOT MATCH").unwrap();
1011 let mut printer =
1012 JSONBuilder::new().always_begin_end(true).build(vec![]);
1013 SearcherBuilder::new()
1014 .build()
1015 .search_reader(&matcher, SHERLOCK, printer.sink(&matcher))
1016 .unwrap();
1017 let got = printer_contents(&mut printer);
1018
1019 assert_eq!(got.lines().count(), 2);
1020 assert!(got.contains("begin") && got.contains("end"));
1021 }
1022
1023 #[test]
1024 fn missing_crlf() {
1025 let haystack = "test\r\n".as_bytes();
1026
1027 let matcher = RegexMatcherBuilder::new().build("test").unwrap();
1028 let mut printer = JSONBuilder::new().build(vec![]);
1029 SearcherBuilder::new()
1030 .build()
1031 .search_reader(&matcher, haystack, printer.sink(&matcher))
1032 .unwrap();
1033 let got = printer_contents(&mut printer);
1034 assert_eq!(got.lines().count(), 3);
1035 assert!(
1036 got.lines().nth(1).unwrap().contains(r"test\r\n"),
1037 r"missing 'test\r\n' in '{}'",
1038 got.lines().nth(1).unwrap(),
1039 );
1040
1041 let matcher =
1042 RegexMatcherBuilder::new().crlf(true).build("test").unwrap();
1043 let mut printer = JSONBuilder::new().build(vec![]);
1044 SearcherBuilder::new()
1045 .line_terminator(LineTerminator::crlf())
1046 .build()
1047 .search_reader(&matcher, haystack, printer.sink(&matcher))
1048 .unwrap();
1049 let got = printer_contents(&mut printer);
1050 assert_eq!(got.lines().count(), 3);
1051 assert!(
1052 got.lines().nth(1).unwrap().contains(r"test\r\n"),
1053 r"missing 'test\r\n' in '{}'",
1054 got.lines().nth(1).unwrap(),
1055 );
1056 }
1057}