json_carver/
lib.rs

1//! JSON Carver
2//!
3//! Carve JSON structs from a binary stream of data.
4
5#![deny(
6    ambiguous_glob_reexports,
7    anonymous_parameters,
8    array_into_iter,
9    asm_sub_register,
10    bad_asm_style,
11    bare_trait_objects,
12    break_with_label_and_loop,
13    clashing_extern_declarations,
14    coherence_leak_check,
15    confusable_idents,
16    const_evaluatable_unchecked,
17    const_item_mutation,
18    dead_code,
19    deprecated,
20    deprecated_where_clause_location,
21    deref_into_dyn_supertrait,
22    deref_nullptr,
23    drop_bounds,
24    dropping_copy_types,
25    dropping_references,
26    duplicate_macro_attributes,
27    dyn_drop,
28    ellipsis_inclusive_range_patterns,
29    exported_private_dependencies,
30    for_loops_over_fallibles,
31    forbidden_lint_groups,
32    forgetting_copy_types,
33    forgetting_references,
34    function_item_references,
35    improper_ctypes,
36    improper_ctypes_definitions,
37    incomplete_features,
38    inline_no_sanitize,
39    invalid_doc_attributes,
40    invalid_macro_export_arguments,
41    invalid_value,
42    irrefutable_let_patterns,
43    large_assignments,
44    late_bound_lifetime_arguments,
45    legacy_derive_helpers,
46    map_unit_fn,
47    missing_docs,
48    named_arguments_used_positionally,
49    no_mangle_generic_items,
50    non_camel_case_types,
51    non_fmt_panics,
52    non_shorthand_field_patterns,
53    non_snake_case,
54    non_upper_case_globals,
55    opaque_hidden_inferred_bound,
56    overlapping_range_endpoints,
57    path_statements,
58    redundant_semicolons,
59    renamed_and_removed_lints,
60    repr_transparent_external_private_fields,
61    semicolon_in_expressions_from_macros,
62    special_module_name,
63    stable_features,
64    suspicious_double_ref_op,
65    trivial_bounds,
66    //trivial_casts,
67    trivial_numeric_casts,
68    type_alias_bounds,
69    tyvar_behind_raw_pointer,
70    uncommon_codepoints,
71    unconditional_recursion,
72    undefined_naked_function_abi,
73    unexpected_cfgs,
74    ungated_async_fn_track_caller,
75    uninhabited_static,
76    unknown_lints,
77    unnameable_test_items,
78    unreachable_code,
79    unreachable_patterns,
80    unsafe_code,
81    unstable_features,
82    unstable_name_collisions,
83    unstable_syntax_pre_expansion,
84    unused_allocation,
85    unused_assignments,
86    unused_attributes,
87    unused_braces,
88    unused_braces,
89    unused_comparisons,
90    unused_doc_comments,
91    unused_features,
92    unused_features,
93    unused_import_braces,
94    unused_imports,
95    unused_imports,
96    unused_labels,
97    unused_labels,
98    unused_macros,
99    unused_macros,
100    unused_must_use,
101    unused_mut,
102    unused_mut,
103    unused_parens,
104    unused_parens,
105    unused_qualifications,
106    unused_unsafe,
107    unused_unsafe,
108    unused_variables,
109    warnings,
110    while_true
111)]
112
113use std::fs::File;
114use std::io;
115use std::io::{BufRead, BufReader, BufWriter, Read, StderrLock, StdinLock, StdoutLock, Write};
116
117use memchr;
118
119mod errors;
120
121// Incrementally extend the internal buffer by this amount of bytes, whenever
122// a JSON string no longer fits in it.
123const BUF_EXTEND_SIZE: usize = 4 << 20; // 4MiB
124
125// The maximum identation depth of the JSON string that will be handled.
126const DEFAULT_MAX_IDENT_DEPTH: usize = 4 << 20;
127
128/// The minimum size of a JSON string that we will report.
129pub const DEFAULT_MIN_JSON_SIZE: usize = 4;
130
131// Constants for parsing JSON strings.
132// From https://www.rfc-editor.org/rfc/rfc8259#section-2
133//
134// Structural characters
135const CHAR_LEFT_SQUARE_BRACKET: u8 = 0x5B; // {
136const CHAR_LEFT_CURLY_BRACKET: u8 = 0x7B; // [
137const CHAR_RIGHT_SQUARE_BRACKET: u8 = 0x5D; // ]
138const CHAR_RIGHT_CURLY_BRACKET: u8 = 0x7D; // }
139const CHAR_COLON: u8 = 0x3A; // :
140const CHAR_COMMA: u8 = 0x2C; // ,
141
142// Insignificant whitespace
143const CHAR_SPACE: u8 = 0x20;
144const CHAR_TAB: u8 = 0x09;
145const CHAR_NEWLINE: u8 = 0x0A;
146const CHAR_CARRIAGE_RETURN: u8 = 0x0D;
147
148// Literals
149const CHAR_START_FALSE: u8 = 0x66; // f
150const CHAR_START_NULL: u8 = 0x6E; // n
151const CHAR_START_TRUE: u8 = 0x74; // t
152
153// Numbers
154const CHAR_MINUS: u8 = 0x2D; // -
155const CHAR_PLUS: u8 = 0x2B; // +
156const CHAR_ZERO: u8 = 0x30; // 0
157const CHAR_NINE: u8 = 0x39; // 9
158const CHAR_DECIMAL: u8 = 0x2E; // .
159const CHAR_EXP_LOWER: u8 = 0x65; // e
160const CHAR_EXP_UPPER: u8 = 0x45; // E
161
162// Strings
163const CHAR_QUOT_MARK: u8 = 0x22; // "
164const CHAR_ESCAPE: u8 = 0x5C; // \
165const CHAR_SLASH: u8 = 0x2F; // /
166const CHAR_ESC_BACKSPACE: u8 = 0x62; // b
167const CHAR_ESC_FORM_FEED: u8 = 0x66; // f
168const CHAR_ESC_LINE_FEED: u8 = 0x6E; // n
169const CHAR_ESC_CARRIAGE_RETURN: u8 = 0x72; // r
170const CHAR_ESC_TAB: u8 = 0x74; // t
171const CHAR_U: u8 = 0x75; // u
172
173enum Cause {
174    Found(u8),
175    Corrupted(u8),
176    Completed,
177    Exhausted,
178}
179
180fn byte_needs_escape(b: u8) -> bool {
181    b < 0x1F
182}
183
184fn byte_can_escape(b: u8) -> bool {
185    match b {
186        CHAR_QUOT_MARK
187        | CHAR_ESCAPE
188        | CHAR_SLASH
189        | CHAR_ESC_BACKSPACE
190        | CHAR_ESC_FORM_FEED
191        | CHAR_ESC_LINE_FEED
192        | CHAR_ESC_CARRIAGE_RETURN
193        | CHAR_ESC_TAB
194        | CHAR_U => true,
195        _ => false,
196    }
197}
198
199fn _closing_ident(b: u8) -> u8 {
200    b + 0x02
201}
202
203struct Report {
204    status: Cause,
205    start: usize,
206    end: usize,
207    partial_end: usize,
208}
209
210impl Report {
211    /// Print a status report.
212    ///
213    /// Status reports are comma-separated CSVs with the following fields:
214    ///
215    /// ```text
216    /// status,start,end,partial_end
217    /// ```
218    ///
219    /// where:
220    /// * `status` is either "corrupted", "exhausted", or "completed".
221    /// * (`start`, `end`) is the position of the JSON string within the byte
222    ///   stream, last character included.
223    /// * `partial_end` is the position of the last character where the JSON
224    ///    string could have ended.
225    fn print(&self, writer: &mut Writer) -> Result<(), errors::Err> {
226        let w = writer.mut_ref();
227
228        let status = match self.status {
229            Cause::Exhausted => "exhausted",
230            Cause::Corrupted(_) => "corrupted",
231            Cause::Completed => "completed",
232            _ => unreachable!(),
233        };
234        w.write_all(
235            format!(
236                "{},{},{},{}\n",
237                status, self.start, self.end, self.partial_end
238            )
239            .as_ref(),
240        )?;
241        Ok(())
242    }
243}
244
245#[derive(Debug)]
246struct JsonTracker {
247    cur: usize,
248    partial_close_end: usize,
249    ident_levels: Vec<u8>,
250    cur_ident_level: usize,
251    in_key: bool,
252    processed: Vec<u8>,
253    replace_newlines: bool,
254}
255
256impl JsonTracker {
257    fn new(max_size: Option<usize>, max_ident_depth: Option<usize>) -> JsonTracker {
258        let _max_size = match max_size {
259            Some(size) => size,
260            None => BUF_EXTEND_SIZE,
261        };
262
263        let _max_ident_depth = match max_ident_depth {
264            Some(size) => size,
265            None => DEFAULT_MAX_IDENT_DEPTH,
266        };
267        JsonTracker {
268            cur: 0,
269            partial_close_end: 0,
270            ident_levels: vec![0u8; _max_ident_depth],
271            cur_ident_level: 0,
272            in_key: false,
273            processed: vec![0u8; _max_size],
274            replace_newlines: false,
275        }
276    }
277
278    fn advance(&mut self, mut b: u8) {
279        // See how ripgrep handles the "very large lines" problem:
280        // https://github.com/BurntSushi/ripgrep/issues/2959
281        // FIXME: Handle the case where we are asked to advance, but there is
282        // no identation level remaining.
283        //
284        if self.replace_newlines && b == CHAR_NEWLINE {
285            b = CHAR_SPACE;
286        }
287        if self.cur < self.processed.len() {
288            self.processed[self.cur] = b;
289        } else {
290            self.processed.reserve(BUF_EXTEND_SIZE);
291            self.processed.push(b);
292        }
293        self.cur += 1;
294    }
295
296    fn last_byte(&self) -> Option<u8> {
297        if self.cur == 0 {
298            return None;
299        }
300
301        Some(self.processed[self.cur - 1])
302    }
303
304    fn last_ident(&self) -> Option<u8> {
305        if self.cur_ident_level == 0 {
306            return None;
307        }
308
309        Some(self.ident_levels[self.cur_ident_level - 1])
310    }
311
312    fn add_ident(&mut self, b: u8) {
313        self.cur_ident_level += 1;
314        self.ident_levels[self.cur_ident_level - 1] = b;
315        self.partial_close_end = self.cur;
316        self.advance(b);
317    }
318
319    fn remove_ident(&mut self, expected: u8) -> Result<bool, ()> {
320        match self.last_ident() {
321            None => return Err(()),
322            Some(ident) => {
323                if ident != expected {
324                    return Err(());
325                }
326            }
327        }
328
329        self.partial_close_end = self.cur;
330        self.cur_ident_level -= 1;
331        self.advance(_closing_ident(expected)); // That's the closing bracket.
332
333        match self.cur_ident_level {
334            0 => Ok(true),
335            _ => Ok(false),
336        }
337    }
338
339    fn quick_clean(&mut self) -> () {
340        self.cur = 0;
341        self.partial_close_end = 0;
342        self.cur_ident_level = 0;
343        self.in_key = false;
344    }
345}
346
347/// Implementation of a stream reader.
348pub enum Reader<'a> {
349    /// A file reader
350    File(BufReader<File>),
351    /// An stdin sreader
352    Stdin(StdinLock<'a>),
353    /// A local buffer reader
354    Local(BufReader<&'a [u8]>),
355}
356
357impl<'a> Reader<'a> {
358    /// Create a Reader from a file.
359    pub fn from_file(file: File, buf_size: Option<usize>) -> Reader<'a> {
360        match buf_size {
361            Some(size) => Reader::File(BufReader::with_capacity(size, file)),
362            None => Reader::File(BufReader::new(file)),
363        }
364    }
365
366    /// Create a Reader for the process' stdin.
367    pub fn from_stdin() -> Reader<'a> {
368        Reader::Stdin(io::stdin().lock())
369    }
370
371    fn mut_ref(&mut self) -> &mut dyn BufRead {
372        // Some type voodo are involved:
373        // https://users.rust-lang.org/t/why-ref-mut-and-not-mut-in-enum-matching/95721/8
374        match self {
375            Reader::File(r) => r,
376            Reader::Stdin(r) => r,
377            Reader::Local(r) => r,
378        }
379    }
380}
381
382/// Implementation of a stream writer.
383pub enum Writer<'a> {
384    /// A file writer
385    File(BufWriter<File>),
386    /// A writer to stdout
387    Stdout(StdoutLock<'a>),
388    /// A writer to stderr
389    Stderr(StderrLock<'a>),
390    /// A writer to a local buffer
391    Local(BufWriter<Vec<u8>>),
392}
393
394impl<'a> Writer<'a> {
395    /// Create a writer from a file.
396    pub fn to_file(file: File, buf_size: Option<usize>) -> Writer<'a> {
397        match buf_size {
398            Some(size) => Writer::File(BufWriter::with_capacity(size, file)),
399            None => Writer::File(BufWriter::new(file)),
400        }
401    }
402
403    /// Create a writer to the process' stdout.
404    pub fn to_stdout() -> Writer<'a> {
405        Writer::Stdout(io::stdout().lock())
406    }
407
408    /// Create a writer to the process' stderr.
409    pub fn to_stderr() -> Writer<'a> {
410        Writer::Stderr(io::stderr().lock())
411    }
412
413    fn mut_ref(&mut self) -> &mut dyn Write {
414        // Some type voodo are involved:
415        // https://users.rust-lang.org/t/why-ref-mut-and-not-mut-in-enum-matching/95721/8
416        match self {
417            Self::File(w) => w,
418            Self::Stdout(w) => w,
419            Self::Stderr(w) => w,
420            Self::Local(w) => w,
421        }
422    }
423}
424
425/// The Carver struct is responsible for carving JSON strings out of the
426/// provided reader, and provide output and reports to the provided writers.
427pub struct Carver<'a> {
428    jt: JsonTracker,
429    reader: Reader<'a>,
430    json_writer: Writer<'a>,
431    report_writer: Writer<'a>,
432    /// The minimum size of the JSON string that will be reported.
433    pub min_size: usize,
434    /// Whether to attempt to fix incomplete JSON strings.
435    pub fix_incomplete: bool,
436    /// Whether to report every detected JSON, not just corrupted ones.
437    pub report_all: bool,
438}
439
440impl<'a> Carver<'a> {
441    /// Create a new `Carver` instance from the provided `Reader` and `Writer`
442    /// instances.
443    pub fn new(
444        reader: Reader<'a>,
445        json_writer: Writer<'a>,
446        report_writer: Writer<'a>,
447        max_size: Option<usize>,
448        max_ident_depth: Option<usize>,
449    ) -> Self {
450        Carver {
451            jt: JsonTracker::new(max_size, max_ident_depth),
452            reader: reader,
453            json_writer: json_writer,
454            report_writer: report_writer,
455            min_size: DEFAULT_MIN_JSON_SIZE,
456            fix_incomplete: false,
457            report_all: false,
458        }
459    }
460
461    /// Configure whether to replace newlines in JSON strings or not.
462    pub fn replace_newlines(&mut self, opt: bool) {
463        self.jt.replace_newlines = opt;
464    }
465
466    /// Basically skip_until(), if it could search for two bytes instead of
467    /// one. Here, we mimic its behavior, using the memchr crate, since the
468    /// internal memchr is not stable yet.
469    ///
470    /// The end product of this method is that the next read from the buffer
471    /// should return the character we looked for.
472    fn scout(&mut self) -> Result<Option<(usize, u8)>, io::Error> {
473        let mut read = 0;
474        let mut ch = 0;
475        let r = self.reader.mut_ref();
476        loop {
477            let (done, used) = {
478                let available = match r.fill_buf() {
479                    Ok(n) => n,
480                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
481                    Err(e) => return Err(e),
482                };
483                match memchr::memchr2(CHAR_LEFT_SQUARE_BRACKET, CHAR_LEFT_CURLY_BRACKET, available)
484                {
485                    Some(i) => {
486                        // The only difference from skip_until is that we want
487                        // to retain the last character.
488                        ch = available[i];
489                        (true, i + 1)
490                    }
491                    None => (false, available.len()),
492                }
493            };
494            r.consume(used);
495            read += used;
496            if done {
497                return Ok(Some((read, ch)));
498            }
499            if used == 0 {
500                return Ok(None);
501            }
502        }
503    }
504
505    fn handle_left_square_bracket(&mut self) -> Result<Cause, io::Error> {
506        self.jt.add_ident(CHAR_LEFT_SQUARE_BRACKET);
507        for b in self.reader.mut_ref().bytes() {
508            let b = b?;
509            match b {
510                CHAR_LEFT_SQUARE_BRACKET
511                | CHAR_LEFT_CURLY_BRACKET
512                | CHAR_RIGHT_SQUARE_BRACKET
513                | CHAR_QUOT_MARK
514                | CHAR_MINUS
515                | CHAR_ZERO..=CHAR_NINE
516                | CHAR_START_FALSE
517                | CHAR_START_NULL
518                | CHAR_START_TRUE => return Ok(Cause::Found(b)),
519                CHAR_SPACE | CHAR_TAB | CHAR_NEWLINE | CHAR_CARRIAGE_RETURN => self.jt.advance(b),
520                _ => return Ok(Cause::Corrupted(b)),
521            }
522        }
523        Ok(Cause::Exhausted)
524    }
525
526    fn handle_left_curly_bracket(&mut self) -> Result<Cause, io::Error> {
527        self.jt.add_ident(CHAR_LEFT_CURLY_BRACKET);
528        for b in self.reader.mut_ref().bytes() {
529            let b = b?;
530            match b {
531                CHAR_QUOT_MARK => {
532                    self.jt.in_key = true;
533                    return Ok(Cause::Found(b));
534                }
535                CHAR_RIGHT_CURLY_BRACKET => return Ok(Cause::Found(b)),
536                CHAR_SPACE | CHAR_TAB | CHAR_NEWLINE | CHAR_CARRIAGE_RETURN => {
537                    self.jt.advance(b);
538                }
539                _ => return Ok(Cause::Corrupted(b)),
540            }
541        }
542        Ok(Cause::Exhausted)
543    }
544
545    fn handle_right_square_bracket(&mut self) -> Result<Cause, io::Error> {
546        match self.jt.remove_ident(CHAR_LEFT_SQUARE_BRACKET) {
547            Ok(true) => return Ok(Cause::Completed),
548            Ok(false) => (),
549            Err(_) => return Ok(Cause::Corrupted(CHAR_RIGHT_SQUARE_BRACKET)),
550        }
551
552        for b in self.reader.mut_ref().bytes() {
553            let b = b?;
554            match b {
555                CHAR_COMMA | CHAR_RIGHT_SQUARE_BRACKET | CHAR_RIGHT_CURLY_BRACKET => {
556                    return Ok(Cause::Found(b))
557                }
558                CHAR_SPACE | CHAR_TAB | CHAR_NEWLINE | CHAR_CARRIAGE_RETURN => self.jt.advance(b),
559                _ => return Ok(Cause::Corrupted(b)),
560            }
561        }
562        Ok(Cause::Exhausted)
563    }
564
565    fn handle_right_curly_bracket(&mut self) -> Result<Cause, io::Error> {
566        match self.jt.remove_ident(CHAR_LEFT_CURLY_BRACKET) {
567            Ok(true) => return Ok(Cause::Completed),
568            Ok(false) => (),
569            Err(_) => return Ok(Cause::Corrupted(CHAR_RIGHT_CURLY_BRACKET)),
570        }
571
572        for b in self.reader.mut_ref().bytes() {
573            let b = b?;
574            match b {
575                CHAR_COMMA | CHAR_RIGHT_SQUARE_BRACKET | CHAR_RIGHT_CURLY_BRACKET => {
576                    return Ok(Cause::Found(b))
577                }
578                CHAR_SPACE | CHAR_TAB | CHAR_NEWLINE | CHAR_CARRIAGE_RETURN => self.jt.advance(b),
579                _ => return Ok(Cause::Corrupted(b)),
580            }
581        }
582        Ok(Cause::Exhausted)
583    }
584
585    fn handle_colon(&mut self) -> Result<Cause, io::Error> {
586        self.jt.in_key = false;
587        self.jt.advance(CHAR_COLON);
588        for b in self.reader.mut_ref().bytes() {
589            let b = b?;
590            match b {
591                CHAR_LEFT_CURLY_BRACKET
592                | CHAR_LEFT_SQUARE_BRACKET
593                | CHAR_MINUS
594                | CHAR_ZERO..=CHAR_NINE
595                | CHAR_QUOT_MARK
596                | CHAR_START_FALSE
597                | CHAR_START_NULL
598                | CHAR_START_TRUE => return Ok(Cause::Found(b)),
599                CHAR_SPACE | CHAR_TAB | CHAR_NEWLINE | CHAR_CARRIAGE_RETURN => self.jt.advance(b),
600                _ => return Ok(Cause::Corrupted(b)),
601            }
602        }
603        Ok(Cause::Exhausted)
604    }
605
606    fn handle_comma(&mut self) -> Result<Cause, io::Error> {
607        self.jt.advance(CHAR_COMMA);
608        match self.jt.last_ident() {
609            Some(CHAR_LEFT_SQUARE_BRACKET) => {
610                for b in self.reader.mut_ref().bytes() {
611                    let b = b?;
612                    match b {
613                        CHAR_LEFT_CURLY_BRACKET
614                        | CHAR_LEFT_SQUARE_BRACKET
615                        | CHAR_MINUS
616                        | CHAR_ZERO..=CHAR_NINE
617                        | CHAR_QUOT_MARK
618                        | CHAR_START_FALSE
619                        | CHAR_START_NULL
620                        | CHAR_START_TRUE => return Ok(Cause::Found(b)),
621                        CHAR_SPACE | CHAR_TAB | CHAR_NEWLINE | CHAR_CARRIAGE_RETURN => {
622                            self.jt.advance(b)
623                        }
624                        _ => return Ok(Cause::Corrupted(b)),
625                    }
626                }
627                Ok(Cause::Exhausted)
628            }
629            Some(CHAR_LEFT_CURLY_BRACKET) => {
630                for b in self.reader.mut_ref().bytes() {
631                    let b = b?;
632                    match b {
633                        CHAR_QUOT_MARK => {
634                            self.jt.in_key = true;
635                            return Ok(Cause::Found(b));
636                        }
637                        CHAR_SPACE | CHAR_TAB | CHAR_NEWLINE | CHAR_CARRIAGE_RETURN => {
638                            self.jt.advance(b)
639                        }
640                        _ => return Ok(Cause::Corrupted(b)),
641                    }
642                }
643                Ok(Cause::Exhausted)
644            }
645            Some(_) => unreachable!(), // FIXME: Ensure that this is indeed unreachable.
646            None => unreachable!(),    // FIXME: Ensure that this is indeed unreachable.
647        }
648    }
649
650    fn handle_string(&mut self) -> Result<Cause, io::Error> {
651        self.jt.advance(CHAR_QUOT_MARK);
652        let mut in_string = true;
653        let mut in_escape = false;
654        let mut in_escaped_unicode = 0;
655
656        for b in self.reader.mut_ref().bytes() {
657            let b = b?;
658            // We're at least one indentation level deep when parsing strings,
659            // so we can safely unwrap().
660            let last_ident = self.jt.last_ident().unwrap();
661
662            if !in_string {
663                match (b, last_ident, self.jt.in_key) {
664                    (CHAR_SPACE | CHAR_TAB | CHAR_NEWLINE | CHAR_CARRIAGE_RETURN, _, _) => (),
665                    // Case 1: A string value in a JSON list: ["test", "1"]
666                    (CHAR_COMMA | CHAR_RIGHT_SQUARE_BRACKET, CHAR_LEFT_SQUARE_BRACKET, _) => {
667                        return Ok(Cause::Found(b))
668                    }
669                    // Case 2: A value in a JSON object: {"test": "yes", "pain": "right"}
670                    (CHAR_COMMA | CHAR_RIGHT_CURLY_BRACKET, CHAR_LEFT_CURLY_BRACKET, false) => {
671                        return Ok(Cause::Found(b))
672                    }
673                    // Case 3: A key in a JSON object: {"test": 1, "pain": true}
674                    (CHAR_COLON, CHAR_LEFT_CURLY_BRACKET, true) => return Ok(Cause::Found(b)),
675                    (_, _, _) => return Ok(Cause::Corrupted(b)),
676                }
677            } else {
678                match (b, in_escape, in_escaped_unicode) {
679                    (CHAR_ESCAPE, false, 0) => in_escape = true,
680                    (CHAR_QUOT_MARK, false, 0) => in_string = false,
681                    (0x00..0x1F, _, _) => return Ok(Cause::Corrupted(b)),
682                    (_, false, 0) => {
683                        if byte_needs_escape(b) {
684                            return Ok(Cause::Corrupted(b));
685                        }
686                    }
687                    (CHAR_U, true, 0) => {
688                        in_escaped_unicode = 4;
689                        in_escape = false;
690                    }
691                    (_, true, 0) => {
692                        if byte_can_escape(b) {
693                            in_escape = false;
694                        } else {
695                            return Ok(Cause::Corrupted(b));
696                        }
697                    }
698                    (_, _, 1..=4) => {
699                        if b.is_ascii_hexdigit() {
700                            in_escaped_unicode -= 1;
701                        } else {
702                            return Ok(Cause::Corrupted(b));
703                        }
704                    }
705                    (_, _, _) => return Ok(Cause::Corrupted(b)),
706                }
707            }
708            self.jt.advance(b);
709        }
710        Ok(Cause::Exhausted)
711    }
712
713    fn handle_number(&mut self, start_num: u8) -> Result<Cause, io::Error> {
714        self.jt.advance(start_num);
715        let mut in_frac = false;
716        let mut in_exp = false;
717        let mut in_leading_zero: Option<bool> = None;
718
719        for b in self.reader.mut_ref().bytes() {
720            let b = b?;
721            // We've processed at least two bytes in order to be here, so we
722            // can safely unwrap().
723            let last_byte = self.jt.last_byte().unwrap();
724
725            // Check for leading zeroes.
726            //
727            // A leading zero can be preceeded by a minus sign (-), but cannot
728            // be followed by digits.
729            if in_leading_zero == None {
730                in_leading_zero = match last_byte {
731                    CHAR_MINUS => None,
732                    CHAR_ZERO => Some(true),
733                    _ => Some(false),
734                }
735            }
736            if in_leading_zero == Some(true) {
737                in_leading_zero = match b {
738                    CHAR_ZERO..=CHAR_NINE => return Ok(Cause::Corrupted(b)),
739                    _ => Some(false),
740                }
741            }
742
743            match (last_byte, b) {
744                // Only numbers can follow +/-/..
745                (CHAR_MINUS | CHAR_PLUS | CHAR_DECIMAL, CHAR_ZERO..=CHAR_NINE) => (),
746                // Only numbers or +/- can follow exponent signs.
747                (
748                    CHAR_EXP_LOWER | CHAR_EXP_UPPER,
749                    CHAR_ZERO..=CHAR_NINE | CHAR_MINUS | CHAR_PLUS,
750                ) => (),
751                // Digits, insignificant whitespace, or ,]} can *always* follow
752                // digits.
753                (
754                    CHAR_ZERO..=CHAR_NINE,
755                    CHAR_ZERO..=CHAR_NINE | CHAR_SPACE | CHAR_TAB | CHAR_NEWLINE,
756                ) => (),
757                // Decimal points can follow numbers if we're not in a
758                // fractional/exponent part already.
759                (CHAR_ZERO..=CHAR_NINE, CHAR_DECIMAL) => match (in_frac, in_exp) {
760                    (true, _) | (_, true) => return Ok(Cause::Corrupted(b)),
761                    (false, _) => in_frac = true,
762                },
763                // Exponent signs can follow numbers if we're not in a exponent
764                // part already.
765                (CHAR_ZERO..=CHAR_NINE, CHAR_EXP_LOWER | CHAR_EXP_UPPER) => match in_exp {
766                    true => return Ok(Cause::Corrupted(b)),
767                    false => in_exp = true,
768                },
769                // Numbers are complete only if digits and insignificant
770                // whitespace are followed by ,]}.
771                (
772                    CHAR_SPACE
773                    | CHAR_TAB
774                    | CHAR_NEWLINE
775                    | CHAR_CARRIAGE_RETURN
776                    | CHAR_ZERO..=CHAR_NINE,
777                    CHAR_COMMA | CHAR_RIGHT_SQUARE_BRACKET | CHAR_RIGHT_CURLY_BRACKET,
778                ) => return Ok(Cause::Found(b)),
779                // Everything else is not permitted.
780                (_, _) => return Ok(Cause::Corrupted(b)),
781            }
782            self.jt.advance(b);
783        }
784        Ok(Cause::Exhausted)
785    }
786
787    fn handle_literal(&mut self, start_char: u8) -> Result<Cause, io::Error> {
788        self.jt.advance(start_char);
789        let literal: &[u8] = match start_char {
790            CHAR_START_FALSE => "alse".as_bytes(),
791            CHAR_START_NULL => "ull".as_bytes(),
792            CHAR_START_TRUE => "rue".as_bytes(),
793            _ => unreachable!(),
794        };
795
796        for (i, b) in self.reader.mut_ref().bytes().enumerate() {
797            let b = b?;
798            if literal[i] != b {
799                return Ok(Cause::Corrupted(b));
800            }
801            self.jt.advance(b);
802            if literal.len() == i + 1 {
803                break;
804            }
805        }
806
807        for b in self.reader.mut_ref().bytes() {
808            let b = b?;
809            match b {
810                CHAR_COMMA | CHAR_RIGHT_SQUARE_BRACKET | CHAR_RIGHT_CURLY_BRACKET => {
811                    return Ok(Cause::Found(b));
812                }
813                CHAR_SPACE | CHAR_TAB | CHAR_NEWLINE | CHAR_CARRIAGE_RETURN => self.jt.advance(b),
814                _ => return Ok(Cause::Corrupted(b)),
815            }
816        }
817        Ok(Cause::Exhausted)
818    }
819
820    fn hunt(&mut self, mut ch: u8) -> Result<Cause, ()> {
821        loop {
822            let res = match ch {
823                CHAR_LEFT_SQUARE_BRACKET => self.handle_left_square_bracket(),
824                CHAR_LEFT_CURLY_BRACKET => self.handle_left_curly_bracket(),
825                CHAR_RIGHT_SQUARE_BRACKET => self.handle_right_square_bracket(),
826                CHAR_RIGHT_CURLY_BRACKET => self.handle_right_curly_bracket(),
827                CHAR_COLON => self.handle_colon(),
828                CHAR_COMMA => self.handle_comma(),
829                CHAR_QUOT_MARK => self.handle_string(),
830                CHAR_MINUS | CHAR_ZERO..=CHAR_NINE => self.handle_number(ch),
831                CHAR_START_FALSE | CHAR_START_NULL | CHAR_START_TRUE => self.handle_literal(ch),
832                _ => {
833                    return Err(());
834                }
835            };
836
837            ch = match res {
838                Ok(Cause::Completed) => {
839                    return Ok(Cause::Completed);
840                }
841                Ok(Cause::Found(ch)) => ch,
842                Ok(Cause::Corrupted(ch)) => {
843                    return Ok(Cause::Corrupted(ch));
844                }
845                Ok(Cause::Exhausted) => {
846                    return Ok(Cause::Exhausted);
847                }
848                Err(_) => {
849                    return Err(()); // FIXME: Capture this error
850                }
851            }
852        }
853    }
854
855    fn _print_incomplete(&mut self) -> Result<(), errors::Err> {
856        let w = self.json_writer.mut_ref();
857        w.write_all(&self.jt.processed[..self.jt.partial_close_end + 1])?;
858        for i in (0..self.jt.cur_ident_level).rev() {
859            let closing_ident = _closing_ident(self.jt.ident_levels[i]);
860            w.write_all(&[closing_ident])?;
861        }
862        w.write_all(&[CHAR_NEWLINE])?;
863        Ok(())
864    }
865
866    /// Start carving a stream of data for JSON strings.
867    pub fn parse(&mut self) -> Result<(), errors::Err> {
868        let mut start = 0;
869        let mut lastb: Option<u8> = None;
870
871        loop {
872            let (read, ch) = match lastb {
873                Some(CHAR_LEFT_CURLY_BRACKET) | Some(CHAR_LEFT_SQUARE_BRACKET) => {
874                    // we can safely unwrap() because we're in Some()
875                    (0, lastb.unwrap())
876                }
877                _ => match self.scout() {
878                    Ok(None) => {
879                        break;
880                    }
881                    Ok(Some((read, ch))) => (read, ch),
882                    Err(_) => {
883                        break;
884                    }
885                },
886            };
887            start = start + read - 1;
888            if lastb.is_some() {
889                start += 1;
890            }
891
892            match self.hunt(ch) {
893                Ok(Cause::Completed) => {
894                    let end = start + self.jt.cur - 1;
895                    let w = self.json_writer.mut_ref();
896                    if self.jt.cur >= self.min_size {
897                        w.write_all(&self.jt.processed[..self.jt.cur])?;
898                        w.write_all(&[CHAR_NEWLINE])?;
899                        if self.report_all {
900                            let report = Report {
901                                status: Cause::Completed,
902                                start: start,
903                                end: end,
904                                partial_end: end,
905                            };
906                            report.print(&mut self.report_writer)?;
907                        }
908                    }
909                    start = end + 1;
910                    lastb = None;
911                }
912                Ok(Cause::Corrupted(ch)) => {
913                    let corrupted_end = start + self.jt.cur - 1;
914                    let partial_end = start + self.jt.partial_close_end;
915                    if self.jt.partial_close_end >= self.min_size {
916                        let report = Report {
917                            status: Cause::Corrupted(ch),
918                            start: start,
919                            end: corrupted_end,
920                            partial_end: partial_end,
921                        };
922                        report.print(&mut self.report_writer)?;
923                        if self.fix_incomplete {
924                            self._print_incomplete()?
925                        }
926                    }
927                    start = corrupted_end + 1;
928                    lastb = Some(ch);
929                }
930                Ok(Cause::Exhausted) => {
931                    let corrupted_end = start + self.jt.cur - 1;
932                    let partial_end = start + self.jt.partial_close_end;
933                    if self.jt.partial_close_end >= self.min_size {
934                        let report = Report {
935                            status: Cause::Exhausted,
936                            start: start,
937                            end: corrupted_end,
938                            partial_end: partial_end,
939                        };
940                        report.print(&mut self.report_writer)?;
941                        if self.fix_incomplete {
942                            self._print_incomplete()?
943                        }
944                    }
945                    break;
946                }
947                Ok(Cause::Found(_)) => unreachable!(),
948                Err(_) => {
949                    break;
950                }
951            };
952            self.jt.quick_clean();
953        }
954        Ok(())
955    }
956}
957
958#[cfg(test)]
959mod tests {
960    use rstest::rstest;
961    use std::fs;
962    use std::path::PathBuf;
963
964    use super::*;
965
966    fn create_carver<'a>(buf: &'a [u8]) -> Carver<'a> {
967        let reader = BufReader::new(buf);
968        let json_writer = BufWriter::new(vec![]);
969        let report_writer = BufWriter::new(vec![]);
970        let mut carver = Carver::new(
971            Reader::Local(reader),
972            Writer::Local(json_writer),
973            Writer::Local(report_writer),
974            None,
975            None,
976        );
977        carver.min_size = 0;
978        carver
979    }
980
981    fn get_buf(writer: &Writer) -> Vec<u8> {
982        let mut res_buf = match writer {
983            Writer::Local(w) => w.buffer().to_vec(),
984            _ => unreachable!(),
985        };
986        if res_buf.last() == Some(&CHAR_NEWLINE) {
987            res_buf.pop();
988        }
989        res_buf
990    }
991
992    /// Parse buffer and return the string that is printed.
993    fn parse(buf: &[u8]) -> Vec<u8> {
994        let buf_disp = String::from_utf8_lossy(buf);
995        eprintln!("### Evaluating buffer: {buf_disp}");
996        let mut carver = create_carver(buf);
997        let res = carver.parse();
998        assert!(res.is_ok());
999        let res_buf = get_buf(&carver.json_writer);
1000        let res_buf_disp = String::from_utf8_lossy(&res_buf);
1001        eprintln!("### Result is: {res_buf_disp}");
1002        res_buf
1003    }
1004
1005    fn report_incomplete(buf: &[u8], fix: bool) -> (Vec<u8>, Vec<u8>) {
1006        let buf_disp = String::from_utf8_lossy(buf);
1007        eprintln!("### Evaluating buffer: {buf_disp}");
1008        let mut carver = create_carver(buf);
1009        carver.fix_incomplete = fix;
1010        let res = carver.parse();
1011        assert!(res.is_ok());
1012        let json_buf = get_buf(&carver.json_writer);
1013        let report_buf = get_buf(&carver.report_writer);
1014        let json_buf_disp = String::from_utf8_lossy(&json_buf);
1015        let report_buf_disp = String::from_utf8_lossy(&report_buf);
1016        eprintln!("### Result is: {json_buf_disp}");
1017        eprintln!("### Report is: {report_buf_disp}");
1018        (json_buf, report_buf)
1019    }
1020
1021    /// Parse buffer and return a list of strings that are printed, delimited
1022    /// by newlines.
1023    fn collect(buf: &[u8]) -> Vec<String> {
1024        let buf: Vec<u8> = parse(buf);
1025        let s: String = String::from_utf8(buf)
1026            .unwrap()
1027            .trim_end_matches("\n")
1028            .to_string();
1029        let mut v: Vec<String> = vec![];
1030        for line in s.lines() {
1031            v.push(line.to_owned())
1032        }
1033        v
1034    }
1035
1036    #[test]
1037    fn test_parse_found() {
1038        let buf = "{}";
1039        assert_eq!(collect(buf.as_bytes()), [buf]);
1040        let buf = "[{}]";
1041        assert_eq!(collect(buf.as_bytes()), [buf]);
1042        let buf = "{ {} ]";
1043        assert_eq!(collect(buf.as_bytes()), ["{}"]);
1044        let buf = "{    []";
1045        assert_eq!(collect(buf.as_bytes()), ["[]"]);
1046        let buf = "hey\n{[]}";
1047        assert_eq!(collect(buf.as_bytes()), ["[]"]);
1048        let buf = "hey";
1049        assert_eq!(collect(buf.as_bytes()), vec![] as Vec<String>);
1050        let buf = "[[[[[[[{}]]]]]]]";
1051        assert_eq!(collect(buf.as_bytes()), [buf]);
1052        let buf = "I[{}]want[[]]moar";
1053        assert_eq!(collect(buf.as_bytes()), ["[{}]", "[[]]"]);
1054        let buf = r#"{"hey": "there"}"#;
1055        assert_eq!(collect(buf.as_bytes()), [buf]);
1056        let buf = r#"{"hey": "there"}{"how": "are", "you": "doing?"}"#;
1057        assert_eq!(
1058            collect(buf.as_bytes()),
1059            [r#"{"hey": "there"}"#, r#"{"how": "are", "you": "doing?"}"#,]
1060        );
1061        let buf = r#"["test", ["nested", {"json": "objs"}]]"#;
1062        assert_eq!(collect(buf.as_bytes()), [buf]);
1063        let buf = r#"[1, 2]"#;
1064        assert_eq!(collect(buf.as_bytes()), [buf]);
1065        let buf = r#"[1, {"test": -2}]"#;
1066        assert_eq!(collect(buf.as_bytes()), [buf]);
1067        let buf = r#"[1]{[-9]test: 9}"#;
1068        assert_eq!(collect(buf.as_bytes()), ["[1]", "[-9]"]);
1069        let buf = r#"{"numbers": 9, "literals": true, "lists": ["1", false, {}]}"#;
1070        assert_eq!(collect(buf.as_bytes()), [buf]);
1071        let buf = r#"[trap, [nullify, 1], {"true": true}]"#;
1072        assert_eq!(collect(buf.as_bytes()), [r#"{"true": true}"#]);
1073        let buf = r#"[1]{"key":"val":  [2],[fal[3]]]"#;
1074        assert_eq!(collect(buf.as_bytes()), ["[1]", "[2]", "[3]"]);
1075    }
1076
1077    #[test]
1078    fn test_parse_fail() {
1079        let bad_buffers: Vec<&str> = vec![
1080            "hey",
1081            r#"{"hey", "there"}"#,
1082            "{:}",
1083            "{]}",
1084            r#"{9: "9"}"#,
1085            r#"{"more": "colons": "bad"}"#,
1086            r#"{"test":, "bad"}"#,
1087            r#"[:]"#,
1088            r#"["a", "b",]"#,
1089            r#"["a", "b", {": "test"}]"#,
1090            "999",
1091            r#"{999: "666"}"#,
1092            r#"[999: "666"]"#,
1093            r#"[999   , ]"#,
1094            r#"[trap]"#,
1095            r#"[nullify]"#,
1096            r#"{true: false}"#,
1097            r#"[true"#,
1098            r#"[false"#,
1099            r#"[null"#,
1100            r#"[9"#,
1101            r#"["test"#,
1102            r#"["test""#,
1103            "[{",
1104            "[",
1105            "{",
1106        ];
1107        for buf in bad_buffers {
1108            assert_eq!(collect(buf.as_bytes()), vec![] as Vec<String>);
1109        }
1110    }
1111
1112    #[test]
1113    fn test_report_incomplete() {
1114        let buf = "{";
1115        let buf_expected = "{}";
1116        let report_expected = "exhausted,0,0,0";
1117        let (buf, report) = report_incomplete(buf.as_bytes(), true);
1118        assert_eq!(buf, buf_expected.as_bytes());
1119        assert_eq!(report, report_expected.as_bytes());
1120
1121        let buf = "[{[{[[";
1122        let buf_expected = "[{}]\n\
1123                            [{}]\n\
1124                            [[]]";
1125        let report_expected = "corrupted,0,1,1\n\
1126                               corrupted,2,3,3\n\
1127                               exhausted,4,5,5";
1128        let (buf, report) = report_incomplete(buf.as_bytes(), true);
1129        assert_eq!(buf, buf_expected.as_bytes());
1130        assert_eq!(report, report_expected.as_bytes());
1131
1132        let buf = r#"{"test": {"inside": [1, 2]"#;
1133        let buf_expected = r#"{"test": {"inside": [1, 2]}}"#;
1134        let report_expected = "exhausted,0,25,25";
1135        let (buf, report) = report_incomplete(buf.as_bytes(), true);
1136        assert_eq!(buf, buf_expected.as_bytes());
1137        assert_eq!(report, report_expected.as_bytes());
1138
1139        let buf = r#"[1, 2, 3, {"test"[true, null, far{"key": "value",[9]"#;
1140        let buf_expected = "[1, 2, 3, {}]\n\
1141                            []\n\
1142                            {}\n\
1143                            [9]";
1144        let report_expected = "corrupted,0,16,10\n\
1145                               corrupted,17,31,17\n\
1146                               corrupted,33,48,33";
1147        let (buf, report) = report_incomplete(buf.as_bytes(), true);
1148        assert_eq!(buf, buf_expected.as_bytes());
1149        assert_eq!(report, report_expected.as_bytes());
1150
1151        let buf = r#"[1]{"key":"val":  [2],[fal[3]]]"#;
1152        let buf_expected = "[1]\n\
1153                            {}\n\
1154                            [2]\n\
1155                            []\n\
1156                            [3]";
1157        let report_expected = "corrupted,3,14,3\n\
1158                               corrupted,22,25,22";
1159        let (buf, report) = report_incomplete(buf.as_bytes(), true);
1160        assert_eq!(buf, buf_expected.as_bytes());
1161        assert_eq!(report, report_expected.as_bytes());
1162    }
1163
1164    #[rstest]
1165    fn json_test_suite_success(#[files("tests/JSONTestSuite/**/y_*.json")] path: PathBuf) {
1166        let buf: Vec<u8> = fs::read(path).unwrap();
1167        let mut res_buf = buf.clone();
1168        if res_buf.last() == Some(&CHAR_NEWLINE) {
1169            res_buf.pop();
1170        }
1171        assert_eq!(parse(&buf), res_buf);
1172    }
1173
1174    #[rstest]
1175    fn json_test_suite_impl(#[files("tests/JSONTestSuite/**/i_*.json")] path: PathBuf) {
1176        let buf: Vec<u8> = fs::read(path).unwrap();
1177        parse(&buf);
1178    }
1179
1180    #[rstest]
1181    fn json_test_suite_fail(
1182        #[files("tests/JSONTestSuite/**/n_*.json")]
1183        #[files("tests/test_valid_but_no_brackets/*.json")]
1184        path: PathBuf,
1185    ) {
1186        let buf: Vec<u8> = fs::read(path).unwrap();
1187        assert_eq!(parse(&buf).len(), 0);
1188    }
1189
1190    #[rstest]
1191    fn json_test_suite_partial(#[files("tests/test_partial/*.json")] path: PathBuf) {
1192        let buf: Vec<u8> = fs::read(path).unwrap();
1193        let res = parse(&buf);
1194        assert!(res.len() > 0);
1195        assert_ne!(res, buf);
1196    }
1197}