json_strip_comments/
lib.rs

1//! Replace json comments and trailing commas in place.
2//!
3//! A fork of a fork:
4//!
5//! * <https://github.com/tmccombs/json-comments-rs>
6//! * <https://github.com/parcel-bundler/parcel/pull/9032>
7//!
8//! `json-strip-comments` is a library to strip out comments from JSON. By processing text
9//! through a [`StripComments`] adapter first, it is possible to use a standard JSON parser (such
10//! as [serde_json](https://crates.io/crates/serde_json) with quasi-json input that contains
11//! comments.
12//!
13//! In fact, this code makes few assumptions about the input and could probably be used to strip
14//! comments out of other types of code as well, provided that strings use double quotes and
15//! backslashes are used for escapes in strings.
16//!
17//! The following types of comments are supported:
18//!   - C style block comments (`/* ... */`)
19//!   - C style line comments (`// ...`)
20//!   - Shell style line comments (`# ...`)
21//!
22//! ## Example
23//!
24//! ```rust
25#![doc = include_str!("../examples/example.rs")]
26//! ```
27
28use std::io::{ErrorKind, Read, Result};
29
30#[derive(Eq, PartialEq, Copy, Clone, Debug)]
31#[repr(u8)]
32enum State {
33    Top,
34    InString,
35    StringEscape,
36    InComment,
37    InBlockComment,
38    MaybeCommentEnd,
39    InLineComment,
40}
41
42use State::{
43    InBlockComment, InComment, InLineComment, InString, MaybeCommentEnd, StringEscape, Top,
44};
45
46/// A [`Read`] that transforms another [`Read`] so that it changes all comments to spaces so that a downstream json parser
47/// (such as json-serde) doesn't choke on them.
48///
49/// The supported comments are:
50///   - C style block comments (`/* ... */`)
51///   - C style line comments (`// ...`)
52///   - Shell style line comments (`# ...`)
53///
54/// ## Example
55/// ```
56/// use json_strip_comments::StripComments;
57/// use std::io::Read;
58///
59/// let input = r#"{
60/// // c line comment
61/// "a": "comment in string /* a */",
62/// ## shell line comment
63/// } /** end */"#;
64///
65/// let mut stripped = String::new();
66/// StripComments::new(input.as_bytes()).read_to_string(&mut stripped).unwrap();
67///
68/// assert_eq!(stripped, "{
69///                  \n\"a\": \"comment in string /* a */\",
70///                     \n}           ");
71///
72/// ```
73///
74pub struct StripComments<T: Read> {
75    inner: T,
76    state: State,
77}
78
79impl<T> StripComments<T>
80where
81    T: Read,
82{
83    pub fn new(input: T) -> Self {
84        Self { inner: input, state: Top }
85    }
86}
87
88impl<T> Read for StripComments<T>
89where
90    T: Read,
91{
92    fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
93        let count = self.inner.read(buf)?;
94        if count > 0 {
95            strip_buf(&mut self.state, &mut buf[..count])?;
96        } else if self.state != Top && self.state != InLineComment {
97            return Err(ErrorKind::InvalidData.into());
98        }
99        Ok(count)
100    }
101}
102
103/// Strips comments from a string in place, replacing it with whitespaces.
104///
105/// /// ## Example
106/// ```
107/// use json_strip_comments::strip_comments_in_place;
108///
109/// let mut string = String::from(r#"{
110/// // c line comment
111/// "a": "comment in string /* a */"
112/// ## shell line comment
113/// } /** end */"#);
114///
115/// strip_comments_in_place(&mut string).unwrap();
116///
117/// assert_eq!(string, "{
118///                  \n\"a\": \"comment in string /* a */\"
119///                     \n}           ");
120///
121/// ```
122pub fn strip_comments_in_place(s: &mut str) -> Result<()> {
123    // Safety: we have made sure the text is UTF-8
124    strip_buf(&mut Top, unsafe { s.as_bytes_mut() })
125}
126
127pub fn strip(s: &mut str) -> Result<()> {
128    strip_comments_in_place(s)
129}
130
131fn consume_comment_whitespace_until_maybe_bracket(
132    state: &mut State,
133    buf: &mut [u8],
134    i: &mut usize,
135) -> Result<bool> {
136    *i += 1;
137    let len = buf.len();
138    while *i < len {
139        let c = &mut buf[*i];
140        *state = match state {
141            Top => {
142                *state = top(c);
143                if c.is_ascii_whitespace() {
144                    *i += 1;
145                    continue;
146                }
147                return Ok(*c == b'}' || *c == b']');
148            }
149            InString => in_string(*c),
150            StringEscape => InString,
151            InComment => in_comment(c)?,
152            InBlockComment => consume_block_comments(buf, i),
153            MaybeCommentEnd => maybe_comment_end(c),
154            InLineComment => consume_line_comments(buf, i),
155        };
156        *i += 1;
157    }
158    Ok(false)
159}
160
161fn strip_buf(state: &mut State, buf: &mut [u8]) -> Result<()> {
162    let mut i = 0;
163    let len = buf.len();
164
165    // Fast path for Top state which is most common
166    while i < len {
167        let c = &mut buf[i];
168
169        match state {
170            Top => {
171                let cur = i;
172                let new_state = top(c);
173                if *c == b',' {
174                    let mut temp_state = new_state;
175                    if consume_comment_whitespace_until_maybe_bracket(&mut temp_state, buf, &mut i)?
176                    {
177                        buf[cur] = b' ';
178                    }
179                    *state = temp_state;
180                } else {
181                    *state = new_state;
182                }
183            }
184            InString => *state = in_string(*c),
185            StringEscape => *state = InString,
186            InComment => *state = in_comment(c)?,
187            InBlockComment => *state = consume_block_comments(buf, &mut i),
188            MaybeCommentEnd => *state = maybe_comment_end(c),
189            InLineComment => *state = consume_line_comments(buf, &mut i),
190        }
191
192        i += 1;
193    }
194    Ok(())
195}
196
197#[inline(always)]
198fn consume_line_comments(buf: &mut [u8], i: &mut usize) -> State {
199    let cur = *i;
200    let remaining = &buf[*i..];
201    match memchr::memchr(b'\n', remaining) {
202        Some(offset) => {
203            *i += offset;
204            buf[cur..*i].fill(b' ');
205            Top
206        }
207        None => {
208            let len = buf.len();
209            *i = len - 1;
210            buf[cur..len].fill(b' ');
211            InLineComment
212        }
213    }
214}
215
216#[inline(always)]
217fn consume_block_comments(buf: &mut [u8], i: &mut usize) -> State {
218    let cur = *i;
219    let remaining = &buf[*i..];
220    match memchr::memchr(b'*', remaining) {
221        Some(offset) => {
222            *i += offset;
223            buf[cur..=*i].fill(b' ');
224            MaybeCommentEnd
225        }
226        None => {
227            let len = buf.len();
228            *i = len - 1;
229            buf[cur..len].fill(b' ');
230            InBlockComment
231        }
232    }
233}
234
235#[inline(always)]
236fn top(c: &mut u8) -> State {
237    match *c {
238        b'"' => InString,
239        b'/' => {
240            *c = b' ';
241            InComment
242        }
243        b'#' => {
244            *c = b' ';
245            InLineComment
246        }
247        _ => Top,
248    }
249}
250
251#[inline(always)]
252fn in_string(c: u8) -> State {
253    match c {
254        b'"' => Top,
255        b'\\' => StringEscape,
256        _ => InString,
257    }
258}
259
260#[inline]
261fn in_comment(c: &mut u8) -> Result<State> {
262    let new_state = match *c {
263        b'*' => InBlockComment,
264        b'/' => InLineComment,
265        _ => return Err(ErrorKind::InvalidData.into()),
266    };
267    *c = b' ';
268    Ok(new_state)
269}
270
271#[inline]
272fn maybe_comment_end(c: &mut u8) -> State {
273    let old = *c;
274    *c = b' ';
275    match old {
276        b'/' => Top,
277        b'*' => MaybeCommentEnd,
278        _ => InBlockComment,
279    }
280}