json_strip_comments/
lib.rs

1//! Replace json comments and trailing commas in place.
2//!
3//! A fork of a fork:
4//!
5//! * <https://github.com/tmccombs/json-comments-rs>
6//! * <https://github.com/parcel-bundler/parcel/pull/9032>
7//!
8//! `json-strip-comments` is a library to strip out comments from JSON. By processing text
9//! through a [`StripComments`] adapter first, it is possible to use a standard JSON parser (such
10//! as [serde_json](https://crates.io/crates/serde_json) with quasi-json input that contains
11//! comments.
12//!
13//! In fact, this code makes few assumptions about the input and could probably be used to strip
14//! comments out of other types of code as well, provided that strings use double quotes and
15//! backslashes are used for escapes in strings.
16//!
17//! The following types of comments are supported:
18//!   - C style block comments (`/* ... */`)
19//!   - C style line comments (`// ...`)
20//!   - Shell style line comments (`# ...`)
21//!
22//! ## Example
23//!
24//! ```rust
25#![doc = include_str!("../examples/example.rs")]
26//! ```
27
28use std::io::{ErrorKind, Read, Result};
29
30#[derive(Eq, PartialEq, Copy, Clone, Debug)]
31#[repr(u8)]
32enum State {
33    Top,
34    InString,
35    StringEscape,
36    InComment,
37    InBlockComment,
38    MaybeCommentEnd,
39    InLineComment,
40}
41
42use State::{
43    InBlockComment, InComment, InLineComment, InString, MaybeCommentEnd, StringEscape, Top,
44};
45
46/// A [`Read`] that transforms another [`Read`] so that it changes all comments to spaces so that a downstream json parser
47/// (such as json-serde) doesn't choke on them.
48///
49/// The supported comments are:
50///   - C style block comments (`/* ... */`)
51///   - C style line comments (`// ...`)
52///   - Shell style line comments (`# ...`)
53///
54/// ## Example
55/// ```
56/// use json_strip_comments::StripComments;
57/// use std::io::Read;
58///
59/// let input = r#"{
60/// // c line comment
61/// "a": "comment in string /* a */",
62/// ## shell line comment
63/// } /** end */"#;
64///
65/// let mut stripped = String::new();
66/// StripComments::new(input.as_bytes()).read_to_string(&mut stripped).unwrap();
67///
68/// assert_eq!(stripped, "{
69///                  \n\"a\": \"comment in string /* a */\",
70///                     \n}           ");
71///
72/// ```
73///
74pub struct StripComments<T: Read> {
75    inner: T,
76    state: State,
77}
78
79impl<T> StripComments<T>
80where
81    T: Read,
82{
83    pub fn new(input: T) -> Self {
84        Self {
85            inner: input,
86            state: Top,
87        }
88    }
89}
90
91impl<T> Read for StripComments<T>
92where
93    T: Read,
94{
95    fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
96        let count = self.inner.read(buf)?;
97        if count > 0 {
98            strip_buf(&mut self.state, &mut buf[..count])?;
99        } else if self.state != Top && self.state != InLineComment {
100            return Err(ErrorKind::InvalidData.into());
101        }
102        Ok(count)
103    }
104}
105
106/// Strips comments from a string in place, replacing it with whitespaces.
107///
108/// /// ## Example
109/// ```
110/// use json_strip_comments::strip_comments_in_place;
111///
112/// let mut string = String::from(r#"{
113/// // c line comment
114/// "a": "comment in string /* a */"
115/// ## shell line comment
116/// } /** end */"#);
117///
118/// strip_comments_in_place(&mut string).unwrap();
119///
120/// assert_eq!(string, "{
121///                  \n\"a\": \"comment in string /* a */\"
122///                     \n}           ");
123///
124/// ```
125pub fn strip_comments_in_place(s: &mut str) -> Result<()> {
126    // Safety: we have made sure the text is UTF-8
127    strip_buf(&mut Top, unsafe { s.as_bytes_mut() })
128}
129
130pub fn strip(s: &mut str) -> Result<()> {
131    strip_comments_in_place(s)
132}
133
134
135fn consume_comment_whitespace_until_maybe_bracket(
136    state: &mut State,
137    buf: &mut [u8],
138    i: &mut usize,
139) -> Result<bool> {
140    *i += 1;
141    let len = buf.len();
142    while *i < len {
143        let c = &mut buf[*i];
144        *state = match state {
145            Top => {
146                *state = top(c);
147                if c.is_ascii_whitespace() {
148                    *i += 1;
149                    continue;
150                }
151                return Ok(*c == b'}' || *c == b']');
152            }
153            InString => in_string(*c),
154            StringEscape => InString,
155            InComment => in_comment(c)?,
156            InBlockComment => consume_block_comments(buf, i),
157            MaybeCommentEnd => maybe_comment_end(c),
158            InLineComment => consume_line_comments(buf, i),
159        };
160        *i += 1;
161    }
162    Ok(false)
163}
164
165fn strip_buf(state: &mut State, buf: &mut [u8]) -> Result<()> {
166    let mut i = 0;
167    let len = buf.len();
168    
169    // Fast path for Top state which is most common
170    while i < len {
171        let c = &mut buf[i];
172        
173        match state {
174            Top => {
175                let cur = i;
176                let new_state = top(c);
177                if *c == b',' {
178                    let mut temp_state = new_state;
179                    if consume_comment_whitespace_until_maybe_bracket(&mut temp_state, buf, &mut i)? {
180                        buf[cur] = b' ';
181                    }
182                    *state = temp_state;
183                } else {
184                    *state = new_state;
185                }
186            }
187            InString => *state = in_string(*c),
188            StringEscape => *state = InString,
189            InComment => *state = in_comment(c)?,
190            InBlockComment => *state = consume_block_comments(buf, &mut i),
191            MaybeCommentEnd => *state = maybe_comment_end(c),
192            InLineComment => *state = consume_line_comments(buf, &mut i),
193        }
194        
195        i += 1;
196    }
197    Ok(())
198}
199
200#[inline(always)]
201fn consume_line_comments(buf: &mut [u8], i: &mut usize) -> State {
202    let cur = *i;
203    let remaining = &buf[*i..];
204    match memchr::memchr(b'\n', remaining) {
205        Some(offset) => {
206            *i += offset;
207            buf[cur..*i].fill(b' ');
208            Top
209        }
210        None => {
211            let len = buf.len();
212            *i = len - 1;
213            buf[cur..len].fill(b' ');
214            InLineComment
215        }
216    }
217}
218
219#[inline(always)]
220fn consume_block_comments(buf: &mut [u8], i: &mut usize) -> State {
221    let cur = *i;
222    let remaining = &buf[*i..];
223    match memchr::memchr(b'*', remaining) {
224        Some(offset) => {
225            *i += offset;
226            buf[cur..=*i].fill(b' ');
227            MaybeCommentEnd
228        }
229        None => {
230            let len = buf.len();
231            *i = len - 1;
232            buf[cur..len].fill(b' ');
233            InBlockComment
234        }
235    }
236}
237
238#[inline(always)]
239fn top(c: &mut u8) -> State {
240    match *c {
241        b'"' => InString,
242        b'/' => {
243            *c = b' ';
244            InComment
245        }
246        b'#' => {
247            *c = b' ';
248            InLineComment
249        }
250        _ => Top,
251    }
252}
253
254#[inline(always)]
255fn in_string(c: u8) -> State {
256    match c {
257        b'"' => Top,
258        b'\\' => StringEscape,
259        _ => InString,
260    }
261}
262
263#[inline]
264fn in_comment(c: &mut u8) -> Result<State> {
265    let new_state = match *c {
266        b'*' => InBlockComment,
267        b'/' => InLineComment,
268        _ => return Err(ErrorKind::InvalidData.into()),
269    };
270    *c = b' ';
271    Ok(new_state)
272}
273
274#[inline]
275fn maybe_comment_end(c: &mut u8) -> State {
276    let old = *c;
277    *c = b' ';
278    match old {
279        b'/' => Top,
280        b'*' => MaybeCommentEnd,
281        _ => InBlockComment,
282    }
283}
284
285#[cfg(test)]
286mod tests {
287    use super::*;
288    use std::io::{ErrorKind, Read};
289
290    fn strip_string(input: &str) -> String {
291        let mut out = String::new();
292        let count = StripComments::new(input.as_bytes())
293            .read_to_string(&mut out)
294            .unwrap();
295        assert_eq!(count, input.len());
296        out
297    }
298
299    #[test]
300    fn block_comments() {
301        let json = r#"{/* Comment */"hi": /** abc */ "bye"}"#;
302        let stripped = strip_string(json);
303        assert_eq!(stripped, r#"{             "hi":            "bye"}"#);
304    }
305
306    #[test]
307    fn block_comments_with_possible_end() {
308        let json = r#"{/* Comment*PossibleEnd */"hi": /** abc */ "bye"}"#;
309        let stripped = strip_string(json);
310        assert_eq!(
311            stripped,
312            r#"{                         "hi":            "bye"}"#
313        );
314    }
315
316    // See https://github.com/tmccombs/json-comments-rs/issues/12
317    // Make sure we can parse a block comment that ends with more than one "*"
318    #[test]
319    fn doc_comment() {
320        let json = r##"/** C **/ { "foo": 123 }"##;
321        let stripped = strip_string(json);
322        assert_eq!(stripped, r##"          { "foo": 123 }"##);
323    }
324
325    #[test]
326    fn line_comments() {
327        let json = r#"{
328            // line comment
329            "a": 4,
330            # another
331        }"#;
332
333        let expected = "{
334                           \n            \"a\": 4,
335                     \n        }";
336
337        assert_eq!(strip_string(json), expected);
338    }
339
340    #[test]
341    fn incomplete_string() {
342        let json = r#""foo"#;
343        let mut stripped = String::new();
344
345        let err = StripComments::new(json.as_bytes())
346            .read_to_string(&mut stripped)
347            .unwrap_err();
348        assert_eq!(err.kind(), ErrorKind::InvalidData);
349    }
350
351    #[test]
352    fn incomplete_comment() {
353        let json = "/* foo ";
354        let mut stripped = String::new();
355
356        let err = StripComments::new(json.as_bytes())
357            .read_to_string(&mut stripped)
358            .unwrap_err();
359        assert_eq!(err.kind(), ErrorKind::InvalidData);
360    }
361
362    #[test]
363    fn incomplete_comment2() {
364        let json = "/* foo *";
365        let mut stripped = String::new();
366
367        let err = StripComments::new(json.as_bytes())
368            .read_to_string(&mut stripped)
369            .unwrap_err();
370        assert_eq!(err.kind(), ErrorKind::InvalidData);
371    }
372
373
374    #[test]
375    fn strip_in_place() {
376        let mut json = String::from(r#"{/* Comment */"hi": /** abc */ "bye"}"#);
377        strip_comments_in_place(&mut json).unwrap();
378        assert_eq!(json, r#"{             "hi":            "bye"}"#);
379    }
380
381    #[test]
382    fn trailing_comma() {
383        let mut json = String::from(
384            r#"{
385            "a1": [1,],
386            "a2": [1,/* x */],
387            "a3": [
388                1, // x
389            ],
390            "o1": {v:1,},
391            "o2": {v:1,/* x */},
392            "o3": {
393                "v":1, // x
394            },
395            # another
396        }"#,
397        );
398        strip_comments_in_place(&mut json).unwrap();
399
400        let expected = r#"{
401            "a1": [1 ],
402            "a2": [1        ],
403            "a3": [
404                1
405            ],
406            "o1": {v:1 },
407            "o2": {v:1        },
408            "o3": {
409                "v":1
410            }
411        }"#;
412
413        assert_eq!(
414            json.replace(|s: char| s.is_ascii_whitespace(), ""),
415            expected.replace(|s: char| s.is_ascii_whitespace(), "")
416        );
417    }
418}