Skip to main content

ocpi_tariffs/json/
decode.rs

1//! A `Warning` for warnings that can happen when decoding a JSON `&str`.
2use std::{borrow::Cow, fmt, iter::Peekable};
3
4use crate::{into_caveat, warning, Caveat, IntoCaveat};
5
6use super::Element;
7
8const ESCAPE_CHAR: char = '\\';
9
10/// The kind of `Warning` that can happen when decoding a `&str`.
11#[derive(Debug, Eq, PartialEq, Ord, PartialOrd)]
12pub enum Warning {
13    /// Control chars were found while parsing a JSON string.
14    ControlCharacterWhileParsingString(usize),
15
16    /// A UTF-16 surrogate pair failed to decode.
17    DecodeUtf16(usize, u16),
18
19    /// A string contains invalid escape chars.
20    InvalidEscape(usize),
21
22    /// The String ended before the parser expected.
23    UnexpectedEndOfString(usize),
24}
25
26impl crate::Warning for Warning {
27    /// A human readable identifier for each `Warning`.
28    fn id(&self) -> warning::Id {
29        match self {
30            Warning::ControlCharacterWhileParsingString(_) => {
31                warning::Id::from_static("control_character_while_parsing_string")
32            }
33            Warning::DecodeUtf16(..) => warning::Id::from_static("decode_utf_1_6"),
34            Warning::InvalidEscape(_) => warning::Id::from_static("invalid_escape"),
35            Warning::UnexpectedEndOfString(_) => {
36                warning::Id::from_static("unexpected_end_of_string")
37            }
38        }
39    }
40}
41
42impl fmt::Display for Warning {
43    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
44        match self {
45            Warning::ControlCharacterWhileParsingString(index) => {
46                write!(
47                    f,
48                    "Control chars were found at index `{index}` while decoding a JSON string."
49                )
50            }
51            Warning::DecodeUtf16(index, code) => {
52                write!(
53                    f,
54                    "A UTF-16 surrogate pair `{code}` failed to decode at index: `{index}`."
55                )
56            }
57            Warning::InvalidEscape(index) => {
58                write!(
59                    f,
60                    "String contains an invalid escape char at index: `{index})`."
61                )
62            }
63            Warning::UnexpectedEndOfString(index) => {
64                write!(f, "The String ended prematurely at index: `{index}`.")
65            }
66        }
67    }
68}
69
70/// Return a `PendingStr` that marks the inner `&str` is containing escape codes or not.
71pub(crate) fn analyze<'buf>(
72    s: &'buf str,
73    elem: &Element<'buf>,
74) -> Caveat<PendingStr<'buf>, Warning> {
75    let mut warnings = warning::Set::new();
76
77    // Strings are expected to be small so running over all bytes
78    // with the intent of early exiting is acceptable.
79    if s.chars().any(|ch| ch == ESCAPE_CHAR) {
80        PendingStr::HasEscapes(EscapeStr(s)).into_caveat(warnings)
81    } else {
82        if let Some((index, _)) = s.char_indices().find(|(_, ch)| ch.is_control()) {
83            warnings.with_elem(Warning::ControlCharacterWhileParsingString(index), elem);
84        }
85
86        PendingStr::NoEscapes(s).into_caveat(warnings)
87    }
88}
89
90/// Marks a `&str` as having escapes or not.
91pub(crate) enum PendingStr<'buf> {
92    /// The `&str` has no escapes and can be used as is.
93    NoEscapes(&'buf str),
94
95    /// The `&str` has escape chars and needs to be unescaped before trying to parse into another form.
96    HasEscapes(EscapeStr<'buf>),
97}
98
99into_caveat!(PendingStr<'buf>);
100
101/// A `&str` with escape chars.
102pub(crate) struct EscapeStr<'buf>(&'buf str);
103
104impl<'buf> EscapeStr<'buf> {
105    pub(crate) fn decode_escapes(&self, elem: &Element<'buf>) -> Caveat<Cow<'buf, str>, Warning> {
106        unescape_str(self.0, elem)
107    }
108
109    /// Consume the `EscapeStr` and return the raw bytes as a str.
110    pub(crate) fn into_raw(self) -> &'buf str {
111        self.0
112    }
113}
114
115/// Return the `str` with escaped chars replaced with the decoded equivalent.
116///
117/// Return `Some(Cow::Owned)` if there are escape chars in the `str` otherwise return
118/// `Some(Cow::Owned)` containing the input `str`.
119/// Return `None` if the `str` contains invalid or unhandled escape chars.
120pub(crate) fn unescape_str<'buf>(
121    s: &'buf str,
122    elem: &Element<'buf>,
123) -> Caveat<Cow<'buf, str>, Warning> {
124    let mut warnings = warning::Set::new();
125
126    // Strings are expected to be small so running over all bytes to early out is acceptable.
127    if !s.chars().any(|ch| ch == ESCAPE_CHAR) {
128        if let Some((index, _)) = s.char_indices().find(|(_, ch)| ch.is_control()) {
129            warnings.with_elem(Warning::ControlCharacterWhileParsingString(index), elem);
130        }
131        return Cow::Borrowed(s).into_caveat(warnings);
132    }
133
134    let mut chars = Chars::from_str(s);
135    let mut buf = Buffer::with_capacity(s.len());
136
137    loop {
138        let Some((index, ch)) = chars.next() else {
139            return Cow::<'buf, str>::Owned(buf.into_string()).into_caveat(warnings);
140        };
141
142        if ch == ESCAPE_CHAR {
143            if let Err(warn_kind) = parse_escape(&mut chars, &mut buf) {
144                warnings.with_elem(warn_kind, elem);
145                return Cow::Borrowed(s).into_caveat(warnings);
146            }
147        } else if let Err(warn_kind) = buf.push_char(ch, index) {
148            warnings.with_elem(warn_kind, elem);
149            return Cow::Borrowed(s).into_caveat(warnings);
150        }
151    }
152}
153
154/// Parses a JSON escape sequence and appends it into the `Buffer`. Assumes
155/// the previous byte read was a backslash.
156///
157/// * See: <https://datatracker.ietf.org/doc/html/rfc8259#section-7>
158fn parse_escape(chars: &mut Chars<'_>, buf: &mut Buffer) -> Result<(), Warning> {
159    let (index, ch) = chars.next_or_eof()?;
160
161    let ch = match ch {
162        '"' => '"',
163        '\\' => '\\',
164        '/' => '/',
165        'b' => '\x08',
166        'f' => '\x0c',
167        'n' => '\n',
168        'r' => '\r',
169        't' => '\t',
170        'u' => return parse_unicode_escape(chars, buf),
171        _ => {
172            return Err(Warning::InvalidEscape(index));
173        }
174    };
175
176    buf.push_char(ch, index)?;
177
178    Ok(())
179}
180
181/// Parses a JSON \u escape and appends it into the buffer.
182/// Assumes `\u` has just been read.
183///
184/// The Unicode escape might be a UTF-16 surrogate pair.
185///
186/// * See: <https://datatracker.ietf.org/doc/html/rfc8259#section-8.2>
187fn parse_unicode_escape(chars: &mut Chars<'_>, buf: &mut Buffer) -> Result<(), Warning> {
188    let n1 = decode_hex_escape(chars)?;
189    let n2 = chars.is_next_escape()?;
190
191    if let Some(n2) = n2 {
192        buf.push_surrogate_pair(n1, n2, chars.index)?;
193    } else {
194        let Some(ch) = char::from_u32(u32::from(n1)) else {
195            return Err(Warning::InvalidEscape(chars.index));
196        };
197
198        buf.push_char(ch, chars.index)?;
199    }
200
201    Ok(())
202}
203
204/// A char iterator that can fail if the next char is a control char.
205struct Chars<'buf> {
206    /// The `char` iterator
207    ///
208    /// This needs to be a `CharIndices` as the `Chars` iterator skips over escaped chars.
209    /// And this needs to be `Peekable` as we need to look ahead to detect a potential second
210    /// Unicode literal and treat that as a UTF-16 surrogate pair.
211    char_indices: Peekable<std::str::CharIndices<'buf>>,
212
213    /// The last parsed char index
214    index: usize,
215}
216
217impl<'buf> Chars<'buf> {
218    /// Create a new `Chars` iterator from a `&str`.
219    fn from_str(s: &'buf str) -> Self {
220        Self {
221            char_indices: s.char_indices().peekable(),
222            index: 0,
223        }
224    }
225
226    /// Return the next char as `Ok` or return `Err(UnexpectedEOF)` if there is no char
227    /// or return `Err(ControlCharacterWhileParsingString)` if the next char is a control char.
228    fn next_or_eof(&mut self) -> Result<(usize, char), Warning> {
229        if let Some((index, ch)) = self.next() {
230            if ch.is_control() {
231                return Err(Warning::ControlCharacterWhileParsingString(index));
232            }
233
234            Ok((index, ch))
235        } else {
236            Err(Warning::UnexpectedEndOfString(self.index))
237        }
238    }
239
240    /// Look ahead in the char stream and if there is another unicode escape return it as a decoded
241    /// hex escape.
242    fn is_next_escape(&mut self) -> Result<Option<u16>, Warning> {
243        {
244            let escape_char = self.char_indices.next_if(|(_, ch)| *ch == ESCAPE_CHAR);
245
246            if escape_char.is_none() {
247                return Ok(None);
248            }
249        }
250
251        {
252            let escape_unicode = self.char_indices.next_if(|(_, ch)| *ch == 'u');
253
254            if escape_unicode.is_none() {
255                return Ok(None);
256            }
257        }
258
259        let n = decode_hex_escape(self)?;
260        Ok(Some(n))
261    }
262}
263
264impl Iterator for Chars<'_> {
265    type Item = (usize, char);
266
267    fn next(&mut self) -> Option<Self::Item> {
268        if let Some((index, char)) = self.char_indices.next() {
269            self.index = index;
270            Some((index, char))
271        } else {
272            None
273        }
274    }
275}
276
277/// The `String` based buffer where we accumulate the escaped JSON string.
278///
279/// If `fail_on_control` is true and a control char is given to a method,
280/// the method will return `Err(ControlCharacterWhileParsingString)`.
281struct Buffer {
282    /// The `String` to accumulate chars in.
283    buf: String,
284}
285
286impl Buffer {
287    /// Create a new `Buffer`
288    fn with_capacity(capacity: usize) -> Self {
289        Self {
290            buf: String::with_capacity(capacity),
291        }
292    }
293
294    /// Push a char into the `String`.
295    ///
296    /// Return `Err` if the char is a control char and the `fail_on_control` is true.
297    /// Otherwise return `Ok`.
298    fn push_char(&mut self, ch: char, index: usize) -> Result<(), Warning> {
299        if ch.is_control() {
300            return Err(Warning::ControlCharacterWhileParsingString(index));
301        }
302
303        self.buf.push(ch);
304        Ok(())
305    }
306
307    /// Consume the `Buffer` and return the inner `String`.
308    fn into_string(self) -> String {
309        self.buf
310    }
311
312    /// Decodes the high and low parts of a UTF-16 surrogate pair and pushes the resulting
313    /// `char` on to the `Buffer`.
314    ///
315    /// Returns `Ok(char)` if the decoding succeeds.
316    /// Returns `Err(DecodeUtf16)` if the decoding fails.
317    fn push_surrogate_pair(&mut self, n1: u16, n2: u16, index: usize) -> Result<char, Warning> {
318        let Some(ch) = char::decode_utf16([n1, n2]).next() else {
319            return Err(Warning::InvalidEscape(index));
320        };
321
322        let ch = match ch {
323            Ok(ch) => ch,
324            Err(err) => {
325                return Err(Warning::DecodeUtf16(index, err.unpaired_surrogate()));
326            }
327        };
328
329        self.push_char(ch, index)?;
330
331        Ok(ch)
332    }
333}
334
335/// Munch four chars as bytes and try convert into a `char`.
336fn decode_hex_escape(chars: &mut Chars<'_>) -> Result<u16, Warning> {
337    const RADIX: u32 = 16;
338
339    let (_, one) = chars.next_or_eof()?;
340    let (_, two) = chars.next_or_eof()?;
341    let (_, three) = chars.next_or_eof()?;
342    let (index, four) = chars.next_or_eof()?;
343
344    let string = [one, two, three, four].into_iter().collect::<String>();
345    let Ok(n) = u16::from_str_radix(&string, RADIX) else {
346        return Err(Warning::InvalidEscape(index));
347    };
348
349    Ok(n)
350}
351
352#[cfg(test)]
353mod test_unescape {
354    #![allow(
355        clippy::indexing_slicing,
356        reason = "unwraps are allowed anywhere in tests"
357    )]
358
359    use std::{borrow::Cow, sync::Arc};
360
361    use assert_matches::assert_matches;
362
363    use crate::json;
364
365    use super::{unescape_str, Warning};
366
367    fn test_elem() -> json::Element<'static> {
368        json::Element {
369            id: 0.into(),
370            path_node: Arc::new(json::PathNode::Root),
371            span: json::parser::Span::default(),
372            value: json::Value::Null,
373        }
374    }
375
376    #[test]
377    fn should_unescape_empty_str() {
378        const INPUT: &str = "";
379
380        let elem = test_elem();
381        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
382        assert_matches!(string, Cow::Borrowed(""));
383        assert!(warnings.is_empty(), "{warnings:#?}");
384    }
385
386    #[test]
387    fn should_unescape_str_without_escapes() {
388        const INPUT: &str = "ab";
389
390        let elem = test_elem();
391        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
392        assert_matches!(string, Cow::Borrowed(INPUT));
393        assert!(warnings.is_empty(), "{warnings:#?}");
394    }
395
396    #[test]
397    fn should_unescape_str_with_forward_slash_escape() {
398        const INPUT: &str = r"a\/b";
399
400        let elem = test_elem();
401        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
402        let s = assert_matches!(
403            string,
404            Cow::Owned(s) => s
405        );
406
407        assert_eq!(s, "a/b");
408        assert!(warnings.is_empty(), "{warnings:#?}");
409    }
410
411    #[test]
412    fn should_unescape_str_with_many_escapes() {
413        const INPUT: &str = r#"a\/\"b\""#;
414
415        let elem = test_elem();
416        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
417        let s = assert_matches!(
418            string,
419            Cow::Owned(s) => s
420        );
421
422        assert_eq!(s, r#"a/"b""#);
423        assert!(warnings.is_empty(), "{warnings:#?}");
424    }
425
426    #[test]
427    fn should_fail_to_unescape_str_with_invalid_escape() {
428        {
429            const INPUT: &str = r"\a/c";
430
431            let elem = test_elem();
432            let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
433            let warnings = warnings.into_path_as_str_map();
434            let warnings = &warnings["$"];
435
436            assert_matches!(string, Cow::Borrowed(_));
437            assert_matches!(warnings.as_slice(), [Warning::InvalidEscape(1)]);
438        }
439
440        {
441            const INPUT: &str = r"a\c";
442
443            let elem = test_elem();
444            let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
445            let warnings = warnings.into_path_as_str_map();
446            let warnings = &warnings["$"];
447
448            assert_matches!(string, Cow::Borrowed(_));
449            assert_matches!(warnings.as_slice(), [Warning::InvalidEscape(2)]);
450        }
451
452        {
453            const INPUT: &str = r"a/c\";
454
455            let elem = test_elem();
456            let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
457            let warnings = warnings.into_path_as_str_map();
458            let warnings = &warnings["$"];
459
460            assert_matches!(string, Cow::Borrowed(_));
461            assert_matches!(warnings.as_slice(), [Warning::UnexpectedEndOfString(3)]);
462        }
463    }
464
465    #[test]
466    fn should_fail_to_unescape_str_with_control_char() {
467        const INPUT: &str = "hello\u{0019}world";
468
469        let elem = test_elem();
470        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
471        let warnings = warnings.into_path_as_str_map();
472        let warnings = &warnings["$"];
473
474        assert_matches!(string, Cow::Borrowed(_));
475        assert_matches!(
476            warnings.as_slice(),
477            [Warning::ControlCharacterWhileParsingString(5)]
478        );
479    }
480
481    #[test]
482    fn should_fail_to_unescape_raw_str_with_rust_unicode_literal_control_char() {
483        const INPUT: &str = r"hello\u{0019}world";
484
485        let elem = test_elem();
486        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
487        let warnings = warnings.into_path_as_str_map();
488        let warnings = &warnings["$"];
489
490        assert_matches!(string, Cow::Borrowed(_));
491        assert_matches!(warnings.as_slice(), [Warning::InvalidEscape(10)]);
492    }
493
494    #[test]
495    fn should_fail_to_unescape_json_control_escape() {
496        const INPUT: &str = r"hello\u0019world";
497
498        let elem = test_elem();
499        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
500        let warnings = warnings.into_path_as_str_map();
501        let warnings = &warnings["$"];
502
503        assert_matches!(string, Cow::Borrowed(_));
504        assert_matches!(
505            warnings.as_slice(),
506            [Warning::ControlCharacterWhileParsingString(10)]
507        );
508    }
509
510    #[test]
511    fn should_unescape_unicode_literals() {
512        const INPUT: &str = r"hello\u0020world\u0021";
513
514        let elem = test_elem();
515        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
516
517        let s = assert_matches!(
518            string,
519            Cow::Owned(s) => s
520        );
521        assert_eq!(s, "hello world!");
522        assert!(warnings.is_empty(), "{warnings:#?}");
523    }
524
525    #[test]
526    fn should_unescape_utf_16_surrogate_pair() {
527        // This test data is taken from the JSON RFC 8259 spec.
528        //
529        // * See: <https://datatracker.ietf.org/doc/html/rfc8259#section-7>
530        const INPUT: &str = r"hello\uD834\uDD1Eworld";
531
532        let elem = test_elem();
533        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
534
535        let s = assert_matches!(
536            string,
537            Cow::Owned(s) => s
538        );
539        assert_eq!(s, "hello\u{1D11E}world");
540        assert!(warnings.is_empty(), "{warnings:#?}");
541    }
542
543    #[test]
544    fn should_unescape_unicode_literal_followed_by_simple_escape() {
545        const INPUT: &str = r"hello\u0020\/world\u0021";
546
547        let elem = test_elem();
548        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
549
550        let s = assert_matches!(
551            string,
552            Cow::Owned(s) => s
553        );
554        assert_eq!(s, "hello /world!");
555        assert!(warnings.is_empty(), "{warnings:#?}");
556    }
557}