ocpi_tariffs/json/
decode.rs

1//! A `WarningKind` for warnings that can happen when decoding a JSON `&str`.
2use std::{borrow::Cow, fmt, iter::Peekable};
3
4use crate::{into_caveat, warning, Caveat, IntoCaveat};
5
6use super::Element;
7
8const ESCAPE_CHAR: char = '\\';
9
10/// The kind of `Warning` that can happen when decoding a `&str`.
11#[derive(Debug, Eq, PartialEq, Ord, PartialOrd)]
12pub enum WarningKind {
13    /// Control chars were found while parsing a JSON string.
14    ControlCharacterWhileParsingString(usize),
15
16    /// A UTF-16 surrogate pair failed to decode.
17    DecodeUtf16(usize, u16),
18
19    /// A string contains invalid escape chars.
20    InvalidEscape(usize),
21
22    /// The String ended before the parser expected.
23    UnexpectedEndOfString(usize),
24}
25
26impl warning::Kind for WarningKind {
27    /// A human readable identifier for each `Warning`.
28    fn id(&self) -> Cow<'static, str> {
29        match self {
30            WarningKind::ControlCharacterWhileParsingString(_) => {
31                "control_character_while_parsing_string".into()
32            }
33            WarningKind::DecodeUtf16(..) => "decode_utf_1_6".into(),
34            WarningKind::InvalidEscape(_) => "invalid_escape".into(),
35            WarningKind::UnexpectedEndOfString(_) => "unexpected_end_of_string".into(),
36        }
37    }
38}
39
40impl fmt::Display for WarningKind {
41    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
42        match self {
43            WarningKind::ControlCharacterWhileParsingString(index) => {
44                write!(
45                    f,
46                    "Control chars were found at index `{index}` while decoding a JSON string."
47                )
48            }
49            WarningKind::DecodeUtf16(index, code) => {
50                write!(
51                    f,
52                    "A UTF-16 surrogate pair `{code}` failed to decode at index: `{index}`."
53                )
54            }
55            WarningKind::InvalidEscape(index) => {
56                write!(
57                    f,
58                    "String contains an invalid escape char at index: `{index})`."
59                )
60            }
61            WarningKind::UnexpectedEndOfString(index) => {
62                write!(f, "The String ended prematurely at index: `{index}`.")
63            }
64        }
65    }
66}
67
68/// Return a `PendingStr` that marks the inner `&str` is containing escape codes or not.
69pub(crate) fn analyze<'buf>(
70    s: &'buf str,
71    elem: &Element<'buf>,
72) -> Caveat<PendingStr<'buf>, WarningKind> {
73    let mut warnings = warning::Set::new();
74
75    // Strings are expected to be small so running over all bytes
76    // with the intent of early exiting is acceptable.
77    if s.chars().any(|ch| ch == ESCAPE_CHAR) {
78        PendingStr::HasEscapes(EscapeStr(s)).into_caveat(warnings)
79    } else {
80        if let Some((index, _)) = s.char_indices().find(|(_, ch)| ch.is_control()) {
81            warnings.with_elem(WarningKind::ControlCharacterWhileParsingString(index), elem);
82        }
83
84        PendingStr::NoEscapes(s).into_caveat(warnings)
85    }
86}
87
88/// Marks a `&str` as having escapes or not.
89pub(crate) enum PendingStr<'buf> {
90    /// The `&str` has no escapes and can be used as is.
91    NoEscapes(&'buf str),
92
93    /// The `&str` has escape chars and needs to be unescaped before trying to parse into another form.
94    HasEscapes(EscapeStr<'buf>),
95}
96
97into_caveat!(PendingStr<'buf>);
98
99/// A `&str` with escape chars.
100pub(crate) struct EscapeStr<'buf>(&'buf str);
101
102impl<'buf> EscapeStr<'buf> {
103    pub(crate) fn decode_escapes(
104        &self,
105        elem: &Element<'buf>,
106    ) -> Caveat<Cow<'buf, str>, WarningKind> {
107        unescape_str(self.0, elem)
108    }
109
110    /// Consume the `EscapeStr` and return the raw bytes as a str.
111    pub(crate) fn into_raw(self) -> &'buf str {
112        self.0
113    }
114}
115
116/// Return the `str` with escaped chars replaced with the decoded equivalent.
117///
118/// Return `Some(Cow::Owned)` if there are escape chars in the `str` otherwise return
119/// `Some(Cow::Owned)` containing the input `str`.
120/// Return `None` if the `str` contains invalid or unhandled escape chars.
121pub(crate) fn unescape_str<'buf>(
122    s: &'buf str,
123    elem: &Element<'buf>,
124) -> Caveat<Cow<'buf, str>, WarningKind> {
125    let mut warnings = warning::Set::new();
126
127    // Strings are expected to be small so running over all bytes to early out is acceptable.
128    if !s.chars().any(|ch| ch == ESCAPE_CHAR) {
129        if let Some((index, _)) = s.char_indices().find(|(_, ch)| ch.is_control()) {
130            warnings.with_elem(WarningKind::ControlCharacterWhileParsingString(index), elem);
131        }
132        return Cow::Borrowed(s).into_caveat(warnings);
133    }
134
135    let mut chars = Chars::from_str(s);
136    let mut buf = Buffer::with_capacity(s.len());
137
138    loop {
139        let Some((index, ch)) = chars.next() else {
140            return Cow::<'buf, str>::Owned(buf.into_string()).into_caveat(warnings);
141        };
142
143        if ch == ESCAPE_CHAR {
144            if let Err(warn_kind) = parse_escape(&mut chars, &mut buf) {
145                warnings.with_elem(warn_kind, elem);
146                return Cow::Borrowed(s).into_caveat(warnings);
147            }
148        } else if let Err(warn_kind) = buf.push_char(ch, index) {
149            warnings.with_elem(warn_kind, elem);
150            return Cow::Borrowed(s).into_caveat(warnings);
151        }
152    }
153}
154
155/// Parses a JSON escape sequence and appends it into the `Buffer`. Assumes
156/// the previous byte read was a backslash.
157///
158/// * See: <https://datatracker.ietf.org/doc/html/rfc8259#section-7>
159fn parse_escape(chars: &mut Chars<'_>, buf: &mut Buffer) -> Result<(), WarningKind> {
160    let (index, ch) = chars.next_or_eof()?;
161
162    let ch = match ch {
163        '"' => '"',
164        '\\' => '\\',
165        '/' => '/',
166        'b' => '\x08',
167        'f' => '\x0c',
168        'n' => '\n',
169        'r' => '\r',
170        't' => '\t',
171        'u' => return parse_unicode_escape(chars, buf),
172        _ => {
173            return Err(WarningKind::InvalidEscape(index));
174        }
175    };
176
177    buf.push_char(ch, index)?;
178
179    Ok(())
180}
181
182/// Parses a JSON \u escape and appends it into the buffer.
183/// Assumes `\u` has just been read.
184///
185/// The Unicode escape might be a UTF-16 surrogate pair.
186///
187/// * See: <https://datatracker.ietf.org/doc/html/rfc8259#section-8.2>
188fn parse_unicode_escape(chars: &mut Chars<'_>, buf: &mut Buffer) -> Result<(), WarningKind> {
189    let n1 = decode_hex_escape(chars)?;
190    let n2 = chars.is_next_escape()?;
191
192    if let Some(n2) = n2 {
193        buf.push_surrogate_pair(n1, n2, chars.index)?;
194    } else {
195        let Some(ch) = char::from_u32(u32::from(n1)) else {
196            return Err(WarningKind::InvalidEscape(chars.index));
197        };
198
199        buf.push_char(ch, chars.index)?;
200    }
201
202    Ok(())
203}
204
205/// A char iterator that can fail if the next char is a control char.
206struct Chars<'buf> {
207    /// The `char` iterator
208    ///
209    /// This needs to be a `CharIndices` as the `Chars` iterator skips over escaped chars.
210    /// And this needs to be `Peekable` as we need to look ahead to detect a potential second
211    /// Unicode literal and treat that as a UTF-16 surrogate pair.
212    char_indices: Peekable<std::str::CharIndices<'buf>>,
213
214    /// The last parsed char index
215    index: usize,
216}
217
218impl<'buf> Chars<'buf> {
219    /// Create a new `Chars` iterator from a `&str`.
220    fn from_str(s: &'buf str) -> Self {
221        Self {
222            char_indices: s.char_indices().peekable(),
223            index: 0,
224        }
225    }
226
227    /// Return the next char as `Ok` or return `Err(UnexpectedEOF)` if there is no char
228    /// or return `Err(ControlCharacterWhileParsingString)` if the next char is a control char.
229    fn next_or_eof(&mut self) -> Result<(usize, char), WarningKind> {
230        if let Some((index, ch)) = self.next() {
231            if ch.is_control() {
232                return Err(WarningKind::ControlCharacterWhileParsingString(index));
233            }
234
235            Ok((index, ch))
236        } else {
237            Err(WarningKind::UnexpectedEndOfString(self.index))
238        }
239    }
240
241    /// Look ahead in the char stream and if there is another unicode escape return it as a decoded
242    /// hex escape.
243    fn is_next_escape(&mut self) -> Result<Option<u16>, WarningKind> {
244        {
245            let escape_char = self.char_indices.next_if(|(_, ch)| *ch == ESCAPE_CHAR);
246
247            if escape_char.is_none() {
248                return Ok(None);
249            }
250        }
251
252        {
253            let escape_unicode = self.char_indices.next_if(|(_, ch)| *ch == 'u');
254
255            if escape_unicode.is_none() {
256                return Ok(None);
257            }
258        }
259
260        let n = decode_hex_escape(self)?;
261        Ok(Some(n))
262    }
263}
264
265impl Iterator for Chars<'_> {
266    type Item = (usize, char);
267
268    fn next(&mut self) -> Option<Self::Item> {
269        if let Some((index, char)) = self.char_indices.next() {
270            self.index = index;
271            Some((index, char))
272        } else {
273            None
274        }
275    }
276}
277
278/// The `String` based buffer where we accumulate the escaped JSON string.
279///
280/// If `fail_on_control` is true and a control char is given to a method,
281/// the method will return `Err(ControlCharacterWhileParsingString)`.
282struct Buffer {
283    /// The `String` to accumulate chars in.
284    buf: String,
285}
286
287impl Buffer {
288    /// Create a new `Buffer`
289    fn with_capacity(capacity: usize) -> Self {
290        Self {
291            buf: String::with_capacity(capacity),
292        }
293    }
294
295    /// Push a char into the `String`.
296    ///
297    /// Return `Err` if the char is a control char and the `fail_on_control` is true.
298    /// Otherwise return `Ok`.
299    fn push_char(&mut self, ch: char, index: usize) -> Result<(), WarningKind> {
300        if ch.is_control() {
301            return Err(WarningKind::ControlCharacterWhileParsingString(index));
302        }
303
304        self.buf.push(ch);
305        Ok(())
306    }
307
308    /// Consume the `Buffer` and return the inner `String`.
309    fn into_string(self) -> String {
310        self.buf
311    }
312
313    /// Decodes the high and low parts of a UTF-16 surrogate pair and pushes the resulting
314    /// `char` on to the `Buffer`.
315    ///
316    /// Returns `Ok(char)` if the decoding succeeds.
317    /// Returns `Err(DecodeUtf16)` if the decoding fails.
318    fn push_surrogate_pair(&mut self, n1: u16, n2: u16, index: usize) -> Result<char, WarningKind> {
319        let Some(ch) = char::decode_utf16([n1, n2]).next() else {
320            return Err(WarningKind::InvalidEscape(index));
321        };
322
323        let ch = match ch {
324            Ok(ch) => ch,
325            Err(err) => {
326                return Err(WarningKind::DecodeUtf16(index, err.unpaired_surrogate()));
327            }
328        };
329
330        self.push_char(ch, index)?;
331
332        Ok(ch)
333    }
334}
335
336/// Munch four chars as bytes and try convert into a `char`.
337fn decode_hex_escape(chars: &mut Chars<'_>) -> Result<u16, WarningKind> {
338    const RADIX: u32 = 16;
339
340    let (_, one) = chars.next_or_eof()?;
341    let (_, two) = chars.next_or_eof()?;
342    let (_, three) = chars.next_or_eof()?;
343    let (index, four) = chars.next_or_eof()?;
344
345    let string = [one, two, three, four].into_iter().collect::<String>();
346    let Ok(n) = u16::from_str_radix(&string, RADIX) else {
347        return Err(WarningKind::InvalidEscape(index));
348    };
349
350    Ok(n)
351}
352
353#[cfg(test)]
354mod test_unescape {
355    use std::{borrow::Cow, rc::Rc};
356
357    use assert_matches::assert_matches;
358
359    use crate::json;
360
361    use super::{unescape_str, WarningKind};
362
363    fn test_elem() -> json::Element<'static> {
364        json::Element {
365            id: 0.into(),
366            path_node: Rc::new(json::PathNode::Root),
367            span: json::parser::Span::default(),
368            value: json::Value::Null,
369        }
370    }
371
372    #[test]
373    fn should_unescape_empty_str() {
374        const INPUT: &str = "";
375
376        let elem = test_elem();
377        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
378        assert_matches!(string, Cow::Borrowed(""));
379        assert_matches!(warnings.as_slice(), []);
380    }
381
382    #[test]
383    fn should_unescape_str_without_escapes() {
384        const INPUT: &str = "ab";
385
386        let elem = test_elem();
387        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
388        assert_matches!(string, Cow::Borrowed(INPUT));
389        assert_matches!(warnings.as_slice(), []);
390    }
391
392    #[test]
393    fn should_unescape_str_with_forward_slash_escape() {
394        const INPUT: &str = r"a\/b";
395
396        let elem = test_elem();
397        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
398        let s = assert_matches!(
399            string,
400            Cow::Owned(s) => s
401        );
402
403        assert_eq!(s, "a/b");
404        assert_matches!(warnings.as_slice(), []);
405    }
406
407    #[test]
408    fn should_unescape_str_with_many_escapes() {
409        const INPUT: &str = r#"a\/\"b\""#;
410
411        let elem = test_elem();
412        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
413        let s = assert_matches!(
414            string,
415            Cow::Owned(s) => s
416        );
417
418        assert_eq!(s, r#"a/"b""#);
419        assert_matches!(warnings.as_slice(), []);
420    }
421
422    #[test]
423    fn should_fail_to_unescape_str_with_invalid_escape() {
424        {
425            const INPUT: &str = r"\a/c";
426
427            let elem = test_elem();
428            let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
429            let warnings = warnings.into_kind_vec();
430
431            assert_matches!(string, Cow::Borrowed(_));
432            assert_matches!(warnings.as_slice(), [WarningKind::InvalidEscape(1)]);
433        }
434
435        {
436            const INPUT: &str = r"a\c";
437
438            let elem = test_elem();
439            let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
440            let warnings = warnings.into_kind_vec();
441
442            assert_matches!(string, Cow::Borrowed(_));
443            assert_matches!(warnings.as_slice(), [WarningKind::InvalidEscape(2)]);
444        }
445
446        {
447            const INPUT: &str = r"a/c\";
448
449            let elem = test_elem();
450            let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
451            let warnings = warnings.into_kind_vec();
452
453            assert_matches!(string, Cow::Borrowed(_));
454            assert_matches!(warnings.as_slice(), [WarningKind::UnexpectedEndOfString(3)]);
455        }
456    }
457
458    #[test]
459    fn should_fail_to_unescape_str_with_control_char() {
460        const INPUT: &str = "hello\u{0019}world";
461
462        let elem = test_elem();
463        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
464        let warnings = warnings.into_kind_vec();
465
466        assert_matches!(string, Cow::Borrowed(_));
467        assert_matches!(
468            warnings.as_slice(),
469            [WarningKind::ControlCharacterWhileParsingString(5)]
470        );
471    }
472
473    #[test]
474    fn should_fail_to_unescape_raw_str_with_rust_unicode_literal_control_char() {
475        const INPUT: &str = r"hello\u{0019}world";
476
477        let elem = test_elem();
478        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
479        let warnings = warnings.into_kind_vec();
480
481        assert_matches!(string, Cow::Borrowed(_));
482        assert_matches!(warnings.as_slice(), [WarningKind::InvalidEscape(10)]);
483    }
484
485    #[test]
486    fn should_fail_to_unescape_json_control_escape() {
487        const INPUT: &str = r"hello\u0019world";
488
489        let elem = test_elem();
490        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
491        let warnings = warnings.into_kind_vec();
492
493        assert_matches!(string, Cow::Borrowed(_));
494        assert_matches!(
495            warnings.as_slice(),
496            [WarningKind::ControlCharacterWhileParsingString(10)]
497        );
498    }
499
500    #[test]
501    fn should_unescape_unicode_literals() {
502        const INPUT: &str = r"hello\u0020world\u0021";
503
504        let elem = test_elem();
505        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
506
507        let s = assert_matches!(
508            string,
509            Cow::Owned(s) => s
510        );
511        assert_eq!(s, "hello world!");
512        assert_matches!(warnings.as_slice(), []);
513    }
514
515    #[test]
516    fn should_unescape_utf_16_surrogate_pair() {
517        // This test data is taken from the JSON RFC 8259 spec.
518        //
519        // * See: <https://datatracker.ietf.org/doc/html/rfc8259#section-7>
520        const INPUT: &str = r"hello\uD834\uDD1Eworld";
521
522        let elem = test_elem();
523        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
524
525        let s = assert_matches!(
526            string,
527            Cow::Owned(s) => s
528        );
529        assert_eq!(s, "hello\u{1D11E}world");
530        assert_matches!(warnings.as_slice(), []);
531    }
532
533    #[test]
534    fn should_unescape_unicode_literal_followed_by_simple_escape() {
535        const INPUT: &str = r"hello\u0020\/world\u0021";
536
537        let elem = test_elem();
538        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
539
540        let s = assert_matches!(
541            string,
542            Cow::Owned(s) => s
543        );
544        assert_eq!(s, "hello /world!");
545        assert_matches!(warnings.as_slice(), []);
546    }
547}