Skip to main content

ocpi_tariffs/json/
decode.rs

1//! A `Warning` for warnings that can happen when decoding a JSON `&str`.
2use std::{borrow::Cow, fmt, iter::Peekable};
3
4use crate::{into_caveat, warning, Caveat, IntoCaveat};
5
6use super::Element;
7
8const ESCAPE_CHAR: char = '\\';
9
10/// The kind of `Warning` that can happen when decoding a `&str`.
11#[derive(Debug, Eq, PartialEq, Ord, PartialOrd)]
12pub enum Warning {
13    /// Control chars were found while parsing a JSON string.
14    ControlCharacterWhileParsingString(usize),
15
16    /// A UTF-16 surrogate pair failed to decode.
17    DecodeUtf16(usize, u16),
18
19    /// A string contains invalid escape chars.
20    InvalidEscape(usize),
21
22    /// The String ended before the parser expected.
23    UnexpectedEndOfString(usize),
24}
25
26impl crate::Warning for Warning {
27    /// A human readable identifier for each `Warning`.
28    fn id(&self) -> crate::SmartString {
29        match self {
30            Warning::ControlCharacterWhileParsingString(_) => {
31                "control_character_while_parsing_string".into()
32            }
33            Warning::DecodeUtf16(..) => "decode_utf_1_6".into(),
34            Warning::InvalidEscape(_) => "invalid_escape".into(),
35            Warning::UnexpectedEndOfString(_) => "unexpected_end_of_string".into(),
36        }
37    }
38}
39
40impl fmt::Display for Warning {
41    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
42        match self {
43            Warning::ControlCharacterWhileParsingString(index) => {
44                write!(
45                    f,
46                    "Control chars were found at index `{index}` while decoding a JSON string."
47                )
48            }
49            Warning::DecodeUtf16(index, code) => {
50                write!(
51                    f,
52                    "A UTF-16 surrogate pair `{code}` failed to decode at index: `{index}`."
53                )
54            }
55            Warning::InvalidEscape(index) => {
56                write!(
57                    f,
58                    "String contains an invalid escape char at index: `{index})`."
59                )
60            }
61            Warning::UnexpectedEndOfString(index) => {
62                write!(f, "The String ended prematurely at index: `{index}`.")
63            }
64        }
65    }
66}
67
68/// Return a `PendingStr` that marks the inner `&str` is containing escape codes or not.
69pub(crate) fn analyze<'buf>(
70    s: &'buf str,
71    elem: &Element<'buf>,
72) -> Caveat<PendingStr<'buf>, Warning> {
73    let mut warnings = warning::Set::new();
74
75    // Strings are expected to be small so running over all bytes
76    // with the intent of early exiting is acceptable.
77    if s.chars().any(|ch| ch == ESCAPE_CHAR) {
78        PendingStr::HasEscapes(EscapeStr(s)).into_caveat(warnings)
79    } else {
80        if let Some((index, _)) = s.char_indices().find(|(_, ch)| ch.is_control()) {
81            warnings.with_elem(Warning::ControlCharacterWhileParsingString(index), elem);
82        }
83
84        PendingStr::NoEscapes(s).into_caveat(warnings)
85    }
86}
87
88/// Marks a `&str` as having escapes or not.
89pub(crate) enum PendingStr<'buf> {
90    /// The `&str` has no escapes and can be used as is.
91    NoEscapes(&'buf str),
92
93    /// The `&str` has escape chars and needs to be unescaped before trying to parse into another form.
94    HasEscapes(EscapeStr<'buf>),
95}
96
97into_caveat!(PendingStr<'buf>);
98
99/// A `&str` with escape chars.
100pub(crate) struct EscapeStr<'buf>(&'buf str);
101
102impl<'buf> EscapeStr<'buf> {
103    pub(crate) fn decode_escapes(&self, elem: &Element<'buf>) -> Caveat<Cow<'buf, str>, Warning> {
104        unescape_str(self.0, elem)
105    }
106
107    /// Consume the `EscapeStr` and return the raw bytes as a str.
108    pub(crate) fn into_raw(self) -> &'buf str {
109        self.0
110    }
111}
112
113/// Return the `str` with escaped chars replaced with the decoded equivalent.
114///
115/// Return `Some(Cow::Owned)` if there are escape chars in the `str` otherwise return
116/// `Some(Cow::Owned)` containing the input `str`.
117/// Return `None` if the `str` contains invalid or unhandled escape chars.
118pub(crate) fn unescape_str<'buf>(
119    s: &'buf str,
120    elem: &Element<'buf>,
121) -> Caveat<Cow<'buf, str>, Warning> {
122    let mut warnings = warning::Set::new();
123
124    // Strings are expected to be small so running over all bytes to early out is acceptable.
125    if !s.chars().any(|ch| ch == ESCAPE_CHAR) {
126        if let Some((index, _)) = s.char_indices().find(|(_, ch)| ch.is_control()) {
127            warnings.with_elem(Warning::ControlCharacterWhileParsingString(index), elem);
128        }
129        return Cow::Borrowed(s).into_caveat(warnings);
130    }
131
132    let mut chars = Chars::from_str(s);
133    let mut buf = Buffer::with_capacity(s.len());
134
135    loop {
136        let Some((index, ch)) = chars.next() else {
137            return Cow::<'buf, str>::Owned(buf.into_string()).into_caveat(warnings);
138        };
139
140        if ch == ESCAPE_CHAR {
141            if let Err(warn_kind) = parse_escape(&mut chars, &mut buf) {
142                warnings.with_elem(warn_kind, elem);
143                return Cow::Borrowed(s).into_caveat(warnings);
144            }
145        } else if let Err(warn_kind) = buf.push_char(ch, index) {
146            warnings.with_elem(warn_kind, elem);
147            return Cow::Borrowed(s).into_caveat(warnings);
148        }
149    }
150}
151
152/// Parses a JSON escape sequence and appends it into the `Buffer`. Assumes
153/// the previous byte read was a backslash.
154///
155/// * See: <https://datatracker.ietf.org/doc/html/rfc8259#section-7>
156fn parse_escape(chars: &mut Chars<'_>, buf: &mut Buffer) -> Result<(), Warning> {
157    let (index, ch) = chars.next_or_eof()?;
158
159    let ch = match ch {
160        '"' => '"',
161        '\\' => '\\',
162        '/' => '/',
163        'b' => '\x08',
164        'f' => '\x0c',
165        'n' => '\n',
166        'r' => '\r',
167        't' => '\t',
168        'u' => return parse_unicode_escape(chars, buf),
169        _ => {
170            return Err(Warning::InvalidEscape(index));
171        }
172    };
173
174    buf.push_char(ch, index)?;
175
176    Ok(())
177}
178
179/// Parses a JSON \u escape and appends it into the buffer.
180/// Assumes `\u` has just been read.
181///
182/// The Unicode escape might be a UTF-16 surrogate pair.
183///
184/// * See: <https://datatracker.ietf.org/doc/html/rfc8259#section-8.2>
185fn parse_unicode_escape(chars: &mut Chars<'_>, buf: &mut Buffer) -> Result<(), Warning> {
186    let n1 = decode_hex_escape(chars)?;
187    let n2 = chars.is_next_escape()?;
188
189    if let Some(n2) = n2 {
190        buf.push_surrogate_pair(n1, n2, chars.index)?;
191    } else {
192        let Some(ch) = char::from_u32(u32::from(n1)) else {
193            return Err(Warning::InvalidEscape(chars.index));
194        };
195
196        buf.push_char(ch, chars.index)?;
197    }
198
199    Ok(())
200}
201
202/// A char iterator that can fail if the next char is a control char.
203struct Chars<'buf> {
204    /// The `char` iterator
205    ///
206    /// This needs to be a `CharIndices` as the `Chars` iterator skips over escaped chars.
207    /// And this needs to be `Peekable` as we need to look ahead to detect a potential second
208    /// Unicode literal and treat that as a UTF-16 surrogate pair.
209    char_indices: Peekable<std::str::CharIndices<'buf>>,
210
211    /// The last parsed char index
212    index: usize,
213}
214
215impl<'buf> Chars<'buf> {
216    /// Create a new `Chars` iterator from a `&str`.
217    fn from_str(s: &'buf str) -> Self {
218        Self {
219            char_indices: s.char_indices().peekable(),
220            index: 0,
221        }
222    }
223
224    /// Return the next char as `Ok` or return `Err(UnexpectedEOF)` if there is no char
225    /// or return `Err(ControlCharacterWhileParsingString)` if the next char is a control char.
226    fn next_or_eof(&mut self) -> Result<(usize, char), Warning> {
227        if let Some((index, ch)) = self.next() {
228            if ch.is_control() {
229                return Err(Warning::ControlCharacterWhileParsingString(index));
230            }
231
232            Ok((index, ch))
233        } else {
234            Err(Warning::UnexpectedEndOfString(self.index))
235        }
236    }
237
238    /// Look ahead in the char stream and if there is another unicode escape return it as a decoded
239    /// hex escape.
240    fn is_next_escape(&mut self) -> Result<Option<u16>, Warning> {
241        {
242            let escape_char = self.char_indices.next_if(|(_, ch)| *ch == ESCAPE_CHAR);
243
244            if escape_char.is_none() {
245                return Ok(None);
246            }
247        }
248
249        {
250            let escape_unicode = self.char_indices.next_if(|(_, ch)| *ch == 'u');
251
252            if escape_unicode.is_none() {
253                return Ok(None);
254            }
255        }
256
257        let n = decode_hex_escape(self)?;
258        Ok(Some(n))
259    }
260}
261
262impl Iterator for Chars<'_> {
263    type Item = (usize, char);
264
265    fn next(&mut self) -> Option<Self::Item> {
266        if let Some((index, char)) = self.char_indices.next() {
267            self.index = index;
268            Some((index, char))
269        } else {
270            None
271        }
272    }
273}
274
275/// The `String` based buffer where we accumulate the escaped JSON string.
276///
277/// If `fail_on_control` is true and a control char is given to a method,
278/// the method will return `Err(ControlCharacterWhileParsingString)`.
279struct Buffer {
280    /// The `String` to accumulate chars in.
281    buf: String,
282}
283
284impl Buffer {
285    /// Create a new `Buffer`
286    fn with_capacity(capacity: usize) -> Self {
287        Self {
288            buf: String::with_capacity(capacity),
289        }
290    }
291
292    /// Push a char into the `String`.
293    ///
294    /// Return `Err` if the char is a control char and the `fail_on_control` is true.
295    /// Otherwise return `Ok`.
296    fn push_char(&mut self, ch: char, index: usize) -> Result<(), Warning> {
297        if ch.is_control() {
298            return Err(Warning::ControlCharacterWhileParsingString(index));
299        }
300
301        self.buf.push(ch);
302        Ok(())
303    }
304
305    /// Consume the `Buffer` and return the inner `String`.
306    fn into_string(self) -> String {
307        self.buf
308    }
309
310    /// Decodes the high and low parts of a UTF-16 surrogate pair and pushes the resulting
311    /// `char` on to the `Buffer`.
312    ///
313    /// Returns `Ok(char)` if the decoding succeeds.
314    /// Returns `Err(DecodeUtf16)` if the decoding fails.
315    fn push_surrogate_pair(&mut self, n1: u16, n2: u16, index: usize) -> Result<char, Warning> {
316        let Some(ch) = char::decode_utf16([n1, n2]).next() else {
317            return Err(Warning::InvalidEscape(index));
318        };
319
320        let ch = match ch {
321            Ok(ch) => ch,
322            Err(err) => {
323                return Err(Warning::DecodeUtf16(index, err.unpaired_surrogate()));
324            }
325        };
326
327        self.push_char(ch, index)?;
328
329        Ok(ch)
330    }
331}
332
333/// Munch four chars as bytes and try convert into a `char`.
334fn decode_hex_escape(chars: &mut Chars<'_>) -> Result<u16, Warning> {
335    const RADIX: u32 = 16;
336
337    let (_, one) = chars.next_or_eof()?;
338    let (_, two) = chars.next_or_eof()?;
339    let (_, three) = chars.next_or_eof()?;
340    let (index, four) = chars.next_or_eof()?;
341
342    let string = [one, two, three, four].into_iter().collect::<String>();
343    let Ok(n) = u16::from_str_radix(&string, RADIX) else {
344        return Err(Warning::InvalidEscape(index));
345    };
346
347    Ok(n)
348}
349
350#[cfg(test)]
351mod test_unescape {
352    #![allow(
353        clippy::indexing_slicing,
354        reason = "unwraps are allowed anywhere in tests"
355    )]
356
357    use std::{borrow::Cow, sync::Arc};
358
359    use assert_matches::assert_matches;
360
361    use crate::json;
362
363    use super::{unescape_str, Warning};
364
365    fn test_elem() -> json::Element<'static> {
366        json::Element {
367            id: 0.into(),
368            path_node: Arc::new(json::PathNode::Root),
369            span: json::parser::Span::default(),
370            value: json::Value::Null,
371        }
372    }
373
374    #[test]
375    fn should_unescape_empty_str() {
376        const INPUT: &str = "";
377
378        let elem = test_elem();
379        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
380        assert_matches!(string, Cow::Borrowed(""));
381        assert!(warnings.is_empty(), "{warnings:#?}");
382    }
383
384    #[test]
385    fn should_unescape_str_without_escapes() {
386        const INPUT: &str = "ab";
387
388        let elem = test_elem();
389        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
390        assert_matches!(string, Cow::Borrowed(INPUT));
391        assert!(warnings.is_empty(), "{warnings:#?}");
392    }
393
394    #[test]
395    fn should_unescape_str_with_forward_slash_escape() {
396        const INPUT: &str = r"a\/b";
397
398        let elem = test_elem();
399        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
400        let s = assert_matches!(
401            string,
402            Cow::Owned(s) => s
403        );
404
405        assert_eq!(s, "a/b");
406        assert!(warnings.is_empty(), "{warnings:#?}");
407    }
408
409    #[test]
410    fn should_unescape_str_with_many_escapes() {
411        const INPUT: &str = r#"a\/\"b\""#;
412
413        let elem = test_elem();
414        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
415        let s = assert_matches!(
416            string,
417            Cow::Owned(s) => s
418        );
419
420        assert_eq!(s, r#"a/"b""#);
421        assert!(warnings.is_empty(), "{warnings:#?}");
422    }
423
424    #[test]
425    fn should_fail_to_unescape_str_with_invalid_escape() {
426        {
427            const INPUT: &str = r"\a/c";
428
429            let elem = test_elem();
430            let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
431            let warnings = warnings.into_path_map();
432            let warnings = &warnings["$"];
433
434            assert_matches!(string, Cow::Borrowed(_));
435            assert_matches!(warnings.as_slice(), [Warning::InvalidEscape(1)]);
436        }
437
438        {
439            const INPUT: &str = r"a\c";
440
441            let elem = test_elem();
442            let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
443            let warnings = warnings.into_path_map();
444            let warnings = &warnings["$"];
445
446            assert_matches!(string, Cow::Borrowed(_));
447            assert_matches!(warnings.as_slice(), [Warning::InvalidEscape(2)]);
448        }
449
450        {
451            const INPUT: &str = r"a/c\";
452
453            let elem = test_elem();
454            let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
455            let warnings = warnings.into_path_map();
456            let warnings = &warnings["$"];
457
458            assert_matches!(string, Cow::Borrowed(_));
459            assert_matches!(warnings.as_slice(), [Warning::UnexpectedEndOfString(3)]);
460        }
461    }
462
463    #[test]
464    fn should_fail_to_unescape_str_with_control_char() {
465        const INPUT: &str = "hello\u{0019}world";
466
467        let elem = test_elem();
468        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
469        let warnings = warnings.into_path_map();
470        let warnings = &warnings["$"];
471
472        assert_matches!(string, Cow::Borrowed(_));
473        assert_matches!(
474            warnings.as_slice(),
475            [Warning::ControlCharacterWhileParsingString(5)]
476        );
477    }
478
479    #[test]
480    fn should_fail_to_unescape_raw_str_with_rust_unicode_literal_control_char() {
481        const INPUT: &str = r"hello\u{0019}world";
482
483        let elem = test_elem();
484        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
485        let warnings = warnings.into_path_map();
486        let warnings = &warnings["$"];
487
488        assert_matches!(string, Cow::Borrowed(_));
489        assert_matches!(warnings.as_slice(), [Warning::InvalidEscape(10)]);
490    }
491
492    #[test]
493    fn should_fail_to_unescape_json_control_escape() {
494        const INPUT: &str = r"hello\u0019world";
495
496        let elem = test_elem();
497        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
498        let warnings = warnings.into_path_map();
499        let warnings = &warnings["$"];
500
501        assert_matches!(string, Cow::Borrowed(_));
502        assert_matches!(
503            warnings.as_slice(),
504            [Warning::ControlCharacterWhileParsingString(10)]
505        );
506    }
507
508    #[test]
509    fn should_unescape_unicode_literals() {
510        const INPUT: &str = r"hello\u0020world\u0021";
511
512        let elem = test_elem();
513        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
514
515        let s = assert_matches!(
516            string,
517            Cow::Owned(s) => s
518        );
519        assert_eq!(s, "hello world!");
520        assert!(warnings.is_empty(), "{warnings:#?}");
521    }
522
523    #[test]
524    fn should_unescape_utf_16_surrogate_pair() {
525        // This test data is taken from the JSON RFC 8259 spec.
526        //
527        // * See: <https://datatracker.ietf.org/doc/html/rfc8259#section-7>
528        const INPUT: &str = r"hello\uD834\uDD1Eworld";
529
530        let elem = test_elem();
531        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
532
533        let s = assert_matches!(
534            string,
535            Cow::Owned(s) => s
536        );
537        assert_eq!(s, "hello\u{1D11E}world");
538        assert!(warnings.is_empty(), "{warnings:#?}");
539    }
540
541    #[test]
542    fn should_unescape_unicode_literal_followed_by_simple_escape() {
543        const INPUT: &str = r"hello\u0020\/world\u0021";
544
545        let elem = test_elem();
546        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
547
548        let s = assert_matches!(
549            string,
550            Cow::Owned(s) => s
551        );
552        assert_eq!(s, "hello /world!");
553        assert!(warnings.is_empty(), "{warnings:#?}");
554    }
555}