ocpi_tariffs/json/
decode.rs

1use std::{borrow::Cow, fmt, iter::Peekable};
2
3use crate::{warning, Caveat, IntoCaveat, IntoWarning};
4
5use super::Element;
6
7const ESCAPE_CHAR: char = '\\';
8
9/// The kind of `Warning` that can happen when decoding a `&str`.
10#[derive(Debug, Eq, PartialEq, Ord, PartialOrd)]
11pub enum WarningKind {
12    /// Control chars were found while parsing a JSON string.
13    ControlCharacterWhileParsingString(usize),
14
15    /// A UTF-16 surrogate pair failed to decode.
16    DecodeUtf16(usize, u16),
17
18    /// A string contains invalid escape chars.
19    InvalidEscape(usize),
20
21    /// The String ended before the parser expected.
22    UnexpectedEndOfString(usize),
23}
24
25impl warning::Kind for WarningKind {
26    /// A human readable identifier for each `Warning`.
27    fn id(&self) -> Cow<'static, str> {
28        match self {
29            WarningKind::ControlCharacterWhileParsingString(_) => {
30                "control_character_while_parsing_string".into()
31            }
32            WarningKind::DecodeUtf16(..) => "decode_utf_1_6".into(),
33            WarningKind::InvalidEscape(_) => "invalid_escape".into(),
34            WarningKind::UnexpectedEndOfString(_) => "unexpected_end_of_string".into(),
35        }
36    }
37}
38
39impl fmt::Display for WarningKind {
40    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
41        match self {
42            WarningKind::ControlCharacterWhileParsingString(index) => {
43                write!(
44                    f,
45                    "Control chars were found at index `{index}` while decoding a JSON string."
46                )
47            }
48            WarningKind::DecodeUtf16(index, code) => {
49                write!(
50                    f,
51                    "A UTF-16 surrogate pair `{code}` failed to decode at index: `{index}`."
52                )
53            }
54            WarningKind::InvalidEscape(index) => {
55                write!(
56                    f,
57                    "String contains an invalid escape char at index: `{index})`."
58                )
59            }
60            WarningKind::UnexpectedEndOfString(index) => {
61                write!(f, "The String ended prematurely at index: `{index}`.")
62            }
63        }
64    }
65}
66
67/// Return a `PendingStr` that marks the inner `&str` is containing escape codes or not.
68pub(crate) fn analyze<'buf>(
69    s: &'buf str,
70    elem: &Element<'buf>,
71) -> Caveat<PendingStr<'buf>, WarningKind> {
72    let mut warnings = warning::Set::new();
73
74    // Strings are expected to be small so running over all bytes
75    // with the intent of early exiting is acceptable.
76    if s.chars().any(|ch| ch == ESCAPE_CHAR) {
77        PendingStr::HasEscapes(EscapeStr(s)).into_caveat(warnings)
78    } else {
79        if let Some((index, _)) = s.char_indices().find(|(_, ch)| ch.is_control()) {
80            warnings
81                .push(WarningKind::ControlCharacterWhileParsingString(index).into_warning(elem));
82        }
83
84        PendingStr::NoEscapes(s).into_caveat(warnings)
85    }
86}
87
88/// Marks a `&str` as having escapes or not.
89pub(crate) enum PendingStr<'buf> {
90    /// The `&str` has no escapes and can be used as is.
91    NoEscapes(&'buf str),
92
93    /// The `&str` has escape chars and needs to be unescaped before trying to parse into another form.
94    #[allow(dead_code, reason = "pending use in `tariff::lint`")]
95    HasEscapes(EscapeStr<'buf>),
96}
97
98impl IntoCaveat for PendingStr<'_> {
99    fn into_caveat<W: warning::Kind>(self, warnings: warning::Set<W>) -> Caveat<Self, W> {
100        Caveat::new(self, warnings)
101    }
102}
103
104/// A `&str` with escape chars.
105pub(crate) struct EscapeStr<'buf>(&'buf str);
106
107impl<'buf> EscapeStr<'buf> {
108    #[allow(dead_code, reason = "pending use in `tariff::lint`")]
109    pub(crate) fn decode_escapes(
110        &self,
111        elem: &Element<'buf>,
112    ) -> Caveat<Cow<'buf, str>, WarningKind> {
113        unescape_str(self.0, elem)
114    }
115}
116
117/// Return the `str` with escaped chars replaced with the decoded equivalent.
118///
119/// Return `Some(Cow::Owned)` if there are escape chars in the `str` otherwise return
120/// `Some(Cow::Owned)` containing the input `str`.
121/// Return `None` if the `str` contains invalid or unhandled escape chars.
122pub(crate) fn unescape_str<'buf>(
123    s: &'buf str,
124    elem: &Element<'buf>,
125) -> Caveat<Cow<'buf, str>, WarningKind> {
126    let mut warnings = warning::Set::new();
127
128    // Strings are expected to be small so running over all bytes to early out is acceptable.
129    if !s.chars().any(|ch| ch == ESCAPE_CHAR) {
130        if let Some((index, _)) = s.char_indices().find(|(_, ch)| ch.is_control()) {
131            warnings
132                .push(WarningKind::ControlCharacterWhileParsingString(index).into_warning(elem));
133        }
134        return Cow::Borrowed(s).into_caveat(warnings);
135    }
136
137    let mut chars = Chars::from_str(s);
138    let mut buf = Buffer::with_capacity(s.len());
139
140    loop {
141        let Some((index, ch)) = chars.next() else {
142            return Cow::<'buf, str>::Owned(buf.into_string()).into_caveat(warnings);
143        };
144
145        if ch == ESCAPE_CHAR {
146            if let Err(warn_kind) = parse_escape(&mut chars, &mut buf) {
147                warnings.push(warn_kind.into_warning(elem));
148                return Cow::Borrowed(s).into_caveat(warnings);
149            }
150        } else if let Err(warn_kind) = buf.push_char(ch, index) {
151            warnings.push(warn_kind.into_warning(elem));
152            return Cow::Borrowed(s).into_caveat(warnings);
153        }
154    }
155}
156
157/// Parses a JSON escape sequence and appends it into the `Buffer`. Assumes
158/// the previous byte read was a backslash.
159///
160/// * See: <https://datatracker.ietf.org/doc/html/rfc8259#section-7>
161fn parse_escape(chars: &mut Chars<'_>, buf: &mut Buffer) -> Result<(), WarningKind> {
162    let (index, ch) = chars.next_or_eof()?;
163
164    let ch = match ch {
165        '"' => '"',
166        '\\' => '\\',
167        '/' => '/',
168        'b' => '\x08',
169        'f' => '\x0c',
170        'n' => '\n',
171        'r' => '\r',
172        't' => '\t',
173        'u' => return parse_unicode_escape(chars, buf),
174        _ => {
175            return Err(WarningKind::InvalidEscape(index));
176        }
177    };
178
179    buf.push_char(ch, index)?;
180
181    Ok(())
182}
183
184/// Parses a JSON \u escape and appends it into the buffer.
185/// Assumes `\u` has just been read.
186///
187/// The Unicode escape might be a UTF-16 surrogate pair.
188///
189/// * See: <https://datatracker.ietf.org/doc/html/rfc8259#section-8.2>
190fn parse_unicode_escape(chars: &mut Chars<'_>, buf: &mut Buffer) -> Result<(), WarningKind> {
191    let n1 = decode_hex_escape(chars)?;
192    let n2 = chars.is_next_escape()?;
193
194    if let Some(n2) = n2 {
195        buf.push_surrogate_pair(n1, n2, chars.index)?;
196    } else {
197        let Some(ch) = char::from_u32(u32::from(n1)) else {
198            return Err(WarningKind::InvalidEscape(chars.index));
199        };
200
201        buf.push_char(ch, chars.index)?;
202    }
203
204    Ok(())
205}
206
207/// A char iterator that can fail if the next char is a control char.
208struct Chars<'buf> {
209    /// The `char` iterator
210    ///
211    /// This needs to be a `CharIndices` as the `Chars` iterator skips over escaped chars.
212    /// And this needs to be `Peekable` as we need to look ahead to detect a potential second
213    /// Unicode literal and treat that as a UTF-16 surrogate pair.
214    char_indices: Peekable<std::str::CharIndices<'buf>>,
215
216    /// The last parsed char index
217    index: usize,
218}
219
220impl<'buf> Chars<'buf> {
221    /// Create a new `Chars` iterator from a `&str`.
222    fn from_str(s: &'buf str) -> Self {
223        Self {
224            char_indices: s.char_indices().peekable(),
225            index: 0,
226        }
227    }
228
229    /// Return the next char as `Ok` or return `Err(UnexpectedEOF)` if there is no char
230    /// or return `Err(ControlCharacterWhileParsingString)` if the next char is a control char.
231    fn next_or_eof(&mut self) -> Result<(usize, char), WarningKind> {
232        if let Some((index, ch)) = self.next() {
233            if ch.is_control() {
234                return Err(WarningKind::ControlCharacterWhileParsingString(index));
235            }
236
237            Ok((index, ch))
238        } else {
239            Err(WarningKind::UnexpectedEndOfString(self.index))
240        }
241    }
242
243    /// Look ahead in the char stream and if there is another unicode escape return it as a decoded
244    /// hex escape.
245    fn is_next_escape(&mut self) -> Result<Option<u16>, WarningKind> {
246        {
247            let escape_char = self.char_indices.next_if(|(_, ch)| *ch == ESCAPE_CHAR);
248
249            if escape_char.is_none() {
250                return Ok(None);
251            }
252        }
253
254        {
255            let escape_unicode = self.char_indices.next_if(|(_, ch)| *ch == 'u');
256
257            if escape_unicode.is_none() {
258                return Ok(None);
259            }
260        }
261
262        let n = decode_hex_escape(self)?;
263        Ok(Some(n))
264    }
265}
266
267impl Iterator for Chars<'_> {
268    type Item = (usize, char);
269
270    fn next(&mut self) -> Option<Self::Item> {
271        if let Some((index, char)) = self.char_indices.next() {
272            self.index = index;
273            Some((index, char))
274        } else {
275            None
276        }
277    }
278}
279
280/// The `String` based buffer where we accumulate the escaped JSON string.
281///
282/// If `fail_on_control` is true and a control char is given to a method,
283/// the method will return `Err(ControlCharacterWhileParsingString)`.
284struct Buffer {
285    /// The `String` to accumulate chars in.
286    buf: String,
287}
288
289impl Buffer {
290    /// Create a new `Buffer`
291    fn with_capacity(capacity: usize) -> Self {
292        Self {
293            buf: String::with_capacity(capacity),
294        }
295    }
296
297    /// Push a char into the `String`.
298    ///
299    /// Return `Err` if the char is a control char and the `fail_on_control` is true.
300    /// Otherwise return `Ok`.
301    fn push_char(&mut self, ch: char, index: usize) -> Result<(), WarningKind> {
302        if ch.is_control() {
303            return Err(WarningKind::ControlCharacterWhileParsingString(index));
304        }
305
306        self.buf.push(ch);
307        Ok(())
308    }
309
310    /// Consume the `Buffer` and return the inner `String`.
311    fn into_string(self) -> String {
312        self.buf
313    }
314
315    /// Decodes the high and low parts of a UTF-16 surrogate pair and pushes the resulting
316    /// `char` on to the `Buffer`.
317    ///
318    /// Returns `Ok(char)` if the decoding succeeds.
319    /// Returns `Err(DecodeUtf16)` if the decoding fails.
320    fn push_surrogate_pair(&mut self, n1: u16, n2: u16, index: usize) -> Result<char, WarningKind> {
321        let Some(ch) = char::decode_utf16([n1, n2]).next() else {
322            return Err(WarningKind::InvalidEscape(index));
323        };
324
325        let ch = match ch {
326            Ok(ch) => ch,
327            Err(err) => {
328                return Err(WarningKind::DecodeUtf16(index, err.unpaired_surrogate()));
329            }
330        };
331
332        self.push_char(ch, index)?;
333
334        Ok(ch)
335    }
336}
337
338/// Munch four chars as bytes and try convert into a `char`.
339fn decode_hex_escape(chars: &mut Chars<'_>) -> Result<u16, WarningKind> {
340    const RADIX: u32 = 16;
341
342    let (_, one) = chars.next_or_eof()?;
343    let (_, two) = chars.next_or_eof()?;
344    let (_, three) = chars.next_or_eof()?;
345    let (index, four) = chars.next_or_eof()?;
346
347    let string = [one, two, three, four].into_iter().collect::<String>();
348    let Ok(n) = u16::from_str_radix(&string, RADIX) else {
349        return Err(WarningKind::InvalidEscape(index));
350    };
351
352    Ok(n)
353}
354
355#[cfg(test)]
356mod test_unescape {
357    use std::{borrow::Cow, rc::Rc};
358
359    use assert_matches::assert_matches;
360
361    use crate::{json, Warning};
362
363    use super::{unescape_str, WarningKind};
364
365    fn test_elem() -> json::Element<'static> {
366        json::Element {
367            id: 0.into(),
368            path_node: Rc::new(json::PathNode::Root),
369            span: json::parser::Span::default(),
370            value: json::Value::Null,
371        }
372    }
373
374    #[test]
375    fn should_unescape_empty_str() {
376        const INPUT: &str = "";
377
378        let elem = test_elem();
379        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
380        assert_matches!(string, Cow::Borrowed(""));
381        assert_matches!(warnings.as_slice(), []);
382    }
383
384    #[test]
385    fn should_unescape_str_without_escapes() {
386        const INPUT: &str = "ab";
387
388        let elem = test_elem();
389        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
390        assert_matches!(string, Cow::Borrowed(INPUT));
391        assert_matches!(warnings.as_slice(), []);
392    }
393
394    #[test]
395    fn should_unescape_str_with_forward_slash_escape() {
396        const INPUT: &str = r"a\/b";
397
398        let elem = test_elem();
399        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
400        let s = assert_matches!(
401            string,
402            Cow::Owned(s) => s
403        );
404
405        assert_eq!(s, "a/b");
406        assert_matches!(warnings.as_slice(), []);
407    }
408
409    #[test]
410    fn should_unescape_str_with_many_escapes() {
411        const INPUT: &str = r#"a\/\"b\""#;
412
413        let elem = test_elem();
414        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
415        let s = assert_matches!(
416            string,
417            Cow::Owned(s) => s
418        );
419
420        assert_eq!(s, r#"a/"b""#);
421        assert_matches!(warnings.as_slice(), []);
422    }
423
424    #[test]
425    fn should_fail_to_unescape_str_with_invalid_escape() {
426        {
427            const INPUT: &str = r"\a/c";
428
429            let elem = test_elem();
430            let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
431
432            assert_matches!(string, Cow::Borrowed(_));
433            assert_matches!(
434                warnings.as_slice(),
435                [Warning {
436                    kind: WarningKind::InvalidEscape(1),
437                    ..
438                }]
439            );
440        }
441
442        {
443            const INPUT: &str = r"a\c";
444
445            let elem = test_elem();
446            let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
447
448            assert_matches!(string, Cow::Borrowed(_));
449            assert_matches!(
450                warnings.as_slice(),
451                [Warning {
452                    kind: WarningKind::InvalidEscape(2),
453                    ..
454                }]
455            );
456        }
457
458        {
459            const INPUT: &str = r"a/c\";
460
461            let elem = test_elem();
462            let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
463
464            assert_matches!(string, Cow::Borrowed(_));
465            assert_matches!(
466                warnings.as_slice(),
467                [Warning {
468                    kind: WarningKind::UnexpectedEndOfString(3),
469                    ..
470                }]
471            );
472        }
473    }
474
475    #[test]
476    fn should_fail_to_unescape_str_with_control_char() {
477        const INPUT: &str = "hello\u{0019}world";
478
479        let elem = test_elem();
480        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
481
482        assert_matches!(string, Cow::Borrowed(_));
483        assert_matches!(
484            warnings.as_slice(),
485            [Warning {
486                kind: WarningKind::ControlCharacterWhileParsingString(5),
487                ..
488            }]
489        );
490    }
491
492    #[test]
493    fn should_fail_to_unescape_raw_str_with_rust_unicode_literal_control_char() {
494        const INPUT: &str = r"hello\u{0019}world";
495
496        let elem = test_elem();
497        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
498
499        assert_matches!(string, Cow::Borrowed(_));
500        assert_matches!(
501            warnings.as_slice(),
502            [Warning {
503                kind: WarningKind::InvalidEscape(10),
504                ..
505            }]
506        );
507    }
508
509    #[test]
510    fn should_fail_to_unescape_json_control_escape() {
511        const INPUT: &str = r"hello\u0019world";
512
513        let elem = test_elem();
514        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
515
516        assert_matches!(string, Cow::Borrowed(_));
517        assert_matches!(
518            warnings.as_slice(),
519            [Warning {
520                kind: WarningKind::ControlCharacterWhileParsingString(10),
521                ..
522            }]
523        );
524    }
525
526    #[test]
527    fn should_unescape_unicode_literals() {
528        const INPUT: &str = r"hello\u0020world\u0021";
529
530        let elem = test_elem();
531        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
532
533        let s = assert_matches!(
534            string,
535            Cow::Owned(s) => s
536        );
537        assert_eq!(s, "hello world!");
538        assert_matches!(warnings.as_slice(), []);
539    }
540
541    #[test]
542    fn should_unescape_utf_16_surrogate_pair() {
543        // This test data is taken from the JSON RFC 8259 spec.
544        //
545        // * See: <https://datatracker.ietf.org/doc/html/rfc8259#section-7>
546        const INPUT: &str = r"hello\uD834\uDD1Eworld";
547
548        let elem = test_elem();
549        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
550
551        let s = assert_matches!(
552            string,
553            Cow::Owned(s) => s
554        );
555        assert_eq!(s, "hello\u{1D11E}world");
556        assert_matches!(warnings.as_slice(), []);
557    }
558
559    #[test]
560    fn should_unescape_unicode_literal_followed_by_simple_escape() {
561        const INPUT: &str = r"hello\u0020\/world\u0021";
562
563        let elem = test_elem();
564        let (string, warnings) = unescape_str(INPUT, &elem).into_parts();
565
566        let s = assert_matches!(
567            string,
568            Cow::Owned(s) => s
569        );
570        assert_eq!(s, "hello /world!");
571        assert_matches!(warnings.as_slice(), []);
572    }
573}