Skip to main content

ocpi_tariffs/json/
decode.rs

1//! A `Warning` for warnings that can happen when decoding a JSON `&str`.
2#[cfg(test)]
3mod test_unescape;
4
5use std::{borrow::Cow, fmt, iter::Peekable};
6
7use crate::{warning, Caveat, IntoCaveat};
8
9use super::Element;
10
11const ESCAPE_CHAR: char = '\\';
12
13/// The kind of `Warning` that can happen when decoding a `&str`.
14#[derive(Debug, Eq, PartialEq, Ord, PartialOrd)]
15pub enum Warning {
16    /// Control chars were found while parsing a JSON string.
17    ControlCharacterWhileParsingString(usize),
18
19    /// A UTF-16 surrogate pair failed to decode.
20    DecodeUtf16(usize, u16),
21
22    /// A string contains invalid escape chars.
23    InvalidEscape(usize),
24
25    /// The String ended before the parser expected.
26    UnexpectedEndOfString(usize),
27}
28
29impl crate::Warning for Warning {
30    /// A human readable identifier for each `Warning`.
31    fn id(&self) -> warning::Id {
32        match self {
33            Self::ControlCharacterWhileParsingString(_) => {
34                warning::Id::from_static("control_character_while_parsing_string")
35            }
36            Self::DecodeUtf16(..) => warning::Id::from_static("decode_utf_1_6"),
37            Self::InvalidEscape(_) => warning::Id::from_static("invalid_escape"),
38            Self::UnexpectedEndOfString(_) => warning::Id::from_static("unexpected_end_of_string"),
39        }
40    }
41}
42
43impl fmt::Display for Warning {
44    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
45        match self {
46            Self::ControlCharacterWhileParsingString(index) => {
47                write!(
48                    f,
49                    "Control chars were found at index `{index}` while decoding a JSON string."
50                )
51            }
52            Self::DecodeUtf16(index, code) => {
53                write!(
54                    f,
55                    "A UTF-16 surrogate pair `{code}` failed to decode at index: `{index}`."
56                )
57            }
58            Self::InvalidEscape(index) => {
59                write!(
60                    f,
61                    "String contains an invalid escape char at index: `{index})`."
62                )
63            }
64            Self::UnexpectedEndOfString(index) => {
65                write!(f, "The String ended prematurely at index: `{index}`.")
66            }
67        }
68    }
69}
70
71/// Return a `PendingStr` that marks the inner `&str` is containing escape codes or not.
72pub(crate) fn analyze<'buf>(
73    s: &'buf str,
74    elem: &Element<'buf>,
75) -> Caveat<PendingStr<'buf>, Warning> {
76    let mut warnings = warning::Set::new();
77
78    // Strings are expected to be small so running over all bytes
79    // with the intent of early exiting is acceptable.
80    if s.chars().any(|ch| ch == ESCAPE_CHAR) {
81        PendingStr::HasEscapes(EscapeStr(s)).into_caveat(warnings)
82    } else {
83        if let Some((index, _)) = s.char_indices().find(|(_, ch)| ch.is_control()) {
84            warnings.insert(Warning::ControlCharacterWhileParsingString(index), elem);
85        }
86
87        PendingStr::NoEscapes(s).into_caveat(warnings)
88    }
89}
90
91/// Marks a `&str` as having escapes or not.
92pub(crate) enum PendingStr<'buf> {
93    /// The `&str` has no escapes and can be used as is.
94    NoEscapes(&'buf str),
95
96    /// The `&str` has escape chars and needs to be unescaped before trying to parse into another form.
97    HasEscapes(EscapeStr<'buf>),
98}
99
100/// A `&str` with escape chars.
101pub(crate) struct EscapeStr<'buf>(&'buf str);
102
103impl<'buf> EscapeStr<'buf> {
104    pub(crate) fn decode_escapes(&self, elem: &Element<'buf>) -> Caveat<Cow<'buf, str>, Warning> {
105        unescape_str(self.0, elem)
106    }
107
108    /// Consume the `EscapeStr` and return the raw bytes as a str.
109    pub(crate) fn into_raw(self) -> &'buf str {
110        self.0
111    }
112}
113
114/// Return the `str` with escaped chars replaced with the decoded equivalent.
115///
116/// Return `Some(Cow::Owned)` if there are escape chars in the `str` otherwise return
117/// `Some(Cow::Owned)` containing the input `str`.
118/// Return `None` if the `str` contains invalid or unhandled escape chars.
119pub(crate) fn unescape_str<'buf>(
120    s: &'buf str,
121    elem: &Element<'buf>,
122) -> Caveat<Cow<'buf, str>, Warning> {
123    let mut warnings = warning::Set::new();
124
125    // Strings are expected to be small so running over all
126    // bytes to early out is acceptable.
127    if !s.chars().any(|ch| ch == ESCAPE_CHAR) {
128        if let Some((index, _)) = s.char_indices().find(|(_, ch)| ch.is_control()) {
129            warnings.insert(Warning::ControlCharacterWhileParsingString(index), elem);
130        }
131        return Cow::Borrowed(s).into_caveat(warnings);
132    }
133
134    let mut chars = Chars::from_str(s);
135    let mut buf = Buffer::with_capacity(s.len());
136
137    loop {
138        let Some((index, ch)) = chars.next() else {
139            return Cow::<'buf, str>::Owned(buf.into_string()).into_caveat(warnings);
140        };
141
142        if ch == ESCAPE_CHAR {
143            if let Err(warn_kind) = parse_escape(&mut chars, &mut buf) {
144                warnings.insert(warn_kind, elem);
145                return Cow::Borrowed(s).into_caveat(warnings);
146            }
147        } else if let Err(warn_kind) = buf.push_char(ch, index) {
148            warnings.insert(warn_kind, elem);
149            return Cow::Borrowed(s).into_caveat(warnings);
150        }
151    }
152}
153
154/// Parses a JSON escape sequence and appends it into the `Buffer`. Assumes
155/// the previous byte read was a backslash.
156///
157/// * See: <https://datatracker.ietf.org/doc/html/rfc8259#section-7>
158fn parse_escape(chars: &mut Chars<'_>, buf: &mut Buffer) -> Result<(), Warning> {
159    let (index, ch) = chars.next_or_eof()?;
160
161    let ch = match ch {
162        '"' => '"',
163        '\\' => '\\',
164        '/' => '/',
165        'b' => '\x08',
166        'f' => '\x0c',
167        'n' => '\n',
168        'r' => '\r',
169        't' => '\t',
170        'u' => return parse_unicode_escape(chars, buf),
171        _ => {
172            return Err(Warning::InvalidEscape(index));
173        }
174    };
175
176    buf.push_char(ch, index)?;
177
178    Ok(())
179}
180
181/// Parses a JSON `\u` escape and appends it into the buffer.
182/// Assumes `\u` has just been read.
183///
184/// The Unicode escape might be a UTF-16 surrogate pair.
185///
186/// * See: <https://datatracker.ietf.org/doc/html/rfc8259#section-8.2>
187fn parse_unicode_escape(chars: &mut Chars<'_>, buf: &mut Buffer) -> Result<(), Warning> {
188    let n1 = decode_hex_escape(chars)?;
189    let n2 = chars.is_next_escape()?;
190
191    if let Some(n2) = n2 {
192        buf.push_surrogate_pair(n1, n2, chars.index)?;
193    } else {
194        let Some(ch) = char::from_u32(u32::from(n1)) else {
195            return Err(Warning::InvalidEscape(chars.index));
196        };
197
198        buf.push_char(ch, chars.index)?;
199    }
200
201    Ok(())
202}
203
204/// A char iterator that can fail if the next char is a control char.
205struct Chars<'buf> {
206    /// The `char` iterator
207    ///
208    /// This needs to be a `CharIndices` as the `Chars` iterator skips over escaped chars.
209    /// And this needs to be `Peekable` as we need to look ahead to detect a potential second
210    /// Unicode literal and treat that as a UTF-16 surrogate pair.
211    char_indices: Peekable<std::str::CharIndices<'buf>>,
212
213    /// The last parsed char index
214    index: usize,
215}
216
217impl<'buf> Chars<'buf> {
218    /// Create a new `Chars` iterator from a `&str`.
219    fn from_str(s: &'buf str) -> Self {
220        Self {
221            char_indices: s.char_indices().peekable(),
222            index: 0,
223        }
224    }
225
226    /// Return the next char as `Ok` or return `Err(UnexpectedEOF)` if there is no char
227    /// or return `Err(ControlCharacterWhileParsingString)` if the next char is a control char.
228    fn next_or_eof(&mut self) -> Result<(usize, char), Warning> {
229        if let Some((index, ch)) = self.next() {
230            if ch.is_control() {
231                return Err(Warning::ControlCharacterWhileParsingString(index));
232            }
233
234            Ok((index, ch))
235        } else {
236            Err(Warning::UnexpectedEndOfString(self.index))
237        }
238    }
239
240    /// Look ahead in the char stream and if there is another Unicode escape return it as a decoded
241    /// hex escape.
242    fn is_next_escape(&mut self) -> Result<Option<u16>, Warning> {
243        {
244            let escape_char = self.char_indices.next_if(|(_, ch)| *ch == ESCAPE_CHAR);
245
246            if escape_char.is_none() {
247                return Ok(None);
248            }
249        }
250
251        {
252            let escape_unicode = self.char_indices.next_if(|(_, ch)| *ch == 'u');
253
254            if escape_unicode.is_none() {
255                return Ok(None);
256            }
257        }
258
259        let n = decode_hex_escape(self)?;
260        Ok(Some(n))
261    }
262}
263
264impl Iterator for Chars<'_> {
265    type Item = (usize, char);
266
267    fn next(&mut self) -> Option<Self::Item> {
268        if let Some((index, char)) = self.char_indices.next() {
269            self.index = index;
270            Some((index, char))
271        } else {
272            None
273        }
274    }
275}
276
277/// The `String` based buffer where we accumulate the escaped JSON string.
278///
279/// If `fail_on_control` is true and a control char is given to a method,
280/// the method will return `Err(ControlCharacterWhileParsingString)`.
281struct Buffer {
282    /// The `String` to accumulate chars in.
283    buf: String,
284}
285
286impl Buffer {
287    /// Create a new `Buffer`
288    fn with_capacity(capacity: usize) -> Self {
289        Self {
290            buf: String::with_capacity(capacity),
291        }
292    }
293
294    /// Push a char into the `String`.
295    ///
296    /// Return `Err` if the char is a control char and the `fail_on_control` is true.
297    /// Otherwise, return `Ok`.
298    fn push_char(&mut self, ch: char, index: usize) -> Result<(), Warning> {
299        if ch.is_control() {
300            return Err(Warning::ControlCharacterWhileParsingString(index));
301        }
302
303        self.buf.push(ch);
304        Ok(())
305    }
306
307    /// Consume the `Buffer` and return the inner `String`.
308    fn into_string(self) -> String {
309        self.buf
310    }
311
312    /// Decodes the high and low parts of a UTF-16 surrogate pair and pushes the resulting
313    /// `char` on to the `Buffer`.
314    ///
315    /// Returns `Ok(char)` if the decoding succeeds.
316    /// Returns `Err(DecodeUtf16)` if the decoding fails.
317    fn push_surrogate_pair(&mut self, n1: u16, n2: u16, index: usize) -> Result<char, Warning> {
318        let Some(ch) = char::decode_utf16([n1, n2]).next() else {
319            return Err(Warning::InvalidEscape(index));
320        };
321
322        let ch = match ch {
323            Ok(ch) => ch,
324            Err(err) => {
325                return Err(Warning::DecodeUtf16(index, err.unpaired_surrogate()));
326            }
327        };
328
329        self.push_char(ch, index)?;
330
331        Ok(ch)
332    }
333}
334
335/// Munch four chars as bytes and try to convert them into a `char`.
336fn decode_hex_escape(chars: &mut Chars<'_>) -> Result<u16, Warning> {
337    const RADIX: u32 = 16;
338
339    let (_, one) = chars.next_or_eof()?;
340    let (_, two) = chars.next_or_eof()?;
341    let (_, three) = chars.next_or_eof()?;
342    let (index, four) = chars.next_or_eof()?;
343
344    let string = [one, two, three, four].into_iter().collect::<String>();
345    let Ok(n) = u16::from_str_radix(&string, RADIX) else {
346        return Err(Warning::InvalidEscape(index));
347    };
348
349    Ok(n)
350}