Skip to main content

ocpi_tariffs/json/
decode.rs

1//! A `Warning` for warnings that can happen when decoding a JSON `&str`.
2#[cfg(test)]
3mod test_unescape;
4
5use std::{borrow::Cow, fmt, iter::Peekable};
6
7use crate::{into_caveat, warning, Caveat, IntoCaveat};
8
9use super::Element;
10
11const ESCAPE_CHAR: char = '\\';
12
13/// The kind of `Warning` that can happen when decoding a `&str`.
14#[derive(Debug, Eq, PartialEq, Ord, PartialOrd)]
15pub enum Warning {
16    /// Control chars were found while parsing a JSON string.
17    ControlCharacterWhileParsingString(usize),
18
19    /// A UTF-16 surrogate pair failed to decode.
20    DecodeUtf16(usize, u16),
21
22    /// A string contains invalid escape chars.
23    InvalidEscape(usize),
24
25    /// The String ended before the parser expected.
26    UnexpectedEndOfString(usize),
27}
28
29impl crate::Warning for Warning {
30    /// A human readable identifier for each `Warning`.
31    fn id(&self) -> warning::Id {
32        match self {
33            Self::ControlCharacterWhileParsingString(_) => {
34                warning::Id::from_static("control_character_while_parsing_string")
35            }
36            Self::DecodeUtf16(..) => warning::Id::from_static("decode_utf_1_6"),
37            Self::InvalidEscape(_) => warning::Id::from_static("invalid_escape"),
38            Self::UnexpectedEndOfString(_) => warning::Id::from_static("unexpected_end_of_string"),
39        }
40    }
41}
42
43impl fmt::Display for Warning {
44    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
45        match self {
46            Self::ControlCharacterWhileParsingString(index) => {
47                write!(
48                    f,
49                    "Control chars were found at index `{index}` while decoding a JSON string."
50                )
51            }
52            Self::DecodeUtf16(index, code) => {
53                write!(
54                    f,
55                    "A UTF-16 surrogate pair `{code}` failed to decode at index: `{index}`."
56                )
57            }
58            Self::InvalidEscape(index) => {
59                write!(
60                    f,
61                    "String contains an invalid escape char at index: `{index})`."
62                )
63            }
64            Self::UnexpectedEndOfString(index) => {
65                write!(f, "The String ended prematurely at index: `{index}`.")
66            }
67        }
68    }
69}
70
71/// Return a `PendingStr` that marks the inner `&str` is containing escape codes or not.
72pub(crate) fn analyze<'buf>(
73    s: &'buf str,
74    elem: &Element<'buf>,
75) -> Caveat<PendingStr<'buf>, Warning> {
76    let mut warnings = warning::Set::new();
77
78    // Strings are expected to be small so running over all bytes
79    // with the intent of early exiting is acceptable.
80    if s.chars().any(|ch| ch == ESCAPE_CHAR) {
81        PendingStr::HasEscapes(EscapeStr(s)).into_caveat(warnings)
82    } else {
83        if let Some((index, _)) = s.char_indices().find(|(_, ch)| ch.is_control()) {
84            warnings.insert(Warning::ControlCharacterWhileParsingString(index), elem);
85        }
86
87        PendingStr::NoEscapes(s).into_caveat(warnings)
88    }
89}
90
91/// Marks a `&str` as having escapes or not.
92pub(crate) enum PendingStr<'buf> {
93    /// The `&str` has no escapes and can be used as is.
94    NoEscapes(&'buf str),
95
96    /// The `&str` has escape chars and needs to be unescaped before trying to parse into another form.
97    HasEscapes(EscapeStr<'buf>),
98}
99
100into_caveat!(PendingStr<'buf>);
101
102/// A `&str` with escape chars.
103pub(crate) struct EscapeStr<'buf>(&'buf str);
104
105impl<'buf> EscapeStr<'buf> {
106    pub(crate) fn decode_escapes(&self, elem: &Element<'buf>) -> Caveat<Cow<'buf, str>, Warning> {
107        unescape_str(self.0, elem)
108    }
109
110    /// Consume the `EscapeStr` and return the raw bytes as a str.
111    pub(crate) fn into_raw(self) -> &'buf str {
112        self.0
113    }
114}
115
116/// Return the `str` with escaped chars replaced with the decoded equivalent.
117///
118/// Return `Some(Cow::Owned)` if there are escape chars in the `str` otherwise return
119/// `Some(Cow::Owned)` containing the input `str`.
120/// Return `None` if the `str` contains invalid or unhandled escape chars.
121pub(crate) fn unescape_str<'buf>(
122    s: &'buf str,
123    elem: &Element<'buf>,
124) -> Caveat<Cow<'buf, str>, Warning> {
125    let mut warnings = warning::Set::new();
126
127    // Strings are expected to be small so running over all bytes to early out is acceptable.
128    if !s.chars().any(|ch| ch == ESCAPE_CHAR) {
129        if let Some((index, _)) = s.char_indices().find(|(_, ch)| ch.is_control()) {
130            warnings.insert(Warning::ControlCharacterWhileParsingString(index), elem);
131        }
132        return Cow::Borrowed(s).into_caveat(warnings);
133    }
134
135    let mut chars = Chars::from_str(s);
136    let mut buf = Buffer::with_capacity(s.len());
137
138    loop {
139        let Some((index, ch)) = chars.next() else {
140            return Cow::<'buf, str>::Owned(buf.into_string()).into_caveat(warnings);
141        };
142
143        if ch == ESCAPE_CHAR {
144            if let Err(warn_kind) = parse_escape(&mut chars, &mut buf) {
145                warnings.insert(warn_kind, elem);
146                return Cow::Borrowed(s).into_caveat(warnings);
147            }
148        } else if let Err(warn_kind) = buf.push_char(ch, index) {
149            warnings.insert(warn_kind, elem);
150            return Cow::Borrowed(s).into_caveat(warnings);
151        }
152    }
153}
154
155/// Parses a JSON escape sequence and appends it into the `Buffer`. Assumes
156/// the previous byte read was a backslash.
157///
158/// * See: <https://datatracker.ietf.org/doc/html/rfc8259#section-7>
159fn parse_escape(chars: &mut Chars<'_>, buf: &mut Buffer) -> Result<(), Warning> {
160    let (index, ch) = chars.next_or_eof()?;
161
162    let ch = match ch {
163        '"' => '"',
164        '\\' => '\\',
165        '/' => '/',
166        'b' => '\x08',
167        'f' => '\x0c',
168        'n' => '\n',
169        'r' => '\r',
170        't' => '\t',
171        'u' => return parse_unicode_escape(chars, buf),
172        _ => {
173            return Err(Warning::InvalidEscape(index));
174        }
175    };
176
177    buf.push_char(ch, index)?;
178
179    Ok(())
180}
181
182/// Parses a JSON \u escape and appends it into the buffer.
183/// Assumes `\u` has just been read.
184///
185/// The Unicode escape might be a UTF-16 surrogate pair.
186///
187/// * See: <https://datatracker.ietf.org/doc/html/rfc8259#section-8.2>
188fn parse_unicode_escape(chars: &mut Chars<'_>, buf: &mut Buffer) -> Result<(), Warning> {
189    let n1 = decode_hex_escape(chars)?;
190    let n2 = chars.is_next_escape()?;
191
192    if let Some(n2) = n2 {
193        buf.push_surrogate_pair(n1, n2, chars.index)?;
194    } else {
195        let Some(ch) = char::from_u32(u32::from(n1)) else {
196            return Err(Warning::InvalidEscape(chars.index));
197        };
198
199        buf.push_char(ch, chars.index)?;
200    }
201
202    Ok(())
203}
204
205/// A char iterator that can fail if the next char is a control char.
206struct Chars<'buf> {
207    /// The `char` iterator
208    ///
209    /// This needs to be a `CharIndices` as the `Chars` iterator skips over escaped chars.
210    /// And this needs to be `Peekable` as we need to look ahead to detect a potential second
211    /// Unicode literal and treat that as a UTF-16 surrogate pair.
212    char_indices: Peekable<std::str::CharIndices<'buf>>,
213
214    /// The last parsed char index
215    index: usize,
216}
217
218impl<'buf> Chars<'buf> {
219    /// Create a new `Chars` iterator from a `&str`.
220    fn from_str(s: &'buf str) -> Self {
221        Self {
222            char_indices: s.char_indices().peekable(),
223            index: 0,
224        }
225    }
226
227    /// Return the next char as `Ok` or return `Err(UnexpectedEOF)` if there is no char
228    /// or return `Err(ControlCharacterWhileParsingString)` if the next char is a control char.
229    fn next_or_eof(&mut self) -> Result<(usize, char), Warning> {
230        if let Some((index, ch)) = self.next() {
231            if ch.is_control() {
232                return Err(Warning::ControlCharacterWhileParsingString(index));
233            }
234
235            Ok((index, ch))
236        } else {
237            Err(Warning::UnexpectedEndOfString(self.index))
238        }
239    }
240
241    /// Look ahead in the char stream and if there is another unicode escape return it as a decoded
242    /// hex escape.
243    fn is_next_escape(&mut self) -> Result<Option<u16>, Warning> {
244        {
245            let escape_char = self.char_indices.next_if(|(_, ch)| *ch == ESCAPE_CHAR);
246
247            if escape_char.is_none() {
248                return Ok(None);
249            }
250        }
251
252        {
253            let escape_unicode = self.char_indices.next_if(|(_, ch)| *ch == 'u');
254
255            if escape_unicode.is_none() {
256                return Ok(None);
257            }
258        }
259
260        let n = decode_hex_escape(self)?;
261        Ok(Some(n))
262    }
263}
264
265impl Iterator for Chars<'_> {
266    type Item = (usize, char);
267
268    fn next(&mut self) -> Option<Self::Item> {
269        if let Some((index, char)) = self.char_indices.next() {
270            self.index = index;
271            Some((index, char))
272        } else {
273            None
274        }
275    }
276}
277
278/// The `String` based buffer where we accumulate the escaped JSON string.
279///
280/// If `fail_on_control` is true and a control char is given to a method,
281/// the method will return `Err(ControlCharacterWhileParsingString)`.
282struct Buffer {
283    /// The `String` to accumulate chars in.
284    buf: String,
285}
286
287impl Buffer {
288    /// Create a new `Buffer`
289    fn with_capacity(capacity: usize) -> Self {
290        Self {
291            buf: String::with_capacity(capacity),
292        }
293    }
294
295    /// Push a char into the `String`.
296    ///
297    /// Return `Err` if the char is a control char and the `fail_on_control` is true.
298    /// Otherwise return `Ok`.
299    fn push_char(&mut self, ch: char, index: usize) -> Result<(), Warning> {
300        if ch.is_control() {
301            return Err(Warning::ControlCharacterWhileParsingString(index));
302        }
303
304        self.buf.push(ch);
305        Ok(())
306    }
307
308    /// Consume the `Buffer` and return the inner `String`.
309    fn into_string(self) -> String {
310        self.buf
311    }
312
313    /// Decodes the high and low parts of a UTF-16 surrogate pair and pushes the resulting
314    /// `char` on to the `Buffer`.
315    ///
316    /// Returns `Ok(char)` if the decoding succeeds.
317    /// Returns `Err(DecodeUtf16)` if the decoding fails.
318    fn push_surrogate_pair(&mut self, n1: u16, n2: u16, index: usize) -> Result<char, Warning> {
319        let Some(ch) = char::decode_utf16([n1, n2]).next() else {
320            return Err(Warning::InvalidEscape(index));
321        };
322
323        let ch = match ch {
324            Ok(ch) => ch,
325            Err(err) => {
326                return Err(Warning::DecodeUtf16(index, err.unpaired_surrogate()));
327            }
328        };
329
330        self.push_char(ch, index)?;
331
332        Ok(ch)
333    }
334}
335
336/// Munch four chars as bytes and try convert into a `char`.
337fn decode_hex_escape(chars: &mut Chars<'_>) -> Result<u16, Warning> {
338    const RADIX: u32 = 16;
339
340    let (_, one) = chars.next_or_eof()?;
341    let (_, two) = chars.next_or_eof()?;
342    let (_, three) = chars.next_or_eof()?;
343    let (index, four) = chars.next_or_eof()?;
344
345    let string = [one, two, three, four].into_iter().collect::<String>();
346    let Ok(n) = u16::from_str_radix(&string, RADIX) else {
347        return Err(Warning::InvalidEscape(index));
348    };
349
350    Ok(n)
351}