Skip to main content

ocpi_tariffs/json/
decode.rs

1//! String decoding for JSON values parsed by [`super`].
2//!
3//! # Parsing vs decoding
4//!
5//! The JSON parser ([`super::parser`]) is intentionally lenient: it accepts
6//! any structurally valid JSON string, leaving escape sequences and control
7//! characters verbatim inside a [`super::RawStr`]. Decoding is a separate
8//! step performed by this module.
9//!
10//! Keeping these concerns apart means the linter can treat string-encoding
11//! problems as warnings rather than fatal parse errors. The caller receives
12//! both a best-effort string value and a list of [`Warning`]s describing what
13//! was wrong and where, which is enough information to suggest a corrected
14//! encoding to the user.
15#[cfg(test)]
16mod test_from_str;
17
18use std::{borrow::Cow, fmt, iter::Peekable, ops::RangeInclusive};
19
20use super::Element;
21use crate::{
22    warning::{self, CaveatDeferred, IntoCaveatDeferred as _},
23    Caveat, IntoCaveat as _,
24};
25
26const ESCAPE_CHAR: char = '\\';
27
28/// The kind of `Warning` that can happen when decoding a `&str`.
29#[derive(Debug, Eq, PartialEq, Ord, PartialOrd)]
30pub enum Warning {
31    /// Control chars were found while parsing a JSON string.
32    ControlCharacter(usize),
33
34    /// A UTF-16 surrogate pair failed to decode.
35    DecodeUtf16(usize, u16),
36
37    /// A string contains invalid escape chars.
38    InvalidEscape(usize),
39
40    /// The String ended before the parser expected.
41    UnexpectedEndOfString(usize),
42}
43
44impl crate::Warning for Warning {
45    /// A human readable identifier for each `Warning`.
46    fn id(&self) -> warning::Id {
47        match self {
48            Self::ControlCharacter(_) => {
49                warning::Id::from_static("control_character_while_parsing_string")
50            }
51            Self::DecodeUtf16(..) => warning::Id::from_static("decode_utf_1_6"),
52            Self::InvalidEscape(_) => warning::Id::from_static("invalid_escape"),
53            Self::UnexpectedEndOfString(_) => warning::Id::from_static("unexpected_end_of_string"),
54        }
55    }
56}
57
58impl fmt::Display for Warning {
59    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
60        match self {
61            Self::ControlCharacter(index) => {
62                write!(
63                    f,
64                    "Control chars were found at index `{index}` while decoding a JSON string."
65                )
66            }
67            Self::DecodeUtf16(index, code) => {
68                write!(
69                    f,
70                    "A UTF-16 surrogate pair `{code}` failed to decode at index: `{index}`."
71                )
72            }
73            Self::InvalidEscape(index) => {
74                write!(
75                    f,
76                    "String contains an invalid escape char at index: `{index})`."
77                )
78            }
79            Self::UnexpectedEndOfString(index) => {
80                write!(f, "The String ended prematurely at index: `{index}`.")
81            }
82        }
83    }
84}
85
86/// Return a `PendingStr` that marks the inner `&str` is containing escape codes or not.
87pub(super) fn analyze<'buf>(
88    s: &'buf str,
89    elem: &Element<'buf>,
90) -> Caveat<super::PendingStr<'buf>, Warning> {
91    let mut warnings = warning::Set::new();
92
93    // Strings are expected to be small so running over all bytes
94    // with the intent of early exiting is acceptable.
95    if let Some((index, _)) = s.char_indices().find(|(_, ch)| ch.is_control()) {
96        warnings.insert(elem, Warning::ControlCharacter(index));
97    }
98
99    if s.chars().any(|ch| ch == ESCAPE_CHAR) {
100        super::PendingStr::HasEscapes(super::EscapeStr(s)).into_caveat(warnings)
101    } else {
102        super::PendingStr::NoEscapes(s).into_caveat(warnings)
103    }
104}
105
106/// Return the `str` with escaped chars replaced with the decoded equivalent.
107///
108/// Returns `Cow::Owned` when all escapes decoded successfully.
109/// Returns `Cow::Borrowed` is the raw string doesn't contain any escapes
110/// or if there is any issues/warnings with the source `&str`.
111pub(super) fn from_raw<'buf>(s: &'buf str) -> CaveatDeferred<Cow<'buf, str>, Warning> {
112    let mut warnings = warning::SetDeferred::new();
113
114    // Strings are expected to be small so running over all
115    // bytes to early out is acceptable.
116    if !s.chars().any(|ch| ch == ESCAPE_CHAR) {
117        if let Some((index, _)) = s.char_indices().find(|(_, ch)| ch.is_control()) {
118            warnings.insert(Warning::ControlCharacter(index));
119        }
120        return Cow::Borrowed(s).into_caveat_deferred(warnings);
121    }
122
123    let mut buf = Buffer::with_capacity(s.len());
124    for decoded in Decoded::from_str(s) {
125        match decoded {
126            Ok(ch) => buf.push(ch),
127            Err(warn_kind) => {
128                warnings.insert(warn_kind);
129                return Cow::Borrowed(s).into_caveat_deferred(warnings);
130            }
131        }
132    }
133
134    Cow::<'buf, str>::Owned(buf.into_string()).into_caveat_deferred(warnings)
135}
136
137/// Compare `other` against the decoded form of the raw JSON string body `raw`,
138/// decoding escape sequences on the fly. Does not allocate.
139///
140/// Returns `Ok(true)` / `Ok(false)` for the comparison, or `Err` if `raw` has a
141/// decoding problem (invalid escape, control character, ...) at or before the
142/// first differing character. Once the two strings are known to differ, decoding
143/// stops, so a decode problem beyond that point is not reported.
144pub(super) fn eq(raw: &str, other: &str) -> Result<bool, Warning> {
145    let mut decoded = Decoded::from_str(raw);
146    let mut expected = other.chars();
147
148    loop {
149        match decoded.next() {
150            Some(Err(warn_kind)) => return Err(warn_kind),
151            Some(Ok(actual)) => {
152                if expected.next() != Some(actual) {
153                    return Ok(false);
154                }
155            }
156            // `raw` is exhausted; the strings are equal only if `other` is too.
157            None => return Ok(expected.next().is_none()),
158        }
159    }
160}
161
162/// Like [`eq`], but compares ASCII letters case-insensitively.
163pub(super) fn eq_ignore_ascii_case(raw: &str, other: &str) -> Result<bool, Warning> {
164    let mut decoded = Decoded::from_str(raw);
165    let mut expected = other.chars();
166
167    loop {
168        match decoded.next() {
169            Some(Err(warn_kind)) => return Err(warn_kind),
170            Some(Ok(actual)) => match expected.next() {
171                Some(expected) if expected.eq_ignore_ascii_case(&actual) => {}
172                _ => return Ok(false),
173            },
174            // `raw` is exhausted; the strings are equal only if `other` is too.
175            None => return Ok(expected.next().is_none()),
176        }
177    }
178}
179
180/// Parses a JSON escape sequence and appends it into the `Buffer`. Assumes
181/// the previous byte read was a backslash.
182///
183/// * See: <https://datatracker.ietf.org/doc/html/rfc8259#section-7>
184fn parse_escape(chars: &mut Chars<'_>) -> Result<char, Warning> {
185    let (index, ch) = chars.next_or_eof()?;
186
187    let ch = match ch {
188        '"' => '"',
189        '\\' => '\\',
190        '/' => '/',
191        'b' => '\x08',
192        'f' => '\x0c',
193        'n' => '\n',
194        'r' => '\r',
195        't' => '\t',
196        'u' => return parse_unicode_escape(chars),
197        _ => {
198            return Err(Warning::InvalidEscape(index));
199        }
200    };
201
202    if ch.is_control() {
203        return Err(Warning::ControlCharacter(index));
204    }
205
206    Ok(ch)
207}
208
209/// Parses a JSON `\u` escape and appends it into the buffer.
210/// Assumes `\u` has just been read.
211///
212/// The Unicode escape might be a UTF-16 surrogate pair.
213///
214/// * See: <https://datatracker.ietf.org/doc/html/rfc8259#section-8.2>
215fn parse_unicode_escape(chars: &mut Chars<'_>) -> Result<char, Warning> {
216    // High surrogates occupy `U+D800..=U+DBFF`. A high surrogate is the first
217    // code unit of a UTF-16 surrogate pair; it must be followed by a low
218    // surrogate (`U+DC00..=U+DFFF`) to form a valid scalar value. Lone high
219    // surrogates and all low surrogates are not valid Unicode scalar values.
220    const HIGH_SURROGATE: RangeInclusive<u16> = 0xD800..=0xDBFF;
221
222    let n1 = decode_hex_escape(chars)?;
223
224    let ch = if HIGH_SURROGATE.contains(&n1) {
225        // Only look for a surrogate-pair continuation when N1 is a high surrogate.
226        // Calling `is_next_escape` unconditionally would consume the `\` of any
227        // following escape and pass two unrelated BMP codepoints to
228        // decode_surrogate_pair, which only decodes the first and silently drops the second.
229        let Some(n2) = chars.is_next_escape()? else {
230            return Err(Warning::InvalidEscape(chars.index));
231        };
232        decode_surrogate_pair(n1, n2, chars.index)?
233    } else {
234        let Some(ch) = char::from_u32(u32::from(n1)) else {
235            return Err(Warning::InvalidEscape(chars.index));
236        };
237        ch
238    };
239
240    if ch.is_control() {
241        return Err(Warning::ControlCharacter(chars.index));
242    }
243
244    Ok(ch)
245}
246
247/// A char iterator that can fail if the next char is a control char.
248struct Chars<'buf> {
249    /// The `char` iterator.
250    ///
251    /// This needs to be a `CharIndices` as the `Chars` iterator skips over escaped chars.
252    /// And this needs to be `Peekable` as we need to look ahead to detect a potential second
253    /// Unicode literal and treat that as a UTF-16 surrogate pair.
254    char_indices: Peekable<std::str::CharIndices<'buf>>,
255
256    /// A single character pushed back by `is_next_escape` when it consumed a `\`
257    /// speculatively but found it was not the start of a `\uXXXX` sequence.
258    push_back: Option<(usize, char)>,
259
260    /// The last parsed char index.
261    index: usize,
262}
263
264impl<'buf> Chars<'buf> {
265    /// Create a new `Chars` iterator from a `&str`.
266    fn from_str(s: &'buf str) -> Self {
267        Self {
268            char_indices: s.char_indices().peekable(),
269            push_back: None,
270            index: 0,
271        }
272    }
273
274    /// Return the next char as `Ok` or return `Err(UnexpectedEndOfString)` if there is no char
275    /// or return `Err(ControlCharacter)` if the next char is a control char.
276    fn next_or_eof(&mut self) -> Result<(usize, char), Warning> {
277        if let Some((index, ch)) = self.next() {
278            if ch.is_control() {
279                return Err(Warning::ControlCharacter(index));
280            }
281
282            Ok((index, ch))
283        } else {
284            Err(Warning::UnexpectedEndOfString(self.index))
285        }
286    }
287
288    /// Look ahead in the char stream and if there is another `\uXXXX` sequence
289    /// return it as a decoded hex value.
290    ///
291    /// If a `\` is found but not followed by `u`, the `\` is pushed back so the
292    /// outer loop can process it as the start of a different escape sequence.
293    fn is_next_escape(&mut self) -> Result<Option<u16>, Warning> {
294        let Some(backslash) = self.char_indices.next_if(|(_, ch)| *ch == ESCAPE_CHAR) else {
295            return Ok(None);
296        };
297
298        if self.char_indices.next_if(|(_, ch)| *ch == 'u').is_none() {
299            self.push_back = Some(backslash);
300            return Ok(None);
301        }
302
303        let n = decode_hex_escape(self)?;
304        Ok(Some(n))
305    }
306}
307
308impl Iterator for Chars<'_> {
309    type Item = (usize, char);
310
311    fn next(&mut self) -> Option<Self::Item> {
312        if let Some(item) = self.push_back.take() {
313            self.index = item.0;
314            return Some(item);
315        }
316        if let Some((index, char)) = self.char_indices.next() {
317            self.index = index;
318            Some((index, char))
319        } else {
320            None
321        }
322    }
323}
324
325/// Iterator over the decoded `char`s of a JSON string body (quotes removed).
326///
327/// Escape sequences are decoded on the fly; control characters and malformed
328/// escapes are returned as `Err`. Callers typically stop at the first `Err`.
329struct Decoded<'buf> {
330    chars: Chars<'buf>,
331}
332
333impl<'buf> Decoded<'buf> {
334    /// Create a `Decoded` iterator from a JSON string body.
335    fn from_str(s: &'buf str) -> Self {
336        Self {
337            chars: Chars::from_str(s),
338        }
339    }
340}
341
342impl Iterator for Decoded<'_> {
343    type Item = Result<char, Warning>;
344
345    fn next(&mut self) -> Option<Self::Item> {
346        let (index, ch) = self.chars.next()?;
347
348        if ch == ESCAPE_CHAR {
349            Some(parse_escape(&mut self.chars))
350        } else if ch.is_control() {
351            Some(Err(Warning::ControlCharacter(index)))
352        } else {
353            Some(Ok(ch))
354        }
355    }
356}
357
358/// The `String` based buffer where we accumulate the decoded JSON string.
359struct Buffer {
360    /// The `String` to accumulate chars in.
361    buf: String,
362}
363
364impl Buffer {
365    /// Create a new `Buffer`.
366    fn with_capacity(capacity: usize) -> Self {
367        Self {
368            buf: String::with_capacity(capacity),
369        }
370    }
371
372    /// Push an already-decoded char into the `String`.
373    fn push(&mut self, ch: char) {
374        self.buf.push(ch);
375    }
376
377    /// Consume the `Buffer` and return the inner `String`.
378    fn into_string(self) -> String {
379        self.buf
380    }
381}
382
383/// Decode the high and low code units of a UTF-16 surrogate pair into a `char`.
384///
385/// Returns `Err(DecodeUtf16)` if the two code units do not form a valid pair.
386fn decode_surrogate_pair(n1: u16, n2: u16, index: usize) -> Result<char, Warning> {
387    let Some(ch) = char::decode_utf16([n1, n2]).next() else {
388        return Err(Warning::InvalidEscape(index));
389    };
390
391    match ch {
392        Ok(ch) => Ok(ch),
393        Err(err) => Err(Warning::DecodeUtf16(index, err.unpaired_surrogate())),
394    }
395}
396
397/// Munch four chars as bytes and try to convert them into a `char`.
398fn decode_hex_escape(chars: &mut Chars<'_>) -> Result<u16, Warning> {
399    const RADIX: u32 = 16;
400
401    let (_, one) = chars.next_or_eof()?;
402    let (_, two) = chars.next_or_eof()?;
403    let (_, three) = chars.next_or_eof()?;
404    let (index, four) = chars.next_or_eof()?;
405
406    let string = [one, two, three, four].into_iter().collect::<String>();
407    let Ok(n) = u16::from_str_radix(&string, RADIX) else {
408        return Err(Warning::InvalidEscape(index));
409    };
410
411    Ok(n)
412}