ocpi-tariffs 0.49.1

//! String decoding for JSON values parsed by [`super`].
//!
//! # Parsing vs decoding
//!
//! The JSON parser ([`super::parser`]) is intentionally lenient: it accepts
//! any structurally valid JSON string, leaving escape sequences and control
//! characters verbatim inside a [`super::RawStr`]. Decoding is a separate
//! step performed by this module.
//!
//! Keeping these concerns apart means the linter can treat string-encoding
//! problems as warnings rather than fatal parse errors. The caller receives
//! both a best-effort string value and a list of [`Warning`]s describing what
//! was wrong and where, which is enough information to suggest a corrected
//! encoding to the user.
#[cfg(test)]
mod test_from_str;

use std::{borrow::Cow, fmt, iter::Peekable, ops::RangeInclusive};

use super::Element;
use crate::{
    warning::{self, CaveatDeferred, IntoCaveatDeferred as _},
    Caveat, IntoCaveat as _,
};

const ESCAPE_CHAR: char = '\\';

/// The kind of `Warning` that can happen when decoding a `&str`.
#[derive(Debug, Eq, PartialEq, Ord, PartialOrd)]
pub enum Warning {
    /// Control chars were found while parsing a JSON string.
    ControlCharacter(usize),

    /// A UTF-16 surrogate pair failed to decode.
    DecodeUtf16(usize, u16),

    /// A string contains invalid escape chars.
    InvalidEscape(usize),

    /// The String ended before the parser expected.
    UnexpectedEndOfString(usize),
}

impl crate::Warning for Warning {
    /// A human readable identifier for each `Warning`.
    fn id(&self) -> warning::Id {
        match self {
            Self::ControlCharacter(_) => {
                warning::Id::from_static("control_character_while_parsing_string")
            }
            Self::DecodeUtf16(..) => warning::Id::from_static("decode_utf_1_6"),
            Self::InvalidEscape(_) => warning::Id::from_static("invalid_escape"),
            Self::UnexpectedEndOfString(_) => warning::Id::from_static("unexpected_end_of_string"),
        }
    }
}

impl fmt::Display for Warning {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Self::ControlCharacter(index) => {
                write!(
                    f,
                    "Control chars were found at index `{index}` while decoding a JSON string."
                )
            }
            Self::DecodeUtf16(index, code) => {
                write!(
                    f,
                    "A UTF-16 surrogate pair `{code}` failed to decode at index: `{index}`."
                )
            }
            Self::InvalidEscape(index) => {
                write!(
                    f,
                    "String contains an invalid escape char at index: `{index})`."
                )
            }
            Self::UnexpectedEndOfString(index) => {
                write!(f, "The String ended prematurely at index: `{index}`.")
            }
        }
    }
}

/// Return a `PendingStr` that marks the inner `&str` is containing escape codes or not.
pub(super) fn analyze<'buf>(
    s: &'buf str,
    elem: &Element<'buf>,
) -> Caveat<super::PendingStr<'buf>, Warning> {
    let mut warnings = warning::Set::new();

    // Strings are expected to be small so running over all bytes
    // with the intent of early exiting is acceptable.
    if let Some((index, _)) = s.char_indices().find(|(_, ch)| ch.is_control()) {
        warnings.insert(elem, Warning::ControlCharacter(index));
    }

    if s.chars().any(|ch| ch == ESCAPE_CHAR) {
        super::PendingStr::HasEscapes(super::EscapeStr(s)).into_caveat(warnings)
    } else {
        super::PendingStr::NoEscapes(s).into_caveat(warnings)
    }
}

/// Return the `str` with escaped chars replaced with the decoded equivalent.
///
/// Returns `Cow::Owned` when all escapes decoded successfully.
/// Returns `Cow::Borrowed` is the raw string doesn't contain any escapes
/// or if there is any issues/warnings with the source `&str`.
pub(super) fn from_raw<'buf>(s: &'buf str) -> CaveatDeferred<Cow<'buf, str>, Warning> {
    let mut warnings = warning::SetDeferred::new();

    // Strings are expected to be small so running over all
    // bytes to early out is acceptable.
    if !s.chars().any(|ch| ch == ESCAPE_CHAR) {
        if let Some((index, _)) = s.char_indices().find(|(_, ch)| ch.is_control()) {
            warnings.insert(Warning::ControlCharacter(index));
        }
        return Cow::Borrowed(s).into_caveat_deferred(warnings);
    }

    let mut buf = Buffer::with_capacity(s.len());
    for decoded in Decoded::from_str(s) {
        match decoded {
            Ok(ch) => buf.push(ch),
            Err(warn_kind) => {
                warnings.insert(warn_kind);
                return Cow::Borrowed(s).into_caveat_deferred(warnings);
            }
        }
    }

    Cow::<'buf, str>::Owned(buf.into_string()).into_caveat_deferred(warnings)
}

/// Compare `other` against the decoded form of the raw JSON string body `raw`,
/// decoding escape sequences on the fly. Does not allocate.
///
/// Returns `Ok(true)` / `Ok(false)` for the comparison, or `Err` if `raw` has a
/// decoding problem (invalid escape, control character, ...) at or before the
/// first differing character. Once the two strings are known to differ, decoding
/// stops, so a decode problem beyond that point is not reported.
pub(super) fn eq(raw: &str, other: &str) -> Result<bool, Warning> {
    let mut decoded = Decoded::from_str(raw);
    let mut expected = other.chars();

    loop {
        match decoded.next() {
            Some(Err(warn_kind)) => return Err(warn_kind),
            Some(Ok(actual)) => {
                if expected.next() != Some(actual) {
                    return Ok(false);
                }
            }
            // `raw` is exhausted; the strings are equal only if `other` is too.
            None => return Ok(expected.next().is_none()),
        }
    }
}

/// Like [`eq`], but compares ASCII letters case-insensitively.
pub(super) fn eq_ignore_ascii_case(raw: &str, other: &str) -> Result<bool, Warning> {
    let mut decoded = Decoded::from_str(raw);
    let mut expected = other.chars();

    loop {
        match decoded.next() {
            Some(Err(warn_kind)) => return Err(warn_kind),
            Some(Ok(actual)) => match expected.next() {
                Some(expected) if expected.eq_ignore_ascii_case(&actual) => {}
                _ => return Ok(false),
            },
            // `raw` is exhausted; the strings are equal only if `other` is too.
            None => return Ok(expected.next().is_none()),
        }
    }
}

/// Parses a JSON escape sequence and appends it into the `Buffer`. Assumes
/// the previous byte read was a backslash.
///
/// * See: <https://datatracker.ietf.org/doc/html/rfc8259#section-7>
fn parse_escape(chars: &mut Chars<'_>) -> Result<char, Warning> {
    let (index, ch) = chars.next_or_eof()?;

    let ch = match ch {
        '"' => '"',
        '\\' => '\\',
        '/' => '/',
        'b' => '\x08',
        'f' => '\x0c',
        'n' => '\n',
        'r' => '\r',
        't' => '\t',
        'u' => return parse_unicode_escape(chars),
        _ => {
            return Err(Warning::InvalidEscape(index));
        }
    };

    if ch.is_control() {
        return Err(Warning::ControlCharacter(index));
    }

    Ok(ch)
}

/// Parses a JSON `\u` escape and appends it into the buffer.
/// Assumes `\u` has just been read.
///
/// The Unicode escape might be a UTF-16 surrogate pair.
///
/// * See: <https://datatracker.ietf.org/doc/html/rfc8259#section-8.2>
fn parse_unicode_escape(chars: &mut Chars<'_>) -> Result<char, Warning> {
    // High surrogates occupy `U+D800..=U+DBFF`. A high surrogate is the first
    // code unit of a UTF-16 surrogate pair; it must be followed by a low
    // surrogate (`U+DC00..=U+DFFF`) to form a valid scalar value. Lone high
    // surrogates and all low surrogates are not valid Unicode scalar values.
    const HIGH_SURROGATE: RangeInclusive<u16> = 0xD800..=0xDBFF;

    let n1 = decode_hex_escape(chars)?;

    let ch = if HIGH_SURROGATE.contains(&n1) {
        // Only look for a surrogate-pair continuation when N1 is a high surrogate.
        // Calling `is_next_escape` unconditionally would consume the `\` of any
        // following escape and pass two unrelated BMP codepoints to
        // decode_surrogate_pair, which only decodes the first and silently drops the second.
        let Some(n2) = chars.is_next_escape()? else {
            return Err(Warning::InvalidEscape(chars.index));
        };
        decode_surrogate_pair(n1, n2, chars.index)?
    } else {
        let Some(ch) = char::from_u32(u32::from(n1)) else {
            return Err(Warning::InvalidEscape(chars.index));
        };
        ch
    };

    if ch.is_control() {
        return Err(Warning::ControlCharacter(chars.index));
    }

    Ok(ch)
}

/// A char iterator that can fail if the next char is a control char.
struct Chars<'buf> {
    /// The `char` iterator.
    ///
    /// This needs to be a `CharIndices` as the `Chars` iterator skips over escaped chars.
    /// And this needs to be `Peekable` as we need to look ahead to detect a potential second
    /// Unicode literal and treat that as a UTF-16 surrogate pair.
    char_indices: Peekable<std::str::CharIndices<'buf>>,

    /// A single character pushed back by `is_next_escape` when it consumed a `\`
    /// speculatively but found it was not the start of a `\uXXXX` sequence.
    push_back: Option<(usize, char)>,

    /// The last parsed char index.
    index: usize,
}

impl<'buf> Chars<'buf> {
    /// Create a new `Chars` iterator from a `&str`.
    fn from_str(s: &'buf str) -> Self {
        Self {
            char_indices: s.char_indices().peekable(),
            push_back: None,
            index: 0,
        }
    }

    /// Return the next char as `Ok` or return `Err(UnexpectedEndOfString)` if there is no char
    /// or return `Err(ControlCharacter)` if the next char is a control char.
    fn next_or_eof(&mut self) -> Result<(usize, char), Warning> {
        if let Some((index, ch)) = self.next() {
            if ch.is_control() {
                return Err(Warning::ControlCharacter(index));
            }

            Ok((index, ch))
        } else {
            Err(Warning::UnexpectedEndOfString(self.index))
        }
    }

    /// Look ahead in the char stream and if there is another `\uXXXX` sequence
    /// return it as a decoded hex value.
    ///
    /// If a `\` is found but not followed by `u`, the `\` is pushed back so the
    /// outer loop can process it as the start of a different escape sequence.
    fn is_next_escape(&mut self) -> Result<Option<u16>, Warning> {
        let Some(backslash) = self.char_indices.next_if(|(_, ch)| *ch == ESCAPE_CHAR) else {
            return Ok(None);
        };

        if self.char_indices.next_if(|(_, ch)| *ch == 'u').is_none() {
            self.push_back = Some(backslash);
            return Ok(None);
        }

        let n = decode_hex_escape(self)?;
        Ok(Some(n))
    }
}

impl Iterator for Chars<'_> {
    type Item = (usize, char);

    fn next(&mut self) -> Option<Self::Item> {
        if let Some(item) = self.push_back.take() {
            self.index = item.0;
            return Some(item);
        }
        if let Some((index, char)) = self.char_indices.next() {
            self.index = index;
            Some((index, char))
        } else {
            None
        }
    }
}

/// Iterator over the decoded `char`s of a JSON string body (quotes removed).
///
/// Escape sequences are decoded on the fly; control characters and malformed
/// escapes are returned as `Err`. Callers typically stop at the first `Err`.
struct Decoded<'buf> {
    chars: Chars<'buf>,
}

impl<'buf> Decoded<'buf> {
    /// Create a `Decoded` iterator from a JSON string body.
    fn from_str(s: &'buf str) -> Self {
        Self {
            chars: Chars::from_str(s),
        }
    }
}

impl Iterator for Decoded<'_> {
    type Item = Result<char, Warning>;

    fn next(&mut self) -> Option<Self::Item> {
        let (index, ch) = self.chars.next()?;

        if ch == ESCAPE_CHAR {
            Some(parse_escape(&mut self.chars))
        } else if ch.is_control() {
            Some(Err(Warning::ControlCharacter(index)))
        } else {
            Some(Ok(ch))
        }
    }
}

/// The `String` based buffer where we accumulate the decoded JSON string.
struct Buffer {
    /// The `String` to accumulate chars in.
    buf: String,
}

impl Buffer {
    /// Create a new `Buffer`.
    fn with_capacity(capacity: usize) -> Self {
        Self {
            buf: String::with_capacity(capacity),
        }
    }

    /// Push an already-decoded char into the `String`.
    fn push(&mut self, ch: char) {
        self.buf.push(ch);
    }

    /// Consume the `Buffer` and return the inner `String`.
    fn into_string(self) -> String {
        self.buf
    }
}

/// Decode the high and low code units of a UTF-16 surrogate pair into a `char`.
///
/// Returns `Err(DecodeUtf16)` if the two code units do not form a valid pair.
fn decode_surrogate_pair(n1: u16, n2: u16, index: usize) -> Result<char, Warning> {
    let Some(ch) = char::decode_utf16([n1, n2]).next() else {
        return Err(Warning::InvalidEscape(index));
    };

    match ch {
        Ok(ch) => Ok(ch),
        Err(err) => Err(Warning::DecodeUtf16(index, err.unpaired_surrogate())),
    }
}

/// Munch four chars as bytes and try to convert them into a `char`.
fn decode_hex_escape(chars: &mut Chars<'_>) -> Result<u16, Warning> {
    const RADIX: u32 = 16;

    let (_, one) = chars.next_or_eof()?;
    let (_, two) = chars.next_or_eof()?;
    let (_, three) = chars.next_or_eof()?;
    let (index, four) = chars.next_or_eof()?;

    let string = [one, two, three, four].into_iter().collect::<String>();
    let Ok(n) = u16::from_str_radix(&string, RADIX) else {
        return Err(Warning::InvalidEscape(index));
    };

    Ok(n)
}