ocpi-tariffs 0.45.0

//! A `Warning` for warnings that can happen when decoding a JSON `&str`.
#[cfg(test)]
mod test_unescape;

use std::{borrow::Cow, fmt, iter::Peekable};

use crate::{warning, Caveat, IntoCaveat};

use super::Element;

const ESCAPE_CHAR: char = '\\';

/// The kind of `Warning` that can happen when decoding a `&str`.
#[derive(Debug, Eq, PartialEq, Ord, PartialOrd)]
pub enum Warning {
    /// Control chars were found while parsing a JSON string.
    ControlCharacterWhileParsingString(usize),

    /// A UTF-16 surrogate pair failed to decode.
    DecodeUtf16(usize, u16),

    /// A string contains invalid escape chars.
    InvalidEscape(usize),

    /// The String ended before the parser expected.
    UnexpectedEndOfString(usize),
}

impl crate::Warning for Warning {
    /// A human readable identifier for each `Warning`.
    fn id(&self) -> warning::Id {
        match self {
            Self::ControlCharacterWhileParsingString(_) => {
                warning::Id::from_static("control_character_while_parsing_string")
            }
            Self::DecodeUtf16(..) => warning::Id::from_static("decode_utf_1_6"),
            Self::InvalidEscape(_) => warning::Id::from_static("invalid_escape"),
            Self::UnexpectedEndOfString(_) => warning::Id::from_static("unexpected_end_of_string"),
        }
    }
}

impl fmt::Display for Warning {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Self::ControlCharacterWhileParsingString(index) => {
                write!(
                    f,
                    "Control chars were found at index `{index}` while decoding a JSON string."
                )
            }
            Self::DecodeUtf16(index, code) => {
                write!(
                    f,
                    "A UTF-16 surrogate pair `{code}` failed to decode at index: `{index}`."
                )
            }
            Self::InvalidEscape(index) => {
                write!(
                    f,
                    "String contains an invalid escape char at index: `{index})`."
                )
            }
            Self::UnexpectedEndOfString(index) => {
                write!(f, "The String ended prematurely at index: `{index}`.")
            }
        }
    }
}

/// Return a `PendingStr` that marks the inner `&str` is containing escape codes or not.
pub(crate) fn analyze<'buf>(
    s: &'buf str,
    elem: &Element<'buf>,
) -> Caveat<PendingStr<'buf>, Warning> {
    let mut warnings = warning::Set::new();

    // Strings are expected to be small so running over all bytes
    // with the intent of early exiting is acceptable.
    if s.chars().any(|ch| ch == ESCAPE_CHAR) {
        PendingStr::HasEscapes(EscapeStr(s)).into_caveat(warnings)
    } else {
        if let Some((index, _)) = s.char_indices().find(|(_, ch)| ch.is_control()) {
            warnings.insert(Warning::ControlCharacterWhileParsingString(index), elem);
        }

        PendingStr::NoEscapes(s).into_caveat(warnings)
    }
}

/// Marks a `&str` as having escapes or not.
pub(crate) enum PendingStr<'buf> {
    /// The `&str` has no escapes and can be used as is.
    NoEscapes(&'buf str),

    /// The `&str` has escape chars and needs to be unescaped before trying to parse into another form.
    HasEscapes(EscapeStr<'buf>),
}

/// A `&str` with escape chars.
pub(crate) struct EscapeStr<'buf>(&'buf str);

impl<'buf> EscapeStr<'buf> {
    pub(crate) fn decode_escapes(&self, elem: &Element<'buf>) -> Caveat<Cow<'buf, str>, Warning> {
        unescape_str(self.0, elem)
    }

    /// Consume the `EscapeStr` and return the raw bytes as a str.
    pub(crate) fn into_raw(self) -> &'buf str {
        self.0
    }
}

/// Return the `str` with escaped chars replaced with the decoded equivalent.
///
/// Return `Some(Cow::Owned)` if there are escape chars in the `str` otherwise return
/// `Some(Cow::Owned)` containing the input `str`.
/// Return `None` if the `str` contains invalid or unhandled escape chars.
pub(crate) fn unescape_str<'buf>(
    s: &'buf str,
    elem: &Element<'buf>,
) -> Caveat<Cow<'buf, str>, Warning> {
    let mut warnings = warning::Set::new();

    // Strings are expected to be small so running over all
    // bytes to early out is acceptable.
    if !s.chars().any(|ch| ch == ESCAPE_CHAR) {
        if let Some((index, _)) = s.char_indices().find(|(_, ch)| ch.is_control()) {
            warnings.insert(Warning::ControlCharacterWhileParsingString(index), elem);
        }
        return Cow::Borrowed(s).into_caveat(warnings);
    }

    let mut chars = Chars::from_str(s);
    let mut buf = Buffer::with_capacity(s.len());

    loop {
        let Some((index, ch)) = chars.next() else {
            return Cow::<'buf, str>::Owned(buf.into_string()).into_caveat(warnings);
        };

        if ch == ESCAPE_CHAR {
            if let Err(warn_kind) = parse_escape(&mut chars, &mut buf) {
                warnings.insert(warn_kind, elem);
                return Cow::Borrowed(s).into_caveat(warnings);
            }
        } else if let Err(warn_kind) = buf.push_char(ch, index) {
            warnings.insert(warn_kind, elem);
            return Cow::Borrowed(s).into_caveat(warnings);
        }
    }
}

/// Parses a JSON escape sequence and appends it into the `Buffer`. Assumes
/// the previous byte read was a backslash.
///
/// * See: <https://datatracker.ietf.org/doc/html/rfc8259#section-7>
fn parse_escape(chars: &mut Chars<'_>, buf: &mut Buffer) -> Result<(), Warning> {
    let (index, ch) = chars.next_or_eof()?;

    let ch = match ch {
        '"' => '"',
        '\\' => '\\',
        '/' => '/',
        'b' => '\x08',
        'f' => '\x0c',
        'n' => '\n',
        'r' => '\r',
        't' => '\t',
        'u' => return parse_unicode_escape(chars, buf),
        _ => {
            return Err(Warning::InvalidEscape(index));
        }
    };

    buf.push_char(ch, index)?;

    Ok(())
}

/// Parses a JSON `\u` escape and appends it into the buffer.
/// Assumes `\u` has just been read.
///
/// The Unicode escape might be a UTF-16 surrogate pair.
///
/// * See: <https://datatracker.ietf.org/doc/html/rfc8259#section-8.2>
fn parse_unicode_escape(chars: &mut Chars<'_>, buf: &mut Buffer) -> Result<(), Warning> {
    let n1 = decode_hex_escape(chars)?;
    let n2 = chars.is_next_escape()?;

    if let Some(n2) = n2 {
        buf.push_surrogate_pair(n1, n2, chars.index)?;
    } else {
        let Some(ch) = char::from_u32(u32::from(n1)) else {
            return Err(Warning::InvalidEscape(chars.index));
        };

        buf.push_char(ch, chars.index)?;
    }

    Ok(())
}

/// A char iterator that can fail if the next char is a control char.
struct Chars<'buf> {
    /// The `char` iterator
    ///
    /// This needs to be a `CharIndices` as the `Chars` iterator skips over escaped chars.
    /// And this needs to be `Peekable` as we need to look ahead to detect a potential second
    /// Unicode literal and treat that as a UTF-16 surrogate pair.
    char_indices: Peekable<std::str::CharIndices<'buf>>,

    /// The last parsed char index
    index: usize,
}

impl<'buf> Chars<'buf> {
    /// Create a new `Chars` iterator from a `&str`.
    fn from_str(s: &'buf str) -> Self {
        Self {
            char_indices: s.char_indices().peekable(),
            index: 0,
        }
    }

    /// Return the next char as `Ok` or return `Err(UnexpectedEOF)` if there is no char
    /// or return `Err(ControlCharacterWhileParsingString)` if the next char is a control char.
    fn next_or_eof(&mut self) -> Result<(usize, char), Warning> {
        if let Some((index, ch)) = self.next() {
            if ch.is_control() {
                return Err(Warning::ControlCharacterWhileParsingString(index));
            }

            Ok((index, ch))
        } else {
            Err(Warning::UnexpectedEndOfString(self.index))
        }
    }

    /// Look ahead in the char stream and if there is another Unicode escape return it as a decoded
    /// hex escape.
    fn is_next_escape(&mut self) -> Result<Option<u16>, Warning> {
        {
            let escape_char = self.char_indices.next_if(|(_, ch)| *ch == ESCAPE_CHAR);

            if escape_char.is_none() {
                return Ok(None);
            }
        }

        {
            let escape_unicode = self.char_indices.next_if(|(_, ch)| *ch == 'u');

            if escape_unicode.is_none() {
                return Ok(None);
            }
        }

        let n = decode_hex_escape(self)?;
        Ok(Some(n))
    }
}

impl Iterator for Chars<'_> {
    type Item = (usize, char);

    fn next(&mut self) -> Option<Self::Item> {
        if let Some((index, char)) = self.char_indices.next() {
            self.index = index;
            Some((index, char))
        } else {
            None
        }
    }
}

/// The `String` based buffer where we accumulate the escaped JSON string.
///
/// If `fail_on_control` is true and a control char is given to a method,
/// the method will return `Err(ControlCharacterWhileParsingString)`.
struct Buffer {
    /// The `String` to accumulate chars in.
    buf: String,
}

impl Buffer {
    /// Create a new `Buffer`
    fn with_capacity(capacity: usize) -> Self {
        Self {
            buf: String::with_capacity(capacity),
        }
    }

    /// Push a char into the `String`.
    ///
    /// Return `Err` if the char is a control char and the `fail_on_control` is true.
    /// Otherwise, return `Ok`.
    fn push_char(&mut self, ch: char, index: usize) -> Result<(), Warning> {
        if ch.is_control() {
            return Err(Warning::ControlCharacterWhileParsingString(index));
        }

        self.buf.push(ch);
        Ok(())
    }

    /// Consume the `Buffer` and return the inner `String`.
    fn into_string(self) -> String {
        self.buf
    }

    /// Decodes the high and low parts of a UTF-16 surrogate pair and pushes the resulting
    /// `char` on to the `Buffer`.
    ///
    /// Returns `Ok(char)` if the decoding succeeds.
    /// Returns `Err(DecodeUtf16)` if the decoding fails.
    fn push_surrogate_pair(&mut self, n1: u16, n2: u16, index: usize) -> Result<char, Warning> {
        let Some(ch) = char::decode_utf16([n1, n2]).next() else {
            return Err(Warning::InvalidEscape(index));
        };

        let ch = match ch {
            Ok(ch) => ch,
            Err(err) => {
                return Err(Warning::DecodeUtf16(index, err.unpaired_surrogate()));
            }
        };

        self.push_char(ch, index)?;

        Ok(ch)
    }
}

/// Munch four chars as bytes and try to convert them into a `char`.
fn decode_hex_escape(chars: &mut Chars<'_>) -> Result<u16, Warning> {
    const RADIX: u32 = 16;

    let (_, one) = chars.next_or_eof()?;
    let (_, two) = chars.next_or_eof()?;
    let (_, three) = chars.next_or_eof()?;
    let (index, four) = chars.next_or_eof()?;

    let string = [one, two, three, four].into_iter().collect::<String>();
    let Ok(n) = u16::from_str_radix(&string, RADIX) else {
        return Err(Warning::InvalidEscape(index));
    };

    Ok(n)
}