saphyr-parser-bw 0.0.612

use crate::{
    char_traits::{is_blank_or_breakz, is_breakz, is_flow},
    input::{BorrowedInput, Input, SkipTabs},
};
use alloc::string::String;

/// A parser input that uses a `&str` as source.
#[allow(clippy::module_name_repetitions)]
pub struct StrInput<'a> {
    /// The full, original input string.
    ///
    /// This is kept to support O(1) byte-offset capture and zero-copy slicing via the optional
    /// [`Input::byte_offset`] / [`Input::slice_bytes`] APIs.
    original: &'a str,
    /// The input str buffer.
    ///
    /// This is a moving window into [`Self::original`]. All consuming operations advance this
    /// slice.
    buffer: &'a str,
    /// The number of characters we have looked ahead.
    ///
    /// We must however keep track of how many characters the parser asked us to look ahead for so
    /// that we can return the correct value in [`Self::buflen`].
    lookahead: usize,
}

impl<'a> StrInput<'a> {
    /// Create a new [`StrInput`] with the given str.
    #[must_use]
    pub fn new(input: &'a str) -> Self {
        Self {
            original: input,
            buffer: input,
            lookahead: 0,
        }
    }

    /// Return the number of bytes consumed from the original input.
    ///
    /// This is an O(1) operation derived from the invariant that [`Self::buffer`] is always a
    /// suffix of [`Self::original`].
    #[inline]
    #[must_use]
    fn consumed_bytes(&self) -> usize {
        self.original.len() - self.buffer.len()
    }
}

impl Input for StrInput<'_> {
    #[inline]
    fn lookahead(&mut self, x: usize) {
        // We already have all characters that we need.
        // We cannot add '\0's to the buffer should we prematurely reach EOF.
        // Returning '\0's befalls the character-retrieving functions.
        self.lookahead = self.lookahead.max(x);
    }

    #[inline]
    fn buflen(&self) -> usize {
        self.lookahead
    }

    #[inline]
    fn bufmaxlen(&self) -> usize {
        BUFFER_LEN
    }

    fn buf_is_empty(&self) -> bool {
        self.buflen() == 0
    }

    #[inline]
    fn raw_read_ch(&mut self) -> char {
        let mut chars = self.buffer.chars();
        if let Some(c) = chars.next() {
            self.buffer = chars.as_str();
            c
        } else {
            '\0'
        }
    }

    #[inline]
    fn raw_read_non_breakz_ch(&mut self) -> Option<char> {
        if let Some((c, sub_str)) = split_first_char(self.buffer) {
            if is_breakz(c) {
                None
            } else {
                self.buffer = sub_str;
                Some(c)
            }
        } else {
            None
        }
    }

    #[inline]
    fn skip(&mut self) {
        if !self.buffer.is_empty() {
            let b = self.buffer.as_bytes()[0];
            if b < 0x80 {
                self.buffer = &self.buffer[1..];
            } else {
                let mut chars = self.buffer.chars();
                chars.next();
                self.buffer = chars.as_str();
            }
        }
    }

    #[inline]
    fn skip_n(&mut self, count: usize) {
        let mut chars = self.buffer.chars();
        for _ in 0..count {
            if chars.next().is_none() {
                break;
            }
        }
        self.buffer = chars.as_str();
    }

    #[inline]
    fn peek(&self) -> char {
        if self.buffer.is_empty() {
            return '\0';
        }
        let b = self.buffer.as_bytes()[0];
        if b < 0x80 {
            b as char
        } else {
            self.buffer.chars().next().unwrap()
        }
    }

    #[inline]
    fn peek_nth(&self, n: usize) -> char {
        if n == 0 {
            return self.peek();
        }
        let bytes = self.buffer.as_bytes();
        if n == 1 && bytes.len() >= 2 && bytes[0] < 0x80 && bytes[1] < 0x80 {
            return bytes[1] as char;
        }
        let mut chars = self.buffer.chars();
        for _ in 0..n {
            if chars.next().is_none() {
                return '\0';
            }
        }
        chars.next().unwrap_or('\0')
    }

    #[inline]
    fn byte_offset(&self) -> Option<usize> {
        Some(self.consumed_bytes())
    }

    #[inline]
    fn slice_bytes(&self, start: usize, end: usize) -> Option<&str> {
        debug_assert!(start <= end);
        debug_assert!(end <= self.original.len());
        self.original.get(start..end)
    }

    #[inline]
    fn look_ch(&mut self) -> char {
        self.lookahead(1);
        self.peek()
    }

    #[inline]
    fn next_char_is(&self, c: char) -> bool {
        self.peek() == c
    }

    #[inline]
    fn nth_char_is(&self, n: usize, c: char) -> bool {
        self.peek_nth(n) == c
    }

    #[inline]
    fn next_2_are(&self, c1: char, c2: char) -> bool {
        let mut chars = self.buffer.chars();
        chars.next() == Some(c1) && chars.next() == Some(c2)
    }

    #[inline]
    fn next_3_are(&self, c1: char, c2: char, c3: char) -> bool {
        let mut chars = self.buffer.chars();
        chars.next() == Some(c1) && chars.next() == Some(c2) && chars.next() == Some(c3)
    }

    #[inline]
    fn next_is_document_indicator(&self) -> bool {
        if self.buffer.len() < 3 {
            false
        } else {
            // Since all characters we look for are ascii, we can directly use the byte API of str.
            let bytes = self.buffer.as_bytes();
            (bytes.len() == 3 || matches!(bytes[3], b' ' | b'\t' | 0 | b'\n' | b'\r'))
                && (bytes[0] == b'.' || bytes[0] == b'-')
                && bytes[0] == bytes[1]
                && bytes[1] == bytes[2]
        }
    }

    #[inline]
    fn next_is_document_start(&self) -> bool {
        if self.buffer.len() < 3 {
            false
        } else {
            // Since all characters we look for are ascii, we can directly use the byte API of str.
            let bytes = self.buffer.as_bytes();
            (bytes.len() == 3 || matches!(bytes[3], b' ' | b'\t' | 0 | b'\n' | b'\r'))
                && bytes[0] == b'-'
                && bytes[1] == b'-'
                && bytes[2] == b'-'
        }
    }

    #[inline]
    fn next_is_document_end(&self) -> bool {
        if self.buffer.len() < 3 {
            false
        } else {
            // Since all characters we look for are ascii, we can directly use the byte API of str.
            let bytes = self.buffer.as_bytes();
            (bytes.len() == 3 || matches!(bytes[3], b' ' | b'\t' | 0 | b'\n' | b'\r'))
                && bytes[0] == b'.'
                && bytes[1] == b'.'
                && bytes[2] == b'.'
        }
    }

    fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> (usize, Result<SkipTabs, &'static str>) {
        assert!(!matches!(skip_tabs, SkipTabs::Result(..)));

        let mut new_str = self.buffer;
        let mut has_yaml_ws = false;
        let mut encountered_tab = false;

        // This ugly pair of loops is the fastest way of trimming spaces (and maybe tabs) I found
        // while keeping track of whether we encountered spaces and/or tabs.
        if skip_tabs == SkipTabs::Yes {
            loop {
                if let Some(sub_str) = new_str.strip_prefix(' ') {
                    has_yaml_ws = true;
                    new_str = sub_str;
                } else if let Some(sub_str) = new_str.strip_prefix('\t') {
                    encountered_tab = true;
                    new_str = sub_str;
                } else {
                    break;
                }
            }
        } else {
            while let Some(sub_str) = new_str.strip_prefix(' ') {
                has_yaml_ws = true;
                new_str = sub_str;
            }
        }

        // All characters consumed were ascii. We can use the byte length difference to count the
        // number of whitespace ignored.
        let mut chars_consumed = self.buffer.len() - new_str.len();

        if !new_str.is_empty() && new_str.as_bytes()[0] == b'#' {
            if !encountered_tab && !has_yaml_ws {
                return (
                    chars_consumed,
                    Err("comments must be separated from other tokens by whitespace"),
                );
            }

            // Skip remaining characters until we hit a breakz.
            while let Some((c, sub_str)) = split_first_char(new_str) {
                if is_breakz(c) {
                    break;
                }
                new_str = sub_str;
                chars_consumed += 1;
            }
        }

        self.buffer = new_str;

        (
            chars_consumed,
            Ok(SkipTabs::Result(encountered_tab, has_yaml_ws)),
        )
    }

    #[allow(clippy::inline_always)]
    #[inline(always)]
    fn next_can_be_plain_scalar(&self, in_flow: bool) -> bool {
        let nc = self.peek_nth(1);
        match self.peek() {
            // indicators can end a plain scalar, see 7.3.3. Plain Style
            ':' if is_blank_or_breakz(nc) || (in_flow && is_flow(nc)) => false,
            c if in_flow && is_flow(c) => false,
            _ => true,
        }
    }

    #[inline]
    fn next_is_blank_or_break(&self) -> bool {
        !self.buffer.is_empty() && matches!(self.buffer.as_bytes()[0], b' ' | b'\t' | b'\n' | b'\r')
    }

    #[inline]
    fn next_is_blank_or_breakz(&self) -> bool {
        self.buffer.is_empty()
            || matches!(self.buffer.as_bytes()[0], b' ' | b'\t' | 0 | b'\n' | b'\r')
    }

    #[inline]
    fn next_is_blank(&self) -> bool {
        !self.buffer.is_empty() && matches!(self.buffer.as_bytes()[0], b' ' | b'\t')
    }

    #[inline]
    fn next_is_break(&self) -> bool {
        !self.buffer.is_empty() && matches!(self.buffer.as_bytes()[0], b'\n' | b'\r')
    }

    #[inline]
    fn next_is_breakz(&self) -> bool {
        self.buffer.is_empty() || matches!(self.buffer.as_bytes()[0], 0 | b'\n' | b'\r')
    }

    #[inline]
    fn next_is_z(&self) -> bool {
        self.buffer.is_empty() || self.buffer.as_bytes()[0] == 0
    }

    #[inline]
    fn next_is_flow(&self) -> bool {
        !self.buffer.is_empty()
            && matches!(self.buffer.as_bytes()[0], b',' | b'[' | b']' | b'{' | b'}')
    }

    #[inline]
    fn next_is_digit(&self) -> bool {
        !self.buffer.is_empty() && self.buffer.as_bytes()[0].is_ascii_digit()
    }

    /// Check if the next character is an ASCII alphanumeric, `_`, or `-`.
    ///
    /// This is used as a heuristic for error detection (e.g., when `:` is followed
    /// by tab and then a potential value character). The ASCII-only check is intentional:
    /// it catches common cases like `key:\tvalue` while avoiding false positives for
    /// valid YAML constructs. Unicode value starters (e.g., `äöü`) are not detected,
    /// but such cases will still fail to parse (with a less specific error message).
    #[inline]
    fn next_is_alpha(&self) -> bool {
        !self.buffer.is_empty()
            && matches!(self.buffer.as_bytes()[0], b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'-')
    }

    fn skip_while_non_breakz(&mut self) -> usize {
        let mut byte_pos = 0;
        let mut chars_consumed = 0;

        for (i, c) in self.buffer.char_indices() {
            if is_breakz(c) {
                break;
            }
            byte_pos = i + c.len_utf8();
            chars_consumed += 1;
        }

        self.buffer = &self.buffer[byte_pos..];
        chars_consumed
    }

    #[inline]
    fn skip_while_blank(&mut self) -> usize {
        let bytes = self.buffer.as_bytes();

        let mut i = 0;
        while i < bytes.len() {
            match bytes[i] {
                b' ' | b'\t' => i += 1,
                _ => break,
            }
        }

        self.buffer = &self.buffer[i..];
        i
    }

    /// Fetch characters matching `is_alpha` (ASCII alphanumeric, `_`, `-`).
    ///
    /// This is used for scanning tag handles (e.g., `!foo!`). Per YAML 1.2 spec,
    /// tag handles use `ns-word-char` which is `[0-9a-zA-Z-]`. Our implementation
    /// is slightly more permissive by also accepting `_`, but this is harmless
    /// and matches common practice. Unicode characters like `ä` or `π` are NOT
    /// valid in tag handles per spec, so the ASCII-only byte-based scanning here
    /// is both correct and efficient.
    fn fetch_while_is_alpha(&mut self, out: &mut String) -> usize {
        let bytes = self.buffer.as_bytes();
        let mut i = 0;

        // All target characters are ASCII, so we can scan bytes directly.
        while i < bytes.len() {
            match bytes[i] {
                b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'-' => i += 1,
                _ => break,
            }
        }

        // All matched characters are ASCII, so we can safely slice and convert.
        out.push_str(&self.buffer[..i]);
        self.buffer = &self.buffer[i..];

        i
    }

    fn fetch_while_is_yaml_non_space(&mut self, out: &mut String) -> usize {
        let byte_pos = self
            .buffer
            .chars()
            .take_while(|c| crate::char_traits::is_yaml_non_space(*c))
            .map(char::len_utf8)
            .sum();

        out.push_str(&self.buffer[..byte_pos]);

        self.buffer = &self.buffer[byte_pos..];

        byte_pos
    }

    fn fetch_plain_scalar_chunk(
        &mut self,
        out: &mut String,
        _count: usize,
        flow_level_gt_0: bool,
    ) -> (bool, usize) {
        let bytes = self.buffer.as_bytes();
        let len = bytes.len();
        let mut byte_pos = 0;
        let mut chars_consumed = 0;

        while byte_pos < len {
            let b = bytes[byte_pos];
            if b < 0x80 {
                let c = b as char;
                if crate::char_traits::is_blank_or_breakz(c) {
                    out.push_str(&self.buffer[..byte_pos]);
                    self.buffer = &self.buffer[byte_pos..];
                    return (true, chars_consumed);
                }
                if flow_level_gt_0 && crate::char_traits::is_flow(c) {
                    out.push_str(&self.buffer[..byte_pos]);
                    self.buffer = &self.buffer[byte_pos..];
                    return (true, chars_consumed);
                }
                if c == ':' {
                    let next_byte = if byte_pos + 1 < len {
                        bytes[byte_pos + 1]
                    } else {
                        0
                    };
                    // ASCII optimization: if next_byte >= 0x80, it is not blank/breakz/flow
                    let is_stop = if next_byte < 0x80 {
                        let nc = next_byte as char;
                        crate::char_traits::is_blank_or_breakz(nc)
                            || (flow_level_gt_0 && crate::char_traits::is_flow(nc))
                    } else {
                        false
                    };

                    if is_stop {
                        out.push_str(&self.buffer[..byte_pos]);
                        self.buffer = &self.buffer[byte_pos..];
                        return (true, chars_consumed);
                    }
                }
                byte_pos += 1;
                chars_consumed += 1;
            } else {
                let mut chars = self.buffer[byte_pos..].chars();
                let c = chars.next().unwrap();
                byte_pos += c.len_utf8();
                chars_consumed += 1;
            }
        }

        out.push_str(&self.buffer[..byte_pos]);
        self.buffer = &self.buffer[byte_pos..];
        // If we reached here, we consumed the whole string (EOF).
        // EOF is effectively a stop condition (breakz).
        (true, chars_consumed)
    }
}

impl<'a> BorrowedInput<'a> for StrInput<'a> {
    #[inline]
    fn slice_borrowed(&self, start: usize, end: usize) -> Option<&'a str> {
        debug_assert!(start <= end);
        debug_assert!(end <= self.original.len());
        self.original.get(start..end)
    }
}

/// The buffer size we return to the scanner.
///
/// This does not correspond to any allocated buffer size. In practice, the scanner can withdraw
/// any character they want. If it's within the input buffer, the given character is returned,
/// otherwise `\0` is returned.
///
/// The number of characters we are asked to retrieve in [`lookahead`] depends on the buffer size
/// of the input. Our buffer here is virtually unlimited, but the scanner cannot work with that. It
/// may allocate buffers of its own of the size we return in [`bufmaxlen`] (so we can't return
/// [`usize::MAX`]). We can't always return the number of characters left either, as the scanner
/// expects [`buflen`] to return the same value that was given to [`lookahead`] right after its
/// call.
///
/// This create a complex situation where [`bufmaxlen`] influences what value [`lookahead`] is
/// called with, which in turns dictates what [`buflen`] returns. In order to avoid breaking any
/// function, we return this constant in [`bufmaxlen`] which, since the input is processed one line
/// at a time, should fit what we expect to be a good balance between memory consumption and what
/// we expect the maximum line length to be.
///
/// [`lookahead`]: `StrInput::lookahead`
/// [`bufmaxlen`]: `StrInput::bufmaxlen`
/// [`buflen`]: `StrInput::buflen`
const BUFFER_LEN: usize = 128;

/// Splits the first character of the given string and returns it along with the rest of the
/// string.
#[inline]
fn split_first_char(s: &str) -> Option<(char, &str)> {
    let mut chars = s.chars();
    let c = chars.next()?;
    Some((c, chars.as_str()))
}

#[cfg(test)]
mod test {
    use crate::input::Input;

    use super::StrInput;

    #[test]
    pub fn is_document_start() {
        let input = StrInput::new("---\n");
        assert!(input.next_is_document_start());
        assert!(input.next_is_document_indicator());
        let input = StrInput::new("---");
        assert!(input.next_is_document_start());
        assert!(input.next_is_document_indicator());
        let input = StrInput::new("...\n");
        assert!(!input.next_is_document_start());
        assert!(input.next_is_document_indicator());
        let input = StrInput::new("--- ");
        assert!(input.next_is_document_start());
        assert!(input.next_is_document_indicator());
    }

    #[test]
    pub fn is_document_end() {
        let input = StrInput::new("...\n");
        assert!(input.next_is_document_end());
        assert!(input.next_is_document_indicator());
        let input = StrInput::new("...");
        assert!(input.next_is_document_end());
        assert!(input.next_is_document_indicator());
        let input = StrInput::new("---\n");
        assert!(!input.next_is_document_end());
        assert!(input.next_is_document_indicator());
        let input = StrInput::new("... ");
        assert!(input.next_is_document_end());
        assert!(input.next_is_document_indicator());
    }
}