alkale 2.0.0 - Docs.rs

//! Sub-module of [`common`][crate::common] used for string and character-related methods.

mod error;

use crate::{
    span::{Spannable, Spanned},
    SourceCodeScanner,
};

pub use error::CharTokenError;
pub use error::ParseCharError;
pub use error::StringErrorList;
pub use error::StringTokenError;

impl<'src> SourceCodeScanner<'src> {
    /// Attempt to read a string-like token from source code.
    ///
    /// This method will immediately consume a character and treat it as the delimiter,
    /// it will then repeatedly call the input predicate until EOF or an ending delimiter is reached.
    ///
    /// Any [`Err`]s returned by the predicate will be collected into a [`Vec`] and returned.
    ///
    /// # Examples
    /// ```rust
    /// # use alkale::SourceCodeScanner;
    /// # use alkale::span::Span;
    /// // Our super basic character consumer
    /// fn consumer(
    ///     scanner: &SourceCodeScanner,
    ///     accumulator: &mut String,
    ///     delimiter: char,
    /// ) -> Result<(), ()> {
    ///     accumulator.push(scanner.next().unwrap());
    ///
    ///     Ok(())
    /// }
    ///
    /// let scanner = SourceCodeScanner::new(r#"'a string' ..."#);
    ///
    /// # unsafe {
    /// assert_eq!(
    ///     scanner.parse_string(consumer),
    ///     Span::new(0, 10).wrap(Ok(String::from("a string")))
    /// );
    ///
    /// let scanner = SourceCodeScanner::new(r#"!also a string! ..."#);
    ///
    /// assert_eq!(
    ///     scanner.parse_string(consumer),
    ///     Span::new(0, 15).wrap(Ok(String::from("also a string")))
    /// );
    /// # }
    /// ```
    ///
    /// # Panics
    /// This method will panic if the [`SourceCodeScanner`] has no characters
    /// before invoking this method.
    #[inline]
    pub fn parse_string<E, F: Fn(&Self, &mut String, char) -> Result<(), E>>(
        &self,
        char_consumer: F,
    ) -> Spanned<Result<String, Vec<StringTokenError<E>>>> {
        // Consume character, store its value and position for later.
        let Some(Spanned { span, data: delim }) = self.next_span() else {
            panic!(
                "parse_string should only be called if the source code iterator cannot be empty."
            )
        };

        let mut result = String::new();
        let mut errors = Vec::with_capacity(4);

        // Loop and consume characters until delimiter is found
        loop {
            // If no characters exist, we have reached EOF.
            if !self.has_next() {
                errors.push(StringTokenError::NoClosingDelimiter);
                break;
            }

            // Check for ending delimiter - consume and stop if found.
            if self.peek_is(delim) {
                self.skip();
                break;
            }

            // Consume next character and append error if necessary
            if let Err(e) = char_consumer(self, &mut result, delim) {
                errors.push(StringTokenError::CharError(e));
            };
        }

        span.up_to(&self.span()).wrap(if errors.is_empty() {
            Ok(result)
        } else {
            Err(errors)
        })
    }

    /// Parses a simple, rust-like string. Both `'` and `"` are allowed as a delimiter.
    /// This string lexer should be sufficient for most languages. This uses
    /// [`parse_simple_character`] to parse each character within the string.
    ///
    /// If the next character is not `'` or `"`, this function is a no-op and returns [`None`].
    ///
    /// # Examples
    /// ```rust
    /// # use alkale::SourceCodeScanner;
    /// # use alkale::span::Span;
    /// let scanner = SourceCodeScanner::new(r#"
    ///     "string one"
    ///     'string \t two'
    /// "#);
    ///
    /// # unsafe {
    /// scanner.skip_whitespace();
    ///
    /// assert_eq!(
    ///     scanner.try_parse_simple_string(),
    ///     Some(
    ///         Span::new(5, 12).wrap(Ok(String::from("string one")))
    ///     )
    /// );
    ///
    /// scanner.skip_whitespace();
    ///
    /// assert_eq!(
    ///     scanner.try_parse_simple_string(),
    ///     Some(
    ///         Span::new(22, 15).wrap(Ok(String::from("string \t two")))
    ///     )
    /// );
    ///
    /// assert_eq!(
    ///     scanner.try_parse_simple_string(),
    ///     None
    /// );
    /// # }
    /// ```
    ///
    /// # See Also
    /// If your language has both string and character tokens, [`try_parse_strict_string`](Self::try_parse_strict_string)
    /// may be more applicable.
    ///
    /// [`parse_string`](Self::parse_string) may be used for more control over parsing.
    #[inline]
    pub fn try_parse_simple_string(&self) -> Option<Spanned<Result<String, StringErrorList>>> {
        if let Some('"' | '\'') = self.peek() {
            Some(self.parse_string(parse_simple_character))
        } else {
            None
        }
    }

    /// Parses a simple, rust-like string that uses `"` as a delimiter.
    /// This string lexer should be sufficient for most languages. This uses
    /// [`parse_simple_character`] to parse each character within the string.
    ///
    /// If the next character is not `"`, this function is a no-op and returns [`None`].
    ///
    /// # Examples
    /// ```rust
    /// # use alkale::SourceCodeScanner;
    /// # use alkale::span::Span;
    /// let scanner = SourceCodeScanner::new(r#"
    ///     "string one"
    ///     "string \t two"
    ///     'not a string'
    /// "#);
    ///
    /// # unsafe {
    /// scanner.skip_whitespace();
    ///
    /// assert_eq!(
    ///     scanner.try_parse_strict_string(),
    ///     Some(
    ///         Span::new(5, 12).wrap(Ok(String::from("string one")))
    ///     )
    /// );
    ///
    /// scanner.skip_whitespace();
    ///
    /// assert_eq!(
    ///     scanner.try_parse_strict_string(),
    ///     Some(
    ///         Span::new(22, 15).wrap(Ok(String::from("string \t two")))
    ///     )
    /// );
    ///
    /// scanner.skip_whitespace();
    ///
    /// assert_eq!(
    ///     scanner.try_parse_strict_string(),
    ///     None
    /// );
    /// # }
    /// ```
    ///
    /// # See Also
    /// If your language doesn't have a character-token, [`try_parse_simple_string`](Self::try_parse_simple_string)
    /// may be preferred.
    ///
    /// [`parse_string`](Self::parse_string) may be used for more control over parsing.
    #[inline]
    pub fn try_parse_strict_string(&self) -> Option<Spanned<Result<String, StringErrorList>>> {
        self.peek()
            .eq(&Some('"'))
            .then(|| self.parse_string(parse_simple_character))
    }

    /// Parses a simple character token. If the next character is a `'`, it will use [`parse_simple_character`]
    /// to find the "character," and then parse the closing `'` delimiter.
    ///
    /// This function will return [`None`] if no `'` was found.
    /// This function will return an [`Err`] variant if the character token couldn't be parsed.
    ///
    /// # Examples
    /// ```rust
    /// # use alkale::SourceCodeScanner;
    /// # use alkale::span::Span;
    /// let scanner = SourceCodeScanner::new(r#"'a' 'b' '\t' "test""#);
    ///
    /// # unsafe {
    /// assert_eq!(
    ///     scanner.try_parse_character_token(),
    ///     Some(Span::new(0, 3).wrap(Ok('a')))
    /// );
    ///
    /// scanner.skip_whitespace();
    ///
    /// assert_eq!(
    ///     scanner.try_parse_character_token(),
    ///     Some(Span::new(4, 3).wrap(Ok('b')))
    /// );
    ///
    /// scanner.skip_whitespace();
    ///
    /// assert_eq!(
    ///     scanner.try_parse_character_token(),
    ///     Some(Span::new(8, 4).wrap(Ok('\t')))
    /// );
    ///
    /// scanner.skip_whitespace();
    ///
    /// assert_eq!(
    ///     scanner.try_parse_character_token(),
    ///     None
    /// );
    /// # }
    /// ```
    #[inline]
    pub fn try_parse_character_token(&self) -> Option<Spanned<Result<char, CharTokenError>>> {
        if self.peek() == Some('\'') {
            let start_span = self.span();

            // Skip the '
            self.skip();

            let mut container = String::with_capacity(1);

            if let Err(err) = parse_simple_character(self, &mut container, '\'') {
                return Some(
                    Err(CharTokenError::CharError(err)).spanned(start_span.up_to(&self.span())),
                );
            }

            match self.next() {
                Some('\'') => Some(
                    // SAFETY: Container must have 1 character to reach this part of the
                    // code.
                    Ok(unsafe { container.chars().next().unwrap_unchecked() })
                        .spanned(start_span.up_to(&self.span())),
                ),
                Some(x) => Some(
                    Err(CharTokenError::UnclosedCharError(x))
                        .spanned(start_span.up_to(&self.span())),
                ),
                None => Some(
                    Err(CharTokenError::UnclosedCharErrorEOF)
                        .spanned(start_span.up_to(&self.span())),
                ),
            }
        } else {
            None
        }
    }
}

/// Consumes a string-like character and appends it to a [`String`] buffer.
///
/// If the consumed character is `\`, it indicates an escape code. If this occurs,
/// an extra character is consumed. The following is a list of escape codes and
/// what characters will be pushed to the buffer when encountered:
///
/// - `\t` becomes a tab character  
/// - `\n` becomes a newline character  
/// - `\r` becomes a carriage return character  
/// - `\\` becomes a backslash  
/// - `\0` becomes a null  
/// - `\DEL` becomes `DEL`
///
/// For that last one, `DEL` is the delimiter argument, typically
/// `\"` or `\'`.
///
/// Notably, this method does not have an analog for Rust's `\x` or `\u`.
///
/// This method is used as a base character parser for built-in string methods.
///
/// # Errors
/// This will return an error if EOF is found while parsing, or
/// if an invalid escape character is used.
///
/// # Examples
/// ```rust
/// # use alkale::SourceCodeScanner;
/// # use alkale::span::Span;
/// # use alkale::common::string::parse_simple_character;
/// let scanner = SourceCodeScanner::new(r#"abc\te"#);
/// let mut accumulator = String::new();
///
/// // Repeatedly read chars as long as more remain, ignore errs.
/// while scanner.has_next() {
///     let _ = parse_simple_character(&scanner, &mut accumulator, '"');
/// }
///
/// assert_eq!(accumulator.as_str(), "abc\te");
/// ```
#[inline]
pub fn parse_simple_character(
    context: &SourceCodeScanner,
    accumulator: &mut String,
    delimiter: char,
) -> Result<(), ParseCharError> {
    // Read next character, error if no next character exists.
    let Some(Spanned {
        data: next_char,
        span,
    }) = context.next_span()
    else {
        return Err(ParseCharError::NoCharFound);
    };

    // If the next character isn't backslash, just return it.
    if next_char != '\\' {
        // If it's a delimiter, this is an illegal state. Error.
        if next_char == delimiter {
            return Err(ParseCharError::UnescapedDelimiter(span));
        } else {
            accumulator.push(next_char);
            return Ok(());
        }
    }

    // Span of the backslash
    let slash_span = context.span();

    // Match an escape sequence
    match context.next_span() {
        Some(Spanned { data: 't', .. }) => accumulator.push('\t'),
        Some(Spanned { data: 'n', .. }) => accumulator.push('\n'),
        Some(Spanned { data: 'r', .. }) => accumulator.push('\r'),
        Some(Spanned { data: '0', .. }) => accumulator.push('\0'),
        Some(Spanned { data: '\\', .. }) => accumulator.push('\\'),
        Some(Spanned { data: '\n', .. }) => (),
        Some(Spanned { data, .. }) if data == delimiter => accumulator.push(delimiter),
        Some(Spanned { data, span }) => {
            return Err(ParseCharError::IllegalEscape(data, span));
        }
        None => {
            return Err(ParseCharError::NoEscape(slash_span));
        }
    };

    Ok(())
}

#[cfg(test)]
mod tests {
    use crate::{
        common::string::{CharTokenError, ParseCharError},
        span::{Span, Spannable},
        SourceCodeScanner,
    };

    fn ctx(code: &str) -> SourceCodeScanner {
        SourceCodeScanner::new(code)
    }

    #[test]
    pub fn parse_string() {
        let code = ctx(r#"%Awesome!% %Meowmeow \n uwu%"#);

        // SAFETY: Spans are all valid.
        unsafe {
            assert_eq!(
                code.parse_string(super::parse_simple_character),
                Ok(String::from("Awesome!")).spanned(Span::new(0, 10))
            );

            code.skip();

            assert_eq!(
                code.parse_string(super::parse_simple_character),
                Ok(String::from("Meowmeow \n uwu")).spanned(Span::new(11, 17))
            );

            assert!(!code.has_next());
        }
    }

    #[test]
    pub fn try_parse_simple_string() {
        let code = ctx(r#"'single quotes' "double quotes!""#);

        // SAFETY: Spans are all valid.
        unsafe {
            assert_eq!(
                code.try_parse_simple_string(),
                Some(Ok(String::from("single quotes")).spanned(Span::new(0, 15)))
            );

            code.skip();

            assert_eq!(
                code.try_parse_simple_string(),
                Some(Ok(String::from("double quotes!")).spanned(Span::new(16, 16)))
            );

            assert!(!code.has_next());

            assert_eq!(code.try_parse_simple_string(), None);
        }
    }

    #[test]
    pub fn try_parse_strict_string() {
        let code = ctx(r#"'h' "double quotes!""#);

        assert_eq!(code.try_parse_strict_string(), None);

        assert_eq!(code.next(), Some('\''));
        assert_eq!(code.next(), Some('h'));
        assert_eq!(code.next(), Some('\''));
        code.skip();

        // SAFETY: Span is valid.
        unsafe {
            assert_eq!(
                code.try_parse_strict_string(),
                Some(Ok(String::from("double quotes!")).spanned(Span::new(4, 16)))
            );
        }

        assert!(!code.has_next());

        assert_eq!(code.try_parse_strict_string(), None);
    }

    #[test]
    pub fn try_parse_character_token() {
        let code = ctx(r#"'p' 'q' '\t' '\'' ' ' ''"#);

        // SAFETY: Spans are all valid.
        unsafe {
            assert_eq!(
                code.try_parse_character_token(),
                Some(Ok('p').spanned(Span::new(0, 3)))
            );

            code.skip();

            assert_eq!(
                code.try_parse_character_token(),
                Some(Ok('q').spanned(Span::new(4, 3)))
            );

            code.skip();

            assert_eq!(
                code.try_parse_character_token(),
                Some(Ok('\t').spanned(Span::new(8, 4)))
            );

            code.skip();

            assert_eq!(
                code.try_parse_character_token(),
                Some(Ok('\'').spanned(Span::new(13, 4)))
            );

            code.skip();

            assert_eq!(
                code.try_parse_character_token(),
                Some(Ok(' ').spanned(Span::new(18, 3)))
            );

            code.skip();

            assert_eq!(
                code.try_parse_character_token(),
                Some(
                    Err(CharTokenError::CharError(
                        ParseCharError::UnescapedDelimiter(Span::new(23, 1))
                    ))
                    .spanned(Span::new(22, 2))
                )
            );

            assert!(!code.has_next());
        }
    }

    #[test]
    pub fn parse_simple_character() {
        let mut code = ctx(r#"ab2\tx\\8*"'\""#);
        let mut acc = String::new();
        let mut next = || super::parse_simple_character(&mut code, &mut acc, '"');

        assert_eq!(next(), Ok(()));
        assert_eq!(next(), Ok(()));
        assert_eq!(next(), Ok(()));
        assert_eq!(next(), Ok(()));
        assert_eq!(next(), Ok(()));
        assert_eq!(next(), Ok(()));
        assert_eq!(next(), Ok(()));
        assert_eq!(next(), Ok(()));

        // SAFETY: Span is valid.
        unsafe {
            assert_eq!(
                next(),
                Err(ParseCharError::UnescapedDelimiter(Span::new(10, 1)))
            );
        }

        assert_eq!(next(), Ok(()));
        assert_eq!(next(), Ok(()));

        assert_eq!(&acc, "ab2\tx\\8*'\"");

        assert!(!code.has_next());
    }
}