oak-ini 0.0.11 - Docs.rs

#![doc = include_str!("readme.md")]
use oak_core::{
    Lexer, LexerCache, LexerState, OakError, Source,
    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
};
pub mod token_type;

use crate::{language::IniLanguage, lexer::token_type::IniTokenType};

pub(crate) type State<'a, S> = LexerState<'a, S, IniLanguage>;

static _INI_WHITESPACE: WhitespaceConfig = WhitespaceConfig { unicode_whitespace: true };
static _INI_COMMENT: CommentConfig = CommentConfig { line_marker: ";", block_start: "", block_end: "", nested_blocks: false };
static _INI_STRING: StringConfig = StringConfig { quotes: &['"', '\''], escape: Some('\\') };

/// INI lexer implementation.
#[derive(Clone, Debug)]
pub struct IniLexer<'config> {
    /// The INI language configuration.
    config: &'config IniLanguage,
}

impl<'config> Lexer<IniLanguage> for IniLexer<'config> {
    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<IniLanguage>) -> LexOutput<IniLanguage> {
        let mut state: State<'_, S> = State::new(source);
        let result = self.run(&mut state);
        if result.is_ok() {
            state.add_eof();
        }
        state.finish_with_cache(result, cache)
    }
}

impl<'config> IniLexer<'config> {
    /// Creates a new `IniLexer` with the given configuration.
    pub fn new(config: &'config IniLanguage) -> Self {
        Self { config }
    }

    /// The main lexical analysis loop.
    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
        while state.not_at_end() {
            let safe_point = state.get_position();

            if self.skip_whitespace(state) {
                continue;
            }

            if self.lex_newline(state) {
                continue;
            }

            if self.skip_comment(state) {
                continue;
            }

            if self.lex_string_literal(state) {
                continue;
            }

            if self.lex_number_literal(state) {
                continue;
            }

            if self.lex_identifier(state) {
                continue;
            }

            if self.lex_punctuation(state) {
                continue;
            }

            state.advance_if_dead_lock(safe_point);
        }

        Ok(())
    }

    /// Skips whitespace characters (excluding newlines).
    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
        let start = state.get_position();

        while let Some(ch) = state.peek() {
            if ch == ' ' || ch == '\t' || ch == '\r' {
                state.advance(ch.len_utf8());
            }
            else {
                break;
            }
        }

        if state.get_position() > start {
            state.add_token(IniTokenType::Whitespace, start, state.get_position());
            return true;
        }
        false
    }

    /// Handles newline characters.
    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
        let start = state.get_position();

        if state.current() == Some('\n') {
            state.advance(1);
            state.add_token(IniTokenType::Newline, start, state.get_position());
            return true;
        }
        false
    }

    /// Skips comments.
    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
        let start = state.get_position();

        if let Some(ch) = state.current() {
            if ch == ';' || ch == '#' {
                // Skip comment character
                state.advance(1);

                // Read until end of line
                while let Some(ch) = state.peek() {
                    if ch != '\n' {
                        state.advance(ch.len_utf8());
                    }
                    else {
                        break;
                    }
                }

                state.add_token(IniTokenType::Comment, start, state.get_position());
                return true;
            }
        }
        false
    }

    /// Handles string literals.
    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
        let start = state.get_position();

        if let Some(quote_char) = state.current() {
            if quote_char == '"' || quote_char == '\'' {
                // Skip opening quote
                state.advance(1);

                while let Some(ch) = state.peek() {
                    if ch != quote_char {
                        if ch == '\\' {
                            state.advance(1); // Escape character
                            if let Some(_) = state.peek() {
                                state.advance(1); // Escaped character
                            }
                        }
                        else {
                            state.advance(ch.len_utf8());
                        }
                    }
                    else {
                        // Found closing quote
                        state.advance(1);
                        break;
                    }
                }

                state.add_token(IniTokenType::String, start, state.get_position());
                return true;
            }
        }
        false
    }

    /// Handles number literals.
    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
        let start = state.get_position();
        let first = match state.current() {
            Some(c) => c,
            None => return false,
        };

        // Check if starts with a digit or a sign
        if !first.is_ascii_digit() && first != '-' && first != '+' {
            return false;
        }

        // If it's a sign, check if followed by a digit
        if first == '-' || first == '+' {
            if let Some(next) = state.peek_next_n(1) {
                if !next.is_ascii_digit() {
                    return false;
                }
            }
            else {
                return false;
            }
        }

        state.advance(1);
        let mut has_dot = false;
        let mut has_exp = false;

        while let Some(ch) = state.peek() {
            if ch.is_ascii_digit() {
                state.advance(1);
            }
            else if ch == '.' && !has_dot && !has_exp {
                has_dot = true;
                state.advance(1);
            }
            else if (ch == 'e' || ch == 'E') && !has_exp {
                has_exp = true;
                state.advance(1);
                // Handle exponent sign
                if let Some(sign) = state.peek() {
                    if sign == '+' || sign == '-' {
                        state.advance(1);
                    }
                }
            }
            else {
                break;
            }
        }

        // Check if it's a valid number
        let end = state.get_position();
        let text = state.get_text_in((start..end).into());

        // Simple validation: cannot be just a sign or just a dot
        if text.as_ref() == "-" || text.as_ref() == "+" || text.as_ref() == "." {
            // Backtrack
            state.set_position(start);
            return false;
        }

        // Determine if it's an integer or a float
        let kind = if has_dot || has_exp { IniTokenType::Float } else { IniTokenType::Integer };

        state.add_token(kind, start, state.get_position());
        true
    }

    /// Handles identifiers
    fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
        let start = state.get_position();
        let ch = match state.current() {
            Some(c) => c,
            None => return false,
        };

        // Identifiers must start with a letter or underscore
        if !(ch.is_ascii_alphabetic() || ch == '_') {
            return false;
        }

        state.advance(1);
        while let Some(c) = state.current() {
            if c.is_ascii_alphanumeric() || c == '_' || c == '-' {
                state.advance(1);
            }
            else {
                break;
            }
        }

        let end = state.get_position();
        let text = state.get_text_in((start..end).into());

        // Check if it's a boolean or date-time
        let kind = match text.to_lowercase().as_str() {
            "true" | "false" => IniTokenType::Boolean,
            _ => {
                if self.is_datetime_like(text.as_ref()) {
                    IniTokenType::DateTime
                }
                else {
                    IniTokenType::Identifier
                }
            }
        };

        state.add_token(kind, start, state.get_position());
        true
    }

    /// Handles punctuation
    fn lex_punctuation<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
        let start = state.get_position();

        // Match longer symbols first
        if state.starts_with("[[") {
            state.advance(2);
            state.add_token(IniTokenType::DoubleLeftBracket, start, state.get_position());
            return true;
        }

        if state.starts_with("]]") {
            state.advance(2);
            state.add_token(IniTokenType::DoubleRightBracket, start, state.get_position());
            return true;
        }

        if let Some(ch) = state.current() {
            let kind = match ch {
                '{' => IniTokenType::LeftBrace,
                '}' => IniTokenType::RightBrace,
                '[' => IniTokenType::LeftBracket,
                ']' => IniTokenType::RightBracket,
                ',' => IniTokenType::Comma,
                '.' => IniTokenType::Dot,
                '=' => IniTokenType::Equal,
                _ => return false,
            };

            state.advance(ch.len_utf8());
            state.add_token(kind, start, state.get_position());
            return true;
        }

        false
    }

    fn is_datetime_like(&self, text: &str) -> bool {
        // Minimal judgment: those containing - and : might be date-time
        text.contains('-') && text.contains(':')
    }
}