parlex-calc 0.4.1

//! # Calculator Lexer
//!
//! This module wires up the calculator’s lexer using data generated by
//! [`parlex-gen`]'s **`alex`** tool. It exposes:
//!
//! - the generated DFA tables and rule IDs (in [`lexer_data`]),
//! - a small, stateful driver [`CalcLexerDriver`] that reacts to rule matches,
//! - the ergonomic adapter [`CalcLexer`] that yields typed [`CalcToken`]s.
//!
//! The lexer pulls bytes from any input implementing
//! [`TryNextWithContext<Item = u8, Context = SymTab>`], which lets rule actions
//! read/mutate the external symbol table (e.g., to intern identifiers).
//!
//! ## Notes
//! - *Modes vs. rules:* generated rules are **applicable in specific modes**;
//!   the driver may switch modes (`Expr` ⇄ `Comment`) but rules themselves do
//!   not encode transitions.
//! - *End-of-input:* the lexer typically emits a final [`TokenID::End`] token
//!   in `Expr` mode; if input ends while still in `Comment` mode, the driver
//!   emits a [`TokenID::Error`] token to surface the unterminated comment.
//!
//! [`parlex-gen`]: https://crates.io/crates/parlex-gen

use crate::{CalcToken, SymTab, TokenID, TokenValue};
use lexer_data::{LexData, Mode, Rule};
use parlex::{Lexer, LexerData, LexerDriver, LexerStats, ParlexError};
use std::marker::PhantomData;
use try_next::TryNextWithContext;

/// Includes the generated lexer definition produced by **`parlex-gen`**’s
/// [`alex`](https://crates.io/crates/parlex-gen) tool.
///
/// The included file provides:
/// - DFA tables and mode enumeration ([`Mode`]),
/// - rule identifiers ([`Rule`]),
/// - aggregate metadata ([`LexData`]) consumed by the runtime [`Lexer`].
///
/// The included file (`lexer_data.rs`) is generated at build time by the
/// project’s `build.rs` script.
pub mod lexer_data {
    include!(concat!(env!("OUT_DIR"), "/lexer_data.rs"));
}

/// Stateful driver that handles rule matches from the generated DFA.
///
/// `CalcLexerDriver` receives callbacks when a rule matches. It can:
/// - **emit tokens** (e.g., identifiers, numbers, operators),
/// - **adjust internal bookkeeping** (e.g., nested comment depth),
/// - **switch modes** (e.g., on comment boundaries).
///
/// The driver is parameterized by an input type `I` that yields bytes and
/// supports contextual access to a [`SymTab`].
///
/// # Internal State
/// - [`comment_level`](#structfield.comment_level): current nesting depth of
///   block comments; positive values mean we’re inside a comment.
/// - [`_marker`](#structfield._marker): binds the generic `I` without storage.
///
/// # Associated Types (via `LexerDriver`)
/// - `LexerData = LexData`
/// - `Token = CalcToken`
/// - `Lexer = Lexer<I, Self, SymTab>`
/// - `Error = CalcError`
/// - `Context = SymTab`
///
/// # Errors
/// - `CalcError::ParseInt` for invalid numeric literals,
/// - `CalcError::FromUtf8` for invalid UTF-8 when decoding identifiers,
/// - `CalcError::SymTab` for symbol-table issues (interning, etc.).
pub struct CalcLexerDriver<I> {
    /// Current nesting depth of block comments.
    ///
    /// - Increment on comment open (e.g., `/*`).
    /// - Decrement on comment close (e.g., `*/`).
    /// - Ensure it never goes negative; reaching EOF with a positive value
    ///   should raise a lexical error.
    comment_level: i32,

    /// Marker to bind the driver to the input type `I` without storing it.
    _marker: PhantomData<I>,
}

impl<I> LexerDriver for CalcLexerDriver<I>
where
    I: TryNextWithContext<SymTab, Item = u8, Error: std::fmt::Display + 'static>,
{
    /// Rule identifiers and metadata produced by the lexer generator.
    type LexerData = LexData;

    /// Concrete token type emitted by the driver.
    type Token = CalcToken;

    /// Concrete lexer type parameterized by input, driver and context.
    type Lexer = Lexer<I, Self, Self::Context>;

    /// Externally supplied context available to actions (symbol table).
    type Context = SymTab;

    /// Handles a single lexer rule match.
    ///
    /// Called by the lexer when a DFA rule in [`Lexer`] fires. The implementation
    /// typically inspects `rule`, reads the matched span from `lexer`, and either:
    ///
    /// - emits a [`CalcToken`] (e.g., identifiers, numbers, operators),
    /// - updates internal state (e.g., `comment_level`),
    /// - or returns an error if the match is invalid.
    ///
    /// Implementations may also use `context` (a [`SymTab`]) to intern identifiers
    /// and store indices in [`TokenValue::Ident`].
    ///
    /// # Errors
    /// Propagates any lexical, parsing, UTF-8 decoding, or symbol-table errors as
    /// [`CalcError`].
    fn action(
        &mut self,
        lexer: &mut Self::Lexer,
        context: &mut Self::Context,
        rule: <Self::LexerData as LexerData>::LexerRule,
    ) -> Result<(), ParlexError> {
        match rule {
            Rule::Empty => {
                unreachable!()
            }
            Rule::Ident => {
                // <Expr> (?:[a-z_][a-z_A-Z0-9]*)
                let index = context.intern(lexer.take_str()?);
                lexer.yield_token(CalcToken {
                    token_id: TokenID::Ident,
                    span: Some(lexer.span()),
                    value: TokenValue::Ident(index),
                });
            }
            Rule::Number => {
                // <Expr> (?:[0-9]+)
                let s = lexer.take_str()?;
                lexer.yield_token(CalcToken {
                    token_id: TokenID::Number,
                    span: Some(lexer.span()),
                    value: TokenValue::Number(
                        s.as_str()
                            .parse::<i64>()
                            .map_err(|e| ParlexError::from_err(e, Some(lexer.span())))?,
                    ),
                });
            }
            Rule::Semicolon => {
                // <Expr> ;
                lexer.yield_token(CalcToken {
                    token_id: TokenID::End,
                    span: Some(lexer.span()),
                    value: TokenValue::None,
                });
            }
            Rule::Equals => {
                // <Expr> =
                lexer.yield_token(CalcToken {
                    token_id: TokenID::Equals,
                    span: Some(lexer.span()),
                    value: TokenValue::None,
                });
            }
            Rule::Plus => {
                // <Expr> \+
                lexer.yield_token(CalcToken {
                    token_id: TokenID::Plus,
                    span: Some(lexer.span()),
                    value: TokenValue::None,
                });
            }
            Rule::Minus => {
                // <Expr> -
                lexer.yield_token(CalcToken {
                    token_id: TokenID::Minus,
                    span: Some(lexer.span()),
                    value: TokenValue::None,
                });
            }
            Rule::Asterisk => {
                // <Expr> \*
                lexer.yield_token(CalcToken {
                    token_id: TokenID::Asterisk,
                    span: Some(lexer.span()),
                    value: TokenValue::None,
                });
            }
            Rule::Slash => {
                // <Expr> /
                lexer.yield_token(CalcToken {
                    token_id: TokenID::Slash,
                    span: Some(lexer.span()),
                    value: TokenValue::None,
                });
            }
            Rule::LeftParen => {
                // <Expr> \(
                lexer.yield_token(CalcToken {
                    token_id: TokenID::LeftParen,
                    span: Some(lexer.span()),
                    value: TokenValue::None,
                });
            }
            Rule::RightParen => {
                // <Expr> \)
                lexer.yield_token(CalcToken {
                    token_id: TokenID::RightParen,
                    span: Some(lexer.span()),
                    value: TokenValue::None,
                });
            }
            Rule::CommentBegin => {
                // <Expr,Comment> /\*

                // Accumulate fragments and span data from multiple
                // regex matches into a single comment token
                lexer.accum();

                lexer.begin(Mode::Comment);
                self.comment_level += 1;
            }
            Rule::CommentEnd => {
                // <Comment> \*/
                self.comment_level -= 1;
                if self.comment_level == 0 {
                    lexer.begin(Mode::Expr);
                    let s = lexer.take_str()?;
                    lexer.yield_token(CalcToken {
                        token_id: TokenID::Comment,
                        span: Some(lexer.span()),
                        value: TokenValue::Comment(s),
                    });
                }
            }
            Rule::CommentChar => { // <Comment> .+
            }
            Rule::NewLine => {
                // <*> (?:\n)
            }
            Rule::WhiteSpace => { // <Expr> (?:[ \t])+
            }
            Rule::Error => {
                // <*> .
                lexer.yield_token(CalcToken {
                    token_id: TokenID::Error,
                    span: Some(lexer.span()),
                    value: TokenValue::None,
                });
            }
            Rule::End => {
                if lexer.mode() == Mode::Expr {
                    lexer.yield_token(CalcToken {
                        token_id: TokenID::End,
                        span: Some(lexer.span()),
                        value: TokenValue::None,
                    });
                } else {
                    lexer.yield_token(CalcToken {
                        token_id: TokenID::Error,
                        span: Some(lexer.span()),
                        value: TokenValue::None,
                    });
                }
            }
        }
        Ok(())
    }
}

/// The calculator lexer.
///
/// `CalcLexer<I>` adapts a byte-oriented input stream `I` (that supports
/// contextual access to a [`SymTab`]) into an iterator-like interface that
/// yields [`CalcToken`]s. Internally, it owns a lower-level [`Lexer`] driven by
/// [`CalcLexerDriver`], which handles rule actions (e.g., interning identifiers,
/// parsing numbers, skipping comments/whitespace).
///
/// The generic parameter `I` must implement
/// [`TryNextWithContext<Item = u8, Context = SymTab>`], allowing the lexer to
/// pull bytes and mutate/read the external symbol table while tokenizing.
///
/// # Output
///
/// Each successful step yields a [`CalcToken`], which carries:
/// - a token kind ([`TokenID`]),
/// - an optional payload ([`TokenValue`]),
/// - a 1-based line number (`line_no`).
///
/// # Errors
///
/// Methods return a [`LexerError<I::Error, CalcError>`], where:
/// - `I::Error` is any error produced by the underlying input,
/// - [`CalcError`] covers lexical/parsing/UTF-8/symbol-table errors.
///
/// # Example
///
/// ```rust
/// # use parlex_calc::{CalcToken, CalcLexer, SymTab, TokenID, TokenValue};
/// # use try_next::{IterInput, TryNextWithContext};
/// let mut symtab = SymTab::new();
/// let input = IterInput::from("hello\n +\n world\n\n123".bytes());
/// let mut lexer = CalcLexer::try_new(input).unwrap();
/// let vs = lexer.try_collect_with_context(&mut symtab).unwrap();
/// assert_eq!(vs.len(), 5);
/// assert_eq!(symtab.len(), 2);
/// ```
pub struct CalcLexer<I>
where
    I: TryNextWithContext<SymTab, Item = u8, Error: std::fmt::Display + 'static>,
{
    /// The underlying DFA/engine that drives tokenization, parameterized by the
    /// input `I` and the driver that executes rule actions.
    lexer: Lexer<I, CalcLexerDriver<I>, SymTab>,
}

impl<I> CalcLexer<I>
where
    I: TryNextWithContext<SymTab, Item = u8, Error: std::fmt::Display + 'static>,
{
    /// Constructs a new calculator lexer over the provided input stream.
    ///
    /// This initializes an internal [`Lexer`] with a [`CalcLexerDriver`] that
    /// performs rule actions such as:
    /// - interning identifiers into the provided [`SymTab`] (via context),
    /// - converting matched byte slices into numbers/idents,
    /// - tracking line numbers and comment nesting.
    ///
    /// # Errors
    ///
    /// Returns a [`LexerError`] if the lexer cannot be constructed from the
    /// given input (rare, but may occur if the input source fails during setup).
    pub fn try_new(input: I) -> Result<Self, ParlexError> {
        let driver = CalcLexerDriver {
            comment_level: 0,
            _marker: PhantomData,
        };
        let lexer = Lexer::try_new(input, driver)?;
        Ok(Self { lexer })
    }
}
impl<I> TryNextWithContext<SymTab, LexerStats> for CalcLexer<I>
where
    I: TryNextWithContext<SymTab, Item = u8, Error: std::fmt::Display + 'static>,
{
    /// Tokens produced by this lexer.
    type Item = CalcToken;

    /// Unified error type.
    type Error = ParlexError;

    /// Advances the lexer and returns the next token, or `None` at end of input.
    ///
    /// The provided `context` (typically a [`SymTab`]) may be mutated by rule
    /// actions (for example, to intern identifiers). This method is fallible;
    /// both input and lexical errors are converted into [`Self::Error`].
    ///
    /// # End of Input
    ///
    /// When the lexer reaches the end of the input stream, it will typically
    /// emit a final [`TokenID::End`] token before returning `None`.
    ///
    /// This explicit *End* token is expected by the **Parlex parser** to
    /// signal successful termination of a complete parsing unit.
    /// Consumers should treat this token as a logical *end-of-sentence* or
    /// *end-of-expression* marker, depending on the grammar.
    ///
    /// If the input contains **multiple independent sentences or expressions**,
    /// the lexer may emit multiple `End` tokens—one after each completed unit.
    /// In such cases, the parser can restart or resume parsing after each `End`
    /// to produce multiple parse results from a single input stream.
    ///
    /// Once all input has been consumed, the lexer returns `None`.
    fn try_next_with_context(
        &mut self,
        context: &mut SymTab,
    ) -> Result<Option<CalcToken>, ParlexError> {
        self.lexer.try_next_with_context(context)
    }

    fn stats(&self) -> LexerStats {
        self.lexer.stats()
    }
}

#[cfg(test)]
mod tests {
    use crate::{CalcLexer, CalcToken, SymTab, TokenID, TokenValue};
    use parlex::span;
    use try_next::{IterInput, TryNextWithContext};

    #[test]
    fn lex_ident_plus_ident_number_end() {
        let _ = env_logger::builder().is_test(true).try_init();
        let mut symtab = SymTab::new();
        let input = IterInput::from("hello\n +\n\n\n\n\n\n\n\n\n\n world\n\n123".bytes());
        let mut lexer = CalcLexer::try_new(input).unwrap();
        assert!(matches!(
            lexer.try_next_with_context(&mut symtab).unwrap(),
            Some(CalcToken {
                token_id: TokenID::Ident,
                span: span!(0, 0, 0, 5),
                value: TokenValue::Ident(0)
            }),
        ));
        assert!(matches!(
            lexer.try_next_with_context(&mut symtab).unwrap(),
            Some(CalcToken {
                token_id: TokenID::Plus,
                span: span!(1, 1, 1, 2),
                value: TokenValue::None
            }),
        ));
        assert!(matches!(
            lexer.try_next_with_context(&mut symtab).unwrap(),
            Some(CalcToken {
                token_id: TokenID::Ident,
                span: span!(11, 1, 11, 6),
                value: TokenValue::Ident(1)
            }),
        ));
        assert!(matches!(
            lexer.try_next_with_context(&mut symtab).unwrap(),
            Some(CalcToken {
                token_id: TokenID::Number,
                span: span!(13, 0, 13, 3),
                value: TokenValue::Number(123)
            }),
        ));
        assert!(matches!(
            lexer.try_next_with_context(&mut symtab).unwrap(),
            Some(CalcToken {
                token_id: TokenID::End,
                span: span!(13, 3, 13, 3),
                value: TokenValue::None
            }),
        ));
        assert!(matches!(
            lexer.try_next_with_context(&mut symtab).unwrap(),
            None,
        ));
    }

    #[test]
    fn nested_block_comments_are_skipped() {
        let _ = env_logger::builder().is_test(true).try_init();
        let mut symtab = SymTab::new();
        let src = "a /* outer /* inner\n */ still\n comment */ + b;";
        let input = IterInput::from(src.bytes());
        let mut lexer = CalcLexer::try_new(input).unwrap();

        // a
        let t1 = lexer.try_next_with_context(&mut symtab).unwrap().unwrap();
        assert!(matches!(
            t1,
            CalcToken {
                token_id: TokenID::Ident,
                span: span!(0, 0, 0, 1),
                value: TokenValue::Ident(0),
            }
        ));

        // comment
        let t2 = lexer.try_next_with_context(&mut symtab).unwrap().unwrap();
        assert!(matches!(
            t2,
            CalcToken {
                token_id: TokenID::Comment,
                span: span!(0, 2, 2, 11),
                value: TokenValue::Comment(s),
            } if s == "/* outer /* inner\n */ still\n comment */"
        ));

        // +
        let t3 = lexer.try_next_with_context(&mut symtab).unwrap().unwrap();
        assert!(matches!(
            dbg!(t3),
            CalcToken {
                token_id: TokenID::Plus,
                span: span!(2, 12, 2, 13),
                value: TokenValue::None,
            }
        ));

        // b
        let t4 = lexer.try_next_with_context(&mut symtab).unwrap().unwrap();
        assert!(matches!(
            dbg!(t4),
            CalcToken {
                token_id: TokenID::Ident,
                span: span!(2, 14, 2, 15),
                value: TokenValue::Ident(1),
            }
        ));

        // End from ';'
        let t5 = lexer.try_next_with_context(&mut symtab).unwrap().unwrap();
        assert!(matches!(
            dbg!(t5),
            CalcToken {
                token_id: TokenID::End,
                span: span!(2, 15, 2, 16),
                value: TokenValue::None,
            }
        ));

        // End from EOF
        let t6 = lexer.try_next_with_context(&mut symtab).unwrap().unwrap();
        assert!(matches!(
            dbg!(t6),
            CalcToken {
                token_id: TokenID::End,
                span: span!(2, 16, 2, 16),
                value: TokenValue::None,
            }
        ));

        // Finish
        assert!(lexer.try_next_with_context(&mut symtab).unwrap().is_none());

        // Only the two idents were interned
        assert_eq!(symtab.len(), 2);
    }

    /// Unterminated comment at EOF yields an `Error` token instead of `End`.
    #[test]
    fn unterminated_block_comment_emits_error_at_eof() {
        let _ = env_logger::builder().is_test(true).try_init();
        let mut symtab = SymTab::new();
        let input = IterInput::from("/* unclosed".bytes());
        let mut lexer = CalcLexer::try_new(input).unwrap();

        // First token at EOF must be Error (still in Comment mode)
        let t = lexer.try_next_with_context(&mut symtab).unwrap().unwrap();
        assert!(matches!(
            t,
            CalcToken {
                token_id: TokenID::Error,
                ..
            }
        ));

        // Then finish
        assert!(lexer.try_next_with_context(&mut symtab).unwrap().is_none());
    }
}