rustcc 0.1.1

An little C Complier( now it's just WIP :) )
//! # RustCC Lexical Scanner
//! A robust lexical scanner for a simple expression language that supports:
//! - Basic arithmetic operators (+, -, *, /)
//! - Integer literals (e.g., 123, 4567)
//! - Whitespace skipping
//! - Line number tracking
//!
//! ## Usage Example
//! ```
//! use jacc::scanner::{Scanner, TokenType};
//!
//! fn main() -> Result<(), Box<dyn std::error::Error>> {
//!     let mut scanner = Scanner::new("input.txt")?;
//!     while let Some(token) = scanner.scan()? {
//!         match token.token_type {
//!             TokenType::IntLit => println!("Integer: {}", token.int_value.unwrap()),
//!             TokenType::Plus => println!("Plus operator"),
//!             // Handle other token types
//!             _ => {}
//!         }
//!     }
//!     Ok(())
//! }
//! ```
use std::fs::File;
use std::io::{self, Read, BufReader};
use std::error::Error;

/// Enumeration of all token types recognized by the scanner.
/// Each variant represents a distinct lexical element in the language.
#[derive(Debug, PartialEq, Clone, Copy)]
pub enum TokenType {
    /// Addition operator '+'
    /// Represents mathematical addition operation.
    Plus,
    /// Subtraction operator '-'
    /// Represents mathematical subtraction operation.
    Minus,
    /// Multiplication operator '*'
    /// Represents mathematical multiplication operation.
    Star,
    /// Division operator '/'
    /// Represents mathematical division operation.
    Slash,
    /// Integer literal
    /// Represents whole number values (e.g., 123, 4567, -89).
    /// The actual value is stored in the `int_value` field of the Token struct.
    IntLit,
}

/// A token representing a single lexical element from the input stream.
/// Tokens are produced by the scanner and consumed by the parser.
/// Each token has a type and an optional integer value (for IntLit tokens).
#[derive(Debug)]
pub struct Token {
    /// The semantic type of this token (e.g., operator, integer literal).
    pub token_type: TokenType,
    /// Optional integer value associated with this token.
    /// Only populated for tokens of type TokenType::IntLit.
    pub int_value: Option<i32>,
}

/// A lexical scanner that reads input from a file and produces a stream of tokens.
/// The scanner handles character input, skips whitespace, tracks line numbers,
/// and recognizes the lexical elements defined in the TokenType enum.
pub struct Scanner {
    reader: BufReader<File>,
    line: u32,
    putback: Option<char>,
}

impl Scanner {
    /// Creates a new scanner instance for the specified file.
///
/// # Arguments
/// * `filename` - Path to the input file to be scanned
///
/// # Returns
/// * `Ok(Scanner)` - A new scanner instance initialized with the specified file
/// * `Err(Box<dyn Error>)` - If the file cannot be opened or an error occurs
///   during initialization
///
/// # Example
/// ```
/// let mut scanner = Scanner::new("input.txt")?;
/// ```
    pub fn new(filename: &str) -> Result<Self, Box<dyn Error>> {
        let file = File::open(filename)?;
        Ok(Scanner {
            reader: BufReader::new(file),
            line: 1,
            putback: Some('\n'), // Initialize with newline to simulate C code behavior
        })
    }

    /// Retrieves the next character from the input stream.
    /// Handles putback characters and tracks line numbers when newlines are encountered.
    ///
    /// # Returns
    /// * `Ok(char)` - The next character from the input
    /// * `Err(Box<dyn Error>)` - If an error occurs or end of file is reached
    fn next(&mut self) -> Result<char, Box<dyn Error>> {
        if let Some(c) = self.putback.take() {
            return Ok(c);
        }

        let mut buf = [0; 1];
        match self.reader.read(&mut buf) {
            Ok(0) => Err(io::Error::from(io::ErrorKind::UnexpectedEof).into()),
            Ok(_) => {
                let c = buf[0] as char;
                if c == '\n' {
                    self.line += 1;
                }
                Ok(c)
            },
            Err(e) => Err(e.into()),
        }
    }

    /// Puts back an unwanted character into the input stream to be read in the next call to next().
    ///
    /// # Arguments
    /// * `c` - The character to put back into the input stream
    fn putback(&mut self, c: char) {
        self.putback = Some(c);
    }

    /// Skips over whitespace characters (spaces, tabs, newlines) and returns the first non-whitespace character.
    ///
    /// # Returns
    /// * `Ok(char)` - The first non-whitespace character
    /// * `Err(Box<dyn Error>)` - If an error occurs while reading input
    fn skip(&mut self) -> Result<char, Box<dyn Error>> {
        let mut c = self.next()?;
        while c.is_whitespace() {
            c = self.next()?;
        }
        Ok(c)
    }

    /// Scans an integer literal starting with the provided character.
    /// Continues reading digits until a non-digit character is encountered
    /// or end of file is reached.
    ///
    /// # Arguments
    /// * `first_char` - The first character of the integer literal (a digit)
    ///
    /// # Returns
    /// * `Ok(i32)` - The integer value scanned
    /// * `Err(Box<dyn Error>)` - If an error occurs during scanning
    ///
    /// # Notes
    /// - Non-digit characters encountered during scanning are put back into the input stream
    /// - If end of file is reached during scanning, the integer value accumulated so far is returned
    fn scan_int(&mut self, first_char: char) -> Result<i32, Box<dyn Error>> {
        let mut val = 0;
        let mut c = first_char;

        while let Some(digit) = c.to_digit(10) {
            val = val * 10 + digit as i32;
            c = match self.next() {
                Ok(ch) => ch,
                Err(e) => {
                    if e.downcast_ref::<io::Error>().is_some_and(|ioe| ioe.kind() == io::ErrorKind::UnexpectedEof) {
                        // End of file, return the value we have
                        return Ok(val);
                    }
                    return Err(e);
                },
            };
        }

        // Put back the non-digit character
        self.putback(c);
        Ok(val)
    }

    /// Scans the next token from the input stream.
/// This is the main entry point for the scanner and is typically called in a loop
/// until it returns Ok(None) (end of file).
///
/// # Returns
/// * `Ok(Some(Token))` - The next token recognized
/// * `Ok(None)` - End of file reached
/// * `Err(Box<dyn Error>)` - If an error occurs or an unrecognized character is found
///
/// # Example
/// ```
/// while let Some(token) = scanner.scan()? {
///     // Process token
/// }
/// ```
    /// Returns the current line number being processed.
    /// This is useful for error reporting in the parser.
    pub fn line(&self) -> u32 {
        self.line
    }

    /// Scans the next token from the input stream.
/// This is the main entry point for the scanner and is typically called in a loop
/// until it returns Ok(None) (end of file).
///
/// # Returns
/// * `Ok(Some(Token))` - The next token recognized
/// * `Ok(None)` - End of file reached
/// * `Err(Box<dyn Error>)` - If an error occurs or an unrecognized character is found
///
/// # Example
/// ```
/// while let Some(token) = scanner.scan()? {
///     // Process token
/// }
/// ```
    pub fn scan(&mut self) -> Result<Option<Token>, Box<dyn Error>> {
        let c = match self.skip() {
            Ok(ch) => ch,
            Err(e) => {
                if let Some(ioe) = e.downcast_ref::<io::Error>()
                    && ioe.kind() == io::ErrorKind::UnexpectedEof {
                    return Ok(None);
                }
                return Err(e);
            },
        };

        match c {
            '+' => Ok(Some(Token { token_type: TokenType::Plus, int_value: None })),
            '-' => Ok(Some(Token { token_type: TokenType::Minus, int_value: None })),
            '*' => Ok(Some(Token { token_type: TokenType::Star, int_value: None })),
            '/' => Ok(Some(Token { token_type: TokenType::Slash, int_value: None })),
            '0'..='9' => {
                let int_value = self.scan_int(c)?;
                Ok(Some(Token { token_type: TokenType::IntLit, int_value: Some(int_value) }))
            },
            _ => {
                Err(format!("Unrecognised character '{}' on line {}", c, self.line).into())
            },
        }
    }
}