ksl 0.1.30

KSL core library and interpreter
Documentation
//! # ksl::token
//!
//! Defines token-related types and functions in KSL.

/// All token types.
#[derive(Clone)]
pub enum TokenType {
    /// `^([^\s\p{P}0-9]|_)([^\s\p{P}]|_|')*$`
    Symbol(std::sync::Arc<[char]>),
    /// `^#[^\s\p{P}[0-9]]([^\s\p{P}]|_|')*$`
    Atom(std::sync::Arc<[char]>),
    /// `^\u{22}[^\u{22}]*\u{22}$`
    String(std::sync::Arc<[char]>),
    /// `^#[1-9]{1,7}$`
    Char(char),
    /// `^[+-]?[0-9]+(\.[0-9]*)?([\+-]?e[0-9]+)?$`
    Number(f64),
    /// `^,$`
    Seperator,
    /// ^;$
    SentenceSeperator,
    /// `^\[$`
    FuncListOpen,
    /// `^\]$`
    FuncListClose,
    /// `^\{$`
    ListOpen,
    /// `^\}$`
    ListClose,
    // comment (* *)
}

impl std::fmt::Debug for TokenType {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            TokenType::Symbol(sym) => write!(f, "Id({})", sym.iter().collect::<String>()),
            TokenType::Atom(a) => write!(f, "Atom({})", a.iter().collect::<String>()),
            TokenType::String(s) => write!(f, "Str({})", s.iter().collect::<String>()),
            TokenType::Char(c) => write!(f, "Ch({c})"),
            TokenType::Number(n) => write!(f, "Num({n})"),
            TokenType::Seperator => write!(f, "S"),
            TokenType::SentenceSeperator => write!(f, "SS"),
            TokenType::FuncListOpen => write!(f, "FnO"),
            TokenType::FuncListClose => write!(f, "FnC"),
            TokenType::ListOpen => write!(f, "LstO"),
            TokenType::ListClose => write!(f, "LstC"),
        }
    }
}

/// Represents a token with its value and span.
#[derive(Clone)]
pub struct Token {
    /// Value of token.
    pub value: TokenType,
    /// Token location in the source code.
    pub location: ((usize, usize), (usize, usize)),
}

impl std::fmt::Debug for Token {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "Token<{:?}, [({}, {}), ({}, {})]>",
            self.value, self.location.0.0, self.location.0.1, self.location.1.0, self.location.1.1,
        )
    }
}

/// Lexer state tracker.
struct Lexer<'a> {
    chars: std::iter::Peekable<std::str::Chars<'a>>,
    line: usize,
    col: usize,
}

impl<'a> Lexer<'a> {
    fn new(input: &'a str) -> Self {
        Self {
            chars: input.chars().peekable(),
            line: 1,
            col: 0,
        }
    }

    /// Look at the next character without consuming it.
    fn peek(&mut self) -> Option<char> { self.chars.peek().copied() }

    /// Advance and return the next character.
    fn next_char(&mut self) -> Option<char> {
        let ch = self.chars.next()?;
        if ch == '\n' {
            self.line += 1;
            self.col = 0;
        } else {
            self.col += 1;
        }
        Some(ch)
    }

    /// Get current position for start of a token.
    fn current_pos(&self) -> (usize, usize) { (self.line, self.col + 1) }

    /// Get current position for end of a token.
    fn end_pos(&self) -> (usize, usize) { (self.line, self.col) }
}

/// Convert source code to a vector of tokens.
///
/// ```rust
/// use std::sync::Arc;
///
/// use ksl::token::{TokenType, source_to_token};
///
/// let source = r#" Let[a, 10];
///  Let[b, 20];"#;
/// assert_eq!(
///     source_to_token(source).map(|v| v[2].position),
///     Ok(((1, 6), (1, 6)))
/// );
/// assert_eq!(
///     source_to_token(source)
///         .map(|v| v[7].value.clone())
///         .and_then(|v| match v {
///             TokenType::Symbol(sym) => Ok(sym),
///             _ => unreachable!(),
///         }),
///     Ok(Arc::from(['L', 'e', 't']))
/// );
/// ```
pub fn source_to_token(source: &str) -> Result<Vec<Token>, std::sync::Arc<str>> {
    let mut lexer = Lexer::new(source);
    let mut tokens = Vec::new();
    let mut comment_depth = 0;

    while let Some(ch) = lexer.peek() {
        let start = lexer.current_pos();

        // Handle Comments
        if ch == '(' && lexer.peek_second() == Some('*') {
            let _ = lexer.next_char();
            let _ = lexer.next_char(); // skip '(*'
            comment_depth += 1;
            continue;
        }
        if ch == '*' && comment_depth > 0 && lexer.peek_second() == Some(')') {
            let _ = lexer.next_char();
            let _ = lexer.next_char(); // skip '*)'
            comment_depth -= 1;
            continue;
        }

        // Skip content if inside comments
        if comment_depth > 0 {
            lexer.next_char();
            continue;
        }

        // Match regular tokens
        match ch {
            ' ' | '\t' | '\r' | '\n' => {
                lexer.next_char();
            }
            '"' => tokens.push(lex_string(&mut lexer)?),
            '#' => tokens.push(lex_atom_or_char(&mut lexer)?),
            ',' => tokens.push(make_token(&mut lexer, TokenType::Seperator)),
            ';' => tokens.push(make_token(&mut lexer, TokenType::SentenceSeperator)),
            '[' => tokens.push(make_token(&mut lexer, TokenType::FuncListOpen)),
            ']' => tokens.push(make_token(&mut lexer, TokenType::FuncListClose)),
            '{' => tokens.push(make_token(&mut lexer, TokenType::ListOpen)),
            '}' => tokens.push(make_token(&mut lexer, TokenType::ListClose)),
            c if c.is_ascii_digit() || c == '+' || c == '-' => tokens.push(lex_number(&mut lexer)?),
            c if is_symbol_start(c) => tokens.push(lex_symbol(&mut lexer)?),
            _ => {
                return Err(std::sync::Arc::from(format!(
                    "Error[ksl::token::source_to_token]: Invalid token `{}` at `({}, {})`.",
                    ch, start.0, start.1
                )));
            }
        }
    }

    if comment_depth == 0 {
        Ok(tokens)
    } else {
        Err(std::sync::Arc::from(
            "Error[ksl::token::source_to_token]: Unclosed comment.",
        ))
    }
}

impl<'a> Lexer<'a> {
    /// Helper to see the character after the next one.
    fn peek_second(&self) -> Option<char> {
        let mut it = self.chars.clone();
        it.next();
        it.next()
    }
}

/// Create a single-character token.
fn make_token(lexer: &mut Lexer, val: TokenType) -> Token {
    let start = lexer.current_pos();
    lexer.next_char();
    Token {
        value: val,
        location: (start, lexer.end_pos()),
    }
}

/// Predicate for symbol start characters.
fn is_symbol_start(c: char) -> bool { (!c.is_ascii_punctuation() && !c.is_whitespace() && !c.is_ascii_digit()) || c == '_' }

/// Predicate for subsequent symbol characters.
fn is_symbol_cont(c: char) -> bool { (!c.is_ascii_punctuation() && !c.is_whitespace()) || c == '_' || c == '\'' }

fn lex_string(lexer: &mut Lexer) -> Result<Token, std::sync::Arc<str>> {
    let start = lexer.current_pos();
    let _ = lexer.next_char(); // skip opening quote
    let mut buf = Vec::new();
    while let Some(c) = lexer.next_char() {
        if c == '"' {
            return Ok(Token {
                value: TokenType::String(std::sync::Arc::from(buf)),
                location: (start, lexer.end_pos()),
            });
        }
        buf.push(c);
    }
    Err(std::sync::Arc::from(format!(
        "Error[ksl::token::lex_string]: Unclosed string at `({}, {})`.",
        start.0, start.1
    )))
}

fn lex_number(lexer: &mut Lexer) -> Result<Token, std::sync::Arc<str>> {
    let start = lexer.current_pos();
    let mut s = String::new();

    // Read numeric sequence including sign, dots, and exponent
    while let Some(c) = lexer.peek() {
        if (c.is_ascii_digit() || "+-.e".contains(c))
            && let Some(ch) = lexer.next_char()
        {
            s.push(ch);
        } else {
            break;
        }
    }

    s.parse::<f64>()
        .map(|n| Token {
            value: TokenType::Number(n),
            location: (start, lexer.end_pos()),
        })
        .map_err(|_| {
            std::sync::Arc::from(format!(
                concat!(
                    "Error[ksl::token::lex_number]: ",
                    "Invalid number string `{}` at `({}, {})`."
                ),
                s, start.0, start.1
            ))
        })
}

fn lex_atom_or_char(lexer: &mut Lexer) -> Result<Token, std::sync::Arc<str>> {
    let start = lexer.current_pos();
    let _ = lexer.next_char(); // consume '#'
    match lexer.peek() {
        Some(c) if c.is_ascii_digit() => {
            // Lex Unicode Character: #123

            let mut s = String::new();
            while let Some(digit) = lexer.peek() {
                if digit.is_ascii_digit()
                    && let Some(ch) = lexer.next_char()
                {
                    s.push(ch);
                } else {
                    break;
                }
            }
            let code = s.parse::<u32>().map_err(|_| {
                std::sync::Arc::from(format!(
                    "Error[ksl::token::lex_atom_or_char]: Invalid number string `{}` at `({}, {})`.",
                    s, start.0, start.1
                ))
            })?;
            let ch = char::from_u32(code).ok_or_else(|| {
                std::sync::Arc::from(format!(
                    "Error[ksl::token::lex_atom_or_char]: Invalid unicode `{}` at `({}, {})`.",
                    code, start.0, start.1
                ))
            })?;
            Ok(Token {
                value: TokenType::Char(ch),
                location: (start, lexer.end_pos()),
            })
        }
        Some(c) if !c.is_ascii_punctuation() && !c.is_whitespace() => {
            // Lex Atom: #tag
            let mut buf = Vec::new();
            while let Some(cont) = lexer.peek() {
                if is_symbol_cont(cont)
                    && let Some(ch) = lexer.next_char()
                {
                    buf.push(ch);
                } else {
                    break;
                }
            }
            Ok(Token {
                value: TokenType::Atom(std::sync::Arc::from(buf)),
                location: (start, lexer.end_pos()),
            })
        }
        _ => Err(std::sync::Arc::from(format!(
            "Error[ksl::token::lex_atom_or_char]: Invalid atom at `({}, {})`.",
            start.0, start.1
        ))),
    }
}

fn lex_symbol(lexer: &mut Lexer) -> Result<Token, std::sync::Arc<str>> {
    let start = lexer.current_pos();
    let mut buf = Vec::new();
    while let Some(c) = lexer.peek() {
        if is_symbol_cont(c)
            && let Some(ch) = lexer.next_char()
        {
            buf.push(ch);
        } else {
            break;
        }
    }
    Ok(Token {
        value: TokenType::Symbol(std::sync::Arc::from(buf)),
        location: (start, lexer.end_pos()),
    })
}