ksl/
token.rs

1//! # ksl::token
2//!
3//! Defines token-related types and functions in KSL.
4
5/// All token types.
6#[derive(Clone)]
7pub enum TokenType {
8    /// `^([^\s\p{P}0-9]|_)([^\s\p{P}]|_|')*$`
9    Symbol(std::sync::Arc<[char]>),
10    /// `^#[^\s\p{P}[0-9]]([^\s\p{P}]|_|')*$`
11    Atom(std::sync::Arc<[char]>),
12    /// `^\u{22}[^\u{22}]*\u{22}$`
13    String(std::sync::Arc<[char]>),
14    /// `^#[1-9]{1,7}$`
15    Char(char),
16    /// `^[+-]?[0-9]+(\.[0-9]*)?([\+-]?e[0-9]+)?$`
17    Number(f64),
18    /// `^,$`
19    Seperator,
20    /// ^;$
21    SentenceSeperator,
22    /// `^\[$`
23    FuncListOpen,
24    /// `^\]$`
25    FuncListClose,
26    /// `^\{$`
27    ListOpen,
28    /// `^\}$`
29    ListClose,
30    // comment (* *)
31}
32
33impl std::fmt::Debug for TokenType {
34    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
35        match self {
36            TokenType::Symbol(sym) => write!(f, "Id({})", sym.iter().collect::<String>()),
37            TokenType::Atom(a) => write!(f, "Atom({})", a.iter().collect::<String>()),
38            TokenType::String(s) => write!(f, "Str({})", s.iter().collect::<String>()),
39            TokenType::Char(c) => write!(f, "Ch({c})"),
40            TokenType::Number(n) => write!(f, "Num({n})"),
41            TokenType::Seperator => write!(f, "S"),
42            TokenType::SentenceSeperator => write!(f, "SS"),
43            TokenType::FuncListOpen => write!(f, "FnO"),
44            TokenType::FuncListClose => write!(f, "FnC"),
45            TokenType::ListOpen => write!(f, "LstO"),
46            TokenType::ListClose => write!(f, "LstC"),
47        }
48    }
49}
50
51/// Represents a token with its value and span.
52#[derive(Clone)]
53pub struct Token {
54    pub value: TokenType,
55    pub position: ((usize, usize), (usize, usize)),
56}
57
58impl std::fmt::Debug for Token {
59    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
60        write!(
61            f,
62            "Token<{:?}, [({}, {}), ({}, {})]>",
63            self.value, self.position.0.0, self.position.0.1, self.position.1.0, self.position.1.1,
64        )
65    }
66}
67
68/// Lexer state tracker.
69struct Lexer<'a> {
70    chars: std::iter::Peekable<std::str::Chars<'a>>,
71    line: usize,
72    col: usize,
73}
74
75impl<'a> Lexer<'a> {
76    fn new(input: &'a str) -> Self {
77        Self {
78            chars: input.chars().peekable(),
79            line: 1,
80            col: 0,
81        }
82    }
83
84    /// Look at the next character without consuming it.
85    fn peek(&mut self) -> Option<char> { self.chars.peek().copied() }
86
87    /// Advance and return the next character.
88    fn next_char(&mut self) -> Option<char> {
89        let ch = self.chars.next()?;
90        if ch == '\n' {
91            self.line += 1;
92            self.col = 0;
93        } else {
94            self.col += 1;
95        }
96        Some(ch)
97    }
98
99    /// Get current position for start of a token.
100    fn current_pos(&self) -> (usize, usize) { (self.line, self.col + 1) }
101
102    /// Get current position for end of a token.
103    fn end_pos(&self) -> (usize, usize) { (self.line, self.col) }
104}
105
106/// Convert source code to a vector of tokens.
107///
108/// ```rust
109/// use std::sync::Arc;
110///
111/// use ksl::token::{TokenType, source_to_token};
112///
113/// let source = r#" Let[a, 10];
114///  Let[b, 20];"#;
115/// assert_eq!(
116///     source_to_token(source).map(|v| v[2].position),
117///     Ok(((1, 6), (1, 6)))
118/// );
119/// assert_eq!(
120///     source_to_token(source)
121///         .map(|v| v[7].value.clone())
122///         .and_then(|v| match v {
123///             TokenType::Symbol(sym) => Ok(sym),
124///             _ => unreachable!(),
125///         }),
126///     Ok(Arc::from(['L', 'e', 't']))
127/// );
128/// ```
129pub fn source_to_token(source: &str) -> Result<Vec<Token>, std::sync::Arc<str>> {
130    let mut lexer = Lexer::new(source);
131    let mut tokens = Vec::new();
132    let mut comment_depth = 0;
133
134    while let Some(ch) = lexer.peek() {
135        let start = lexer.current_pos();
136
137        // Handle Comments
138        if ch == '(' && lexer.peek_second() == Some('*') {
139            lexer.next_char();
140            lexer.next_char(); // skip '(*'
141            comment_depth += 1;
142            continue;
143        }
144        if ch == '*' && comment_depth > 0 && lexer.peek_second() == Some(')') {
145            lexer.next_char();
146            lexer.next_char(); // skip '*)'
147            comment_depth -= 1;
148            continue;
149        }
150
151        // Skip content if inside comments
152        if comment_depth > 0 {
153            lexer.next_char();
154            continue;
155        }
156
157        // Match regular tokens
158        match ch {
159            ' ' | '\t' | '\r' | '\n' => {
160                lexer.next_char();
161            }
162            '"' => tokens.push(lex_string(&mut lexer)?),
163            '#' => tokens.push(lex_atom_or_char(&mut lexer)?),
164            ',' => tokens.push(make_token(&mut lexer, TokenType::Seperator)),
165            ';' => tokens.push(make_token(&mut lexer, TokenType::SentenceSeperator)),
166            '[' => tokens.push(make_token(&mut lexer, TokenType::FuncListOpen)),
167            ']' => tokens.push(make_token(&mut lexer, TokenType::FuncListClose)),
168            '{' => tokens.push(make_token(&mut lexer, TokenType::ListOpen)),
169            '}' => tokens.push(make_token(&mut lexer, TokenType::ListClose)),
170            c if c.is_ascii_digit() || c == '+' || c == '-' => tokens.push(lex_number(&mut lexer)?),
171            c if is_symbol_start(c) => tokens.push(lex_symbol(&mut lexer)?),
172            _ => {
173                return Err(std::sync::Arc::from(format!(
174                    "Error[ksl::token::source_to_token]: Invalid token `{}` at `({}, {})`.",
175                    ch, start.0, start.1
176                )));
177            }
178        }
179    }
180
181    if comment_depth == 0 {
182        Ok(tokens)
183    } else {
184        Err(std::sync::Arc::from(
185            "Error[ksl::token::source_to_token]: Unclosed comment.",
186        ))
187    }
188}
189
190impl<'a> Lexer<'a> {
191    /// Helper to see the character after the next one.
192    fn peek_second(&self) -> Option<char> {
193        let mut it = self.chars.clone();
194        it.next();
195        it.next()
196    }
197}
198
199/// Create a single-character token.
200fn make_token(lexer: &mut Lexer, val: TokenType) -> Token {
201    let start = lexer.current_pos();
202    lexer.next_char();
203    Token {
204        value: val,
205        position: (start, lexer.end_pos()),
206    }
207}
208
209/// Predicate for symbol start characters.
210fn is_symbol_start(c: char) -> bool { (!c.is_ascii_punctuation() && !c.is_whitespace() && !c.is_ascii_digit()) || c == '_' }
211
212/// Predicate for subsequent symbol characters.
213fn is_symbol_cont(c: char) -> bool { (!c.is_ascii_punctuation() && !c.is_whitespace()) || c == '_' || c == '\'' }
214
215fn lex_string(lexer: &mut Lexer) -> Result<Token, std::sync::Arc<str>> {
216    let start = lexer.current_pos();
217    lexer.next_char(); // skip opening quote
218    let mut buf = Vec::new();
219    while let Some(c) = lexer.next_char() {
220        if c == '"' {
221            return Ok(Token {
222                value: TokenType::String(std::sync::Arc::from(buf)),
223                position: (start, lexer.end_pos()),
224            });
225        }
226        buf.push(c);
227    }
228    Err(std::sync::Arc::from(format!(
229        "Error[ksl::token::lex_string]: Unclosed string at `({}, {})`.",
230        start.0, start.1
231    )))
232}
233
234fn lex_number(lexer: &mut Lexer) -> Result<Token, std::sync::Arc<str>> {
235    let start = lexer.current_pos();
236    let mut s = String::new();
237
238    // Read numeric sequence including sign, dots, and exponent
239    while let Some(c) = lexer.peek() {
240        if c.is_ascii_digit() || "+-.e".contains(c) {
241            s.push(lexer.next_char().unwrap());
242        } else {
243            break;
244        }
245    }
246
247    s.parse::<f64>()
248        .map(|n| Token {
249            value: TokenType::Number(n),
250            position: (start, lexer.end_pos()),
251        })
252        .map_err(|_| {
253            std::sync::Arc::from(format!(
254                concat!(
255                    "Error[ksl::token::lex_number]: ",
256                    "Invalid number string `{}` at `({}, {})`."
257                ),
258                s, start.0, start.1
259            ))
260        })
261}
262
263fn lex_atom_or_char(lexer: &mut Lexer) -> Result<Token, std::sync::Arc<str>> {
264    let start = lexer.current_pos();
265    lexer.next_char(); // consume '#'
266
267    match lexer.peek() {
268        Some(c) if c.is_ascii_digit() => {
269            // Lex Unicode Character: #123
270            let mut s = String::new();
271            while let Some(digit) = lexer.peek() {
272                if digit.is_ascii_digit() {
273                    s.push(lexer.next_char().unwrap());
274                } else {
275                    break;
276                }
277            }
278            let code = s.parse::<u32>().map_err(|_| {
279                std::sync::Arc::from(format!(
280                    "Error[ksl::token::lex_atom_or_char]: Invalid number string `{}` at `({}, {})`.",
281                    s, start.0, start.1
282                ))
283            })?;
284            let ch = char::from_u32(code).ok_or_else(|| {
285                std::sync::Arc::from(format!(
286                    "Error[ksl::token::lex_atom_or_char]: Invalid unicode `{}` at `({}, {})`.",
287                    code, start.0, start.1
288                ))
289            })?;
290            Ok(Token {
291                value: TokenType::Char(ch),
292                position: (start, lexer.end_pos()),
293            })
294        }
295        Some(c) if !c.is_ascii_punctuation() && !c.is_whitespace() => {
296            // Lex Atom: #tag
297            let mut buf = Vec::new();
298            while let Some(cont) = lexer.peek() {
299                if is_symbol_cont(cont) {
300                    buf.push(lexer.next_char().unwrap());
301                } else {
302                    break;
303                }
304            }
305            Ok(Token {
306                value: TokenType::Atom(std::sync::Arc::from(buf)),
307                position: (start, lexer.end_pos()),
308            })
309        }
310        _ => Err(std::sync::Arc::from(format!(
311            "Error[ksl::token::lex_atom_or_char]: Invalid atom at `({}, {})`.",
312            start.0, start.1
313        ))),
314    }
315}
316
317fn lex_symbol(lexer: &mut Lexer) -> Result<Token, std::sync::Arc<str>> {
318    let start = lexer.current_pos();
319    let mut buf = Vec::new();
320    while let Some(c) = lexer.peek() {
321        if is_symbol_cont(c) {
322            buf.push(lexer.next_char().unwrap());
323        } else {
324            break;
325        }
326    }
327    Ok(Token {
328        value: TokenType::Symbol(std::sync::Arc::from(buf)),
329        position: (start, lexer.end_pos()),
330    })
331}