uni_core/
tokenizer.rs

1// Temporary new tokenizer implementation with complete position tracking
2use crate::compat::{fmt, String, ToString, Vec};
3
4// RUST CONCEPT: Source position for rich error messages
5#[derive(Debug, Clone, PartialEq)]
6pub struct SourcePos {
7    pub line: usize,
8    pub column: usize,
9    pub offset: usize, // Byte offset from start of input
10}
11
12impl SourcePos {
13    pub fn new(line: usize, column: usize, offset: usize) -> Self {
14        Self {
15            line,
16            column,
17            offset,
18        }
19    }
20}
21
22// RUST CONCEPT: Token with embedded source position
23#[derive(Debug, Clone, PartialEq)]
24pub struct Token {
25    pub kind: TokenKind,
26    pub pos: SourcePos,
27    pub end_pos: SourcePos,
28}
29
30impl Token {
31    pub fn new(kind: TokenKind, pos: SourcePos, end_pos: SourcePos) -> Self {
32        Self { kind, pos, end_pos }
33    }
34
35    // TODO: Method for calculating token span length - uncomment when implementing syntax highlighting or error spans
36    // pub fn span_len(&self) -> usize {
37    //     self.end_pos.offset - self.pos.offset
38    // }
39
40    // TODO: Simple token factory for tests - uncomment when needed for simplified test token creation
41    // pub fn simple(kind: TokenKind) -> Self {
42    //     Self::new(kind, SourcePos::new(1, 1, 0), SourcePos::new(1, 1, 0))
43    // }
44}
45
46#[derive(Debug, Clone, PartialEq)]
47pub enum TokenKind {
48    Number(f64),           // Float literal (has decimal point or scientific notation)
49    Integer(String),       // Integer literal (no decimal point)
50    BigInt(String),        // Explicit BigInt with 'n' suffix (e.g., 123n)
51    Rational(String, String), // Rational literal (e.g., 3/4 -> ("3", "4"))
52    #[cfg(feature = "complex_numbers")]
53    GaussianInt(String, String), // Gaussian integer (e.g., 3+4i -> ("3", "4"))
54    #[cfg(feature = "complex_numbers")]
55    Complex(String, String),     // Complex float (e.g., 3.0+4.0i -> ("3.0", "4.0"))
56    Atom(String),
57    String(String), // Quoted strings - not interned
58    Boolean(bool),  // Boolean literals: true, false
59    Null,           // Null literal
60    LeftBracket,
61    ArrayLeftBracket,
62    RightBracket,
63    Quote,
64    Pipe, // For cons pair notation like [1 | rest]
65}
66
67impl fmt::Display for TokenKind {
68    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
69        match self {
70            TokenKind::Number(n) => write!(f, "{}", n),
71            TokenKind::Integer(s) => write!(f, "{}", s),
72            TokenKind::BigInt(s) => write!(f, "{}n", s),
73            TokenKind::Rational(n, d) => write!(f, "{}/{}", n, d),
74            #[cfg(feature = "complex_numbers")]
75            TokenKind::GaussianInt(re, im) => write!(f, "{}+{}i", re, im),
76            #[cfg(feature = "complex_numbers")]
77            TokenKind::Complex(re, im) => write!(f, "{}+{}i", re, im),
78            TokenKind::Atom(s) => write!(f, "{}", s),
79            TokenKind::String(s) => write!(f, "\"{}\"", s),
80            TokenKind::Boolean(b) => write!(f, "{}", if *b { "true" } else { "false" }),
81            TokenKind::Null => write!(f, "null"),
82            TokenKind::LeftBracket => write!(f, "["),
83            TokenKind::ArrayLeftBracket => write!(f, "#["),
84            TokenKind::RightBracket => write!(f, "]"),
85            TokenKind::Quote => write!(f, "'"),
86            TokenKind::Pipe => write!(f, "|"),
87        }
88    }
89}
90
91impl fmt::Display for Token {
92    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
93        write!(f, "{}", self.kind)
94    }
95}
96
97pub fn tokenize(input: &str) -> Result<Vec<Token>, String> {
98    let mut tokens = Vec::new();
99    let mut chars = input.chars().peekable();
100    let mut line = 1;
101    let mut column = 1;
102    let mut offset = 0;
103
104    // Helper function to advance position tracking
105    fn advance_pos(ch: char, line: &mut usize, column: &mut usize, offset: &mut usize) {
106        if ch == '\n' {
107            *line += 1;
108            *column = 1;
109        } else {
110            *column += 1;
111        }
112        *offset += ch.len_utf8();
113    }
114
115    // Helper function to classify atom-like strings into appropriate token types
116    fn classify_atom(s: String) -> TokenKind {
117        // Check for BigInt suffix (e.g., 123n, -456n, 123456789012345678901234567890n)
118        if s.ends_with('n') && s.len() > 1 {
119            let num_part = &s[..s.len() - 1];
120            // Check if it looks like a valid integer (all digits, optionally with leading -)
121            let is_integer = if let Some(stripped) = num_part.strip_prefix('-') {
122                num_part.len() > 1 && stripped.chars().all(|c| c.is_ascii_digit())
123            } else {
124                num_part.chars().all(|c| c.is_ascii_digit())
125            };
126
127            if is_integer {
128                return TokenKind::BigInt(num_part.to_string());
129            }
130        }
131
132        // Check for rational (e.g., 3/4)
133        if s.contains('/') {
134            let parts: Vec<&str> = s.split('/').collect();
135            if parts.len() == 2
136                && let (Ok(_), Ok(_)) = (parts[0].parse::<i64>(), parts[1].parse::<i64>())
137            {
138                return TokenKind::Rational(parts[0].to_string(), parts[1].to_string());
139            }
140        }
141
142        // Check for complex/gaussian (e.g., 3+4i, 3.0+4.0i, 5i)
143        #[cfg(feature = "complex_numbers")]
144        if s.ends_with('i') && s.len() > 1 {
145            let num_part = &s[..s.len() - 1];
146            // Find the last + or - that's not at the start
147            if let Some(op_pos) = num_part.char_indices().skip(1).find(|(_, c)| *c == '+' || *c == '-').map(|(pos, _)| pos) {
148                let real_part = &num_part[..op_pos];
149                let imag_part = &num_part[op_pos..];
150
151                // Check if both parts are integers (Gaussian)
152                if let (Ok(_), Ok(_)) = (real_part.parse::<i64>(), imag_part.parse::<i64>()) {
153                    return TokenKind::GaussianInt(real_part.to_string(), imag_part.to_string());
154                }
155
156                // Check if either part is a float (Complex)
157                if real_part.parse::<f64>().is_ok() && imag_part.parse::<f64>().is_ok() {
158                    return TokenKind::Complex(real_part.to_string(), imag_part.to_string());
159                }
160            } else {
161                // Pure imaginary (e.g., 5i, -5i, 3.5i)
162                // Try integer first
163                if num_part.parse::<i64>().is_ok() {
164                    return TokenKind::GaussianInt("0".to_string(), num_part.to_string());
165                }
166                // Try float
167                if num_part.parse::<f64>().is_ok() {
168                    return TokenKind::Complex("0".to_string(), num_part.to_string());
169                }
170            }
171        }
172
173        // Default: it's just an atom
174        TokenKind::Atom(s)
175    }
176
177    while let Some(&ch) = chars.peek() {
178        let start_line = line;
179        let start_column = column;
180        let start_offset = offset;
181
182        match ch {
183            ' ' | '\t' | '\n' | '\r' => {
184                let consumed = chars.next().unwrap();
185                advance_pos(consumed, &mut line, &mut column, &mut offset);
186            }
187
188            '[' => {
189                let consumed = chars.next().unwrap();
190                advance_pos(consumed, &mut line, &mut column, &mut offset);
191                tokens.push(Token::new(
192                    TokenKind::LeftBracket,
193                    SourcePos::new(start_line, start_column, start_offset),
194                    SourcePos::new(line, column, offset),
195                ));
196            }
197
198            '#' => {
199                let consumed = chars.next().unwrap();
200                advance_pos(consumed, &mut line, &mut column, &mut offset);
201
202                match chars.peek() {
203                    Some(&'[') => {
204                        let consumed_bracket = chars.next().unwrap();
205                        advance_pos(consumed_bracket, &mut line, &mut column, &mut offset);
206                        tokens.push(Token::new(
207                            TokenKind::ArrayLeftBracket,
208                            SourcePos::new(start_line, start_column, start_offset),
209                            SourcePos::new(line, column, offset),
210                        ));
211                    }
212                    Some(_) => {
213                        let mut atom = String::from("#");
214                        while let Some(&ch) = chars.peek() {
215                            if ch.is_whitespace() || "[]|\'\"\\\\".contains(ch) {
216                                break;
217                            }
218                            atom.push(ch);
219                            let consumed = chars.next().unwrap();
220                            advance_pos(consumed, &mut line, &mut column, &mut offset);
221                        }
222                        tokens.push(Token::new(
223                            classify_atom(atom),
224                            SourcePos::new(start_line, start_column, start_offset),
225                            SourcePos::new(line, column, offset),
226                        ));
227                    }
228                    None => {
229                        tokens.push(Token::new(
230                            TokenKind::Atom("#".to_string()),
231                            SourcePos::new(start_line, start_column, start_offset),
232                            SourcePos::new(line, column, offset),
233                        ));
234                    }
235                }
236            }
237
238            ']' => {
239                let consumed = chars.next().unwrap();
240                advance_pos(consumed, &mut line, &mut column, &mut offset);
241                tokens.push(Token::new(
242                    TokenKind::RightBracket,
243                    SourcePos::new(start_line, start_column, start_offset),
244                    SourcePos::new(line, column, offset),
245                ));
246            }
247
248            '\'' => {
249                let consumed = chars.next().unwrap();
250                advance_pos(consumed, &mut line, &mut column, &mut offset);
251                tokens.push(Token::new(
252                    TokenKind::Quote,
253                    SourcePos::new(start_line, start_column, start_offset),
254                    SourcePos::new(line, column, offset),
255                ));
256            }
257
258            '\\' => {
259                // Skip comments - consume everything until newline
260                let consumed = chars.next().unwrap(); // consume the backslash
261                advance_pos(consumed, &mut line, &mut column, &mut offset);
262                for ch in chars.by_ref() {
263                    advance_pos(ch, &mut line, &mut column, &mut offset);
264                    if ch == '\n' {
265                        break;
266                    }
267                }
268                // Continue tokenizing after the comment
269            }
270
271            '|' => {
272                let consumed = chars.next().unwrap();
273                advance_pos(consumed, &mut line, &mut column, &mut offset);
274                tokens.push(Token::new(
275                    TokenKind::Pipe,
276                    SourcePos::new(start_line, start_column, start_offset),
277                    SourcePos::new(line, column, offset),
278                ));
279            }
280
281            '"' => {
282                let consumed = chars.next().unwrap();
283                advance_pos(consumed, &mut line, &mut column, &mut offset);
284                let mut string = String::new();
285                let mut escaped = false;
286
287                for ch in chars.by_ref() {
288                    advance_pos(ch, &mut line, &mut column, &mut offset);
289                    if escaped {
290                        match ch {
291                            'n' => string.push('\n'),
292                            't' => string.push('\t'),
293                            '\\' => string.push('\\'),
294                            '"' => string.push('"'),
295                            _ => {
296                                string.push('\\');
297                                string.push(ch);
298                            }
299                        }
300                        escaped = false;
301                    } else if ch == '\\' {
302                        escaped = true;
303                    } else if ch == '"' {
304                        break;
305                    } else {
306                        string.push(ch);
307                    }
308                }
309
310                tokens.push(Token::new(
311                    TokenKind::String(string),
312                    SourcePos::new(start_line, start_column, start_offset),
313                    SourcePos::new(line, column, offset),
314                ));
315            }
316
317            '+' | '-' if chars.clone().nth(1).is_some_and(|c| c.is_ascii_digit()) => {
318                // Handle signed numbers
319                let mut num_str = String::new();
320                num_str.push(ch);
321                let consumed = chars.next().unwrap();
322                advance_pos(consumed, &mut line, &mut column, &mut offset);
323
324                while let Some(&ch) = chars.peek() {
325                    if ch.is_ascii_digit() || ch == '.' || ch == 'e' || ch == 'E' {
326                        num_str.push(ch);
327                        let consumed = chars.next().unwrap();
328                        advance_pos(consumed, &mut line, &mut column, &mut offset);
329                        // Allow + or - after e/E for scientific notation
330                        if (ch == 'e' || ch == 'E')
331                            && chars.peek().is_some_and(|&c| c == '+' || c == '-')
332                        {
333                            let sign = chars.next().unwrap();
334                            num_str.push(sign);
335                            advance_pos(sign, &mut line, &mut column, &mut offset);
336                        }
337                    } else {
338                        break;
339                    }
340                }
341
342                // RUST CONCEPT: Check for extended number suffixes (same as unsigned numbers)
343                let has_suffix = chars.peek().is_some_and(|&c| {
344                    c == 'n' // BigInt suffix
345                        || c == 'i' // Complex imaginary unit
346                        || c == '/' // Rational fraction
347                        || c == '+' || c == '-' // Complex with separate real/imaginary
348                });
349
350                if has_suffix {
351                    // Continue collecting as an atom-like string (extended number literal)
352                    // Don't break on '.' since we might have decimal complex numbers like "-1.5+2.5i"
353                    while let Some(&ch) = chars.peek() {
354                        if ch.is_whitespace() || "[]|\'\"\\\\".contains(ch) {
355                            break;
356                        }
357                        num_str.push(ch);
358                        let consumed = chars.next().unwrap();
359                        advance_pos(consumed, &mut line, &mut column, &mut offset);
360                    }
361                    tokens.push(Token::new(
362                        classify_atom(num_str),
363                        SourcePos::new(start_line, start_column, start_offset),
364                        SourcePos::new(line, column, offset),
365                    ));
366                } else {
367                    // Regular signed number - check if it's an integer or float based on syntax
368                    let has_decimal = num_str.contains('.') || num_str.contains('e') || num_str.contains('E');
369
370                    if !has_decimal {
371                        // Integer literal - use dedicated Integer token type
372                        tokens.push(Token::new(
373                            TokenKind::Integer(num_str),
374                            SourcePos::new(start_line, start_column, start_offset),
375                            SourcePos::new(line, column, offset),
376                        ));
377                    } else {
378                        // Floating point number
379                        match num_str.parse::<f64>() {
380                            Ok(num) => tokens.push(Token::new(
381                                TokenKind::Number(num),
382                                SourcePos::new(start_line, start_column, start_offset),
383                                SourcePos::new(line, column, offset),
384                            )),
385                            Err(_) => {
386                                // If it's not a valid number, treat it as an atom
387                                // Continue collecting non-whitespace chars
388                                while let Some(&ch) = chars.peek() {
389                                    if ch.is_whitespace() || "[]|\'\"\\\\".contains(ch) {
390                                        break;
391                                    }
392                                    num_str.push(ch);
393                                    let consumed = chars.next().unwrap();
394                                    advance_pos(consumed, &mut line, &mut column, &mut offset);
395                                }
396                                tokens.push(Token::new(
397                                    TokenKind::Atom(num_str),
398                                    SourcePos::new(start_line, start_column, start_offset),
399                                    SourcePos::new(line, column, offset),
400                                ));
401                            }
402                        }
403                    }
404                }
405            }
406
407            '0'..='9' => {
408                let mut num_str = String::new();
409
410                while let Some(&ch) = chars.peek() {
411                    if ch.is_ascii_digit() || ch == '.' || ch == 'e' || ch == 'E' {
412                        num_str.push(ch);
413                        let consumed = chars.next().unwrap();
414                        advance_pos(consumed, &mut line, &mut column, &mut offset);
415                        // Allow + or - after e/E for scientific notation
416                        if (ch == 'e' || ch == 'E')
417                            && chars.peek().is_some_and(|&c| c == '+' || c == '-')
418                        {
419                            let sign = chars.next().unwrap();
420                            num_str.push(sign);
421                            advance_pos(sign, &mut line, &mut column, &mut offset);
422                        }
423                    } else {
424                        break;
425                    }
426                }
427
428                // RUST CONCEPT: Check for extended number suffixes (n, i, /, +, -)
429                // In postfix languages, operators need spaces, so "+"/"-" immediately
430                // after a number can only mean complex number syntax (e.g., 1.5+2.5i)
431                let has_suffix = chars.peek().is_some_and(|&c| {
432                    c == 'n' // BigInt suffix
433                        || c == 'i' // Complex imaginary unit
434                        || c == '/' // Rational fraction
435                        || c == '+' || c == '-' // Complex with separate real/imaginary
436                });
437
438                if has_suffix {
439                    // Continue collecting as an atom-like string (extended number literal)
440                    // Don't break on '.' since we might have decimal complex numbers like "1.5+2.5i"
441                    while let Some(&ch) = chars.peek() {
442                        if ch.is_whitespace() || "[]|\'\"\\\\".contains(ch) {
443                            break;
444                        }
445                        num_str.push(ch);
446                        let consumed = chars.next().unwrap();
447                        advance_pos(consumed, &mut line, &mut column, &mut offset);
448                    }
449                    tokens.push(Token::new(
450                        classify_atom(num_str),
451                        SourcePos::new(start_line, start_column, start_offset),
452                        SourcePos::new(line, column, offset),
453                    ));
454                } else {
455                    // Regular number - check if it's an integer or float based on syntax
456                    // If no decimal point and no scientific notation, treat as integer
457                    let has_decimal = num_str.contains('.') || num_str.contains('e') || num_str.contains('E');
458
459                    if !has_decimal {
460                        // Integer literal - use dedicated Integer token type
461                        tokens.push(Token::new(
462                            TokenKind::Integer(num_str),
463                            SourcePos::new(start_line, start_column, start_offset),
464                            SourcePos::new(line, column, offset),
465                        ));
466                    } else {
467                        // Floating point number
468                        match num_str.parse::<f64>() {
469                            Ok(num) => tokens.push(Token::new(
470                                TokenKind::Number(num),
471                                SourcePos::new(start_line, start_column, start_offset),
472                                SourcePos::new(line, column, offset),
473                            )),
474                            Err(_) => {
475                                // If it's not a valid number, treat it as an atom
476                                // Continue collecting non-whitespace chars
477                                while let Some(&ch) = chars.peek() {
478                                    if ch.is_whitespace() || "[]|\'\"\\\\".contains(ch) {
479                                        break;
480                                    }
481                                    num_str.push(ch);
482                                    let consumed = chars.next().unwrap();
483                                    advance_pos(consumed, &mut line, &mut column, &mut offset);
484                                }
485                                tokens.push(Token::new(
486                                    TokenKind::Atom(num_str),
487                                    SourcePos::new(start_line, start_column, start_offset),
488                                    SourcePos::new(line, column, offset),
489                                ));
490                            }
491                        }
492                    }
493                }
494            }
495
496            _ => {
497                let mut atom = String::new();
498
499                while let Some(&ch) = chars.peek() {
500                    if ch.is_whitespace() || "[]|\\'\"\\\\".contains(ch) {
501                        break;
502                    }
503                    atom.push(ch);
504                    let consumed = chars.next().unwrap();
505                    advance_pos(consumed, &mut line, &mut column, &mut offset);
506                }
507
508                if !atom.is_empty() {
509                    // RUST CONCEPT: Pattern matching on string literals
510                    // Check for special boolean and null literals
511                    let token_kind = match atom.as_str() {
512                        "true" => TokenKind::Boolean(true),
513                        "false" => TokenKind::Boolean(false),
514                        "null" => TokenKind::Null,
515                        _ => TokenKind::Atom(atom),
516                    };
517                    tokens.push(Token::new(
518                        token_kind,
519                        SourcePos::new(start_line, start_column, start_offset),
520                        SourcePos::new(line, column, offset),
521                    ));
522                }
523            }
524        }
525    }
526
527    Ok(tokens)
528}
529
530#[cfg(test)]
531mod tests {
532    use super::*;
533
534    #[test]
535    fn test_tokenize_numbers() {
536        // Integer literal
537        let tokens = tokenize("42").unwrap();
538        assert_eq!(tokens.len(), 1);
539        assert!(matches!(&tokens[0].kind, TokenKind::Integer(s) if s == "42"));
540        assert_eq!(tokens[0].pos.line, 1);
541        assert_eq!(tokens[0].pos.column, 1);
542
543        // Float literal
544        let tokens = tokenize("3.14").unwrap();
545        assert_eq!(tokens.len(), 1);
546        assert!(matches!(tokens[0].kind, TokenKind::Number(n) if n == 3.14));
547
548        // Negative integer
549        let tokens = tokenize("-17").unwrap();
550        assert_eq!(tokens.len(), 1);
551        assert!(matches!(&tokens[0].kind, TokenKind::Integer(s) if s == "-17"));
552    }
553
554    #[test]
555    fn test_tokenize_atoms() {
556        let tokens = tokenize("hello").unwrap();
557        assert_eq!(tokens.len(), 1);
558        assert!(matches!(&tokens[0].kind, TokenKind::Atom(s) if s == "hello"));
559
560        let tokens = tokenize("+").unwrap();
561        assert_eq!(tokens.len(), 1);
562        assert!(matches!(&tokens[0].kind, TokenKind::Atom(s) if s == "+"));
563    }
564
565    #[test]
566    fn test_tokenize_strings() {
567        let tokens = tokenize("\"hello world\"").unwrap();
568        assert_eq!(tokens.len(), 1);
569        assert!(matches!(&tokens[0].kind, TokenKind::String(s) if s == "hello world"));
570    }
571
572    #[test]
573    fn test_tokenize_brackets() {
574        let tokens = tokenize("[1 2 3]").unwrap();
575        assert_eq!(tokens.len(), 5);
576        assert!(matches!(tokens[0].kind, TokenKind::LeftBracket));
577        assert!(matches!(&tokens[1].kind, TokenKind::Integer(s) if s == "1"));
578        assert!(matches!(&tokens[2].kind, TokenKind::Integer(s) if s == "2"));
579        assert!(matches!(&tokens[3].kind, TokenKind::Integer(s) if s == "3"));
580        assert!(matches!(tokens[4].kind, TokenKind::RightBracket));
581    }
582
583    #[test]
584    fn test_tokenize_array_literals() {
585        let tokens = tokenize("#[1 2]").unwrap();
586        assert_eq!(tokens.len(), 4);
587        assert!(matches!(tokens[0].kind, TokenKind::ArrayLeftBracket));
588        assert!(matches!(&tokens[1].kind, TokenKind::Integer(s) if s == "1"));
589        assert!(matches!(&tokens[2].kind, TokenKind::Integer(s) if s == "2"));
590        assert!(matches!(tokens[3].kind, TokenKind::RightBracket));
591    }
592
593    #[test]
594    fn test_tokenize_position_tracking() {
595        let tokens = tokenize("hello\nworld").unwrap();
596        assert_eq!(tokens.len(), 2);
597
598        // First token "hello" at line 1, column 1
599        assert!(matches!(&tokens[0].kind, TokenKind::Atom(s) if s == "hello"));
600        assert_eq!(tokens[0].pos.line, 1);
601        assert_eq!(tokens[0].pos.column, 1);
602
603        // Second token "world" at line 2, column 1
604        assert!(matches!(&tokens[1].kind, TokenKind::Atom(s) if s == "world"));
605        assert_eq!(tokens[1].pos.line, 2);
606        assert_eq!(tokens[1].pos.column, 1);
607    }
608
609    #[test]
610    fn test_tokenize_booleans_and_null() {
611        let tokens = tokenize("true false null").unwrap();
612        assert_eq!(tokens.len(), 3);
613        assert!(matches!(tokens[0].kind, TokenKind::Boolean(true)));
614        assert!(matches!(tokens[1].kind, TokenKind::Boolean(false)));
615        assert!(matches!(tokens[2].kind, TokenKind::Null));
616    }
617
618    #[test]
619    fn test_simple_token_helper() {
620        let pos = SourcePos::new(1, 1, 0);
621        let token = Token::new(TokenKind::Number(42.0), pos.clone(), pos);
622        assert!(matches!(token.kind, TokenKind::Number(n) if n == 42.0));
623        assert_eq!(token.pos.line, 1);
624        assert_eq!(token.pos.column, 1);
625    }
626}