Skip to main content

cobble/parser/
tokenizer.rs

1/// Token type for indentation-based parsing
2#[derive(Clone, Debug, PartialEq, Eq, Hash)]
3pub enum Token {
4    // Keywords
5    Import,
6    From,
7    Def,
8    If,
9    Elif,
10    Else,
11    For,
12    While,
13    Return,
14    Pass,
15    In,
16    Global,
17    As,
18    At,
19    Asat,
20    And,
21    Or,
22    Not,
23    Unless,
24    Match,
25    Case,
26    Const,
27    Define,
28    Create,
29    End,
30    To,
31    By,
32    Underscore,
33
34    // Literals
35    Number(String), // Store as string to avoid f64 Eq/Hash issues
36    String(String),
37    True_,
38    False_,
39    None_,
40
41    // Identifiers
42    Ident(String),
43
44    // Symbols
45    LParen,
46    RParen,
47    LBracket,
48    RBracket,
49    LBrace,
50    RBrace,
51    Colon,
52    SemiColon,
53    Comma,
54    Dot,
55    Equals,
56    Plus,
57    Minus,
58    Star,
59    Slash,
60    Percent,
61    Caret,
62
63    // Comparison
64    EqEq,
65    NotEq,
66    Lt,
67    LtEq,
68    Gt,
69    GtEq,
70
71    // Special
72    MinecraftCommand(String),
73    Newline,
74    Indent,
75    Dedent,
76    Eof,
77}
78
79impl std::fmt::Display for Token {
80    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
81        match self {
82            Token::Ident(s) => write!(f, "{}", s),
83            Token::String(s) => write!(
84                f,
85                "{}",
86                serde_json::to_string(s).unwrap_or_else(|_| format!("\"{}\"", s))
87            ),
88            Token::Number(n) => write!(f, "{}", n),
89            Token::MinecraftCommand(s) => write!(f, "/{}", s),
90            Token::Dot => write!(f, "."),
91            Token::Colon => write!(f, ":"),
92            Token::SemiColon => write!(f, ";"),
93            Token::Comma => write!(f, ","),
94            Token::LParen => write!(f, "("),
95            Token::RParen => write!(f, ")"),
96            Token::LBracket => write!(f, "["),
97            Token::RBracket => write!(f, "]"),
98            Token::LBrace => write!(f, "{{"),
99            Token::RBrace => write!(f, "}}"),
100            Token::Plus => write!(f, "+"),
101            Token::Minus => write!(f, "-"),
102            Token::Star => write!(f, "*"),
103            Token::Slash => write!(f, "/"),
104            Token::Percent => write!(f, "%"),
105            Token::Caret => write!(f, "^"),
106            Token::Equals => write!(f, "="),
107            Token::EqEq => write!(f, "=="),
108            Token::NotEq => write!(f, "!="),
109            Token::Lt => write!(f, "<"),
110            Token::LtEq => write!(f, "<="),
111            Token::Gt => write!(f, ">"),
112            Token::GtEq => write!(f, ">="),
113            // Keywords - must be lowercase for Minecraft compatibility
114            Token::If => write!(f, "if"),
115            Token::Unless => write!(f, "unless"),
116            Token::As => write!(f, "as"),
117            Token::At => write!(f, "at"),
118            Token::And => write!(f, "and"),
119            Token::Or => write!(f, "or"),
120            Token::Not => write!(f, "not"),
121            Token::In => write!(f, "in"),
122            Token::For => write!(f, "for"),
123            Token::While => write!(f, "while"),
124            Token::Elif => write!(f, "elif"),
125            Token::Else => write!(f, "else"),
126            Token::Def => write!(f, "def"),
127            Token::Return => write!(f, "return"),
128            Token::Pass => write!(f, "pass"),
129            Token::Global => write!(f, "global"),
130            Token::Import => write!(f, "import"),
131            Token::From => write!(f, "from"),
132            Token::Asat => write!(f, "asat"),
133            Token::Match => write!(f, "match"),
134            Token::Case => write!(f, "case"),
135            Token::Const => write!(f, "const"),
136            Token::Define => write!(f, "define"),
137            Token::Create => write!(f, "create"),
138            Token::End => write!(f, "end"),
139            Token::To => write!(f, "to"),
140            Token::By => write!(f, "by"),
141            Token::Underscore => write!(f, "_"),
142            Token::True_ => write!(f, "True"),
143            Token::False_ => write!(f, "False"),
144            Token::None_ => write!(f, "None"),
145            _ => write!(f, "{:?}", self),
146        }
147    }
148}
149
150/// Manual tokenizer that handles indentation
151pub fn tokenize(source: &str) -> Result<Vec<Token>, String> {
152    let mut tokens = Vec::new();
153    let mut indent_stack: Vec<usize> = vec![0];
154    let mut paren_depth = 0;
155
156    for (line_idx, line) in source.lines().enumerate() {
157        // Skip empty lines and comments
158        let trimmed = line.trim();
159        if trimmed.is_empty() || trimmed.starts_with('#') {
160            continue;
161        }
162
163        // Only handle indentation if we are not inside parentheses/brackets/braces
164        if paren_depth == 0 {
165            // Calculate indentation
166            let indent_level = line.len() - line.trim_start().len();
167            let current_indent = *indent_stack.last().unwrap();
168
169            // Handle indentation changes
170            if indent_level > current_indent {
171                indent_stack.push(indent_level);
172                tokens.push(Token::Indent);
173            } else if indent_level < current_indent {
174                while indent_stack.len() > 1 && *indent_stack.last().unwrap() > indent_level {
175                    indent_stack.pop();
176                    tokens.push(Token::Dedent);
177                }
178                if *indent_stack.last().unwrap() != indent_level {
179                    return Err(format!("Indentation error at line {}", line_idx + 1));
180                }
181            }
182        }
183
184        // Tokenize the line content
185        let line_content = line.trim();
186        tokenize_line(line_content, &mut tokens, &mut paren_depth)?;
187
188        // Only emit Newline if not inside parentheses/brackets/braces
189        if paren_depth == 0 {
190            tokens.push(Token::Newline);
191        }
192    }
193
194    // Add remaining dedents
195    while indent_stack.len() > 1 {
196        indent_stack.pop();
197        tokens.push(Token::Dedent);
198    }
199
200    tokens.push(Token::Eof);
201    Ok(tokens)
202}
203
204/// Check if the minus sign should be treated as a binary operator
205/// based on the previous token context
206fn should_be_binary_minus(tokens: &[Token]) -> bool {
207    // If previous token is one of these, minus is a binary operator:
208    // Number, Ident, RParen, RBracket, True_, False_, None_
209    if let Some(last_token) = tokens.last() {
210        matches!(
211            last_token,
212            Token::Number(_)
213                | Token::Ident(_)
214                | Token::RParen
215                | Token::RBracket
216                | Token::True_
217                | Token::False_
218                | Token::None_
219        )
220    } else {
221        // At start of line or after operators/keywords, it's unary
222        false
223    }
224}
225
226/// Check if the caret should be treated as a power operator
227/// based on the previous token context
228fn should_be_power_operator(tokens: &[Token]) -> bool {
229    // Similar to should_be_binary_minus - if previous token can be an operand,
230    // then ^ is the power operator, not a coordinate marker
231    if let Some(last_token) = tokens.last() {
232        matches!(
233            last_token,
234            Token::Number(_)
235                | Token::Ident(_)
236                | Token::RParen
237                | Token::RBracket
238                | Token::True_
239                | Token::False_
240                | Token::None_
241        )
242    } else {
243        false
244    }
245}
246
247/// Tokenize a single line
248fn tokenize_line(line: &str, tokens: &mut Vec<Token>, paren_depth: &mut i32) -> Result<(), String> {
249    let mut chars = line.chars().peekable();
250
251    while let Some(&ch) = chars.peek() {
252        match ch {
253            ' ' | '\t' => {
254                chars.next();
255            }
256            '/' => {
257                // Check if this is a Minecraft command (starts with / followed by letter)
258                // or a division operator
259                chars.next();
260                if let Some(&next_ch) = chars.peek() {
261                    // Minecraft command only if followed immediately by a letter (no space)
262                    if next_ch.is_alphabetic() {
263                        // Minecraft command - consume rest of line
264                        let mut cmd: String = chars.collect();
265                        cmd = strip_minecraft_inline_comment(&cmd).to_string();
266                        cmd = cmd.trim_end().to_string();
267
268                        tokens.push(Token::MinecraftCommand(cmd));
269                        break;
270                    } else {
271                        // Division operator or other use
272                        tokens.push(Token::Slash);
273                    }
274                } else {
275                    // End of line after /, treat as Slash
276                    tokens.push(Token::Slash);
277                }
278            }
279            '"' | '\'' => {
280                // String literal
281                let quote = chars.next().unwrap();
282                let mut s = String::new();
283                let mut escaped = false;
284                for ch in chars.by_ref() {
285                    if escaped {
286                        s.push(ch);
287                        escaped = false;
288                    } else if ch == '\\' {
289                        escaped = true;
290                    } else if ch == quote {
291                        break;
292                    } else {
293                        s.push(ch);
294                    }
295                }
296                tokens.push(Token::String(s));
297            }
298            '0'..='9' => {
299                // Number
300                let mut num = String::new();
301                while let Some(&ch) = chars.peek() {
302                    if ch.is_ascii_digit() {
303                        num.push(chars.next().unwrap());
304                    } else if ch == '.' {
305                        // Check if this is a range operator (..)
306                        let mut temp_chars = chars.clone();
307                        temp_chars.next(); // skip first dot
308                        if let Some(&next_ch) = temp_chars.peek() {
309                            if next_ch == '.' {
310                                // This is "..", stop parsing number
311                                break;
312                            }
313                        }
314                        // Single dot, part of decimal number
315                        num.push(chars.next().unwrap());
316                    } else {
317                        break;
318                    }
319                }
320                // Validate that the number can be parsed
321                if num.parse::<f64>().is_err() {
322                    return Err(format!(
323                        "Invalid number literal: '{}' at line {}",
324                        num, line
325                    ));
326                }
327                tokens.push(Token::Number(num));
328            }
329            'a'..='z' | 'A'..='Z' | '_' => {
330                // Identifier or keyword (may include namespace like minecraft:stone)
331                let mut ident = String::new();
332                while let Some(&ch) = chars.peek() {
333                    if ch.is_alphanumeric() || ch == '_' {
334                        ident.push(chars.next().unwrap());
335                    } else if ch == ':' {
336                        // Check if this is a namespace separator (followed by identifier)
337                        let mut temp_chars = chars.clone();
338                        temp_chars.next(); // skip the colon
339                        if let Some(&next_ch) = temp_chars.peek() {
340                            if next_ch.is_alphabetic() || next_ch == '_' {
341                                // This is a namespace separator
342                                ident.push(chars.next().unwrap()); // add the colon
343                                continue;
344                            }
345                        }
346                        // Not a namespace separator, stop here
347                        break;
348                    } else {
349                        break;
350                    }
351                }
352                let token = match ident.as_str() {
353                    "import" => Token::Import,
354                    "from" => Token::From,
355                    "def" => Token::Def,
356                    "if" => Token::If,
357                    "elif" => Token::Elif,
358                    "else" => Token::Else,
359                    "for" => Token::For,
360                    "while" => Token::While,
361                    "return" => Token::Return,
362                    "pass" => Token::Pass,
363                    "in" => Token::In,
364                    "global" => Token::Global,
365                    "as" => Token::As,
366                    "at" => Token::At,
367                    "asat" => Token::Asat,
368                    "and" => Token::And,
369                    "or" => Token::Or,
370                    "not" => Token::Not,
371                    "unless" => Token::Unless,
372                    "match" => Token::Match,
373                    "case" => Token::Case,
374                    "const" => Token::Const,
375                    "define" => Token::Define,
376                    "create" => Token::Create,
377                    "end" => Token::End,
378                    "to" => Token::To,
379                    "by" => Token::By,
380                    "_" => Token::Underscore,
381                    "True" => Token::True_,
382                    "False" => Token::False_,
383                    "None" => Token::None_,
384                    _ => Token::Ident(ident),
385                };
386                tokens.push(token);
387            }
388            '@' => {
389                // Selector (e.g., @a, @p, @s, @e[...], @Player)
390                let mut selector = String::new();
391                selector.push(chars.next().unwrap()); // @
392                                                      // Collect all alphanumeric characters (for @Player, @Boss, etc.)
393                while let Some(&ch) = chars.peek() {
394                    if ch.is_alphanumeric() || ch == '_' {
395                        selector.push(chars.next().unwrap());
396                    } else {
397                        break;
398                    }
399                }
400                // Handle selector arguments
401                if chars.peek() == Some(&'[') {
402                    let mut bracket_depth = 0;
403                    while let Some(ch) = chars.peek() {
404                        selector.push(*ch);
405                        if *ch == '[' {
406                            bracket_depth += 1;
407                        } else if *ch == ']' {
408                            bracket_depth -= 1;
409                            chars.next();
410                            if bracket_depth == 0 {
411                                break;
412                            }
413                            continue;
414                        }
415                        chars.next();
416                    }
417                }
418                tokens.push(Token::Ident(selector));
419            }
420            '~' => {
421                // Coordinate marker
422                let mut coord = String::new();
423                coord.push(chars.next().unwrap());
424                while let Some(&ch) = chars.peek() {
425                    if ch.is_ascii_digit() || ch == '.' || ch == '-' {
426                        coord.push(chars.next().unwrap());
427                    } else {
428                        break;
429                    }
430                }
431                tokens.push(Token::Ident(coord));
432            }
433            '^' => {
434                chars.next();
435                // Context-aware: check if it's a coordinate (^number) or power operator (^)
436                // If previous token suggests binary operator context, it's power operator
437                if should_be_power_operator(tokens) {
438                    // It's a power operator
439                    tokens.push(Token::Caret);
440                } else if let Some(&ch) = chars.peek() {
441                    if ch.is_ascii_digit() || ch == '.' || ch == '-' {
442                        // It's a coordinate marker (in execute commands)
443                        let mut coord = String::from("^");
444                        while let Some(&ch) = chars.peek() {
445                            if ch.is_ascii_digit() || ch == '.' || ch == '-' {
446                                coord.push(chars.next().unwrap());
447                            } else {
448                                break;
449                            }
450                        }
451                        tokens.push(Token::Ident(coord));
452                    } else {
453                        // It's a power operator
454                        tokens.push(Token::Caret);
455                    }
456                } else {
457                    // End of input, it's a power operator
458                    tokens.push(Token::Caret);
459                }
460            }
461            '=' => {
462                chars.next();
463                if chars.peek() == Some(&'=') {
464                    chars.next();
465                    tokens.push(Token::EqEq);
466                } else {
467                    tokens.push(Token::Equals);
468                }
469            }
470            '!' => {
471                chars.next();
472                if chars.peek() == Some(&'=') {
473                    chars.next();
474                    tokens.push(Token::NotEq);
475                } else {
476                    return Err("Unexpected '!' character".to_string());
477                }
478            }
479            '<' => {
480                chars.next();
481                if chars.peek() == Some(&'=') {
482                    chars.next();
483                    tokens.push(Token::LtEq);
484                } else {
485                    tokens.push(Token::Lt);
486                }
487            }
488            '>' => {
489                chars.next();
490                if chars.peek() == Some(&'=') {
491                    chars.next();
492                    tokens.push(Token::GtEq);
493                } else {
494                    tokens.push(Token::Gt);
495                }
496            }
497            '(' => {
498                chars.next();
499                tokens.push(Token::LParen);
500                *paren_depth += 1;
501            }
502            ')' => {
503                chars.next();
504                tokens.push(Token::RParen);
505                *paren_depth -= 1;
506            }
507            '[' => {
508                chars.next();
509                tokens.push(Token::LBracket);
510                *paren_depth += 1;
511            }
512            ']' => {
513                chars.next();
514                tokens.push(Token::RBracket);
515                *paren_depth -= 1;
516            }
517            ':' => {
518                chars.next();
519                tokens.push(Token::Colon);
520            }
521            ';' => {
522                chars.next();
523                tokens.push(Token::SemiColon);
524            }
525            ',' => {
526                chars.next();
527                tokens.push(Token::Comma);
528            }
529            '.' => {
530                chars.next();
531                tokens.push(Token::Dot);
532            }
533            '+' => {
534                chars.next();
535                tokens.push(Token::Plus);
536            }
537            '-' => {
538                chars.next();
539                // Context-aware parsing: check if this should be binary minus or unary negative
540                if let Some(&next_ch) = chars.peek() {
541                    // Only treat as negative number if:
542                    // 1. Next char is a digit
543                    // 2. Previous token suggests unary context (not a binary operator context)
544                    if next_ch.is_ascii_digit() && !should_be_binary_minus(tokens) {
545                        let mut num = String::from("-");
546                        while let Some(&ch) = chars.peek() {
547                            if ch.is_ascii_digit() {
548                                num.push(chars.next().unwrap());
549                            } else if ch == '.' {
550                                // Check if this is a range operator (..)
551                                let mut temp_chars = chars.clone();
552                                temp_chars.next(); // skip first dot
553                                if let Some(&next_ch) = temp_chars.peek() {
554                                    if next_ch == '.' {
555                                        // This is "..", stop parsing number
556                                        break;
557                                    }
558                                }
559                                // Single dot, part of decimal number
560                                num.push(chars.next().unwrap());
561                            } else {
562                                break;
563                            }
564                        }
565                        // Validate that the number can be parsed
566                        if num.parse::<f64>().is_err() {
567                            return Err(format!(
568                                "Invalid number literal: '{}' at line {}",
569                                num, line
570                            ));
571                        }
572                        tokens.push(Token::Number(num));
573                    } else {
574                        // Binary minus operator
575                        tokens.push(Token::Minus);
576                    }
577                } else {
578                    tokens.push(Token::Minus);
579                }
580            }
581            '*' => {
582                chars.next();
583                tokens.push(Token::Star);
584            }
585            '%' => {
586                chars.next();
587                tokens.push(Token::Percent);
588            }
589            '{' => {
590                chars.next();
591                tokens.push(Token::LBrace);
592                *paren_depth += 1;
593            }
594            '}' => {
595                chars.next();
596                tokens.push(Token::RBrace);
597                *paren_depth -= 1;
598            }
599            '#' => {
600                // Comment - ignore rest of line
601                break;
602            }
603            _ => {
604                return Err(format!("Unexpected character: {}", ch));
605            }
606        }
607    }
608
609    Ok(())
610}
611
612fn strip_minecraft_inline_comment(command: &str) -> &str {
613    let mut quote: Option<char> = None;
614    let mut escaped = false;
615    let chars: Vec<(usize, char)> = command.char_indices().collect();
616
617    for (position, (index, ch)) in chars.iter().enumerate() {
618        if escaped {
619            escaped = false;
620            continue;
621        }
622
623        if *ch == '\\' {
624            escaped = true;
625            continue;
626        }
627
628        if let Some(active_quote) = quote {
629            if *ch == active_quote {
630                quote = None;
631            }
632            continue;
633        }
634
635        if *ch == '"' || *ch == '\'' {
636            quote = Some(*ch);
637            continue;
638        }
639
640        if *ch == '#' {
641            let prev_is_space = position == 0
642                || chars
643                    .get(position.wrapping_sub(1))
644                    .map(|(_, c)| c.is_whitespace())
645                    .unwrap_or(false);
646            let next_is_space_or_end = chars
647                .get(position + 1)
648                .map(|(_, c)| c.is_whitespace())
649                .unwrap_or(true);
650            if prev_is_space && next_is_space_or_end {
651                return command[..*index].trim_end();
652            }
653        }
654    }
655
656    command
657}