xdl_parser/
lexer.rs

1//! XDL Lexer implementation
2
3use nom::{
4    branch::alt,
5    bytes::complete::{tag, take_while, take_while1},
6    character::complete::{char, digit1, none_of},
7    combinator::{map, map_res, opt, recognize, value},
8    multi::many0,
9    sequence::{delimited, pair, preceded},
10    IResult,
11};
12
13/// Skip whitespace but NOT newlines (unlike multispace0)
14/// This is critical for procedure call detection which relies on newline tokens
15fn ws0(input: &str) -> IResult<&str, &str> {
16    take_while(|c: char| c == ' ' || c == '\t' || c == '\r')(input)
17}
18use xdl_core::XdlResult;
19
20#[derive(Debug, Clone, PartialEq)]
21pub struct TokenSpan {
22    pub token: Token,
23    pub line: usize,
24    pub column: usize,
25}
26
27#[derive(Debug, Clone, PartialEq)]
28pub enum Token {
29    // Literals
30    Integer(i64),
31    Float(f64),
32    String(String),
33
34    // Keywords
35    If,
36    Then,
37    Else,
38    Endif,
39    For,
40    Endfor,
41    Foreach,
42    While,
43    Endwhile,
44    Repeat,
45    Until,
46    Break,
47    Continue,
48    Function,
49    Endfunction,
50    Procedure,
51    Pro,
52    Endpro,
53    Return,
54    Goto,
55    Common,
56    CompileOpt,
57    Begin,
58    End,
59    Case,
60    Of,
61    Endcase,
62    Switch,
63    Endswitch,
64
65    // Operators
66    Plus,           // +
67    Minus,          // -
68    Multiply,       // *
69    Divide,         // /
70    Modulo,         // MOD
71    Power,          // ^
72    MatrixMultiply, // #
73
74    // Assignment
75    Assign,         // =
76    PlusAssign,     // +=
77    MinusAssign,    // -=
78    MultiplyAssign, // *=
79    DivideAssign,   // /=
80
81    // Comparison
82    Equal,        // EQ
83    NotEqual,     // NE
84    Less,         // LT
85    Greater,      // GT
86    LessEqual,    // LE
87    GreaterEqual, // GE
88
89    // Logical
90    And, // AND
91    Or,  // OR
92    Not, // NOT
93    Xor, // XOR
94
95    // Bitwise (rarely used but part of IDL)
96    BitwiseAnd, // AND (bitwise)
97    BitwiseOr,  // OR (bitwise)
98    BitwiseXor, // XOR (bitwise)
99    BitwiseNot, // NOT (bitwise)
100
101    // Delimiters
102    LeftParen,    // (
103    RightParen,   // )
104    LeftBracket,  // [
105    RightBracket, // ]
106    LeftBrace,    // {
107    RightBrace,   // }
108    Comma,        // ,
109    Semicolon,    // ;
110    Colon,        // :
111    DoubleColon,  // ::
112    Dot,          // .
113    Arrow,        // ->
114    QuestionMark, // ?
115
116    // Special
117    Identifier(String),
118    SystemVariable(String), // !PI, !X, etc.
119    Label(String),          // label:
120    Comment(String),        // ; comment
121    Newline,
122    EOF,
123}
124
125type ParseResult<'a, T> = IResult<&'a str, T>;
126
127// Helper function to check if character is valid in identifier
128fn is_identifier_char(c: char) -> bool {
129    c.is_alphanumeric() || c == '_'
130}
131
132fn is_identifier_start(c: char) -> bool {
133    c.is_alphabetic() || c == '_'
134}
135
136// Parse integers
137fn parse_integer(input: &str) -> ParseResult<'_, Token> {
138    map_res(digit1, |s: &str| s.parse::<i64>().map(Token::Integer))(input)
139}
140
141// Parse floating point numbers
142fn parse_float(input: &str) -> ParseResult<'_, Token> {
143    map_res(
144        recognize(pair(digit1, pair(char('.'), opt(digit1)))),
145        |s: &str| s.parse::<f64>().map(Token::Float),
146    )(input)
147}
148
149// Parse numbers (float or integer)
150fn parse_number(input: &str) -> ParseResult<'_, Token> {
151    alt((parse_float, parse_integer))(input)
152}
153
154// Parse string literals
155fn parse_string(input: &str) -> ParseResult<'_, Token> {
156    alt((
157        // Double quoted strings
158        delimited(
159            char('"'),
160            map(many0(none_of("\"")), |chars| {
161                Token::String(chars.into_iter().collect())
162            }),
163            char('"'),
164        ),
165        // Single quoted strings
166        delimited(
167            char('\''),
168            map(many0(none_of("'")), |chars| {
169                Token::String(chars.into_iter().collect())
170            }),
171            char('\''),
172        ),
173    ))(input)
174}
175
176// Parse labels (identifier followed by colon, but not :: and not keywords)
177fn parse_label(input: &str) -> ParseResult<'_, Token> {
178    let (remaining, name) = recognize(pair(
179        take_while1(is_identifier_start),
180        take_while(is_identifier_char),
181    ))(input)?;
182
183    // Check for colon after identifier (but not ::)
184    if remaining.starts_with(':') && !remaining.starts_with("::") {
185        // Check if this is a keyword - keywords should not be treated as labels
186        let is_keyword = matches!(
187            name.to_uppercase().as_str(),
188            "IF" | "THEN" | "ELSE" | "ENDIF" | "FOR" | "ENDFOR" | "FOREACH" | "WHILE"
189                | "ENDWHILE" | "REPEAT" | "UNTIL" | "BREAK" | "CONTINUE" | "FUNCTION"
190                | "ENDFUNCTION" | "PROCEDURE" | "PRO" | "ENDPRO" | "RETURN" | "GOTO"
191                | "COMMON" | "COMPILE_OPT" | "BEGIN" | "END" | "CASE" | "OF" | "ENDCASE"
192                | "SWITCH" | "ENDSWITCH" | "MOD" | "EQ" | "NE" | "LT" | "GT" | "LE"
193                | "GE" | "AND" | "OR" | "NOT" | "XOR"
194        );
195
196        if is_keyword {
197            // Not a label, it's a keyword followed by colon
198            Err(nom::Err::Error(nom::error::Error::new(
199                input,
200                nom::error::ErrorKind::Tag,
201            )))
202        } else {
203            // Skip the colon
204            let remaining = &remaining[1..];
205            Ok((remaining, Token::Label(name.to_string())))
206        }
207    } else {
208        // Not a label
209        Err(nom::Err::Error(nom::error::Error::new(
210            input,
211            nom::error::ErrorKind::Tag,
212        )))
213    }
214}
215
216// Parse identifiers and keywords
217fn parse_identifier_or_keyword(input: &str) -> ParseResult<'_, Token> {
218    let (input, name) = recognize(pair(
219        take_while1(is_identifier_start),
220        take_while(is_identifier_char),
221    ))(input)?;
222
223    let token = match name.to_uppercase().as_str() {
224        // Control flow keywords
225        "IF" => Token::If,
226        "THEN" => Token::Then,
227        "ELSE" => Token::Else,
228        "ENDIF" => Token::Endif,
229        "FOR" => Token::For,
230        "ENDFOR" => Token::Endfor,
231        "FOREACH" => Token::Foreach,
232        "WHILE" => Token::While,
233        "ENDWHILE" => Token::Endwhile,
234        "REPEAT" => Token::Repeat,
235        "UNTIL" => Token::Until,
236        "BREAK" => Token::Break,
237        "CONTINUE" => Token::Continue,
238
239        // Function/procedure keywords
240        "FUNCTION" => Token::Function,
241        "ENDFUNCTION" => Token::Endfunction,
242        "PROCEDURE" | "PRO" => Token::Pro,
243        "ENDPRO" => Token::Endpro,
244        "RETURN" => Token::Return,
245        "GOTO" => Token::Goto,
246
247        // Other keywords
248        "COMMON" => Token::Common,
249        "COMPILE_OPT" => Token::CompileOpt,
250        "BEGIN" => Token::Begin,
251        "END" => Token::End,
252        "CASE" => Token::Case,
253        "OF" => Token::Of,
254        "ENDCASE" => Token::Endcase,
255        "SWITCH" => Token::Switch,
256        "ENDSWITCH" => Token::Endswitch,
257
258        // Operators (word forms)
259        "MOD" => Token::Modulo,
260        "EQ" => Token::Equal,
261        "NE" => Token::NotEqual,
262        "LT" => Token::Less,
263        "GT" => Token::Greater,
264        "LE" => Token::LessEqual,
265        "GE" => Token::GreaterEqual,
266        "AND" => Token::And,
267        "OR" => Token::Or,
268        "NOT" => Token::Not,
269        "XOR" => Token::Xor,
270
271        // Regular identifier
272        _ => Token::Identifier(name.to_string()),
273    };
274
275    Ok((input, token))
276}
277
278// Parse system variables (!PI, !X, etc.)
279fn parse_system_variable(input: &str) -> ParseResult<'_, Token> {
280    preceded(
281        char('!'),
282        map(take_while1(is_identifier_char), |s: &str| {
283            Token::SystemVariable(s.to_uppercase())
284        }),
285    )(input)
286}
287
288// Parse comments
289fn parse_comment(input: &str) -> ParseResult<'_, Token> {
290    preceded(
291        char(';'),
292        map(take_while(|c| c != '\n'), |s: &str| {
293            Token::Comment(s.to_string())
294        }),
295    )(input)
296}
297
298// Parse operators
299fn parse_operator(input: &str) -> ParseResult<'_, Token> {
300    alt((
301        value(Token::PlusAssign, tag("+=")),
302        value(Token::MinusAssign, tag("-=")),
303        value(Token::MultiplyAssign, tag("*=")),
304        value(Token::DivideAssign, tag("/=")),
305        value(Token::Arrow, tag("->")),
306        value(Token::MatrixMultiply, char('#')),
307        value(Token::Power, char('^')),
308        value(Token::Plus, char('+')),
309        value(Token::Minus, char('-')),
310        value(Token::Multiply, char('*')),
311        value(Token::Divide, char('/')),
312        value(Token::Assign, char('=')),
313        value(Token::QuestionMark, char('?')),
314    ))(input)
315}
316
317// Parse delimiters
318fn parse_delimiter(input: &str) -> ParseResult<'_, Token> {
319    alt((
320        value(Token::LeftParen, char('(')),
321        value(Token::RightParen, char(')')),
322        value(Token::LeftBracket, char('[')),
323        value(Token::RightBracket, char(']')),
324        value(Token::LeftBrace, char('{')),
325        value(Token::RightBrace, char('}')),
326        value(Token::Comma, char(',')),
327        value(Token::Semicolon, char(';')),
328        value(Token::DoubleColon, tag("::")),
329        value(Token::Colon, char(':')),
330        value(Token::Dot, char('.')),
331    ))(input)
332}
333
334// Parse a single token
335fn parse_token(input: &str) -> ParseResult<'_, Token> {
336    preceded(
337        ws0, // Use ws0 instead of multispace0 to preserve newlines as tokens
338        alt((
339            parse_comment,
340            parse_string,
341            parse_number,
342            parse_system_variable,
343            parse_label, // Try label before identifier (label: vs identifier)
344            parse_identifier_or_keyword,
345            parse_operator,
346            parse_delimiter,
347            value(Token::Newline, char('\n')),
348        )),
349    )(input)
350}
351
352// Main tokenizer function
353pub fn tokenize(input: &str) -> XdlResult<Vec<Token>> {
354    let mut remaining = input;
355    let mut tokens = Vec::new();
356
357    while !remaining.is_empty() {
358        // Handle line continuation: $ followed by optional whitespace and newline
359        if remaining.starts_with('$') {
360            let after_dollar = &remaining[1..];
361            // Skip whitespace after $
362            let trimmed = after_dollar.trim_start_matches([' ', '\t', '\r']);
363            // If followed by newline or end of input, it's a line continuation
364            if trimmed.is_empty() || trimmed.starts_with('\n') {
365                if let Some(stripped) = trimmed.strip_prefix('\n') {
366                    remaining = stripped;
367                } else {
368                    remaining = trimmed;
369                }
370                continue;
371            }
372            // Otherwise, skip the $ as unknown character
373            remaining = after_dollar;
374            continue;
375        }
376
377        match parse_token(remaining) {
378            Ok((rest, token)) => {
379                // Skip comments for now, but keep them for potential use
380                match token {
381                    Token::Comment(_) => {} // Skip comments
382                    _ => tokens.push(token),
383                }
384                remaining = rest;
385            }
386            Err(_) => {
387                // Skip unknown characters
388                remaining = &remaining[1..];
389            }
390        }
391    }
392
393    tokens.push(Token::EOF);
394    Ok(tokens)
395}
396
397#[cfg(test)]
398mod tests {
399    use super::*;
400
401    #[test]
402    fn test_tokenize_simple() {
403        let input = "x = 42";
404        let tokens = tokenize(input).unwrap();
405        assert_eq!(
406            tokens,
407            vec![
408                Token::Identifier("x".to_string()),
409                Token::Assign,
410                Token::Integer(42),
411                Token::EOF
412            ]
413        );
414    }
415
416    #[test]
417    fn test_tokenize_string() {
418        let input = r#"print, "Hello, World!""#;
419        let tokens = tokenize(input).unwrap();
420        assert_eq!(
421            tokens,
422            vec![
423                Token::Identifier("print".to_string()),
424                Token::Comma,
425                Token::String("Hello, World!".to_string()),
426                Token::EOF
427            ]
428        );
429    }
430
431    #[test]
432    fn test_tokenize_keywords() {
433        let input = "if x eq 42 then";
434        let tokens = tokenize(input).unwrap();
435        assert_eq!(
436            tokens,
437            vec![
438                Token::If,
439                Token::Identifier("x".to_string()),
440                Token::Equal,
441                Token::Integer(42),
442                Token::Then,
443                Token::EOF
444            ]
445        );
446    }
447
448    #[test]
449    fn test_tokenize_system_variable() {
450        let input = "!PI";
451        let tokens = tokenize(input).unwrap();
452        assert_eq!(
453            tokens,
454            vec![Token::SystemVariable("PI".to_string()), Token::EOF]
455        );
456    }
457}