Skip to main content

variable_core/
lexer.rs

1#[derive(Debug, Clone, PartialEq)]
2pub struct Span {
3    pub offset: usize,
4    pub line: usize,
5    pub column: usize,
6}
7
8#[derive(Debug, Clone, PartialEq)]
9pub struct SpannedToken {
10    pub token: Token,
11    pub span: Span,
12}
13
14#[derive(Debug, Clone, PartialEq)]
15pub enum Token {
16    // Keywords
17    Feature,
18    Struct,
19
20    // Type keywords
21    BooleanType,
22    IntegerType,
23    FloatType,
24    StringType,
25
26    // Literals
27    BoolLit(bool),
28    NumberLit(f64),
29    StringLit(String),
30
31    // Symbols
32    LBrace,
33    RBrace,
34    Equals,
35    Colon,
36
37    // Identifiers
38    Ident(String),
39}
40
41#[derive(Debug, Clone, PartialEq)]
42pub struct LexError {
43    pub message: String,
44    pub span: Span,
45}
46
47fn compute_span(full_input: &str, offset: usize) -> Span {
48    let consumed = &full_input[..offset];
49    let line = consumed.chars().filter(|&c| c == '\n').count() + 1;
50    let column = match consumed.rfind('\n') {
51        Some(pos) => offset - pos,
52        None => offset + 1,
53    };
54    Span {
55        offset,
56        line,
57        column,
58    }
59}
60
61fn skip_whitespace(input: &str, pos: usize) -> usize {
62    let mut i = pos;
63    let bytes = input.as_bytes();
64    while i < bytes.len()
65        && (bytes[i] == b' ' || bytes[i] == b'\t' || bytes[i] == b'\n' || bytes[i] == b'\r')
66    {
67        i += 1;
68    }
69    i
70}
71
72fn lex_word(input: &str, pos: usize) -> (usize, &str) {
73    let start = pos;
74    let mut i = pos;
75    let bytes = input.as_bytes();
76    while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
77        i += 1;
78    }
79    (i, &input[start..i])
80}
81
82fn lex_number_token(input: &str, pos: usize) -> Result<(usize, f64), String> {
83    let mut i = pos;
84    let bytes = input.as_bytes();
85
86    if i < bytes.len() && bytes[i] == b'-' {
87        i += 1;
88    }
89
90    let digit_start = i;
91    while i < bytes.len() && bytes[i].is_ascii_digit() {
92        i += 1;
93    }
94    if i == digit_start {
95        return Err("expected digit".to_string());
96    }
97
98    if i < bytes.len() && bytes[i] == b'.' {
99        i += 1;
100        let frac_start = i;
101        while i < bytes.len() && bytes[i].is_ascii_digit() {
102            i += 1;
103        }
104        if i == frac_start {
105            return Err("expected digit after decimal point".to_string());
106        }
107    }
108
109    let num_str = &input[pos..i];
110    let n: f64 = num_str.parse().map_err(|e| format!("{}", e))?;
111    Ok((i, n))
112}
113
114fn lex_string_token(input: &str, pos: usize) -> Result<(usize, String), String> {
115    // pos should be at the opening quote
116    let mut i = pos + 1; // skip opening "
117    let mut result = String::new();
118    let bytes = input.as_bytes();
119
120    loop {
121        if i >= bytes.len() {
122            return Err("unterminated string".to_string());
123        }
124        match bytes[i] {
125            b'"' => {
126                return Ok((i + 1, result));
127            }
128            b'\\' => {
129                i += 1;
130                if i >= bytes.len() {
131                    return Err("unterminated string".to_string());
132                }
133                match bytes[i] {
134                    b'n' => result.push('\n'),
135                    b't' => result.push('\t'),
136                    b'\\' => result.push('\\'),
137                    b'"' => result.push('"'),
138                    c => {
139                        result.push('\\');
140                        result.push(c as char);
141                    }
142                }
143                i += 1;
144            }
145            _ => {
146                // Handle UTF-8 properly
147                let ch = input[i..].chars().next().unwrap();
148                result.push(ch);
149                i += ch.len_utf8();
150            }
151        }
152    }
153}
154
155pub fn lex(input: &str) -> Result<Vec<SpannedToken>, LexError> {
156    let mut tokens = Vec::new();
157    let mut pos = 0;
158
159    loop {
160        pos = skip_whitespace(input, pos);
161        if pos >= input.len() {
162            break;
163        }
164
165        let span = compute_span(input, pos);
166        let byte = input.as_bytes()[pos];
167
168        match byte {
169            b'{' => {
170                tokens.push(SpannedToken {
171                    token: Token::LBrace,
172                    span,
173                });
174                pos += 1;
175            }
176            b'}' => {
177                tokens.push(SpannedToken {
178                    token: Token::RBrace,
179                    span,
180                });
181                pos += 1;
182            }
183            b'=' => {
184                tokens.push(SpannedToken {
185                    token: Token::Equals,
186                    span,
187                });
188                pos += 1;
189            }
190            b':' => {
191                tokens.push(SpannedToken {
192                    token: Token::Colon,
193                    span,
194                });
195                pos += 1;
196            }
197            b'"' => match lex_string_token(input, pos) {
198                Ok((new_pos, s)) => {
199                    tokens.push(SpannedToken {
200                        token: Token::StringLit(s),
201                        span,
202                    });
203                    pos = new_pos;
204                }
205                Err(msg) => {
206                    return Err(LexError { message: msg, span });
207                }
208            },
209            b'0'..=b'9' => match lex_number_token(input, pos) {
210                Ok((new_pos, n)) => {
211                    tokens.push(SpannedToken {
212                        token: Token::NumberLit(n),
213                        span,
214                    });
215                    pos = new_pos;
216                }
217                Err(msg) => {
218                    return Err(LexError { message: msg, span });
219                }
220            },
221            c if c.is_ascii_alphabetic() || c == b'_' => {
222                let (new_pos, word) = lex_word(input, pos);
223                let token = match word {
224                    "Feature" => Token::Feature,
225                    "Struct" => Token::Struct,
226                    "Boolean" => Token::BooleanType,
227                    "Integer" => Token::IntegerType,
228                    "Float" => Token::FloatType,
229                    "String" => Token::StringType,
230                    "true" => Token::BoolLit(true),
231                    "false" => Token::BoolLit(false),
232                    _ => Token::Ident(word.to_string()),
233                };
234                tokens.push(SpannedToken { token, span });
235                pos = new_pos;
236            }
237            _ => {
238                return Err(LexError {
239                    message: format!("unexpected character: {:?}", byte as char),
240                    span,
241                });
242            }
243        }
244    }
245
246    Ok(tokens)
247}
248
249#[cfg(test)]
250mod tests {
251    use super::*;
252
253    #[test]
254    fn lex_feature_keyword() {
255        let tokens = lex("Feature").unwrap();
256        assert_eq!(tokens.len(), 1);
257        assert_eq!(tokens[0].token, Token::Feature);
258        assert_eq!(
259            tokens[0].span,
260            Span {
261                offset: 0,
262                line: 1,
263                column: 1
264            }
265        );
266    }
267
268    #[test]
269    fn lex_type_keywords() {
270        let tokens = lex("Boolean Integer Float String").unwrap();
271        assert_eq!(tokens.len(), 4);
272        assert_eq!(tokens[0].token, Token::BooleanType);
273        assert_eq!(tokens[1].token, Token::IntegerType);
274        assert_eq!(tokens[2].token, Token::FloatType);
275        assert_eq!(tokens[3].token, Token::StringType);
276    }
277
278    #[test]
279    fn lex_bool_literals() {
280        let tokens = lex("true false").unwrap();
281        assert_eq!(tokens.len(), 2);
282        assert_eq!(tokens[0].token, Token::BoolLit(true));
283        assert_eq!(tokens[1].token, Token::BoolLit(false));
284    }
285
286    #[test]
287    fn lex_number_literals() {
288        let tokens = lex("42 3.14").unwrap();
289        assert_eq!(tokens.len(), 2);
290        assert_eq!(tokens[0].token, Token::NumberLit(42.0));
291        assert_eq!(tokens[1].token, Token::NumberLit(3.14));
292    }
293
294    #[test]
295    fn lex_string_literal() {
296        let tokens = lex(r#""hello""#).unwrap();
297        assert_eq!(tokens.len(), 1);
298        assert_eq!(tokens[0].token, Token::StringLit("hello".to_string()));
299    }
300
301    #[test]
302    fn lex_string_with_escapes() {
303        let tokens = lex(r#""hello\nworld""#).unwrap();
304        assert_eq!(tokens.len(), 1);
305        assert_eq!(
306            tokens[0].token,
307            Token::StringLit("hello\nworld".to_string())
308        );
309    }
310
311    #[test]
312    fn lex_complete_feature_block() {
313        let input = r#"1: Feature Checkout = {
314    1: enabled Boolean = true
315    2: max_items Integer = 50
316    3: header_text String = "Complete your purchase"
317}"#;
318        let tokens = lex(input).unwrap();
319        // 1: Feature Checkout = { (6)
320        // + 3 * (id : name Type = value) = 3 * 7 = 21
321        // + } (1) = 28
322        assert_eq!(tokens.len(), 25);
323        assert_eq!(tokens[0].token, Token::NumberLit(1.0));
324        assert_eq!(tokens[1].token, Token::Colon);
325        assert_eq!(tokens[2].token, Token::Feature);
326        assert_eq!(tokens[3].token, Token::Ident("Checkout".to_string()));
327        assert_eq!(tokens[4].token, Token::Equals);
328        assert_eq!(tokens[5].token, Token::LBrace);
329        assert_eq!(tokens[6].token, Token::NumberLit(1.0));
330        assert_eq!(tokens[7].token, Token::Colon);
331        assert_eq!(tokens[8].token, Token::Ident("enabled".to_string()));
332        assert_eq!(tokens[9].token, Token::BooleanType);
333        assert_eq!(tokens[10].token, Token::Equals);
334        assert_eq!(tokens[11].token, Token::BoolLit(true));
335        assert_eq!(tokens[24].token, Token::RBrace);
336    }
337
338    #[test]
339    fn lex_error_unterminated_string() {
340        let result = lex(r#""hello"#);
341        assert!(result.is_err());
342        let err = result.unwrap_err();
343        assert_eq!(err.message, "unterminated string");
344    }
345
346    #[test]
347    fn lex_error_invalid_character() {
348        let result = lex("@");
349        assert!(result.is_err());
350        let err = result.unwrap_err();
351        assert!(err.message.contains("unexpected character"));
352    }
353
354    #[test]
355    fn lex_struct_keyword() {
356        let tokens = lex("Struct").unwrap();
357        assert_eq!(tokens.len(), 1);
358        assert_eq!(tokens[0].token, Token::Struct);
359    }
360
361    #[test]
362    fn lex_struct_block() {
363        let input = r#"1: Struct Theme = {
364    1: dark_mode Boolean = false
365}"#;
366        let tokens = lex(input).unwrap();
367        assert_eq!(tokens[0].token, Token::NumberLit(1.0));
368        assert_eq!(tokens[1].token, Token::Colon);
369        assert_eq!(tokens[2].token, Token::Struct);
370        assert_eq!(tokens[3].token, Token::Ident("Theme".to_string()));
371        assert_eq!(tokens[4].token, Token::Equals);
372        assert_eq!(tokens[5].token, Token::LBrace);
373        assert_eq!(tokens[6].token, Token::NumberLit(1.0));
374        assert_eq!(tokens[7].token, Token::Colon);
375    }
376
377    #[test]
378    fn lex_span_info_multiline() {
379        let input = "Feature\n  Checkout";
380        let tokens = lex(input).unwrap();
381        assert_eq!(
382            tokens[0].span,
383            Span {
384                offset: 0,
385                line: 1,
386                column: 1
387            }
388        );
389        assert_eq!(
390            tokens[1].span,
391            Span {
392                offset: 10,
393                line: 2,
394                column: 3
395            }
396        );
397    }
398}