Skip to main content

variable_core/
lexer.rs

1#[derive(Debug, Clone, PartialEq)]
2pub struct Span {
3    pub offset: usize,
4    pub line: usize,
5    pub column: usize,
6}
7
8#[derive(Debug, Clone, PartialEq)]
9pub struct SpannedToken {
10    pub token: Token,
11    pub span: Span,
12}
13
14#[derive(Debug, Clone, PartialEq)]
15pub enum Token {
16    // Keywords
17    Feature,
18    Variable,
19
20    // Type keywords
21    BooleanType,
22    NumberType,
23    StringType,
24
25    // Literals
26    BoolLit(bool),
27    NumberLit(f64),
28    StringLit(String),
29
30    // Symbols
31    LBrace,
32    RBrace,
33    Equals,
34    Colon,
35
36    // Identifiers
37    Ident(String),
38}
39
40#[derive(Debug, Clone, PartialEq)]
41pub struct LexError {
42    pub message: String,
43    pub span: Span,
44}
45
46fn compute_span(full_input: &str, offset: usize) -> Span {
47    let consumed = &full_input[..offset];
48    let line = consumed.chars().filter(|&c| c == '\n').count() + 1;
49    let column = match consumed.rfind('\n') {
50        Some(pos) => offset - pos,
51        None => offset + 1,
52    };
53    Span {
54        offset,
55        line,
56        column,
57    }
58}
59
60fn skip_whitespace(input: &str, pos: usize) -> usize {
61    let mut i = pos;
62    let bytes = input.as_bytes();
63    while i < bytes.len()
64        && (bytes[i] == b' ' || bytes[i] == b'\t' || bytes[i] == b'\n' || bytes[i] == b'\r')
65    {
66        i += 1;
67    }
68    i
69}
70
71fn lex_word(input: &str, pos: usize) -> (usize, &str) {
72    let start = pos;
73    let mut i = pos;
74    let bytes = input.as_bytes();
75    while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
76        i += 1;
77    }
78    (i, &input[start..i])
79}
80
81fn lex_number_token(input: &str, pos: usize) -> Result<(usize, f64), String> {
82    let mut i = pos;
83    let bytes = input.as_bytes();
84
85    if i < bytes.len() && bytes[i] == b'-' {
86        i += 1;
87    }
88
89    let digit_start = i;
90    while i < bytes.len() && bytes[i].is_ascii_digit() {
91        i += 1;
92    }
93    if i == digit_start {
94        return Err("expected digit".to_string());
95    }
96
97    if i < bytes.len() && bytes[i] == b'.' {
98        i += 1;
99        let frac_start = i;
100        while i < bytes.len() && bytes[i].is_ascii_digit() {
101            i += 1;
102        }
103        if i == frac_start {
104            return Err("expected digit after decimal point".to_string());
105        }
106    }
107
108    let num_str = &input[pos..i];
109    let n: f64 = num_str.parse().map_err(|e| format!("{}", e))?;
110    Ok((i, n))
111}
112
113fn lex_string_token(input: &str, pos: usize) -> Result<(usize, String), String> {
114    // pos should be at the opening quote
115    let mut i = pos + 1; // skip opening "
116    let mut result = String::new();
117    let bytes = input.as_bytes();
118
119    loop {
120        if i >= bytes.len() {
121            return Err("unterminated string".to_string());
122        }
123        match bytes[i] {
124            b'"' => {
125                return Ok((i + 1, result));
126            }
127            b'\\' => {
128                i += 1;
129                if i >= bytes.len() {
130                    return Err("unterminated string".to_string());
131                }
132                match bytes[i] {
133                    b'n' => result.push('\n'),
134                    b't' => result.push('\t'),
135                    b'\\' => result.push('\\'),
136                    b'"' => result.push('"'),
137                    c => {
138                        result.push('\\');
139                        result.push(c as char);
140                    }
141                }
142                i += 1;
143            }
144            _ => {
145                // Handle UTF-8 properly
146                let ch = input[i..].chars().next().unwrap();
147                result.push(ch);
148                i += ch.len_utf8();
149            }
150        }
151    }
152}
153
154pub fn lex(input: &str) -> Result<Vec<SpannedToken>, LexError> {
155    let mut tokens = Vec::new();
156    let mut pos = 0;
157
158    loop {
159        pos = skip_whitespace(input, pos);
160        if pos >= input.len() {
161            break;
162        }
163
164        let span = compute_span(input, pos);
165        let byte = input.as_bytes()[pos];
166
167        match byte {
168            b'{' => {
169                tokens.push(SpannedToken {
170                    token: Token::LBrace,
171                    span,
172                });
173                pos += 1;
174            }
175            b'}' => {
176                tokens.push(SpannedToken {
177                    token: Token::RBrace,
178                    span,
179                });
180                pos += 1;
181            }
182            b'=' => {
183                tokens.push(SpannedToken {
184                    token: Token::Equals,
185                    span,
186                });
187                pos += 1;
188            }
189            b':' => {
190                tokens.push(SpannedToken {
191                    token: Token::Colon,
192                    span,
193                });
194                pos += 1;
195            }
196            b'"' => match lex_string_token(input, pos) {
197                Ok((new_pos, s)) => {
198                    tokens.push(SpannedToken {
199                        token: Token::StringLit(s),
200                        span,
201                    });
202                    pos = new_pos;
203                }
204                Err(msg) => {
205                    return Err(LexError { message: msg, span });
206                }
207            },
208            b'0'..=b'9' => match lex_number_token(input, pos) {
209                Ok((new_pos, n)) => {
210                    tokens.push(SpannedToken {
211                        token: Token::NumberLit(n),
212                        span,
213                    });
214                    pos = new_pos;
215                }
216                Err(msg) => {
217                    return Err(LexError { message: msg, span });
218                }
219            },
220            c if c.is_ascii_alphabetic() || c == b'_' => {
221                let (new_pos, word) = lex_word(input, pos);
222                let token = match word {
223                    "Feature" => Token::Feature,
224                    "Variable" => Token::Variable,
225                    "Boolean" => Token::BooleanType,
226                    "Number" => Token::NumberType,
227                    "String" => Token::StringType,
228                    "true" => Token::BoolLit(true),
229                    "false" => Token::BoolLit(false),
230                    _ => Token::Ident(word.to_string()),
231                };
232                tokens.push(SpannedToken { token, span });
233                pos = new_pos;
234            }
235            _ => {
236                return Err(LexError {
237                    message: format!("unexpected character: {:?}", byte as char),
238                    span,
239                });
240            }
241        }
242    }
243
244    Ok(tokens)
245}
246
247#[cfg(test)]
248mod tests {
249    use super::*;
250
251    #[test]
252    fn lex_feature_keyword() {
253        let tokens = lex("Feature").unwrap();
254        assert_eq!(tokens.len(), 1);
255        assert_eq!(tokens[0].token, Token::Feature);
256        assert_eq!(
257            tokens[0].span,
258            Span {
259                offset: 0,
260                line: 1,
261                column: 1
262            }
263        );
264    }
265
266    #[test]
267    fn lex_variable_keyword() {
268        let tokens = lex("Variable").unwrap();
269        assert_eq!(tokens.len(), 1);
270        assert_eq!(tokens[0].token, Token::Variable);
271    }
272
273    #[test]
274    fn lex_type_keywords() {
275        let tokens = lex("Boolean Number String").unwrap();
276        assert_eq!(tokens.len(), 3);
277        assert_eq!(tokens[0].token, Token::BooleanType);
278        assert_eq!(tokens[1].token, Token::NumberType);
279        assert_eq!(tokens[2].token, Token::StringType);
280    }
281
282    #[test]
283    fn lex_bool_literals() {
284        let tokens = lex("true false").unwrap();
285        assert_eq!(tokens.len(), 2);
286        assert_eq!(tokens[0].token, Token::BoolLit(true));
287        assert_eq!(tokens[1].token, Token::BoolLit(false));
288    }
289
290    #[test]
291    fn lex_number_literals() {
292        let tokens = lex("42 3.14").unwrap();
293        assert_eq!(tokens.len(), 2);
294        assert_eq!(tokens[0].token, Token::NumberLit(42.0));
295        assert_eq!(tokens[1].token, Token::NumberLit(3.14));
296    }
297
298    #[test]
299    fn lex_string_literal() {
300        let tokens = lex(r#""hello""#).unwrap();
301        assert_eq!(tokens.len(), 1);
302        assert_eq!(tokens[0].token, Token::StringLit("hello".to_string()));
303    }
304
305    #[test]
306    fn lex_string_with_escapes() {
307        let tokens = lex(r#""hello\nworld""#).unwrap();
308        assert_eq!(tokens.len(), 1);
309        assert_eq!(
310            tokens[0].token,
311            Token::StringLit("hello\nworld".to_string())
312        );
313    }
314
315    #[test]
316    fn lex_complete_feature_block() {
317        let input = r#"1: Feature Checkout = {
318    1: Variable enabled Boolean = true
319    2: Variable max_items Number = 50
320    3: Variable header_text String = "Complete your purchase"
321}"#;
322        let tokens = lex(input).unwrap();
323        // 1: Feature Checkout = { (6)
324        // + 3 * (id : Variable name Type = value) = 3 * 7 = 21
325        // + } (1) = 28
326        assert_eq!(tokens.len(), 28);
327        assert_eq!(tokens[0].token, Token::NumberLit(1.0));
328        assert_eq!(tokens[1].token, Token::Colon);
329        assert_eq!(tokens[2].token, Token::Feature);
330        assert_eq!(tokens[3].token, Token::Ident("Checkout".to_string()));
331        assert_eq!(tokens[4].token, Token::Equals);
332        assert_eq!(tokens[5].token, Token::LBrace);
333        assert_eq!(tokens[6].token, Token::NumberLit(1.0));
334        assert_eq!(tokens[7].token, Token::Colon);
335        assert_eq!(tokens[8].token, Token::Variable);
336        assert_eq!(tokens[9].token, Token::Ident("enabled".to_string()));
337        assert_eq!(tokens[10].token, Token::BooleanType);
338        assert_eq!(tokens[11].token, Token::Equals);
339        assert_eq!(tokens[12].token, Token::BoolLit(true));
340        assert_eq!(tokens[27].token, Token::RBrace);
341    }
342
343    #[test]
344    fn lex_error_unterminated_string() {
345        let result = lex(r#""hello"#);
346        assert!(result.is_err());
347        let err = result.unwrap_err();
348        assert_eq!(err.message, "unterminated string");
349    }
350
351    #[test]
352    fn lex_error_invalid_character() {
353        let result = lex("@");
354        assert!(result.is_err());
355        let err = result.unwrap_err();
356        assert!(err.message.contains("unexpected character"));
357    }
358
359    #[test]
360    fn lex_span_info_multiline() {
361        let input = "Feature\n  Checkout";
362        let tokens = lex(input).unwrap();
363        assert_eq!(
364            tokens[0].span,
365            Span {
366                offset: 0,
367                line: 1,
368                column: 1
369            }
370        );
371        assert_eq!(
372            tokens[1].span,
373            Span {
374                offset: 10,
375                line: 2,
376                column: 3
377            }
378        );
379    }
380}