Skip to main content

pdforg_sheets/
lexer.rs

1//! Formula lexer — tokenizes formula strings like "=SUM(A1:B10, 3.14)"
2
3use thiserror::Error;
4
5#[derive(Debug, Clone, PartialEq)]
6pub enum Token {
7    // Literals
8    Number(f64),
9    StringLit(String),
10    Bool(bool),
11    Error(String),    // #DIV/0!, #N/A, etc.
12
13    // Identifiers and references
14    Ident(String),    // function names, named ranges
15    CellRef(String),  // A1, $B$2, Sheet1!C3
16    RangeRef(String, String),  // A1:B10
17
18    // Operators
19    Plus,
20    Minus,
21    Star,
22    Slash,
23    Caret,           // ^ (power)
24    Ampersand,       // & (concatenate)
25    Percent,         // % (percentage as postfix)
26    Equal,
27    NotEqual,        // <>
28    LessThan,
29    LessEqual,
30    GreaterThan,
31    GreaterEqual,
32
33    // Punctuation
34    LParen,
35    RParen,
36    LBrace,          // { for array literals
37    RBrace,          // }
38    Comma,
39    Semicolon,       // sometimes used as argument separator
40    Colon,
41    Dollar,
42    Exclamation,     // ! for sheet references
43
44    // Special
45    EOF,
46}
47
48#[derive(Debug, Error)]
49pub enum LexError {
50    #[error("Unexpected character '{0}' at position {1}")]
51    UnexpectedChar(char, usize),
52    #[error("Unterminated string literal")]
53    UnterminatedString,
54}
55
56pub struct Lexer {
57    input: Vec<char>,
58    pos: usize,
59}
60
61impl Lexer {
62    pub fn new(input: &str) -> Self {
63        // Skip leading '='
64        let s = input.trim();
65        let chars: Vec<char> = if s.starts_with('=') {
66            s[1..].chars().collect()
67        } else {
68            s.chars().collect()
69        };
70        Lexer { input: chars, pos: 0 }
71    }
72
73    pub fn tokenize(&mut self) -> Result<Vec<Token>, LexError> {
74        let mut tokens = vec![];
75        loop {
76            let tok = self.next_token()?;
77            let done = tok == Token::EOF;
78            tokens.push(tok);
79            if done { break; }
80        }
81        Ok(tokens)
82    }
83
84    fn peek(&self) -> Option<char> { self.input.get(self.pos).copied() }
85    fn peek2(&self) -> Option<char> { self.input.get(self.pos + 1).copied() }
86    fn advance(&mut self) -> Option<char> {
87        let c = self.input.get(self.pos).copied();
88        self.pos += 1;
89        c
90    }
91
92    fn skip_whitespace(&mut self) {
93        while matches!(self.peek(), Some(' ' | '\t' | '\n' | '\r')) {
94            self.pos += 1;
95        }
96    }
97
98    fn next_token(&mut self) -> Result<Token, LexError> {
99        self.skip_whitespace();
100
101        match self.peek() {
102            None => Ok(Token::EOF),
103            Some(c) => match c {
104                '+' => { self.advance(); Ok(Token::Plus) }
105                '-' => { self.advance(); Ok(Token::Minus) }
106                '*' => { self.advance(); Ok(Token::Star) }
107                '/' => { self.advance(); Ok(Token::Slash) }
108                '^' => { self.advance(); Ok(Token::Caret) }
109                '&' => { self.advance(); Ok(Token::Ampersand) }
110                '%' => { self.advance(); Ok(Token::Percent) }
111                '(' => { self.advance(); Ok(Token::LParen) }
112                ')' => { self.advance(); Ok(Token::RParen) }
113                '{' => { self.advance(); Ok(Token::LBrace) }
114                '}' => { self.advance(); Ok(Token::RBrace) }
115                ',' => { self.advance(); Ok(Token::Comma) }
116                ';' => { self.advance(); Ok(Token::Semicolon) }
117                '!' => { self.advance(); Ok(Token::Exclamation) }
118                '$' => { self.advance(); Ok(Token::Dollar) }
119
120                '=' => { self.advance(); Ok(Token::Equal) }
121                '<' => {
122                    self.advance();
123                    if self.peek() == Some('>') { self.advance(); Ok(Token::NotEqual) }
124                    else if self.peek() == Some('=') { self.advance(); Ok(Token::LessEqual) }
125                    else { Ok(Token::LessThan) }
126                }
127                '>' => {
128                    self.advance();
129                    if self.peek() == Some('=') { self.advance(); Ok(Token::GreaterEqual) }
130                    else { Ok(Token::GreaterThan) }
131                }
132
133                '"' => self.lex_string(),
134                '#' => self.lex_error_val(),
135
136                '0'..='9' | '.' => self.lex_number(),
137                'A'..='Z' | 'a'..='z' | '_' => self.lex_ident_or_cellref(),
138                ':' => { self.advance(); Ok(Token::Colon) }
139
140                other => {
141                    let pos = self.pos;
142                    self.advance();
143                    Err(LexError::UnexpectedChar(other, pos))
144                }
145            }
146        }
147    }
148
149    fn lex_string(&mut self) -> Result<Token, LexError> {
150        self.advance(); // consume opening "
151        let mut s = String::new();
152        loop {
153            match self.advance() {
154                None => return Err(LexError::UnterminatedString),
155                Some('"') => {
156                    // Handle escaped quote: ""
157                    if self.peek() == Some('"') {
158                        self.advance();
159                        s.push('"');
160                    } else {
161                        break;
162                    }
163                }
164                Some(c) => s.push(c),
165            }
166        }
167        Ok(Token::StringLit(s))
168    }
169
170    fn lex_error_val(&mut self) -> Result<Token, LexError> {
171        let start = self.pos;
172        while matches!(self.peek(), Some(c) if !c.is_whitespace() && c != ')' && c != ',') {
173            self.advance();
174        }
175        let s: String = self.input[start..self.pos].iter().collect();
176        Ok(Token::Error(s))
177    }
178
179    fn lex_number(&mut self) -> Result<Token, LexError> {
180        let start = self.pos;
181        while matches!(self.peek(), Some('0'..='9' | '.')) {
182            self.advance();
183        }
184        // Scientific notation: 1e10, 1E-5
185        if matches!(self.peek(), Some('e' | 'E')) {
186            self.advance();
187            if matches!(self.peek(), Some('+' | '-')) { self.advance(); }
188            while matches!(self.peek(), Some('0'..='9')) { self.advance(); }
189        }
190        let s: String = self.input[start..self.pos].iter().collect();
191        let n: f64 = s.parse().unwrap_or(0.0);
192        Ok(Token::Number(n))
193    }
194
195    fn lex_ident_or_cellref(&mut self) -> Result<Token, LexError> {
196        let start = self.pos;
197
198        // Cell references can start with $, so handle here
199        let has_dollar_col = self.peek() == Some('$');
200        if has_dollar_col { self.advance(); }
201
202        // Read column letters
203        let col_start = self.pos;
204        while matches!(self.peek(), Some('A'..='Z' | 'a'..='z')) {
205            self.advance();
206        }
207        let col_part: String = self.input[col_start..self.pos].iter().collect();
208
209        // Check for optional $ before row number
210        let has_dollar_row = self.peek() == Some('$');
211        if has_dollar_row { self.advance(); }
212
213        // Check if followed by row digits
214        let row_start = self.pos;
215        while matches!(self.peek(), Some('0'..='9')) {
216            self.advance();
217        }
218        let row_part: String = self.input[row_start..self.pos].iter().collect();
219
220        if !col_part.is_empty() && !row_part.is_empty() {
221            // It's a cell reference like A1, $B$3
222            let cell_ref: String = self.input[start..self.pos].iter().collect();
223
224            // Check if followed by : for range
225            if self.peek() == Some(':') {
226                self.advance(); // consume ':'
227
228                // Parse second cell ref
229                let start2 = self.pos;
230                if self.peek() == Some('$') { self.advance(); }
231                while matches!(self.peek(), Some('A'..='Z' | 'a'..='z')) { self.advance(); }
232                if self.peek() == Some('$') { self.advance(); }
233                while matches!(self.peek(), Some('0'..='9')) { self.advance(); }
234                let cell_ref2: String = self.input[start2..self.pos].iter().collect();
235                return Ok(Token::RangeRef(cell_ref, cell_ref2));
236            }
237            return Ok(Token::CellRef(cell_ref));
238        }
239
240        // Also consume more chars for identifiers (function names, named ranges)
241        while matches!(self.peek(), Some('A'..='Z' | 'a'..='z' | '0'..='9' | '_' | '.')) {
242            self.advance();
243        }
244        let ident: String = self.input[start..self.pos].iter().collect();
245
246        // Check for sheet reference: Sheet1!A1
247        if self.peek() == Some('!') {
248            self.advance();
249            let ref_start = self.pos;
250            // Read rest of cell ref
251            if self.peek() == Some('$') { self.advance(); }
252            while matches!(self.peek(), Some('A'..='Z' | 'a'..='z')) { self.advance(); }
253            if self.peek() == Some('$') { self.advance(); }
254            while matches!(self.peek(), Some('0'..='9')) { self.advance(); }
255            let cell_part: String = self.input[ref_start..self.pos].iter().collect();
256            return Ok(Token::CellRef(format!("{}!{}", ident, cell_part)));
257        }
258
259        // Check for boolean keywords
260        match ident.to_uppercase().as_str() {
261            "TRUE" => return Ok(Token::Bool(true)),
262            "FALSE" => return Ok(Token::Bool(false)),
263            _ => {}
264        }
265
266        Ok(Token::Ident(ident))
267    }
268}
269
270#[cfg(test)]
271mod tests {
272    use super::*;
273
274    #[test]
275    fn test_lex_sum() {
276        let mut lex = Lexer::new("=SUM(A1:B10)");
277        let tokens = lex.tokenize().unwrap();
278        assert!(tokens.contains(&Token::Ident("SUM".into())));
279    }
280
281    #[test]
282    fn test_lex_number() {
283        let mut lex = Lexer::new("=3.14");
284        let tokens = lex.tokenize().unwrap();
285        assert!(tokens.contains(&Token::Number(3.14)));
286    }
287
288    #[test]
289    fn test_lex_string() {
290        let mut lex = Lexer::new(r#"="Hello World""#);
291        let tokens = lex.tokenize().unwrap();
292        assert!(tokens.contains(&Token::StringLit("Hello World".into())));
293    }
294
295    #[test]
296    fn test_lex_range() {
297        let mut lex = Lexer::new("=A1:B10");
298        let tokens = lex.tokenize().unwrap();
299        assert!(matches!(tokens[0], Token::RangeRef(..)));
300    }
301}