Skip to main content

pflow_dsl/
lexer.rs

1//! S-expression tokenizer.
2
3use std::fmt;
4
5/// Token types produced by the lexer.
6#[derive(Debug, Clone, Copy, PartialEq, Eq)]
7pub enum TokenType {
8    Eof,
9    LParen,
10    RParen,
11    Arrow,
12    Keyword,
13    Symbol,
14    Str,
15    Number,
16    Guard,
17}
18
19impl fmt::Display for TokenType {
20    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
21        match self {
22            TokenType::Eof => write!(f, "EOF"),
23            TokenType::LParen => write!(f, "("),
24            TokenType::RParen => write!(f, ")"),
25            TokenType::Arrow => write!(f, "->"),
26            TokenType::Keyword => write!(f, "keyword"),
27            TokenType::Symbol => write!(f, "symbol"),
28            TokenType::Str => write!(f, "string"),
29            TokenType::Number => write!(f, "number"),
30            TokenType::Guard => write!(f, "guard"),
31        }
32    }
33}
34
35/// A single token from the lexer.
36#[derive(Debug, Clone)]
37pub struct Token {
38    pub typ: TokenType,
39    pub literal: String,
40    pub pos: usize,
41}
42
43impl fmt::Display for Token {
44    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
45        write!(f, "Token({}, {:?}, {})", self.typ, self.literal, self.pos)
46    }
47}
48
49/// Tokenizes S-expression DSL input.
50pub struct Lexer {
51    input: Vec<u8>,
52    pos: usize,
53    read_pos: usize,
54    ch: u8,
55}
56
57impl Lexer {
58    pub fn new(input: &str) -> Self {
59        let mut l = Self {
60            input: input.as_bytes().to_vec(),
61            pos: 0,
62            read_pos: 0,
63            ch: 0,
64        };
65        l.read_char();
66        l
67    }
68
69    fn read_char(&mut self) {
70        if self.read_pos >= self.input.len() {
71            self.ch = 0;
72        } else {
73            self.ch = self.input[self.read_pos];
74        }
75        self.pos = self.read_pos;
76        self.read_pos += 1;
77    }
78
79    fn peek_char(&self) -> u8 {
80        if self.read_pos >= self.input.len() {
81            0
82        } else {
83            self.input[self.read_pos]
84        }
85    }
86
87    fn skip_whitespace(&mut self) {
88        while self.ch == b' ' || self.ch == b'\t' || self.ch == b'\n' || self.ch == b'\r' {
89            self.read_char();
90        }
91    }
92
93    fn skip_comment(&mut self) {
94        while self.ch != 0 && self.ch != b'\n' {
95            self.read_char();
96        }
97    }
98
99    pub fn next_token(&mut self) -> Token {
100        loop {
101            self.skip_whitespace();
102            if self.ch == b';' {
103                self.skip_comment();
104                continue;
105            }
106            break;
107        }
108
109        let pos = self.pos;
110
111        match self.ch {
112            0 => Token {
113                typ: TokenType::Eof,
114                literal: String::new(),
115                pos,
116            },
117            b'(' => {
118                self.read_char();
119                Token {
120                    typ: TokenType::LParen,
121                    literal: "(".into(),
122                    pos,
123                }
124            }
125            b')' => {
126                self.read_char();
127                Token {
128                    typ: TokenType::RParen,
129                    literal: ")".into(),
130                    pos,
131                }
132            }
133            b'-' => {
134                if self.peek_char() == b'>' {
135                    self.read_char();
136                    self.read_char();
137                    Token {
138                        typ: TokenType::Arrow,
139                        literal: "->".into(),
140                        pos,
141                    }
142                } else if is_digit(self.peek_char()) {
143                    self.read_char();
144                    let num = self.read_number();
145                    Token {
146                        typ: TokenType::Number,
147                        literal: format!("-{}", num),
148                        pos,
149                    }
150                } else {
151                    let sym = self.read_symbol();
152                    Token {
153                        typ: TokenType::Symbol,
154                        literal: sym,
155                        pos,
156                    }
157                }
158            }
159            b':' => {
160                self.read_char();
161                let kw = self.read_symbol();
162                Token {
163                    typ: TokenType::Keyword,
164                    literal: format!(":{}", kw),
165                    pos,
166                }
167            }
168            b'"' => {
169                self.read_char();
170                let s = self.read_string(b'"');
171                Token {
172                    typ: TokenType::Str,
173                    literal: s,
174                    pos,
175                }
176            }
177            b'{' => {
178                self.read_char();
179                let g = self.read_guard();
180                Token {
181                    typ: TokenType::Guard,
182                    literal: g,
183                    pos,
184                }
185            }
186            ch if is_digit(ch) => {
187                let num = self.read_number();
188                Token {
189                    typ: TokenType::Number,
190                    literal: num,
191                    pos,
192                }
193            }
194            ch if is_symbol_start(ch) => {
195                let sym = self.read_symbol();
196                Token {
197                    typ: TokenType::Symbol,
198                    literal: sym,
199                    pos,
200                }
201            }
202            _ => {
203                self.read_char();
204                Token {
205                    typ: TokenType::Eof,
206                    literal: String::new(),
207                    pos,
208                }
209            }
210        }
211    }
212
213    fn read_symbol(&mut self) -> String {
214        let start = self.pos;
215        while is_symbol_char(self.ch) {
216            self.read_char();
217        }
218        String::from_utf8_lossy(&self.input[start..self.pos]).to_string()
219    }
220
221    fn read_number(&mut self) -> String {
222        let start = self.pos;
223        while is_digit(self.ch) {
224            self.read_char();
225        }
226        String::from_utf8_lossy(&self.input[start..self.pos]).to_string()
227    }
228
229    fn read_string(&mut self, quote: u8) -> String {
230        let mut result = Vec::new();
231        while self.ch != 0 && self.ch != quote {
232            if self.ch == b'\\' {
233                self.read_char();
234                match self.ch {
235                    b'n' => result.push(b'\n'),
236                    b't' => result.push(b'\t'),
237                    b'r' => result.push(b'\r'),
238                    b'\\' => result.push(b'\\'),
239                    b'"' => result.push(b'"'),
240                    other => result.push(other),
241                }
242            } else {
243                result.push(self.ch);
244            }
245            self.read_char();
246        }
247        if self.ch == quote {
248            self.read_char();
249        }
250        String::from_utf8_lossy(&result).to_string()
251    }
252
253    fn read_guard(&mut self) -> String {
254        let mut result = Vec::new();
255        let mut depth = 1;
256        while self.ch != 0 && depth > 0 {
257            if self.ch == b'{' {
258                depth += 1;
259            } else if self.ch == b'}' {
260                depth -= 1;
261                if depth == 0 {
262                    self.read_char();
263                    break;
264                }
265            }
266            result.push(self.ch);
267            self.read_char();
268        }
269        String::from_utf8_lossy(&result).to_string()
270    }
271}
272
273fn is_symbol_start(ch: u8) -> bool {
274    ch.is_ascii_alphabetic() || ch == b'_'
275}
276
277fn is_symbol_char(ch: u8) -> bool {
278    ch.is_ascii_alphanumeric() || ch == b'_' || ch == b'-' || ch == b'[' || ch == b']' || ch == b'.'
279}
280
281fn is_digit(ch: u8) -> bool {
282    ch.is_ascii_digit()
283}
284
285/// Tokenize the input into a list of tokens.
286pub fn tokenize(input: &str) -> Vec<Token> {
287    let mut lexer = Lexer::new(input);
288    let mut tokens = Vec::new();
289    loop {
290        let tok = lexer.next_token();
291        let is_eof = tok.typ == TokenType::Eof;
292        tokens.push(tok);
293        if is_eof {
294            break;
295        }
296    }
297    tokens
298}
299
300#[cfg(test)]
301mod tests {
302    use super::*;
303
304    #[test]
305    fn test_basic_tokens() {
306        let tokens = tokenize("(schema ERC-020)");
307        assert_eq!(tokens[0].typ, TokenType::LParen);
308        assert_eq!(tokens[1].typ, TokenType::Symbol);
309        assert_eq!(tokens[1].literal, "schema");
310        assert_eq!(tokens[2].typ, TokenType::Symbol);
311        assert_eq!(tokens[2].literal, "ERC-020");
312        assert_eq!(tokens[3].typ, TokenType::RParen);
313    }
314
315    #[test]
316    fn test_keywords() {
317        let tokens = tokenize(":type :guard :keys");
318        assert_eq!(tokens[0].typ, TokenType::Keyword);
319        assert_eq!(tokens[0].literal, ":type");
320        assert_eq!(tokens[1].typ, TokenType::Keyword);
321        assert_eq!(tokens[1].literal, ":guard");
322    }
323
324    #[test]
325    fn test_arrow() {
326        let tokens = tokenize("balances -> transfer");
327        assert_eq!(tokens[0].typ, TokenType::Symbol);
328        assert_eq!(tokens[1].typ, TokenType::Arrow);
329        assert_eq!(tokens[2].typ, TokenType::Symbol);
330    }
331
332    #[test]
333    fn test_guard() {
334        let tokens = tokenize("{balances[from] >= amount}");
335        assert_eq!(tokens[0].typ, TokenType::Guard);
336        assert_eq!(tokens[0].literal, "balances[from] >= amount");
337    }
338
339    #[test]
340    fn test_numbers() {
341        let tokens = tokenize("123 -456");
342        assert_eq!(tokens[0].typ, TokenType::Number);
343        assert_eq!(tokens[0].literal, "123");
344        assert_eq!(tokens[1].typ, TokenType::Number);
345        assert_eq!(tokens[1].literal, "-456");
346    }
347
348    #[test]
349    fn test_comments() {
350        let tokens = tokenize("; this is a comment\n(schema test)");
351        assert_eq!(tokens[0].typ, TokenType::LParen);
352        assert_eq!(tokens[1].literal, "schema");
353    }
354}