envuse_parser/parser/
tokenizer.rs

1use std::{ops::RangeBounds, usize, vec};
2
3use super::span::{self, Span};
4use crate::syntax_error::SyntaxError;
5
6#[derive(Debug, Clone)]
7pub struct Token {
8    pub kind: String,
9    pub raw: String,
10    pub span: span::Span,
11}
12
13impl Token {
14    fn new<A: ToString>(kind: A, cursor: &Cursor, span: span::Span) -> Self {
15        Self {
16            kind: kind.to_string(),
17            raw: cursor.get_by_span(&span),
18            span,
19        }
20    }
21}
22
23#[derive(Debug)]
24struct Cursor {
25    payload: String,
26    index: usize,
27}
28
29impl Cursor {
30    fn current_char(&self) -> Option<char> {
31        self.payload.chars().nth(self.index)
32    }
33
34    fn current_char_expected(&self, val: char) {
35        if !self.current_matches_char(val) {
36            todo!(
37                "Require error expected the char {} (ascii code {})",
38                val,
39                val as usize
40            );
41        }
42    }
43
44    fn current_matches_char(&self, val: char) -> bool {
45        if let Some(c) = self.current_char() {
46            c == val
47        } else {
48            false
49        }
50    }
51
52    pub fn current_matches_range_char<T>(&self, vec_ranges: &Vec<T>) -> bool
53    where
54        T: RangeBounds<char> + std::fmt::Debug,
55    {
56        let current_char = self.current_char();
57        if let Some(current_char) = current_char {
58            for e in vec_ranges {
59                if e.contains(&current_char) {
60                    return true;
61                }
62            }
63        }
64
65        return false;
66    }
67
68    fn current_range_char_expected<T>(&self, vec_ranges: &Vec<T>)
69    where
70        T: RangeBounds<char> + std::fmt::Debug,
71    {
72        if !self.current_matches_range_char(&vec_ranges) {
73            todo!("Require error expected the char {:?}", &vec_ranges);
74        }
75    }
76
77    fn forward(&mut self, positions: usize) {
78        self.index = self.index + positions;
79    }
80
81    fn get_by_span(&self, span: &span::Span) -> String {
82        unsafe { self.payload.get_unchecked(span.start..span.end).to_string() }
83    }
84
85    fn has_current<'a>(&'a self) -> bool {
86        self.payload.len() > self.index
87    }
88
89    fn has_next<'a>(&'a self) -> bool {
90        (self.payload.len() > (self.index + 1)).clone()
91    }
92
93    fn new<A>(index: usize, payload: A) -> Self
94    where
95        A: ToString,
96    {
97        Self {
98            payload: payload.to_string(),
99            index,
100        }
101    }
102
103    fn next_char(&self) -> Option<char> {
104        self.payload.chars().nth(self.index + 1)
105    }
106
107    fn next_matches_char(&self, val: char) -> bool {
108        if let Some(c) = self.next_char() {
109            c == val
110        } else {
111            false
112        }
113    }
114
115    pub fn next_matches_range_char<T>(&self, vec_ranges: &Vec<T>) -> bool
116    where
117        T: RangeBounds<char> + std::fmt::Debug,
118    {
119        let current_char = self.next_char();
120        if let Some(current_char) = current_char {
121            for e in vec_ranges {
122                if e.contains(&current_char) {
123                    return true;
124                }
125            }
126        }
127
128        return false;
129    }
130}
131
132pub struct Tokenizer {}
133
134impl Tokenizer {
135    pub fn parse<A>(payload: A) -> Result<Vec<Token>, SyntaxError>
136    where
137        A: ToString,
138    {
139        let ref mut cursor = Cursor::new(0, payload);
140        return Ok(Self::parse_by_cursor(cursor)?);
141    }
142
143    fn parse_by_cursor(cursor: &mut Cursor) -> Result<Vec<Token>, SyntaxError> {
144        let mut tokens: Vec<Token> = vec![];
145
146        while cursor.has_current() {
147            if cursor.current_matches_char('#') {
148                tokens.extend(Self::parse_comment(cursor)?);
149                continue;
150            }
151
152            if cursor.current_matches_char('\n') {
153                tokens.extend(Self::parse_newline(cursor)?);
154                continue;
155            }
156
157            if cursor.current_matches_range_char(&vec![' '..=' ', '\t'..='\t']) {
158                tokens.extend(Self::parse_spaces(cursor)?);
159                continue;
160            }
161
162            if cursor.current_matches_char(':') {
163                tokens.extend(Self::parse_colon(cursor)?);
164                continue;
165            }
166
167            if cursor.current_matches_char('=') {
168                tokens.extend(Self::parse_equal(cursor)?);
169                continue;
170            }
171
172            if cursor.current_matches_char('"') {
173                tokens.extend(Self::parse_string(cursor)?);
174                continue;
175            }
176
177            if cursor.current_matches_range_char(&vec!['a'..='z', 'A'..='Z', '_'..='_']) {
178                tokens.extend(Self::parse_keyword(cursor)?);
179                continue;
180            }
181
182            if cursor.current_matches_range_char(&vec!['0'..='9']) {
183                tokens.extend(Self::parse_number(cursor)?);
184                continue;
185            }
186
187            if cursor.current_matches_range_char(&vec!['?'..='?']) {
188                tokens.extend(Self::parse_question_mark(cursor)?);
189                continue;
190            }
191
192            if cursor.current_matches_range_char(&vec!['<'..='<']) {
193                tokens.extend(Self::parse_less_than(cursor)?);
194                continue;
195            }
196
197            if cursor.current_matches_range_char(&vec!['>'..='>']) {
198                tokens.extend(Self::parse_greater_than(cursor)?);
199                continue;
200            }
201
202            // dbg!(&cursor);
203            // dbg!(&cursor.current_char());
204            // dbg!(&tokens);
205            do yeet SyntaxError::new(
206                "Unexpected token",
207                Span {
208                    start: cursor.index,
209                    end: cursor.index + 1,
210                },
211            )
212        }
213
214        Ok(tokens)
215    }
216
217    fn parse_comment(cursor: &mut Cursor) -> Result<Vec<Token>, SyntaxError> {
218        cursor.current_char_expected('#');
219        let span_start = cursor.index;
220
221        while cursor.has_current() {
222            cursor.forward(1);
223            if cursor.current_matches_char('\n') {
224                break;
225            }
226        }
227
228        let span = span::Span {
229            start: span_start,
230            end: cursor.index,
231        };
232        let comment_token = Token::new("comment", &cursor, span);
233        Ok(vec![comment_token])
234    }
235
236    fn parse_newline(cursor: &mut Cursor) -> Result<Vec<Token>, SyntaxError> {
237        cursor.current_char_expected('\n');
238        let span_start = cursor.index;
239        cursor.forward(1);
240        let span = span::Span {
241            start: span_start,
242            end: cursor.index,
243        };
244        let newline_token = Token::new("newline", &cursor, span);
245        Ok(vec![newline_token])
246    }
247
248    fn parse_spaces(cursor: &mut Cursor) -> Result<Vec<Token>, SyntaxError> {
249        cursor.current_range_char_expected(&vec![' '..=' ', '\t'..='\t']);
250        let span_start = cursor.index;
251        while cursor.has_current() {
252            if cursor.current_matches_range_char(&vec![' '..=' ', '\t'..='\t']) {
253                cursor.forward(1);
254                continue;
255            }
256            break;
257        }
258        let span = span::Span {
259            start: span_start,
260            end: cursor.index,
261        };
262        let newline_token = Token::new("space", &cursor, span);
263        Ok(vec![newline_token])
264    }
265
266    fn parse_equal(cursor: &mut Cursor) -> Result<Vec<Token>, SyntaxError> {
267        cursor.current_char_expected('=');
268        let span_start = cursor.index;
269        cursor.forward(1);
270        let span = span::Span {
271            start: span_start,
272            end: cursor.index,
273        };
274        let newline_token = Token::new("equal", &cursor, span);
275        Ok(vec![newline_token])
276    }
277
278    fn parse_colon(cursor: &mut Cursor) -> Result<Vec<Token>, SyntaxError> {
279        cursor.current_char_expected(':');
280        let span_start = cursor.index;
281        cursor.forward(1);
282        let span = span::Span {
283            start: span_start,
284            end: cursor.index,
285        };
286        let newline_token = Token::new("colon", &cursor, span);
287        Ok(vec![newline_token])
288    }
289
290    fn parse_question_mark(cursor: &mut Cursor) -> Result<Vec<Token>, SyntaxError> {
291        cursor.current_char_expected('?');
292        let span_start = cursor.index;
293        cursor.forward(1);
294        let span = span::Span {
295            start: span_start,
296            end: cursor.index,
297        };
298        let newline_token = Token::new("question_mark", &cursor, span);
299        Ok(vec![newline_token])
300    }
301
302    fn parse_less_than(cursor: &mut Cursor) -> Result<Vec<Token>, SyntaxError> {
303        cursor.current_char_expected('<');
304        let span_start = cursor.index;
305        cursor.forward(1);
306        let span = span::Span {
307            start: span_start,
308            end: cursor.index,
309        };
310        let newline_token = Token::new("less_than", &cursor, span);
311        Ok(vec![newline_token])
312    }
313
314    fn parse_greater_than(cursor: &mut Cursor) -> Result<Vec<Token>, SyntaxError> {
315        cursor.current_char_expected('>');
316        let span_start = cursor.index;
317        cursor.forward(1);
318        let span = span::Span {
319            start: span_start,
320            end: cursor.index,
321        };
322        let newline_token = Token::new("greater_than", &cursor, span);
323        Ok(vec![newline_token])
324    }
325
326    fn parse_string(cursor: &mut Cursor) -> Result<Vec<Token>, SyntaxError> {
327        cursor.current_char_expected('"');
328        cursor.forward(1);
329        let span_start = cursor.index;
330
331        while cursor.has_current() {
332            if cursor.current_matches_char('\\') {
333                cursor.forward(2);
334                continue;
335            }
336            if cursor.current_matches_char('"') {
337                break;
338            }
339            cursor.forward(1);
340        }
341
342        cursor.current_char_expected('"');
343
344        let span = span::Span {
345            start: span_start,
346            end: cursor.index,
347        };
348        cursor.forward(1);
349        let newline_token = Token::new("string", &cursor, span);
350        Ok(vec![newline_token])
351    }
352
353    fn parse_keyword(cursor: &mut Cursor) -> Result<Vec<Token>, SyntaxError> {
354        cursor.current_range_char_expected(&vec!['a'..='z', 'A'..='Z', '0'..='9', '_'..='_']);
355        let span_start = cursor.index;
356
357        while cursor.has_current() {
358            if !cursor.current_matches_range_char(&vec!['a'..='z', 'A'..='Z', '0'..='9', '_'..='_'])
359            {
360                break;
361            }
362            cursor.forward(1);
363        }
364
365        let span = span::Span {
366            start: span_start,
367            end: cursor.index,
368        };
369        let newline_token = Token::new("keyword", &cursor, span);
370        Ok(vec![newline_token])
371    }
372
373    fn parse_number(cursor: &mut Cursor) -> Result<Vec<Token>, SyntaxError> {
374        cursor.current_range_char_expected(&vec!['0'..='9']);
375        let span_start = cursor.index;
376        let mut decimal = false;
377
378        while cursor.has_current() {
379            if cursor.current_matches_range_char(&vec!['0'..='9']) {
380                cursor.forward(1);
381                continue;
382            }
383            if cursor.current_matches_char('_') {
384                if !cursor.next_matches_range_char(&vec!['0'..='9']) {
385                    do yeet SyntaxError::new(
386                        "Only one underscore is allowed as numeric separator",
387                        Span {
388                            start: span_start,
389                            end: cursor.index,
390                        },
391                    )
392                }
393                cursor.forward(1);
394                continue;
395            }
396            if cursor.current_matches_char('.') {
397                if decimal {
398                    do yeet SyntaxError::new(
399                        "Unexpected token",
400                        Span {
401                            start: span_start,
402                            end: cursor.index,
403                        },
404                    )
405                }
406                if !cursor.next_matches_range_char(&vec!['0'..='9']) {
407                    do yeet SyntaxError::new(
408                        "Invalid or unexpected token",
409                        Span {
410                            start: span_start,
411                            end: cursor.index,
412                        },
413                    )
414                }
415                decimal = true;
416                cursor.forward(1);
417                continue;
418            }
419            break;
420        }
421
422        let span = span::Span {
423            start: span_start,
424            end: cursor.index,
425        };
426        let newline_token = Token::new("number", &cursor, span);
427        Ok(vec![newline_token])
428    }
429}