mamba/parse/lex/
tokenize.rs

1use std::iter::Peekable;
2use std::str::Chars;
3
4use crate::common::position::CaretPos;
5use crate::parse::lex::result::{LexErr, LexResult};
6use crate::parse::lex::state::State;
7use crate::parse::lex::token::{Lex, Token};
8use crate::parse::lex::tokenize_direct;
9
10#[allow(clippy::cognitive_complexity)]
11pub fn into_tokens(c: char, it: &mut Peekable<Chars>, state: &mut State) -> LexResult {
12    match c {
13        ',' => create(state, Token::Comma),
14        ':' => match it.peek() {
15            Some(':') => match (it.next(), it.peek()) {
16                (_, Some('=')) => next_and_create(it, state, Token::SliceIncl),
17                _ => create(state, Token::Slice),
18            },
19            Some('=') => next_and_create(it, state, Token::Assign),
20            _ => create(state, Token::DoublePoint),
21        },
22        '(' => create(state, Token::LRBrack),
23        ')' => create(state, Token::RRBrack),
24        '[' => create(state, Token::LSBrack),
25        ']' => create(state, Token::RSBrack),
26        '{' => create(state, Token::LCBrack),
27        '}' => create(state, Token::RCBrack),
28        '|' => create(state, Token::Ver),
29        '\n' => create(state, Token::NL),
30        '\r' => match it.next() {
31            Some('\n') => create(state, Token::NL),
32            _ => Err(LexErr::new(state.pos, None, "return carriage not followed by newline")),
33        },
34        '.' => match it.peek() {
35            Some('.') => match (it.next(), it.peek()) {
36                (_, Some('=')) => next_and_create(it, state, Token::RangeIncl),
37                _ => create(state, Token::Range),
38            },
39            _ => create(state, Token::Point),
40        },
41        '<' => match it.peek() {
42            Some('<') => match (it.next(), it.peek()) {
43                (_, Some('=')) => next_and_create(it, state, Token::BLShiftAssign),
44                _ => next_and_create(it, state, Token::BLShift),
45            },
46            Some('=') => next_and_create(it, state, Token::Leq),
47            _ => create(state, Token::Le),
48        },
49        '>' => match it.peek() {
50            Some('>') => match (it.next(), it.peek()) {
51                (_, Some('=')) => next_and_create(it, state, Token::BRShiftAssign),
52                _ => next_and_create(it, state, Token::BRShift),
53            },
54            Some('=') => next_and_create(it, state, Token::Geq),
55            _ => create(state, Token::Ge),
56        },
57        '+' => match it.peek() {
58            Some('=') => next_and_create(it, state, Token::AddAssign),
59            _ => create(state, Token::Add),
60        },
61        '-' => match it.peek() {
62            Some('=') => next_and_create(it, state, Token::SubAssign),
63            Some('>') => next_and_create(it, state, Token::To),
64            _ => create(state, Token::Sub),
65        },
66        '*' => match it.peek() {
67            Some('=') => next_and_create(it, state, Token::MulAssign),
68            _ => create(state, Token::Mul),
69        },
70        '/' => match it.peek() {
71            Some('=') => next_and_create(it, state, Token::DivAssign),
72            Some('/') => next_and_create(it, state, Token::FDiv),
73            _ => create(state, Token::Div),
74        },
75        '\\' => create(state, Token::BSlash),
76        '^' => match it.peek() {
77            Some('=') => next_and_create(it, state, Token::PowAssign),
78            _ => create(state, Token::Pow),
79        },
80        '=' => match it.peek() {
81            Some('>') => next_and_create(it, state, Token::BTo),
82            _ => create(state, Token::Eq),
83        },
84        '#' => {
85            let mut comment = String::new();
86            while it.peek().is_some() && *it.peek().unwrap() != '\n' && *it.peek().unwrap() != '\r'
87            {
88                comment.push(it.next().unwrap());
89            }
90            create(state, Token::Comment(comment))
91        }
92        '!' => match it.peek() {
93            Some('=') => next_and_create(it, state, Token::Neq),
94            _ => {
95                let msg = String::from("'!' is not a valid character on its own");
96                Err(LexErr::new(state.pos, None, &msg))
97            }
98        },
99        '?' => create(state, Token::Question),
100        '0'..='9' => {
101            let mut number = c.to_string();
102            let mut exp = String::new();
103            let mut float = false;
104            let mut e_num = false;
105
106            while let Some(&c) = it.peek() {
107                match c {
108                    '0'..='9' if !e_num => {
109                        number.push(c);
110                        it.next();
111                    }
112                    '0'..='9' if e_num => {
113                        exp.push(c);
114                        it.next();
115                    }
116                    'E' if e_num => break,
117                    'E' => {
118                        e_num = true;
119                        it.next();
120                    }
121                    '.' if float || e_num => break,
122                    '.' => {
123                        {
124                            // Check if not range by peeking ahead extra char
125                            let mut it = it.clone();
126                            it.next();
127                            if let Some('.') = it.peek() {
128                                break;
129                            }
130                        }
131
132                        number.push(c);
133                        float = true;
134                        it.next();
135                    }
136                    _ => break,
137                }
138            }
139            create(
140                state,
141                if e_num {
142                    Token::ENum(number, exp)
143                } else if float {
144                    Token::Real(number)
145                } else {
146                    Token::Int(number)
147                },
148            )
149        }
150        'a'..='z' | 'A'..='Z' | '_' => {
151            let mut id_or_operation = c.to_string();
152            while let Some(c) = it.peek() {
153                match c {
154                    'a'..='z' | 'A'..='Z' | '_' | '0'..='9' => {
155                        id_or_operation.push(*c);
156                        it.next();
157                    }
158                    _ => break,
159                }
160            }
161            create(state, as_op_or_id(id_or_operation))
162        }
163        '"' => {
164            let mut string = String::new();
165            let mut back_slash = false;
166
167            let mut exprs: Vec<(CaretPos, String)> = vec![];
168            let mut build_cur_expr = 0;
169            let mut cur_offset = CaretPos::start();
170            let mut cur_expr = String::new();
171
172            for c in it {
173                if !back_slash && build_cur_expr == 0 && c == '"' {
174                    break;
175                }
176                string.push(c);
177
178                if !back_slash {
179                    if build_cur_expr > 0 {
180                        cur_expr.push(c);
181                    }
182
183                    if c == '{' {
184                        if build_cur_expr == 0 {
185                            cur_offset = state.pos.offset_pos(string.len() + 1);
186                        }
187                        build_cur_expr += 1;
188                    } else if c == '}' {
189                        build_cur_expr -= 1;
190                    }
191
192                    if build_cur_expr == 0 && !cur_expr.is_empty() {
193                        // Last char is always } due to counter
194                        cur_expr = cur_expr[0..cur_expr.len() - 1].to_owned();
195                        if !cur_expr.is_empty() {
196                            exprs.push((cur_offset, cur_expr.clone()));
197                        }
198                        cur_expr.clear()
199                    }
200                }
201
202                back_slash = c == '\\';
203            }
204
205            if string.starts_with("\"\"") && string.ends_with("\"\"") {
206                let string = string.trim_start_matches("\"\"").trim_end_matches("\"\"");
207                create(state, Token::DocStr(String::from(string)))
208            } else {
209                let tokens = exprs
210                    .iter()
211                    .map(|(offset, string)| match tokenize_direct(string) {
212                        Ok(tokens) => Ok(tokens
213                            .iter()
214                            .map(|lex| Lex::new(lex.pos.offset(offset).start, lex.token.clone()))
215                            .collect()),
216                        Err(err) => Err(err),
217                    })
218                    .collect::<Result<_, _>>()?;
219
220                create(state, Token::Str(string, tokens))
221            }
222        }
223        ' ' => {
224            state.space();
225            Ok(vec![])
226        }
227        c => Err(LexErr::new(state.pos, None, &format!("unrecognized character: {c}"))),
228    }
229}
230
231fn next_and_create(
232    it: &mut Peekable<Chars>,
233    state: &mut State,
234    token: Token,
235) -> LexResult<Vec<Lex>> {
236    it.next();
237    create(state, token)
238}
239
240fn create(state: &mut State, token: Token) -> LexResult<Vec<Lex>> {
241    Ok(state.token(token))
242}
243
244fn as_op_or_id(string: String) -> Token {
245    match string.as_ref() {
246        "_" => Token::Underscore,
247
248        "from" => Token::From,
249        "type" => Token::Type,
250        "class" => Token::Class,
251        "pure" => Token::Pure,
252        "as" => Token::As,
253
254        "import" => Token::Import,
255        "forward" => Token::Forward,
256        "self" => Token::_Self,
257        "vararg" => Token::Vararg,
258        "init" => Token::Init,
259
260        "def" => Token::Def,
261        "fin" => Token::Fin,
262        "and" => Token::And,
263        "or" => Token::Or,
264        "not" => Token::Not,
265        "is" => Token::Is,
266        "isa" => Token::IsA,
267        "isnt" => Token::IsN,
268        "isnta" => Token::IsNA,
269        "mod" => Token::Mod,
270        "sqrt" => Token::Sqrt,
271        "while" => Token::While,
272        "for" => Token::For,
273
274        "_and_" => Token::BAnd,
275        "_or_" => Token::BOr,
276        "_xor_" => Token::BXOr,
277        "_not_" => Token::BOneCmpl,
278
279        "if" => Token::If,
280        "else" => Token::Else,
281        "match" => Token::Match,
282        "continue" => Token::Continue,
283        "break" => Token::Break,
284        "return" => Token::Ret,
285        "then" => Token::Then,
286        "do" => Token::Do,
287        "with" => Token::With,
288
289        "in" => Token::In,
290
291        "raise" => Token::Raise,
292        "handle" => Token::Handle,
293        "when" => Token::When,
294
295        "True" => Token::Bool(true),
296        "False" => Token::Bool(false),
297
298        "None" => Token::Undefined,
299        "pass" => Token::Pass,
300
301        _ => Token::Id(string),
302    }
303}
304
305#[cfg(test)]
306mod test {
307    use crate::parse::lex::result::LexErr;
308    use crate::parse::lex::token::Token;
309    use crate::parse::lex::tokenize;
310
311    #[test]
312    fn function_with_ret() -> Result<(), LexErr> {
313        let source = "def f(x: Int) -> Int =>\n    return";
314        let tokens = tokenize(&source)
315            .map_err(|e| e.into_with_source(&Some(String::from(source)), &None))?;
316
317        assert_eq!(tokens[0].token, Token::Def);
318        assert_eq!(tokens[1].token, Token::Id(String::from("f")));
319        assert_eq!(tokens[2].token, Token::LRBrack);
320        assert_eq!(tokens[3].token, Token::Id(String::from("x")));
321        assert_eq!(tokens[4].token, Token::DoublePoint);
322        assert_eq!(tokens[5].token, Token::Id(String::from("Int")));
323        assert_eq!(tokens[6].token, Token::RRBrack);
324        assert_eq!(tokens[7].token, Token::To);
325        assert_eq!(tokens[8].token, Token::Id(String::from("Int")));
326        assert_eq!(tokens[9].token, Token::BTo);
327        assert_eq!(tokens[10].token, Token::NL);
328        assert_eq!(tokens[11].token, Token::Indent);
329        assert_eq!(tokens[12].token, Token::Ret);
330        assert_eq!(tokens[13].token, Token::Dedent);
331
332        Ok(())
333    }
334
335    #[test]
336    fn class_with_body_class_right_after() -> Result<(), LexErr> {
337        let source = "class MyClass\n    def var := 10\nclass MyClass1\n";
338        let tokens = tokenize(&source)
339            .map_err(|e| e.into_with_source(&Some(String::from(source)), &None))?;
340
341        assert_eq!(tokens[0].token, Token::Class);
342        assert_eq!(tokens[1].token, Token::Id(String::from("MyClass")));
343        assert_eq!(tokens[2].token, Token::NL);
344        assert_eq!(tokens[3].token, Token::Indent);
345        assert_eq!(tokens[4].token, Token::Def);
346        assert_eq!(tokens[5].token, Token::Id(String::from("var")));
347        assert_eq!(tokens[6].token, Token::Assign);
348        assert_eq!(tokens[7].token, Token::Int(String::from("10")));
349        assert_eq!(tokens[8].token, Token::NL);
350        assert_eq!(tokens[9].token, Token::Dedent);
351        assert_eq!(tokens[10].token, Token::NL);
352        assert_eq!(tokens[11].token, Token::Class);
353        assert_eq!(tokens[12].token, Token::Id(String::from("MyClass1")));
354
355        Ok(())
356    }
357
358    #[test]
359    fn if_statement() -> Result<(), LexErr> {
360        let source = "if a then\n    b\nelse\n    c";
361        let tokens = tokenize(&source)
362            .map_err(|e| e.into_with_source(&Some(String::from(source)), &None))?;
363
364        assert_eq!(tokens[0].token, Token::If);
365        assert_eq!(tokens[1].token, Token::Id(String::from("a")));
366        assert_eq!(tokens[2].token, Token::Then);
367        assert_eq!(tokens[3].token, Token::NL);
368        assert_eq!(tokens[4].token, Token::Indent);
369        assert_eq!(tokens[5].token, Token::Id(String::from("b")));
370        assert_eq!(tokens[6].token, Token::NL);
371        assert_eq!(tokens[7].token, Token::Dedent);
372        assert_eq!(tokens[8].token, Token::NL);
373        assert_eq!(tokens[9].token, Token::Else);
374        assert_eq!(tokens[10].token, Token::NL);
375        assert_eq!(tokens[11].token, Token::Indent);
376        assert_eq!(tokens[12].token, Token::Id(String::from("c")));
377
378        Ok(())
379    }
380
381    #[test]
382    fn e_number() -> Result<(), LexErr> {
383        let source = "3E4";
384        let tokens = tokenize(&source)
385            .map_err(|e| e.into_with_source(&Some(String::from(source)), &None))?;
386
387        assert_eq!(tokens[0].token, Token::ENum(String::from("3"), String::from("4")));
388        Ok(())
389    }
390
391    #[test]
392    fn int() -> Result<(), LexErr> {
393        let source = "0";
394        let tokens = tokenize(&source)
395            .map_err(|e| e.into_with_source(&Some(String::from(source)), &None))?;
396
397        assert_eq!(tokens[0].token, Token::Int(String::from("0")));
398        Ok(())
399    }
400
401    #[test]
402    fn real() -> Result<(), LexErr> {
403        let source = "0.";
404        let tokens = tokenize(&source)
405            .map_err(|e| e.into_with_source(&Some(String::from(source)), &None))?;
406
407        assert_eq!(tokens[0].token, Token::Real(String::from("0.")));
408        Ok(())
409    }
410
411    #[test]
412    fn real2() -> Result<(), LexErr> {
413        let source = "0.0";
414        let tokens = tokenize(&source)
415            .map_err(|e| e.into_with_source(&Some(String::from(source)), &None))?;
416
417        assert_eq!(tokens[0].token, Token::Real(String::from("0.0")));
418        Ok(())
419    }
420
421    #[test]
422    fn real3() -> Result<(), LexErr> {
423        let source = "0.0.";
424        let tokens = tokenize(&source)
425            .map_err(|e| e.into_with_source(&Some(String::from(source)), &None))?;
426
427        assert_eq!(tokens[0].token, Token::Real(String::from("0.0")));
428        assert_eq!(tokens[1].token, Token::Point);
429        Ok(())
430    }
431
432    #[test]
433    fn range_incl() -> Result<(), LexErr> {
434        let sources = vec!["0 ..= 2", "0..= 2", "0 ..=2", "0..=2"];
435
436        for source in sources {
437            let tokens = tokenize(&source)
438                .map_err(|e| e.into_with_source(&Some(String::from(source)), &None))?;
439
440            assert_eq!(tokens[0].token, Token::Int(String::from("0")), "(0): {}", source);
441            assert_eq!(tokens[1].token, Token::RangeIncl, "(..=): {}", source);
442            assert_eq!(tokens[2].token, Token::Int(String::from("2")), "(2): {}", source);
443        }
444
445        Ok(())
446    }
447
448    #[test]
449    fn range() -> Result<(), LexErr> {
450        let sources = vec!["0 .. 2", "0.. 2", "0 ..2", "0..2"];
451
452        for source in sources {
453            let tokens = tokenize(&source)
454                .map_err(|e| e.into_with_source(&Some(String::from(source)), &None))?;
455
456            assert_eq!(tokens[0].token, Token::Int(String::from("0")), "(0): {}", source);
457            assert_eq!(tokens[1].token, Token::Range, "(..): {}", source);
458            assert_eq!(tokens[2].token, Token::Int(String::from("2")), "(2): {}", source);
459        }
460
461        Ok(())
462    }
463
464    #[test]
465    fn range_tripped_up() -> Result<(), LexErr> {
466        let sources = vec!["0 ... 2", "0... 2", "0 ...2", "0...2"];
467
468        for source in sources {
469            let tokens = tokenize(&source)
470                .map_err(|e| e.into_with_source(&Some(String::from(source)), &None))?;
471
472            assert_eq!(tokens[0].token, Token::Int(String::from("0")), "(0): {}", source);
473            assert_eq!(tokens[1].token, Token::Range, "(..): {}", source);
474            assert_eq!(tokens[2].token, Token::Point, "(.): {}", source);
475            assert_eq!(tokens[3].token, Token::Int(String::from("2")), "(2): {}", source);
476        }
477
478        Ok(())
479    }
480}