makepad_code_editor/
tokenizer.rs

1use crate::{
2    text::{Change, Text},
3    token::TokenKind,
4    Token,
5};
6
7#[derive(Clone, Debug, Eq, Hash, PartialEq)]
8pub struct Tokenizer {
9    state: Vec<Option<(State, State)>>,
10}
11
12impl Tokenizer {
13    pub fn new(line_count: usize) -> Self {
14        Self {
15            state: (0..line_count).map(|_| None).collect(),
16        }
17    }
18
19    pub fn apply_change(&mut self, change: &Change) {
20        match *change {
21            Change::Insert(point, ref text) => {
22                self.state[point.line_index] = None;
23                let line_count = text.length().line_count;
24                if line_count > 0 {
25                    let line = point.line_index + 1;
26                    self.state.splice(line..line, (0..line_count).map(|_| None));
27                }
28            }
29            Change::Delete(start, length) => {
30                self.state[start.line_index] = None;
31                let line_count = length.line_count;
32                if line_count > 0 {
33                    let start_line = start.line_index + 1;
34                    let end_line = start_line + line_count;
35                    self.state.drain(start_line..end_line);
36                }
37            }
38        }
39    }
40
41    pub fn update(&mut self, text: &Text, tokens: &mut [Vec<Token>]) {
42        let mut state = State::default();
43        for line in 0..text.as_lines().len() {
44            match self.state[line] {
45                Some((start_state, end_state)) if state == start_state => {
46                    state = end_state;
47                }
48                _ => {
49                    let start_state = state;
50                    let mut new_tokens = Vec::new();
51                    let mut cursor = Cursor::new(&text.as_lines()[line]);
52                    loop {
53                        let (next_state, token) = state.next(&mut cursor);
54                        state = next_state;
55                        match token {
56                            Some(token) => new_tokens.push(token),
57                            None => break,
58                        }
59                    }
60                    self.state[line] = Some((start_state, state));
61                    tokens[line] = new_tokens;
62                }
63            }
64        }
65    }
66}
67
68#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
69pub enum State {
70    Initial(InitialState),
71    BlockCommentTail(BlockCommentTailState),
72    DoubleQuotedStringTail(DoubleQuotedStringTailState),
73    RawDoubleQuotedStringTail(RawDoubleQuotedStringTailState),
74}
75
76impl Default for State {
77    fn default() -> State {
78        State::Initial(InitialState)
79    }
80}
81
82impl State {
83    pub fn next(self, cursor: &mut Cursor) -> (State, Option<Token>) {
84        if cursor.peek(0) == '\0' {
85            return (self, None);
86        }
87        let start = cursor.index;
88        let (next_state, kind) = match self {
89            State::Initial(state) => state.next(cursor),
90            State::BlockCommentTail(state) => state.next(cursor),
91            State::DoubleQuotedStringTail(state) => state.next(cursor),
92            State::RawDoubleQuotedStringTail(state) => state.next(cursor),
93        };
94        let end = cursor.index;
95        assert!(start < end);
96        (
97            next_state,
98            Some(Token {
99                len: end - start,
100                kind,
101            }),
102        )
103    }
104}
105
106#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
107pub struct InitialState;
108
109impl InitialState {
110    fn next(self, cursor: &mut Cursor<'_>) -> (State, TokenKind) {
111        match (cursor.peek(0), cursor.peek(1), cursor.peek(2)) {
112            ('r', '#', '"') | ('r', '#', '#') => self.raw_string(cursor),
113            ('b', 'r', '"') | ('b', 'r', '#') => self.raw_byte_string(cursor),
114            ('/', '/', _) => self.line_comment(cursor),
115            ('/', '*', _) => self.block_comment(cursor),
116            ('b', '\'', _) => self.byte(cursor),
117            ('b', '"', _) => self.byte_string(cursor),
118            ('!', '=', _)
119            | ('%', '=', _)
120            | ('&', '&', _)
121            | ('&', '=', _)
122            | ('*', '=', _)
123            | ('+', '=', _)
124            | ('-', '=', _)
125            | ('-', '>', _)
126            | ('.', '.', _)
127            | ('/', '=', _)
128            | (':', ':', _)
129            | ('<', '<', _)
130            | ('<', '=', _)
131            | ('=', '=', _)
132            | ('=', '>', _)
133            | ('>', '=', _)
134            | ('>', '>', _)
135            | ('^', '=', _)
136            | ('|', '=', _)
137            | ('|', '|', _) => {
138                cursor.skip(2);
139                (State::Initial(InitialState), TokenKind::Punctuator)
140            }
141            ('\'', _, _) => self.char_or_lifetime(cursor),
142            ('"', _, _) => self.string(cursor),
143            ('(', _, _) => {
144                cursor.skip(1);
145                (State::Initial(InitialState), TokenKind::Delimiter)
146            }
147            (')', _, _) => {
148                cursor.skip(1);
149                (State::Initial(InitialState), TokenKind::Delimiter)
150            }
151            ('[', _, _) => {
152                cursor.skip(1);
153                (State::Initial(InitialState), TokenKind::Delimiter)
154            }
155            (']', _, _) => {
156                cursor.skip(1);
157                (State::Initial(InitialState), TokenKind::Delimiter)
158            }
159            ('{', _, _) => {
160                cursor.skip(1);
161                (State::Initial(InitialState), TokenKind::Delimiter)
162            }
163            ('}', _, _) => {
164                cursor.skip(1);
165                (State::Initial(InitialState), TokenKind::Delimiter)
166            }
167            ('.', char, _) if char.is_digit(10) => self.number(cursor),
168            ('!', _, _)
169            | ('#', _, _)
170            | ('$', _, _)
171            | ('%', _, _)
172            | ('&', _, _)
173            | ('*', _, _)
174            | ('+', _, _)
175            | (',', _, _)
176            | ('-', _, _)
177            | ('.', _, _)
178            | ('/', _, _)
179            | (':', _, _)
180            | (';', _, _)
181            | ('<', _, _)
182            | ('=', _, _)
183            | ('>', _, _)
184            | ('?', _, _)
185            | ('@', _, _)
186            | ('^', _, _)
187            | ('_', _, _)
188            | ('|', _, _) => {
189                cursor.skip(1);
190                (State::Initial(InitialState), TokenKind::Punctuator)
191            }
192            (char, _, _) if char.is_identifier_start() => self.identifier_or_keyword(cursor),
193            (char, _, _) if char.is_digit(10) => self.number(cursor),
194            (char, _, _) if char.is_whitespace() => self.whitespace(cursor),
195            _ => {
196                cursor.skip(1);
197                (State::Initial(InitialState), TokenKind::Unknown)
198            }
199        }
200    }
201
202    fn line_comment(self, cursor: &mut Cursor) -> (State, TokenKind) {
203        debug_assert!(cursor.peek(0) == '/' && cursor.peek(1) == '/');
204        cursor.skip(2);
205        while cursor.skip_if(|ch| ch != '\0') {}
206        (State::Initial(InitialState), TokenKind::Comment)
207    }
208
209    fn block_comment(self, cursor: &mut Cursor<'_>) -> (State, TokenKind) {
210        debug_assert!(cursor.peek(0) == '/' && cursor.peek(1) == '*');
211        cursor.skip(2);
212        BlockCommentTailState { depth: 0 }.next(cursor)
213    }
214
215    fn identifier_or_keyword(self, cursor: &mut Cursor) -> (State, TokenKind) {
216        debug_assert!(cursor.peek(0).is_identifier_start());
217        let start = cursor.index;
218        cursor.skip(1);
219        while cursor.skip_if(|char| char.is_identifier_continue()) {}
220        let end = cursor.index;
221        let string = &cursor.string[start..end];
222        (
223            State::Initial(InitialState),
224            match string {
225                "else" | "if" | "match" | "return" => TokenKind::BranchKeyword,
226                "break" | "continue" | "for" | "loop" | "while" => TokenKind::LoopKeyword,
227                "Self" | "as" | "async" | "await" | "const" | "crate" | "dyn" | "enum"
228                | "extern" | "false" | "fn" | "impl" | "in" | "let" | "mod" | "move" | "mut"
229                | "pub" | "ref" | "self" | "static" | "struct" | "super" | "trait" | "true"
230                | "type" | "unsafe" | "use" | "where" | "usize" | "isize" | "u8" | "u16"
231                | "u32" | "u64" | "i8" | "i16" | "i32" | "i64" | "vec2" | "vec3" | "vec4"
232                | "bool" | "f32" | "f64" => TokenKind::OtherKeyword,
233                _ => {
234                    let mut chars = string.chars();
235                    if chars.next().unwrap().is_uppercase() {
236                        match chars.next() {
237                            Some(char) if char.is_uppercase() => TokenKind::Constant,
238                            _ => TokenKind::Typename,
239                        }
240                    } else if cursor.peek(0) == '(' {
241                        TokenKind::Function
242                    } else {
243                        TokenKind::Identifier
244                    }
245                }
246            },
247        )
248    }
249
250    fn number(self, cursor: &mut Cursor) -> (State, TokenKind) {
251        match (cursor.peek(0), cursor.peek(1)) {
252            ('0', 'b') => {
253                cursor.skip(2);
254                if !cursor.skip_digits(2) {
255                    return (State::Initial(InitialState), TokenKind::Unknown);
256                }
257                return (State::Initial(InitialState), TokenKind::Number);
258            }
259            ('0', 'o') => {
260                cursor.skip(2);
261                if !cursor.skip_digits(8) {
262                    return (State::Initial(InitialState), TokenKind::Unknown);
263                }
264                return (State::Initial(InitialState), TokenKind::Number);
265            }
266            ('0', 'x') => {
267                cursor.skip(2);
268                if !cursor.skip_digits(16) {
269                    return (State::Initial(InitialState), TokenKind::Unknown);
270                }
271                return (State::Initial(InitialState), TokenKind::Number);
272            }
273            _ => {
274                cursor.skip_digits(10);
275                match cursor.peek(0) {
276                    '.' if cursor.peek(1) != '.' && !cursor.peek(0).is_identifier_start() => {
277                        cursor.skip(1);
278                        if cursor.skip_digits(10) {
279                            if cursor.peek(0) == 'E' || cursor.peek(0) == 'e' {
280                                if !cursor.skip_exponent() {
281                                    return (State::Initial(InitialState), TokenKind::Unknown);
282                                }
283                            }
284                        }
285                        cursor.skip_suffix();
286                        return (State::Initial(InitialState), TokenKind::Number);
287                    }
288                    'E' | 'e' => {
289                        if !cursor.skip_exponent() {
290                            return (State::Initial(InitialState), TokenKind::Unknown);
291                        }
292                        cursor.skip_suffix();
293                        return (State::Initial(InitialState), TokenKind::Number);
294                    }
295                    _ => {
296                        cursor.skip_suffix();
297                        return (State::Initial(InitialState), TokenKind::Number);
298                    }
299                }
300            }
301        };
302    }
303
304    fn char_or_lifetime(self, cursor: &mut Cursor) -> (State, TokenKind) {
305        if cursor.peek(1).is_identifier_start() && cursor.peek(2) != '\'' {
306            debug_assert!(cursor.peek(0) == '\'');
307            cursor.skip(2);
308            while cursor.skip_if(|ch| ch.is_identifier_continue()) {}
309            if cursor.peek(0) == '\'' {
310                cursor.skip(1);
311                cursor.skip_suffix();
312                (State::Initial(InitialState), TokenKind::String)
313            } else {
314                (State::Initial(InitialState), TokenKind::String)
315            }
316        } else {
317            self.single_quoted_string(cursor)
318        }
319    }
320
321    fn byte(self, cursor: &mut Cursor) -> (State, TokenKind) {
322        debug_assert!(cursor.peek(0) == 'b');
323        cursor.skip(1);
324        self.single_quoted_string(cursor)
325    }
326
327    fn string(self, cursor: &mut Cursor) -> (State, TokenKind) {
328        self.double_quoted_string(cursor)
329    }
330
331    fn byte_string(self, cursor: &mut Cursor) -> (State, TokenKind) {
332        debug_assert!(cursor.peek(0) == 'b');
333        cursor.skip(1);
334        self.double_quoted_string(cursor)
335    }
336
337    fn raw_string(self, cursor: &mut Cursor) -> (State, TokenKind) {
338        debug_assert!(cursor.peek(0) == 'r');
339        cursor.skip(1);
340        self.raw_double_quoted_string(cursor)
341    }
342
343    fn raw_byte_string(self, cursor: &mut Cursor) -> (State, TokenKind) {
344        debug_assert!(cursor.peek(0) == 'b' && cursor.peek(1) == 'r');
345        cursor.skip(2);
346        self.raw_double_quoted_string(cursor)
347    }
348
349    fn single_quoted_string(self, cursor: &mut Cursor) -> (State, TokenKind) {
350        debug_assert!(cursor.peek(0) == '\'');
351        cursor.skip(1);
352        loop {
353            match (cursor.peek(0), cursor.peek(1)) {
354                ('\'', _) => {
355                    cursor.skip(1);
356                    cursor.skip_suffix();
357                    break;
358                }
359                ('\0', _) => return (State::Initial(InitialState), TokenKind::Unknown),
360                ('\\', '\'') | ('\\', '\\') => cursor.skip(2),
361                _ => cursor.skip(1),
362            }
363        }
364        (State::Initial(InitialState), TokenKind::String)
365    }
366
367    fn double_quoted_string(self, cursor: &mut Cursor) -> (State, TokenKind) {
368        debug_assert!(cursor.peek(0) == '"');
369        cursor.skip(1);
370        DoubleQuotedStringTailState.next(cursor)
371    }
372
373    fn raw_double_quoted_string(self, cursor: &mut Cursor) -> (State, TokenKind) {
374        let mut start_hash_count = 0;
375        while cursor.skip_if(|ch| ch == '#') {
376            start_hash_count += 1;
377        }
378        RawDoubleQuotedStringTailState { start_hash_count }.next(cursor)
379    }
380
381    fn whitespace(self, cursor: &mut Cursor) -> (State, TokenKind) {
382        debug_assert!(cursor.peek(0).is_whitespace());
383        cursor.skip(1);
384        while cursor.skip_if(|char| char.is_whitespace()) {}
385        (State::Initial(InitialState), TokenKind::Whitespace)
386    }
387}
388
389#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
390pub struct BlockCommentTailState {
391    depth: usize,
392}
393
394impl BlockCommentTailState {
395    fn next(self, cursor: &mut Cursor<'_>) -> (State, TokenKind) {
396        let mut state = self;
397        loop {
398            match (cursor.peek(0), cursor.peek(1)) {
399                ('/', '*') => {
400                    cursor.skip(2);
401                    state.depth += 1;
402                }
403                ('*', '/') => {
404                    cursor.skip(2);
405                    if state.depth == 0 {
406                        break (State::Initial(InitialState), TokenKind::Comment);
407                    }
408                    state.depth -= 1;
409                }
410                ('\0', _) => {
411                    break (State::BlockCommentTail(state), TokenKind::Comment);
412                }
413                _ => cursor.skip(1),
414            }
415        }
416    }
417}
418
419#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
420pub struct DoubleQuotedStringTailState;
421
422impl DoubleQuotedStringTailState {
423    fn next(self, cursor: &mut Cursor<'_>) -> (State, TokenKind) {
424        loop {
425            match (cursor.peek(0), cursor.peek(1)) {
426                ('"', _) => {
427                    cursor.skip(1);
428                    cursor.skip_suffix();
429                    break (State::Initial(InitialState), TokenKind::String);
430                }
431                ('\0', _) => {
432                    break (
433                        State::DoubleQuotedStringTail(DoubleQuotedStringTailState),
434                        TokenKind::String,
435                    );
436                }
437                ('\\', '"') | ('\\', '\\') => cursor.skip(2),
438                _ => cursor.skip(1),
439            }
440        }
441    }
442}
443
444#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
445pub struct RawDoubleQuotedStringTailState {
446    start_hash_count: usize,
447}
448
449impl RawDoubleQuotedStringTailState {
450    fn next(self, cursor: &mut Cursor<'_>) -> (State, TokenKind) {
451        loop {
452            match cursor.peek(0) {
453                '"' => {
454                    cursor.skip(1);
455                    let mut end_hash_count = 0;
456                    while end_hash_count < self.start_hash_count && cursor.skip_if(|ch| ch == '#') {
457                        end_hash_count += 1;
458                    }
459                    if end_hash_count == self.start_hash_count {
460                        cursor.skip_suffix();
461                        break (State::Initial(InitialState), TokenKind::String);
462                    }
463                }
464                '\0' => {
465                    break (State::RawDoubleQuotedStringTail(self), TokenKind::String);
466                }
467                _ => cursor.skip(1),
468            }
469        }
470    }
471}
472
473#[derive(Debug)]
474pub struct Cursor<'a> {
475    string: &'a str,
476    index: usize,
477}
478
479impl<'a> Cursor<'a> {
480    pub fn new(string: &'a str) -> Self {
481        Cursor { string, index: 0 }
482    }
483
484    fn peek(&self, index: usize) -> char {
485        self.string[self.index..].chars().nth(index).unwrap_or('\0')
486    }
487
488    fn skip(&mut self, count: usize) {
489        self.index = self.string[self.index..]
490            .char_indices()
491            .nth(count)
492            .map_or(self.string.len(), |(index, _)| self.index + index);
493    }
494
495    fn skip_if<P>(&mut self, predicate: P) -> bool
496    where
497        P: FnOnce(char) -> bool,
498    {
499        if predicate(self.peek(0)) {
500            self.skip(1);
501            true
502        } else {
503            false
504        }
505    }
506
507    fn skip_exponent(&mut self) -> bool {
508        debug_assert!(self.peek(0) == 'E' || self.peek(0) == 'e');
509        self.skip(1);
510        if self.peek(0) == '+' || self.peek(0) == '-' {
511            self.skip(1);
512        }
513        self.skip_digits(10)
514    }
515
516    fn skip_digits(&mut self, radix: u32) -> bool {
517        let mut has_skip_digits = false;
518        loop {
519            match self.peek(0) {
520                '_' => {
521                    self.skip(1);
522                }
523                char if char.is_digit(radix) => {
524                    self.skip(1);
525                    has_skip_digits = true;
526                }
527                _ => break,
528            }
529        }
530        has_skip_digits
531    }
532
533    fn skip_suffix(&mut self) -> bool {
534        if self.peek(0).is_identifier_start() {
535            self.skip(1);
536            while self.skip_if(|char| char.is_identifier_continue()) {}
537            return true;
538        }
539        false
540    }
541}
542
543pub trait CharExt {
544    fn is_identifier_start(self) -> bool;
545    fn is_identifier_continue(self) -> bool;
546}
547
548impl CharExt for char {
549    fn is_identifier_start(self) -> bool {
550        match self {
551            'A'..='Z' | '_' | 'a'..='z' => true,
552            _ => false,
553        }
554    }
555
556    fn is_identifier_continue(self) -> bool {
557        match self {
558            '0'..='9' | 'A'..='Z' | '_' | 'a'..='z' => true,
559            _ => false,
560        }
561    }
562}