makepad_code_editor/
tokenizer.rs

1use crate::{
2    text::{Change, Text},
3    token::TokenKind,
4    Token,
5};
6
7#[derive(Clone, Debug, Eq, Hash, PartialEq)]
8pub struct Tokenizer {
9    state: Vec<Option<(State, State)>>,
10}
11
12impl Tokenizer {
13    pub fn new(line_count: usize) -> Self {
14        Self {
15            state: (0..line_count).map(|_| None).collect(),
16        }
17    }
18
19    pub fn apply_change(&mut self, change: &Change) {
20        match *change {
21            Change::Insert(point, ref text) => {
22                self.state[point.line_index] = None;
23                let line_count = text.length().line_count;
24                if line_count > 0 {
25                    let line = point.line_index + 1;
26                    self.state.splice(line..line, (0..line_count).map(|_| None));
27                }
28            }
29            Change::Delete(start, length) => {
30                self.state[start.line_index] = None;
31                let line_count = length.line_count;
32                if line_count > 0 {
33                    let start_line = start.line_index + 1;
34                    let end_line = start_line + line_count;
35                    self.state.drain(start_line..end_line);
36                }
37            }
38        }
39    }
40
41    pub fn update(&mut self, text: &Text, tokens: &mut [Vec<Token>]) {
42        let mut state = State::default();
43        for line in 0..text.as_lines().len() {
44            match self.state[line] {
45                Some((start_state, end_state)) if state == start_state => {
46                    state = end_state;
47                }
48                _ => {
49                    let start_state = state;
50                    let mut new_tokens = Vec::new();
51                    let mut cursor = Cursor::new(&text.as_lines()[line]);
52                    loop {
53                        let (next_state, token) = state.next(&mut cursor);
54                        state = next_state;
55                        match token {
56                            Some(token) => new_tokens.push(token),
57                            None => break,
58                        }
59                    }
60                    self.state[line] = Some((start_state, state));
61                    tokens[line] = new_tokens;
62                }
63            }
64        }
65    }
66}
67
68#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
69pub enum State {
70    Initial(InitialState),
71    BlockCommentTail(BlockCommentTailState),
72    DoubleQuotedStringTail(DoubleQuotedStringTailState),
73    RawDoubleQuotedStringTail(RawDoubleQuotedStringTailState),
74}
75
76impl Default for State {
77    fn default() -> State {
78        State::Initial(InitialState)
79    }
80}
81
82impl State {
83    pub fn next(self, cursor: &mut Cursor) -> (State, Option<Token>) {
84        if cursor.peek(0) == '\0' {
85            return (self, None);
86        }
87        let start = cursor.index;
88        let (next_state, kind) = match self {
89            State::Initial(state) => state.next(cursor),
90            State::BlockCommentTail(state) => state.next(cursor),
91            State::DoubleQuotedStringTail(state) => state.next(cursor),
92            State::RawDoubleQuotedStringTail(state) => state.next(cursor),
93        };
94        let end = cursor.index;
95        assert!(start < end);
96        (
97            next_state,
98            Some(Token {
99                len: end - start,
100                kind,
101            }),
102        )
103    }
104}
105
106#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
107pub struct InitialState;
108
109impl InitialState {
110    fn next(self, cursor: &mut Cursor<'_>) -> (State, TokenKind) {
111        match (cursor.peek(0), cursor.peek(1), cursor.peek(2)) {
112            ('r', '#', '"') | ('r', '#', '#') => self.raw_string(cursor),
113            ('b', 'r', '"') | ('b', 'r', '#') => self.raw_byte_string(cursor),
114            ('/', '/', _) => self.line_comment(cursor),
115            ('/', '*', _) => self.block_comment(cursor),
116            ('b', '\'', _) => self.byte(cursor),
117            ('b', '"', _) => self.byte_string(cursor),
118            ('!', '=', _)
119            | ('%', '=', _)
120            | ('&', '&', _)
121            | ('&', '=', _)
122            | ('*', '=', _)
123            | ('+', '=', _)
124            | ('-', '=', _)
125            | ('-', '>', _)
126            | ('.', '.', _)
127            | ('/', '=', _)
128            | (':', ':', _)
129            | ('<', '<', _)
130            | ('<', '=', _)
131            | ('=', '=', _)
132            | ('=', '>', _)
133            | ('>', '=', _)
134            | ('>', '>', _)
135            | ('^', '=', _)
136            | ('|', '=', _)
137            | ('|', '|', _) => {
138                cursor.skip(2);
139                (State::Initial(InitialState), TokenKind::Punctuator)
140            }
141            ('\'', _, _) => self.char_or_lifetime(cursor),
142            ('"', _, _) => self.string(cursor),
143            ('(', _, _) => {
144                cursor.skip(1);
145                (State::Initial(InitialState), TokenKind::Delimiter)
146            }
147            (')', _, _) => {
148                cursor.skip(1);
149                (State::Initial(InitialState), TokenKind::Delimiter)
150            }
151            ('[', _, _) => {
152                cursor.skip(1);
153                (State::Initial(InitialState), TokenKind::Delimiter)
154            }
155            (']', _, _) => {
156                cursor.skip(1);
157                (State::Initial(InitialState), TokenKind::Delimiter)
158            }
159            ('{', _, _) => {
160                cursor.skip(1);
161                (State::Initial(InitialState), TokenKind::Delimiter)
162            }
163            ('}', _, _) => {
164                cursor.skip(1);
165                (State::Initial(InitialState), TokenKind::Delimiter)
166            }
167            ('.', char, _) if char.is_digit(10) => self.number(cursor),
168            ('!', _, _)
169            | ('#', _, _)
170            | ('$', _, _)
171            | ('%', _, _)
172            | ('&', _, _)
173            | ('*', _, _)
174            | ('+', _, _)
175            | (',', _, _)
176            | ('-', _, _)
177            | ('.', _, _)
178            | ('/', _, _)
179            | (':', _, _)
180            | (';', _, _)
181            | ('<', _, _)
182            | ('=', _, _)
183            | ('>', _, _)
184            | ('?', _, _)
185            | ('@', _, _)
186            | ('^', _, _)
187            | ('_', _, _)
188            | ('|', _, _) => {
189                cursor.skip(1);
190                (State::Initial(InitialState), TokenKind::Punctuator)
191            }
192            (char, _, _) if char.is_identifier_start() => self.identifier_or_keyword(cursor),
193            (char, _, _) if char.is_digit(10) => self.number(cursor),
194            (char, _, _) if char.is_whitespace() => self.whitespace(cursor),
195            _ => {
196                cursor.skip(1);
197                (State::Initial(InitialState), TokenKind::Unknown)
198            }
199        }
200    }
201
202    fn line_comment(self, cursor: &mut Cursor) -> (State, TokenKind) {
203        debug_assert!(cursor.peek(0) == '/' && cursor.peek(1) == '/');
204        cursor.skip(2);
205        while cursor.skip_if(|ch| ch != '\0') {}
206        (State::Initial(InitialState), TokenKind::Comment)
207    }
208
209    fn block_comment(self, cursor: &mut Cursor<'_>) -> (State, TokenKind) {
210        debug_assert!(cursor.peek(0) == '/' && cursor.peek(1) == '*');
211        cursor.skip(2);
212        BlockCommentTailState { depth: 0 }.next(cursor)
213    }
214
215    fn identifier_or_keyword(self, cursor: &mut Cursor) -> (State, TokenKind) {
216        debug_assert!(cursor.peek(0).is_identifier_start());
217        let start = cursor.index;
218        cursor.skip(1);
219        while cursor.skip_if(|char| char.is_identifier_continue()) {}
220        let end = cursor.index;
221        let string = &cursor.string[start..end];
222        (
223            State::Initial(InitialState),
224            match string {
225                "else" | "if" | "match" | "return" => TokenKind::BranchKeyword,
226                "break" | "continue" | "for" | "loop" | "while" => TokenKind::LoopKeyword,
227                "Self" | "as" | "async" | "await" | "const" | "crate" | "dyn" | "enum"
228                | "extern" | "false" | "fn" | "impl" | "in" | "let" | "mod" | "move" | "mut"
229                | "pub" | "ref" | "self" | "static" | "struct" | "super" | "trait" | "true"
230                | "type" | "unsafe" | "use" | "where" => TokenKind::OtherKeyword,
231                _ => {
232                    let mut chars = string.chars();
233                    if chars.next().unwrap().is_uppercase() {
234                        match chars.next() {
235                            Some(char) if char.is_uppercase() => TokenKind::Constant,
236                            _ => TokenKind::Typename,
237                        }
238                    } else {
239                        TokenKind::Identifier
240                    }
241                }
242            },
243        )
244    }
245
246    fn number(self, cursor: &mut Cursor) -> (State, TokenKind) {
247        match (cursor.peek(0), cursor.peek(1)) {
248            ('0', 'b') => {
249                cursor.skip(2);
250                if !cursor.skip_digits(2) {
251                    return (State::Initial(InitialState), TokenKind::Unknown);
252                }
253                return (State::Initial(InitialState), TokenKind::Number);
254            }
255            ('0', 'o') => {
256                cursor.skip(2);
257                if !cursor.skip_digits(8) {
258                    return (State::Initial(InitialState), TokenKind::Unknown);
259                }
260                return (State::Initial(InitialState), TokenKind::Number);
261            }
262            ('0', 'x') => {
263                cursor.skip(2);
264                if !cursor.skip_digits(16) {
265                    return (State::Initial(InitialState), TokenKind::Unknown);
266                }
267                return (State::Initial(InitialState), TokenKind::Number);
268            }
269            _ => {
270                cursor.skip_digits(10);
271                match cursor.peek(0) {
272                    '.' if cursor.peek(1) != '.' && !cursor.peek(0).is_identifier_start() => {
273                        cursor.skip(1);
274                        if cursor.skip_digits(10) {
275                            if cursor.peek(0) == 'E' || cursor.peek(0) == 'e' {
276                                if !cursor.skip_exponent() {
277                                    return (State::Initial(InitialState), TokenKind::Unknown);
278                                }
279                            }
280                        }
281                        cursor.skip_suffix();
282                        return (State::Initial(InitialState), TokenKind::Number);
283                    }
284                    'E' | 'e' => {
285                        if !cursor.skip_exponent() {
286                            return (State::Initial(InitialState), TokenKind::Unknown);
287                        }
288                        cursor.skip_suffix();
289                        return (State::Initial(InitialState), TokenKind::Number);
290                    }
291                    _ => {
292                        cursor.skip_suffix();
293                        return (State::Initial(InitialState), TokenKind::Number);
294                    }
295                }
296            }
297        };
298    }
299
300    fn char_or_lifetime(self, cursor: &mut Cursor) -> (State, TokenKind) {
301        if cursor.peek(1).is_identifier_start() && cursor.peek(2) != '\'' {
302            debug_assert!(cursor.peek(0) == '\'');
303            cursor.skip(2);
304            while cursor.skip_if(|ch| ch.is_identifier_continue()) {}
305            if cursor.peek(0) == '\'' {
306                cursor.skip(1);
307                cursor.skip_suffix();
308                (State::Initial(InitialState), TokenKind::String)
309            } else {
310                (State::Initial(InitialState), TokenKind::String)
311            }
312        } else {
313            self.single_quoted_string(cursor)
314        }
315    }
316
317    fn byte(self, cursor: &mut Cursor) -> (State, TokenKind) {
318        debug_assert!(cursor.peek(0) == 'b');
319        cursor.skip(1);
320        self.single_quoted_string(cursor)
321    }
322
323    fn string(self, cursor: &mut Cursor) -> (State, TokenKind) {
324        self.double_quoted_string(cursor)
325    }
326
327    fn byte_string(self, cursor: &mut Cursor) -> (State, TokenKind) {
328        debug_assert!(cursor.peek(0) == 'b');
329        cursor.skip(1);
330        self.double_quoted_string(cursor)
331    }
332
333    fn raw_string(self, cursor: &mut Cursor) -> (State, TokenKind) {
334        debug_assert!(cursor.peek(0) == 'r');
335        cursor.skip(1);
336        self.raw_double_quoted_string(cursor)
337    }
338
339    fn raw_byte_string(self, cursor: &mut Cursor) -> (State, TokenKind) {
340        debug_assert!(cursor.peek(0) == 'b' && cursor.peek(1) == 'r');
341        cursor.skip(2);
342        self.raw_double_quoted_string(cursor)
343    }
344
345    fn single_quoted_string(self, cursor: &mut Cursor) -> (State, TokenKind) {
346        debug_assert!(cursor.peek(0) == '\'');
347        cursor.skip(1);
348        loop {
349            match (cursor.peek(0), cursor.peek(1)) {
350                ('\'', _) => {
351                    cursor.skip(1);
352                    cursor.skip_suffix();
353                    break;
354                }
355                ('\0', _) => return (State::Initial(InitialState), TokenKind::Unknown),
356                ('\\', '\'') | ('\\', '\\') => cursor.skip(2),
357                _ => cursor.skip(1),
358            }
359        }
360        (State::Initial(InitialState), TokenKind::String)
361    }
362
363    fn double_quoted_string(self, cursor: &mut Cursor) -> (State, TokenKind) {
364        debug_assert!(cursor.peek(0) == '"');
365        cursor.skip(1);
366        DoubleQuotedStringTailState.next(cursor)
367    }
368
369    fn raw_double_quoted_string(self, cursor: &mut Cursor) -> (State, TokenKind) {
370        let mut start_hash_count = 0;
371        while cursor.skip_if(|ch| ch == '#') {
372            start_hash_count += 1;
373        }
374        RawDoubleQuotedStringTailState { start_hash_count }.next(cursor)
375    }
376
377    fn whitespace(self, cursor: &mut Cursor) -> (State, TokenKind) {
378        debug_assert!(cursor.peek(0).is_whitespace());
379        cursor.skip(1);
380        while cursor.skip_if(|char| char.is_whitespace()) {}
381        (State::Initial(InitialState), TokenKind::Whitespace)
382    }
383}
384
385#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
386pub struct BlockCommentTailState {
387    depth: usize,
388}
389
390impl BlockCommentTailState {
391    fn next(self, cursor: &mut Cursor<'_>) -> (State, TokenKind) {
392        let mut state = self;
393        loop {
394            match (cursor.peek(0), cursor.peek(1)) {
395                ('/', '*') => {
396                    cursor.skip(2);
397                    state.depth += 1;
398                }
399                ('*', '/') => {
400                    cursor.skip(2);
401                    if state.depth == 0 {
402                        break (State::Initial(InitialState), TokenKind::Comment);
403                    }
404                    state.depth -= 1;
405                }
406                ('\0', _) => {
407                    break (State::BlockCommentTail(state), TokenKind::Comment);
408                }
409                _ => cursor.skip(1),
410            }
411        }
412    }
413}
414
415#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
416pub struct DoubleQuotedStringTailState;
417
418impl DoubleQuotedStringTailState {
419    fn next(self, cursor: &mut Cursor<'_>) -> (State, TokenKind) {
420        loop {
421            match (cursor.peek(0), cursor.peek(1)) {
422                ('"', _) => {
423                    cursor.skip(1);
424                    cursor.skip_suffix();
425                    break (State::Initial(InitialState), TokenKind::String);
426                }
427                ('\0', _) => {
428                    break (
429                        State::DoubleQuotedStringTail(DoubleQuotedStringTailState),
430                        TokenKind::String,
431                    );
432                }
433                ('\\', '"') => cursor.skip(2),
434                _ => cursor.skip(1),
435            }
436        }
437    }
438}
439
440#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
441pub struct RawDoubleQuotedStringTailState {
442    start_hash_count: usize,
443}
444
445impl RawDoubleQuotedStringTailState {
446    fn next(self, cursor: &mut Cursor<'_>) -> (State, TokenKind) {
447        loop {
448            match cursor.peek(0) {
449                '"' => {
450                    cursor.skip(1);
451                    let mut end_hash_count = 0;
452                    while end_hash_count < self.start_hash_count && cursor.skip_if(|ch| ch == '#') {
453                        end_hash_count += 1;
454                    }
455                    if end_hash_count == self.start_hash_count {
456                        cursor.skip_suffix();
457                        break (State::Initial(InitialState), TokenKind::String);
458                    }
459                }
460                '\0' => {
461                    break (State::RawDoubleQuotedStringTail(self), TokenKind::String);
462                }
463                _ => cursor.skip(1),
464            }
465        }
466    }
467}
468
469#[derive(Debug)]
470pub struct Cursor<'a> {
471    string: &'a str,
472    index: usize,
473}
474
475impl<'a> Cursor<'a> {
476    pub fn new(string: &'a str) -> Self {
477        Cursor { string, index: 0 }
478    }
479
480    fn peek(&self, index: usize) -> char {
481        self.string[self.index..].chars().nth(index).unwrap_or('\0')
482    }
483
484    fn skip(&mut self, count: usize) {
485        self.index = self.string[self.index..]
486            .char_indices()
487            .nth(count)
488            .map_or(self.string.len(), |(index, _)| self.index + index);
489    }
490
491    fn skip_if<P>(&mut self, predicate: P) -> bool
492    where
493        P: FnOnce(char) -> bool,
494    {
495        if predicate(self.peek(0)) {
496            self.skip(1);
497            true
498        } else {
499            false
500        }
501    }
502
503    fn skip_exponent(&mut self) -> bool {
504        debug_assert!(self.peek(0) == 'E' || self.peek(0) == 'e');
505        self.skip(1);
506        if self.peek(0) == '+' || self.peek(0) == '-' {
507            self.skip(1);
508        }
509        self.skip_digits(10)
510    }
511
512    fn skip_digits(&mut self, radix: u32) -> bool {
513        let mut has_skip_digits = false;
514        loop {
515            match self.peek(0) {
516                '_' => {
517                    self.skip(1);
518                }
519                char if char.is_digit(radix) => {
520                    self.skip(1);
521                    has_skip_digits = true;
522                }
523                _ => break,
524            }
525        }
526        has_skip_digits
527    }
528
529    fn skip_suffix(&mut self) -> bool {
530        if self.peek(0).is_identifier_start() {
531            self.skip(1);
532            while self.skip_if(|char| char.is_identifier_continue()) {}
533            return true;
534        }
535        false
536    }
537}
538
539pub trait CharExt {
540    fn is_identifier_start(self) -> bool;
541    fn is_identifier_continue(self) -> bool;
542}
543
544impl CharExt for char {
545    fn is_identifier_start(self) -> bool {
546        match self {
547            'A'..='Z' | '_' | 'a'..='z' => true,
548            _ => false,
549        }
550    }
551
552    fn is_identifier_continue(self) -> bool {
553        match self {
554            '0'..='9' | 'A'..='Z' | '_' | 'a'..='z' => true,
555            _ => false,
556        }
557    }
558}