1use crate::{
2 text::{Change, Text},
3 token::TokenKind,
4 Token,
5};
6
7#[derive(Clone, Debug, Eq, Hash, PartialEq)]
8pub struct Tokenizer {
9 state: Vec<Option<(State, State)>>,
10}
11
12impl Tokenizer {
13 pub fn new(line_count: usize) -> Self {
14 Self {
15 state: (0..line_count).map(|_| None).collect(),
16 }
17 }
18
19 pub fn apply_change(&mut self, change: &Change) {
20 match *change {
21 Change::Insert(point, ref text) => {
22 self.state[point.line_index] = None;
23 let line_count = text.length().line_count;
24 if line_count > 0 {
25 let line = point.line_index + 1;
26 self.state.splice(line..line, (0..line_count).map(|_| None));
27 }
28 }
29 Change::Delete(start, length) => {
30 self.state[start.line_index] = None;
31 let line_count = length.line_count;
32 if line_count > 0 {
33 let start_line = start.line_index + 1;
34 let end_line = start_line + line_count;
35 self.state.drain(start_line..end_line);
36 }
37 }
38 }
39 }
40
41 pub fn update(&mut self, text: &Text, tokens: &mut [Vec<Token>]) {
42 let mut state = State::default();
43 for line in 0..text.as_lines().len() {
44 match self.state[line] {
45 Some((start_state, end_state)) if state == start_state => {
46 state = end_state;
47 }
48 _ => {
49 let start_state = state;
50 let mut new_tokens = Vec::new();
51 let mut cursor = Cursor::new(&text.as_lines()[line]);
52 loop {
53 let (next_state, token) = state.next(&mut cursor);
54 state = next_state;
55 match token {
56 Some(token) => new_tokens.push(token),
57 None => break,
58 }
59 }
60 self.state[line] = Some((start_state, state));
61 tokens[line] = new_tokens;
62 }
63 }
64 }
65 }
66}
67
68#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
69pub enum State {
70 Initial(InitialState),
71 BlockCommentTail(BlockCommentTailState),
72 DoubleQuotedStringTail(DoubleQuotedStringTailState),
73 RawDoubleQuotedStringTail(RawDoubleQuotedStringTailState),
74}
75
76impl Default for State {
77 fn default() -> State {
78 State::Initial(InitialState)
79 }
80}
81
82impl State {
83 pub fn next(self, cursor: &mut Cursor) -> (State, Option<Token>) {
84 if cursor.peek(0) == '\0' {
85 return (self, None);
86 }
87 let start = cursor.index;
88 let (next_state, kind) = match self {
89 State::Initial(state) => state.next(cursor),
90 State::BlockCommentTail(state) => state.next(cursor),
91 State::DoubleQuotedStringTail(state) => state.next(cursor),
92 State::RawDoubleQuotedStringTail(state) => state.next(cursor),
93 };
94 let end = cursor.index;
95 assert!(start < end);
96 (
97 next_state,
98 Some(Token {
99 len: end - start,
100 kind,
101 }),
102 )
103 }
104}
105
106#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
107pub struct InitialState;
108
109impl InitialState {
110 fn next(self, cursor: &mut Cursor<'_>) -> (State, TokenKind) {
111 match (cursor.peek(0), cursor.peek(1), cursor.peek(2)) {
112 ('r', '#', '"') | ('r', '#', '#') => self.raw_string(cursor),
113 ('b', 'r', '"') | ('b', 'r', '#') => self.raw_byte_string(cursor),
114 ('/', '/', _) => self.line_comment(cursor),
115 ('/', '*', _) => self.block_comment(cursor),
116 ('b', '\'', _) => self.byte(cursor),
117 ('b', '"', _) => self.byte_string(cursor),
118 ('!', '=', _)
119 | ('%', '=', _)
120 | ('&', '&', _)
121 | ('&', '=', _)
122 | ('*', '=', _)
123 | ('+', '=', _)
124 | ('-', '=', _)
125 | ('-', '>', _)
126 | ('.', '.', _)
127 | ('/', '=', _)
128 | (':', ':', _)
129 | ('<', '<', _)
130 | ('<', '=', _)
131 | ('=', '=', _)
132 | ('=', '>', _)
133 | ('>', '=', _)
134 | ('>', '>', _)
135 | ('^', '=', _)
136 | ('|', '=', _)
137 | ('|', '|', _) => {
138 cursor.skip(2);
139 (State::Initial(InitialState), TokenKind::Punctuator)
140 }
141 ('\'', _, _) => self.char_or_lifetime(cursor),
142 ('"', _, _) => self.string(cursor),
143 ('(', _, _) => {
144 cursor.skip(1);
145 (State::Initial(InitialState), TokenKind::Delimiter)
146 }
147 (')', _, _) => {
148 cursor.skip(1);
149 (State::Initial(InitialState), TokenKind::Delimiter)
150 }
151 ('[', _, _) => {
152 cursor.skip(1);
153 (State::Initial(InitialState), TokenKind::Delimiter)
154 }
155 (']', _, _) => {
156 cursor.skip(1);
157 (State::Initial(InitialState), TokenKind::Delimiter)
158 }
159 ('{', _, _) => {
160 cursor.skip(1);
161 (State::Initial(InitialState), TokenKind::Delimiter)
162 }
163 ('}', _, _) => {
164 cursor.skip(1);
165 (State::Initial(InitialState), TokenKind::Delimiter)
166 }
167 ('.', char, _) if char.is_digit(10) => self.number(cursor),
168 ('!', _, _)
169 | ('#', _, _)
170 | ('$', _, _)
171 | ('%', _, _)
172 | ('&', _, _)
173 | ('*', _, _)
174 | ('+', _, _)
175 | (',', _, _)
176 | ('-', _, _)
177 | ('.', _, _)
178 | ('/', _, _)
179 | (':', _, _)
180 | (';', _, _)
181 | ('<', _, _)
182 | ('=', _, _)
183 | ('>', _, _)
184 | ('?', _, _)
185 | ('@', _, _)
186 | ('^', _, _)
187 | ('_', _, _)
188 | ('|', _, _) => {
189 cursor.skip(1);
190 (State::Initial(InitialState), TokenKind::Punctuator)
191 }
192 (char, _, _) if char.is_identifier_start() => self.identifier_or_keyword(cursor),
193 (char, _, _) if char.is_digit(10) => self.number(cursor),
194 (char, _, _) if char.is_whitespace() => self.whitespace(cursor),
195 _ => {
196 cursor.skip(1);
197 (State::Initial(InitialState), TokenKind::Unknown)
198 }
199 }
200 }
201
202 fn line_comment(self, cursor: &mut Cursor) -> (State, TokenKind) {
203 debug_assert!(cursor.peek(0) == '/' && cursor.peek(1) == '/');
204 cursor.skip(2);
205 while cursor.skip_if(|ch| ch != '\0') {}
206 (State::Initial(InitialState), TokenKind::Comment)
207 }
208
209 fn block_comment(self, cursor: &mut Cursor<'_>) -> (State, TokenKind) {
210 debug_assert!(cursor.peek(0) == '/' && cursor.peek(1) == '*');
211 cursor.skip(2);
212 BlockCommentTailState { depth: 0 }.next(cursor)
213 }
214
215 fn identifier_or_keyword(self, cursor: &mut Cursor) -> (State, TokenKind) {
216 debug_assert!(cursor.peek(0).is_identifier_start());
217 let start = cursor.index;
218 cursor.skip(1);
219 while cursor.skip_if(|char| char.is_identifier_continue()) {}
220 let end = cursor.index;
221 let string = &cursor.string[start..end];
222 (
223 State::Initial(InitialState),
224 match string {
225 "else" | "if" | "match" | "return" => TokenKind::BranchKeyword,
226 "break" | "continue" | "for" | "loop" | "while" => TokenKind::LoopKeyword,
227 "Self" | "as" | "async" | "await" | "const" | "crate" | "dyn" | "enum"
228 | "extern" | "false" | "fn" | "impl" | "in" | "let" | "mod" | "move" | "mut"
229 | "pub" | "ref" | "self" | "static" | "struct" | "super" | "trait" | "true"
230 | "type" | "unsafe" | "use" | "where" | "usize" | "isize" | "u8" | "u16"
231 | "u32" | "u64" | "i8" | "i16" | "i32" | "i64" | "vec2" | "vec3" | "vec4"
232 | "bool" | "f32" | "f64" => TokenKind::OtherKeyword,
233 _ => {
234 let mut chars = string.chars();
235 if chars.next().unwrap().is_uppercase() {
236 match chars.next() {
237 Some(char) if char.is_uppercase() => TokenKind::Constant,
238 _ => TokenKind::Typename,
239 }
240 } else if cursor.peek(0) == '(' {
241 TokenKind::Function
242 } else {
243 TokenKind::Identifier
244 }
245 }
246 },
247 )
248 }
249
250 fn number(self, cursor: &mut Cursor) -> (State, TokenKind) {
251 match (cursor.peek(0), cursor.peek(1)) {
252 ('0', 'b') => {
253 cursor.skip(2);
254 if !cursor.skip_digits(2) {
255 return (State::Initial(InitialState), TokenKind::Unknown);
256 }
257 return (State::Initial(InitialState), TokenKind::Number);
258 }
259 ('0', 'o') => {
260 cursor.skip(2);
261 if !cursor.skip_digits(8) {
262 return (State::Initial(InitialState), TokenKind::Unknown);
263 }
264 return (State::Initial(InitialState), TokenKind::Number);
265 }
266 ('0', 'x') => {
267 cursor.skip(2);
268 if !cursor.skip_digits(16) {
269 return (State::Initial(InitialState), TokenKind::Unknown);
270 }
271 return (State::Initial(InitialState), TokenKind::Number);
272 }
273 _ => {
274 cursor.skip_digits(10);
275 match cursor.peek(0) {
276 '.' if cursor.peek(1) != '.' && !cursor.peek(0).is_identifier_start() => {
277 cursor.skip(1);
278 if cursor.skip_digits(10) {
279 if cursor.peek(0) == 'E' || cursor.peek(0) == 'e' {
280 if !cursor.skip_exponent() {
281 return (State::Initial(InitialState), TokenKind::Unknown);
282 }
283 }
284 }
285 cursor.skip_suffix();
286 return (State::Initial(InitialState), TokenKind::Number);
287 }
288 'E' | 'e' => {
289 if !cursor.skip_exponent() {
290 return (State::Initial(InitialState), TokenKind::Unknown);
291 }
292 cursor.skip_suffix();
293 return (State::Initial(InitialState), TokenKind::Number);
294 }
295 _ => {
296 cursor.skip_suffix();
297 return (State::Initial(InitialState), TokenKind::Number);
298 }
299 }
300 }
301 };
302 }
303
304 fn char_or_lifetime(self, cursor: &mut Cursor) -> (State, TokenKind) {
305 if cursor.peek(1).is_identifier_start() && cursor.peek(2) != '\'' {
306 debug_assert!(cursor.peek(0) == '\'');
307 cursor.skip(2);
308 while cursor.skip_if(|ch| ch.is_identifier_continue()) {}
309 if cursor.peek(0) == '\'' {
310 cursor.skip(1);
311 cursor.skip_suffix();
312 (State::Initial(InitialState), TokenKind::String)
313 } else {
314 (State::Initial(InitialState), TokenKind::String)
315 }
316 } else {
317 self.single_quoted_string(cursor)
318 }
319 }
320
321 fn byte(self, cursor: &mut Cursor) -> (State, TokenKind) {
322 debug_assert!(cursor.peek(0) == 'b');
323 cursor.skip(1);
324 self.single_quoted_string(cursor)
325 }
326
327 fn string(self, cursor: &mut Cursor) -> (State, TokenKind) {
328 self.double_quoted_string(cursor)
329 }
330
331 fn byte_string(self, cursor: &mut Cursor) -> (State, TokenKind) {
332 debug_assert!(cursor.peek(0) == 'b');
333 cursor.skip(1);
334 self.double_quoted_string(cursor)
335 }
336
337 fn raw_string(self, cursor: &mut Cursor) -> (State, TokenKind) {
338 debug_assert!(cursor.peek(0) == 'r');
339 cursor.skip(1);
340 self.raw_double_quoted_string(cursor)
341 }
342
343 fn raw_byte_string(self, cursor: &mut Cursor) -> (State, TokenKind) {
344 debug_assert!(cursor.peek(0) == 'b' && cursor.peek(1) == 'r');
345 cursor.skip(2);
346 self.raw_double_quoted_string(cursor)
347 }
348
349 fn single_quoted_string(self, cursor: &mut Cursor) -> (State, TokenKind) {
350 debug_assert!(cursor.peek(0) == '\'');
351 cursor.skip(1);
352 loop {
353 match (cursor.peek(0), cursor.peek(1)) {
354 ('\'', _) => {
355 cursor.skip(1);
356 cursor.skip_suffix();
357 break;
358 }
359 ('\0', _) => return (State::Initial(InitialState), TokenKind::Unknown),
360 ('\\', '\'') | ('\\', '\\') => cursor.skip(2),
361 _ => cursor.skip(1),
362 }
363 }
364 (State::Initial(InitialState), TokenKind::String)
365 }
366
367 fn double_quoted_string(self, cursor: &mut Cursor) -> (State, TokenKind) {
368 debug_assert!(cursor.peek(0) == '"');
369 cursor.skip(1);
370 DoubleQuotedStringTailState.next(cursor)
371 }
372
373 fn raw_double_quoted_string(self, cursor: &mut Cursor) -> (State, TokenKind) {
374 let mut start_hash_count = 0;
375 while cursor.skip_if(|ch| ch == '#') {
376 start_hash_count += 1;
377 }
378 RawDoubleQuotedStringTailState { start_hash_count }.next(cursor)
379 }
380
381 fn whitespace(self, cursor: &mut Cursor) -> (State, TokenKind) {
382 debug_assert!(cursor.peek(0).is_whitespace());
383 cursor.skip(1);
384 while cursor.skip_if(|char| char.is_whitespace()) {}
385 (State::Initial(InitialState), TokenKind::Whitespace)
386 }
387}
388
389#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
390pub struct BlockCommentTailState {
391 depth: usize,
392}
393
394impl BlockCommentTailState {
395 fn next(self, cursor: &mut Cursor<'_>) -> (State, TokenKind) {
396 let mut state = self;
397 loop {
398 match (cursor.peek(0), cursor.peek(1)) {
399 ('/', '*') => {
400 cursor.skip(2);
401 state.depth += 1;
402 }
403 ('*', '/') => {
404 cursor.skip(2);
405 if state.depth == 0 {
406 break (State::Initial(InitialState), TokenKind::Comment);
407 }
408 state.depth -= 1;
409 }
410 ('\0', _) => {
411 break (State::BlockCommentTail(state), TokenKind::Comment);
412 }
413 _ => cursor.skip(1),
414 }
415 }
416 }
417}
418
419#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
420pub struct DoubleQuotedStringTailState;
421
422impl DoubleQuotedStringTailState {
423 fn next(self, cursor: &mut Cursor<'_>) -> (State, TokenKind) {
424 loop {
425 match (cursor.peek(0), cursor.peek(1)) {
426 ('"', _) => {
427 cursor.skip(1);
428 cursor.skip_suffix();
429 break (State::Initial(InitialState), TokenKind::String);
430 }
431 ('\0', _) => {
432 break (
433 State::DoubleQuotedStringTail(DoubleQuotedStringTailState),
434 TokenKind::String,
435 );
436 }
437 ('\\', '"') | ('\\', '\\') => cursor.skip(2),
438 _ => cursor.skip(1),
439 }
440 }
441 }
442}
443
444#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
445pub struct RawDoubleQuotedStringTailState {
446 start_hash_count: usize,
447}
448
449impl RawDoubleQuotedStringTailState {
450 fn next(self, cursor: &mut Cursor<'_>) -> (State, TokenKind) {
451 loop {
452 match cursor.peek(0) {
453 '"' => {
454 cursor.skip(1);
455 let mut end_hash_count = 0;
456 while end_hash_count < self.start_hash_count && cursor.skip_if(|ch| ch == '#') {
457 end_hash_count += 1;
458 }
459 if end_hash_count == self.start_hash_count {
460 cursor.skip_suffix();
461 break (State::Initial(InitialState), TokenKind::String);
462 }
463 }
464 '\0' => {
465 break (State::RawDoubleQuotedStringTail(self), TokenKind::String);
466 }
467 _ => cursor.skip(1),
468 }
469 }
470 }
471}
472
473#[derive(Debug)]
474pub struct Cursor<'a> {
475 string: &'a str,
476 index: usize,
477}
478
479impl<'a> Cursor<'a> {
480 pub fn new(string: &'a str) -> Self {
481 Cursor { string, index: 0 }
482 }
483
484 fn peek(&self, index: usize) -> char {
485 self.string[self.index..].chars().nth(index).unwrap_or('\0')
486 }
487
488 fn skip(&mut self, count: usize) {
489 self.index = self.string[self.index..]
490 .char_indices()
491 .nth(count)
492 .map_or(self.string.len(), |(index, _)| self.index + index);
493 }
494
495 fn skip_if<P>(&mut self, predicate: P) -> bool
496 where
497 P: FnOnce(char) -> bool,
498 {
499 if predicate(self.peek(0)) {
500 self.skip(1);
501 true
502 } else {
503 false
504 }
505 }
506
507 fn skip_exponent(&mut self) -> bool {
508 debug_assert!(self.peek(0) == 'E' || self.peek(0) == 'e');
509 self.skip(1);
510 if self.peek(0) == '+' || self.peek(0) == '-' {
511 self.skip(1);
512 }
513 self.skip_digits(10)
514 }
515
516 fn skip_digits(&mut self, radix: u32) -> bool {
517 let mut has_skip_digits = false;
518 loop {
519 match self.peek(0) {
520 '_' => {
521 self.skip(1);
522 }
523 char if char.is_digit(radix) => {
524 self.skip(1);
525 has_skip_digits = true;
526 }
527 _ => break,
528 }
529 }
530 has_skip_digits
531 }
532
533 fn skip_suffix(&mut self) -> bool {
534 if self.peek(0).is_identifier_start() {
535 self.skip(1);
536 while self.skip_if(|char| char.is_identifier_continue()) {}
537 return true;
538 }
539 false
540 }
541}
542
543pub trait CharExt {
544 fn is_identifier_start(self) -> bool;
545 fn is_identifier_continue(self) -> bool;
546}
547
548impl CharExt for char {
549 fn is_identifier_start(self) -> bool {
550 match self {
551 'A'..='Z' | '_' | 'a'..='z' => true,
552 _ => false,
553 }
554 }
555
556 fn is_identifier_continue(self) -> bool {
557 match self {
558 '0'..='9' | 'A'..='Z' | '_' | 'a'..='z' => true,
559 _ => false,
560 }
561 }
562}