1use crate::{
2 text::{Change, Text},
3 token::TokenKind,
4 Token,
5};
6
7#[derive(Clone, Debug, Eq, Hash, PartialEq)]
8pub struct Tokenizer {
9 state: Vec<Option<(State, State)>>,
10}
11
12impl Tokenizer {
13 pub fn new(line_count: usize) -> Self {
14 Self {
15 state: (0..line_count).map(|_| None).collect(),
16 }
17 }
18
19 pub fn apply_change(&mut self, change: &Change) {
20 match *change {
21 Change::Insert(point, ref text) => {
22 self.state[point.line_index] = None;
23 let line_count = text.length().line_count;
24 if line_count > 0 {
25 let line = point.line_index + 1;
26 self.state.splice(line..line, (0..line_count).map(|_| None));
27 }
28 }
29 Change::Delete(start, length) => {
30 self.state[start.line_index] = None;
31 let line_count = length.line_count;
32 if line_count > 0 {
33 let start_line = start.line_index + 1;
34 let end_line = start_line + line_count;
35 self.state.drain(start_line..end_line);
36 }
37 }
38 }
39 }
40
41 pub fn update(&mut self, text: &Text, tokens: &mut [Vec<Token>]) {
42 let mut state = State::default();
43 for line in 0..text.as_lines().len() {
44 match self.state[line] {
45 Some((start_state, end_state)) if state == start_state => {
46 state = end_state;
47 }
48 _ => {
49 let start_state = state;
50 let mut new_tokens = Vec::new();
51 let mut cursor = Cursor::new(&text.as_lines()[line]);
52 loop {
53 let (next_state, token) = state.next(&mut cursor);
54 state = next_state;
55 match token {
56 Some(token) => new_tokens.push(token),
57 None => break,
58 }
59 }
60 self.state[line] = Some((start_state, state));
61 tokens[line] = new_tokens;
62 }
63 }
64 }
65 }
66}
67
68#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
69pub enum State {
70 Initial(InitialState),
71 BlockCommentTail(BlockCommentTailState),
72 DoubleQuotedStringTail(DoubleQuotedStringTailState),
73 RawDoubleQuotedStringTail(RawDoubleQuotedStringTailState),
74}
75
76impl Default for State {
77 fn default() -> State {
78 State::Initial(InitialState)
79 }
80}
81
82impl State {
83 pub fn next(self, cursor: &mut Cursor) -> (State, Option<Token>) {
84 if cursor.peek(0) == '\0' {
85 return (self, None);
86 }
87 let start = cursor.index;
88 let (next_state, kind) = match self {
89 State::Initial(state) => state.next(cursor),
90 State::BlockCommentTail(state) => state.next(cursor),
91 State::DoubleQuotedStringTail(state) => state.next(cursor),
92 State::RawDoubleQuotedStringTail(state) => state.next(cursor),
93 };
94 let end = cursor.index;
95 assert!(start < end);
96 (
97 next_state,
98 Some(Token {
99 len: end - start,
100 kind,
101 }),
102 )
103 }
104}
105
106#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
107pub struct InitialState;
108
109impl InitialState {
110 fn next(self, cursor: &mut Cursor<'_>) -> (State, TokenKind) {
111 match (cursor.peek(0), cursor.peek(1), cursor.peek(2)) {
112 ('r', '#', '"') | ('r', '#', '#') => self.raw_string(cursor),
113 ('b', 'r', '"') | ('b', 'r', '#') => self.raw_byte_string(cursor),
114 ('/', '/', _) => self.line_comment(cursor),
115 ('/', '*', _) => self.block_comment(cursor),
116 ('b', '\'', _) => self.byte(cursor),
117 ('b', '"', _) => self.byte_string(cursor),
118 ('!', '=', _)
119 | ('%', '=', _)
120 | ('&', '&', _)
121 | ('&', '=', _)
122 | ('*', '=', _)
123 | ('+', '=', _)
124 | ('-', '=', _)
125 | ('-', '>', _)
126 | ('.', '.', _)
127 | ('/', '=', _)
128 | (':', ':', _)
129 | ('<', '<', _)
130 | ('<', '=', _)
131 | ('=', '=', _)
132 | ('=', '>', _)
133 | ('>', '=', _)
134 | ('>', '>', _)
135 | ('^', '=', _)
136 | ('|', '=', _)
137 | ('|', '|', _) => {
138 cursor.skip(2);
139 (State::Initial(InitialState), TokenKind::Punctuator)
140 }
141 ('\'', _, _) => self.char_or_lifetime(cursor),
142 ('"', _, _) => self.string(cursor),
143 ('(', _, _) => {
144 cursor.skip(1);
145 (State::Initial(InitialState), TokenKind::Delimiter)
146 }
147 (')', _, _) => {
148 cursor.skip(1);
149 (State::Initial(InitialState), TokenKind::Delimiter)
150 }
151 ('[', _, _) => {
152 cursor.skip(1);
153 (State::Initial(InitialState), TokenKind::Delimiter)
154 }
155 (']', _, _) => {
156 cursor.skip(1);
157 (State::Initial(InitialState), TokenKind::Delimiter)
158 }
159 ('{', _, _) => {
160 cursor.skip(1);
161 (State::Initial(InitialState), TokenKind::Delimiter)
162 }
163 ('}', _, _) => {
164 cursor.skip(1);
165 (State::Initial(InitialState), TokenKind::Delimiter)
166 }
167 ('.', char, _) if char.is_digit(10) => self.number(cursor),
168 ('!', _, _)
169 | ('#', _, _)
170 | ('$', _, _)
171 | ('%', _, _)
172 | ('&', _, _)
173 | ('*', _, _)
174 | ('+', _, _)
175 | (',', _, _)
176 | ('-', _, _)
177 | ('.', _, _)
178 | ('/', _, _)
179 | (':', _, _)
180 | (';', _, _)
181 | ('<', _, _)
182 | ('=', _, _)
183 | ('>', _, _)
184 | ('?', _, _)
185 | ('@', _, _)
186 | ('^', _, _)
187 | ('_', _, _)
188 | ('|', _, _) => {
189 cursor.skip(1);
190 (State::Initial(InitialState), TokenKind::Punctuator)
191 }
192 (char, _, _) if char.is_identifier_start() => self.identifier_or_keyword(cursor),
193 (char, _, _) if char.is_digit(10) => self.number(cursor),
194 (char, _, _) if char.is_whitespace() => self.whitespace(cursor),
195 _ => {
196 cursor.skip(1);
197 (State::Initial(InitialState), TokenKind::Unknown)
198 }
199 }
200 }
201
202 fn line_comment(self, cursor: &mut Cursor) -> (State, TokenKind) {
203 debug_assert!(cursor.peek(0) == '/' && cursor.peek(1) == '/');
204 cursor.skip(2);
205 while cursor.skip_if(|ch| ch != '\0') {}
206 (State::Initial(InitialState), TokenKind::Comment)
207 }
208
209 fn block_comment(self, cursor: &mut Cursor<'_>) -> (State, TokenKind) {
210 debug_assert!(cursor.peek(0) == '/' && cursor.peek(1) == '*');
211 cursor.skip(2);
212 BlockCommentTailState { depth: 0 }.next(cursor)
213 }
214
215 fn identifier_or_keyword(self, cursor: &mut Cursor) -> (State, TokenKind) {
216 debug_assert!(cursor.peek(0).is_identifier_start());
217 let start = cursor.index;
218 cursor.skip(1);
219 while cursor.skip_if(|char| char.is_identifier_continue()) {}
220 let end = cursor.index;
221 let string = &cursor.string[start..end];
222 (
223 State::Initial(InitialState),
224 match string {
225 "else" | "if" | "match" | "return" => TokenKind::BranchKeyword,
226 "break" | "continue" | "for" | "loop" | "while" => TokenKind::LoopKeyword,
227 "Self" | "as" | "async" | "await" | "const" | "crate" | "dyn" | "enum"
228 | "extern" | "false" | "fn" | "impl" | "in" | "let" | "mod" | "move" | "mut"
229 | "pub" | "ref" | "self" | "static" | "struct" | "super" | "trait" | "true"
230 | "type" | "unsafe" | "use" | "where" => TokenKind::OtherKeyword,
231 _ => {
232 let mut chars = string.chars();
233 if chars.next().unwrap().is_uppercase() {
234 match chars.next() {
235 Some(char) if char.is_uppercase() => TokenKind::Constant,
236 _ => TokenKind::Typename,
237 }
238 } else {
239 TokenKind::Identifier
240 }
241 }
242 },
243 )
244 }
245
246 fn number(self, cursor: &mut Cursor) -> (State, TokenKind) {
247 match (cursor.peek(0), cursor.peek(1)) {
248 ('0', 'b') => {
249 cursor.skip(2);
250 if !cursor.skip_digits(2) {
251 return (State::Initial(InitialState), TokenKind::Unknown);
252 }
253 return (State::Initial(InitialState), TokenKind::Number);
254 }
255 ('0', 'o') => {
256 cursor.skip(2);
257 if !cursor.skip_digits(8) {
258 return (State::Initial(InitialState), TokenKind::Unknown);
259 }
260 return (State::Initial(InitialState), TokenKind::Number);
261 }
262 ('0', 'x') => {
263 cursor.skip(2);
264 if !cursor.skip_digits(16) {
265 return (State::Initial(InitialState), TokenKind::Unknown);
266 }
267 return (State::Initial(InitialState), TokenKind::Number);
268 }
269 _ => {
270 cursor.skip_digits(10);
271 match cursor.peek(0) {
272 '.' if cursor.peek(1) != '.' && !cursor.peek(0).is_identifier_start() => {
273 cursor.skip(1);
274 if cursor.skip_digits(10) {
275 if cursor.peek(0) == 'E' || cursor.peek(0) == 'e' {
276 if !cursor.skip_exponent() {
277 return (State::Initial(InitialState), TokenKind::Unknown);
278 }
279 }
280 }
281 cursor.skip_suffix();
282 return (State::Initial(InitialState), TokenKind::Number);
283 }
284 'E' | 'e' => {
285 if !cursor.skip_exponent() {
286 return (State::Initial(InitialState), TokenKind::Unknown);
287 }
288 cursor.skip_suffix();
289 return (State::Initial(InitialState), TokenKind::Number);
290 }
291 _ => {
292 cursor.skip_suffix();
293 return (State::Initial(InitialState), TokenKind::Number);
294 }
295 }
296 }
297 };
298 }
299
300 fn char_or_lifetime(self, cursor: &mut Cursor) -> (State, TokenKind) {
301 if cursor.peek(1).is_identifier_start() && cursor.peek(2) != '\'' {
302 debug_assert!(cursor.peek(0) == '\'');
303 cursor.skip(2);
304 while cursor.skip_if(|ch| ch.is_identifier_continue()) {}
305 if cursor.peek(0) == '\'' {
306 cursor.skip(1);
307 cursor.skip_suffix();
308 (State::Initial(InitialState), TokenKind::String)
309 } else {
310 (State::Initial(InitialState), TokenKind::String)
311 }
312 } else {
313 self.single_quoted_string(cursor)
314 }
315 }
316
317 fn byte(self, cursor: &mut Cursor) -> (State, TokenKind) {
318 debug_assert!(cursor.peek(0) == 'b');
319 cursor.skip(1);
320 self.single_quoted_string(cursor)
321 }
322
323 fn string(self, cursor: &mut Cursor) -> (State, TokenKind) {
324 self.double_quoted_string(cursor)
325 }
326
327 fn byte_string(self, cursor: &mut Cursor) -> (State, TokenKind) {
328 debug_assert!(cursor.peek(0) == 'b');
329 cursor.skip(1);
330 self.double_quoted_string(cursor)
331 }
332
333 fn raw_string(self, cursor: &mut Cursor) -> (State, TokenKind) {
334 debug_assert!(cursor.peek(0) == 'r');
335 cursor.skip(1);
336 self.raw_double_quoted_string(cursor)
337 }
338
339 fn raw_byte_string(self, cursor: &mut Cursor) -> (State, TokenKind) {
340 debug_assert!(cursor.peek(0) == 'b' && cursor.peek(1) == 'r');
341 cursor.skip(2);
342 self.raw_double_quoted_string(cursor)
343 }
344
345 fn single_quoted_string(self, cursor: &mut Cursor) -> (State, TokenKind) {
346 debug_assert!(cursor.peek(0) == '\'');
347 cursor.skip(1);
348 loop {
349 match (cursor.peek(0), cursor.peek(1)) {
350 ('\'', _) => {
351 cursor.skip(1);
352 cursor.skip_suffix();
353 break;
354 }
355 ('\0', _) => return (State::Initial(InitialState), TokenKind::Unknown),
356 ('\\', '\'') | ('\\', '\\') => cursor.skip(2),
357 _ => cursor.skip(1),
358 }
359 }
360 (State::Initial(InitialState), TokenKind::String)
361 }
362
363 fn double_quoted_string(self, cursor: &mut Cursor) -> (State, TokenKind) {
364 debug_assert!(cursor.peek(0) == '"');
365 cursor.skip(1);
366 DoubleQuotedStringTailState.next(cursor)
367 }
368
369 fn raw_double_quoted_string(self, cursor: &mut Cursor) -> (State, TokenKind) {
370 let mut start_hash_count = 0;
371 while cursor.skip_if(|ch| ch == '#') {
372 start_hash_count += 1;
373 }
374 RawDoubleQuotedStringTailState { start_hash_count }.next(cursor)
375 }
376
377 fn whitespace(self, cursor: &mut Cursor) -> (State, TokenKind) {
378 debug_assert!(cursor.peek(0).is_whitespace());
379 cursor.skip(1);
380 while cursor.skip_if(|char| char.is_whitespace()) {}
381 (State::Initial(InitialState), TokenKind::Whitespace)
382 }
383}
384
385#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
386pub struct BlockCommentTailState {
387 depth: usize,
388}
389
390impl BlockCommentTailState {
391 fn next(self, cursor: &mut Cursor<'_>) -> (State, TokenKind) {
392 let mut state = self;
393 loop {
394 match (cursor.peek(0), cursor.peek(1)) {
395 ('/', '*') => {
396 cursor.skip(2);
397 state.depth += 1;
398 }
399 ('*', '/') => {
400 cursor.skip(2);
401 if state.depth == 0 {
402 break (State::Initial(InitialState), TokenKind::Comment);
403 }
404 state.depth -= 1;
405 }
406 ('\0', _) => {
407 break (State::BlockCommentTail(state), TokenKind::Comment);
408 }
409 _ => cursor.skip(1),
410 }
411 }
412 }
413}
414
415#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
416pub struct DoubleQuotedStringTailState;
417
418impl DoubleQuotedStringTailState {
419 fn next(self, cursor: &mut Cursor<'_>) -> (State, TokenKind) {
420 loop {
421 match (cursor.peek(0), cursor.peek(1)) {
422 ('"', _) => {
423 cursor.skip(1);
424 cursor.skip_suffix();
425 break (State::Initial(InitialState), TokenKind::String);
426 }
427 ('\0', _) => {
428 break (
429 State::DoubleQuotedStringTail(DoubleQuotedStringTailState),
430 TokenKind::String,
431 );
432 }
433 ('\\', '"') => cursor.skip(2),
434 _ => cursor.skip(1),
435 }
436 }
437 }
438}
439
440#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
441pub struct RawDoubleQuotedStringTailState {
442 start_hash_count: usize,
443}
444
445impl RawDoubleQuotedStringTailState {
446 fn next(self, cursor: &mut Cursor<'_>) -> (State, TokenKind) {
447 loop {
448 match cursor.peek(0) {
449 '"' => {
450 cursor.skip(1);
451 let mut end_hash_count = 0;
452 while end_hash_count < self.start_hash_count && cursor.skip_if(|ch| ch == '#') {
453 end_hash_count += 1;
454 }
455 if end_hash_count == self.start_hash_count {
456 cursor.skip_suffix();
457 break (State::Initial(InitialState), TokenKind::String);
458 }
459 }
460 '\0' => {
461 break (State::RawDoubleQuotedStringTail(self), TokenKind::String);
462 }
463 _ => cursor.skip(1),
464 }
465 }
466 }
467}
468
469#[derive(Debug)]
470pub struct Cursor<'a> {
471 string: &'a str,
472 index: usize,
473}
474
475impl<'a> Cursor<'a> {
476 pub fn new(string: &'a str) -> Self {
477 Cursor { string, index: 0 }
478 }
479
480 fn peek(&self, index: usize) -> char {
481 self.string[self.index..].chars().nth(index).unwrap_or('\0')
482 }
483
484 fn skip(&mut self, count: usize) {
485 self.index = self.string[self.index..]
486 .char_indices()
487 .nth(count)
488 .map_or(self.string.len(), |(index, _)| self.index + index);
489 }
490
491 fn skip_if<P>(&mut self, predicate: P) -> bool
492 where
493 P: FnOnce(char) -> bool,
494 {
495 if predicate(self.peek(0)) {
496 self.skip(1);
497 true
498 } else {
499 false
500 }
501 }
502
503 fn skip_exponent(&mut self) -> bool {
504 debug_assert!(self.peek(0) == 'E' || self.peek(0) == 'e');
505 self.skip(1);
506 if self.peek(0) == '+' || self.peek(0) == '-' {
507 self.skip(1);
508 }
509 self.skip_digits(10)
510 }
511
512 fn skip_digits(&mut self, radix: u32) -> bool {
513 let mut has_skip_digits = false;
514 loop {
515 match self.peek(0) {
516 '_' => {
517 self.skip(1);
518 }
519 char if char.is_digit(radix) => {
520 self.skip(1);
521 has_skip_digits = true;
522 }
523 _ => break,
524 }
525 }
526 has_skip_digits
527 }
528
529 fn skip_suffix(&mut self) -> bool {
530 if self.peek(0).is_identifier_start() {
531 self.skip(1);
532 while self.skip_if(|char| char.is_identifier_continue()) {}
533 return true;
534 }
535 false
536 }
537}
538
539pub trait CharExt {
540 fn is_identifier_start(self) -> bool;
541 fn is_identifier_continue(self) -> bool;
542}
543
544impl CharExt for char {
545 fn is_identifier_start(self) -> bool {
546 match self {
547 'A'..='Z' | '_' | 'a'..='z' => true,
548 _ => false,
549 }
550 }
551
552 fn is_identifier_continue(self) -> bool {
553 match self {
554 '0'..='9' | 'A'..='Z' | '_' | 'a'..='z' => true,
555 _ => false,
556 }
557 }
558}