thrift_analyzer/analyzer/
scanner.rs1use crate::analyzer::{
2 base::{Error, Position},
3 token::{Token, TokenKind},
4};
5
6pub struct Scanner<'a> {
8 input: &'a [char], state: ScannerState, }
11
12#[derive(Clone, Copy)]
14pub struct ScannerState {
15 offset: usize, line: usize, column: usize, }
19
20impl Into<Position> for ScannerState {
21 fn into(self) -> Position {
22 Position {
23 line: self.line as u32,
24 column: self.column as u32,
25 }
26 }
27}
28
29impl<'a> Scanner<'a> {
30 pub fn new(input: &'a [char]) -> Self {
32 Scanner {
33 input,
34 state: ScannerState {
35 offset: 0,
36 line: 1,
37 column: 1,
38 },
39 }
40 }
41
42 pub fn scan(&mut self) -> (Token, Option<Error>) {
44 let mut token = None;
45 let mut err = None;
46
47 while self.state.offset < self.input.len() && token.is_none() {
48 let ch = self.input[self.state.offset];
49
50 match ch {
51 '\n' => {
52 self.state.offset += 1;
53 self.state.column = 1;
54 self.state.line += 1;
55 }
56 '\r' => {
57 self.state.offset += 1;
58 self.state.column = 1;
59 self.state.line += 1;
60
61 if self.state.offset < self.input.len() && self.input[self.state.offset] == '\n'
62 {
63 self.state.offset += 1;
64 }
65 }
66 ' ' | '\t' => {
67 self.state.offset += 1;
68 self.state.column += 1;
69 }
70 '/' => {
71 if self.state.offset + 1 >= self.input.len() {
72 token = Some(Token {
73 kind: TokenKind::Invalid(ch),
74 position: self.state.into(),
75 });
76 self.state.offset += 1;
77 self.state.column += 1;
78 break;
79 }
80
81 let start = self.state.offset;
82 let (offset, ok) = self.scan_line_comment();
83 if ok {
84 token = Some(Token {
85 kind: TokenKind::Comment(
86 self.input[start + 2..start + offset]
87 .iter()
88 .collect::<String>(),
89 ),
90 position: self.state.into(),
91 });
92 self.state.offset += offset;
93 self.state.column = 1;
94 self.state.line += 1;
95 break;
96 }
97
98 let (offset, line_offset, column_offset, ok) = self.scan_block_comment();
99 let position = self.state.into();
100 if ok {
101 token = Some(Token {
102 kind: TokenKind::BlockComment(
103 self.input[start + 2..start + offset - 2]
104 .iter()
105 .collect::<String>(),
106 ),
107 position,
108 })
109 } else {
110 let value = self.input[start..start + offset].iter().collect::<String>();
111 let tk = Token {
112 kind: TokenKind::InvalidString(value.clone()),
113 position,
114 };
115 err = Some(Error {
116 range: tk.range(),
117 message: format!("Unclosed block comment: {}", value),
118 });
119 token = Some(tk);
120 }
121
122 if line_offset > 0 {
123 debug_assert!(column_offset > 0);
124 self.state.column = 0;
125 }
126 self.state.offset += offset;
127 self.state.column += column_offset;
128 self.state.line += line_offset;
129 }
130 '#' => {
131 let start = self.state.offset;
132 let offset = self.scan_pound_comment();
133 let value = self.input[start..start + offset].iter().collect::<String>();
134 let position = self.state.into();
135
136 token = Some(Token {
137 kind: TokenKind::PoundComment(value),
138 position,
139 });
140
141 self.state.offset += offset;
142 self.state.column = 1;
143 self.state.line += 1;
144 }
145 'a'..='z' | 'A'..='Z' | '_' => {
146 let start = self.state.offset;
147 let offset = self.scan_identifier();
148 let value = self.input[start..start + offset].iter().collect::<String>();
149 let position = self.state.into();
150
151 if let Some(tok) = TokenKind::from_string(&value) {
152 token = Some(Token {
153 kind: tok,
154 position,
155 });
156 } else {
157 token = Some(Token {
158 kind: TokenKind::Identifier(value),
159 position,
160 });
161 }
162
163 self.state.offset += offset;
164 self.state.column += offset;
165 }
166 '\'' | '"' => {
167 let start = self.state.offset;
168 let (offset, line_offset, column_offset, ok) = self.scan_literal(ch);
169 let value = self.input[start + 1..start + offset - 1]
170 .iter()
171 .collect::<String>();
172 let position = self.state.into();
173
174 if ok {
175 token = Some(Token {
176 kind: TokenKind::Literal(value),
177 position,
178 });
179 } else {
180 let tk = Token {
181 kind: TokenKind::InvalidString(value.clone()),
182 position,
183 };
184 err = Some(Error {
185 range: tk.range(),
186 message: format!("Unclosed string: {}", value),
187 });
188 token = Some(tk);
189 }
190
191 if line_offset > 0 {
192 debug_assert!(column_offset > 0);
193 self.state.column = 0;
194 }
195 self.state.offset += offset;
196 self.state.column += column_offset;
197 self.state.line += line_offset;
198 }
199 '+' | '-' | '0'..='9' => {
200 let start = self.state.offset;
201 let mut offset: usize;
202 let mut int_ok: bool;
203 let mut double_ok = false;
204
205 (offset, int_ok) = self.scan_int_constant();
206 if !int_ok {
207 (offset, double_ok) = self.scan_double_constant();
208 } else {
209 if self.state.offset + offset < self.input.len() {
210 let next_ch = self.input[self.state.offset + offset];
211 if next_ch == '.' || next_ch == 'e' || next_ch == 'E' {
212 (offset, double_ok) = self.scan_double_constant();
213 if double_ok {
214 int_ok = false;
215 }
216 }
217 }
218 }
219
220 let value = self.input[start..start + offset].iter().collect::<String>();
221 let position = self.state.into();
222
223 if int_ok {
224 token = Some(Token {
225 kind: TokenKind::IntConstant(value),
226 position,
227 });
228 } else if double_ok {
229 token = Some(Token {
230 kind: TokenKind::DoubleConstant(value),
231 position,
232 });
233 } else {
234 token = Some(Token {
235 kind: TokenKind::InvalidString(value),
236 position,
237 })
238 }
239
240 self.state.offset += offset;
241 self.state.column += offset;
242 }
243 '.' => {
244 let start = self.state.offset;
245 let (offset, double_ok) = self.scan_double_constant();
246 let value = self.input[start..start + offset].iter().collect::<String>();
247 let position = self.state.into();
248
249 if !double_ok {
250 token = Some(Token {
251 kind: TokenKind::InvalidString(value),
252 position,
253 })
254 } else {
255 token = Some(Token {
256 kind: TokenKind::DoubleConstant(value),
257 position,
258 });
259 }
260
261 self.state.offset += offset;
262 self.state.column += offset;
263 }
264 _ => {
265 let position = self.state.into();
266
267 if let Some(tok) = TokenKind::from_char(ch) {
268 token = Some(Token {
269 kind: tok,
270 position,
271 });
272 } else {
273 token = Some(Token {
274 kind: TokenKind::Invalid(ch),
275 position,
276 })
277 }
278
279 self.state.offset += 1;
280 self.state.column += 1;
281 }
282 }
283 }
284
285 (token.unwrap_or(self.eof()), err)
286 }
287
288 pub fn skip_to_next_line(&mut self) {
290 while self.state.offset < self.input.len() {
291 let ch = self.input[self.state.offset] as char;
292 self.state.offset += 1;
293
294 if ch == '\n' {
295 self.state.line += 1;
296 self.state.column = 1;
297 break;
298 } else if ch == '\r' {
299 if self.state.offset < self.input.len()
300 && self.input[self.state.offset] as char == '\n'
301 {
302 self.state.offset += 1;
303 }
304 self.state.line += 1;
305 self.state.column = 1;
306 break;
307 }
308 }
309 }
310}
311
312impl<'a> Scanner<'a> {
313 pub fn save_state(&self) -> ScannerState {
315 self.state
316 }
317
318 pub fn restore_state(&mut self, state: ScannerState) {
320 self.state = state;
321 }
322}
323
324impl<'a> Scanner<'a> {
325 fn eof(&self) -> Token {
326 Token {
327 kind: TokenKind::Eof,
328 position: Position {
329 line: self.state.line as u32,
330 column: self.state.column as u32,
331 },
332 }
333 }
334
335 fn scan_identifier(&mut self) -> usize {
337 let mut offset = 1;
338 while self.state.offset + offset < self.input.len() {
339 let ch = self.input[self.state.offset + offset];
340
341 match ch {
342 'a'..='z' | 'A'..='Z' | '0'..='9' | '_' | '.' => offset += 1,
343 _ => break,
344 }
345 }
346
347 offset
348 }
349
350 fn scan_literal(&mut self, delimiter: char) -> (usize, usize, usize, bool) {
352 let mut offset = 1;
353 let mut line_offset = 0;
354 let mut column_offset = 1;
355 let mut prev_ch = delimiter;
356
357 while self.state.offset + offset < self.input.len() {
358 let ch = self.input[self.state.offset + offset];
359 offset += 1;
360 column_offset += 1;
361
362 if ch == delimiter && prev_ch != '\\' {
363 return (offset, line_offset, column_offset, true);
364 }
365 if ch == '\n' {
366 line_offset += 1;
367 column_offset = 1;
368 } else if ch == '\r' {
369 if self.state.offset + offset < self.input.len()
370 && self.input[self.state.offset + offset] as char == '\n'
371 {
372 offset += 1;
373 }
374 line_offset += 1;
375 column_offset = 1;
376 }
377
378 prev_ch = ch;
379 }
380
381 (offset, line_offset, column_offset, false)
382 }
383
384 fn scan_int_constant(&mut self) -> (usize, bool) {
386 match self.input[self.state.offset] {
387 '0'..='9' | '+' | '-' => (),
388 _ => return (0, false),
389 }
390
391 let mut offset = 0;
392 while self.state.offset + offset < self.input.len() {
393 let ch = self.input[self.state.offset + offset];
394
395 if offset > 0 && (ch == '+' || ch == '-') {
397 break;
398 }
399
400 match ch {
401 '0'..='9' | '+' | '-' => offset += 1,
402 _ => break,
403 }
404 }
405
406 if offset > 1 {
407 (offset, true)
408 } else {
409 let ch = self.input[self.state.offset];
410 (offset, ch != '+' && ch != '-')
411 }
412 }
413
414 fn scan_double_constant(&mut self) -> (usize, bool) {
416 match self.input[self.state.offset] {
417 '0'..='9' | '+' | '-' | '.' | 'e' | 'E' => (),
418 _ => return (0, false),
419 }
420
421 enum State {
422 ParsePlusMinus,
423 ParseFirstDigits,
424 ParseDot,
425 ParseSecondDigits,
426 ParseE,
427 PraseIntConstant,
428 }
429
430 let mut state = State::ParsePlusMinus;
431 let mut offset = 0;
432
433 while self.state.offset + offset < self.input.len() {
434 let ch = self.input[self.state.offset + offset];
435
436 match state {
437 State::ParsePlusMinus => {
438 if ch == '+' || ch == '-' {
439 offset += 1;
440 }
441 state = State::ParseFirstDigits;
442 }
443 State::ParseFirstDigits => match ch {
444 '0'..='9' => {
445 offset += 1;
446 }
447 _ => {
448 state = State::ParseDot;
449 }
450 },
451 State::ParseDot => {
452 if ch == '.' {
453 offset += 1;
454 }
455 state = State::ParseSecondDigits;
456 }
457 State::ParseSecondDigits => match ch {
458 '0'..='9' => {
459 offset += 1;
460 }
461 _ => {
462 state = State::ParseE;
463 }
464 },
465 State::ParseE => {
466 if ch == 'e' || ch == 'E' {
467 offset += 1;
468 }
469 state = State::PraseIntConstant;
470 }
471 State::PraseIntConstant => {
472 let cur_state = self.save_state();
473 self.state.offset += offset;
474 let (int_offset, ok) = self.scan_int_constant();
475 self.restore_state(cur_state);
476
477 if ok {
478 offset += int_offset;
479 }
480 break;
481 }
482 }
483 }
484
485 let mut has_digit = false;
486 for i in 0..offset {
487 let ch = self.input[self.state.offset + i];
488 if ch >= '0' && ch <= '9' {
489 has_digit = true;
490 break;
491 }
492 }
493
494 (offset, has_digit)
495 }
496
497 fn scan_line_comment(&mut self) -> (usize, bool) {
499 let mut offset = 1;
500 if self.state.offset + offset >= self.input.len()
501 || self.input[self.state.offset + offset] != '/'
502 {
503 return (offset, false);
504 }
505
506 offset += 1;
507 while self.state.offset + offset < self.input.len() {
508 let ch = self.input[self.state.offset + offset];
509 offset += 1;
510 if ch == '\n' {
511 break;
512 }
513 }
514
515 (offset, true)
516 }
517
518 fn scan_block_comment(&mut self) -> (usize, usize, usize, bool) {
520 let mut offset = 1;
521 let mut line_offset = 0;
522 let mut column_offset = 1;
523 if self.state.offset + offset >= self.input.len()
524 || self.input[self.state.offset + offset] != '*'
525 {
526 return (offset, line_offset, column_offset, false);
527 }
528 offset += 1;
529 column_offset += 1;
530
531 while self.state.offset + offset < self.input.len() {
532 let ch = self.input[self.state.offset + offset];
533 offset += 1;
534 column_offset += 1;
535
536 if ch == '\n' {
537 line_offset += 1;
538 column_offset = 1;
539 } else if ch == '\r' {
540 if self.state.offset + offset < self.input.len()
541 && self.input[self.state.offset + offset] as char == '\n'
542 {
543 offset += 1;
544 }
545 line_offset += 1;
546 column_offset = 1;
547 }
548
549 if self.state.offset + offset >= self.input.len() {
550 return (offset, line_offset, column_offset, false);
551 }
552
553 let next_ch = self.input[self.state.offset + offset];
555 if ch == '*' && next_ch == '/' {
556 offset += 1;
557 column_offset += 1;
558 return (offset, line_offset, column_offset, true);
559 }
560
561 if ch == '/' && next_ch == '*' {
563 let state = self.save_state();
564 self.state.offset += offset - 1;
565 let (nested_offset, nested_line_offset, nested_column_offset, ok) =
566 self.scan_block_comment();
567 self.restore_state(state);
568 offset += nested_offset - 1;
569 line_offset += nested_line_offset;
570 column_offset += nested_column_offset;
571 if !ok {
572 return (offset, line_offset, column_offset, false);
573 }
574 }
575 }
576
577 (offset, line_offset, column_offset, true)
578 }
579
580 fn scan_pound_comment(&mut self) -> usize {
582 let mut offset = 1;
583
584 while self.state.offset + offset < self.input.len() {
585 let ch = self.input[self.state.offset + offset];
586 offset += 1;
587 if ch == '\n' {
588 break;
589 } else if ch == '\r' {
590 if self.state.offset + offset < self.input.len()
591 && self.input[self.state.offset + offset] as char == '\n'
592 {
593 offset += 1;
594 }
595 break;
596 }
597 }
598
599 offset
600 }
601}
602
603#[cfg(test)]
604mod tests {
605 use std::{env, fs, path::Path};
606
607 use super::*;
608
609 #[test]
610 fn test_scan() {
611 let work_path = env::current_dir().unwrap();
612 let file_path = work_path.join(Path::new("./lib/analyzer/test_file/ThriftTest.thrift"));
613 let content = fs::read_to_string(&file_path)
614 .unwrap()
615 .chars()
616 .collect::<Vec<_>>();
617 let mut scanner = Scanner::new(&content);
618
619 loop {
620 let (token, err) = scanner.scan();
621 println!("{:?}", token);
622 if token.is_eof() {
623 break;
624 }
625
626 if token.is_invalid() {
627 println!("invalid token: {:?}, err: {:?}", token, err)
628 }
629 }
630 }
631}