Skip to main content

thrift_analyzer/analyzer/
scanner.rs

1use crate::analyzer::{
2    base::{Error, Position},
3    token::{Token, TokenKind},
4};
5
6/// Represents a Thrift scanner.
7pub struct Scanner<'a> {
8    input: &'a [char],   // input data
9    state: ScannerState, // current state
10}
11
12/// Represents a Thrift scanner state.
13#[derive(Clone, Copy)]
14pub struct ScannerState {
15    offset: usize, // next reading offset
16    line: usize,   // current line offset
17    column: usize, // current column offset
18}
19
20impl Into<Position> for ScannerState {
21    fn into(self) -> Position {
22        Position {
23            line: self.line as u32,
24            column: self.column as u32,
25        }
26    }
27}
28
29impl<'a> Scanner<'a> {
30    /// Creates a new scanner with the given input data.
31    pub fn new(input: &'a [char]) -> Self {
32        Scanner {
33            input,
34            state: ScannerState {
35                offset: 0,
36                line: 1,
37                column: 1,
38            },
39        }
40    }
41
42    /// Scans the next token and returns it.
43    pub fn scan(&mut self) -> (Token, Option<Error>) {
44        let mut token = None;
45        let mut err = None;
46
47        while self.state.offset < self.input.len() && token.is_none() {
48            let ch = self.input[self.state.offset];
49
50            match ch {
51                '\n' => {
52                    self.state.offset += 1;
53                    self.state.column = 1;
54                    self.state.line += 1;
55                }
56                '\r' => {
57                    self.state.offset += 1;
58                    self.state.column = 1;
59                    self.state.line += 1;
60
61                    if self.state.offset < self.input.len() && self.input[self.state.offset] == '\n'
62                    {
63                        self.state.offset += 1;
64                    }
65                }
66                ' ' | '\t' => {
67                    self.state.offset += 1;
68                    self.state.column += 1;
69                }
70                '/' => {
71                    if self.state.offset + 1 >= self.input.len() {
72                        token = Some(Token {
73                            kind: TokenKind::Invalid(ch),
74                            position: self.state.into(),
75                        });
76                        self.state.offset += 1;
77                        self.state.column += 1;
78                        break;
79                    }
80
81                    let start = self.state.offset;
82                    let (offset, ok) = self.scan_line_comment();
83                    if ok {
84                        token = Some(Token {
85                            kind: TokenKind::Comment(
86                                self.input[start + 2..start + offset]
87                                    .iter()
88                                    .collect::<String>(),
89                            ),
90                            position: self.state.into(),
91                        });
92                        self.state.offset += offset;
93                        self.state.column = 1;
94                        self.state.line += 1;
95                        break;
96                    }
97
98                    let (offset, line_offset, column_offset, ok) = self.scan_block_comment();
99                    let position = self.state.into();
100                    if ok {
101                        token = Some(Token {
102                            kind: TokenKind::BlockComment(
103                                self.input[start + 2..start + offset - 2]
104                                    .iter()
105                                    .collect::<String>(),
106                            ),
107                            position,
108                        })
109                    } else {
110                        let value = self.input[start..start + offset].iter().collect::<String>();
111                        let tk = Token {
112                            kind: TokenKind::InvalidString(value.clone()),
113                            position,
114                        };
115                        err = Some(Error {
116                            range: tk.range(),
117                            message: format!("Unclosed block comment: {}", value),
118                        });
119                        token = Some(tk);
120                    }
121
122                    if line_offset > 0 {
123                        debug_assert!(column_offset > 0);
124                        self.state.column = 0;
125                    }
126                    self.state.offset += offset;
127                    self.state.column += column_offset;
128                    self.state.line += line_offset;
129                }
130                '#' => {
131                    let start = self.state.offset;
132                    let offset = self.scan_pound_comment();
133                    let value = self.input[start..start + offset].iter().collect::<String>();
134                    let position = self.state.into();
135
136                    token = Some(Token {
137                        kind: TokenKind::PoundComment(value),
138                        position,
139                    });
140
141                    self.state.offset += offset;
142                    self.state.column = 1;
143                    self.state.line += 1;
144                }
145                'a'..='z' | 'A'..='Z' | '_' => {
146                    let start = self.state.offset;
147                    let offset = self.scan_identifier();
148                    let value = self.input[start..start + offset].iter().collect::<String>();
149                    let position = self.state.into();
150
151                    if let Some(tok) = TokenKind::from_string(&value) {
152                        token = Some(Token {
153                            kind: tok,
154                            position,
155                        });
156                    } else {
157                        token = Some(Token {
158                            kind: TokenKind::Identifier(value),
159                            position,
160                        });
161                    }
162
163                    self.state.offset += offset;
164                    self.state.column += offset;
165                }
166                '\'' | '"' => {
167                    let start = self.state.offset;
168                    let (offset, line_offset, column_offset, ok) = self.scan_literal(ch);
169                    let value = self.input[start + 1..start + offset - 1]
170                        .iter()
171                        .collect::<String>();
172                    let position = self.state.into();
173
174                    if ok {
175                        token = Some(Token {
176                            kind: TokenKind::Literal(value),
177                            position,
178                        });
179                    } else {
180                        let tk = Token {
181                            kind: TokenKind::InvalidString(value.clone()),
182                            position,
183                        };
184                        err = Some(Error {
185                            range: tk.range(),
186                            message: format!("Unclosed string: {}", value),
187                        });
188                        token = Some(tk);
189                    }
190
191                    if line_offset > 0 {
192                        debug_assert!(column_offset > 0);
193                        self.state.column = 0;
194                    }
195                    self.state.offset += offset;
196                    self.state.column += column_offset;
197                    self.state.line += line_offset;
198                }
199                '+' | '-' | '0'..='9' => {
200                    let start = self.state.offset;
201                    let mut offset: usize;
202                    let mut int_ok: bool;
203                    let mut double_ok = false;
204
205                    (offset, int_ok) = self.scan_int_constant();
206                    if !int_ok {
207                        (offset, double_ok) = self.scan_double_constant();
208                    } else {
209                        if self.state.offset + offset < self.input.len() {
210                            let next_ch = self.input[self.state.offset + offset];
211                            if next_ch == '.' || next_ch == 'e' || next_ch == 'E' {
212                                (offset, double_ok) = self.scan_double_constant();
213                                if double_ok {
214                                    int_ok = false;
215                                }
216                            }
217                        }
218                    }
219
220                    let value = self.input[start..start + offset].iter().collect::<String>();
221                    let position = self.state.into();
222
223                    if int_ok {
224                        token = Some(Token {
225                            kind: TokenKind::IntConstant(value),
226                            position,
227                        });
228                    } else if double_ok {
229                        token = Some(Token {
230                            kind: TokenKind::DoubleConstant(value),
231                            position,
232                        });
233                    } else {
234                        token = Some(Token {
235                            kind: TokenKind::InvalidString(value),
236                            position,
237                        })
238                    }
239
240                    self.state.offset += offset;
241                    self.state.column += offset;
242                }
243                '.' => {
244                    let start = self.state.offset;
245                    let (offset, double_ok) = self.scan_double_constant();
246                    let value = self.input[start..start + offset].iter().collect::<String>();
247                    let position = self.state.into();
248
249                    if !double_ok {
250                        token = Some(Token {
251                            kind: TokenKind::InvalidString(value),
252                            position,
253                        })
254                    } else {
255                        token = Some(Token {
256                            kind: TokenKind::DoubleConstant(value),
257                            position,
258                        });
259                    }
260
261                    self.state.offset += offset;
262                    self.state.column += offset;
263                }
264                _ => {
265                    let position = self.state.into();
266
267                    if let Some(tok) = TokenKind::from_char(ch) {
268                        token = Some(Token {
269                            kind: tok,
270                            position,
271                        });
272                    } else {
273                        token = Some(Token {
274                            kind: TokenKind::Invalid(ch),
275                            position,
276                        })
277                    }
278
279                    self.state.offset += 1;
280                    self.state.column += 1;
281                }
282            }
283        }
284
285        (token.unwrap_or(self.eof()), err)
286    }
287
288    /// Skips to the next line.
289    pub fn skip_to_next_line(&mut self) {
290        while self.state.offset < self.input.len() {
291            let ch = self.input[self.state.offset] as char;
292            self.state.offset += 1;
293
294            if ch == '\n' {
295                self.state.line += 1;
296                self.state.column = 1;
297                break;
298            } else if ch == '\r' {
299                if self.state.offset < self.input.len()
300                    && self.input[self.state.offset] as char == '\n'
301                {
302                    self.state.offset += 1;
303                }
304                self.state.line += 1;
305                self.state.column = 1;
306                break;
307            }
308        }
309    }
310}
311
312impl<'a> Scanner<'a> {
313    /// Saves the current state.
314    pub fn save_state(&self) -> ScannerState {
315        self.state
316    }
317
318    /// Restores the state.
319    pub fn restore_state(&mut self, state: ScannerState) {
320        self.state = state;
321    }
322}
323
324impl<'a> Scanner<'a> {
325    fn eof(&self) -> Token {
326        Token {
327            kind: TokenKind::Eof,
328            position: Position {
329                line: self.state.line as u32,
330                column: self.state.column as u32,
331            },
332        }
333    }
334
335    // scan the next identifier and return the end offset.
336    fn scan_identifier(&mut self) -> usize {
337        let mut offset = 1;
338        while self.state.offset + offset < self.input.len() {
339            let ch = self.input[self.state.offset + offset];
340
341            match ch {
342                'a'..='z' | 'A'..='Z' | '0'..='9' | '_' | '.' => offset += 1,
343                _ => break,
344            }
345        }
346
347        offset
348    }
349
350    // scan the next literal and return the end offset and line offset.
351    fn scan_literal(&mut self, delimiter: char) -> (usize, usize, usize, bool) {
352        let mut offset = 1;
353        let mut line_offset = 0;
354        let mut column_offset = 1;
355        let mut prev_ch = delimiter;
356
357        while self.state.offset + offset < self.input.len() {
358            let ch = self.input[self.state.offset + offset];
359            offset += 1;
360            column_offset += 1;
361
362            if ch == delimiter && prev_ch != '\\' {
363                return (offset, line_offset, column_offset, true);
364            }
365            if ch == '\n' {
366                line_offset += 1;
367                column_offset = 1;
368            } else if ch == '\r' {
369                if self.state.offset + offset < self.input.len()
370                    && self.input[self.state.offset + offset] as char == '\n'
371                {
372                    offset += 1;
373                }
374                line_offset += 1;
375                column_offset = 1;
376            }
377
378            prev_ch = ch;
379        }
380
381        (offset, line_offset, column_offset, false)
382    }
383
384    // scan the next integer constant and return the end offset.
385    fn scan_int_constant(&mut self) -> (usize, bool) {
386        match self.input[self.state.offset] {
387            '0'..='9' | '+' | '-' => (),
388            _ => return (0, false),
389        }
390
391        let mut offset = 0;
392        while self.state.offset + offset < self.input.len() {
393            let ch = self.input[self.state.offset + offset];
394
395            // only allow + or - at the beginning
396            if offset > 0 && (ch == '+' || ch == '-') {
397                break;
398            }
399
400            match ch {
401                '0'..='9' | '+' | '-' => offset += 1,
402                _ => break,
403            }
404        }
405
406        if offset > 1 {
407            (offset, true)
408        } else {
409            let ch = self.input[self.state.offset];
410            (offset, ch != '+' && ch != '-')
411        }
412    }
413
414    // scan the next double constant and return the end offset.
415    fn scan_double_constant(&mut self) -> (usize, bool) {
416        match self.input[self.state.offset] {
417            '0'..='9' | '+' | '-' | '.' | 'e' | 'E' => (),
418            _ => return (0, false),
419        }
420
421        enum State {
422            ParsePlusMinus,
423            ParseFirstDigits,
424            ParseDot,
425            ParseSecondDigits,
426            ParseE,
427            PraseIntConstant,
428        }
429
430        let mut state = State::ParsePlusMinus;
431        let mut offset = 0;
432
433        while self.state.offset + offset < self.input.len() {
434            let ch = self.input[self.state.offset + offset];
435
436            match state {
437                State::ParsePlusMinus => {
438                    if ch == '+' || ch == '-' {
439                        offset += 1;
440                    }
441                    state = State::ParseFirstDigits;
442                }
443                State::ParseFirstDigits => match ch {
444                    '0'..='9' => {
445                        offset += 1;
446                    }
447                    _ => {
448                        state = State::ParseDot;
449                    }
450                },
451                State::ParseDot => {
452                    if ch == '.' {
453                        offset += 1;
454                    }
455                    state = State::ParseSecondDigits;
456                }
457                State::ParseSecondDigits => match ch {
458                    '0'..='9' => {
459                        offset += 1;
460                    }
461                    _ => {
462                        state = State::ParseE;
463                    }
464                },
465                State::ParseE => {
466                    if ch == 'e' || ch == 'E' {
467                        offset += 1;
468                    }
469                    state = State::PraseIntConstant;
470                }
471                State::PraseIntConstant => {
472                    let cur_state = self.save_state();
473                    self.state.offset += offset;
474                    let (int_offset, ok) = self.scan_int_constant();
475                    self.restore_state(cur_state);
476
477                    if ok {
478                        offset += int_offset;
479                    }
480                    break;
481                }
482            }
483        }
484
485        let mut has_digit = false;
486        for i in 0..offset {
487            let ch = self.input[self.state.offset + i];
488            if ch >= '0' && ch <= '9' {
489                has_digit = true;
490                break;
491            }
492        }
493
494        (offset, has_digit)
495    }
496
497    // scan the next line comment and return the end offset.
498    fn scan_line_comment(&mut self) -> (usize, bool) {
499        let mut offset = 1;
500        if self.state.offset + offset >= self.input.len()
501            || self.input[self.state.offset + offset] != '/'
502        {
503            return (offset, false);
504        }
505
506        offset += 1;
507        while self.state.offset + offset < self.input.len() {
508            let ch = self.input[self.state.offset + offset];
509            offset += 1;
510            if ch == '\n' {
511                break;
512            }
513        }
514
515        (offset, true)
516    }
517
518    // scan the next block comment and return the end offset.
519    fn scan_block_comment(&mut self) -> (usize, usize, usize, bool) {
520        let mut offset = 1;
521        let mut line_offset = 0;
522        let mut column_offset = 1;
523        if self.state.offset + offset >= self.input.len()
524            || self.input[self.state.offset + offset] != '*'
525        {
526            return (offset, line_offset, column_offset, false);
527        }
528        offset += 1;
529        column_offset += 1;
530
531        while self.state.offset + offset < self.input.len() {
532            let ch = self.input[self.state.offset + offset];
533            offset += 1;
534            column_offset += 1;
535
536            if ch == '\n' {
537                line_offset += 1;
538                column_offset = 1;
539            } else if ch == '\r' {
540                if self.state.offset + offset < self.input.len()
541                    && self.input[self.state.offset + offset] as char == '\n'
542                {
543                    offset += 1;
544                }
545                line_offset += 1;
546                column_offset = 1;
547            }
548
549            if self.state.offset + offset >= self.input.len() {
550                return (offset, line_offset, column_offset, false);
551            }
552
553            // scan delimiter
554            let next_ch = self.input[self.state.offset + offset];
555            if ch == '*' && next_ch == '/' {
556                offset += 1;
557                column_offset += 1;
558                return (offset, line_offset, column_offset, true);
559            }
560
561            // scan nested block comments
562            if ch == '/' && next_ch == '*' {
563                let state = self.save_state();
564                self.state.offset += offset - 1;
565                let (nested_offset, nested_line_offset, nested_column_offset, ok) =
566                    self.scan_block_comment();
567                self.restore_state(state);
568                offset += nested_offset - 1;
569                line_offset += nested_line_offset;
570                column_offset += nested_column_offset;
571                if !ok {
572                    return (offset, line_offset, column_offset, false);
573                }
574            }
575        }
576
577        (offset, line_offset, column_offset, true)
578    }
579
580    // scan the next pound comment and return the end offset.
581    fn scan_pound_comment(&mut self) -> usize {
582        let mut offset = 1;
583
584        while self.state.offset + offset < self.input.len() {
585            let ch = self.input[self.state.offset + offset];
586            offset += 1;
587            if ch == '\n' {
588                break;
589            } else if ch == '\r' {
590                if self.state.offset + offset < self.input.len()
591                    && self.input[self.state.offset + offset] as char == '\n'
592                {
593                    offset += 1;
594                }
595                break;
596            }
597        }
598
599        offset
600    }
601}
602
603#[cfg(test)]
604mod tests {
605    use std::{env, fs, path::Path};
606
607    use super::*;
608
609    #[test]
610    fn test_scan() {
611        let work_path = env::current_dir().unwrap();
612        let file_path = work_path.join(Path::new("./lib/analyzer/test_file/ThriftTest.thrift"));
613        let content = fs::read_to_string(&file_path)
614            .unwrap()
615            .chars()
616            .collect::<Vec<_>>();
617        let mut scanner = Scanner::new(&content);
618
619        loop {
620            let (token, err) = scanner.scan();
621            println!("{:?}", token);
622            if token.is_eof() {
623                break;
624            }
625
626            if token.is_invalid() {
627                println!("invalid token: {:?}, err: {:?}", token, err)
628            }
629        }
630    }
631}