uscan/
scanner.rs

1use std::io::Write;
2
3pub type Number = f64;
4
5/// The fields contain the line number and character position in the line
6#[derive(Debug,PartialEq)]
7pub enum ScanError {
8    /// Unrecognized token.
9    UnknownToken(usize, usize),
10    /// Eof of file before the end of current token
11    /// (for example, an unterminated string)
12    UnexpectedEof(usize, usize),
13}
14
15impl std::fmt::Display for ScanError {
16    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
17        let (line, offset) = match self {
18            ScanError::UnknownToken(line, offset) => (line, offset),
19            ScanError::UnexpectedEof(line, offset) => (line, offset),
20        };
21        write!(
22            f,
23            "{}:{} : {}",
24            line,
25            offset,
26            match self {
27                ScanError::UnknownToken(_, _) => "unknown token",
28                ScanError::UnexpectedEof(_, _) => "unexpected end of file",
29            }
30        )
31    }
32}
33
34#[derive(Debug, PartialEq)]
35pub enum TokenType {
36    /// a symbol from the symbols list
37    Symbol(String),
38    /// an identifier
39    Identifier(String),
40    /// a string litteral. value is the string value, without the delimiting quotes
41    StringLiteral(String),
42    /// a number literal, with its string representation in the code and its parsed value
43    NumberLiteral(String, Number),
44    /// a keyword from the keywords list
45    Keyword(String),
46    /// a single or multi-line comment. The value contains the delimiting characters.
47    Comment(String),
48    /// space, tabulations, ...
49    Ignore,
50    /// a newline character
51    NewLine,
52    Eof,
53    /// only if Scanner::run returns an error
54    Unknown,
55}
56
57impl TokenType {
58    pub fn len(&self) -> usize {
59        match self {
60            TokenType::Symbol(s) => s.len(),
61            TokenType::Identifier(s) => s.len(),
62            TokenType::StringLiteral(s) => s.len() + 2,
63            TokenType::Keyword(s) => s.len(),
64            TokenType::NumberLiteral(s, _) => s.len(),
65            TokenType::Comment(s) => s.len(),
66            _ => 0,
67        }
68    }
69}
70
71#[derive(Default)]
72pub struct ScannerData {
73    /// complete source code
74    pub source: Vec<char>,
75    /// resulting list of tokens
76    pub token_types: Vec<TokenType>,
77    /// token start line in the source code
78    pub token_lines: Vec<usize>,
79    /// token start offset from its line beginning
80    pub token_start: Vec<usize>,
81    /// token length in characters (not in bytes!)
82    /// not always = token value's length.
83    /// For example for TokenType::StringLiteral("aa") the value length is 2 but the token length including the quotes is 4
84    /// Also when using unicode,  the length of "à" in bytes is 4, but the token_len is 3
85    pub token_len: Vec<usize>,
86}
87
88impl ScannerData {
89    pub fn dump(&self, out: &mut dyn Write) {
90        for (i, token) in self.token_types.iter().enumerate() {
91            writeln!(out, "[#{:03} line {}] {:?}", i, self.token_lines[i], *token).ok();
92        }
93    }
94}
95
96#[derive(Default)]
97pub struct Scanner {
98    // start of parsing position
99    start: usize,
100    // position during parsing of current token
101    current: usize,
102    // current line in file
103    line: usize,
104}
105
106pub struct ScannerConfig {
107    /// list of keywords, ordered by descending length
108    pub keywords: &'static [&'static str],
109    /// list of symbols, ordered by descending length
110    pub symbols: &'static [&'static str],
111    /// token starting a single line comment
112    pub single_line_cmt: Option<&'static str>,
113    /// token starting a multi line comment
114    pub multi_line_cmt_start: Option<&'static str>,
115    /// token ending a multi line comment
116    pub multi_line_cmt_end: Option<&'static str>,
117}
118
119impl Scanner {
120    /// scan the provided source code and return a list of tokens in the ScannerData structure.
121    /// The ScannerData is not returned in the Result because we want it even when there is a scan error.
122    /// We don't return an iterator because the parser needs to easily move back and forth in the token list
123    pub fn run(
124        &mut self,
125        source: &str,
126        config: &ScannerConfig,
127        data: &mut ScannerData,
128    ) -> Result<(), ScanError> {
129        data.source = source.chars().collect();
130        self.current = 0;
131        self.line = 1;
132        self.start = self.current;
133        let mut exit = false;
134        while !exit {
135            let token = self.scan_token(data, config)?;
136            match token {
137                TokenType::Eof => exit = true,
138                TokenType::Ignore => self.start = self.current,
139                TokenType::NewLine => (),
140                _ => self.add_token(token, data),
141            }
142        }
143        Ok(())
144    }
145    fn add_token(&mut self, token: TokenType, data: &mut ScannerData) {
146        data.token_start.push(self.start);
147        data.token_len.push(self.current - self.start);
148        data.token_types.push(token);
149        data.token_lines.push(self.line);
150        self.start = self.current;
151    }
152    fn scan_token(
153        &mut self,
154        data: &mut ScannerData,
155        config: &ScannerConfig,
156    ) -> Result<TokenType, ScanError> {
157        if self.current >= data.source.len() {
158            return Ok(TokenType::Eof);
159        }
160        if let Some(token) = self.scan_comment(config, data) {
161            return Ok(token);
162        }
163        if let Some(token) = self.scan_newline(data) {
164            return Ok(token);
165        }
166        if let Some(token) = self.scan_space(data) {
167            return Ok(token);
168        }
169        if let Some(token) = self.scan_symbol(data, config) {
170            return Ok(token);
171        }
172        if let Some(token) = self.scan_keyword(data, config) {
173            return Ok(token);
174        }
175        if let Some(token) = self.scan_string(data)? {
176            return Ok(token);
177        }
178        if let Some(token) = self.scan_identifier(data) {
179            return Ok(token);
180        }
181        if let Some(token) = self.scan_number(data) {
182            return Ok(token);
183        }
184        data.token_len.push(1);
185        data.token_start.push(self.current);
186        data.token_types.push(TokenType::Unknown);
187        data.token_lines.push(self.line);
188        let token_id = data.token_len.len() - 1;
189        Err(ScanError::UnknownToken(
190            self.line,
191            data.token_start[token_id],
192        ))
193    }
194    fn scan_comment(
195        &mut self,
196        config: &ScannerConfig,
197        data: &mut ScannerData,
198    ) -> Option<TokenType> {
199        if let Some(multi_start) = config.multi_line_cmt_start {
200            if self.matches(multi_start, data) {
201                if let Some(multi_end) = config.multi_line_cmt_end {
202                    return self.scan_multi_line_comment(multi_start, multi_end, data);
203                }
204            }
205        }
206        if let Some(single_start) = config.single_line_cmt {
207            if self.matches(single_start, data) {
208                return self.scan_single_line_comment(data);
209            }
210        }
211        None
212    }
213    fn scan_single_line_comment(&mut self, data: &mut ScannerData) -> Option<TokenType> {
214        let source_len = data.source.len();
215        while self.current < source_len && data.source[self.current] != '\n' {
216            self.current += 1;
217        }
218        let end=self.current;
219        if self.current < source_len {
220            self.current += 1;
221            self.line += 1;
222        }
223        return Some(TokenType::Comment(
224            data.source[self.start..end]
225                .iter()
226                .cloned()
227                .collect::<String>(),
228        ));
229    }
230    fn scan_multi_line_comment(
231        &mut self,
232        multi_start: &str,
233        multi_end: &str,
234        data: &mut ScannerData,
235    ) -> Option<TokenType> {
236        let mut level = 0;
237        let mut in_string = false;
238        let mut escape = false;
239        while self.current < data.source.len() {
240            let c = data.source[self.current];
241            if c == '\n' {
242                self.line += 1;
243            } else if c == '\\' && !escape {
244                escape = true;
245            } else {
246                if c == '\"' && !escape {
247                    in_string = !in_string;
248                } else if !in_string {
249                    if self.matches(multi_end, data) {
250                        level -= 1;
251                        self.current += multi_end.len() - 1;
252                        if level == 0 {
253                            self.current += 1;
254                            return Some(TokenType::Comment(
255                                data.source[self.start..self.current]
256                                    .iter()
257                                    .cloned()
258                                    .collect::<String>(),
259                            ));
260                        }
261                    } else if self.matches(multi_start, data) {
262                        self.current += multi_start.len() - 1;
263                        level += 1;
264                    }
265                }
266                escape = false;
267            }
268            self.current += 1;
269        }
270        self.add_token(
271            TokenType::Comment(
272                data.source[self.start..self.current - 1]
273                    .iter()
274                    .cloned()
275                    .collect::<String>(),
276            ),
277            data,
278        );
279        Some(TokenType::Eof)
280    }
281    fn scan_number(&mut self, data: &mut ScannerData) -> Option<TokenType> {
282        if is_digit(data.source[self.current]) {
283            let source_len = data.source.len();
284            if self.current < source_len - 2 {
285                if data.source[self.current + 1] == 'x' || data.source[self.current + 1] == 'X' {
286                    self.current += 2;
287                    return self.scan_hex_number(data);
288                } else if data.source[self.current + 1] == 'b'
289                    || data.source[self.current + 1] == 'B'
290                {
291                    self.current += 2;
292                    return self.scan_binary_number(data);
293                }
294            }
295            let mut number = 0.0;
296            let mut value = String::new();
297            while self.current < source_len && is_digit(data.source[self.current]) {
298                let c = data.source[self.current];
299                value.push(c);
300                number = number * 10.0 + Number::from((c as u8) - b'0');
301                self.current += 1;
302            }
303            if self.current < source_len - 1
304                && data.source[self.current] == '.'
305                && is_digit(data.source[self.current + 1])
306            {
307                self.current += 1;
308                value.push('.');
309                let mut div = 1.0;
310                while self.current < source_len && is_digit(data.source[self.current]) {
311                    let c = data.source[self.current];
312                    value.push(c);
313                    number = number * 10.0 + Number::from((c as u8) - b'0');
314                    self.current += 1;
315                    div *= 10.0;
316                }
317                number /= div;
318            }
319            return Some(TokenType::NumberLiteral(value, number));
320        }
321        None
322    }
323    fn scan_binary_number(&mut self, data: &mut ScannerData) -> Option<TokenType> {
324        let mut number = 0.0;
325        let mut value = String::new();
326        loop {
327            let c = data.source[self.current];
328            match c {
329                '0' | '1' => {
330                    number = number * 2.0 + Number::from((c as u8) - b'0');
331                    value.push(c);
332                }
333                _ => break,
334            }
335            self.current += 1;
336            if self.current == data.source.len() {
337                break;
338            }
339        }
340        Some(TokenType::NumberLiteral(format!("0b{}", value), number))
341    }
342    fn scan_hex_number(&mut self, data: &mut ScannerData) -> Option<TokenType> {
343        let mut number = 0.0;
344        let mut value = String::new();
345        loop {
346            let c = data.source[self.current];
347            match c {
348                '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' => {
349                    number = number * 16.0 + Number::from((c as u8) - b'0');
350                    value.push(c);
351                }
352                'a' | 'b' | 'c' | 'd' | 'e' | 'f' => {
353                    number = number * 16.0 + Number::from((c as u8) - b'a' + 10);
354                    value.push(c);
355                }
356                'A' | 'B' | 'C' | 'D' | 'E' | 'F' => {
357                    number = number * 16.0 + Number::from((c as u8) - b'A' + 10);
358                    value.push(c);
359                }
360                _ => break,
361            }
362            self.current += 1;
363            if self.current == data.source.len() {
364                break;
365            }
366        }
367        Some(TokenType::NumberLiteral(format!("0x{}", value), number))
368    }
369    fn scan_identifier(&mut self, data: &mut ScannerData) -> Option<TokenType> {
370        if is_alpha(data.source[self.current]) {
371            let mut value = String::new();
372            while self.current < data.source.len() && is_alphanum(data.source[self.current]) {
373                value.push(data.source[self.current]);
374                self.current += 1;
375            }
376            return Some(TokenType::Identifier(value));
377        }
378        None
379    }
380    fn scan_space(&mut self, data: &mut ScannerData) -> Option<TokenType> {
381        let start = self.current;
382        while self.current < data.source.len() && is_space(data.source[self.current]) {
383            self.current += 1;
384        }
385        if start == self.current {
386            return None;
387        }
388        Some(TokenType::Ignore)
389    }
390    fn scan_string(&mut self, data: &mut ScannerData) -> Result<Option<TokenType>, ScanError> {
391        if data.source[self.current] == '\"' {
392            self.current += 1;
393            let mut escape = false;
394            let mut value = String::new();
395            while self.current < data.source.len() {
396                let c = data.source[self.current];
397                if c == '\\' && !escape {
398                    escape = true;
399                } else {
400                    if c == '\"' && !escape {
401                        self.current += 1;
402                        return Ok(Some(TokenType::StringLiteral(value)));
403                    } else if c == 'n' && escape {
404                        value.push('\n');
405                    } else if c == 't' && escape {
406                        value.push('\t');
407                    } else {
408                        value.push(c);
409                        if c == '\n' {
410                            self.line += 1;
411                        }
412                    }
413                    escape = false;
414                }
415                self.current += 1;
416            }
417            data.token_len.push(data.source.len() - self.start + 1);
418            data.token_start.push(self.start);
419            data.token_types.push(TokenType::StringLiteral(value));
420            data.token_lines.push(self.line);
421            let token_id = data.token_len.len() - 1;
422            return Err(ScanError::UnexpectedEof(
423                self.line,
424                data.token_start[token_id],
425            ));
426        }
427        Ok(None)
428    }
429    fn scan_newline(&mut self, data: &ScannerData) -> Option<TokenType> {
430        if data.source[self.current] == '\n' {
431            self.current += 1;
432            self.line += 1;
433            return Some(TokenType::NewLine);
434        }
435        None
436    }
437    fn scan_symbol(&mut self, data: &ScannerData, config: &ScannerConfig) -> Option<TokenType> {
438        for s in config.symbols.iter() {
439            if self.matches(s, data) {
440                self.current += s.len();
441                return Some(TokenType::Symbol((*s).to_owned()));
442            }
443        }
444        None
445    }
446    fn scan_keyword(&mut self, data: &ScannerData, config: &ScannerConfig) -> Option<TokenType> {
447        let source_len = data.source.len();
448        for s in config.keywords.iter() {
449            let keyword_len = s.len();
450            if self.matches(s, data)
451                && (self.current + keyword_len >= source_len
452                    || !is_alphanum(data.source[self.current + keyword_len]))
453            {
454                self.current += s.len();
455                return Some(TokenType::Keyword((*s).to_owned()));
456            }
457        }
458        None
459    }
460    fn matches(&self, s: &str, data: &ScannerData) -> bool {
461        let mut check = true;
462        let source_len = data.source.len();
463        for (i, c) in s.chars().enumerate() {
464            if self.current + i >= source_len || data.source[self.current + i] != c {
465                check = false;
466                break;
467            }
468        }
469        check
470    }
471}
472
473fn is_digit(c: char) -> bool {
474    c >= '0' && c <= '9'
475}
476
477fn is_alpha(c: char) -> bool {
478    (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'
479}
480
481fn is_alphanum(c: char) -> bool {
482    is_digit(c) || is_alpha(c)
483}
484
485fn is_space(c: char) -> bool {
486    c == ' ' || c == '\t' || c == '\r'
487}