rustql_parser/
lexer.rs

1#[allow(clippy::all)]
2use std::str::CharIndices;
3use rustql_common::position::Position;
4use rustql_common::token::TokenKind;
5use crate::{lexer_error, internal_error};
6
7pub struct Lexer<'a> {
8    source: &'a str,
9    iter: CharIndices<'a>,
10    iter_byte_index: usize,
11    iter_char: Option<char>,
12
13    tok: TokenKind,
14    pos: Position,
15    start_pos: Position,
16    end_pos: Position,
17    start_byte_index: usize,
18    end_byte_index: usize,
19}
20
21impl<'a> Lexer<'a> {
22    pub fn new(source: &'a str) -> Self {
23        let mut iter = source.char_indices();
24        let frist_tuple = iter.next();
25        match frist_tuple {
26            Some((index, frist_char)) => {
27                Self {
28                    source,
29                    iter,
30                    iter_char: Some(frist_char),
31                    iter_byte_index: index,
32
33                    tok: TokenKind::Start,
34                    pos: Position::new(),
35                    start_pos: Position::new(),
36                    end_pos: Position::new(),
37                    start_byte_index: 0,
38                    end_byte_index: 0,
39                }
40            }
41            None => {
42                Self {
43                    source,
44                    iter,
45                    iter_char: None,
46                    iter_byte_index: 0,
47
48                    tok: TokenKind::EOFToken,
49                    pos: Position::new(),
50                    start_pos: Position::new(),
51                    end_pos: Position::new(),
52                    start_byte_index: 0,
53                    end_byte_index: 0,
54                }
55            }
56        }
57    }
58    fn is_char(&self, target: char) -> bool {
59        if let Some(ch) = self.get_char() {
60            if ch == target {
61                return true;
62            }
63        }
64        false
65    }
66    fn get_char(&self) -> Option<char> {
67        self.iter_char
68    }
69    fn eat_char(&mut self, mut n: usize) {
70        while n != 0 {
71            if let Some(code) = self.get_char() {
72                match code {
73                    '\n' => {
74                        self.pos.col = 0;
75                        self.pos.row += 1; 
76                    }
77                    _ => {
78                        self.pos.col += 1;
79                    }
80                }
81                self.pos.index += 1;
82                n -= 1;
83                match self.iter.next() {
84                    Some(tuple) => {
85                        self.iter_char = Some(tuple.1);
86                        self.iter_byte_index = tuple.0;
87                    }
88                    None => {
89                        self.iter_char = None; 
90                        self.iter_byte_index = self.source.len();
91                    }
92                }
93            }else {
94                break;
95            }
96        }
97    }
98    fn start_with(&self, pat: &str) -> bool {
99        self.source[self.iter_byte_index..].starts_with(pat)
100    }
101    fn start_token(&mut self) {
102        self.start_byte_index = self.iter_byte_index;
103        self.start_pos = self.pos.clone();
104    }
105    fn finish_token(&mut self) {
106        self.end_byte_index = self.iter_byte_index;
107        self.end_pos = self.pos.clone();
108    }
109    fn skip_ignore_token(&mut self) {
110        while let Some(code) = self.get_char() {
111            match code {
112                '\n' | ' ' | '\t' | ',' | '\r'=> self.eat_char(1) ,
113                _ => break
114            }
115        }
116    }
117    pub fn get_start_pos(&self) -> Position {
118        self.start_pos.clone()
119    }
120    pub fn get_end_pos(&self) -> Position {
121        self.end_pos.clone()
122    }
123    /* this method only used for debug  */
124    pub fn get_pos(&self) -> Position {
125        self.pos.clone()
126    }
127    pub fn get_start_byte_index(&self) -> usize {
128        self.start_byte_index
129    }
130    pub fn get_end_byte_index(&self) -> usize {
131        self.end_byte_index
132    }
133    pub fn get_source_string(&self, start: usize, end: usize) -> &'a str {
134        &self.source[start..end]
135    }
136    pub fn get_value(&self)-> &'a str {
137        &self.source[self.start_byte_index..self.end_byte_index]
138    }
139    pub fn get_token(&mut self) -> TokenKind {
140        if self.tok == TokenKind::Start {
141            self.next_token()
142        }else {
143            self.tok.clone()
144        }
145    }
146    pub fn next_token(&mut self) -> TokenKind {
147        self.skip_ignore_token();
148        self.start_token();
149        self.tok = match self.get_char() {
150            None => {
151                self.finish_token();
152                TokenKind::EOFToken
153            }
154            Some(code) => {
155                match code  {
156                    '!' => {
157                        self.eat_char(1);
158                        self.finish_token();
159                        TokenKind::Point
160                    }
161                    '|' => {
162                        self.eat_char(1);
163                        self.finish_token();
164                        TokenKind::Pipe
165                    }
166                    '$' => {
167                        self.eat_char(1);
168                        self.finish_token();
169                        TokenKind::DollarSign
170                    }
171                    '(' => {
172                        self.eat_char(1);
173                        self.finish_token();
174                        TokenKind::ParenthesesLeft
175                    }
176                    ')' => {
177                        self.eat_char(1);
178                        self.finish_token();
179                        TokenKind::ParenthesesRight
180                    }
181                    ':' => {
182                        self.eat_char(1);
183                        self.finish_token();
184                        TokenKind::Colon
185                    }
186                    '=' => {
187                        self.eat_char(1);
188                        self.finish_token();
189                        TokenKind::Eqal
190                    }
191                    '@' => {
192                        self.eat_char(1);
193                        self.finish_token();
194                        TokenKind::At
195                    }
196                    '[' => {
197                        self.eat_char(1);
198                        self.finish_token();
199                        TokenKind::BracketLeft
200                    }
201                    ']' => {
202                        self.eat_char(1);
203                        self.finish_token();
204                        TokenKind::BracketRight
205                    }
206                    '{' => {
207                        self.eat_char(1);
208                        self.finish_token();
209                        TokenKind::BracesLeft
210                    }
211                    '}' => {
212                        self.eat_char(1);
213                        self.finish_token();
214                        TokenKind::BracesRight
215                    }
216                    '.' => {
217                        self.read_dot()
218                    }
219                    '&' => {
220                        self.eat_char(1);
221                        self.finish_token();
222                        TokenKind::And
223                    }
224                    '-' | '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' => {
225                        self.read_number()
226                    }
227                    '#' => {
228                        self.read_comment()
229                    }
230                    '\"' => {
231                        if self.start_with("\"\"\"") {
232                            self.read_block_string()
233                        } else {
234                            self.read_string()
235                        }
236                    }
237                    _ => {
238                        if is_name_start(code) {
239                            self.read_name()
240                        }else {
241                            lexer_error!("this char can not be parsed", self);
242                        }
243                    }
244                }
245            }
246        };
247        self.tok.clone()
248    }
249    fn read_dot(&mut self) -> TokenKind {
250        if !self.start_with(".") {
251            internal_error!("unreach code, read_dot function must be called when start with .");
252        }
253        if self.start_with("...") {
254            self.eat_char(3);
255            self.finish_token();
256            return TokenKind::Ellipsis;
257        }
258        self.read_number()
259    }
260    fn read_name(&mut self) -> TokenKind {
261        match self.get_char() {
262            Some(ch) => {
263                if !is_name_start(ch) {
264                    internal_error!(format!("unreach code, read_name must be called with start name char, but got {:?}", ch));
265                }
266                self.eat_char(1);
267            }
268            None => {
269                internal_error!("unreach code, rread_name must be called with start name char, but got EOF");
270            }
271        }
272        while let Some(ch) = self.get_char() {
273            if is_name_body(ch) {
274                self.eat_char(1)
275            }else {
276                break;
277            }
278        }
279        self.finish_token();
280        TokenKind::Name
281
282    }
283    fn read_number(&mut self) -> TokenKind {
284        let mut is_float = false;
285        // Read nagaive
286        if self.is_char('-') {
287            self.eat_char(1);
288        }
289        // Read int part
290        // if not start with 0, caume util not digital
291        // if start with 0, must not start with digial next.
292        if !self.is_char('0') {
293            self.helper_read_digital();
294        }else {
295            self.eat_char(1);
296            if let Some(ch) = self.get_char() {
297                if is_digital(ch) {
298                    lexer_error!("0 can not be followed by digial when it in begin of number", self);
299                }
300            }
301        }
302        // Read dot and if start with dot
303        if self.is_char('.') {
304            self.eat_char(1);
305            is_float = true;
306            self.helper_read_digital();
307        }
308        if self.is_char('e') || self.is_char('E') {
309            self.eat_char(1);
310            is_float = true;
311            if self.is_char('+') || self.is_char('-') {
312                self.eat_char(1);
313            }
314            self.helper_read_digital();
315        }
316        self.finish_token();
317        // next char can not be any char belong to start with name 
318        if let Some(ch) = self.get_char() {
319            if is_name_start(ch) {
320                lexer_error!("number can not be followed by this char", self);
321            }
322        }
323        if is_float { TokenKind::FloatValue } else { TokenKind::IntValue }
324    }
325    fn helper_read_digital(&mut self) {
326        while let Some(ch) = self.get_char() {
327            if is_digital(ch) {
328                self.eat_char(1);
329            }else {
330                break;
331            }
332        }
333    }
334    fn read_comment(&mut self) -> TokenKind {
335        while let Some(ch) = self.get_char() {
336            match ch {
337                '\n' => break,
338                _ => self.eat_char(1)
339            }
340        }
341        self.finish_token();
342        TokenKind::Comment
343    }
344    fn read_string(&mut self) -> TokenKind {
345        if !self.start_with("\"") {
346            internal_error!("unreach code, read_block_string must be call when start with '...'");
347        }
348        self.eat_char(1);
349
350        while !self.start_with("\"")  {
351            match self.get_char() {
352                Some(code) => {
353                    match code {
354                        '\n' => { lexer_error!("non block string can not use lineterminator", self); },
355                        _ => self.eat_char(1)
356                    };
357                }
358                None => {
359                    lexer_error!("unclose string.", self);
360                }
361            }
362        };
363        self.eat_char(1);
364        self.finish_token();
365        TokenKind::StringValue
366    }
367    fn read_block_string(&mut self) -> TokenKind {
368        if !self.start_with("\"\"\"") {
369            internal_error!("unreach code, read_block_string must be call when start with '...'");
370        }
371        self.eat_char(3);
372
373        while !self.start_with("\"\"\"")  {
374            if self.start_with("\\\"\"\"") {
375                self.eat_char(4);
376                continue;
377            }
378            match self.get_char() {
379                Some(_) => {
380                    self.eat_char(1);
381                }
382                None => {
383                    lexer_error!("unclose block string.", self);
384                }
385            }
386        }
387        self.eat_char(3);
388        self.finish_token();
389        TokenKind::StringValue
390    }
391}
392
393fn is_digital(ch: char) -> bool {
394    matches!(ch, '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9')
395}
396fn is_name_start(ch: char) -> bool {
397    matches!(ch, 'a' | 'b' | 'c' | 'd' | 'e' | 'f' | 'g' | 'h' | 'i' | 'j' | 'k' | 'l' | 'm' |
398        'n' | 'o' | 'p' | 'q' | 'r' | 's' | 't' | 'u' | 'v' | 'w' | 'x' | 'y' | 'z' |
399        'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'G' | 'H' | 'I' | 'J' | 'K' | 'L' | 'M' |
400        'N' | 'O' | 'P' | 'Q' | 'R' | 'S' | 'T' | 'U' | 'V' | 'W' | 'X' | 'Y' | 'Z' |
401        '_')
402}
403fn is_name_body(ch: char) -> bool {
404    if is_digital(ch) || is_name_start(ch) {
405        return true
406    }
407    false
408}