1#[allow(clippy::all)]
2use std::str::CharIndices;
3use rustql_common::position::Position;
4use rustql_common::token::TokenKind;
5use crate::{lexer_error, internal_error};
6
7pub struct Lexer<'a> {
8 source: &'a str,
9 iter: CharIndices<'a>,
10 iter_byte_index: usize,
11 iter_char: Option<char>,
12
13 tok: TokenKind,
14 pos: Position,
15 start_pos: Position,
16 end_pos: Position,
17 start_byte_index: usize,
18 end_byte_index: usize,
19}
20
21impl<'a> Lexer<'a> {
22 pub fn new(source: &'a str) -> Self {
23 let mut iter = source.char_indices();
24 let frist_tuple = iter.next();
25 match frist_tuple {
26 Some((index, frist_char)) => {
27 Self {
28 source,
29 iter,
30 iter_char: Some(frist_char),
31 iter_byte_index: index,
32
33 tok: TokenKind::Start,
34 pos: Position::new(),
35 start_pos: Position::new(),
36 end_pos: Position::new(),
37 start_byte_index: 0,
38 end_byte_index: 0,
39 }
40 }
41 None => {
42 Self {
43 source,
44 iter,
45 iter_char: None,
46 iter_byte_index: 0,
47
48 tok: TokenKind::EOFToken,
49 pos: Position::new(),
50 start_pos: Position::new(),
51 end_pos: Position::new(),
52 start_byte_index: 0,
53 end_byte_index: 0,
54 }
55 }
56 }
57 }
58 fn is_char(&self, target: char) -> bool {
59 if let Some(ch) = self.get_char() {
60 if ch == target {
61 return true;
62 }
63 }
64 false
65 }
66 fn get_char(&self) -> Option<char> {
67 self.iter_char
68 }
69 fn eat_char(&mut self, mut n: usize) {
70 while n != 0 {
71 if let Some(code) = self.get_char() {
72 match code {
73 '\n' => {
74 self.pos.col = 0;
75 self.pos.row += 1;
76 }
77 _ => {
78 self.pos.col += 1;
79 }
80 }
81 self.pos.index += 1;
82 n -= 1;
83 match self.iter.next() {
84 Some(tuple) => {
85 self.iter_char = Some(tuple.1);
86 self.iter_byte_index = tuple.0;
87 }
88 None => {
89 self.iter_char = None;
90 self.iter_byte_index = self.source.len();
91 }
92 }
93 }else {
94 break;
95 }
96 }
97 }
98 fn start_with(&self, pat: &str) -> bool {
99 self.source[self.iter_byte_index..].starts_with(pat)
100 }
101 fn start_token(&mut self) {
102 self.start_byte_index = self.iter_byte_index;
103 self.start_pos = self.pos.clone();
104 }
105 fn finish_token(&mut self) {
106 self.end_byte_index = self.iter_byte_index;
107 self.end_pos = self.pos.clone();
108 }
109 fn skip_ignore_token(&mut self) {
110 while let Some(code) = self.get_char() {
111 match code {
112 '\n' | ' ' | '\t' | ',' | '\r'=> self.eat_char(1) ,
113 _ => break
114 }
115 }
116 }
117 pub fn get_start_pos(&self) -> Position {
118 self.start_pos.clone()
119 }
120 pub fn get_end_pos(&self) -> Position {
121 self.end_pos.clone()
122 }
123 pub fn get_pos(&self) -> Position {
125 self.pos.clone()
126 }
127 pub fn get_start_byte_index(&self) -> usize {
128 self.start_byte_index
129 }
130 pub fn get_end_byte_index(&self) -> usize {
131 self.end_byte_index
132 }
133 pub fn get_source_string(&self, start: usize, end: usize) -> &'a str {
134 &self.source[start..end]
135 }
136 pub fn get_value(&self)-> &'a str {
137 &self.source[self.start_byte_index..self.end_byte_index]
138 }
139 pub fn get_token(&mut self) -> TokenKind {
140 if self.tok == TokenKind::Start {
141 self.next_token()
142 }else {
143 self.tok.clone()
144 }
145 }
146 pub fn next_token(&mut self) -> TokenKind {
147 self.skip_ignore_token();
148 self.start_token();
149 self.tok = match self.get_char() {
150 None => {
151 self.finish_token();
152 TokenKind::EOFToken
153 }
154 Some(code) => {
155 match code {
156 '!' => {
157 self.eat_char(1);
158 self.finish_token();
159 TokenKind::Point
160 }
161 '|' => {
162 self.eat_char(1);
163 self.finish_token();
164 TokenKind::Pipe
165 }
166 '$' => {
167 self.eat_char(1);
168 self.finish_token();
169 TokenKind::DollarSign
170 }
171 '(' => {
172 self.eat_char(1);
173 self.finish_token();
174 TokenKind::ParenthesesLeft
175 }
176 ')' => {
177 self.eat_char(1);
178 self.finish_token();
179 TokenKind::ParenthesesRight
180 }
181 ':' => {
182 self.eat_char(1);
183 self.finish_token();
184 TokenKind::Colon
185 }
186 '=' => {
187 self.eat_char(1);
188 self.finish_token();
189 TokenKind::Eqal
190 }
191 '@' => {
192 self.eat_char(1);
193 self.finish_token();
194 TokenKind::At
195 }
196 '[' => {
197 self.eat_char(1);
198 self.finish_token();
199 TokenKind::BracketLeft
200 }
201 ']' => {
202 self.eat_char(1);
203 self.finish_token();
204 TokenKind::BracketRight
205 }
206 '{' => {
207 self.eat_char(1);
208 self.finish_token();
209 TokenKind::BracesLeft
210 }
211 '}' => {
212 self.eat_char(1);
213 self.finish_token();
214 TokenKind::BracesRight
215 }
216 '.' => {
217 self.read_dot()
218 }
219 '&' => {
220 self.eat_char(1);
221 self.finish_token();
222 TokenKind::And
223 }
224 '-' | '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' => {
225 self.read_number()
226 }
227 '#' => {
228 self.read_comment()
229 }
230 '\"' => {
231 if self.start_with("\"\"\"") {
232 self.read_block_string()
233 } else {
234 self.read_string()
235 }
236 }
237 _ => {
238 if is_name_start(code) {
239 self.read_name()
240 }else {
241 lexer_error!("this char can not be parsed", self);
242 }
243 }
244 }
245 }
246 };
247 self.tok.clone()
248 }
249 fn read_dot(&mut self) -> TokenKind {
250 if !self.start_with(".") {
251 internal_error!("unreach code, read_dot function must be called when start with .");
252 }
253 if self.start_with("...") {
254 self.eat_char(3);
255 self.finish_token();
256 return TokenKind::Ellipsis;
257 }
258 self.read_number()
259 }
260 fn read_name(&mut self) -> TokenKind {
261 match self.get_char() {
262 Some(ch) => {
263 if !is_name_start(ch) {
264 internal_error!(format!("unreach code, read_name must be called with start name char, but got {:?}", ch));
265 }
266 self.eat_char(1);
267 }
268 None => {
269 internal_error!("unreach code, rread_name must be called with start name char, but got EOF");
270 }
271 }
272 while let Some(ch) = self.get_char() {
273 if is_name_body(ch) {
274 self.eat_char(1)
275 }else {
276 break;
277 }
278 }
279 self.finish_token();
280 TokenKind::Name
281
282 }
283 fn read_number(&mut self) -> TokenKind {
284 let mut is_float = false;
285 if self.is_char('-') {
287 self.eat_char(1);
288 }
289 if !self.is_char('0') {
293 self.helper_read_digital();
294 }else {
295 self.eat_char(1);
296 if let Some(ch) = self.get_char() {
297 if is_digital(ch) {
298 lexer_error!("0 can not be followed by digial when it in begin of number", self);
299 }
300 }
301 }
302 if self.is_char('.') {
304 self.eat_char(1);
305 is_float = true;
306 self.helper_read_digital();
307 }
308 if self.is_char('e') || self.is_char('E') {
309 self.eat_char(1);
310 is_float = true;
311 if self.is_char('+') || self.is_char('-') {
312 self.eat_char(1);
313 }
314 self.helper_read_digital();
315 }
316 self.finish_token();
317 if let Some(ch) = self.get_char() {
319 if is_name_start(ch) {
320 lexer_error!("number can not be followed by this char", self);
321 }
322 }
323 if is_float { TokenKind::FloatValue } else { TokenKind::IntValue }
324 }
325 fn helper_read_digital(&mut self) {
326 while let Some(ch) = self.get_char() {
327 if is_digital(ch) {
328 self.eat_char(1);
329 }else {
330 break;
331 }
332 }
333 }
334 fn read_comment(&mut self) -> TokenKind {
335 while let Some(ch) = self.get_char() {
336 match ch {
337 '\n' => break,
338 _ => self.eat_char(1)
339 }
340 }
341 self.finish_token();
342 TokenKind::Comment
343 }
344 fn read_string(&mut self) -> TokenKind {
345 if !self.start_with("\"") {
346 internal_error!("unreach code, read_block_string must be call when start with '...'");
347 }
348 self.eat_char(1);
349
350 while !self.start_with("\"") {
351 match self.get_char() {
352 Some(code) => {
353 match code {
354 '\n' => { lexer_error!("non block string can not use lineterminator", self); },
355 _ => self.eat_char(1)
356 };
357 }
358 None => {
359 lexer_error!("unclose string.", self);
360 }
361 }
362 };
363 self.eat_char(1);
364 self.finish_token();
365 TokenKind::StringValue
366 }
367 fn read_block_string(&mut self) -> TokenKind {
368 if !self.start_with("\"\"\"") {
369 internal_error!("unreach code, read_block_string must be call when start with '...'");
370 }
371 self.eat_char(3);
372
373 while !self.start_with("\"\"\"") {
374 if self.start_with("\\\"\"\"") {
375 self.eat_char(4);
376 continue;
377 }
378 match self.get_char() {
379 Some(_) => {
380 self.eat_char(1);
381 }
382 None => {
383 lexer_error!("unclose block string.", self);
384 }
385 }
386 }
387 self.eat_char(3);
388 self.finish_token();
389 TokenKind::StringValue
390 }
391}
392
393fn is_digital(ch: char) -> bool {
394 matches!(ch, '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9')
395}
396fn is_name_start(ch: char) -> bool {
397 matches!(ch, 'a' | 'b' | 'c' | 'd' | 'e' | 'f' | 'g' | 'h' | 'i' | 'j' | 'k' | 'l' | 'm' |
398 'n' | 'o' | 'p' | 'q' | 'r' | 's' | 't' | 'u' | 'v' | 'w' | 'x' | 'y' | 'z' |
399 'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'G' | 'H' | 'I' | 'J' | 'K' | 'L' | 'M' |
400 'N' | 'O' | 'P' | 'Q' | 'R' | 'S' | 'T' | 'U' | 'V' | 'W' | 'X' | 'Y' | 'Z' |
401 '_')
402}
403fn is_name_body(ch: char) -> bool {
404 if is_digital(ch) || is_name_start(ch) {
405 return true
406 }
407 false
408}