pipeline_script/lexer/
mod.rs

1pub mod iter;
2pub mod position;
3pub mod token;
4
5use crate::lexer::position::Position;
6use crate::lexer::token::Token;
7use std::ops::Add;
8
9/// 词法分析器状态
10#[derive(Debug, Clone, PartialEq)]
11enum State {
12    /// 初始状态
13    Initial,
14    /// 识别标识符状态
15    Identifier,
16    /// 识别数字状态
17    Number,
18    /// 识别小数状态
19    Decimal,
20    /// 识别字符串状态
21    String,
22    /// 识别格式化字符串状态
23    FormatString,
24    /// 识别注释状态
25    Comment,
26    /// 识别多行注释状态
27    MultilineComment,
28    /// 完成状态,带有类型信息
29    Done(TokenType),
30}
31/// 词法单元类型
32#[derive(Debug, Clone, Copy, PartialEq)]
33enum TokenType {
34    /// 标识符
35    Identifier,
36    /// 整数
37    Integer,
38    /// 浮点数
39    Float,
40    /// 运算符或分隔符
41    Symbol,
42}
43
44#[derive(Debug, Clone)]
45pub struct Lexer {
46    file_name: String,
47    chars: Vec<char>,
48    index: usize,
49    col: usize,
50    row: usize,
51    keywords: Vec<&'static str>,
52    /// 当前状态
53    state: State,
54    /// 当前词素缓冲区
55    buffer: String,
56    /// 开始位置
57    start_pos: usize,
58    start_col: usize,
59    start_row: usize,
60}
61
62// impl IntoIterator for Lexer {
63//     type Item = (Token, Position);
64//     type IntoIter = TokenStream;
65//
66//     fn into_iter(self) -> Self::IntoIter {
67//         TokenStream::new(self)
68//     }
69// }
70impl Lexer {
71    pub fn is_eof(&self) -> bool {
72        self.index >= self.chars.len()
73    }
74    pub fn new(file_name: impl Into<String>) -> Self {
75        Self {
76            chars: vec![],
77            index: 0,
78            col: 1,
79            row: 1,
80            file_name: file_name.into(),
81            keywords: vec![
82                "let", "fn", "fun", "return", "if", "while", "import", "else", "val", "var",
83                "break", "continue", "for", "in", "class", "static", "trait", "struct", "extern",
84                "module", "const", "enum", "match",
85            ],
86            state: State::Initial,
87            buffer: String::new(),
88            start_pos: 0,
89            start_col: 1,
90            start_row: 1,
91        }
92    }
93    #[allow(unused)]
94    pub fn get_file_name(&self) -> String {
95        self.file_name.clone()
96    }
97    #[allow(unused)]
98    pub fn line(&self, line: usize) -> String {
99        let mut s = String::new();
100        let mut current_line = 1;
101        for i in &self.chars {
102            if i == &'\n' {
103                current_line += 1;
104                continue;
105            }
106            if current_line == line {
107                s.push(*i);
108            }
109            if current_line > line {
110                break;
111            }
112        }
113        s
114    }
115    #[allow(unused)]
116    pub fn set_chars(&mut self, chars: Vec<char>) {
117        self.chars = chars;
118    }
119
120    /// 开始新的词法分析
121    fn start_token(&mut self) {
122        self.buffer.clear();
123        self.start_pos = self.index;
124        self.start_col = self.col;
125        self.start_row = self.row;
126        self.state = State::Initial;
127    }
128
129    /// 创建token位置
130    fn create_position(&self) -> Position {
131        Position::new(
132            self.start_pos,
133            self.buffer.len(),
134            self.start_row,
135            self.start_col,
136            &self.file_name,
137        )
138    }
139
140    pub fn next_token(&mut self) -> Option<(Token, Position)> {
141        self.start_token();
142
143        while self.index < self.chars.len() {
144            let c = self.current_char().unwrap();
145            let peek = self.peek_char().unwrap_or('\0');
146
147            match &self.state {
148                State::Initial => self.process_initial_state(c, peek),
149                State::Identifier => self.process_identifier_state(c),
150                State::Number => self.process_number_state(c, peek),
151                State::Decimal => self.process_decimal_state(c),
152                State::String => {
153                    if let Some(token) = self.process_string_state(c) {
154                        return Some(token);
155                    }
156                }
157                State::FormatString => {
158                    if let Some(token) = self.process_format_string_state(c) {
159                        return Some(token);
160                    }
161                }
162                State::Comment => {
163                    if c == '\n' {
164                        self.state = State::Initial;
165                    }
166                    self.increase_index();
167                }
168                State::MultilineComment => {
169                    if c == '*' && peek == '/' {
170                        self.increase_index();
171                        self.increase_index();
172                        self.state = State::Initial;
173                    } else {
174                        self.increase_index();
175                    }
176                }
177                State::Done(token_type) => {
178                    return self.finalize_token(*token_type);
179                }
180            }
181
182            // 如果已经到达Done状态,返回结果
183            if let State::Done(token_type) = &self.state {
184                return self.finalize_token(*token_type);
185            }
186        }
187
188        // 处理文件结束情况
189        if !self.buffer.is_empty() {
190            if let State::Identifier = self.state {
191                return self.finalize_token(TokenType::Identifier);
192            } else if let State::Number = self.state {
193                return self.finalize_token(TokenType::Integer);
194            } else if let State::Decimal = self.state {
195                return self.finalize_token(TokenType::Float);
196            } else {
197                // 默认作为标识符处理未完成的token
198                return self.finalize_token(TokenType::Identifier);
199            }
200        }
201
202        None
203    }
204
205    /// 处理初始状态
206    fn process_initial_state(&mut self, c: char, peek: char) {
207        match (c, peek) {
208            ('r', '"') => {
209                self.buffer.push(c);
210                self.buffer.push(peek);
211                self.increase_index();
212                self.increase_index();
213                self.state = State::FormatString;
214            }
215            ('.', p) if !p.is_numeric() => {
216                self.buffer.push(c);
217                self.increase_index();
218                self.state = State::Done(TokenType::Symbol);
219            }
220            ('0'..='9', _) => {
221                self.buffer.push(c);
222                self.increase_index();
223                self.state = State::Number;
224            }
225            ('.', p) if p.is_numeric() => {
226                self.buffer.push(c);
227                self.increase_index();
228                self.state = State::Decimal;
229            }
230            ('a'..='z' | 'A'..='Z' | '_', _) => {
231                self.buffer.push(c);
232                self.increase_index();
233                self.state = State::Identifier;
234            }
235            ('"', _) => {
236                self.buffer.push(c);
237                self.increase_index();
238                self.state = State::String;
239            }
240            ('\'', _) => {
241                self.buffer.push(c);
242                self.increase_index();
243                self.state = State::String;
244            }
245            ('/', '/') => {
246                self.increase_index();
247                self.increase_index();
248                self.state = State::Comment;
249            }
250            ('/', '*') => {
251                self.increase_index();
252                self.increase_index();
253                self.state = State::MultilineComment;
254            }
255            (':', ':') => {
256                self.buffer.push(':');
257                self.buffer.push(':');
258                self.increase_index();
259                self.increase_index();
260                self.state = State::Done(TokenType::Symbol);
261            }
262            ('!', '=') => {
263                self.buffer.push('!');
264                self.buffer.push('=');
265                self.increase_index();
266                self.increase_index();
267                self.state = State::Done(TokenType::Symbol);
268            }
269            ('=', '=') => {
270                self.buffer.push('=');
271                self.buffer.push('=');
272                self.increase_index();
273                self.increase_index();
274                self.state = State::Done(TokenType::Symbol);
275            }
276            ('|', '|') => {
277                self.buffer.push('|');
278                self.buffer.push('|');
279                self.increase_index();
280                self.increase_index();
281                self.state = State::Done(TokenType::Symbol);
282            }
283            ('&', '&') => {
284                self.buffer.push('&');
285                self.buffer.push('&');
286                self.increase_index();
287                self.increase_index();
288                self.state = State::Done(TokenType::Symbol);
289            }
290            ('-', '>') => {
291                self.buffer.push('-');
292                self.buffer.push('>');
293                self.increase_index();
294                self.increase_index();
295                self.state = State::Done(TokenType::Symbol);
296            }
297            ('>', '=') => {
298                self.buffer.push('>');
299                self.buffer.push('=');
300                self.increase_index();
301                self.increase_index();
302                self.state = State::Done(TokenType::Symbol);
303            }
304            ('<', '=') => {
305                self.buffer.push('<');
306                self.buffer.push('=');
307                self.increase_index();
308                self.increase_index();
309                self.state = State::Done(TokenType::Symbol);
310            }
311            ('(', _)
312            | (')', _)
313            | ('{', _)
314            | ('}', _)
315            | ('[', _)
316            | (']', _)
317            | (':', _)
318            | (',', _)
319            | ('|', _)
320            | ('!', _)
321            | ('@', _)
322            | ('=', _)
323            | ('&', _)
324            | ('>', _)
325            | ('<', _)
326            | ('+', _)
327            | ('-', _)
328            | ('*', _)
329            | ('%', _)
330            | ('/', _) => {
331                self.buffer.push(c);
332                self.increase_index();
333                self.state = State::Done(TokenType::Symbol);
334            }
335            (' ' | '\r' | ';' | '\t' | '\n', _) => {
336                self.increase_index();
337            }
338            _ => {
339                println!("Unexpected character: {:?}", c);
340                self.increase_index();
341            }
342        }
343    }
344
345    /// 处理标识符状态
346    fn process_identifier_state(&mut self, c: char) {
347        if c.is_alphanumeric() || c == '_' {
348            self.buffer.push(c);
349            self.increase_index();
350        } else {
351            self.state = State::Done(TokenType::Identifier);
352        }
353    }
354
355    /// 处理数字状态
356    fn process_number_state(&mut self, c: char, peek: char) {
357        if c.is_numeric() {
358            self.buffer.push(c);
359            self.increase_index();
360        } else if c == '.' && peek.is_numeric() {
361            self.buffer.push(c);
362            self.increase_index();
363            self.state = State::Decimal;
364        } else {
365            self.state = State::Done(TokenType::Integer);
366        }
367    }
368
369    /// 处理小数状态
370    fn process_decimal_state(&mut self, c: char) {
371        if c.is_numeric() {
372            self.buffer.push(c);
373            self.increase_index();
374        } else {
375            self.state = State::Done(TokenType::Float);
376        }
377    }
378
379    /// 处理字符串状态
380    fn process_string_state(&mut self, c: char) -> Option<(Token, Position)> {
381        let quote = self.buffer.chars().next().unwrap();
382        if c == quote {
383            self.buffer.push(c);
384            self.increase_index();
385            let pos = self.create_position();
386            let content = self.buffer[1..self.buffer.len() - 1]
387                .to_string()
388                .replace("\\n", "\n");
389            Some((Token::String(content), pos))
390        } else {
391            self.buffer.push(c);
392            self.increase_index();
393            None
394        }
395    }
396
397    /// 处理格式化字符串状态
398    fn process_format_string_state(&mut self, c: char) -> Option<(Token, Position)> {
399        if c == '"' {
400            self.buffer.push(c);
401            self.increase_index();
402            let pos = self.create_position();
403            let content = self.buffer[2..self.buffer.len() - 1]
404                .to_string()
405                .replace("\\n", "\n");
406            Some((Token::FormatString(content), pos))
407        } else {
408            self.buffer.push(c);
409            self.increase_index();
410            None
411        }
412    }
413
414    /// 完成token处理
415    fn finalize_token(&mut self, token_type: TokenType) -> Option<(Token, Position)> {
416        let pos = self.create_position();
417
418        // 根据缓冲区内容和当前状态生成token
419        let token = match token_type {
420            TokenType::Identifier => {
421                let ident = self.buffer.clone();
422                if self.keywords.contains(&ident.as_str()) {
423                    Token::Keyword(ident)
424                } else if ident == "true" {
425                    Token::Boolean(true)
426                } else if ident == "false" {
427                    Token::Boolean(false)
428                } else {
429                    Token::Identifier(ident)
430                }
431            }
432            TokenType::Integer => {
433                let i: i64 = self.buffer.parse().unwrap();
434                Token::Int(i)
435            }
436            TokenType::Float => {
437                let f: f32 = self.buffer.parse().unwrap();
438                Token::Float(f)
439            }
440            TokenType::Symbol => match self.buffer.as_str() {
441                "(" => Token::BraceLeft,
442                ")" => Token::BraceRight,
443                "{" => Token::ParenLeft,
444                "}" => Token::ParenRight,
445                "[" => Token::BracketLeft,
446                "]" => Token::BracketRight,
447                "." => Token::Dot,
448                ":" => Token::Colon,
449                "::" => Token::ScopeSymbol,
450                "=" => Token::Assign,
451                "," => Token::Comma,
452                "+" => Token::Plus,
453                "-" => Token::Minus,
454                "*" => Token::Star,
455                "/" => Token::Slash,
456                "%" => Token::Mod,
457                ">" => Token::Greater,
458                "<" => Token::Less,
459                "<=" => Token::LessEqual,
460                ">=" => Token::GreaterEqual,
461                "==" => Token::Equal,
462                "!=" => Token::NotEqual,
463                "->" => Token::Arrow,
464                "!" => Token::Not,
465                "&&" => Token::And,
466                "||" => Token::Or,
467                "|" => Token::Vertical,
468                "@" => Token::Annotation,
469                "&" => Token::BitAnd,
470                _ => {
471                    println!("Unknown token: {}", self.buffer);
472                    return None;
473                }
474            },
475            // TokenType::String => {
476            //     let content = self.buffer[1..self.buffer.len() - 1]
477            //         .to_string()
478            //         .replace("\\n", "\n");
479            //     Token::String(content)
480            // }
481            // TokenType::FormatString => {
482            //     let content = self.buffer[2..self.buffer.len() - 1]
483            //         .to_string()
484            //         .replace("\\n", "\n");
485            //     Token::FormatString(content)
486            // }
487        };
488
489        Some((token, pos))
490    }
491
492    fn peek_char(&self) -> Option<char> {
493        self.chars.get(self.index + 1).copied()
494    }
495
496    fn current_char(&self) -> Option<char> {
497        self.chars.get(self.index).copied()
498    }
499
500    fn increase_index(&mut self) {
501        if self.index < self.chars.len() && self.chars[self.index] == '\n' {
502            self.row += 1;
503            self.col = 0;
504        }
505        self.index = self.index.add(1);
506        self.col += 1;
507    }
508
509    #[allow(unused)]
510    pub fn get_source(&self) -> Vec<char> {
511        self.chars.clone()
512    }
513
514    pub fn from_script(module_name: impl Into<String>, script: impl AsRef<str>) -> Self {
515        let script = script.as_ref().chars().collect();
516        let mut lexer = Lexer::new(module_name);
517        lexer.set_chars(script);
518        lexer
519    }
520}