pipeline_script/lexer/
mod.rs

1pub mod iter;
2pub mod position;
3pub mod token;
4
5use crate::lexer::position::Position;
6use crate::lexer::token::Token;
7use std::ops::Add;
8
9/// 词法分析器状态
10#[derive(Debug, Clone, PartialEq)]
11enum State {
12    /// 初始状态
13    Initial,
14    /// 识别标识符状态
15    Identifier,
16    /// 识别数字状态
17    Number,
18    /// 识别小数状态
19    Decimal,
20    /// 识别字符串状态
21    String,
22    /// 识别格式化字符串状态
23    FormatString,
24    /// 识别注释状态
25    Comment,
26    /// 识别多行注释状态
27    MultilineComment,
28    /// 完成状态,带有类型信息
29    Done(TokenType),
30}
31/// 词法单元类型
32#[derive(Debug, Clone, Copy, PartialEq)]
33enum TokenType {
34    /// 标识符
35    Identifier,
36    /// 整数
37    Integer,
38    /// 浮点数
39    Float,
40    /// 运算符或分隔符
41    Symbol,
42}
43
44#[derive(Debug, Clone)]
45pub struct Lexer {
46    file_name: String,
47    chars: Vec<char>,
48    index: usize,
49    col: usize,
50    row: usize,
51    keywords: Vec<&'static str>,
52    /// 当前状态
53    state: State,
54    /// 当前词素缓冲区
55    buffer: String,
56    /// 开始位置
57    start_pos: usize,
58    start_col: usize,
59    start_row: usize,
60}
61
62// impl IntoIterator for Lexer {
63//     type Item = (Token, Position);
64//     type IntoIter = TokenStream;
65//
66//     fn into_iter(self) -> Self::IntoIter {
67//         TokenStream::new(self)
68//     }
69// }
70impl Lexer {
71    pub fn is_eof(&self) -> bool {
72        self.index >= self.chars.len()
73    }
74    pub fn new(file_name: impl Into<String>) -> Self {
75        Self {
76            chars: vec![],
77            index: 0,
78            col: 1,
79            row: 1,
80            file_name: file_name.into(),
81            keywords: vec![
82                "let", "fn", "fun", "return", "if", "while", "import", "else", "val", "var",
83                "break", "continue", "for", "in", "class", "static", "trait", "struct", "extern",
84                "module", "const", "enum", "match",
85            ],
86            state: State::Initial,
87            buffer: String::new(),
88            start_pos: 0,
89            start_col: 1,
90            start_row: 1,
91        }
92    }
93    #[allow(unused)]
94    pub fn get_file_name(&self) -> String {
95        self.file_name.clone()
96    }
97    #[allow(unused)]
98    pub fn line(&self, line: usize) -> String {
99        let mut s = String::new();
100        let mut current_line = 1;
101        for i in &self.chars {
102            if i == &'\n' {
103                current_line += 1;
104                continue;
105            }
106            if current_line == line {
107                s.push(*i);
108            }
109            if current_line > line {
110                break;
111            }
112        }
113        s
114    }
115    #[allow(unused)]
116    pub fn set_chars(&mut self, chars: Vec<char>) {
117        self.chars = chars;
118    }
119
120    /// 开始新的词法分析
121    fn start_token(&mut self) {
122        self.buffer.clear();
123        self.start_pos = self.index;
124        self.start_col = self.col;
125        self.start_row = self.row;
126        self.state = State::Initial;
127    }
128
129    /// 创建token位置
130    fn create_position(&self) -> Position {
131        Position::new(
132            self.start_pos,
133            self.buffer.len(),
134            self.start_row,
135            self.start_col,
136            &self.file_name,
137        )
138    }
139
140    pub fn next_token(&mut self) -> Option<(Token, Position)> {
141        self.start_token();
142
143        while self.index < self.chars.len() {
144            let c = self.current_char().unwrap();
145            let peek = self.peek_char().unwrap_or('\0');
146
147            match &self.state {
148                State::Initial => self.process_initial_state(c, peek),
149                State::Identifier => self.process_identifier_state(c),
150                State::Number => self.process_number_state(c, peek),
151                State::Decimal => self.process_decimal_state(c),
152                State::String => {
153                    if let Some(token) = self.process_string_state(c) {
154                        return Some(token);
155                    }
156                }
157                State::FormatString => {
158                    if let Some(token) = self.process_format_string_state(c) {
159                        return Some(token);
160                    }
161                }
162                State::Comment => {
163                    if c == '\n' {
164                        self.state = State::Initial;
165                    }
166                    self.increase_index();
167                }
168                State::MultilineComment => {
169                    if c == '*' && peek == '/' {
170                        self.increase_index();
171                        self.increase_index();
172                        self.state = State::Initial;
173                    } else {
174                        self.increase_index();
175                    }
176                }
177                State::Done(token_type) => {
178                    return self.finalize_token(*token_type);
179                }
180            }
181
182            // 如果已经到达Done状态,返回结果
183            if let State::Done(token_type) = &self.state {
184                return self.finalize_token(*token_type);
185            }
186        }
187
188        // 处理文件结束情况
189        if !self.buffer.is_empty() {
190            if let State::Identifier = self.state {
191                return self.finalize_token(TokenType::Identifier);
192            } else if let State::Number = self.state {
193                return self.finalize_token(TokenType::Integer);
194            } else if let State::Decimal = self.state {
195                return self.finalize_token(TokenType::Float);
196            } else {
197                // 默认作为标识符处理未完成的token
198                return self.finalize_token(TokenType::Identifier);
199            }
200        }
201
202        None
203    }
204
205    /// 处理初始状态
206    fn process_initial_state(&mut self, c: char, peek: char) {
207        match (c, peek) {
208            ('r', '"') => {
209                self.buffer.push(c);
210                self.buffer.push(peek);
211                self.increase_index();
212                self.increase_index();
213                self.state = State::FormatString;
214            }
215            ('.', p) if !p.is_numeric() => {
216                self.buffer.push(c);
217                self.increase_index();
218                self.state = State::Done(TokenType::Symbol);
219            }
220            ('-','0'..='9') => {
221                self.buffer.push(c);
222                self.increase_index();
223                self.state = State::Number;
224            }
225            ('0'..='9', _) => {
226                self.buffer.push(c);
227                self.increase_index();
228                self.state = State::Number;
229            }
230            ('.', p) if p.is_numeric() => {
231                self.buffer.push(c);
232                self.increase_index();
233                self.state = State::Decimal;
234            }
235            ('a'..='z' | 'A'..='Z' | '_', _) => {
236                self.buffer.push(c);
237                self.increase_index();
238                self.state = State::Identifier;
239            }
240            ('"', _) => {
241                self.buffer.push(c);
242                self.increase_index();
243                self.state = State::String;
244            }
245            ('\'', _) => {
246                self.buffer.push(c);
247                self.increase_index();
248                self.state = State::String;
249            }
250            ('/', '/') => {
251                self.increase_index();
252                self.increase_index();
253                self.state = State::Comment;
254            }
255            ('/', '*') => {
256                self.increase_index();
257                self.increase_index();
258                self.state = State::MultilineComment;
259            }
260            (':', ':') => {
261                self.buffer.push(':');
262                self.buffer.push(':');
263                self.increase_index();
264                self.increase_index();
265                self.state = State::Done(TokenType::Symbol);
266            }
267            ('!', '=') => {
268                self.buffer.push('!');
269                self.buffer.push('=');
270                self.increase_index();
271                self.increase_index();
272                self.state = State::Done(TokenType::Symbol);
273            }
274            ('=', '=') => {
275                self.buffer.push('=');
276                self.buffer.push('=');
277                self.increase_index();
278                self.increase_index();
279                self.state = State::Done(TokenType::Symbol);
280            }
281            ('|', '|') => {
282                self.buffer.push('|');
283                self.buffer.push('|');
284                self.increase_index();
285                self.increase_index();
286                self.state = State::Done(TokenType::Symbol);
287            }
288            ('&', '&') => {
289                self.buffer.push('&');
290                self.buffer.push('&');
291                self.increase_index();
292                self.increase_index();
293                self.state = State::Done(TokenType::Symbol);
294            }
295            ('-', '>') => {
296                self.buffer.push('-');
297                self.buffer.push('>');
298                self.increase_index();
299                self.increase_index();
300                self.state = State::Done(TokenType::Symbol);
301            }
302            ('>', '=') => {
303                self.buffer.push('>');
304                self.buffer.push('=');
305                self.increase_index();
306                self.increase_index();
307                self.state = State::Done(TokenType::Symbol);
308            }
309            ('<', '=') => {
310                self.buffer.push('<');
311                self.buffer.push('=');
312                self.increase_index();
313                self.increase_index();
314                self.state = State::Done(TokenType::Symbol);
315            }
316            ('(', _)
317            | (')', _)
318            | ('{', _)
319            | ('}', _)
320            | ('[', _)
321            | (']', _)
322            | (':', _)
323            | (',', _)
324            | ('|', _)
325            | ('!', _)
326            | ('@', _)
327            | ('=', _)
328            | ('&', _)
329            | ('>', _)
330            | ('<', _)
331            | ('+', _)
332            | ('-', _)
333            | ('*', _)
334            | ('%', _)
335            | ('/', _) => {
336                self.buffer.push(c);
337                self.increase_index();
338                self.state = State::Done(TokenType::Symbol);
339            }
340            (' ' | '\r' | ';' | '\t' | '\n', _) => {
341                self.increase_index();
342            }
343            _ => {
344                println!("Unexpected character: {:?}", c);
345                self.increase_index();
346            }
347        }
348    }
349
350    /// 处理标识符状态
351    fn process_identifier_state(&mut self, c: char) {
352        if c.is_alphanumeric() || c == '_' {
353            self.buffer.push(c);
354            self.increase_index();
355        } else {
356            self.state = State::Done(TokenType::Identifier);
357        }
358    }
359
360    /// 处理数字状态
361    fn process_number_state(&mut self, c: char, peek: char) {
362        if c.is_numeric() {
363            self.buffer.push(c);
364            self.increase_index();
365        } else if c == '.' && peek.is_numeric() {
366            self.buffer.push(c);
367            self.increase_index();
368            self.state = State::Decimal;
369        } else {
370            self.state = State::Done(TokenType::Integer);
371        }
372    }
373
374    /// 处理小数状态
375    fn process_decimal_state(&mut self, c: char) {
376        if c.is_numeric() {
377            self.buffer.push(c);
378            self.increase_index();
379        } else {
380            self.state = State::Done(TokenType::Float);
381        }
382    }
383
384    /// 处理字符串状态
385    fn process_string_state(&mut self, c: char) -> Option<(Token, Position)> {
386        let quote = self.buffer.chars().next().unwrap();
387        if c == quote {
388            self.buffer.push(c);
389            self.increase_index();
390            let pos = self.create_position();
391            let content = self.buffer[1..self.buffer.len() - 1]
392                .to_string()
393                .replace("\\n", "\n");
394            Some((Token::String(content), pos))
395        } else {
396            self.buffer.push(c);
397            self.increase_index();
398            None
399        }
400    }
401
402    /// 处理格式化字符串状态
403    fn process_format_string_state(&mut self, c: char) -> Option<(Token, Position)> {
404        if c == '"' {
405            self.buffer.push(c);
406            self.increase_index();
407            let pos = self.create_position();
408            let content = self.buffer[2..self.buffer.len() - 1]
409                .to_string()
410                .replace("\\n", "\n");
411            Some((Token::FormatString(content), pos))
412        } else {
413            self.buffer.push(c);
414            self.increase_index();
415            None
416        }
417    }
418
419    /// 完成token处理
420    fn finalize_token(&mut self, token_type: TokenType) -> Option<(Token, Position)> {
421        let pos = self.create_position();
422
423        // 根据缓冲区内容和当前状态生成token
424        let token = match token_type {
425            TokenType::Identifier => {
426                let ident = self.buffer.clone();
427                if self.keywords.contains(&ident.as_str()) {
428                    Token::Keyword(ident)
429                } else if ident == "true" {
430                    Token::Boolean(true)
431                } else if ident == "false" {
432                    Token::Boolean(false)
433                } else {
434                    Token::Identifier(ident)
435                }
436            }
437            TokenType::Integer => {
438                let i: i64 = self.buffer.parse().unwrap();
439                Token::Int(i)
440            }
441            TokenType::Float => {
442                let f: f64 = self.buffer.parse().unwrap();
443                Token::Float(f)
444            }
445            TokenType::Symbol => match self.buffer.as_str() {
446                "(" => Token::BraceLeft,
447                ")" => Token::BraceRight,
448                "{" => Token::ParenLeft,
449                "}" => Token::ParenRight,
450                "[" => Token::BracketLeft,
451                "]" => Token::BracketRight,
452                "." => Token::Dot,
453                ":" => Token::Colon,
454                "::" => Token::ScopeSymbol,
455                "=" => Token::Assign,
456                "," => Token::Comma,
457                "+" => Token::Plus,
458                "-" => Token::Minus,
459                "*" => Token::Star,
460                "/" => Token::Slash,
461                "%" => Token::Mod,
462                ">" => Token::Greater,
463                "<" => Token::Less,
464                "<=" => Token::LessEqual,
465                ">=" => Token::GreaterEqual,
466                "==" => Token::Equal,
467                "!=" => Token::NotEqual,
468                "->" => Token::Arrow,
469                "!" => Token::Not,
470                "&&" => Token::And,
471                "||" => Token::Or,
472                "|" => Token::Vertical,
473                "@" => Token::Annotation,
474                "&" => Token::BitAnd,
475                _ => {
476                    println!("Unknown token: {}", self.buffer);
477                    return None;
478                }
479            },
480            // TokenType::String => {
481            //     let content = self.buffer[1..self.buffer.len() - 1]
482            //         .to_string()
483            //         .replace("\\n", "\n");
484            //     Token::String(content)
485            // }
486            // TokenType::FormatString => {
487            //     let content = self.buffer[2..self.buffer.len() - 1]
488            //         .to_string()
489            //         .replace("\\n", "\n");
490            //     Token::FormatString(content)
491            // }
492        };
493
494        Some((token, pos))
495    }
496
497    fn peek_char(&self) -> Option<char> {
498        self.chars.get(self.index + 1).copied()
499    }
500
501    fn current_char(&self) -> Option<char> {
502        self.chars.get(self.index).copied()
503    }
504
505    fn increase_index(&mut self) {
506        if self.index < self.chars.len() && self.chars[self.index] == '\n' {
507            self.row += 1;
508            self.col = 0;
509        }
510        self.index = self.index.add(1);
511        self.col += 1;
512    }
513
514    #[allow(unused)]
515    pub fn get_source(&self) -> Vec<char> {
516        self.chars.clone()
517    }
518
519    pub fn from_script(module_name: impl Into<String>, script: impl AsRef<str>) -> Self {
520        let script = script.as_ref().chars().collect();
521        let mut lexer = Lexer::new(module_name);
522        lexer.set_chars(script);
523        lexer
524    }
525}