pipeline_script/lexer/
mod.rs

1pub mod iter;
2pub mod position;
3pub mod token;
4
5use crate::lexer::position::Position;
6use crate::lexer::token::Token;
7use std::ops::Add;
8
9/// 词法分析器状态
10#[derive(Debug, Clone, PartialEq)]
11enum State {
12    /// 初始状态
13    Initial,
14    /// 识别标识符状态
15    Identifier,
16    /// 识别数字状态
17    Number,
18    /// 识别小数状态
19    Decimal,
20    /// 识别字符串状态
21    String,
22    /// 识别格式化字符串状态
23    FormatString,
24    /// 识别注释状态
25    Comment,
26    /// 识别多行注释状态
27    MultilineComment,
28    /// 完成状态,带有类型信息
29    Done(TokenType),
30}
31/// 词法单元类型
32#[derive(Debug, Clone, Copy, PartialEq)]
33enum TokenType {
34    /// 标识符
35    Identifier,
36    /// 整数
37    Integer,
38    /// 浮点数
39    Float,
40    /// 运算符或分隔符
41    Symbol,
42}
43
44#[derive(Debug, Clone)]
45pub struct Lexer {
46    file_name: String,
47    chars: Vec<char>,
48    index: usize,
49    col: usize,
50    row: usize,
51    keywords: Vec<&'static str>,
52    /// 当前状态
53    state: State,
54    /// 当前词素缓冲区
55    buffer: String,
56    /// 开始位置
57    start_pos: usize,
58    start_col: usize,
59    start_row: usize,
60}
61
62// impl IntoIterator for Lexer {
63//     type Item = (Token, Position);
64//     type IntoIter = TokenStream;
65//
66//     fn into_iter(self) -> Self::IntoIter {
67//         TokenStream::new(self)
68//     }
69// }
70impl Lexer {
71    pub fn is_eof(&self) -> bool {
72        self.index >= self.chars.len()
73    }
74    pub fn new(file_name: impl Into<String>) -> Self {
75        Self {
76            chars: vec![],
77            index: 0,
78            col: 1,
79            row: 1,
80            file_name: file_name.into(),
81            keywords: vec![
82                "let", "fn", "fun", "return", "if", "while", "import", "else", "val", "var",
83                "break", "continue", "for", "in", "class", "static", "trait", "struct", "extern",
84                "module", "const", "enum", "match",
85            ],
86            state: State::Initial,
87            buffer: String::new(),
88            start_pos: 0,
89            start_col: 1,
90            start_row: 1,
91        }
92    }
93    #[allow(unused)]
94    pub fn get_file_name(&self) -> String {
95        self.file_name.clone()
96    }
97    #[allow(unused)]
98    pub fn line(&self, line: usize) -> String {
99        let mut s = String::new();
100        let mut current_line = 1;
101        for i in &self.chars {
102            if i == &'\n' {
103                current_line += 1;
104                continue;
105            }
106            if current_line == line {
107                s.push(*i);
108            }
109            if current_line > line {
110                break;
111            }
112        }
113        s
114    }
115    #[allow(unused)]
116    pub fn set_chars(&mut self, chars: Vec<char>) {
117        self.chars = chars;
118    }
119
120    /// 开始新的词法分析
121    fn start_token(&mut self) {
122        self.buffer.clear();
123        self.start_pos = self.index;
124        self.start_col = self.col;
125        self.start_row = self.row;
126        self.state = State::Initial;
127    }
128
129    /// 创建token位置
130    fn create_position(&self) -> Position {
131        Position::new(
132            self.start_pos,
133            self.buffer.len(),
134            self.start_row,
135            self.start_col,
136            &self.file_name,
137        )
138    }
139
140    pub fn next_token(&mut self) -> Option<(Token, Position)> {
141        self.start_token();
142
143        while self.index < self.chars.len() {
144            let c = self.current_char().unwrap();
145            let peek = self.peek_char().unwrap_or('\0');
146
147            match &self.state {
148                State::Initial => self.process_initial_state(c, peek),
149                State::Identifier => self.process_identifier_state(c),
150                State::Number => self.process_number_state(c, peek),
151                State::Decimal => self.process_decimal_state(c),
152                State::String => {
153                    if let Some(token) = self.process_string_state(c) {
154                        return Some(token);
155                    }
156                }
157                State::FormatString => {
158                    if let Some(token) = self.process_format_string_state(c) {
159                        return Some(token);
160                    }
161                }
162                State::Comment => {
163                    if c == '\n' {
164                        self.state = State::Initial;
165                    }
166                    self.increase_index();
167                }
168                State::MultilineComment => {
169                    if c == '*' && peek == '/' {
170                        self.increase_index();
171                        self.increase_index();
172                        self.state = State::Initial;
173                    } else {
174                        self.increase_index();
175                    }
176                }
177                State::Done(token_type) => {
178                    return self.finalize_token(*token_type);
179                }
180            }
181
182            // 如果已经到达Done状态,返回结果
183            if let State::Done(token_type) = &self.state {
184                return self.finalize_token(*token_type);
185            }
186        }
187
188        // 处理文件结束情况
189        if !self.buffer.is_empty() {
190            if let State::Identifier = self.state {
191                return self.finalize_token(TokenType::Identifier);
192            } else if let State::Number = self.state {
193                return self.finalize_token(TokenType::Integer);
194            } else if let State::Decimal = self.state {
195                return self.finalize_token(TokenType::Float);
196            } else {
197                // 默认作为标识符处理未完成的token
198                return self.finalize_token(TokenType::Identifier);
199            }
200        }
201
202        None
203    }
204
205    /// 处理初始状态
206    fn process_initial_state(&mut self, c: char, peek: char) {
207        match (c, peek) {
208            ('r', '"') => {
209                self.buffer.push(c);
210                self.buffer.push(peek);
211                self.increase_index();
212                self.increase_index();
213                self.state = State::FormatString;
214            }
215            ('.', p) if !p.is_numeric() => {
216                self.buffer.push(c);
217                self.increase_index();
218                self.state = State::Done(TokenType::Symbol);
219            }
220            ('0'..='9', _) => {
221                self.buffer.push(c);
222                self.increase_index();
223                self.state = State::Number;
224            }
225            ('.', p) if p.is_numeric() => {
226                self.buffer.push(c);
227                self.increase_index();
228                self.state = State::Decimal;
229            }
230            ('a'..='z' | 'A'..='Z' | '_', _) => {
231                self.buffer.push(c);
232                self.increase_index();
233                self.state = State::Identifier;
234            }
235            ('"', _) => {
236                self.buffer.push(c);
237                self.increase_index();
238                self.state = State::String;
239            }
240            ('\'', _) => {
241                self.buffer.push(c);
242                self.increase_index();
243                self.state = State::String;
244            }
245            ('/', '/') => {
246                self.increase_index();
247                self.increase_index();
248                self.state = State::Comment;
249            }
250            ('/', '*') => {
251                self.increase_index();
252                self.increase_index();
253                self.state = State::MultilineComment;
254            }
255            (':', ':') => {
256                self.buffer.push(':');
257                self.buffer.push(':');
258                self.increase_index();
259                self.increase_index();
260                self.state = State::Done(TokenType::Symbol);
261            }
262            ('!', '=') => {
263                self.buffer.push('!');
264                self.buffer.push('=');
265                self.increase_index();
266                self.increase_index();
267                self.state = State::Done(TokenType::Symbol);
268            }
269            ('=', '=') => {
270                self.buffer.push('=');
271                self.buffer.push('=');
272                self.increase_index();
273                self.increase_index();
274                self.state = State::Done(TokenType::Symbol);
275            }
276            ('&', '&') => {
277                self.buffer.push('&');
278                self.buffer.push('&');
279                self.increase_index();
280                self.increase_index();
281                self.state = State::Done(TokenType::Symbol);
282            }
283            ('-', '>') => {
284                self.buffer.push('-');
285                self.buffer.push('>');
286                self.increase_index();
287                self.increase_index();
288                self.state = State::Done(TokenType::Symbol);
289            }
290            ('>', '=') => {
291                self.buffer.push('>');
292                self.buffer.push('=');
293                self.increase_index();
294                self.increase_index();
295                self.state = State::Done(TokenType::Symbol);
296            }
297            ('<', '=') => {
298                self.buffer.push('<');
299                self.buffer.push('=');
300                self.increase_index();
301                self.increase_index();
302                self.state = State::Done(TokenType::Symbol);
303            }
304            ('(', _)
305            | (')', _)
306            | ('{', _)
307            | ('}', _)
308            | ('[', _)
309            | (']', _)
310            | (':', _)
311            | (',', _)
312            | ('|', _)
313            | ('!', _)
314            | ('@', _)
315            | ('=', _)
316            | ('&', _)
317            | ('>', _)
318            | ('<', _)
319            | ('+', _)
320            | ('-', _)
321            | ('*', _)
322            | ('%', _)
323            | ('/', _) => {
324                self.buffer.push(c);
325                self.increase_index();
326                self.state = State::Done(TokenType::Symbol);
327            }
328            (' ' | '\r' | ';' | '\t' | '\n', _) => {
329                self.increase_index();
330            }
331            _ => {
332                println!("Unexpected character: {:?}", c);
333                self.increase_index();
334            }
335        }
336    }
337
338    /// 处理标识符状态
339    fn process_identifier_state(&mut self, c: char) {
340        if c.is_alphanumeric() || c == '_' {
341            self.buffer.push(c);
342            self.increase_index();
343        } else {
344            self.state = State::Done(TokenType::Identifier);
345        }
346    }
347
348    /// 处理数字状态
349    fn process_number_state(&mut self, c: char, peek: char) {
350        if c.is_numeric() {
351            self.buffer.push(c);
352            self.increase_index();
353        } else if c == '.' && peek.is_numeric() {
354            self.buffer.push(c);
355            self.increase_index();
356            self.state = State::Decimal;
357        } else {
358            self.state = State::Done(TokenType::Integer);
359        }
360    }
361
362    /// 处理小数状态
363    fn process_decimal_state(&mut self, c: char) {
364        if c.is_numeric() {
365            self.buffer.push(c);
366            self.increase_index();
367        } else {
368            self.state = State::Done(TokenType::Float);
369        }
370    }
371
372    /// 处理字符串状态
373    fn process_string_state(&mut self, c: char) -> Option<(Token, Position)> {
374        let quote = self.buffer.chars().next().unwrap();
375        if c == quote {
376            self.buffer.push(c);
377            self.increase_index();
378            let pos = self.create_position();
379            let content = self.buffer[1..self.buffer.len() - 1]
380                .to_string()
381                .replace("\\n", "\n");
382            Some((Token::String(content), pos))
383        } else {
384            self.buffer.push(c);
385            self.increase_index();
386            None
387        }
388    }
389
390    /// 处理格式化字符串状态
391    fn process_format_string_state(&mut self, c: char) -> Option<(Token, Position)> {
392        if c == '"' {
393            self.buffer.push(c);
394            self.increase_index();
395            let pos = self.create_position();
396            let content = self.buffer[2..self.buffer.len() - 1]
397                .to_string()
398                .replace("\\n", "\n");
399            Some((Token::FormatString(content), pos))
400        } else {
401            self.buffer.push(c);
402            self.increase_index();
403            None
404        }
405    }
406
407    /// 完成token处理
408    fn finalize_token(&mut self, token_type: TokenType) -> Option<(Token, Position)> {
409        let pos = self.create_position();
410
411        // 根据缓冲区内容和当前状态生成token
412        let token = match token_type {
413            TokenType::Identifier => {
414                let ident = self.buffer.clone();
415                if self.keywords.contains(&ident.as_str()) {
416                    Token::Keyword(ident)
417                } else if ident == "true" {
418                    Token::Boolean(true)
419                } else if ident == "false" {
420                    Token::Boolean(false)
421                } else {
422                    Token::Identifier(ident)
423                }
424            }
425            TokenType::Integer => {
426                let i: i64 = self.buffer.parse().unwrap();
427                Token::Int(i)
428            }
429            TokenType::Float => {
430                let f: f32 = self.buffer.parse().unwrap();
431                Token::Float(f)
432            }
433            TokenType::Symbol => match self.buffer.as_str() {
434                "(" => Token::BraceLeft,
435                ")" => Token::BraceRight,
436                "{" => Token::ParenLeft,
437                "}" => Token::ParenRight,
438                "[" => Token::BracketLeft,
439                "]" => Token::BracketRight,
440                "." => Token::Dot,
441                ":" => Token::Colon,
442                "::" => Token::ScopeSymbol,
443                "=" => Token::Assign,
444                "," => Token::Comma,
445                "+" => Token::Plus,
446                "-" => Token::Minus,
447                "*" => Token::Star,
448                "/" => Token::Slash,
449                "%" => Token::Mod,
450                ">" => Token::Greater,
451                "<" => Token::Less,
452                "<=" => Token::LessEqual,
453                ">=" => Token::GreaterEqual,
454                "==" => Token::Equal,
455                "!=" => Token::NotEqual,
456                "->" => Token::Arrow,
457                "!" => Token::Not,
458                "&&" => Token::And,
459                "|" => Token::Vertical,
460                "@" => Token::Annotation,
461                "&" => Token::BitAnd,
462                _ => {
463                    println!("Unknown token: {}", self.buffer);
464                    return None;
465                }
466            },
467            // TokenType::String => {
468            //     let content = self.buffer[1..self.buffer.len() - 1]
469            //         .to_string()
470            //         .replace("\\n", "\n");
471            //     Token::String(content)
472            // }
473            // TokenType::FormatString => {
474            //     let content = self.buffer[2..self.buffer.len() - 1]
475            //         .to_string()
476            //         .replace("\\n", "\n");
477            //     Token::FormatString(content)
478            // }
479        };
480
481        Some((token, pos))
482    }
483
484    fn peek_char(&self) -> Option<char> {
485        self.chars.get(self.index + 1).copied()
486    }
487
488    fn current_char(&self) -> Option<char> {
489        self.chars.get(self.index).copied()
490    }
491
492    fn increase_index(&mut self) {
493        if self.index < self.chars.len() && self.chars[self.index] == '\n' {
494            self.row += 1;
495            self.col = 0;
496        }
497        self.index = self.index.add(1);
498        self.col += 1;
499    }
500
501    #[allow(unused)]
502    pub fn get_source(&self) -> Vec<char> {
503        self.chars.clone()
504    }
505
506    pub fn from_script(module_name: impl Into<String>, script: impl AsRef<str>) -> Self {
507        let script = script.as_ref().chars().collect();
508        let mut lexer = Lexer::new(module_name);
509        lexer.set_chars(script);
510        lexer
511    }
512}