oak_smalltalk/lexer/
mod.rs

1use crate::{kind::SmalltalkKind, language::SmalltalkLanguage};
2use oak_core::{IncrementalCache, Lexer, LexerState, OakError, lexer::LexOutput, source::Source};
3
4type State<S> = LexerState<S, SmalltalkLanguage>;
5
6#[derive(Clone)]
7pub struct SmalltalkLexer<'config> {
8    config: &'config SmalltalkLanguage,
9}
10
11impl<'config> SmalltalkLexer<'config> {
12    pub fn new(config: &'config SmalltalkLanguage) -> Self {
13        Self { config }
14    }
15
16    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
17        while state.not_at_end() {
18            let safe_point = state.get_position();
19
20            if self.skip_whitespace(state) {
21                continue;
22            }
23
24            if self.lex_newline(state) {
25                continue;
26            }
27
28            if self.lex_comment(state) {
29                continue;
30            }
31
32            if self.lex_number(state) {
33                continue;
34            }
35
36            if self.lex_identifier(state) {
37                continue;
38            }
39
40            if self.lex_punctuation(state) {
41                continue;
42            }
43
44            // 错误处理：如果没有匹配任何规则，跳过当前字符并标记为错误
45            let start_pos = state.get_position();
46            if let Some(ch) = state.peek() {
47                state.advance(ch.len_utf8());
48                state.add_token(SmalltalkKind::Error, start_pos, state.get_position());
49            }
50
51            state.safe_check(safe_point);
52        }
53
54        // 添加 EOF token
55        let eof_pos = state.get_position();
56        state.add_token(SmalltalkKind::Eof, eof_pos, eof_pos);
57        Ok(())
58    }
59
60    /// 跳过空白字符
61    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
62        let start_pos = state.get_position();
63
64        while let Some(ch) = state.peek() {
65            if ch == ' ' || ch == '\t' {
66                state.advance(ch.len_utf8());
67            }
68            else {
69                break;
70            }
71        }
72
73        if state.get_position() > start_pos {
74            state.add_token(SmalltalkKind::Whitespace, start_pos, state.get_position());
75            true
76        }
77        else {
78            false
79        }
80    }
81
82    /// 处理换行符
83    fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
84        let start_pos = state.get_position();
85
86        if let Some('\n') = state.peek() {
87            state.advance(1);
88            state.add_token(SmalltalkKind::Newline, start_pos, state.get_position());
89            true
90        }
91        else if let Some('\r') = state.peek() {
92            state.advance(1);
93            if let Some('\n') = state.peek() {
94                state.advance(1);
95            }
96            state.add_token(SmalltalkKind::Newline, start_pos, state.get_position());
97            true
98        }
99        else {
100            false
101        }
102    }
103
104    /// 处理注释
105    fn lex_comment<S: Source>(&self, state: &mut State<S>) -> bool {
106        let start_pos = state.get_position();
107
108        if let Some('"') = state.peek() {
109            state.advance(1);
110
111            while let Some(ch) = state.peek() {
112                if ch == '"' {
113                    state.advance(1);
114                    break;
115                }
116                state.advance(ch.len_utf8());
117            }
118
119            state.add_token(SmalltalkKind::Comment, start_pos, state.get_position());
120            true
121        }
122        else {
123            false
124        }
125    }
126
127    /// 处理标识符
128    fn lex_identifier<S: Source>(&self, state: &mut State<S>) -> bool {
129        let start_pos = state.get_position();
130
131        if let Some(ch) = state.peek() {
132            if ch.is_alphabetic() || ch == '_' {
133                state.advance(ch.len_utf8());
134
135                while let Some(ch) = state.peek() {
136                    if ch.is_alphanumeric() || ch == '_' {
137                        state.advance(ch.len_utf8());
138                    }
139                    else {
140                        break;
141                    }
142                }
143
144                state.add_token(SmalltalkKind::Identifier, start_pos, state.get_position());
145                true
146            }
147            else {
148                false
149            }
150        }
151        else {
152            false
153        }
154    }
155
156    /// 处理数字
157    fn lex_number<S: Source>(&self, state: &mut State<S>) -> bool {
158        let start_pos = state.get_position();
159
160        if let Some(ch) = state.peek() {
161            if ch.is_ascii_digit() {
162                state.advance(1);
163
164                while let Some(ch) = state.peek() {
165                    if ch.is_ascii_digit() {
166                        state.advance(1);
167                    }
168                    else {
169                        break;
170                    }
171                }
172
173                state.add_token(SmalltalkKind::Number, start_pos, state.get_position());
174                true
175            }
176            else {
177                false
178            }
179        }
180        else {
181            false
182        }
183    }
184
185    /// 处理标点符号
186    fn lex_punctuation<S: Source>(&self, state: &mut State<S>) -> bool {
187        let start_pos = state.get_position();
188
189        if let Some(ch) = state.peek() {
190            let kind = match ch {
191                '(' => SmalltalkKind::LeftParen,
192                ')' => SmalltalkKind::RightParen,
193                '[' => SmalltalkKind::LeftBracket,
194                ']' => SmalltalkKind::RightBracket,
195                '{' => SmalltalkKind::LeftBrace,
196                '}' => SmalltalkKind::RightBrace,
197                '.' => SmalltalkKind::Dot,
198                ';' => SmalltalkKind::Semicolon,
199                ',' => SmalltalkKind::Comma,
200                '+' => SmalltalkKind::Plus,
201                '-' => SmalltalkKind::Minus,
202                '*' => SmalltalkKind::Star,
203                '/' => SmalltalkKind::Slash,
204                '=' => SmalltalkKind::Equal,
205                '<' => SmalltalkKind::Less,
206                '>' => SmalltalkKind::Greater,
207                _ => return false,
208            };
209
210            state.advance(1);
211            state.add_token(kind, start_pos, state.get_position());
212            true
213        }
214        else {
215            false
216        }
217    }
218}
219
220impl<'config> Lexer<SmalltalkLanguage> for SmalltalkLexer<'config> {
221    fn lex_incremental(
222        &self,
223        source: impl Source,
224        changed: usize,
225        cache: IncrementalCache<SmalltalkLanguage>,
226    ) -> LexOutput<SmalltalkLanguage> {
227        let mut state = LexerState::new_with_cache(source, changed, cache);
228        let result = self.run(&mut state);
229        state.finish(result)
230    }
231}
oak_smalltalk/lexer/mod.rs

oak_smalltalk/lexer/
mod.rs